Completed
Pull Request — master (#649)
by Adrien
03:03 queued 33s
created

SharedStringsManager   A

Complexity

Total Complexity 22

Size/Duplication

Total Lines 235
Duplicated Lines 0 %

Coupling/Cohesion

Components 1
Dependencies 6

Test Coverage

Coverage 98.39%

Importance

Changes 0
Metric Value
wmc 22
lcom 1
cbo 6
dl 0
loc 235
ccs 61
cts 62
cp 0.9839
rs 10
c 0
b 0
f 0

10 Methods

Rating   Name   Duplication   Size   Complexity  
A __construct() 0 15 1
A hasSharedStrings() 0 4 1
A extractSharedStrings() 0 31 4
A getSharedStringsUniqueCount() 0 19 5
A getBestSharedStringsCachingStrategy() 0 5 1
A processSharedStringsItem() 0 19 4
A shouldExtractTextNodeValue() 0 6 2
A shouldPreserveWhitespace() 0 6 1
A getStringAtIndex() 0 4 1
A cleanup() 0 6 2
1
<?php
2
3
namespace Box\Spout\Reader\XLSX\Manager;
4
5
use Box\Spout\Common\Exception\IOException;
6
use Box\Spout\Reader\Exception\XMLProcessingException;
7
use Box\Spout\Reader\Wrapper\XMLReader;
8
use Box\Spout\Reader\XLSX\Creator\HelperFactory;
9
use Box\Spout\Reader\XLSX\Creator\InternalEntityFactory;
10
use Box\Spout\Reader\XLSX\Manager\SharedStringsCaching\CachingStrategyFactory;
11
use Box\Spout\Reader\XLSX\Manager\SharedStringsCaching\CachingStrategyInterface;
12
13
/**
14
 * Class SharedStringsManager
15
 * This class manages the shared strings defined in the associated XML file
16
 */
17
class SharedStringsManager
18
{
19
    /** Main namespace for the sharedStrings.xml file */
20
    const MAIN_NAMESPACE_FOR_SHARED_STRINGS_XML = 'http://schemas.openxmlformats.org/spreadsheetml/2006/main';
21
22
    /** Definition of XML nodes names used to parse data */
23
    const XML_NODE_SST = 'sst';
24
    const XML_NODE_SI = 'si';
25
    const XML_NODE_R = 'r';
26
    const XML_NODE_T = 't';
27
28
    /** Definition of XML attributes used to parse data */
29
    const XML_ATTRIBUTE_COUNT = 'count';
30
    const XML_ATTRIBUTE_UNIQUE_COUNT = 'uniqueCount';
31
    const XML_ATTRIBUTE_XML_SPACE = 'xml:space';
32
    const XML_ATTRIBUTE_VALUE_PRESERVE = 'preserve';
33
34
    /** @var string Path of the XLSX file being read */
35
    protected $filePath;
36
37
    /** @var string Temporary folder where the temporary files to store shared strings will be stored */
38
    protected $tempFolder;
39
40
    /** @var WorkbookRelationshipsManager Helps retrieving workbook relationships */
41
    protected $workbookRelationshipsManager;
42
43
    /** @var InternalEntityFactory Factory to create entities */
44
    protected $entityFactory;
45
46
    /** @var HelperFactory $helperFactory Factory to create helpers */
47
    protected $helperFactory;
48
49
    /** @var CachingStrategyFactory Factory to create shared strings caching strategies */
50
    protected $cachingStrategyFactory;
51
52
    /** @var CachingStrategyInterface The best caching strategy for storing shared strings */
53
    protected $cachingStrategy;
54
55
    /**
56
     * @param string $filePath Path of the XLSX file being read
57
     * @param string $tempFolder Temporary folder where the temporary files to store shared strings will be stored
58
     * @param WorkbookRelationshipsManager $workbookRelationshipsManager Helps retrieving workbook relationships
59
     * @param InternalEntityFactory $entityFactory Factory to create entities
60
     * @param HelperFactory $helperFactory Factory to create helpers
61
     * @param CachingStrategyFactory $cachingStrategyFactory Factory to create shared strings caching strategies
62
     */
63 46
    public function __construct(
64
        $filePath,
65
        $tempFolder,
66
        $workbookRelationshipsManager,
67
        $entityFactory,
68
        $helperFactory,
69
        $cachingStrategyFactory
70
    ) {
71 46
        $this->filePath = $filePath;
72 46
        $this->tempFolder = $tempFolder;
73 46
        $this->workbookRelationshipsManager = $workbookRelationshipsManager;
74 46
        $this->entityFactory = $entityFactory;
75 46
        $this->helperFactory = $helperFactory;
76 46
        $this->cachingStrategyFactory = $cachingStrategyFactory;
77 46
    }
78
79
    /**
80
     * Returns whether the XLSX file contains a shared strings XML file
81
     *
82
     * @return bool
83
     */
84 40
    public function hasSharedStrings()
85
    {
86 40
        return $this->workbookRelationshipsManager->hasSharedStringsXMLFile();
87
    }
88
89
    /**
90
     * Builds an in-memory array containing all the shared strings of the sheet.
91
     * All the strings are stored in a XML file, located at 'xl/sharedStrings.xml'.
92
     * It is then accessed by the sheet data, via the string index in the built table.
93
     *
94
     * More documentation available here: http://msdn.microsoft.com/en-us/library/office/gg278314.aspx
95
     *
96
     * The XML file can be really big with sheets containing a lot of data. That is why
97
     * we need to use a XML reader that provides streaming like the XMLReader library.
98
     *
99
     * @throws \Box\Spout\Common\Exception\IOException If shared strings XML file can't be read
100
     * @return void
101
     */
102 39
    public function extractSharedStrings()
103
    {
104 39
        $sharedStringsXMLFilePath = $this->workbookRelationshipsManager->getSharedStringsXMLFilePath();
105 39
        $xmlReader = $this->entityFactory->createXMLReader();
106 39
        $sharedStringIndex = 0;
107
108 39
        if ($xmlReader->openFileInZip($this->filePath, $sharedStringsXMLFilePath) === false) {
109
            throw new IOException('Could not open "' . $sharedStringsXMLFilePath . '".');
110
        }
111
112
        try {
113 39
            $sharedStringsUniqueCount = $this->getSharedStringsUniqueCount($xmlReader);
114 38
            $this->cachingStrategy = $this->getBestSharedStringsCachingStrategy($sharedStringsUniqueCount);
115
116 38
            $xmlReader->readUntilNodeFound(self::XML_NODE_SI);
117
118 38
            while ($xmlReader->getCurrentNodeName() === self::XML_NODE_SI) {
119 27
                $this->processSharedStringsItem($xmlReader, $sharedStringIndex);
120 27
                $sharedStringIndex++;
121
122
                // jump to the next '<si>' tag
123 27
                $xmlReader->next(self::XML_NODE_SI);
124
            }
125
126 38
            $this->cachingStrategy->closeCache();
127 1
        } catch (XMLProcessingException $exception) {
128 1
            throw new IOException("The sharedStrings.xml file is invalid and cannot be read. [{$exception->getMessage()}]");
129
        }
130
131 38
        $xmlReader->close();
132 38
    }
133
134
    /**
135
     * Returns the shared strings unique count, as specified in <sst> tag.
136
     *
137
     * @param \Box\Spout\Reader\Wrapper\XMLReader $xmlReader XMLReader instance
138
     * @throws \Box\Spout\Common\Exception\IOException If sharedStrings.xml is invalid and can't be read
139
     * @return int|null Number of unique shared strings in the sharedStrings.xml file
140
     */
141 39
    protected function getSharedStringsUniqueCount($xmlReader)
142
    {
143 39
        $xmlReader->next(self::XML_NODE_SST);
144
145
        // Iterate over the "sst" elements to get the actual "sst ELEMENT" (skips any DOCTYPE)
146 38
        while ($xmlReader->getCurrentNodeName() === self::XML_NODE_SST && $xmlReader->nodeType !== XMLReader::ELEMENT) {
147 1
            $xmlReader->read();
148
        }
149
150 38
        $uniqueCount = $xmlReader->getAttribute(self::XML_ATTRIBUTE_UNIQUE_COUNT);
151
152
        // some software do not add the "uniqueCount" attribute but only use the "count" one
153
        // @see https://github.com/box/spout/issues/254
154 38
        if ($uniqueCount === null) {
155 4
            $uniqueCount = $xmlReader->getAttribute(self::XML_ATTRIBUTE_COUNT);
156
        }
157
158 38
        return ($uniqueCount !== null) ? (int) $uniqueCount : null;
159
    }
160
161
    /**
162
     * Returns the best shared strings caching strategy.
163
     *
164
     * @param int|null $sharedStringsUniqueCount Number of unique shared strings (NULL if unknown)
165
     * @return CachingStrategyInterface
166
     */
167 38
    protected function getBestSharedStringsCachingStrategy($sharedStringsUniqueCount)
168
    {
169 38
        return $this->cachingStrategyFactory
170 38
                ->createBestCachingStrategy($sharedStringsUniqueCount, $this->tempFolder, $this->helperFactory);
171
    }
172
173
    /**
174
     * Processes the shared strings item XML node which the given XML reader is positioned on.
175
     *
176
     * @param \Box\Spout\Reader\Wrapper\XMLReader $xmlReader XML Reader positioned on a "<si>" node
177
     * @param int $sharedStringIndex Index of the processed shared strings item
178
     * @return void
179
     */
180 27
    protected function processSharedStringsItem($xmlReader, $sharedStringIndex)
181
    {
182 27
        $sharedStringValue = '';
183
184
        // NOTE: expand() will automatically decode all XML entities of the child nodes
185 27
        $siNode = $xmlReader->expand();
186 27
        $textNodes = $siNode->getElementsByTagName(self::XML_NODE_T);
187
188 27
        foreach ($textNodes as $textNode) {
189 27
            if ($this->shouldExtractTextNodeValue($textNode)) {
190 27
                $textNodeValue = $textNode->nodeValue;
191 27
                $shouldPreserveWhitespace = $this->shouldPreserveWhitespace($textNode);
192
193 27
                $sharedStringValue .= ($shouldPreserveWhitespace) ? $textNodeValue : trim($textNodeValue);
194
            }
195
        }
196
197 27
        $this->cachingStrategy->addStringForIndex($sharedStringValue, $sharedStringIndex);
198 27
    }
199
200
    /**
201
     * Not all text nodes' values must be extracted.
202
     * Some text nodes are part of a node describing the pronunciation for instance.
203
     * We'll only consider the nodes whose parents are "<si>" or "<r>".
204
     *
205
     * @param \DOMElement $textNode Text node to check
206
     * @return bool Whether the given text node's value must be extracted
207
     */
208 27
    protected function shouldExtractTextNodeValue($textNode)
209
    {
210 27
        $parentTagName = $textNode->parentNode->localName;
211
212 27
        return ($parentTagName === self::XML_NODE_SI || $parentTagName === self::XML_NODE_R);
213
    }
214
215
    /**
216
     * If the text node has the attribute 'xml:space="preserve"', then preserve whitespace.
217
     *
218
     * @param \DOMElement $textNode The text node element (<t>) whose whitespace may be preserved
219
     * @return bool Whether whitespace should be preserved
220
     */
221 27
    protected function shouldPreserveWhitespace($textNode)
222
    {
223 27
        $spaceValue = $textNode->getAttribute(self::XML_ATTRIBUTE_XML_SPACE);
224
225 27
        return ($spaceValue === self::XML_ATTRIBUTE_VALUE_PRESERVE);
226
    }
227
228
    /**
229
     * Returns the shared string at the given index, using the previously chosen caching strategy.
230
     *
231
     * @param int $sharedStringIndex Index of the shared string in the sharedStrings.xml file
232
     * @throws \Box\Spout\Reader\Exception\SharedStringNotFoundException If no shared string found for the given index
233
     * @return string The shared string at the given index
234
     */
235 27
    public function getStringAtIndex($sharedStringIndex)
236
    {
237 27
        return $this->cachingStrategy->getStringAtIndex($sharedStringIndex);
238
    }
239
240
    /**
241
     * Destroys the cache, freeing memory and removing any created artifacts
242
     *
243
     * @return void
244
     */
245 43
    public function cleanup()
246
    {
247 43
        if ($this->cachingStrategy) {
248 36
            $this->cachingStrategy->clearCache();
249
        }
250 43
    }
251
}
252