Completed
Push — master ( 2fa01c...687c32 )
by Adrien
02:55
created

SharedStringsHelper::extractTextValueForNodes()   A

Complexity

Conditions 4
Paths 5

Size

Total Lines 18
Code Lines 9

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 11
CRAP Score 4

Importance

Changes 0
Metric Value
dl 0
loc 18
rs 9.2
c 0
b 0
f 0
ccs 11
cts 11
cp 1
cc 4
eloc 9
nc 5
nop 1
crap 4
1
<?php
2
3
namespace Box\Spout\Reader\XLSX\Helper;
4
5
use Box\Spout\Common\Exception\IOException;
6
use Box\Spout\Reader\Exception\XMLProcessingException;
7
use Box\Spout\Reader\Wrapper\SimpleXMLElement;
8
use Box\Spout\Reader\Wrapper\XMLReader;
9
use Box\Spout\Reader\XLSX\Helper\SharedStringsCaching\CachingStrategyFactory;
10
use Box\Spout\Reader\XLSX\Helper\SharedStringsCaching\CachingStrategyInterface;
11
12
/**
13
 * Class SharedStringsHelper
14
 * This class provides helper functions for reading sharedStrings XML file
15
 *
16
 * @package Box\Spout\Reader\XLSX\Helper
17
 */
18
class SharedStringsHelper
19
{
20
    /** Path of sharedStrings XML file inside the XLSX file */
21
    const SHARED_STRINGS_XML_FILE_PATH = 'xl/sharedStrings.xml';
22
23
    /** Main namespace for the sharedStrings.xml file */
24
    const MAIN_NAMESPACE_FOR_SHARED_STRINGS_XML = 'http://schemas.openxmlformats.org/spreadsheetml/2006/main';
25
26
    /** @var string Path of the XLSX file being read */
27
    protected $filePath;
28
29
    /** @var string Temporary folder where the temporary files to store shared strings will be stored */
30
    protected $tempFolder;
31
32
    /** @var CachingStrategyInterface The best caching strategy for storing shared strings */
33
    protected $cachingStrategy;
34
35
    /**
36
     * @param string $filePath Path of the XLSX file being read
37
     * @param string|null|void $tempFolder Temporary folder where the temporary files to store shared strings will be stored
38
     */
39 120
    public function __construct($filePath, $tempFolder = null)
40
    {
41 120
        $this->filePath = $filePath;
42 120
        $this->tempFolder = $tempFolder;
43 120
    }
44
45
    /**
46
     * Returns whether the XLSX file contains a shared strings XML file
47
     *
48
     * @return bool
49
     */
50 107
    public function hasSharedStrings()
51
    {
52 105
        $hasSharedStrings = false;
53 105
        $zip = new \ZipArchive();
54
55 105
        if ($zip->open($this->filePath) === true) {
56 105
            $hasSharedStrings = ($zip->locateName(self::SHARED_STRINGS_XML_FILE_PATH) !== false);
57 105
            $zip->close();
58 107
        }
59
60 105
        return $hasSharedStrings;
61
    }
62
63
    /**
64
     * Builds an in-memory array containing all the shared strings of the sheet.
65
     * All the strings are stored in a XML file, located at 'xl/sharedStrings.xml'.
66
     * It is then accessed by the sheet data, via the string index in the built table.
67
     *
68
     * More documentation available here: http://msdn.microsoft.com/en-us/library/office/gg278314.aspx
69
     *
70
     * The XML file can be really big with sheets containing a lot of data. That is why
71
     * we need to use a XML reader that provides streaming like the XMLReader library.
72
     * Please note that SimpleXML does not provide such a functionality but since it is faster
73
     * and more handy to parse few XML nodes, it is used in combination with XMLReader for that purpose.
74
     *
75
     * @return void
76
     * @throws \Box\Spout\Common\Exception\IOException If sharedStrings.xml can't be read
77
     */
78 102
    public function extractSharedStrings()
79
    {
80 102
        $xmlReader = new XMLReader();
81 102
        $sharedStringIndex = 0;
82
        /** @noinspection PhpUnnecessaryFullyQualifiedNameInspection */
83 102
        $escaper = \Box\Spout\Common\Escaper\XLSX::getInstance();
84
85 102
        $sharedStringsFilePath = $this->getSharedStringsFilePath();
86 102
        if ($xmlReader->open($sharedStringsFilePath) === false) {
87
            throw new IOException('Could not open "' . self::SHARED_STRINGS_XML_FILE_PATH . '".');
88
        }
89
90
        try {
91 102
            $sharedStringsUniqueCount = $this->getSharedStringsUniqueCount($xmlReader);
92 99
            $this->cachingStrategy = $this->getBestSharedStringsCachingStrategy($sharedStringsUniqueCount);
93
94 99
            $xmlReader->readUntilNodeFound('si');
95
96 99
            while ($xmlReader->name === 'si') {
97 69
                $this->processSharedStringsItem($xmlReader, $sharedStringIndex, $escaper);
98 66
                $sharedStringIndex++;
99
100
                // jump to the next 'si' tag
101 66
                $xmlReader->next('si');
102 66
            }
103
104 96
            $this->cachingStrategy->closeCache();
105
106 102
        } catch (XMLProcessingException $exception) {
107 3
            throw new IOException("The sharedStrings.xml file is invalid and cannot be read. [{$exception->getMessage()}]");
108
        }
109
110 96
        $xmlReader->close();
111 96
    }
112
113
    /**
114
     * @return string The path to the shared strings XML file
115
     */
116 102
    protected function getSharedStringsFilePath()
117
    {
118 102
        return 'zip://' . $this->filePath . '#' . self::SHARED_STRINGS_XML_FILE_PATH;
119
    }
120
121
    /**
122
     * Returns the shared strings unique count, as specified in <sst> tag.
123
     *
124
     * @param \Box\Spout\Reader\Wrapper\XMLReader $xmlReader XMLReader instance
125
     * @return int|null Number of unique shared strings in the sharedStrings.xml file
126
     * @throws \Box\Spout\Common\Exception\IOException If sharedStrings.xml is invalid and can't be read
127
     */
128 102
    protected function getSharedStringsUniqueCount($xmlReader)
129
    {
130 102
        $xmlReader->next('sst');
131
132
        // Iterate over the "sst" elements to get the actual "sst ELEMENT" (skips any DOCTYPE)
133 99
        while ($xmlReader->name === 'sst' && $xmlReader->nodeType !== XMLReader::ELEMENT) {
134 3
            $xmlReader->read();
135 3
        }
136
137 99
        $uniqueCount = $xmlReader->getAttribute('uniqueCount');
138
139
        // some software do not add the "uniqueCount" attribute but only use the "count" one
140
        // @see https://github.com/box/spout/issues/254
141 99
        if ($uniqueCount === null) {
142 9
            $uniqueCount = $xmlReader->getAttribute('count');
143 9
        }
144
145 99
        return ($uniqueCount !== null) ? intval($uniqueCount) : null;
146
    }
147
148
    /**
149
     * Returns the best shared strings caching strategy.
150
     *
151
     * @param int|null $sharedStringsUniqueCount Number of unique shared strings (NULL if unknown)
152
     * @return CachingStrategyInterface
153
     */
154 99
    protected function getBestSharedStringsCachingStrategy($sharedStringsUniqueCount)
155
    {
156 99
        return CachingStrategyFactory::getInstance()
157 99
                ->getBestCachingStrategy($sharedStringsUniqueCount, $this->tempFolder);
158
    }
159
160
    /**
161
     * Processes the shared strings item XML node which the given XML reader is positioned on.
162
     *
163
     * @param \Box\Spout\Reader\Wrapper\XMLReader $xmlReader
164
     * @param int $sharedStringIndex Index of the processed shared strings item
165
     * @param \Box\Spout\Common\Escaper\XLSX $escaper Helper to escape values
166
     * @return void
167
     */
168 69
    protected function processSharedStringsItem($xmlReader, $sharedStringIndex, $escaper)
169
    {
170 69
        $node = $this->getSimpleXmlElementNodeFromXMLReader($xmlReader);
171 66
        $node->registerXPathNamespace('ns', self::MAIN_NAMESPACE_FOR_SHARED_STRINGS_XML);
172
173
        // removes nodes that should not be read, like the pronunciation of the Kanji characters
174 66
        $cleanNode = $this->removeSuperfluousTextNodes($node);
175
176
        // find all text nodes "t"; there can be multiple if the cell contains formatting
177 66
        $textNodes = $cleanNode->xpath('//ns:t');
178
179 66
        $textValue = $this->extractTextValueForNodes($textNodes);
180 66
        $unescapedTextValue = $escaper->unescape($textValue);
181
182 66
        $this->cachingStrategy->addStringForIndex($unescapedTextValue, $sharedStringIndex);
183 66
    }
184
185
    /**
186
     * Returns a SimpleXMLElement node from the current node in the given XMLReader instance.
187
     * This is to simplify the parsing of the subtree.
188
     *
189
     * @param \Box\Spout\Reader\Wrapper\XMLReader $xmlReader
190
     * @return \Box\Spout\Reader\Wrapper\SimpleXMLElement
191
     * @throws \Box\Spout\Common\Exception\IOException If the current node cannot be read
192
     */
193 69
    protected function getSimpleXmlElementNodeFromXMLReader($xmlReader)
194
    {
195 69
        $node = null;
196
        try {
197 69
            $node = new SimpleXMLElement($xmlReader->readOuterXml());
198 69
        } catch (XMLProcessingException $exception) {
199 3
            throw new IOException("The sharedStrings.xml file contains unreadable data [{$exception->getMessage()}].");
200
        }
201
202 66
        return $node;
203
    }
204
205
    /**
206
     * Removes nodes that should not be read, like the pronunciation of the Kanji characters.
207
     * By keeping them, their text content would be added to the read string.
208
     *
209
     * @param \Box\Spout\Reader\Wrapper\SimpleXMLElement $parentNode Parent node that may contain nodes to remove
210
     * @return \Box\Spout\Reader\Wrapper\SimpleXMLElement Cleaned parent node
211
     */
212 66
    protected function removeSuperfluousTextNodes($parentNode)
213
    {
214
        $tagsToRemove = [
215 66
            'rPh', // Pronunciation of the text
216 66
            'pPr', // Paragraph Properties / Previous Paragraph Properties
217 66
            'rPr', // Run Properties for the Paragraph Mark / Previous Run Properties for the Paragraph Mark
218 66
        ];
219
220 66
        foreach ($tagsToRemove as $tagToRemove) {
221 66
            $xpath = '//ns:' . $tagToRemove;
222 66
            $parentNode->removeNodesMatchingXPath($xpath);
223 66
        }
224
225 66
        return $parentNode;
226
    }
227
228
    /**
229
     * @param array $textNodes Text XML nodes ("<t>")
230
     * @return string The value associated with the given text node(s)
231
     */
232 66
    protected function extractTextValueForNodes($textNodes)
233
    {
234 66
        $textValue = '';
235
236 66
        foreach ($textNodes as $nodeIndex => $textNode) {
237 66
            if ($nodeIndex !== 0) {
238
                // add a space between each "t" node
239 3
                $textValue .= ' ';
240 3
            }
241
242 66
            $textNodeAsString = $textNode->__toString();
243 66
            $shouldPreserveWhitespace = $this->shouldPreserveWhitespace($textNode);
244
245 66
            $textValue .= ($shouldPreserveWhitespace) ? $textNodeAsString : trim($textNodeAsString);
246 66
        }
247
248 66
        return $textValue;
249
    }
250
251
    /**
252
     * If the text node has the attribute 'xml:space="preserve"', then preserve whitespace.
253
     *
254
     * @param \Box\Spout\Reader\Wrapper\SimpleXMLElement $textNode The text node element (<t>) whitespace may be preserved
255
     * @return bool Whether whitespace should be preserved
256
     */
257 66
    protected function shouldPreserveWhitespace($textNode)
258
    {
259 66
        $spaceValue = $textNode->getAttribute('space', 'xml');
260 66
        return ($spaceValue === 'preserve');
261
    }
262
263
    /**
264
     * Returns the shared string at the given index, using the previously chosen caching strategy.
265
     *
266
     * @param int $sharedStringIndex Index of the shared string in the sharedStrings.xml file
267
     * @return string The shared string at the given index
268
     * @throws \Box\Spout\Reader\Exception\SharedStringNotFoundException If no shared string found for the given index
269
     */
270 66
    public function getStringAtIndex($sharedStringIndex)
271
    {
272 66
        return $this->cachingStrategy->getStringAtIndex($sharedStringIndex);
273
    }
274
275
    /**
276
     * Destroys the cache, freeing memory and removing any created artifacts
277
     *
278
     * @return void
279
     */
280 108
    public function cleanup()
281
    {
282 108
        if ($this->cachingStrategy) {
283 90
            $this->cachingStrategy->clearCache();
284 90
        }
285 108
    }
286
}
287