Completed
Pull Request — master (#418)
by
unknown
11:48
created

SharedStringsHelper::shouldExtractTextNodeValue()   A

Complexity

Conditions 2
Paths 2

Size

Total Lines 5
Code Lines 3

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 2
CRAP Score 2

Importance

Changes 0
Metric Value
dl 0
loc 5
ccs 2
cts 2
cp 1
rs 9.4285
c 0
b 0
f 0
cc 2
eloc 3
nc 2
nop 1
crap 2
1
<?php
2
3
namespace Box\Spout\Reader\XLSX\Helper;
4
5
use Box\Spout\Common\Exception\IOException;
6
use Box\Spout\Reader\Exception\XMLProcessingException;
7
use Box\Spout\Reader\Wrapper\XMLReader;
8
use Box\Spout\Reader\XLSX\Helper\SharedStringsCaching\CachingStrategyFactory;
9
use Box\Spout\Reader\XLSX\Helper\SharedStringsCaching\CachingStrategyInterface;
10
11
/**
12
 * Class SharedStringsHelper
13
 * This class provides helper functions for reading sharedStrings XML file
14
 *
15
 * @package Box\Spout\Reader\XLSX\Helper
16
 */
17
class SharedStringsHelper
18
{
19
    /** Path of sharedStrings XML file inside the XLSX file */
20
    const SHARED_STRINGS_XML_FILE_PATH = 'xl/sharedStrings.xml';
21
22
    /** Main namespace for the sharedStrings.xml file */
23
    const MAIN_NAMESPACE_FOR_SHARED_STRINGS_XML = 'http://schemas.openxmlformats.org/spreadsheetml/2006/main';
24
25
    /** Definition of XML nodes names used to parse data */
26
    const XML_NODE_SST = 'sst';
27
    const XML_NODE_SI = 'si';
28
    const XML_NODE_R = 'r';
29
    const XML_NODE_T = 't';
30
31
    /** Definition of XML attributes used to parse data */
32
    const XML_ATTRIBUTE_COUNT = 'count';
33
    const XML_ATTRIBUTE_UNIQUE_COUNT = 'uniqueCount';
34
    const XML_ATTRIBUTE_XML_SPACE = 'xml:space';
35
    const XML_ATTRIBUTE_VALUE_PRESERVE = 'preserve';
36
37
    /** @var string Path of the XLSX file being read */
38
    protected $filePath;
39
40
    /** @var string Temporary folder where the temporary files to store shared strings will be stored */
41
    protected $tempFolder;
42
43
    /** @var CachingStrategyInterface The best caching strategy for storing shared strings */
44
    protected $cachingStrategy;
45
46
    /**
47
     * @param string $filePath Path of the XLSX file being read
48
     * @param string|null|void $tempFolder Temporary folder where the temporary files to store shared strings will be stored
49
     */
50 123
    public function __construct($filePath, $tempFolder = null)
51
    {
52 123
        $this->filePath = $filePath;
53 123
        $this->tempFolder = $tempFolder;
54 123
    }
55
56
    /**
57
     * Returns whether the XLSX file contains a shared strings XML file
58
     *
59
     * @return bool
60
     */
61 108
    public function hasSharedStrings()
62
    {
63 105
        return $this->getSharedStringsPath() !== null;
64 105
    }
65
66 105
    /**
67 105
     * Returns shared strings XML file path
68 105
     *
69 108
     * @return string|null
70
     */
71 105
    public function getSharedStringsPath()
72
    {
73
        $sharedStringsPath = null;
74
        $zip = new \ZipArchive();
75
76
        if ($zip->open($this->filePath) === true) {
77
            $index = $zip->locateName(self::SHARED_STRINGS_XML_FILE_PATH, \ZipArchive::FL_NOCASE);
78
            if ($index !== false) {
79
                $sharedStringsPath = $zip->getNameIndex($index);
80
            }
81
            $zip->close();
82
        }
83
84
        return $sharedStringsPath;
85
    }
86
87
    /**
88
     * Builds an in-memory array containing all the shared strings of the sheet.
89 105
     * All the strings are stored in a XML file, located at 'xl/sharedStrings.xml'.
90
     * It is then accessed by the sheet data, via the string index in the built table.
91 105
     *
92 105
     * More documentation available here: http://msdn.microsoft.com/en-us/library/office/gg278314.aspx
93
     *
94 105
     * The XML file can be really big with sheets containing a lot of data. That is why
95 105
     * we need to use a XML reader that provides streaming like the XMLReader library.
96
     * Please note that SimpleXML does not provide such a functionality but since it is faster
97
     * and more handy to parse few XML nodes, it is used in combination with XMLReader for that purpose.
98
     *
99
     * @param string $stringsPath Path of the XLSX's shared strings
100 105
     * @return void
101 102
     * @throws \Box\Spout\Common\Exception\IOException If sharedStrings.xml can't be read
102
     */
103 102
    public function extractSharedStrings($stringsPath = self::SHARED_STRINGS_XML_FILE_PATH)
104
    {
105 102
        $xmlReader = new XMLReader();
106 72
        $sharedStringIndex = 0;
107 72
108
        $sharedStringsFilePath = $this->getSharedStringsFilePath($stringsPath);
109
        if ($xmlReader->open($sharedStringsFilePath) === false) {
110 72
            throw new IOException('Could not open "' . $stringsPath . '".');
111 72
        }
112
113 102
        try {
114
            $sharedStringsUniqueCount = $this->getSharedStringsUniqueCount($xmlReader);
115 105
            $this->cachingStrategy = $this->getBestSharedStringsCachingStrategy($sharedStringsUniqueCount);
116 3
117
            $xmlReader->readUntilNodeFound(self::XML_NODE_SI);
118
119 102
            while ($xmlReader->name === self::XML_NODE_SI) {
120 102
                $this->processSharedStringsItem($xmlReader, $sharedStringIndex);
121
                $sharedStringIndex++;
122
123
                // jump to the next '<si>' tag
124
                $xmlReader->next(self::XML_NODE_SI);
125 105
            }
126
127 105
            $this->cachingStrategy->closeCache();
128
129
        } catch (XMLProcessingException $exception) {
130
            throw new IOException("The sharedStrings.xml file is invalid and cannot be read. [{$exception->getMessage()}]");
131
        }
132
133
        $xmlReader->close();
134
    }
135
136
    /**
137 105
     * @return string The path to the shared strings XML file
138
     */
139 105
    protected function getSharedStringsFilePath($path = self::SHARED_STRINGS_XML_FILE_PATH)
140
    {
141
        return 'zip://' . $this->filePath . '#' . $path;
142 102
    }
143 3
144 3
    /**
145
     * Returns the shared strings unique count, as specified in <sst> tag.
146 102
     *
147
     * @param \Box\Spout\Reader\Wrapper\XMLReader $xmlReader XMLReader instance
148
     * @return int|null Number of unique shared strings in the sharedStrings.xml file
149
     * @throws \Box\Spout\Common\Exception\IOException If sharedStrings.xml is invalid and can't be read
150 102
     */
151 9
    protected function getSharedStringsUniqueCount($xmlReader)
152 9
    {
153
        $xmlReader->next(self::XML_NODE_SST);
154 102
155
        // Iterate over the "sst" elements to get the actual "sst ELEMENT" (skips any DOCTYPE)
156
        while ($xmlReader->name === self::XML_NODE_SST && $xmlReader->nodeType !== XMLReader::ELEMENT) {
157
            $xmlReader->read();
158
        }
159
160
        $uniqueCount = $xmlReader->getAttribute(self::XML_ATTRIBUTE_UNIQUE_COUNT);
161
162
        // some software do not add the "uniqueCount" attribute but only use the "count" one
163 102
        // @see https://github.com/box/spout/issues/254
164
        if ($uniqueCount === null) {
165 102
            $uniqueCount = $xmlReader->getAttribute(self::XML_ATTRIBUTE_COUNT);
166 102
        }
167
168
        return ($uniqueCount !== null) ? intval($uniqueCount) : null;
169
    }
170
171
    /**
172
     * Returns the best shared strings caching strategy.
173
     *
174
     * @param int|null $sharedStringsUniqueCount Number of unique shared strings (NULL if unknown)
175
     * @return CachingStrategyInterface
176 72
     */
177
    protected function getBestSharedStringsCachingStrategy($sharedStringsUniqueCount)
178 72
    {
179
        return CachingStrategyFactory::getInstance()
180
                ->getBestCachingStrategy($sharedStringsUniqueCount, $this->tempFolder);
181 72
    }
182 72
183
    /**
184 72
     * Processes the shared strings item XML node which the given XML reader is positioned on.
185 72
     *
186 72
     * @param \Box\Spout\Reader\Wrapper\XMLReader $xmlReader XML Reader positioned on a "<si>" node
187 72
     * @param int $sharedStringIndex Index of the processed shared strings item
188
     * @return void
189 72
     */
190 72
    protected function processSharedStringsItem($xmlReader, $sharedStringIndex)
191 72
    {
192
        $sharedStringValue = '';
193 72
194 72
        // NOTE: expand() will automatically decode all XML entities of the child nodes
195
        $siNode = $xmlReader->expand();
196
        $textNodes = $siNode->getElementsByTagName(self::XML_NODE_T);
197
198
        foreach ($textNodes as $textNode) {
199
            if ($this->shouldExtractTextNodeValue($textNode)) {
200
                $textNodeValue = $textNode->nodeValue;
201
                $shouldPreserveWhitespace = $this->shouldPreserveWhitespace($textNode);
202
203
                $sharedStringValue .= ($shouldPreserveWhitespace) ? $textNodeValue : trim($textNodeValue);
204 72
            }
205
        }
206 72
207 72
        $this->cachingStrategy->addStringForIndex($sharedStringValue, $sharedStringIndex);
208
    }
209
210
    /**
211
     * Not all text nodes' values must be extracted.
212
     * Some text nodes are part of a node describing the pronunciation for instance.
213
     * We'll only consider the nodes whose parents are "<si>" or "<r>".
214
     *
215
     * @param \DOMElement $textNode Text node to check
216 72
     * @return bool Whether the given text node's value must be extracted
217
     */
218 72
    protected function shouldExtractTextNodeValue($textNode)
219 72
    {
220
        $parentTagName = $textNode->parentNode->localName;
221
        return ($parentTagName === self::XML_NODE_SI || $parentTagName === self::XML_NODE_R);
222
    }
223
224
    /**
225
     * If the text node has the attribute 'xml:space="preserve"', then preserve whitespace.
226
     *
227
     * @param \DOMElement $textNode The text node element (<t>) whose whitespace may be preserved
228
     * @return bool Whether whitespace should be preserved
229 72
     */
230
    protected function shouldPreserveWhitespace($textNode)
231 72
    {
232
        $spaceValue = $textNode->getAttribute(self::XML_ATTRIBUTE_XML_SPACE);
233
        return ($spaceValue === self::XML_ATTRIBUTE_VALUE_PRESERVE);
234
    }
235
236
    /**
237
     * Returns the shared string at the given index, using the previously chosen caching strategy.
238
     *
239 114
     * @param int $sharedStringIndex Index of the shared string in the sharedStrings.xml file
240
     * @return string The shared string at the given index
241 114
     * @throws \Box\Spout\Reader\Exception\SharedStringNotFoundException If no shared string found for the given index
242 96
     */
243 96
    public function getStringAtIndex($sharedStringIndex)
244 114
    {
245
        return $this->cachingStrategy->getStringAtIndex($sharedStringIndex);
246
    }
247
248
    /**
249
     * Destroys the cache, freeing memory and removing any created artifacts
250
     *
251
     * @return void
252
     */
253
    public function cleanup()
254
    {
255
        if ($this->cachingStrategy) {
256
            $this->cachingStrategy->clearCache();
257
        }
258
    }
259
}
260