1 | <?php |
||
17 | class SharedStringsHelper |
||
18 | { |
||
19 | /** Path of sharedStrings XML file inside the XLSX file */ |
||
20 | const SHARED_STRINGS_XML_FILE_PATH = 'xl/sharedStrings.xml'; |
||
21 | |||
22 | /** Main namespace for the sharedStrings.xml file */ |
||
23 | const MAIN_NAMESPACE_FOR_SHARED_STRINGS_XML = 'http://schemas.openxmlformats.org/spreadsheetml/2006/main'; |
||
24 | |||
25 | /** Definition of XML nodes names used to parse data */ |
||
26 | const XML_NODE_SST = 'sst'; |
||
27 | const XML_NODE_SI = 'si'; |
||
28 | const XML_NODE_R = 'r'; |
||
29 | const XML_NODE_T = 't'; |
||
30 | |||
31 | /** Definition of XML attributes used to parse data */ |
||
32 | const XML_ATTRIBUTE_COUNT = 'count'; |
||
33 | const XML_ATTRIBUTE_UNIQUE_COUNT = 'uniqueCount'; |
||
34 | const XML_ATTRIBUTE_XML_SPACE = 'xml:space'; |
||
35 | const XML_ATTRIBUTE_VALUE_PRESERVE = 'preserve'; |
||
36 | |||
37 | /** @var string Path of the XLSX file being read */ |
||
38 | protected $filePath; |
||
39 | |||
40 | /** @var string Temporary folder where the temporary files to store shared strings will be stored */ |
||
41 | protected $tempFolder; |
||
42 | |||
43 | /** @var CachingStrategyInterface The best caching strategy for storing shared strings */ |
||
44 | protected $cachingStrategy; |
||
45 | |||
46 | /** |
||
47 | * @param string $filePath Path of the XLSX file being read |
||
48 | * @param string|null|void $tempFolder Temporary folder where the temporary files to store shared strings will be stored |
||
49 | */ |
||
50 | 41 | public function __construct($filePath, $tempFolder = null) |
|
55 | |||
56 | /** |
||
57 | * Returns whether the XLSX file contains a shared strings XML file |
||
58 | * |
||
59 | * @return bool |
||
60 | */ |
||
61 | 35 | public function hasSharedStrings() |
|
62 | { |
||
63 | 35 | $hasSharedStrings = false; |
|
64 | 35 | $zip = new \ZipArchive(); |
|
65 | |||
66 | 35 | if ($zip->open($this->filePath) === true) { |
|
67 | 35 | $hasSharedStrings = ($zip->locateName(self::SHARED_STRINGS_XML_FILE_PATH) !== false); |
|
68 | 35 | $zip->close(); |
|
69 | } |
||
70 | |||
71 | 35 | return $hasSharedStrings; |
|
72 | } |
||
73 | |||
74 | /** |
||
75 | * Builds an in-memory array containing all the shared strings of the sheet. |
||
76 | * All the strings are stored in a XML file, located at 'xl/sharedStrings.xml'. |
||
77 | * It is then accessed by the sheet data, via the string index in the built table. |
||
78 | * |
||
79 | * More documentation available here: http://msdn.microsoft.com/en-us/library/office/gg278314.aspx |
||
80 | * |
||
81 | * The XML file can be really big with sheets containing a lot of data. That is why |
||
82 | * we need to use a XML reader that provides streaming like the XMLReader library. |
||
83 | * Please note that SimpleXML does not provide such a functionality but since it is faster |
||
84 | * and more handy to parse few XML nodes, it is used in combination with XMLReader for that purpose. |
||
85 | * |
||
86 | * @return void |
||
87 | * @throws \Box\Spout\Common\Exception\IOException If sharedStrings.xml can't be read |
||
88 | */ |
||
89 | 35 | public function extractSharedStrings() |
|
90 | { |
||
91 | 35 | $xmlReader = new XMLReader(); |
|
92 | 35 | $sharedStringIndex = 0; |
|
93 | |||
94 | 35 | $sharedStringsFilePath = $this->getSharedStringsFilePath(); |
|
95 | 35 | if ($xmlReader->open($sharedStringsFilePath) === false) { |
|
96 | throw new IOException('Could not open "' . self::SHARED_STRINGS_XML_FILE_PATH . '".'); |
||
97 | } |
||
98 | |||
99 | try { |
||
100 | 35 | $sharedStringsUniqueCount = $this->getSharedStringsUniqueCount($xmlReader); |
|
101 | 34 | $this->cachingStrategy = $this->getBestSharedStringsCachingStrategy($sharedStringsUniqueCount); |
|
102 | |||
103 | 34 | $xmlReader->readUntilNodeFound(self::XML_NODE_SI); |
|
104 | |||
105 | 34 | while ($xmlReader->name === self::XML_NODE_SI) { |
|
106 | 24 | $this->processSharedStringsItem($xmlReader, $sharedStringIndex); |
|
107 | 24 | $sharedStringIndex++; |
|
108 | |||
109 | // jump to the next '<si>' tag |
||
110 | 24 | $xmlReader->next(self::XML_NODE_SI); |
|
111 | } |
||
112 | |||
113 | 34 | $this->cachingStrategy->closeCache(); |
|
114 | |||
115 | 1 | } catch (XMLProcessingException $exception) { |
|
116 | 1 | throw new IOException("The sharedStrings.xml file is invalid and cannot be read. [{$exception->getMessage()}]"); |
|
117 | } |
||
118 | |||
119 | 34 | $xmlReader->close(); |
|
120 | 34 | } |
|
121 | |||
122 | /** |
||
123 | * @return string The path to the shared strings XML file |
||
124 | */ |
||
125 | 35 | protected function getSharedStringsFilePath() |
|
129 | |||
130 | /** |
||
131 | * Returns the shared strings unique count, as specified in <sst> tag. |
||
132 | * |
||
133 | * @param \Box\Spout\Reader\Wrapper\XMLReader $xmlReader XMLReader instance |
||
134 | * @return int|null Number of unique shared strings in the sharedStrings.xml file |
||
135 | * @throws \Box\Spout\Common\Exception\IOException If sharedStrings.xml is invalid and can't be read |
||
136 | */ |
||
137 | 35 | protected function getSharedStringsUniqueCount($xmlReader) |
|
138 | { |
||
139 | 35 | $xmlReader->next(self::XML_NODE_SST); |
|
140 | |||
141 | // Iterate over the "sst" elements to get the actual "sst ELEMENT" (skips any DOCTYPE) |
||
142 | 34 | while ($xmlReader->name === self::XML_NODE_SST && $xmlReader->nodeType !== XMLReader::ELEMENT) { |
|
143 | 1 | $xmlReader->read(); |
|
144 | } |
||
145 | |||
146 | 34 | $uniqueCount = $xmlReader->getAttribute(self::XML_ATTRIBUTE_UNIQUE_COUNT); |
|
147 | |||
148 | // some software do not add the "uniqueCount" attribute but only use the "count" one |
||
149 | // @see https://github.com/box/spout/issues/254 |
||
150 | 34 | if ($uniqueCount === null) { |
|
151 | 3 | $uniqueCount = $xmlReader->getAttribute(self::XML_ATTRIBUTE_COUNT); |
|
152 | } |
||
153 | |||
154 | 34 | return ($uniqueCount !== null) ? intval($uniqueCount) : null; |
|
155 | } |
||
156 | |||
157 | /** |
||
158 | * Returns the best shared strings caching strategy. |
||
159 | * |
||
160 | * @param int|null $sharedStringsUniqueCount Number of unique shared strings (NULL if unknown) |
||
161 | * @return CachingStrategyInterface |
||
162 | */ |
||
163 | 34 | protected function getBestSharedStringsCachingStrategy($sharedStringsUniqueCount) |
|
168 | |||
169 | /** |
||
170 | * Processes the shared strings item XML node which the given XML reader is positioned on. |
||
171 | * |
||
172 | * @param \Box\Spout\Reader\Wrapper\XMLReader $xmlReader XML Reader positioned on a "<si>" node |
||
173 | * @param int $sharedStringIndex Index of the processed shared strings item |
||
174 | * @return void |
||
175 | */ |
||
176 | 24 | protected function processSharedStringsItem($xmlReader, $sharedStringIndex) |
|
177 | { |
||
178 | 24 | $sharedStringValue = ''; |
|
179 | |||
180 | // NOTE: expand() will automatically decode all XML entities of the child nodes |
||
181 | 24 | $siNode = $xmlReader->expand(); |
|
182 | 24 | $textNodes = $siNode->getElementsByTagName(self::XML_NODE_T); |
|
183 | |||
184 | 24 | foreach ($textNodes as $textNode) { |
|
185 | 24 | if ($this->shouldExtractTextNodeValue($textNode)) { |
|
186 | 24 | $textNodeValue = $textNode->nodeValue; |
|
187 | 24 | $shouldPreserveWhitespace = $this->shouldPreserveWhitespace($textNode); |
|
188 | |||
189 | 24 | $sharedStringValue .= ($shouldPreserveWhitespace) ? $textNodeValue : trim($textNodeValue); |
|
190 | } |
||
191 | } |
||
192 | |||
193 | 24 | $this->cachingStrategy->addStringForIndex($sharedStringValue, $sharedStringIndex); |
|
194 | 24 | } |
|
195 | |||
196 | /** |
||
197 | * Not all text nodes' values must be extracted. |
||
198 | * Some text nodes are part of a node describing the pronunciation for instance. |
||
199 | * We'll only consider the nodes whose parents are "<si>" or "<r>". |
||
200 | * |
||
201 | * @param \DOMElement $textNode Text node to check |
||
202 | * @return bool Whether the given text node's value must be extracted |
||
203 | */ |
||
204 | 24 | protected function shouldExtractTextNodeValue($textNode) |
|
209 | |||
210 | /** |
||
211 | * If the text node has the attribute 'xml:space="preserve"', then preserve whitespace. |
||
212 | * |
||
213 | * @param \DOMElement $textNode The text node element (<t>) whose whitespace may be preserved |
||
214 | * @return bool Whether whitespace should be preserved |
||
215 | */ |
||
216 | 24 | protected function shouldPreserveWhitespace($textNode) |
|
221 | |||
222 | /** |
||
223 | * Returns the shared string at the given index, using the previously chosen caching strategy. |
||
224 | * |
||
225 | * @param int $sharedStringIndex Index of the shared string in the sharedStrings.xml file |
||
226 | * @return string The shared string at the given index |
||
227 | * @throws \Box\Spout\Reader\Exception\SharedStringNotFoundException If no shared string found for the given index |
||
228 | */ |
||
229 | 24 | public function getStringAtIndex($sharedStringIndex) |
|
233 | |||
234 | /** |
||
235 | * Destroys the cache, freeing memory and removing any created artifacts |
||
236 | * |
||
237 | * @return void |
||
238 | */ |
||
239 | 38 | public function cleanup() |
|
245 | } |
||
246 |