Completed
Pull Request — master (#229)
by Adrien
02:36
created

getBestSharedStringsCachingStrategy()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 5
Code Lines 3

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 3
CRAP Score 1

Importance

Changes 1
Bugs 0 Features 0
Metric Value
c 1
b 0
f 0
dl 0
loc 5
rs 9.4285
ccs 3
cts 3
cp 1
cc 1
eloc 3
nc 1
nop 1
crap 1
1
<?php
2
3
namespace Box\Spout\Reader\XLSX\Helper;
4
5
use Box\Spout\Common\Exception\IOException;
6
use Box\Spout\Reader\Exception\XMLProcessingException;
7
use Box\Spout\Reader\Wrapper\SimpleXMLElement;
8
use Box\Spout\Reader\Wrapper\XMLReader;
9
use Box\Spout\Reader\XLSX\Helper\SharedStringsCaching\CachingStrategyFactory;
10
use Box\Spout\Reader\XLSX\Helper\SharedStringsCaching\CachingStrategyInterface;
11
12
/**
13
 * Class SharedStringsHelper
14
 * This class provides helper functions for reading sharedStrings XML file
15
 *
16
 * @package Box\Spout\Reader\XLSX\Helper
17
 */
18
class SharedStringsHelper
19
{
20
    /** Path of sharedStrings XML file inside the XLSX file */
21
    const SHARED_STRINGS_XML_FILE_PATH = 'xl/sharedStrings.xml';
22
23
    /** Main namespace for the sharedStrings.xml file */
24
    const MAIN_NAMESPACE_FOR_SHARED_STRINGS_XML = 'http://schemas.openxmlformats.org/spreadsheetml/2006/main';
25
26
    /** @var string Path of the XLSX file being read */
27
    protected $filePath;
28
29
    /** @var string Temporary folder where the temporary files to store shared strings will be stored */
30
    protected $tempFolder;
31
32
    /** @var CachingStrategyInterface The best caching strategy for storing shared strings */
33
    protected $cachingStrategy;
34
35
    /**
36
     * @param string $filePath Path of the XLSX file being read
37
     * @param string|void $tempFolder Temporary folder where the temporary files to store shared strings will be stored
38
     */
39 102
    public function __construct($filePath, $tempFolder = null)
40
    {
41 102
        $this->filePath = $filePath;
42 102
        $this->tempFolder = $tempFolder;
43 102
    }
44
45
    /**
46
     * Returns whether the XLSX file contains a shared strings XML file
47
     *
48
     * @return bool
49
     */
50 89
    public function hasSharedStrings()
51
    {
52 87
        $hasSharedStrings = false;
53 87
        $zip = new \ZipArchive();
54
55 87
        if ($zip->open($this->filePath) === true) {
56 87
            $hasSharedStrings = ($zip->locateName(self::SHARED_STRINGS_XML_FILE_PATH) !== false);
57 87
            $zip->close();
58 87
        }
59
60 89
        return $hasSharedStrings;
61
    }
62
63
    /**
64
     * Builds an in-memory array containing all the shared strings of the sheet.
65
     * All the strings are stored in a XML file, located at 'xl/sharedStrings.xml'.
66
     * It is then accessed by the sheet data, via the string index in the built table.
67
     *
68
     * More documentation available here: http://msdn.microsoft.com/en-us/library/office/gg278314.aspx
69
     *
70
     * The XML file can be really big with sheets containing a lot of data. That is why
71
     * we need to use a XML reader that provides streaming like the XMLReader library.
72
     * Please note that SimpleXML does not provide such a functionality but since it is faster
73
     * and more handy to parse few XML nodes, it is used in combination with XMLReader for that purpose.
74
     *
75
     * @return void
76
     * @throws \Box\Spout\Common\Exception\IOException If sharedStrings.xml can't be read
77
     */
78 90
    public function extractSharedStrings()
79
    {
80 90
        $xmlReader = new XMLReader();
81 90
        $sharedStringIndex = 0;
82
        /** @noinspection PhpUnnecessaryFullyQualifiedNameInspection */
83 90
        $escaper = new \Box\Spout\Common\Escaper\XLSX();
84
85 90
        $sharedStringsFilePath = $this->getSharedStringsFilePath();
86 90
        if ($xmlReader->open($sharedStringsFilePath) === false) {
87
            throw new IOException('Could not open "' . self::SHARED_STRINGS_XML_FILE_PATH . '".');
88
        }
89
90
        try {
91 90
            $sharedStringsUniqueCount = $this->getSharedStringsUniqueCount($xmlReader);
92 87
            $this->cachingStrategy = $this->getBestSharedStringsCachingStrategy($sharedStringsUniqueCount);
93
94 87
            $xmlReader->readUntilNodeFound('si');
95
96 87
            while ($xmlReader->name === 'si') {
97 57
                $node = $this->getSimpleXmlElementNodeFromXMLReader($xmlReader);
98 54
                $node->registerXPathNamespace('ns', self::MAIN_NAMESPACE_FOR_SHARED_STRINGS_XML);
99
100
                // removes nodes that should not be read, like the pronunciation of the Kanji characters
101 54
                $cleanNode = $this->removeSuperfluousTextNodes($node);
102
103
                // find all text nodes "t"; there can be multiple if the cell contains formatting
104 54
                $textNodes = $cleanNode->xpath('//ns:t');
105
106 54
                $textValue = '';
107 54
                foreach ($textNodes as $nodeIndex => $textNode) {
108 54
                    if ($nodeIndex !== 0) {
109
                        // add a space between each "t" node
110 3
                        $textValue .= ' ';
111 3
                    }
112
113 54
                    if ($this->shouldPreserveWhitespace($textNode)) {
114 6
                        $textValue .= $textNode->__toString();
115 6
                    } else {
116 48
                        $textValue .= trim($textNode->__toString());
117
                    }
118 54
                }
119
120 54
                $unescapedTextValue = $escaper->unescape($textValue);
121 54
                $this->cachingStrategy->addStringForIndex($unescapedTextValue, $sharedStringIndex);
122
123 54
                $sharedStringIndex++;
124
125
                // jump to the next 'si' tag
126 54
                $xmlReader->next('si');
127 54
            }
128
129 90
        } catch (XMLProcessingException $exception) {
130 3
            throw new IOException("The sharedStrings.xml file is invalid and cannot be read. [{$exception->getMessage()}]");
131
        }
132
133 84
        $this->cachingStrategy->closeCache();
134
135 84
        $xmlReader->close();
136 84
    }
137
138
    /**
139
     * @return string The path to the shared strings XML file
140
     */
141 90
    protected function getSharedStringsFilePath()
142
    {
143 90
        return 'zip://' . $this->filePath . '#' . self::SHARED_STRINGS_XML_FILE_PATH;
144
    }
145
146
    /**
147
     * Returns the shared strings unique count, as specified in <sst> tag.
148
     *
149
     * @param \Box\Spout\Reader\Wrapper\XMLReader $xmlReader XMLReader instance
150
     * @return int Number of unique shared strings in the sharedStrings.xml file
151
     * @throws \Box\Spout\Common\Exception\IOException If sharedStrings.xml is invalid and can't be read
152
     */
153 90
    protected function getSharedStringsUniqueCount($xmlReader)
154
    {
155 90
        $xmlReader->next('sst');
156
157
        // Iterate over the "sst" elements to get the actual "sst ELEMENT" (skips any DOCTYPE)
158 87
        while ($xmlReader->name === 'sst' && $xmlReader->nodeType !== XMLReader::ELEMENT) {
159 3
            $xmlReader->read();
160 3
        }
161
162 87
        return intval($xmlReader->getAttribute('uniqueCount'));
163
    }
164
165
    /**
166
     * Returns the best shared strings caching strategy.
167
     *
168
     * @param int $sharedStringsUniqueCount
169
     * @return CachingStrategyInterface
170
     */
171 87
    protected function getBestSharedStringsCachingStrategy($sharedStringsUniqueCount)
172
    {
173 87
        return CachingStrategyFactory::getInstance()
174 87
                ->getBestCachingStrategy($sharedStringsUniqueCount, $this->tempFolder);
175
    }
176
177
    /**
178
     * Returns a SimpleXMLElement node from the current node in the given XMLReader instance.
179
     * This is to simplify the parsing of the subtree.
180
     *
181
     * @param \Box\Spout\Reader\Wrapper\XMLReader $xmlReader
182
     * @return \Box\Spout\Reader\Wrapper\SimpleXMLElement
183
     * @throws \Box\Spout\Common\Exception\IOException If the current node cannot be read
184
     */
185 57
    protected function getSimpleXmlElementNodeFromXMLReader($xmlReader)
186
    {
187 57
        $node = null;
188
        try {
189 57
            $node = new SimpleXMLElement($xmlReader->readOuterXml());
190 57
        } catch (XMLProcessingException $exception) {
191 3
            throw new IOException("The sharedStrings.xml file contains unreadable data [{$exception->getMessage()}].");
192
        }
193
194 54
        return $node;
195
    }
196
197
    /**
198
     * Removes nodes that should not be read, like the pronunciation of the Kanji characters.
199
     * By keeping them, their text content would be added to the read string.
200
     *
201
     * @param \Box\Spout\Reader\Wrapper\SimpleXMLElement $parentNode Parent node that may contain nodes to remove
202
     * @return \Box\Spout\Reader\Wrapper\SimpleXMLElement Cleaned parent node
203
     */
204 54
    protected function removeSuperfluousTextNodes($parentNode)
205
    {
206
        $tagsToRemove = [
207 54
            'rPh', // Pronunciation of the text
208 54
            'pPr', // Paragraph Properties / Previous Paragraph Properties
209 54
            'rPr', // Run Properties for the Paragraph Mark / Previous Run Properties for the Paragraph Mark
210 54
        ];
211
212 54
        foreach ($tagsToRemove as $tagToRemove) {
213 54
            $xpath = '//ns:' . $tagToRemove;
214 54
            $parentNode->removeNodesMatchingXPath($xpath);
215 54
        }
216
217 54
        return $parentNode;
218
    }
219
220
    /**
221
     * If the text node has the attribute 'xml:space="preserve"', then preserve whitespace.
222
     *
223
     * @param \Box\Spout\Reader\Wrapper\SimpleXMLElement $textNode The text node element (<t>) whitespace may be preserved
224
     * @return bool Whether whitespace should be preserved
225
     */
226 54
    protected function shouldPreserveWhitespace($textNode)
227
    {
228 54
        $spaceValue = $textNode->getAttribute('space', 'xml');
229 54
        return ($spaceValue === 'preserve');
230
    }
231
232
    /**
233
     * Returns the shared string at the given index, using the previously chosen caching strategy.
234
     *
235
     * @param int $sharedStringIndex Index of the shared string in the sharedStrings.xml file
236
     * @return string The shared string at the given index
237
     * @throws \Box\Spout\Reader\Exception\SharedStringNotFoundException If no shared string found for the given index
238
     */
239 54
    public function getStringAtIndex($sharedStringIndex)
240
    {
241 54
        return $this->cachingStrategy->getStringAtIndex($sharedStringIndex);
242
    }
243
244
    /**
245
     * Destroys the cache, freeing memory and removing any created artifacts
246
     *
247
     * @return void
248
     */
249 90
    public function cleanup()
250
    {
251 90
        if ($this->cachingStrategy) {
252 78
            $this->cachingStrategy->clearCache();
253 78
        }
254 90
    }
255
}
256