Completed
Pull Request — develop_3.0 (#460)
by Adrien
02:25
created

SharedStringsManager::shouldPreserveWhitespace()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 5
Code Lines 3

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 3
CRAP Score 1

Importance

Changes 0
Metric Value
dl 0
loc 5
ccs 3
cts 3
cp 1
rs 9.4285
c 0
b 0
f 0
cc 1
eloc 3
nc 1
nop 1
crap 1
1
<?php
2
3
namespace Box\Spout\Reader\XLSX\Manager;
4
5
use Box\Spout\Common\Exception\IOException;
6
use Box\Spout\Reader\Exception\XMLProcessingException;
7
use Box\Spout\Reader\Wrapper\XMLReader;
8
use Box\Spout\Reader\XLSX\Creator\EntityFactory;
9
use Box\Spout\Reader\XLSX\Creator\HelperFactory;
10
use Box\Spout\Reader\XLSX\Manager\SharedStringsCaching\CachingStrategyFactory;
11
use Box\Spout\Reader\XLSX\Manager\SharedStringsCaching\CachingStrategyInterface;
12
13
/**
14
 * Class SharedStringsManager
15
 * This class manages the shared strings defined in the associated XML file
16
 *
17
 * @package Box\Spout\Reader\XLSX\Manager
18
 */
19
class SharedStringsManager
20
{
21
    /** Path of sharedStrings XML file inside the XLSX file */
22
    const SHARED_STRINGS_XML_FILE_PATH = 'xl/sharedStrings.xml';
23
24
    /** Main namespace for the sharedStrings.xml file */
25
    const MAIN_NAMESPACE_FOR_SHARED_STRINGS_XML = 'http://schemas.openxmlformats.org/spreadsheetml/2006/main';
26
27
    /** Definition of XML nodes names used to parse data */
28
    const XML_NODE_SST = 'sst';
29
    const XML_NODE_SI = 'si';
30
    const XML_NODE_R = 'r';
31
    const XML_NODE_T = 't';
32
33
    /** Definition of XML attributes used to parse data */
34
    const XML_ATTRIBUTE_COUNT = 'count';
35
    const XML_ATTRIBUTE_UNIQUE_COUNT = 'uniqueCount';
36
    const XML_ATTRIBUTE_XML_SPACE = 'xml:space';
37
    const XML_ATTRIBUTE_VALUE_PRESERVE = 'preserve';
38
39
    /** @var string Path of the XLSX file being read */
40
    protected $filePath;
41
42
    /** @var string Temporary folder where the temporary files to store shared strings will be stored */
43
    protected $tempFolder;
44
45
    /** @var EntityFactory Factory to create entities */
46
    protected $entityFactory;
47
48
    /** @var HelperFactory $helperFactory Factory to create helpers */
49
    protected $helperFactory;
50
51
    /** @var CachingStrategyFactory Factory to create shared strings caching strategies */
52
    protected $cachingStrategyFactory;
53
54
    /** @var CachingStrategyInterface The best caching strategy for storing shared strings */
55
    protected $cachingStrategy;
56
57
    /**
58
     * @param string $filePath Path of the XLSX file being read
59
     * @param string $tempFolder Temporary folder where the temporary files to store shared strings will be stored
60
     * @param EntityFactory $entityFactory Factory to create entities
61
     * @param HelperFactory $helperFactory Factory to create helpers
62
     * @param CachingStrategyFactory $cachingStrategyFactory Factory to create shared strings caching strategies
63
     */
64 41
    public function __construct($filePath, $tempFolder, $entityFactory, $helperFactory, $cachingStrategyFactory)
65
    {
66 41
        $this->filePath = $filePath;
67 41
        $this->tempFolder = $tempFolder;
68 41
        $this->entityFactory = $entityFactory;
69 41
        $this->helperFactory = $helperFactory;
70 41
        $this->cachingStrategyFactory = $cachingStrategyFactory;
71 41
    }
72
73
    /**
74
     * Returns whether the XLSX file contains a shared strings XML file
75
     *
76
     * @return bool
77
     */
78 35
    public function hasSharedStrings()
79
    {
80 35
        $hasSharedStrings = false;
81 35
        $zip = $this->entityFactory->createZipArchive();
82
83 35
        if ($zip->open($this->filePath) === true) {
84 35
            $hasSharedStrings = ($zip->locateName(self::SHARED_STRINGS_XML_FILE_PATH) !== false);
85 35
            $zip->close();
86
        }
87
88 35
        return $hasSharedStrings;
89
    }
90
91
    /**
92
     * Builds an in-memory array containing all the shared strings of the sheet.
93
     * All the strings are stored in a XML file, located at 'xl/sharedStrings.xml'.
94
     * It is then accessed by the sheet data, via the string index in the built table.
95
     *
96
     * More documentation available here: http://msdn.microsoft.com/en-us/library/office/gg278314.aspx
97
     *
98
     * The XML file can be really big with sheets containing a lot of data. That is why
99
     * we need to use a XML reader that provides streaming like the XMLReader library.
100
     *
101
     * @return void
102
     * @throws \Box\Spout\Common\Exception\IOException If sharedStrings.xml can't be read
103
     */
104 35
    public function extractSharedStrings()
105
    {
106 35
        $xmlReader = $this->entityFactory->createXMLReader();
107 35
        $sharedStringIndex = 0;
108
109 35
        if ($xmlReader->openFileInZip($this->filePath, self::SHARED_STRINGS_XML_FILE_PATH) === false) {
110
            throw new IOException('Could not open "' . self::SHARED_STRINGS_XML_FILE_PATH . '".');
111
        }
112
113
        try {
114 35
            $sharedStringsUniqueCount = $this->getSharedStringsUniqueCount($xmlReader);
115 34
            $this->cachingStrategy = $this->getBestSharedStringsCachingStrategy($sharedStringsUniqueCount);
116
117 34
            $xmlReader->readUntilNodeFound(self::XML_NODE_SI);
118
119 34
            while ($xmlReader->name === self::XML_NODE_SI) {
120 24
                $this->processSharedStringsItem($xmlReader, $sharedStringIndex);
121 24
                $sharedStringIndex++;
122
123
                // jump to the next '<si>' tag
124 24
                $xmlReader->next(self::XML_NODE_SI);
125
            }
126
127 34
            $this->cachingStrategy->closeCache();
128
129 1
        } catch (XMLProcessingException $exception) {
130 1
            throw new IOException("The sharedStrings.xml file is invalid and cannot be read. [{$exception->getMessage()}]");
131
        }
132
133 34
        $xmlReader->close();
134 34
    }
135
136
    /**
137
     * Returns the shared strings unique count, as specified in <sst> tag.
138
     *
139
     * @param \Box\Spout\Reader\Wrapper\XMLReader $xmlReader XMLReader instance
140
     * @return int|null Number of unique shared strings in the sharedStrings.xml file
141
     * @throws \Box\Spout\Common\Exception\IOException If sharedStrings.xml is invalid and can't be read
142
     */
143 35
    protected function getSharedStringsUniqueCount($xmlReader)
144
    {
145 35
        $xmlReader->next(self::XML_NODE_SST);
146
147
        // Iterate over the "sst" elements to get the actual "sst ELEMENT" (skips any DOCTYPE)
148 34
        while ($xmlReader->name === self::XML_NODE_SST && $xmlReader->nodeType !== XMLReader::ELEMENT) {
149 1
            $xmlReader->read();
150
        }
151
152 34
        $uniqueCount = $xmlReader->getAttribute(self::XML_ATTRIBUTE_UNIQUE_COUNT);
153
154
        // some software do not add the "uniqueCount" attribute but only use the "count" one
155
        // @see https://github.com/box/spout/issues/254
156 34
        if ($uniqueCount === null) {
157 3
            $uniqueCount = $xmlReader->getAttribute(self::XML_ATTRIBUTE_COUNT);
158
        }
159
160 34
        return ($uniqueCount !== null) ? intval($uniqueCount) : null;
161
    }
162
163
    /**
164
     * Returns the best shared strings caching strategy.
165
     *
166
     * @param int|null $sharedStringsUniqueCount Number of unique shared strings (NULL if unknown)
167
     * @return CachingStrategyInterface
168
     */
169 34
    protected function getBestSharedStringsCachingStrategy($sharedStringsUniqueCount)
170
    {
171 34
        return $this->cachingStrategyFactory
172 34
                ->createBestCachingStrategy($sharedStringsUniqueCount, $this->tempFolder, $this->helperFactory);
173
    }
174
175
    /**
176
     * Processes the shared strings item XML node which the given XML reader is positioned on.
177
     *
178
     * @param \Box\Spout\Reader\Wrapper\XMLReader $xmlReader XML Reader positioned on a "<si>" node
179
     * @param int $sharedStringIndex Index of the processed shared strings item
180
     * @return void
181
     */
182 24
    protected function processSharedStringsItem($xmlReader, $sharedStringIndex)
183
    {
184 24
        $sharedStringValue = '';
185
186
        // NOTE: expand() will automatically decode all XML entities of the child nodes
187 24
        $siNode = $xmlReader->expand();
188 24
        $textNodes = $siNode->getElementsByTagName(self::XML_NODE_T);
189
190 24
        foreach ($textNodes as $textNode) {
191 24
            if ($this->shouldExtractTextNodeValue($textNode)) {
192 24
                $textNodeValue = $textNode->nodeValue;
193 24
                $shouldPreserveWhitespace = $this->shouldPreserveWhitespace($textNode);
194
195 24
                $sharedStringValue .= ($shouldPreserveWhitespace) ? $textNodeValue : trim($textNodeValue);
196
            }
197
        }
198
199 24
        $this->cachingStrategy->addStringForIndex($sharedStringValue, $sharedStringIndex);
200 24
    }
201
202
    /**
203
     * Not all text nodes' values must be extracted.
204
     * Some text nodes are part of a node describing the pronunciation for instance.
205
     * We'll only consider the nodes whose parents are "<si>" or "<r>".
206
     *
207
     * @param \DOMElement $textNode Text node to check
208
     * @return bool Whether the given text node's value must be extracted
209
     */
210 24
    protected function shouldExtractTextNodeValue($textNode)
211
    {
212 24
        $parentTagName = $textNode->parentNode->localName;
213 24
        return ($parentTagName === self::XML_NODE_SI || $parentTagName === self::XML_NODE_R);
214
    }
215
216
    /**
217
     * If the text node has the attribute 'xml:space="preserve"', then preserve whitespace.
218
     *
219
     * @param \DOMElement $textNode The text node element (<t>) whose whitespace may be preserved
220
     * @return bool Whether whitespace should be preserved
221
     */
222 24
    protected function shouldPreserveWhitespace($textNode)
223
    {
224 24
        $spaceValue = $textNode->getAttribute(self::XML_ATTRIBUTE_XML_SPACE);
225 24
        return ($spaceValue === self::XML_ATTRIBUTE_VALUE_PRESERVE);
226
    }
227
228
    /**
229
     * Returns the shared string at the given index, using the previously chosen caching strategy.
230
     *
231
     * @param int $sharedStringIndex Index of the shared string in the sharedStrings.xml file
232
     * @return string The shared string at the given index
233
     * @throws \Box\Spout\Reader\Exception\SharedStringNotFoundException If no shared string found for the given index
234
     */
235 24
    public function getStringAtIndex($sharedStringIndex)
236
    {
237 24
        return $this->cachingStrategy->getStringAtIndex($sharedStringIndex);
238
    }
239
240
    /**
241
     * Destroys the cache, freeing memory and removing any created artifacts
242
     *
243
     * @return void
244
     */
245 38
    public function cleanup()
246
    {
247 38
        if ($this->cachingStrategy) {
248 32
            $this->cachingStrategy->clearCache();
249
        }
250 38
    }
251
}
252