Passed
Push — develop_3.0 ( 0c8a53...e2b519 )
by Adrien
02:43
created

SharedStringsManager::processSharedStringsItem()   A

Complexity

Conditions 4
Paths 4

Size

Total Lines 19
Code Lines 10

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 11
CRAP Score 4

Importance

Changes 0
Metric Value
dl 0
loc 19
ccs 11
cts 11
cp 1
rs 9.2
c 0
b 0
f 0
cc 4
eloc 10
nc 4
nop 2
crap 4
1
<?php
2
3
namespace Box\Spout\Reader\XLSX\Manager;
4
5
use Box\Spout\Common\Exception\IOException;
6
use Box\Spout\Reader\Exception\XMLProcessingException;
7
use Box\Spout\Reader\Wrapper\XMLReader;
8
use Box\Spout\Reader\XLSX\Creator\EntityFactory;
9
use Box\Spout\Reader\XLSX\Creator\HelperFactory;
10
use Box\Spout\Reader\XLSX\Manager\SharedStringsCaching\CachingStrategyFactory;
11
use Box\Spout\Reader\XLSX\Manager\SharedStringsCaching\CachingStrategyInterface;
12
use Box\Spout\Writer\Common\Entity\Workbook;
13
14
/**
15
 * Class SharedStringsManager
16
 * This class manages the shared strings defined in the associated XML file
17
 */
18
class SharedStringsManager
19
{
20
    /** Main namespace for the sharedStrings.xml file */
21
    const MAIN_NAMESPACE_FOR_SHARED_STRINGS_XML = 'http://schemas.openxmlformats.org/spreadsheetml/2006/main';
22
23
    /** Definition of XML nodes names used to parse data */
24
    const XML_NODE_SST = 'sst';
25
    const XML_NODE_SI = 'si';
26
    const XML_NODE_R = 'r';
27
    const XML_NODE_T = 't';
28
29
    /** Definition of XML attributes used to parse data */
30
    const XML_ATTRIBUTE_COUNT = 'count';
31
    const XML_ATTRIBUTE_UNIQUE_COUNT = 'uniqueCount';
32
    const XML_ATTRIBUTE_XML_SPACE = 'xml:space';
33
    const XML_ATTRIBUTE_VALUE_PRESERVE = 'preserve';
34
35
    /** @var string Path of the XLSX file being read */
36
    protected $filePath;
37
38
    /** @var string Temporary folder where the temporary files to store shared strings will be stored */
39
    protected $tempFolder;
40
41
    /** @var WorkbookRelationshipsManager Helps retrieving workbook relationships */
42
    protected $workbookRelationshipsManager;
43
44
    /** @var EntityFactory Factory to create entities */
45
    protected $entityFactory;
46
47
    /** @var HelperFactory $helperFactory Factory to create helpers */
48
    protected $helperFactory;
49
50
    /** @var CachingStrategyFactory Factory to create shared strings caching strategies */
51
    protected $cachingStrategyFactory;
52
53
    /** @var CachingStrategyInterface The best caching strategy for storing shared strings */
54
    protected $cachingStrategy;
55
56
    /**
57
     * @param string $filePath Path of the XLSX file being read
58
     * @param string $tempFolder Temporary folder where the temporary files to store shared strings will be stored
59
     * @param WorkbookRelationshipsManager $workbookRelationshipsManager Helps retrieving workbook relationships
60
     * @param EntityFactory $entityFactory Factory to create entities
61
     * @param HelperFactory $helperFactory Factory to create helpers
62
     * @param CachingStrategyFactory $cachingStrategyFactory Factory to create shared strings caching strategies
63
     */
64 46
    public function __construct(
65
        $filePath,
66
        $tempFolder,
67
        $workbookRelationshipsManager,
68
        $entityFactory,
69
        $helperFactory,
70
        $cachingStrategyFactory
71
    ) {
72 46
        $this->filePath = $filePath;
73 46
        $this->tempFolder = $tempFolder;
74 46
        $this->workbookRelationshipsManager = $workbookRelationshipsManager;
75 46
        $this->entityFactory = $entityFactory;
76 46
        $this->helperFactory = $helperFactory;
77 46
        $this->cachingStrategyFactory = $cachingStrategyFactory;
78 46
    }
79
80
    /**
81
     * Returns whether the XLSX file contains a shared strings XML file
82
     *
83
     * @return bool
84
     */
85 40
    public function hasSharedStrings()
86
    {
87 40
        return $this->workbookRelationshipsManager->hasSharedStringsXMLFile();
88
    }
89
90
    /**
91
     * Builds an in-memory array containing all the shared strings of the sheet.
92
     * All the strings are stored in a XML file, located at 'xl/sharedStrings.xml'.
93
     * It is then accessed by the sheet data, via the string index in the built table.
94
     *
95
     * More documentation available here: http://msdn.microsoft.com/en-us/library/office/gg278314.aspx
96
     *
97
     * The XML file can be really big with sheets containing a lot of data. That is why
98
     * we need to use a XML reader that provides streaming like the XMLReader library.
99
     *
100
     * @throws \Box\Spout\Common\Exception\IOException If shared strings XML file can't be read
101
     * @return void
102
     */
103 39
    public function extractSharedStrings()
104
    {
105 39
        $sharedStringsXMLFilePath = $this->workbookRelationshipsManager->getSharedStringsXMLFilePath();
106 39
        $xmlReader = $this->entityFactory->createXMLReader();
107 39
        $sharedStringIndex = 0;
108
109 39
        if ($xmlReader->openFileInZip($this->filePath, $sharedStringsXMLFilePath) === false) {
110
            throw new IOException('Could not open "' . $sharedStringsXMLFilePath . '".');
111
        }
112
113
        try {
114 39
            $sharedStringsUniqueCount = $this->getSharedStringsUniqueCount($xmlReader);
115 38
            $this->cachingStrategy = $this->getBestSharedStringsCachingStrategy($sharedStringsUniqueCount);
116
117 38
            $xmlReader->readUntilNodeFound(self::XML_NODE_SI);
118
119 38
            while ($xmlReader->getCurrentNodeName() === self::XML_NODE_SI) {
120 27
                $this->processSharedStringsItem($xmlReader, $sharedStringIndex);
121 27
                $sharedStringIndex++;
122
123
                // jump to the next '<si>' tag
124 27
                $xmlReader->next(self::XML_NODE_SI);
125
            }
126
127 38
            $this->cachingStrategy->closeCache();
128 1
        } catch (XMLProcessingException $exception) {
129 1
            throw new IOException("The sharedStrings.xml file is invalid and cannot be read. [{$exception->getMessage()}]");
130
        }
131
132 38
        $xmlReader->close();
133 38
    }
134
135
    /**
136
     * Returns the shared strings unique count, as specified in <sst> tag.
137
     *
138
     * @param \Box\Spout\Reader\Wrapper\XMLReader $xmlReader XMLReader instance
139
     * @throws \Box\Spout\Common\Exception\IOException If sharedStrings.xml is invalid and can't be read
140
     * @return int|null Number of unique shared strings in the sharedStrings.xml file
141
     */
142 39
    protected function getSharedStringsUniqueCount($xmlReader)
143
    {
144 39
        $xmlReader->next(self::XML_NODE_SST);
145
146
        // Iterate over the "sst" elements to get the actual "sst ELEMENT" (skips any DOCTYPE)
147 38
        while ($xmlReader->getCurrentNodeName() === self::XML_NODE_SST && $xmlReader->nodeType !== XMLReader::ELEMENT) {
148 1
            $xmlReader->read();
149
        }
150
151 38
        $uniqueCount = $xmlReader->getAttribute(self::XML_ATTRIBUTE_UNIQUE_COUNT);
152
153
        // some software do not add the "uniqueCount" attribute but only use the "count" one
154
        // @see https://github.com/box/spout/issues/254
155 38
        if ($uniqueCount === null) {
156 4
            $uniqueCount = $xmlReader->getAttribute(self::XML_ATTRIBUTE_COUNT);
157
        }
158
159 38
        return ($uniqueCount !== null) ? (int) $uniqueCount : null;
160
    }
161
162
    /**
163
     * Returns the best shared strings caching strategy.
164
     *
165
     * @param int|null $sharedStringsUniqueCount Number of unique shared strings (NULL if unknown)
166
     * @return CachingStrategyInterface
167
     */
168 38
    protected function getBestSharedStringsCachingStrategy($sharedStringsUniqueCount)
169
    {
170 38
        return $this->cachingStrategyFactory
171 38
                ->createBestCachingStrategy($sharedStringsUniqueCount, $this->tempFolder, $this->helperFactory);
172
    }
173
174
    /**
175
     * Processes the shared strings item XML node which the given XML reader is positioned on.
176
     *
177
     * @param \Box\Spout\Reader\Wrapper\XMLReader $xmlReader XML Reader positioned on a "<si>" node
178
     * @param int $sharedStringIndex Index of the processed shared strings item
179
     * @return void
180
     */
181 27
    protected function processSharedStringsItem($xmlReader, $sharedStringIndex)
182
    {
183 27
        $sharedStringValue = '';
184
185
        // NOTE: expand() will automatically decode all XML entities of the child nodes
186 27
        $siNode = $xmlReader->expand();
187 27
        $textNodes = $siNode->getElementsByTagName(self::XML_NODE_T);
188
189 27
        foreach ($textNodes as $textNode) {
190 27
            if ($this->shouldExtractTextNodeValue($textNode)) {
191 27
                $textNodeValue = $textNode->nodeValue;
192 27
                $shouldPreserveWhitespace = $this->shouldPreserveWhitespace($textNode);
193
194 27
                $sharedStringValue .= ($shouldPreserveWhitespace) ? $textNodeValue : trim($textNodeValue);
195
            }
196
        }
197
198 27
        $this->cachingStrategy->addStringForIndex($sharedStringValue, $sharedStringIndex);
199 27
    }
200
201
    /**
202
     * Not all text nodes' values must be extracted.
203
     * Some text nodes are part of a node describing the pronunciation for instance.
204
     * We'll only consider the nodes whose parents are "<si>" or "<r>".
205
     *
206
     * @param \DOMElement $textNode Text node to check
207
     * @return bool Whether the given text node's value must be extracted
208
     */
209 27
    protected function shouldExtractTextNodeValue($textNode)
210
    {
211 27
        $parentTagName = $textNode->parentNode->localName;
212
213 27
        return ($parentTagName === self::XML_NODE_SI || $parentTagName === self::XML_NODE_R);
214
    }
215
216
    /**
217
     * If the text node has the attribute 'xml:space="preserve"', then preserve whitespace.
218
     *
219
     * @param \DOMElement $textNode The text node element (<t>) whose whitespace may be preserved
220
     * @return bool Whether whitespace should be preserved
221
     */
222 27
    protected function shouldPreserveWhitespace($textNode)
223
    {
224 27
        $spaceValue = $textNode->getAttribute(self::XML_ATTRIBUTE_XML_SPACE);
225
226 27
        return ($spaceValue === self::XML_ATTRIBUTE_VALUE_PRESERVE);
227
    }
228
229
    /**
230
     * Returns the shared string at the given index, using the previously chosen caching strategy.
231
     *
232
     * @param int $sharedStringIndex Index of the shared string in the sharedStrings.xml file
233
     * @throws \Box\Spout\Reader\Exception\SharedStringNotFoundException If no shared string found for the given index
234
     * @return string The shared string at the given index
235
     */
236 27
    public function getStringAtIndex($sharedStringIndex)
237
    {
238 27
        return $this->cachingStrategy->getStringAtIndex($sharedStringIndex);
239
    }
240
241
    /**
242
     * Destroys the cache, freeing memory and removing any created artifacts
243
     *
244
     * @return void
245
     */
246 43
    public function cleanup()
247
    {
248 43
        if ($this->cachingStrategy) {
249 36
            $this->cachingStrategy->clearCache();
250
        }
251 43
    }
252
}
253