XLSX::escapeControlCharacters()   A
last analyzed

Complexity

Conditions 2
Paths 2

Size

Total Lines 13

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 6
CRAP Score 2

Importance

Changes 0
Metric Value
dl 0
loc 13
ccs 6
cts 6
cp 1
rs 9.8333
c 0
b 0
f 0
cc 2
nc 2
nop 1
crap 2
1
<?php
2
3
namespace Box\Spout\Common\Helper\Escaper;
4
5
/**
6
 * Class XLSX
7
 * Provides functions to escape and unescape data for XLSX files
8
 */
9
class XLSX implements EscaperInterface
10
{
11
    /** @var bool Whether the escaper has already been initialized */
12
    private $isAlreadyInitialized = false;
13
14
    /** @var string Regex pattern to detect control characters that need to be escaped */
15
    private $escapableControlCharactersPattern;
16
17
    /** @var string[] Map containing control characters to be escaped (key) and their escaped value (value) */
18
    private $controlCharactersEscapingMap;
19
20
    /** @var string[] Map containing control characters to be escaped (value) and their escaped value (key) */
21
    private $controlCharactersEscapingReverseMap;
22
23
    /**
24
     * Initializes the control characters if not already done
25
     */
26 106
    protected function initIfNeeded()
27
    {
28 106
        if (!$this->isAlreadyInitialized) {
29 106
            $this->escapableControlCharactersPattern = $this->getEscapableControlCharactersPattern();
30 106
            $this->controlCharactersEscapingMap = $this->getControlCharactersEscapingMap();
31 106
            $this->controlCharactersEscapingReverseMap = \array_flip($this->controlCharactersEscapingMap);
32
33 106
            $this->isAlreadyInitialized = true;
34
        }
35 106
    }
36
37
    /**
38
     * Escapes the given string to make it compatible with XLSX
39
     *
40
     * @param string $string The string to escape
41
     * @return string The escaped string
42
     */
43 50
    public function escape($string)
44
    {
45 50
        $this->initIfNeeded();
46
47 50
        $escapedString = $this->escapeControlCharacters($string);
48
        // @NOTE: Using ENT_QUOTES as XML entities ('<', '>', '&') as well as
49
        //        single/double quotes (for XML attributes) need to be encoded.
50 50
        $escapedString = \htmlspecialchars($escapedString, ENT_QUOTES, 'UTF-8');
51
52 50
        return $escapedString;
53
    }
54
55
    /**
56
     * Unescapes the given string to make it compatible with XLSX
57
     *
58
     * @param string $string The string to unescape
59
     * @return string The unescaped string
60
     */
61 56
    public function unescape($string)
62
    {
63 56
        $this->initIfNeeded();
64
65
        // ==============
66
        // =   WARNING  =
67
        // ==============
68
        // It is assumed that the given string has already had its XML entities decoded.
69
        // This is true if the string is coming from a DOMNode (as DOMNode already decode XML entities on creation).
70
        // Therefore there is no need to call "htmlspecialchars_decode()".
71 56
        $unescapedString = $this->unescapeControlCharacters($string);
72
73 56
        return $unescapedString;
74
    }
75
76
    /**
77
     * @return string Regex pattern containing all escapable control characters
78
     */
79 106
    protected function getEscapableControlCharactersPattern()
80
    {
81
        // control characters values are from 0 to 1F (hex values) in the ASCII table
82
        // some characters should not be escaped though: "\t", "\r" and "\n".
83
        return '[\x00-\x08' .
84
                // skipping "\t" (0x9) and "\n" (0xA)
85
                '\x0B-\x0C' .
86
                // skipping "\r" (0xD)
87 106
                '\x0E-\x1F]';
88
    }
89
90
    /**
91
     * Builds the map containing control characters to be escaped
92
     * mapped to their escaped values.
93
     * "\t", "\r" and "\n" don't need to be escaped.
94
     *
95
     * NOTE: the logic has been adapted from the XlsxWriter library (BSD License)
96
     * @see https://github.com/jmcnamara/XlsxWriter/blob/f1e610f29/xlsxwriter/sharedstrings.py#L89
97
     *
98
     * @return string[]
99
     */
100 106
    protected function getControlCharactersEscapingMap()
101
    {
102 106
        $controlCharactersEscapingMap = [];
103
104
        // control characters values are from 0 to 1F (hex values) in the ASCII table
105 106
        for ($charValue = 0x00; $charValue <= 0x1F; $charValue++) {
106 106
            $character = \chr($charValue);
107 106
            if (\preg_match("/{$this->escapableControlCharactersPattern}/", $character)) {
108 106
                $charHexValue = \dechex($charValue);
109 106
                $escapedChar = '_x' . \sprintf('%04s', \strtoupper($charHexValue)) . '_';
110 106
                $controlCharactersEscapingMap[$escapedChar] = $character;
111
            }
112
        }
113
114 106
        return $controlCharactersEscapingMap;
115
    }
116
117
    /**
118
     * Converts PHP control characters from the given string to OpenXML escaped control characters
119
     *
120
     * Excel escapes control characters with _xHHHH_ and also escapes any
121
     * literal strings of that type by encoding the leading underscore.
122
     * So "\0" -> _x0000_ and "_x0000_" -> _x005F_x0000_.
123
     *
124
     * NOTE: the logic has been adapted from the XlsxWriter library (BSD License)
125
     * @see https://github.com/jmcnamara/XlsxWriter/blob/f1e610f29/xlsxwriter/sharedstrings.py#L89
126
     *
127
     * @param string $string String to escape
128
     * @return string
129
     */
130 50
    protected function escapeControlCharacters($string)
131
    {
132 50
        $escapedString = $this->escapeEscapeCharacter($string);
133
134
        // if no control characters
135 50
        if (!\preg_match("/{$this->escapableControlCharactersPattern}/", $escapedString)) {
136 45
            return $escapedString;
137
        }
138
139
        return \preg_replace_callback("/({$this->escapableControlCharactersPattern})/", function ($matches) {
140 6
            return $this->controlCharactersEscapingReverseMap[$matches[0]];
141 6
        }, $escapedString);
142
    }
143
144
    /**
145
     * Escapes the escape character: "_x0000_" -> "_x005F_x0000_"
146
     *
147
     * @param string $string String to escape
148
     * @return string The escaped string
149
     */
150 50
    protected function escapeEscapeCharacter($string)
151
    {
152 50
        return \preg_replace('/_(x[\dA-F]{4})_/', '_x005F_$1_', $string);
153
    }
154
155
    /**
156
     * Converts OpenXML escaped control characters from the given string to PHP control characters
157
     *
158
     * Excel escapes control characters with _xHHHH_ and also escapes any
159
     * literal strings of that type by encoding the leading underscore.
160
     * So "_x0000_" -> "\0" and "_x005F_x0000_" -> "_x0000_"
161
     *
162
     * NOTE: the logic has been adapted from the XlsxWriter library (BSD License)
163
     * @see https://github.com/jmcnamara/XlsxWriter/blob/f1e610f29/xlsxwriter/sharedstrings.py#L89
164
     *
165
     * @param string $string String to unescape
166
     * @return string
167
     */
168 56
    protected function unescapeControlCharacters($string)
169
    {
170 56
        $unescapedString = $string;
171
172 56
        foreach ($this->controlCharactersEscapingMap as $escapedCharValue => $charValue) {
173
            // only unescape characters that don't contain the escaped escape character for now
174 56
            $unescapedString = \preg_replace("/(?<!_x005F)($escapedCharValue)/", $charValue, $unescapedString);
175
        }
176
177 56
        return $this->unescapeEscapeCharacter($unescapedString);
178
    }
179
180
    /**
181
     * Unecapes the escape character: "_x005F_x0000_" => "_x0000_"
182
     *
183
     * @param string $string String to unescape
184
     * @return string The unescaped string
185
     */
186 56
    protected function unescapeEscapeCharacter($string)
187
    {
188 56
        return \preg_replace('/_x005F(_x[\dA-F]{4}_)/', '$1', $string);
189
    }
190
}
191