Completed
Pull Request — develop_3.0 (#457)
by Adrien
02:34
created

XLSX::unescape()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 14
Code Lines 4

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 4
CRAP Score 1

Importance

Changes 0
Metric Value
dl 0
loc 14
ccs 4
cts 4
cp 1
rs 9.4285
c 0
b 0
f 0
cc 1
eloc 4
nc 1
nop 1
crap 1
1
<?php
2
3
namespace Box\Spout\Common\Helper\Escaper;
4
5
/**
6
 * Class XLSX
7
 * Provides functions to escape and unescape data for XLSX files
8
 *
9
 * @package Box\Spout\Common\Helper\Escaper
10
 */
11
class XLSX implements EscaperInterface
12
{
13
    /** @var bool Whether the escaper has already been initialized */
14
    private $isAlreadyInitialized = false;
15
16
    /** @var string Regex pattern to detect control characters that need to be escaped */
17
    private $escapableControlCharactersPattern;
18
19
    /** @var string[] Map containing control characters to be escaped (key) and their escaped value (value) */
20
    private $controlCharactersEscapingMap;
21
22
    /** @var string[] Map containing control characters to be escaped (value) and their escaped value (key) */
23
    private $controlCharactersEscapingReverseMap;
24
25
    /**
26
     * Initializes the control characters if not already done
27
     */
28 95
    protected function initIfNeeded()
29
    {
30 95
        if (!$this->isAlreadyInitialized) {
31 95
            $this->escapableControlCharactersPattern = $this->getEscapableControlCharactersPattern();
32 95
            $this->controlCharactersEscapingMap = $this->getControlCharactersEscapingMap();
33 95
            $this->controlCharactersEscapingReverseMap = array_flip($this->controlCharactersEscapingMap);
34
35 95
            $this->isAlreadyInitialized = true;
36
        }
37 95
    }
38
39
    /**
40
     * Escapes the given string to make it compatible with XLSX
41
     *
42
     * @param string $string The string to escape
43
     * @return string The escaped string
44
     */
45 47
    public function escape($string)
46
    {
47 47
        $this->initIfNeeded();
48
49 47
        $escapedString = $this->escapeControlCharacters($string);
50
        // @NOTE: Using ENT_NOQUOTES as only XML entities ('<', '>', '&') need to be encoded.
51
        //        Single and double quotes can be left as is.
52 47
        $escapedString = htmlspecialchars($escapedString, ENT_NOQUOTES);
53
54 47
        return $escapedString;
55
    }
56
57
    /**
58
     * Unescapes the given string to make it compatible with XLSX
59
     *
60
     * @param string $string The string to unescape
61
     * @return string The unescaped string
62
     */
63 48
    public function unescape($string)
64
    {
65 48
        $this->initIfNeeded();
66
67
        // ==============
68
        // =   WARNING  =
69
        // ==============
70
        // It is assumed that the given string has already had its XML entities decoded.
71
        // This is true if the string is coming from a DOMNode (as DOMNode already decode XML entities on creation).
72
        // Therefore there is no need to call "htmlspecialchars_decode()".
73 48
        $unescapedString = $this->unescapeControlCharacters($string);
74
75 48
        return $unescapedString;
76
    }
77
78
    /**
79
     * @return string Regex pattern containing all escapable control characters
80
     */
81 95
    protected function getEscapableControlCharactersPattern()
82
    {
83
        // control characters values are from 0 to 1F (hex values) in the ASCII table
84
        // some characters should not be escaped though: "\t", "\r" and "\n".
85
        return '[\x00-\x08' .
86
                // skipping "\t" (0x9) and "\n" (0xA)
87
                '\x0B-\x0C' .
88
                // skipping "\r" (0xD)
89 95
                '\x0E-\x1F]';
90
    }
91
92
    /**
93
     * Builds the map containing control characters to be escaped
94
     * mapped to their escaped values.
95
     * "\t", "\r" and "\n" don't need to be escaped.
96
     *
97
     * NOTE: the logic has been adapted from the XlsxWriter library (BSD License)
98
     * @link https://github.com/jmcnamara/XlsxWriter/blob/f1e610f29/xlsxwriter/sharedstrings.py#L89
99
     *
100
     * @return string[]
101
     */
102 95
    protected function getControlCharactersEscapingMap()
103
    {
104 95
        $controlCharactersEscapingMap = [];
105
106
        // control characters values are from 0 to 1F (hex values) in the ASCII table
107 95
        for ($charValue = 0x00; $charValue <= 0x1F; $charValue++) {
108 95
            $character = chr($charValue);
109 95
            if (preg_match("/{$this->escapableControlCharactersPattern}/", $character)) {
110 95
                $charHexValue = dechex($charValue);
111 95
                $escapedChar = '_x' . sprintf('%04s' , strtoupper($charHexValue)) . '_';
112 95
                $controlCharactersEscapingMap[$escapedChar] = $character;
113
            }
114
        }
115
116 95
        return $controlCharactersEscapingMap;
117
    }
118
119
    /**
120
     * Converts PHP control characters from the given string to OpenXML escaped control characters
121
     *
122
     * Excel escapes control characters with _xHHHH_ and also escapes any
123
     * literal strings of that type by encoding the leading underscore.
124
     * So "\0" -> _x0000_ and "_x0000_" -> _x005F_x0000_.
125
     *
126
     * NOTE: the logic has been adapted from the XlsxWriter library (BSD License)
127
     * @link https://github.com/jmcnamara/XlsxWriter/blob/f1e610f29/xlsxwriter/sharedstrings.py#L89
128
     *
129
     * @param string $string String to escape
130
     * @return string
131
     */
132 47
    protected function escapeControlCharacters($string)
133
    {
134 47
        $escapedString = $this->escapeEscapeCharacter($string);
135
136
        // if no control characters
137 47
        if (!preg_match("/{$this->escapableControlCharactersPattern}/", $escapedString)) {
138 42
            return $escapedString;
139
        }
140
141 6
        return preg_replace_callback("/({$this->escapableControlCharactersPattern})/", function($matches) {
142 6
            return $this->controlCharactersEscapingReverseMap[$matches[0]];
143 6
        }, $escapedString);
144
    }
145
146
    /**
147
     * Escapes the escape character: "_x0000_" -> "_x005F_x0000_"
148
     *
149
     * @param string $string String to escape
150
     * @return string The escaped string
151
     */
152 47
    protected function escapeEscapeCharacter($string)
153
    {
154 47
        return preg_replace('/_(x[\dA-F]{4})_/', '_x005F_$1_', $string);
155
    }
156
157
    /**
158
     * Converts OpenXML escaped control characters from the given string to PHP control characters
159
     *
160
     * Excel escapes control characters with _xHHHH_ and also escapes any
161
     * literal strings of that type by encoding the leading underscore.
162
     * So "_x0000_" -> "\0" and "_x005F_x0000_" -> "_x0000_"
163
     *
164
     * NOTE: the logic has been adapted from the XlsxWriter library (BSD License)
165
     * @link https://github.com/jmcnamara/XlsxWriter/blob/f1e610f29/xlsxwriter/sharedstrings.py#L89
166
     *
167
     * @param string $string String to unescape
168
     * @return string
169
     */
170 48
    protected function unescapeControlCharacters($string)
171
    {
172 48
        $unescapedString = $string;
173
174 48
        foreach ($this->controlCharactersEscapingMap as $escapedCharValue => $charValue) {
175
            // only unescape characters that don't contain the escaped escape character for now
176 48
            $unescapedString = preg_replace("/(?<!_x005F)($escapedCharValue)/", $charValue, $unescapedString);
177
        }
178
179 48
        return $this->unescapeEscapeCharacter($unescapedString);
180
    }
181
182
    /**
183
     * Unecapes the escape character: "_x005F_x0000_" => "_x0000_"
184
     *
185
     * @param string $string String to unescape
186
     * @return string The unescaped string
187
     */
188 48
    protected function unescapeEscapeCharacter($string)
189
    {
190 48
        return preg_replace('/_x005F(_x[\dA-F]{4}_)/', '$1', $string);
191
    }
192
}
193