Completed
Pull Request — master (#557)
by Adrien
03:10
created

XLSX::escapeControlCharacters()   A

Complexity

Conditions 2
Paths 2

Size

Total Lines 13
Code Lines 7

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 6
CRAP Score 2

Importance

Changes 0
Metric Value
dl 0
loc 13
ccs 6
cts 6
cp 1
rs 9.4285
c 0
b 0
f 0
cc 2
eloc 7
nc 2
nop 1
crap 2
1
<?php
2
3
namespace Box\Spout\Common\Helper\Escaper;
4
5
/**
6
 * Class XLSX
7
 * Provides functions to escape and unescape data for XLSX files
8
 */
9
class XLSX implements EscaperInterface
10
{
11
    /** @var bool Whether the escaper has already been initialized */
12
    private $isAlreadyInitialized = false;
13
14
    /** @var string Regex pattern to detect control characters that need to be escaped */
15
    private $escapableControlCharactersPattern;
16
17
    /** @var string[] Map containing control characters to be escaped (key) and their escaped value (value) */
18
    private $controlCharactersEscapingMap;
19
20
    /** @var string[] Map containing control characters to be escaped (value) and their escaped value (key) */
21
    private $controlCharactersEscapingReverseMap;
22
23
    /**
24
     * Initializes the control characters if not already done
25
     */
26 100
    protected function initIfNeeded()
27
    {
28 100
        if (!$this->isAlreadyInitialized) {
29 100
            $this->escapableControlCharactersPattern = $this->getEscapableControlCharactersPattern();
30 100
            $this->controlCharactersEscapingMap = $this->getControlCharactersEscapingMap();
31 100
            $this->controlCharactersEscapingReverseMap = array_flip($this->controlCharactersEscapingMap);
32
33 100
            $this->isAlreadyInitialized = true;
34
        }
35 100
    }
36
37
    /**
38
     * Escapes the given string to make it compatible with XLSX
39
     *
40
     * @param string $string The string to escape
41
     * @return string The escaped string
42
     */
43 47
    public function escape($string)
44
    {
45 47
        $this->initIfNeeded();
46
47 47
        $escapedString = $this->escapeControlCharacters($string);
48
        // @NOTE: Using ENT_NOQUOTES as only XML entities ('<', '>', '&') need to be encoded.
49
        //        Single and double quotes can be left as is.
50 47
        $escapedString = htmlspecialchars($escapedString, ENT_NOQUOTES);
51
52 47
        return $escapedString;
53
    }
54
55
    /**
56
     * Unescapes the given string to make it compatible with XLSX
57
     *
58
     * @param string $string The string to unescape
59
     * @return string The unescaped string
60
     */
61 53
    public function unescape($string)
62
    {
63 53
        $this->initIfNeeded();
64
65
        // ==============
66
        // =   WARNING  =
67
        // ==============
68
        // It is assumed that the given string has already had its XML entities decoded.
69
        // This is true if the string is coming from a DOMNode (as DOMNode already decode XML entities on creation).
70
        // Therefore there is no need to call "htmlspecialchars_decode()".
71 53
        $unescapedString = $this->unescapeControlCharacters($string);
72
73 53
        return $unescapedString;
74
    }
75
76
    /**
77
     * @return string Regex pattern containing all escapable control characters
78
     */
79 100
    protected function getEscapableControlCharactersPattern()
80
    {
81
        // control characters values are from 0 to 1F (hex values) in the ASCII table
82
        // some characters should not be escaped though: "\t", "\r" and "\n".
83
        return '[\x00-\x08' .
84
                // skipping "\t" (0x9) and "\n" (0xA)
85
                '\x0B-\x0C' .
86
                // skipping "\r" (0xD)
87 100
                '\x0E-\x1F]';
88
    }
89
90
    /**
91
     * Builds the map containing control characters to be escaped
92
     * mapped to their escaped values.
93
     * "\t", "\r" and "\n" don't need to be escaped.
94
     *
95
     * NOTE: the logic has been adapted from the XlsxWriter library (BSD License)
96
     * @see https://github.com/jmcnamara/XlsxWriter/blob/f1e610f29/xlsxwriter/sharedstrings.py#L89
97
     *
98
     * @return string[]
99
     */
100 100
    protected function getControlCharactersEscapingMap()
101
    {
102 100
        $controlCharactersEscapingMap = [];
103
104
        // control characters values are from 0 to 1F (hex values) in the ASCII table
105 100
        for ($charValue = 0x00; $charValue <= 0x1F; $charValue++) {
106 100
            $character = chr($charValue);
107 100
            if (preg_match("/{$this->escapableControlCharactersPattern}/", $character)) {
108 100
                $charHexValue = dechex($charValue);
109 100
                $escapedChar = '_x' . sprintf('%04s', strtoupper($charHexValue)) . '_';
110 100
                $controlCharactersEscapingMap[$escapedChar] = $character;
111
            }
112
        }
113
114 100
        return $controlCharactersEscapingMap;
115
    }
116
117
    /**
118
     * Converts PHP control characters from the given string to OpenXML escaped control characters
119
     *
120
     * Excel escapes control characters with _xHHHH_ and also escapes any
121
     * literal strings of that type by encoding the leading underscore.
122
     * So "\0" -> _x0000_ and "_x0000_" -> _x005F_x0000_.
123
     *
124
     * NOTE: the logic has been adapted from the XlsxWriter library (BSD License)
125
     * @see https://github.com/jmcnamara/XlsxWriter/blob/f1e610f29/xlsxwriter/sharedstrings.py#L89
126
     *
127
     * @param string $string String to escape
128
     * @return string
129
     */
130 47
    protected function escapeControlCharacters($string)
131
    {
132 47
        $escapedString = $this->escapeEscapeCharacter($string);
133
134
        // if no control characters
135 47
        if (!preg_match("/{$this->escapableControlCharactersPattern}/", $escapedString)) {
136 42
            return $escapedString;
137
        }
138
139
        return preg_replace_callback("/({$this->escapableControlCharactersPattern})/", function ($matches) {
140 6
            return $this->controlCharactersEscapingReverseMap[$matches[0]];
141 6
        }, $escapedString);
142
    }
143
144
    /**
145
     * Escapes the escape character: "_x0000_" -> "_x005F_x0000_"
146
     *
147
     * @param string $string String to escape
148
     * @return string The escaped string
149
     */
150 47
    protected function escapeEscapeCharacter($string)
151
    {
152 47
        return preg_replace('/_(x[\dA-F]{4})_/', '_x005F_$1_', $string);
153
    }
154
155
    /**
156
     * Converts OpenXML escaped control characters from the given string to PHP control characters
157
     *
158
     * Excel escapes control characters with _xHHHH_ and also escapes any
159
     * literal strings of that type by encoding the leading underscore.
160
     * So "_x0000_" -> "\0" and "_x005F_x0000_" -> "_x0000_"
161
     *
162
     * NOTE: the logic has been adapted from the XlsxWriter library (BSD License)
163
     * @see https://github.com/jmcnamara/XlsxWriter/blob/f1e610f29/xlsxwriter/sharedstrings.py#L89
164
     *
165
     * @param string $string String to unescape
166
     * @return string
167
     */
168 53
    protected function unescapeControlCharacters($string)
169
    {
170 53
        $unescapedString = $string;
171
172 53
        foreach ($this->controlCharactersEscapingMap as $escapedCharValue => $charValue) {
173
            // only unescape characters that don't contain the escaped escape character for now
174 53
            $unescapedString = preg_replace("/(?<!_x005F)($escapedCharValue)/", $charValue, $unescapedString);
175
        }
176
177 53
        return $this->unescapeEscapeCharacter($unescapedString);
178
    }
179
180
    /**
181
     * Unecapes the escape character: "_x005F_x0000_" => "_x0000_"
182
     *
183
     * @param string $string String to unescape
184
     * @return string The unescaped string
185
     */
186 53
    protected function unescapeEscapeCharacter($string)
187
    {
188 53
        return preg_replace('/_x005F(_x[\dA-F]{4}_)/', '$1', $string);
189
    }
190
}
191