Strings::softEscape()   A
last analyzed

Complexity

Conditions 1
Paths 1

Size

Total Lines 3
Code Lines 1

Duplication

Lines 0
Ratio 0 %

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 1
eloc 1
c 1
b 0
f 0
nc 1
nop 2
dl 0
loc 3
rs 10
1
<?php
2
3
/*
4
 * This file is part of Strings
5
 *     (c) Fabrice de Stefanis / https://github.com/fab2s/Strings
6
 * This source file is licensed under the MIT license which you will
7
 * find in the LICENSE file or at https://opensource.org/licenses/MIT
8
 */
9
10
namespace fab2s\Strings;
11
12
use fab2s\Bom\Bom;
13
use fab2s\Utf8\Utf8;
14
15
/**
16
 * class Strings
17
 */
18
class Strings
19
{
20
    /**
21
     * The canonical EOL for normalization
22
     */
23
    const EOL = "\n";
24
25
    /**
26
     * The canonical encoding
27
     */
28
    const ENCODING = 'UTF-8';
29
30
    /**
31
     * U+200B zero width space
32
     * U+FEFF zero width no-break space
33
     */
34
    const ZERO_WIDTH_WS_CLASS = '\x{200B}\x{FEFF}';
35
36
    /**
37
     * U+00A0  no-break space
38
     * U+2000  en quad
39
     * U+2001  em quad
40
     * U+2002  en space
41
     * U+2003  em space
42
     * U+2004  three-per-em space
43
     * U+2005  four-per-em space
44
     * U+2006  six-per-em space
45
     * U+2007  figure space
46
     * U+2008  punctuation space
47
     * U+2009  thin space
48
     * U+200A  hair space
49
     * U+202F  narrow no-break space
50
     * U+3000  ideographic space
51
     */
52
    const NON_STANDARD_WS_CLASS = '\x{00A0}\x{2000}-\x{200A}\x{202F}\x{3000}';
53
54
    /**
55
     * normalize EOL to LF and strip null bit
56
     *
57
     * @param string $string
58
     *
59
     * @return string
60
     */
61
    public static function filter(string $string): string
62
    {
63
        /*
64
         * U+00 null bit
65
         * Zero width ws
66
         * normalized eol
67
         * normalized utf8
68
         */
69
        return Utf8::normalize(static::normalizeEol(preg_replace('`[\x{00}' . static::ZERO_WIDTH_WS_CLASS . ']+`u', '', $string)));
70
    }
71
72
    /**
73
     * @param string $string
74
     * @param bool   $normalize
75
     * @param bool   $includeTabs
76
     *
77
     * @return string
78
     */
79
    public static function singleWsIze(string $string, bool $normalize = false, bool $includeTabs = true): string
80
    {
81
        if ($normalize) {
82
            // multiple horizontal ws to a single low ws (eg ' ')
83
            return static::normalizeWs($string, $includeTabs);
84
        }
85
86
        return preg_replace('`(\h)(?:\1+)`u', '$1', $string);
87
    }
88
89
    /**
90
     * @param string $string
91
     *
92
     * @return string
93
     */
94
    public static function singleLineIze(string $string): string
95
    {
96
        return preg_replace("`\s*\R+`u", ' ', $string);
97
    }
98
99
    /**
100
     * @param $string string
101
     *
102
     * @return string
103
     */
104
    public static function dropZwWs(string $string): string
105
    {
106
        return preg_replace('`[' . static::ZERO_WIDTH_WS_CLASS . ']+`u', '', $string);
107
    }
108
109
    /**
110
     * @param string   $string
111
     * @param bool     $includeTabs    true to also replace tabs (\t) with ws ( )
112
     * @param int|null $maxConsecutive
113
     *
114
     * @return string
115
     */
116
    public static function normalizeWs(string $string, bool $includeTabs = true, ?int $maxConsecutive = null): string
117
    {
118
        // don't include regular ws unless we want to handle consecutive
119
        $extraWs = $includeTabs ? "\t" : '';
120
        $length  = '';
121
        $replace = ' ';
122
        if (isset($maxConsecutive)) {
123
            // as regular ws should be the majority, put it first
124
            $extraWs = " $extraWs";
125
            $length  = '{' . $maxConsecutive . ',}';
126
            $replace = str_repeat($replace, $maxConsecutive);
127
        }
128
129
        return preg_replace("`[$extraWs" . static::NON_STANDARD_WS_CLASS . "]$length`u", $replace, $string);
130
    }
131
132
    /**
133
     * @param string      $string
134
     * @param int|null    $maxConsecutive
135
     * @param string|null $eol
136
     *
137
     * @return string
138
     */
139
    public static function normalizeEol(string $string, ?int $maxConsecutive = null, string $eol = self::EOL): string
140
    {
141
        if ($maxConsecutive === null) {
142
            return preg_replace('`\s*?\R`u', $eol, $string);
143
        }
144
145
        if ($maxConsecutive === 1) {
146
            return preg_replace('`\s*\R`u', $eol, $string);
147
        }
148
149
        return preg_replace([
150
            // start with normalizing with LF (faster than CRLF)
151
            '`\s*?\R`u',
152
            // then remove high dupes
153
            "`\n{" . $maxConsecutive . ',}`u',
154
        ], [
155
            "\n",
156
            // restore EOL and set max consecutive
157
            str_repeat($eol, $maxConsecutive),
158
        ], $string);
159
    }
160
161
    /**
162
     * Normalizes a text document
163
     *
164
     * @param string $text
165
     *
166
     * @return string
167
     */
168
    public static function normalizeText(string $text): string
169
    {
170
        return trim(static::filter($text));
171
    }
172
173
    /**
174
     * Normalizes a title
175
     *
176
     * @param string $title
177
     *
178
     * @return string
179
     */
180
    public static function normalizeTitle(string $title): string
181
    {
182
        return Utf8::ucfirst(static::normalizeWs(static::singleLineIze(static::normalizeText($title)), true, 1));
183
    }
184
185
    /**
186
     * @param string $name
187
     *
188
     * @return string
189
     */
190
    public static function normalizeName(string $name): string
191
    {
192
        return Utf8::ucwords(static::normalizeTitle($name));
193
    }
194
195
    /**
196
     * wrapper for htmlspecialchars with utf-8 and ENT_COMPAT set as default
197
     *
198
     * @param string $string
199
     * @param int    $flag
200
     * @param bool   $hardEscape
201
     *
202
     * @return string
203
     */
204
    public static function escape(string $string, int $flag = ENT_COMPAT, bool $hardEscape = true)
205
    {
206
        return htmlspecialchars($string, $flag, static::ENCODING, (bool) $hardEscape);
207
    }
208
209
    /**
210
     * wrapper for htmlspecialchars with utf-8 and ENT_COMPAT set
211
     * which prevents double encoding
212
     *
213
     * @param string $string
214
     * @param int    $flag
215
     *
216
     * @return string
217
     */
218
    public static function softEscape(string $string, int $flag = ENT_COMPAT): string
219
    {
220
        return static::escape($string, $flag, false);
221
    }
222
223
    /**
224
     * wrapper for htmlspecialchars_decode with ENT_COMPAT set
225
     *
226
     * @param string $string
227
     * @param int    $quoteStyle
228
     *
229
     * @return string
230
     */
231
    public static function unEscape(string $string, int $quoteStyle = ENT_COMPAT): string
232
    {
233
        return htmlspecialchars_decode($string, $quoteStyle);
234
    }
235
236
    /**
237
     * @param string      $string
238
     * @param string|null $from
239
     * @param string      $to
240
     *
241
     * @return string
242
     */
243
    public static function convert(string $string, ?string $from = null, string $to = self::ENCODING): string
244
    {
245
        return mb_convert_encoding($string, $to, $from ? $from : static::detectEncoding($string));
246
    }
247
248
    /**
249
     * @param string $string
250
     *
251
     * @return string|null
252
     */
253
    public static function detectEncoding(string $string): ? string
254
    {
255
        if (Utf8::isUtf8($string)) {
256
            return static::ENCODING;
257
        }
258
259
        if ($bom = Bom::extract($string)) {
260
            return Bom::getBomEncoding($bom);
261
        }
262
263
        return mb_detect_encoding($string, 'ISO-8859-1,Windows-1252', true) ?: null;
264
    }
265
266
    /**
267
     * Truly constant time string comparison for Timing Attack protection
268
     *
269
     * Many implementations will stop after length comparison which can
270
     * leak length (not much I agree, but what topic is this?), or try to
271
     * be smart at failing to compare portion of the $reference which again
272
     * could leak $reference length
273
     *
274
     * This method just goes through exactly the same number of operations
275
     * in every cases
276
     *
277
     * @param string $userInput
278
     * @param string $reference
279
     *
280
     * @return bool
281
     */
282
    public static function secureCompare(string $userInput, string $reference): bool
283
    {
284
        if (strlen($userInput) !== strlen($reference)) {
285
            // preserve full comparison loop
286
            $comparison = $reference ^ $reference;
287
            // and return false
288
            $result = 1;
289
        } else {
290
            $comparison = $userInput ^ $reference;
291
            $result     = 0;
292
        }
293
294
        $len = strlen($comparison);
295
        for ($i = $len - 1; $i >= 0; --$i) {
296
            $result |= ord($comparison[$i]);
297
        }
298
299
        return !$result;
300
    }
301
302
    /**
303
     * Generate a pretty reliable hash to identify strings
304
     * Adding the length reduces collisions by quite a lot
305
     *
306
     * @param string $content
307
     *
308
     * @return string
309
     */
310
    public static function contentHash(string $content): string
311
    {
312
        return strlen($content) . '_' . hash('sha256', $content);
313
    }
314
}
315