Passed
Pull Request — master (#21)
by Yuri
03:59
created

Formatter::replaceHtmlEntitiesWithPlaceholders()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 11
Code Lines 8

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 9
CRAP Score 1

Importance

Changes 0
Metric Value
cc 1
eloc 8
c 0
b 0
f 0
nc 1
nop 1
dl 0
loc 11
ccs 9
cts 9
cp 1
crap 1
rs 10
1
<?php namespace Tamtamchik\NameCase;
2
3
/**
4
 * Class Formatter.
5
 */
6
class Formatter
7
{
8
    // Irish exceptions.
9
    private const EXCEPTIONS = [
10
        '\bMacEdo' => 'Macedo',
11
        '\bMacEvicius' => 'Macevicius',
12
        '\bMacHado' => 'Machado',
13
        '\bMacHar' => 'Machar',
14
        '\bMacHin' => 'Machin',
15
        '\bMacHlin' => 'Machlin',
16
        '\bMacIas' => 'Macias',
17
        '\bMacIulis' => 'Maciulis',
18
        '\bMacKie' => 'Mackie',
19
        '\bMacKle' => 'Mackle',
20
        '\bMacKlin' => 'Macklin',
21
        '\bMacKmin' => 'Mackmin',
22
        '\bMacQuarie' => 'Macquarie',
23
        '\bMacOmber' => 'Macomber',
24
        '\bMacIn' => 'Macin',
25
        '\bMacKintosh' => 'Mackintosh',
26
        '\bMacKen' => 'Macken',
27
        '\bMacHen' => 'Machen',
28
        '\bMacisaac' => 'MacIsaac',
29
        '\bMacHiel' => 'Machiel',
30
        '\bMacIol' => 'Maciol',
31
        '\bMacKell' => 'Mackell',
32
        '\bMacKlem' => 'Macklem',
33
        '\bMacKrell' => 'Mackrell',
34
        '\bMacLin' => 'Maclin',
35
        '\bMacKey' => 'Mackey',
36
        '\bMacKley' => 'Mackley',
37
        '\bMacHell' => 'Machell',
38
        '\bMacHon' => 'Machon',
39
    ];
40
41
    // General replacements.
42
    private const REPLACEMENTS = [
43
        '\bAl(?=\s+\w)' => 'al',        // al Arabic or forename Al.
44
        '\bAp\b' => 'ap',        // ap Welsh.
45
        '\b(Bin|Binti|Binte)\b' => 'bin',       // bin, binti, binte Arabic.
46
        '\bDell([ae])\b' => 'dell\1',    // della and delle Italian.
47
        '\bD([aeiou])\b' => 'd\1',       // da, de, di Italian; du French; do Brasil.
48
        '\bD([ao]s)\b' => 'd\1',       // das, dos Brasileiros.
49
        '\bDe([lrn])\b' => 'de\1',      // del Italian; der/den Dutch/Flemish.
50
        '\bL([eo])\b' => 'l\1',       // lo Italian; le French.
51
        '\bTe([rn])\b' => 'te\1',      // ten, ter Dutch/Flemish.
52
        '\bVan(?=\s+\w)' => 'van',       // van German or forename Van.
53
        '\bVon\b' => 'von',       // von Dutch/Flemish.
54
    ];
55
56
    private const SPANISH = [
57
        '\bEl\b' => 'el',        // el Greek or El Spanish.
58
        '\bLa\b' => 'la',        // la French or La Spanish.
59
    ];
60
61
    private const HEBREW = [
62
        '\bBen(?=\s+\w)' => 'ben', // ben Hebrew or forename Ben.
63
        '\bBat(?=\s+\w)' => 'bat', // bat Hebrew or forename Bat.
64
    ];
65
66
    // Spanish conjunctions.
67
    private const CONJUNCTIONS = ['Y', 'E', 'I'];
68
69
    // Roman letters regexp.
70
    private const ROMAN_REGEX = '\b((?:[Xx]{1,3}|[Xx][Ll]|[Ll][Xx]{0,3})?(?:[Ii]{1,3}|[Ii][VvXx]|[Vv][Ii]{0,3})?)\b';
71
72
    // Post nominal values.
73
    private const POST_NOMINALS = [
74
        'ACILEx', 'ACSM', 'ADC', 'AEPC', 'AFC', 'AFM', 'AICSM', 'AKC', 'AM', 'ARBRIBA', 'ARCS', 'ARRC', 'ARSM', 'AUH',
75
        'AUS',
76
        'BA', 'BArch', 'BCh', 'BChir', 'BCL', 'BDS', 'BEd', 'BEM', 'BEng', 'BM', 'BS', 'BSc', 'BSW', 'BVM&S',
77
        'BVScBVetMed',
78
        'CB', 'CBE', 'CEng', 'CertHE', 'CGC', 'CGM', 'CH', 'CIE', 'CMarEngCMarSci', 'CMarTech', 'CMG', 'CMILT',
79
        'CML', 'CPhT', 'CPLCTP', 'CPM', 'CQSW', 'CSciTeach', 'CSI', 'CTL', 'CVO',
80
        'DBE', 'DBEnv', 'DC', 'DCB', 'DCM', 'DCMG', 'DConstMgt', 'DCVO', 'DD', 'DEM', 'DFC', 'DFM', 'DIC', 'Dip',
81
        'DipHE', 'DipLP', 'DipSW', 'DL', 'DLitt', 'DLP', 'DPhil', 'DProf', 'DPT', 'DREst', 'DSC', 'DSM', 'DSO',
82
        'DSocSci',
83
        'ED', 'EdD', 'EJLog', 'EMLog', 'EN', 'EngD', 'EngTech', 'ERD', 'ESLog',
84
        'FADO', 'FAWM', 'FBDOFCOptom', 'FCEM', 'FCILEx', 'FCILT', 'FCSP.', 'FdAFdSc', 'FdEng', 'FFHOM', 'FFPM',
85
        'FRCAFFPMRCA', 'FRCGP', 'FRCOG', 'FRCP', 'FRCPsych', 'FRCS', 'FRCVS', 'FSCR.',
86
        'GBE', 'GC', 'GCB', 'GCIE', 'GCILEx', 'GCMG', 'GCSI', 'GCVO', 'GM',
87
        'HNC', 'HNCert', 'HND', 'HNDip',
88
        'ICTTech', 'IDSM', 'IEng', 'IMarEng', 'IOMCPM', 'ISO',
89
        'J', 'JP', 'JrLog',
90
        'KBE', 'KC', 'KCB', 'KCIE', 'KCMG', 'KCSI', 'KCVO', 'KG', 'KP', 'KT',
91
        'LFHOM', 'LG', 'LJ', 'LLB', 'LLD', 'LLM', 'Log', 'LPE', /* 'LT', - excluded, see initial names */
92
        'LVO',
93
        'MA', 'MAcc', 'MAnth', 'MArch', 'MarEngTech', 'MB', 'MBA', 'MBChB', 'MBE', 'MBEIOM', 'MBiochem', 'MC', 'MCEM',
94
        'MCGI', 'MCh.', 'MChem', 'MChiro', 'MClinRes', 'MComp', 'MCOptom', 'MCSM', 'MCSP', 'MD', 'MEarthSc',
95
        'MEng', 'MEnt', 'MEP', 'MFHOM', 'MFin', 'MFPM', 'MGeol', 'MILT', 'MJur', 'MLA', 'MLitt', 'MM', 'MMath',
96
        'MMathStat', 'MMORSE', 'MMus', 'MOst', 'MP', 'MPAMEd', 'MPharm', 'MPhil', 'MPhys', 'MRCGP', 'MRCOG',
97
        'MRCP', 'MRCPath', 'MRCPCHFRCPCH', 'MRCPsych', 'MRCS', 'MRCVS', 'MRes',
98
        /* 'MS', - excluded, see initial names */
99
        'MSc', 'MScChiro', 'MSci',
100
        'MSCR', 'MSM', 'MSocSc', 'MSP', 'MSt', 'MSW', 'MSYP', 'MVO',
101
        'NPQH',
102
        'OBE', 'OBI', 'OM', 'OND',
103
        'PgC', 'PGCAP', 'PGCE', 'PgCert', 'PGCHE', 'PgCLTHE', 'PgD', 'PGDE', 'PgDip', 'PhD', 'PLog', 'PLS',
104
        'QAM', 'QC', 'QFSM', 'QGM', 'QHC', 'QHDS', 'QHNS', 'QHP', 'QHS', 'QPM', 'QS', 'QTSCSci',
105
        'RD', 'RFHN', 'RGN', 'RHV', 'RIAI', 'RIAS', 'RM', 'RMN', 'RN', 'RN1RNA', 'RN2', 'RN3', 'RN4', 'RN5', 'RN6', 'RN7', 'RN8', 'RN9', 'RNC', 'RNLD', 'RNMH', 'ROH', 'RRC', 'RSAW', 'RSci', 'RSciTech', 'RSCN', 'RSN', 'RVM', 'RVN',
106
        'SCHM', 'SCJ', 'SCLD', 'SEN', 'SGM', 'SL', 'SPANSPMH', 'SPCC', 'SPCN', 'SPDN', 'SPHP', 'SPLD', 'SrLog', 'SRN', 'SROT',
107
        'TD',
108
        'UD',
109
        'V100', 'V200', 'V300', 'VC', 'VD', 'VetMB', 'VN', 'VRD'
110
    ];
111
112
    // Excluded post-nominals
113
    private const INITIAL_NAME_REGEX = '\b(Aj|[bcdfghjklmnpqrstvwxyzBCDFGHJKLMNPQRSTVWXYZ]{2})\s';
114
115
    // Most two-letter words with no vowels should be kept in all caps as initials
116
    private const INITIAL_NAME_EXCEPTIONS = [
117
        'Mr',
118
        'Ms', // Replaces Member of the Senedd post nominal.
119
        'Dr',
120
        'St',
121
        'Jr',
122
        'Sr',
123
        'Lt', // Replaces Lady of the Order of the Thistle post nominal.
124
    ];
125
    private const LOWER_CASE_WORDS = ['The', 'Of', 'And'];
126
127
    // Lowercase words
128
    private static $postNominalsExcluded = [];
129
130
    // Default options.
131
    private static $options = [
132
        'lazy' => true,
133
        'irish' => true,
134
        'spanish' => false,
135
        'roman' => true,
136
        'hebrew' => true,
137
        'postnominal' => true,
138
    ];
139
140
    /**
141
     * Formatter constructor.
142
     *
143
     * @param array $options
144
     */
145 2
    public function __construct(array $options = [])
146
    {
147 2
        $this->setOptions($options);
148
    }
149
150
    /**
151
     * Global options setter.
152
     *
153
     * @param array $options
154
     */
155 25
    public static function setOptions(array $options): void
156
    {
157 25
        self::$options = array_merge(self::$options, $options);
158
    }
159
160
    /**
161
     * Global post-nominals exclusions setter.
162
     *
163
     * @param array|string|null $values
164
     * @return boolean|void
165
     */
166 3
    public static function excludePostNominals($values)
167
    {
168 3
        if (is_string($values)) {
169 1
            $values = [$values];
170
        }
171
172 3
        if ( ! is_array($values)) {
173 1
            return false;
174
        }
175
176 2
        self::$postNominalsExcluded = array_merge(self::$postNominalsExcluded, $values);
177
    }
178
179
    /**
180
     * Main function for NameCase.
181
     *
182
     * @param string|null $name
183
     * @param array|null $options
184
     *
185
     * @return string
186
     */
187 25
    public static function nameCase(?string $name = '', ?array $options = []): string
188
    {
189 25
        $name = is_null($name) ? '' : $name;
190
191 25
        self::setOptions($options);
192
193
        // Temporarily replace HTML encoded entities with placeholders
194 25
        $placeholders = self::replaceHtmlEntitiesWithPlaceholders($name);
195
196
        // Do not do anything if string is mixed and lazy option is true.
197 25
        if ( ! self::canBeProcessed($name)) {
198 5
            return $name;
199
        }
200
201 22
        $original = $name;
202
203
        // Capitalize
204 22
        $name = self::capitalize($name);
205 22
        foreach (self::getReplacements() as $pattern => $replacement) {
206 22
            $name = mb_ereg_replace($pattern, $replacement, $name);
207
208
            // Very difficult to write a test in modern environments
209
            // @codeCoverageIgnoreStart
210
            if ( ! is_string($name)) {
211
                return $original;
212
            }
213
            // @codeCoverageIgnoreEnd
214
        }
215
216 22
        $name = self::correctInitialNames($name);
217 22
        $name = self::correctLowerCaseWords($name);
218
219 22
        $name = self::processOptions($name);
220
221
        // After name casing operations, restore HTML encoded entities
222 22
        self::restoreHtmlEntitiesFromPlaceholders($name, $placeholders);
223
224 22
        return $name;
225
    }
226
227
    /**
228
     * Check if string can be processed.
229
     *
230
     * @param string $name
231
     *
232
     * @return bool
233
     */
234 25
    private static function canBeProcessed(string $name): bool
235
    {
236 25
        if ($name != '') {
237 23
            return ! (self::$options['lazy'] && self::skipMixed($name));
238
        }
239
240 2
        return false;
241
    }
242
243
    /**
244
     * Skip if string is mixed case.
245
     *
246
     * @param string $name
247
     *
248
     * @return bool
249
     */
250 23
    private static function skipMixed(string $name): bool
251
    {
252 23
        $firstLetterLower = $name[0] == mb_strtolower($name[0]);
253 23
        $allLowerOrUpper = (mb_strtolower($name) == $name || mb_strtoupper($name) == $name);
254
255 23
        return ! ($firstLetterLower || $allLowerOrUpper);
256
    }
257
258
    /**
259
     * Capitalize first letters.
260
     *
261
     * @param string $name
262
     *
263
     * @return string
264
     */
265 22
    private static function capitalize(string $name): string
266
    {
267 22
        $name = mb_strtolower($name);
268
269 22
        $name = mb_ereg_replace_callback('\b\w', function ($matches) {
270 22
            return mb_strtoupper($matches[0]);
271 22
        }, $name);
272
273
        // Lowercase 's
274 22
        $name = mb_ereg_replace_callback('\'\w\b', function ($matches) {
275 3
            return mb_strtolower($matches[0]);
276 22
        }, $name);
277
278 22
        return self::updateIrish($name);
279
    }
280
281
    /**
282
     * Update for Irish names.
283
     *
284
     * @param string $name
285
     *
286
     * @return string
287
     */
288 22
    private static function updateIrish(string $name): string
289
    {
290 22
        if ( ! self::$options['irish']) return $name;
291
292
        if (
293 22
            mb_ereg_match('.*?\bMac[A-Za-z]{2,}[^aciozj]\b', $name) ||
294 22
            mb_ereg_match('.*?\bMc', $name)
295
        ) {
296 7
            $name = self::updateMac($name);
297
        }
298
299 22
        return mb_ereg_replace('Macmurdo', 'MacMurdo', $name);
300
    }
301
302
    /**
303
     * Updates irish Mac & Mc.
304
     *
305
     * @param string $name
306
     *
307
     * @return string
308
     */
309 7
    private static function updateMac(string $name): string
310
    {
311 7
        $name = mb_ereg_replace_callback('\b(Ma?c)([A-Za-z]+)', function ($matches) {
312 7
            return $matches[1] . mb_strtoupper(mb_substr($matches[2], 0, 1)) . mb_substr($matches[2], 1);
313 7
        }, $name);
314
315
        // Now fix "Mac" exceptions
316 7
        foreach (self::EXCEPTIONS as $pattern => $replacement) {
317 7
            $name = mb_ereg_replace($pattern, $replacement, $name);
318
        }
319
320 7
        return $name;
321
    }
322
323
    /**
324
     * Define required replacements.
325
     *
326
     * @return array
327
     */
328 22
    private static function getReplacements(): array
329
    {
330
        // General fixes
331 22
        $replacements = self::REPLACEMENTS;
332 22
        if ( ! self::$options['spanish']) {
333 22
            $replacements = array_merge($replacements, self::SPANISH);
334
        }
335
336 22
        if (self::$options['hebrew']) {
337 22
            $replacements = array_merge($replacements, self::HEBREW);
338
        }
339
340 22
        return $replacements;
341
    }
342
343
    /**
344
     * Correct capitalization of initial names like JJ and TJ.
345
     *
346
     * @param string $name
347
     *
348
     * @return string
349
     */
350 22
    private static function correctInitialNames(string $name): string
351
    {
352 22
        return mb_ereg_replace_callback(self::INITIAL_NAME_REGEX, function ($matches) {
353 1
            $match = $matches[0];
354
355 1
            if (in_array($matches[1], self::INITIAL_NAME_EXCEPTIONS)) {
356 1
                return $match;
357
            }
358
359 1
            return mb_strtoupper($match);
360 22
        }, $name);
361
    }
362
363
    /**
364
     * Correct lower-case words of titles.
365
     *
366
     * @param string $name
367
     *
368
     * @return string
369
     */
370 22
    private static function correctLowerCaseWords(string $name): string
371
    {
372 22
        foreach (self::LOWER_CASE_WORDS as $lowercase) {
373 22
            $name = mb_ereg_replace('\b' . $lowercase . '\b', mb_strtolower($lowercase), $name);
0 ignored issues
show
Bug introduced by
It seems like $name can also be of type null; however, parameter $string of mb_ereg_replace() does only seem to accept string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

373
            $name = mb_ereg_replace('\b' . $lowercase . '\b', mb_strtolower($lowercase), /** @scrutinizer ignore-type */ $name);
Loading history...
374
        }
375 22
        return $name;
376
    }
377
378
    /**
379
     * Process options with given name
380
     *
381
     * @param string $name
382
     *
383
     * @return string
384
     */
385 22
    private static function processOptions(string $name): string
386
    {
387 22
        if (self::$options['roman']) {
388 22
            $name = self::updateRoman($name);
389
        }
390
391 22
        if (self::$options['spanish']) {
392 1
            $name = self::fixConjunction($name);
393
        }
394
395 22
        if (self::$options['postnominal']) {
396 22
            $name = self::fixPostNominal($name);
397
        }
398
399 22
        return $name;
400
    }
401
402
    /**
403
     * Fix roman numeral names.
404
     *
405
     * @param string $name
406
     *
407
     * @return string
408
     */
409 22
    private static function updateRoman(string $name): string
410
    {
411 22
        return mb_ereg_replace_callback(self::ROMAN_REGEX, function ($matches) {
412 22
            return mb_strtoupper($matches[0]);
413 22
        }, $name);
414
    }
415
416
    /**
417
     * Fix Spanish conjunctions.
418
     *
419
     * @param string $name
420
     *
421
     * @return string
422
     */
423 1
    private static function fixConjunction(string $name): string
424
    {
425 1
        foreach (self::CONJUNCTIONS as $conjunction) {
426 1
            $name = mb_ereg_replace('\b' . $conjunction . '\b', mb_strtolower($conjunction), $name);
0 ignored issues
show
Bug introduced by
It seems like $name can also be of type null; however, parameter $string of mb_ereg_replace() does only seem to accept string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

426
            $name = mb_ereg_replace('\b' . $conjunction . '\b', mb_strtolower($conjunction), /** @scrutinizer ignore-type */ $name);
Loading history...
427
        }
428 1
        return $name;
429
    }
430
431
    /**
432
     * Fix post-nominal letter cases.
433
     *
434
     * @param string $name
435
     * @return string
436
     */
437 22
    private static function fixPostNominal(string $name): string
438
    {
439 22
        $postNominals = array_diff(self::POST_NOMINALS, self::$postNominalsExcluded);
440 22
        foreach ($postNominals as $postNominal) {
441 22
            $name = mb_ereg_replace('\b' . $postNominal . '\b', $postNominal, $name, 'ix');
0 ignored issues
show
Bug introduced by
It seems like $name can also be of type null; however, parameter $string of mb_ereg_replace() does only seem to accept string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

441
            $name = mb_ereg_replace('\b' . $postNominal . '\b', $postNominal, /** @scrutinizer ignore-type */ $name, 'ix');
Loading history...
442
        }
443 22
        return $name;
444
    }
445
446
    /**
447
     * Replace HTML entities with placeholders.
448
     * 
449
     * @param string $name
450
     * @return array
451
     */
452 25
    private static function replaceHtmlEntitiesWithPlaceholders(string &$name): array {
453 25
        $placeholders = [];
454 25
        $counter = 0;
455
456 25
        $name = preg_replace_callback('/&[a-zA-Z0-9#]+;/i', function($matches) use (&$placeholders, &$counter) {
457 1
            $placeholder = mb_strtolower('HTML_ENTITY_PLACEHOLDER_' . $counter++. ' '); // note space at the end, to avoid merging with the next word
458 1
            $placeholders[$placeholder] = $matches[0];
459 1
            return $placeholder;
460 25
        }, $name);
461
462 25
        return $placeholders;
463
    }
464
465
    /**
466
     * Restore HTML entities.
467
     *
468
     * @param string $name
469
     * @param array $placeholders
470
     * @return void
471
     */
472 22
    private static function restoreHtmlEntitiesFromPlaceholders(string &$name, array $placeholders): void {
473 22
        foreach ($placeholders as $placeholder => $entity) {
474 1
            $name = preg_replace('/' . preg_quote($placeholder, '/') . '/i', $entity, $name);
475
        }
476
    }
477
478
}
479