Passed
Pull Request — master (#21)
by Yuri
04:06
created

Formatter::replaceHtmlEntitiesWithPlaceholders()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 11
Code Lines 8

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 9
CRAP Score 1

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 1
eloc 8
c 1
b 0
f 0
nc 1
nop 1
dl 0
loc 11
ccs 9
cts 9
cp 1
crap 1
rs 10
1
<?php namespace Tamtamchik\NameCase;
2
3
/**
4
 * Class Formatter.
5
 */
6
class Formatter
7
{
8
    // Irish exceptions.
9
    private const EXCEPTIONS = [
10
        '\bMacEdo' => 'Macedo',
11
        '\bMacEvicius' => 'Macevicius',
12
        '\bMacHado' => 'Machado',
13
        '\bMacHar' => 'Machar',
14
        '\bMacHin' => 'Machin',
15
        '\bMacHlin' => 'Machlin',
16
        '\bMacIas' => 'Macias',
17
        '\bMacIulis' => 'Maciulis',
18
        '\bMacKie' => 'Mackie',
19
        '\bMacKle' => 'Mackle',
20
        '\bMacKlin' => 'Macklin',
21
        '\bMacKmin' => 'Mackmin',
22
        '\bMacQuarie' => 'Macquarie',
23
        '\bMacOmber' => 'Macomber',
24
        '\bMacIn' => 'Macin',
25
        '\bMacKintosh' => 'Mackintosh',
26
        '\bMacKen' => 'Macken',
27
        '\bMacHen' => 'Machen',
28
        '\bMacisaac' => 'MacIsaac',
29
        '\bMacHiel' => 'Machiel',
30
        '\bMacIol' => 'Maciol',
31
        '\bMacKell' => 'Mackell',
32
        '\bMacKlem' => 'Macklem',
33
        '\bMacKrell' => 'Mackrell',
34
        '\bMacLin' => 'Maclin',
35
        '\bMacKey' => 'Mackey',
36
        '\bMacKley' => 'Mackley',
37
        '\bMacHell' => 'Machell',
38
        '\bMacHon' => 'Machon',
39
    ];
40
41
    // General replacements.
42
    private const REPLACEMENTS = [
43
        '\bAl(?=\s+\w)' => 'al',        // al Arabic or forename Al.
44
        '\bAp\b' => 'ap',        // ap Welsh.
45
        '\b(Bin|Binti|Binte)\b' => 'bin',       // bin, binti, binte Arabic.
46
        '\bDell([ae])\b' => 'dell\1',    // della and delle Italian.
47
        '\bD([aeiou])\b' => 'd\1',       // da, de, di Italian; du French; do Brasil.
48
        '\bD([ao]s)\b' => 'd\1',       // das, dos Brasileiros.
49
        '\bDe([lrn])\b' => 'de\1',      // del Italian; der/den Dutch/Flemish.
50
        '\bL([eo])\b' => 'l\1',       // lo Italian; le French.
51
        '\bTe([rn])\b' => 'te\1',      // ten, ter Dutch/Flemish.
52
        '\bVan(?=\s+\w)' => 'van',       // van German or forename Van.
53
        '\bVon\b' => 'von',       // von Dutch/Flemish.
54
    ];
55
56
    private const SPANISH = [
57
        '\bEl\b' => 'el',        // el Greek or El Spanish.
58
        '\bLa\b' => 'la',        // la French or La Spanish.
59
    ];
60
61
    private const HEBREW = [
62
        '\bBen(?=\s+\w)' => 'ben', // ben Hebrew or forename Ben.
63
        '\bBat(?=\s+\w)' => 'bat', // bat Hebrew or forename Bat.
64
    ];
65
66
    // Spanish conjunctions.
67
    private const CONJUNCTIONS = ['Y', 'E', 'I'];
68
69
    // Roman letters regexp.
70
    private const ROMAN_REGEX = '\b((?:[Xx]{1,3}|[Xx][Ll]|[Ll][Xx]{0,3})?(?:[Ii]{1,3}|[Ii][VvXx]|[Vv][Ii]{0,3})?)\b';
71
72
    // Post nominal values.
73
    private const POST_NOMINALS = [
74
        'ACILEx', 'ACSM', 'ADC', 'AEPC', 'AFC', 'AFM', 'AICSM', 'AKC', 'AM', 'ARBRIBA', 'ARCS', 'ARRC', 'ARSM', 'AUH',
75
        'AUS',
76
        'BA', 'BArch', 'BCh', 'BChir', 'BCL', 'BDS', 'BEd', 'BEM', 'BEng', 'BM', 'BS', 'BSc', 'BSW', 'BVM&S',
77
        'BVScBVetMed',
78
        'CB', 'CBE', 'CEng', 'CertHE', 'CGC', 'CGM', 'CH', 'CIE', 'CMarEngCMarSci', 'CMarTech', 'CMG', 'CMILT',
79
        'CML', 'CPhT', 'CPLCTP', 'CPM', 'CQSW', 'CSciTeach', 'CSI', 'CTL', 'CVO',
80
        'DBE', 'DBEnv', 'DC', 'DCB', 'DCM', 'DCMG', 'DConstMgt', 'DCVO', 'DD', 'DEM', 'DFC', 'DFM', 'DIC', 'Dip',
81
        'DipHE', 'DipLP', 'DipSW', 'DL', 'DLitt', 'DLP', 'DPhil', 'DProf', 'DPT', 'DREst', 'DSC', 'DSM', 'DSO',
82
        'DSocSci',
83
        'ED', 'EdD', 'EJLog', 'EMLog', 'EN', 'EngD', 'EngTech', 'ERD', 'ESLog',
84
        'FADO', 'FAWM', 'FBDOFCOptom', 'FCEM', 'FCILEx', 'FCILT', 'FCSP.', 'FdAFdSc', 'FdEng', 'FFHOM', 'FFPM',
85
        'FRCAFFPMRCA', 'FRCGP', 'FRCOG', 'FRCP', 'FRCPsych', 'FRCS', 'FRCVS', 'FSCR.',
86
        'GBE', 'GC', 'GCB', 'GCIE', 'GCILEx', 'GCMG', 'GCSI', 'GCVO', 'GM',
87
        'HNC', 'HNCert', 'HND', 'HNDip',
88
        'ICTTech', 'IDSM', 'IEng', 'IMarEng', 'IOMCPM', 'ISO',
89
        'J', 'JP', 'JrLog',
90
        'KBE', 'KC', 'KCB', 'KCIE', 'KCMG', 'KCSI', 'KCVO', 'KG', 'KP', 'KT',
91
        'LFHOM', 'LG', 'LJ', 'LLB', 'LLD', 'LLM', 'Log', 'LPE', /* 'LT', - excluded, see initial names */
92
        'LVO',
93
        'MA', 'MAcc', 'MAnth', 'MArch', 'MarEngTech', 'MB', 'MBA', 'MBChB', 'MBE', 'MBEIOM', 'MBiochem', 'MC', 'MCEM',
94
        'MCGI', 'MCh.', 'MChem', 'MChiro', 'MClinRes', 'MComp', 'MCOptom', 'MCSM', 'MCSP', 'MD', 'MEarthSc',
95
        'MEng', 'MEnt', 'MEP', 'MFHOM', 'MFin', 'MFPM', 'MGeol', 'MILT', 'MJur', 'MLA', 'MLitt', 'MM', 'MMath',
96
        'MMathStat', 'MMORSE', 'MMus', 'MOst', 'MP', 'MPAMEd', 'MPharm', 'MPhil', 'MPhys', 'MRCGP', 'MRCOG',
97
        'MRCP', 'MRCPath', 'MRCPCHFRCPCH', 'MRCPsych', 'MRCS', 'MRCVS', 'MRes',
98
        /* 'MS', - excluded, see initial names */
99
        'MSc', 'MScChiro', 'MSci',
100
        'MSCR', 'MSM', 'MSocSc', 'MSP', 'MSt', 'MSW', 'MSYP', 'MVO',
101
        'NPQH',
102
        'OBE', 'OBI', 'OM', 'OND',
103
        'PgC', 'PGCAP', 'PGCE', 'PgCert', 'PGCHE', 'PgCLTHE', 'PgD', 'PGDE', 'PgDip', 'PhD', 'PLog', 'PLS',
104
        'QAM', 'QC', 'QFSM', 'QGM', 'QHC', 'QHDS', 'QHNS', 'QHP', 'QHS', 'QPM', 'QS', 'QTSCSci',
105
        'RD', 'RFHN', 'RGN', 'RHV', 'RIAI', 'RIAS', 'RM', 'RMN', 'RN', 'RN1RNA', 'RN2', 'RN3', 'RN4', 'RN5', 'RN6', 'RN7', 'RN8', 'RN9', 'RNC', 'RNLD', 'RNMH', 'ROH', 'RRC', 'RSAW', 'RSci', 'RSciTech', 'RSCN', 'RSN', 'RVM', 'RVN',
106
        'SCHM', 'SCJ', 'SCLD', 'SEN', 'SGM', 'SL', 'SPANSPMH', 'SPCC', 'SPCN', 'SPDN', 'SPHP', 'SPLD', 'SrLog', 'SRN', 'SROT',
107
        'TD',
108
        'UD',
109
        'V100', 'V200', 'V300', 'VC', 'VD', 'VetMB', 'VN', 'VRD'
110
    ];
111
112
    // Excluded post-nominals
113
    private const INITIAL_NAME_REGEX = '\b(Aj|[bcdfghjklmnpqrstvwxyzBCDFGHJKLMNPQRSTVWXYZ]{2})\s';
114
115
    // Most two-letter words with no vowels should be kept in all caps as initials
116
    private const INITIAL_NAME_EXCEPTIONS = [
117
        'Mr',
118
        'Ms', // Replaces Member of the Senedd post nominal.
119
        'Dr',
120
        'St',
121
        'Jr',
122
        'Sr',
123
        'Lt', // Replaces Lady of the Order of the Thistle post nominal.
124
    ];
125
    private const LOWER_CASE_WORDS = ['The', 'Of', 'And'];
126
127
    // Lowercase words
128
    private static $postNominalsExcluded = [];
129
130
    // Default options.
131
    private static $options = [
132
        'lazy' => true,
133
        'irish' => true,
134
        'spanish' => false,
135
        'roman' => true,
136
        'hebrew' => true,
137
        'postnominal' => true,
138
    ];
139
140
    /**
141
     * Formatter constructor.
142
     *
143
     * @param array $options
144
     */
145 2
    public function __construct(array $options = [])
146
    {
147 2
        $this->setOptions($options);
148
    }
149
150
    /**
151
     * Global options setter.
152
     *
153
     * @param array $options
154
     */
155 25
    public static function setOptions(array $options): void
156
    {
157 25
        self::$options = array_merge(self::$options, $options);
158
    }
159
160
    /**
161
     * Global post-nominals exclusions setter.
162
     *
163
     * @param array|string|null $values
164
     * @return boolean|void
165
     */
166 3
    public static function excludePostNominals($values)
167
    {
168 3
        if (is_string($values)) {
169 1
            $values = [$values];
170
        }
171
172 3
        if ( ! is_array($values)) {
173 1
            return false;
174
        }
175
176 2
        self::$postNominalsExcluded = array_merge(self::$postNominalsExcluded, $values);
177
    }
178
179
    /**
180
     * Main function for NameCase.
181
     *
182
     * @param string|null $name
183
     * @param array|null $options
184
     *
185
     * @return string
186
     */
187 25
    public static function nameCase(?string $name = '', ?array $options = []): string
188
    {
189 25
        $name = is_null($name) ? '' : $name;
190
191 25
        self::setOptions($options);
192
193
        // Temporarily replace HTML encoded entities with placeholders
194 25
        $placeholders = self::replaceHtmlEntitiesWithPlaceholders($name);
195
196
        // Do not do anything if string is mixed and lazy option is true.
197 25
        if ( ! self::canBeProcessed($name)) {
198 5
            return $name;
199
        }
200
201 22
        $original = $name;
202
203
        // Capitalize
204 22
        self::capitalize($name);
205
206 22
        foreach (self::getReplacements() as $pattern => $replacement) {
207 22
            $name = mb_ereg_replace($pattern, $replacement, $name);
208
209
            // Very difficult to write a test in modern environments
210
            // @codeCoverageIgnoreStart
211
            if ( ! is_string($name)) {
212
                return $original;
213
            }
214
            // @codeCoverageIgnoreEnd
215
        }
216
217 22
        self::correctInitialNames($name);
218 22
        self::correctLowerCaseWords($name);
219
220 22
        self::processOptions($name);
221
222
        // After name casing operations, restore HTML encoded entities
223 22
        self::restoreHtmlEntitiesFromPlaceholders($name, $placeholders);
224
225 22
        return $name;
226
    }
227
228
    /**
229
     * Check if string can be processed.
230
     *
231
     * @param string $name
232
     *
233
     * @return bool
234
     */
235 25
    private static function canBeProcessed(string $name): bool
236
    {
237 25
        if ($name != '') {
238 23
            return ! (self::$options['lazy'] && self::skipMixed($name));
239
        }
240
241 2
        return false;
242
    }
243
244
    /**
245
     * Skip if string is mixed case.
246
     *
247
     * @param string $name
248
     *
249
     * @return bool
250
     */
251 23
    private static function skipMixed(string $name): bool
252
    {
253 23
        $firstLetterLower = $name[0] == mb_strtolower($name[0]);
254 23
        $allLowerOrUpper = (mb_strtolower($name) == $name || mb_strtoupper($name) == $name);
255
256 23
        return ! ($firstLetterLower || $allLowerOrUpper);
257
    }
258
259
    /**
260
     * Capitalize first letters.
261
     *
262
     * @param string $name
263
     */
264 22
    private static function capitalize(string &$name): void
265
    {
266 22
        $name = mb_strtolower($name);
267
268 22
        $name = mb_ereg_replace_callback('\b\w', function ($matches) {
269 22
            return mb_strtoupper($matches[0]);
270 22
        }, $name);
271
272
        // Lowercase 's
273 22
        $name = mb_ereg_replace_callback('\'\w\b', function ($matches) {
274 3
            return mb_strtolower($matches[0]);
275 22
        }, $name);
276
277 22
        self::updateIrish($name);
278
    }
279
280
    /**
281
     * Update for Irish names.
282
     *
283
     * @param string $name
284
     */
285 22
    private static function updateIrish(string &$name): void
286
    {
287 22
        if ( ! self::$options['irish']) return;
288
289
        if (
290 22
            mb_ereg_match('.*?\bMac[A-Za-z]{2,}[^aciozj]\b', $name) ||
291 22
            mb_ereg_match('.*?\bMc', $name)
292
        ) {
293 7
            self::updateMac($name);
294
        }
295
296 22
        $name = mb_ereg_replace('Macmurdo', 'MacMurdo', $name);
297
    }
298
299
    /**
300
     * Updates irish Mac & Mc.
301
     *
302
     * @param string $name
303
     */
304 7
    private static function updateMac(string &$name): void
305
    {
306 7
        $name = mb_ereg_replace_callback('\b(Ma?c)([A-Za-z]+)', function ($matches) {
307 7
            return $matches[1] . mb_strtoupper(mb_substr($matches[2], 0, 1)) . mb_substr($matches[2], 1);
308 7
        }, $name);
309
310
        // Now fix "Mac" exceptions
311 7
        foreach (self::EXCEPTIONS as $pattern => $replacement) {
312 7
            $name = mb_ereg_replace($pattern, $replacement, $name);
313
        }
314
    }
315
316
    /**
317
     * Define required replacements.
318
     *
319
     * @return array
320
     */
321 22
    private static function getReplacements(): array
322
    {
323
        // General fixes
324 22
        $replacements = self::REPLACEMENTS;
325 22
        if ( ! self::$options['spanish']) {
326 22
            $replacements = array_merge($replacements, self::SPANISH);
327
        }
328
329 22
        if (self::$options['hebrew']) {
330 22
            $replacements = array_merge($replacements, self::HEBREW);
331
        }
332
333 22
        return $replacements;
334
    }
335
336
    /**
337
     * Correct capitalization of initial names like JJ and TJ.
338
     *
339
     * @param string $name
340
     */
341 22
    private static function correctInitialNames(string &$name): void
342
    {
343 22
        $name = mb_ereg_replace_callback(self::INITIAL_NAME_REGEX, function ($matches) {
344 1
            $match = $matches[0];
345
346 1
            if (in_array($matches[1], self::INITIAL_NAME_EXCEPTIONS)) {
347 1
                return $match;
348
            }
349
350 1
            return mb_strtoupper($match);
351 22
        }, $name);
352
    }
353
354
    /**
355
     * Correct lower-case words of titles.
356
     *
357
     * @param string $name
358
     */
359 22
    private static function correctLowerCaseWords(string &$name): void
360
    {
361 22
        foreach (self::LOWER_CASE_WORDS as $lowercase) {
362 22
            $name = mb_ereg_replace('\b' . $lowercase . '\b', mb_strtolower($lowercase), $name);
0 ignored issues
show
Bug introduced by
It seems like $name can also be of type null; however, parameter $string of mb_ereg_replace() does only seem to accept string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

362
            $name = mb_ereg_replace('\b' . $lowercase . '\b', mb_strtolower($lowercase), /** @scrutinizer ignore-type */ $name);
Loading history...
363
        }
364
    }
365
366
    /**
367
     * Process options with given name
368
     *
369
     * @param string $name
370
     */
371 22
    private static function processOptions(string &$name): void
372
    {
373 22
        if (self::$options['roman']) {
374 22
            self::updateRoman($name);
375
        }
376
377 22
        if (self::$options['spanish']) {
378 1
            self::fixConjunction($name);
379
        }
380
381 22
        if (self::$options['postnominal']) {
382 22
            self::fixPostNominal($name);
383
        }
384
    }
385
386
    /**
387
     * Fix roman numeral names.
388
     *
389
     * @param string $name
390
     */
391 22
    private static function updateRoman(string &$name): void
392
    {
393 22
        $name = mb_ereg_replace_callback(self::ROMAN_REGEX, function ($matches) {
394 22
            return mb_strtoupper($matches[0]);
395 22
        }, $name);
396
    }
397
398
    /**
399
     * Fix Spanish conjunctions.
400
     *
401
     * @param string $name
402
     */
403 1
    private static function fixConjunction(string &$name): void
404
    {
405 1
        foreach (self::CONJUNCTIONS as $conjunction) {
406 1
            $name = mb_ereg_replace('\b' . $conjunction . '\b', mb_strtolower($conjunction), $name);
0 ignored issues
show
Bug introduced by
It seems like $name can also be of type null; however, parameter $string of mb_ereg_replace() does only seem to accept string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

406
            $name = mb_ereg_replace('\b' . $conjunction . '\b', mb_strtolower($conjunction), /** @scrutinizer ignore-type */ $name);
Loading history...
407
        }
408
    }
409
410
    /**
411
     * Fix post-nominal letter cases.
412
     *
413
     * @param string $name
414
     */
415 22
    private static function fixPostNominal(string &$name): void
416
    {
417 22
        $postNominals = array_diff(self::POST_NOMINALS, self::$postNominalsExcluded);
418 22
        foreach ($postNominals as $postNominal) {
419 22
            $name = mb_ereg_replace('\b' . $postNominal . '\b', $postNominal, $name, 'ix');
0 ignored issues
show
Bug introduced by
It seems like $name can also be of type null; however, parameter $string of mb_ereg_replace() does only seem to accept string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

419
            $name = mb_ereg_replace('\b' . $postNominal . '\b', $postNominal, /** @scrutinizer ignore-type */ $name, 'ix');
Loading history...
420
        }
421
    }
422
423
    /**
424
     * Replace HTML entities with placeholders.
425
     *
426
     * @param string $name
427
     * @return array
428
     */
429 25
    private static function replaceHtmlEntitiesWithPlaceholders(string &$name): array {
430 25
        $placeholders = [];
431 25
        $counter = 0;
432
433 25
        $name = preg_replace_callback('/&[a-zA-Z0-9#]+;/i', function($matches) use (&$placeholders, &$counter) {
434 1
            $placeholder = mb_strtolower('HTML_ENTITY_PLACEHOLDER_' . $counter++. ' '); // note space at the end, to avoid merging with the next word
435 1
            $placeholders[$placeholder] = $matches[0];
436 1
            return $placeholder;
437 25
        }, $name);
438
439 25
        return $placeholders;
440
    }
441
442
    /**
443
     * Restore HTML entities.
444
     *
445
     * @param string $name
446
     * @param array $placeholders
447
     * @return void
448
     */
449 22
    private static function restoreHtmlEntitiesFromPlaceholders(string &$name, array $placeholders): void {
450 22
        foreach ($placeholders as $placeholder => $entity) {
451 1
            $name = preg_replace('/' . preg_quote($placeholder, '/') . '/i', $entity, $name);
452
        }
453
    }
454
455
}
456