1
|
|
|
<?php |
2
|
|
|
|
3
|
|
|
namespace TheIconic\NameParser; |
4
|
|
|
|
5
|
|
|
use TheIconic\NameParser\Language\English; |
6
|
|
|
use TheIconic\NameParser\Mapper\NicknameMapper; |
7
|
|
|
use TheIconic\NameParser\Mapper\SalutationMapper; |
8
|
|
|
use TheIconic\NameParser\Mapper\SuffixMapper; |
9
|
|
|
use TheIconic\NameParser\Mapper\InitialMapper; |
10
|
|
|
use TheIconic\NameParser\Mapper\LastnameMapper; |
11
|
|
|
use TheIconic\NameParser\Mapper\FirstnameMapper; |
12
|
|
|
use TheIconic\NameParser\Mapper\MiddlenameMapper; |
13
|
|
|
use TheIconic\NameParser\Mapper\CompanyMapper; |
14
|
|
|
use TheIconic\NameParser\Mapper\ExtensionMapper; |
15
|
|
|
use TheIconic\NameParser\Mapper\MultipartMapper; |
16
|
|
|
|
17
|
|
|
class Parser |
18
|
|
|
{ |
19
|
|
|
/** |
20
|
|
|
* @var string |
21
|
|
|
*/ |
22
|
|
|
protected $whitespace = " \r\n\t"; |
23
|
|
|
|
24
|
|
|
/** |
25
|
|
|
* @var array |
26
|
|
|
*/ |
27
|
|
|
protected $mappers = []; |
28
|
|
|
|
29
|
|
|
/** |
30
|
|
|
* @var array |
31
|
|
|
*/ |
32
|
|
|
protected $languages = []; |
33
|
|
|
|
34
|
|
|
/** |
35
|
|
|
* @var array |
36
|
|
|
*/ |
37
|
|
|
protected $nicknameDelimiters = []; |
38
|
|
|
|
39
|
|
|
/** |
40
|
|
|
* @var int |
41
|
|
|
*/ |
42
|
|
|
protected $maxSalutationIndex = 0; |
43
|
|
|
|
44
|
|
|
/** |
45
|
|
|
* @var int |
46
|
|
|
*/ |
47
|
|
|
protected $maxCombinedInitials = 2; |
48
|
|
|
|
49
|
|
|
public function __construct(array $languages = []) |
50
|
|
|
{ |
51
|
|
|
if (empty($languages)) { |
52
|
|
|
$languages = [new English()]; |
53
|
|
|
} |
54
|
|
|
|
55
|
|
|
$this->languages = $languages; |
56
|
|
|
} |
57
|
|
|
|
58
|
|
|
/** |
59
|
|
|
* split full names into the following parts: |
60
|
|
|
* - prefix / salutation (Mr., Mrs., etc) |
61
|
|
|
* - given name / first name |
62
|
|
|
* - middle initials |
63
|
|
|
* - surname / last name |
64
|
|
|
* - suffix (II, Phd, Jr, etc) |
65
|
|
|
* - extension (Germany: nobility predicate is part of lastname) |
66
|
|
|
* - title (Germany: academic titles are usually used as name parts between salutation and given name) |
67
|
|
|
* - company (the string contains typical characteristics for a company name and is returned identically) |
68
|
|
|
* |
69
|
|
|
* @param string $name |
70
|
|
|
* @return Name |
71
|
|
|
*/ |
72
|
|
|
public function parse($name): Name |
73
|
|
|
{ |
74
|
|
|
$name = $this->normalize($name); |
75
|
|
|
|
76
|
|
|
$segments = explode(',', $name); |
77
|
|
|
|
78
|
|
|
if (1 < count($segments)) { |
79
|
|
|
return $this->parseSplitName($segments[0], $segments[1], $segments[2] ?? ''); |
80
|
|
|
} else { |
81
|
|
|
$mapped = $this->getCompany($name); |
82
|
|
|
if (count($mapped)) { |
83
|
|
|
return new Name($mapped); |
84
|
|
|
} |
85
|
|
|
} |
86
|
|
|
|
87
|
|
|
$parts = explode(' ', $name); |
88
|
|
|
|
89
|
|
|
foreach ($this->getMappers() as $mapper) { |
90
|
|
|
$parts = $mapper->map($parts); |
91
|
|
|
} |
92
|
|
|
|
93
|
|
|
return new Name($parts); |
94
|
|
|
} |
95
|
|
|
|
96
|
|
|
/** |
97
|
|
|
* handles split-parsing of comma-separated name parts |
98
|
|
|
* |
99
|
|
|
* @param string $first - the name part left of the comma |
100
|
|
|
* @param string $second - the name part right of the comma |
101
|
|
|
* @param string $third |
102
|
|
|
* @return Name |
103
|
|
|
*/ |
104
|
|
|
protected function parseSplitName($first, $second, $third): Name |
105
|
|
|
{ |
106
|
|
|
$parts = array_merge( |
107
|
|
|
$this->getFirstSegmentParser()->parse($first)->getParts(), |
108
|
|
|
$this->getSecondSegmentParser()->parse($second)->getParts(), |
109
|
|
|
$this->getThirdSegmentParser()->parse($third)->getParts() |
110
|
|
|
); |
111
|
|
|
|
112
|
|
|
return new Name($parts); |
113
|
|
|
} |
114
|
|
|
|
115
|
|
|
/** |
116
|
|
|
* @return Parser |
117
|
|
|
*/ |
118
|
|
View Code Duplication |
protected function getFirstSegmentParser(): Parser |
|
|
|
|
119
|
|
|
{ |
120
|
|
|
$parser = new Parser(); |
121
|
|
|
|
122
|
|
|
$parser->setMappers([ |
123
|
|
|
new ExtensionMapper($this->getExtensions()), |
124
|
|
|
new MultipartMapper($this->getTitles(), 'title'), |
125
|
|
|
new MultipartMapper($this->getPrefixes(), 'prefix'), |
126
|
|
|
new SalutationMapper($this->getSalutations(), $this->getMaxSalutationIndex()), |
127
|
|
|
new SuffixMapper($this->getSuffixes(), false, 2), |
128
|
|
|
new LastnameMapper($this->getPrefixes(), true), |
129
|
|
|
new FirstnameMapper(), |
130
|
|
|
new MiddlenameMapper(), |
131
|
|
|
]); |
132
|
|
|
|
133
|
|
|
return $parser; |
134
|
|
|
} |
135
|
|
|
|
136
|
|
|
/** |
137
|
|
|
* @return Parser |
138
|
|
|
*/ |
139
|
|
View Code Duplication |
protected function getSecondSegmentParser(): Parser |
|
|
|
|
140
|
|
|
{ |
141
|
|
|
$parser = new Parser(); |
142
|
|
|
|
143
|
|
|
$parser->setMappers([ |
144
|
|
|
new ExtensionMapper($this->getExtensions()), |
145
|
|
|
new MultipartMapper($this->getTitles(), 'title'), |
146
|
|
|
new MultipartMapper($this->getPrefixes(), 'prefix'), |
147
|
|
|
new SalutationMapper($this->getSalutations(), $this->getMaxSalutationIndex()), |
148
|
|
|
new SuffixMapper($this->getSuffixes(), true, 1), |
149
|
|
|
new NicknameMapper($this->getNicknameDelimiters()), |
150
|
|
|
new InitialMapper($this->getMaxCombinedInitials(), true), |
151
|
|
|
new FirstnameMapper(), |
152
|
|
|
new MiddlenameMapper(true), |
153
|
|
|
]); |
154
|
|
|
|
155
|
|
|
return $parser; |
156
|
|
|
} |
157
|
|
|
|
158
|
|
|
protected function getThirdSegmentParser(): Parser |
159
|
|
|
{ |
160
|
|
|
$parser = new Parser(); |
161
|
|
|
|
162
|
|
|
$parser->setMappers([ |
163
|
|
|
new SuffixMapper($this->getSuffixes(), true, 0), |
164
|
|
|
]); |
165
|
|
|
|
166
|
|
|
return $parser; |
167
|
|
|
} |
168
|
|
|
|
169
|
|
|
/** |
170
|
|
|
* get the mappers for this parser |
171
|
|
|
* |
172
|
|
|
* @return array |
173
|
|
|
*/ |
174
|
|
|
public function getMappers(): array |
175
|
|
|
{ |
176
|
|
|
if (empty($this->mappers)) { |
177
|
|
|
$this->setMappers([ |
178
|
|
|
new ExtensionMapper($this->getExtensions()), |
179
|
|
|
new MultipartMapper($this->getTitles(), 'title'), |
180
|
|
|
new MultipartMapper($this->getPrefixes(), 'prefix'), |
181
|
|
|
new NicknameMapper($this->getNicknameDelimiters()), |
182
|
|
|
new SalutationMapper($this->getSalutations(), $this->getMaxSalutationIndex()), |
183
|
|
|
new SuffixMapper($this->getSuffixes()), |
184
|
|
|
new InitialMapper($this->getMaxCombinedInitials()), |
185
|
|
|
new LastnameMapper($this->getPrefixes()), |
186
|
|
|
new FirstnameMapper(), |
187
|
|
|
new MiddlenameMapper(), |
188
|
|
|
]); |
189
|
|
|
} |
190
|
|
|
|
191
|
|
|
return $this->mappers; |
192
|
|
|
} |
193
|
|
|
|
194
|
|
|
/** |
195
|
|
|
* get name as company if parts matches company identifiers |
196
|
|
|
* |
197
|
|
|
* @param string $name |
198
|
|
|
* @return array |
199
|
|
|
*/ |
200
|
|
|
protected function getCompany(string $name): array |
201
|
|
|
{ |
202
|
|
|
$mapper = new CompanyMapper($this->getCompanies()); |
203
|
|
|
|
204
|
|
|
return $mapper->map([$name]); |
205
|
|
|
} |
206
|
|
|
|
207
|
|
|
/** |
208
|
|
|
* set the mappers for this parser |
209
|
|
|
* |
210
|
|
|
* @param array $mappers |
211
|
|
|
* @return Parser |
212
|
|
|
*/ |
213
|
|
|
public function setMappers(array $mappers): Parser |
214
|
|
|
{ |
215
|
|
|
$this->mappers = $mappers; |
216
|
|
|
|
217
|
|
|
return $this; |
218
|
|
|
} |
219
|
|
|
|
220
|
|
|
/** |
221
|
|
|
* normalize the name |
222
|
|
|
* |
223
|
|
|
* @param string $name |
224
|
|
|
* @return string |
225
|
|
|
*/ |
226
|
|
|
protected function normalize(string $name): string |
227
|
|
|
{ |
228
|
|
|
$whitespace = $this->getWhitespace(); |
229
|
|
|
|
230
|
|
|
$name = trim($name); |
231
|
|
|
|
232
|
|
|
return preg_replace('/[' . preg_quote($whitespace) . ']+/', ' ', $name); |
233
|
|
|
} |
234
|
|
|
|
235
|
|
|
/** |
236
|
|
|
* get a string of characters that are supposed to be treated as whitespace |
237
|
|
|
* |
238
|
|
|
* @return string |
239
|
|
|
*/ |
240
|
|
|
public function getWhitespace(): string |
241
|
|
|
{ |
242
|
|
|
return $this->whitespace; |
243
|
|
|
} |
244
|
|
|
|
245
|
|
|
/** |
246
|
|
|
* set the string of characters that are supposed to be treated as whitespace |
247
|
|
|
* |
248
|
|
|
* @param string $whitespace |
249
|
|
|
* @return Parser |
250
|
|
|
*/ |
251
|
|
|
public function setWhitespace($whitespace): Parser |
252
|
|
|
{ |
253
|
|
|
$this->whitespace = $whitespace; |
254
|
|
|
|
255
|
|
|
return $this; |
256
|
|
|
} |
257
|
|
|
|
258
|
|
|
/** |
259
|
|
|
* @return array |
260
|
|
|
*/ |
261
|
|
|
protected function getPrefixes() |
262
|
|
|
{ |
263
|
|
|
$prefixes = []; |
264
|
|
|
|
265
|
|
|
/** @var LanguageInterface $language */ |
266
|
|
|
foreach ($this->languages as $language) { |
267
|
|
|
$prefixes += $language->getLastnamePrefixes(); |
268
|
|
|
} |
269
|
|
|
|
270
|
|
|
return $prefixes; |
271
|
|
|
} |
272
|
|
|
|
273
|
|
|
/** |
274
|
|
|
* @return array |
275
|
|
|
*/ |
276
|
|
|
protected function getSuffixes() |
277
|
|
|
{ |
278
|
|
|
$suffixes = []; |
279
|
|
|
|
280
|
|
|
/** @var LanguageInterface $language */ |
281
|
|
|
foreach ($this->languages as $language) { |
282
|
|
|
$suffixes += $language->getSuffixes(); |
283
|
|
|
} |
284
|
|
|
|
285
|
|
|
return $suffixes; |
286
|
|
|
} |
287
|
|
|
|
288
|
|
|
/** |
289
|
|
|
* @return array |
290
|
|
|
*/ |
291
|
|
|
protected function getSalutations() |
292
|
|
|
{ |
293
|
|
|
$salutations = []; |
294
|
|
|
|
295
|
|
|
/** @var LanguageInterface $language */ |
296
|
|
|
foreach ($this->languages as $language) { |
297
|
|
|
$salutations += $language->getSalutations(); |
298
|
|
|
} |
299
|
|
|
|
300
|
|
|
return $salutations; |
301
|
|
|
} |
302
|
|
|
|
303
|
|
|
/** |
304
|
|
|
* @return array |
305
|
|
|
*/ |
306
|
|
|
protected function getExtensions() |
307
|
|
|
{ |
308
|
|
|
$extensions = []; |
309
|
|
|
|
310
|
|
|
/** @var LanguageInterface $language */ |
311
|
|
|
foreach ($this->languages as $language) { |
312
|
|
|
$extensions += $language->getExtensions(); |
313
|
|
|
} |
314
|
|
|
|
315
|
|
|
return $extensions; |
316
|
|
|
} |
317
|
|
|
|
318
|
|
|
/** |
319
|
|
|
* @return array |
320
|
|
|
*/ |
321
|
|
|
protected function getTitles() |
322
|
|
|
{ |
323
|
|
|
$titles = []; |
324
|
|
|
|
325
|
|
|
/** @var LanguageInterface $language */ |
326
|
|
|
foreach ($this->languages as $language) { |
327
|
|
|
$titles += $language->getTitles(); |
328
|
|
|
} |
329
|
|
|
|
330
|
|
|
return $titles; |
331
|
|
|
} |
332
|
|
|
|
333
|
|
|
/** |
334
|
|
|
* @return array |
335
|
|
|
*/ |
336
|
|
|
protected function getCompanies() |
337
|
|
|
{ |
338
|
|
|
$companies = []; |
339
|
|
|
|
340
|
|
|
/** @var LanguageInterface $language */ |
341
|
|
|
foreach ($this->languages as $language) { |
342
|
|
|
$companies += $language->getCompanies(); |
343
|
|
|
} |
344
|
|
|
|
345
|
|
|
return $companies; |
346
|
|
|
} |
347
|
|
|
|
348
|
|
|
/** |
349
|
|
|
* @return array |
350
|
|
|
*/ |
351
|
|
|
public function getNicknameDelimiters(): array |
352
|
|
|
{ |
353
|
|
|
return $this->nicknameDelimiters; |
354
|
|
|
} |
355
|
|
|
|
356
|
|
|
/** |
357
|
|
|
* @param array $nicknameDelimiters |
358
|
|
|
* @return Parser |
359
|
|
|
*/ |
360
|
|
|
public function setNicknameDelimiters(array $nicknameDelimiters): Parser |
361
|
|
|
{ |
362
|
|
|
$this->nicknameDelimiters = $nicknameDelimiters; |
363
|
|
|
|
364
|
|
|
return $this; |
365
|
|
|
} |
366
|
|
|
|
367
|
|
|
/** |
368
|
|
|
* @return int |
369
|
|
|
*/ |
370
|
|
|
public function getMaxSalutationIndex(): int |
371
|
|
|
{ |
372
|
|
|
return $this->maxSalutationIndex; |
373
|
|
|
} |
374
|
|
|
|
375
|
|
|
/** |
376
|
|
|
* @param int $maxSalutationIndex |
377
|
|
|
* @return Parser |
378
|
|
|
*/ |
379
|
|
|
public function setMaxSalutationIndex(int $maxSalutationIndex): Parser |
380
|
|
|
{ |
381
|
|
|
$this->maxSalutationIndex = $maxSalutationIndex; |
382
|
|
|
|
383
|
|
|
return $this; |
384
|
|
|
} |
385
|
|
|
|
386
|
|
|
/** |
387
|
|
|
* @return int |
388
|
|
|
*/ |
389
|
|
|
public function getMaxCombinedInitials(): int |
390
|
|
|
{ |
391
|
|
|
return $this->maxCombinedInitials; |
392
|
|
|
} |
393
|
|
|
|
394
|
|
|
/** |
395
|
|
|
* @param int $maxCombinedInitials |
396
|
|
|
* @return Parser |
397
|
|
|
*/ |
398
|
|
|
public function setMaxCombinedInitials(int $maxCombinedInitials): Parser |
399
|
|
|
{ |
400
|
|
|
$this->maxCombinedInitials = $maxCombinedInitials; |
401
|
|
|
|
402
|
|
|
return $this; |
403
|
|
|
} |
404
|
|
|
} |
405
|
|
|
|
Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.
You can also find more detailed suggestions in the “Code” section of your repository.