1
|
|
|
<?php |
2
|
|
|
/** |
3
|
|
|
* Copyright (c) 2016 Martin Dilling-Hansen <[email protected]> |
4
|
|
|
* https://github.com/scripturadesign/tokenizer |
5
|
|
|
*/ |
6
|
|
|
|
7
|
|
|
namespace Scriptura\Tokenizer\Tokenizers; |
8
|
|
|
|
9
|
|
|
use Scriptura\Tokenizer\Tokenizer; |
10
|
|
|
|
11
|
|
|
class Simple implements Tokenizer |
12
|
|
|
{ |
13
|
|
|
/** |
14
|
|
|
* Get the token sequence from a character sequence |
15
|
|
|
* |
16
|
|
|
* @param string $string |
17
|
|
|
* |
18
|
|
|
* @return array |
19
|
|
|
*/ |
20
|
12 |
|
public function tokenize($string) |
21
|
|
|
{ |
22
|
12 |
|
$string = $this->wrapInSpaces($string); |
23
|
12 |
|
$string = $this->spaceBeforeContractions($string); |
24
|
12 |
|
$string = $this->concatenateDoubleOrMoreSpaces($string); |
25
|
12 |
|
$string = $this->removeStartingAndEndingSpaces($string); |
26
|
|
|
|
27
|
12 |
|
return explode(' ', $string); |
28
|
|
|
} |
29
|
|
|
|
30
|
12 |
|
protected function wrapInSpaces($string) |
31
|
|
|
{ |
32
|
12 |
|
return ' ' . $string . ' '; |
33
|
|
|
} |
34
|
|
|
|
35
|
12 |
|
protected function spaceBeforeContractions($string) |
36
|
|
|
{ |
37
|
|
|
// Special cases |
38
|
12 |
|
$string = preg_replace('/[^\w](AI)(N\'T) /', ' AM ${2} ', $string); |
39
|
12 |
|
$string = preg_replace('/[^\w](ai)(n\'t) /i', ' am ${2} ', $string); |
40
|
12 |
|
$string = preg_replace('/[^\w](ca(n))(\'t) /i', ' ${1} ${2}${3} ', $string); |
41
|
|
|
|
42
|
|
|
|
43
|
|
|
// The rest |
44
|
12 |
|
$string = preg_replace('/(\'[sSmMdD]) /', ' ${1} ', $string); |
45
|
12 |
|
$string = preg_replace('/(\'ll|\'LL|\'re|\'RE|\'ve|\'VE|n\'t|N\'T) /', ' ${1} ', $string); |
46
|
|
|
|
47
|
12 |
|
return $string; |
48
|
|
|
} |
49
|
|
|
|
50
|
12 |
|
protected function concatenateDoubleOrMoreSpaces($string) |
51
|
|
|
{ |
52
|
12 |
|
return preg_replace('/ +/', ' ', $string); |
53
|
|
|
} |
54
|
|
|
|
55
|
12 |
|
protected function removeStartingAndEndingSpaces($string) |
56
|
|
|
{ |
57
|
12 |
|
return trim($string); |
58
|
|
|
} |
59
|
|
|
} |
60
|
|
|
|