1
|
|
|
<?php |
2
|
|
|
|
3
|
|
|
namespace Onoi\Tesa\Tokenizer; |
4
|
|
|
|
5
|
|
|
use IntlRuleBasedBreakIterator; |
6
|
|
|
|
7
|
|
|
/** |
8
|
|
|
* @license GNU GPL v2+ |
9
|
|
|
* @since 0.1 |
10
|
|
|
* |
11
|
|
|
* @author mwjames |
12
|
|
|
*/ |
13
|
|
|
class IcuWordBoundaryTokenizer implements Tokenizer { |
14
|
|
|
|
15
|
|
|
/** |
16
|
|
|
* @var Tokenizer |
17
|
|
|
*/ |
18
|
|
|
private $tokenizer; |
19
|
|
|
|
20
|
|
|
/** |
21
|
|
|
* @var string |
22
|
|
|
*/ |
23
|
|
|
private $locale = 'en'; |
24
|
|
|
|
25
|
|
|
/** |
26
|
|
|
* @var string |
27
|
|
|
*/ |
28
|
|
|
private $isWordTokenizer = true; |
29
|
|
|
|
30
|
|
|
/** |
31
|
|
|
* @since 0.1 |
32
|
|
|
* |
33
|
|
|
* @param Tokenizer|null $tokenizer |
34
|
|
|
*/ |
35
|
|
|
public function __construct( Tokenizer $tokenizer = null ) { |
36
|
|
|
$this->tokenizer = $tokenizer; |
37
|
|
|
} |
38
|
|
|
|
39
|
|
|
/** |
40
|
|
|
* @since 0.1 |
41
|
|
|
* |
42
|
|
|
* {@inheritDoc} |
43
|
|
|
*/ |
44
|
|
|
public function setOption( $name, $value ) { |
45
|
|
|
if ( $this->tokenizer !== null ) { |
46
|
|
|
$this->tokenizer->setOption( $name, $value ); |
47
|
|
|
} |
48
|
|
|
} |
49
|
|
|
|
50
|
|
|
/** |
51
|
|
|
* @since 0.1 |
52
|
|
|
* |
53
|
|
|
* {@inheritDoc} |
54
|
|
|
*/ |
55
|
|
|
public function isWordTokenizer() { |
56
|
|
|
return $this->isWordTokenizer; |
|
|
|
|
57
|
|
|
} |
58
|
|
|
|
59
|
|
|
/** |
60
|
|
|
* @since 0.1 |
61
|
|
|
* |
62
|
|
|
* {@inheritDoc} |
63
|
|
|
*/ |
64
|
|
|
public function setWordTokenizerAttribute( $usesWordBoundaries ) { |
65
|
|
|
return $this->isWordTokenizer = $usesWordBoundaries; |
66
|
|
|
} |
67
|
|
|
|
68
|
|
|
/** |
69
|
|
|
* @since 0.1 |
70
|
|
|
* |
71
|
|
|
* @return boolean |
72
|
|
|
*/ |
73
|
|
|
public function isAvailable() { |
74
|
|
|
return class_exists( 'IntlRuleBasedBreakIterator' ); |
75
|
|
|
} |
76
|
|
|
|
77
|
|
|
/** |
78
|
|
|
* @since 0.1 |
79
|
|
|
* |
80
|
|
|
* @param string $locale |
81
|
|
|
*/ |
82
|
|
|
public function setLocale( $locale ) { |
83
|
|
|
$this->locale = $locale; |
84
|
|
|
} |
85
|
|
|
|
86
|
|
|
/** |
87
|
|
|
* @since 0.1 |
88
|
|
|
* |
89
|
|
|
* @param string $string |
90
|
|
|
* |
91
|
|
|
* @return array|false |
92
|
|
|
*/ |
93
|
|
|
public function tokenize( $string ) { |
94
|
|
|
|
95
|
|
|
if ( $this->tokenizer !== null ) { |
96
|
|
|
$string = implode( " ", $this->tokenizer->tokenize( $string ) ); |
97
|
|
|
} |
98
|
|
|
|
99
|
|
|
if ( !$this->isAvailable() ) { |
100
|
|
|
return $this->tokenizer !== null ? $this->tokenizer->tokenize( $string ) : array( $string ); |
101
|
|
|
} |
102
|
|
|
|
103
|
|
|
return $this->createTokens( $string ); |
104
|
|
|
} |
105
|
|
|
|
106
|
|
|
private function createTokens( $string ) { |
107
|
|
|
|
108
|
|
|
$tokens = array(); |
109
|
|
|
|
110
|
|
|
if ( $tokenizer = IntlRuleBasedBreakIterator::createWordInstance( $this->locale ) ) { |
111
|
|
|
$tokenizer->setText( $string ); |
112
|
|
|
$prev = 0; |
113
|
|
|
|
114
|
|
|
foreach ( $tokenizer as $token ) { |
115
|
|
|
|
116
|
|
|
if ( $token == 0 ) { |
117
|
|
|
continue; |
118
|
|
|
} |
119
|
|
|
|
120
|
|
|
$res = substr( $string, $prev, $token - $prev ); |
121
|
|
|
|
122
|
|
|
if ( $res !== '' && $res !== ' ' ) { |
123
|
|
|
$tokens[] = $res; |
124
|
|
|
} |
125
|
|
|
|
126
|
|
|
$prev = $token; |
127
|
|
|
} |
128
|
|
|
} |
129
|
|
|
|
130
|
|
|
return $tokens; |
131
|
|
|
} |
132
|
|
|
|
133
|
|
|
} |
134
|
|
|
|
If you return a value from a function or method, it should be a sub-type of the type that is given by the parent type f.e. an interface, or abstract method. This is more formally defined by the Lizkov substitution principle, and guarantees that classes that depend on the parent type can use any instance of a child type interchangably. This principle also belongs to the SOLID principles for object oriented design.
Let’s take a look at an example:
Our function
my_function
expects aPost
object, and outputs the author of the post. The base classPost
returns a simple string and outputting a simple string will work just fine. However, the child classBlogPost
which is a sub-type ofPost
instead decided to return anobject
, and is therefore violating the SOLID principles. If aBlogPost
were passed tomy_function
, PHP would not complain, but ultimately fail when executing thestrtoupper
call in its body.