PunctuationTokenizer::tokenize()   A
last analyzed

Complexity

Conditions 4
Paths 4

Size

Total Lines 18

Duplication

Lines 18
Ratio 100 %

Importance

Changes 0
Metric Value
dl 18
loc 18
rs 9.6666
c 0
b 0
f 0
cc 4
nc 4
nop 1
1
<?php
2
/**
3
 * Copyright (c) 2008-2011 Andreas Heigl<[email protected]>
4
 *
5
 * Permission is hereby granted, free of charge, to any person obtaining a copy
6
 * of this software and associated documentation files (the "Software"), to deal
7
 * in the Software without restriction, including without limitation the rights
8
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
 * copies of the Software, and to permit persons to whom the Software is
10
 * furnished to do so, subject to the following conditions:
11
 *
12
 * The above copyright notice and this permission notice shall be included in
13
 * all copies or substantial portions of the Software.
14
 *
15
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
 * THE SOFTWARE.
22
 *
23
 * @category   Hyphenation
24
 * @package    Org_Heigl_Hyphenator
25
 * @subpackage Tokenizer
26
 * @author     Andreas Heigl <[email protected]>
27
 * @copyright  2008-2011 Andreas Heigl<[email protected]>
28
 * @license    http://www.opensource.org/licenses/mit-license.php MIT-License
29
 * @version    2.0.1
30
 * @link       http://github.com/heiglandreas/Hyphenator
31
 * @since      11.11.2011
32
 */
33
34
namespace Org\Heigl\Hyphenator\Tokenizer;
35
36
/**
37
 * Use Punctuation to split any input into tokens
38
 *
39
 * @category   Hyphenation
40
 * @package    Org_Heigl_Hyphenator
41
 * @subpackage Tokenizer
42
 * @author     Andreas Heigl <[email protected]>
43
 * @copyright  2008-2011 Andreas Heigl<[email protected]>
44
 * @license    http://www.opensource.org/licenses/mit-license.php MIT-License
45
 * @version    2.0.1
46
 * @link       http://github.com/heiglandreas/Hyphenator
47
 * @since      04.11.2011
48
 */
49
class PunctuationTokenizer implements Tokenizer
50
{
51
52
    /**
53
     * The tokens to be handled by this tokenizer as an array.
54
     *
55
     * @var string[] $tokens
56
     */
57
    protected $tokens = [
58
        '.',
59
        '?',
60
        '!',
61
        ':',
62
        ';',
63
        ',',
64
        '#',
65
        '"',
66
        '$',
67
        '§',
68
        '%',
69
        '&',
70
        '/',
71
        '(',
72
        ')',
73
        '=',
74
        '[',
75
        ']',
76
        '|',
77
        '{',
78
        '}',
79
        '\\',
80
        '<',
81
        '>',
82
        '«',
83
        '»',
84
        '“',
85
        '”',
86
        '^',
87
        '°',
88
        '≤',
89
        '≥',
90
        '¥',
91
        '©',
92
        '€',
93
        "'",
94
        '-',
95
        '_',
96
    ];
97
98
    /**
99
     * Split the given input into tokens using punktuation marks as splitter
100
     *
101
     * The input can be a string or a tokenRegistry. If the input is a
102
     * TokenRegistry, each item will be tokenized.
103
     *
104
     * @param string|TokenRegistry $input The
105
     * input to be tokenized
106
     *
107
     * @return TokenRegistry
108
     */
109 View Code Duplication
    public function run($input)
110
    {
111
        if ($input instanceof TokenRegistry) {
112
            // Tokenize a TokenRegistry
113
            $f = clone($input);
114
            foreach ($input as $token) {
115
                if (! $token instanceof WordToken) {
116
                    continue;
117
                }
118
                $newTokens = $this->tokenize($token->get());
119
                if ($newTokens == array($token)) {
120
                    continue;
121
                }
122
                $f->replace($token, $newTokens);
123
            }
124
125
            return $f ;
126
        }
127
128
        // Tokenize a simple string.
129
        $array =  $this->tokenize($input);
130
        $registry = new TokenRegistry();
131
        foreach ($array as $item) {
132
            $registry->add($item);
133
        }
134
135
        return $registry;
136
    }
137
138
    /**
139
     * Split the given string into tokens using whitespace.
140
     *
141
     * Each whitespace is placed in a WhitespaceToken and everything else is
142
     * placed in a WordToken-Object
143
     *
144
     * @param string $input The String to tokenize
145
     *
146
     * @return Token[]
147
     */
148 View Code Duplication
    private function tokenize($input)
149
    {
150
        $tokens = array();
151
        $signs = '\\' . implode('\\', $this->tokens);
152
        $splits = preg_split('/([' . $signs . ']+)/u', $input, -1, PREG_SPLIT_DELIM_CAPTURE);
153
        foreach ($splits as $split) {
154
            if ('' == $split) {
155
                continue;
156
            }
157
            if (in_array(mb_substr($split, 0, 1), $this->tokens)) {
158
                $tokens[] = new NonWordToken($split);
159
                continue;
160
            }
161
            $tokens[] = new WordToken($split);
162
        }
163
164
        return $tokens;
165
    }
166
}
167