WhitespaceTokenizer::tokenize()   A
last analyzed

Complexity

Conditions 4
Paths 4

Size

Total Lines 19

Duplication

Lines 19
Ratio 100 %

Importance

Changes 0
Metric Value
dl 19
loc 19
rs 9.6333
c 0
b 0
f 0
cc 4
nc 4
nop 1
1
<?php
2
/**
3
 * Copyright (c) 2008-2011 Andreas Heigl<[email protected]>
4
 *
5
 * Permission is hereby granted, free of charge, to any person obtaining a copy
6
 * of this software and associated documentation files (the "Software"), to deal
7
 * in the Software without restriction, including without limitation the rights
8
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
 * copies of the Software, and to permit persons to whom the Software is
10
 * furnished to do so, subject to the following conditions:
11
 *
12
 * The above copyright notice and this permission notice shall be included in
13
 * all copies or substantial portions of the Software.
14
 *
15
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
 * THE SOFTWARE.
22
 *
23
 * @category   Hyphenation
24
 * @package    Org_Heigl_Hyphenator
25
 * @subpackage Tokenizer
26
 * @author     Andreas Heigl <[email protected]>
27
 * @copyright  2008-2011 Andreas Heigl<[email protected]>
28
 * @license    http://www.opensource.org/licenses/mit-license.php MIT-License
29
 * @version    2.0.1
30
 * @link       http://github.com/heiglandreas/Hyphenator
31
 * @since      11.11.2011
32
 */
33
34
namespace Org\Heigl\Hyphenator\Tokenizer;
35
36
/**
37
 * Use Whitespace to split any input into tokens
38
 *
39
 * @category   Hyphenation
40
 * @package    Org_Heigl_Hyphenator
41
 * @subpackage Tokenizer
42
 * @author     Andreas Heigl <[email protected]>
43
 * @copyright  2008-2011 Andreas Heigl<[email protected]>
44
 * @license    http://www.opensource.org/licenses/mit-license.php MIT-License
45
 * @version    2.0.1
46
 * @link       http://github.com/heiglandreas/Hyphenator
47
 * @since      04.11.2011
48
 */
49
class WhitespaceTokenizer implements Tokenizer
50
{
51
    /**
52
     * @var string[]
53
     */
54
    protected $whitespaces = array(
55
      '\s',           // white space
56
      "\xE2\x80\xAF", // non-breaking thin white space
57
      "\xC2\xA0",     // non-breaking space
58
    );
59
60
    /**
61
     * Split the given input into tokens using whitespace as splitter
62
     *
63
     * The input can be a string or a tokenRegistry. If the input is a
64
     * TokenRegistry, each item will be tokenized.
65
     *
66
     * @param string|TokenRegistry $input The
67
     * input to be tokenized
68
     *
69
     * @return TokenRegistry
70
     */
71 View Code Duplication
    public function run($input)
72
    {
73
        if ($input instanceof TokenRegistry) {
74
            // Tokenize a TokenRegistry
75
            foreach ($input as $token) {
76
                if (! $token instanceof WordToken) {
77
                    continue;
78
                }
79
                $newTokens = $this->tokenize($token->get());
80
                if ($newTokens == array($token)) {
81
                    continue;
82
                }
83
                $input->replace($token, $newTokens);
84
            }
85
86
            return $input ;
87
        }
88
89
        // Tokenize a simple string.
90
        $array =  $this->tokenize($input);
91
        $registry = new TokenRegistry();
92
        foreach ($array as $item) {
93
            $registry->add($item);
94
        }
95
96
        return $registry;
97
    }
98
99
    /**
100
     * Split the given string into tokens using whitespace.
101
     *
102
     * Each whitespace is placed in a WhitespaceToken and everything else is
103
     * placed in a WordToken-Object
104
     *
105
     * @param string $input The String to tokenize
106
     *
107
     * @return Token[]
108
     */
109 View Code Duplication
    private function tokenize($input)
110
    {
111
        $tokens = array();
112
        $splits = preg_split("/([".implode("", $this->whitespaces)."]+)/u", $input, -1, PREG_SPLIT_DELIM_CAPTURE);
113
114
        foreach ($splits as $split) {
115
            if ($split === '') {
116
                $tokens[] = new EmptyToken($split);
117
                continue;
118
            }
119
            if (preg_match("/^[".implode("", $this->whitespaces)."]+$/um", $split)) {
120
                $tokens[] = new WhitespaceToken($split);
121
                continue;
122
            }
123
            $tokens[] = new WordToken($split);
124
        }
125
126
        return $tokens;
127
    }
128
}
129