Completed
Pull Request — master (#37)
by Andreas
03:50 queued 02:16
created

WhitespaceTokenizer   A

Complexity

Total Complexity 9

Size/Duplication

Total Lines 73
Duplicated Lines 0 %

Coupling/Cohesion

Components 1
Dependencies 3

Importance

Changes 2
Bugs 0 Features 0
Metric Value
wmc 9
c 2
b 0
f 0
lcom 1
cbo 3
dl 0
loc 73
rs 10

2 Methods

Rating   Name   Duplication   Size   Complexity  
B run() 0 27 6
A tokenize() 0 15 3
1
<?php
2
/**
3
 * Copyright (c) 2008-2011 Andreas Heigl<[email protected]>
4
 *
5
 * Permission is hereby granted, free of charge, to any person obtaining a copy
6
 * of this software and associated documentation files (the "Software"), to deal
7
 * in the Software without restriction, including without limitation the rights
8
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
 * copies of the Software, and to permit persons to whom the Software is
10
 * furnished to do so, subject to the following conditions:
11
 *
12
 * The above copyright notice and this permission notice shall be included in
13
 * all copies or substantial portions of the Software.
14
 *
15
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
 * THE SOFTWARE.
22
 *
23
 * @category   Hyphenation
24
 * @package    Org_Heigl_Hyphenator
25
 * @subpackage Tokenizer
26
 * @author     Andreas Heigl <[email protected]>
27
 * @copyright  2008-2011 Andreas Heigl<[email protected]>
28
 * @license    http://www.opensource.org/licenses/mit-license.php MIT-License
29
 * @version    2.0.1
30
 * @link       http://github.com/heiglandreas/Hyphenator
31
 * @since      11.11.2011
32
 */
33
34
namespace Org\Heigl\Hyphenator\Tokenizer;
35
36
/**
37
 * Use Whitespace to split any input into tokens
38
 *
39
 * @category   Hyphenation
40
 * @package    Org_Heigl_Hyphenator
41
 * @subpackage Tokenizer
42
 * @author     Andreas Heigl <[email protected]>
43
 * @copyright  2008-2011 Andreas Heigl<[email protected]>
44
 * @license    http://www.opensource.org/licenses/mit-license.php MIT-License
45
 * @version    2.0.1
46
 * @link       http://github.com/heiglandreas/Hyphenator
47
 * @since      04.11.2011
48
 */
49
class WhitespaceTokenizer implements Tokenizer
50
{
51
    protected $whitespaces = array(
52
      '\s',           // white space
53
      "\xE2\x80\xAF", // non-breaking thin white space
54
      "\xC2\xA0",     // non-breaking space
55
    );
56
57
    /**
58
     * Split the given input into tokens using whitespace as splitter
59
     *
60
     * The input can be a string or a tokenRegistry. If the input is a
61
     * TokenRegistry, each item will be tokenized.
62
     *
63
     * @param string|\Org\Heigl\Hyphenator\Tokenizer\TokenRegistry $input The
64
     * input to be tokenized
65
     *
66
     * @return \Org\Heigl\Hyphenator\Tokenizer\TokenRegistry
67
     */
68
    public function run($input)
69
    {
70
        if ($input instanceof TokenRegistry) {
71
            // Tokenize a TokenRegistry
72
            foreach ($input as $token) {
73
                if (! $token instanceof WordToken) {
74
                    continue;
75
                }
76
                $newTokens = $this->tokenize($token->get());
77
                if ($newTokens == array($token)) {
78
                    continue;
79
                }
80
                $input->replace($token, $newTokens);
81
            }
82
83
            return $input ;
84
        }
85
86
        // Tokenize a simple string.
87
        $array =  $this->tokenize($input);
88
        $registry = new TokenRegistry();
89
        foreach ($array as $item) {
90
            $registry->add($item);
91
        }
92
93
        return $registry;
94
    }
95
96
    /**
97
     * Split the given string into tokens using whitespace.
98
     *
99
     * Each whitespace is placed in a WhitespaceToken and everything else is
100
     * placed in a WordToken-Object
101
     *
102
     * @param string $input The String to tokenize
103
     *
104
     * @return Token
105
     */
106
    private function tokenize($input)
107
    {
108
        $tokens = array();
109
        $splits = preg_split("/([".implode("", $this->whitespaces)."]+)/u", $input, -1, PREG_SPLIT_DELIM_CAPTURE);
110
111
        foreach ($splits as $split) {
112
            if (preg_match("/^[".implode("", $this->whitespaces)."]+$/um", $split)) {
113
                $tokens[] = new WhitespaceToken($split);
114
                continue;
115
            }
116
            $tokens[] = new WordToken($split);
117
        }
118
119
        return $tokens;
120
    }
121
}
122