CustomHyphenationTokenizer::run()   A
last analyzed

Complexity

Conditions 5
Paths 5

Size

Total Lines 25

Duplication

Lines 25
Ratio 100 %

Importance

Changes 0
Metric Value
dl 25
loc 25
rs 9.2088
c 0
b 0
f 0
cc 5
nc 5
nop 1
1
<?php
2
/**
3
 * Copyright (c) 2008-2011 Andreas Heigl<[email protected]>
4
 *
5
 * Permission is hereby granted, free of charge, to any person obtaining a copy
6
 * of this software and associated documentation files (the "Software"), to deal
7
 * in the Software without restriction, including without limitation the rights
8
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
 * copies of the Software, and to permit persons to whom the Software is
10
 * furnished to do so, subject to the following conditions:
11
 *
12
 * The above copyright notice and this permission notice shall be included in
13
 * all copies or substantial portions of the Software.
14
 *
15
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
 * THE SOFTWARE.
22
 *
23
 * @category   Hyphenation
24
 * @package    Org_Heigl_Hyphenator
25
 * @subpackage Tokenizer
26
 * @author     Andreas Heigl <[email protected]>
27
 * @copyright  2008-2011 Andreas Heigl<[email protected]>
28
 * @license    http://www.opensource.org/licenses/mit-license.php MIT-License
29
 * @version    2.0.1
30
 * @link       http://github.com/heiglandreas/Hyphenator
31
 * @since      11.11.2011
32
 */
33
34
namespace Org\Heigl\Hyphenator\Tokenizer;
35
36
use Org\Heigl\Hyphenator\Options;
37
38
/**
39
 * Use Punktuation to split any input into tokens
40
 *
41
 * @category   Hyphenation
42
 * @package    Org_Heigl_Hyphenator
43
 * @subpackage Tokenizer
44
 * @author     Andreas Heigl <[email protected]>
45
 * @copyright  2008-2011 Andreas Heigl<[email protected]>
46
 * @license    http://www.opensource.org/licenses/mit-license.php MIT-License
47
 * @version    2.0.1
48
 * @link       http://github.com/heiglandreas/Hyphenator
49
 * @since      04.11.2011
50
 */
51
class CustomHyphenationTokenizer implements Tokenizer
52
{
53
54
    private $options;
55
56
    public function __construct(Options $options)
57
    {
58
        $this->options = $options;
59
    }
60
61
    /**
62
     * Split the given input into tokens using punktuation marks as splitter
63
     *
64
     * The input can be a string or a tokenRegistry. If the input is a
65
     * TokenRegistry, each item will be tokenized.
66
     *
67
     * @param string|TokenRegistry $input The
68
     * input to be tokenized
69
     *
70
     * @return TokenRegistry
71
     */
72 View Code Duplication
    public function run($input)
73
    {
74
        if ($input instanceof TokenRegistry) {
75
            // Tokenize a TokenRegistry
76
            $f = clone($input);
77
            foreach ($input as $token) {
78
                if (! $token instanceof WordToken) {
79
                    continue;
80
                }
81
                $newTokens = $this->tokenize($token->get());
82
                $f->replace($token, $newTokens);
83
            }
84
85
            return $f ;
86
        }
87
88
        // Tokenize a simple string.
89
        $array =  $this->tokenize($input);
90
        $registry = new TokenRegistry();
91
        foreach ($array as $item) {
92
            $registry->add($item);
93
        }
94
95
        return $registry;
96
    }
97
98
    /**
99
     * Split the given string into tokens using whitespace.
100
     *
101
     * Each whitespace is placed in a WhitespaceToken and everything else is
102
     * placed in a WordToken-Object
103
     *
104
     * @param string $input The String to tokenize
105
     *
106
     * @return Token[]
107
     */
108
    private function tokenize($input)
109
    {
110
        $tokens = [];
111
112
        $splits = preg_split(sprintf(
113
            '/((?:(?<=\W)%1$s|\b\w+%2$s)\w+?\b)/u',
114
            $this->options->getNoHyphenateString(),
115
            $this->options->getCustomHyphen()
116
        ), $input, -1, PREG_SPLIT_DELIM_CAPTURE);
117
        foreach ($splits as $split) {
118
            if ('' == $split) {
119
                continue;
120
            }
121
            if (0 === mb_strpos($split, $this->options->getNoHyphenateString())) {
122
                $tokens[] = new ExcludedWordToken(str_replace(
123
                    $this->options->getNoHyphenateString(),
124
                    '',
125
                    $split
126
                ));
127
                continue;
128
            }
129
130
            if (false !== mb_strpos($split, $this->options->getCustomHyphen())) {
131
                $tokens[] = new ExcludedWordToken(str_replace(
132
                    $this->options->getCustomHyphen(),
133
                    $this->options->getHyphen(),
134
                    $split
135
                ));
136
                continue;
137
            }
138
139
            $tokens[] = new WordToken($split);
140
        }
141
142
        return $tokens;
143
    }
144
}
145