PunctuationTokenizer::tokenize() - Code Metrics - heiglandreas/Org_Heigl_Hyphenator - Measure and Improve Code Quality continuously with Scrutinizer

PunctuationTokenizer::tokenize() A
last analyzed 2021-04-29 19:35 UTC

↳ Parent: PunctuationTokenizer

Complexity

Conditions	4
Paths	4

Size

Total Lines

Duplication

Lines	18
Ratio	100 %

Importance

Changes

Metric	Value
dl	18
loc	18
rs	9.6666
c	0
b	0
f	0
cc	4
nc	4
nop	1

<?php
/**
 * Copyright (c) 2008-2011 Andreas Heigl<[email protected]>
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 *
 * @category   Hyphenation
 * @package    Org_Heigl_Hyphenator
 * @subpackage Tokenizer
 * @author     Andreas Heigl <[email protected]>
 * @copyright  2008-2011 Andreas Heigl<[email protected]>
 * @license    http://www.opensource.org/licenses/mit-license.php MIT-License
 * @version    2.0.1
 * @link       http://github.com/heiglandreas/Hyphenator
 * @since      11.11.2011
 */

namespace Org\Heigl\Hyphenator\Tokenizer;

/**
 * Use Punctuation to split any input into tokens
 *
 * @category   Hyphenation
 * @package    Org_Heigl_Hyphenator
 * @subpackage Tokenizer
 * @author     Andreas Heigl <[email protected]>
 * @copyright  2008-2011 Andreas Heigl<[email protected]>
 * @license    http://www.opensource.org/licenses/mit-license.php MIT-License
 * @version    2.0.1
 * @link       http://github.com/heiglandreas/Hyphenator
 * @since      04.11.2011
 */
class PunctuationTokenizer implements Tokenizer
{

    /**
     * The tokens to be handled by this tokenizer as an array.
     *
     * @var string[] $tokens
     */
    protected $tokens = [
        '.',
        '?',
        '!',
        ':',
        ';',
        ',',
        '#',
        '"',
        '$',
        '§',
        '%',
        '&',
        '/',
        '(',
        ')',
        '=',
        '[',
        ']',
        '|',
        '{',
        '}',
        '\\',
        '<',
        '>',
        '«',
        '»',
        '“',
        '”',
        '^',
        '°',
        '≤',
        '≥',
        '¥',
        '©',
        '€',
        "'",
        '-',
        '_',
    ];

    /**
     * Split the given input into tokens using punktuation marks as splitter
     *
     * The input can be a string or a tokenRegistry. If the input is a
     * TokenRegistry, each item will be tokenized.
     *
     * @param string|TokenRegistry $input The
     * input to be tokenized
     *
     * @return TokenRegistry
     */
    public function run($input)
    {
        if ($input instanceof TokenRegistry) {
            // Tokenize a TokenRegistry
            $f = clone($input);
            foreach ($input as $token) {
                if (! $token instanceof WordToken) {
                    continue;
                }
                $newTokens = $this->tokenize($token->get());
                if ($newTokens == array($token)) {
                    continue;
                }
                $f->replace($token, $newTokens);
            }

            return $f ;
        }

        // Tokenize a simple string.
        $array =  $this->tokenize($input);
        $registry = new TokenRegistry();
        foreach ($array as $item) {
            $registry->add($item);
        }

        return $registry;
    }

    /**
     * Split the given string into tokens using whitespace.
     *
     * Each whitespace is placed in a WhitespaceToken and everything else is
     * placed in a WordToken-Object
     *
     * @param string $input The String to tokenize
     *
     * @return Token[]
     */
    private function tokenize($input)
    {
        $tokens = array();
        $signs = '\\' . implode('\\', $this->tokens);
        $splits = preg_split('/([' . $signs . ']+)/u', $input, -1, PREG_SPLIT_DELIM_CAPTURE);
        foreach ($splits as $split) {
            if ('' == $split) {
                continue;
            }
            if (in_array(mb_substr($split, 0, 1), $this->tokens)) {
                $tokens[] = new NonWordToken($split);
                continue;
            }
            $tokens[] = new WordToken($split);
        }

        return $tokens;
    }
}


1		<?php
2		/**
3		* Copyright (c) 2008-2011 Andreas Heigl<[email protected]>
4		*
5		* Permission is hereby granted, free of charge, to any person obtaining a copy
6		* of this software and associated documentation files (the "Software"), to deal
7		* in the Software without restriction, including without limitation the rights
8		* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9		* copies of the Software, and to permit persons to whom the Software is
10		* furnished to do so, subject to the following conditions:
11		*
12		* The above copyright notice and this permission notice shall be included in
13		* all copies or substantial portions of the Software.
14		*
15		* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16		* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17		* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18		* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19		* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20		* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21		* THE SOFTWARE.
22		*
23		* @category Hyphenation
24		* @package Org_Heigl_Hyphenator
25		* @subpackage Tokenizer
26		* @author Andreas Heigl <[email protected]>
27		* @copyright 2008-2011 Andreas Heigl<[email protected]>
28		* @license http://www.opensource.org/licenses/mit-license.php MIT-License
29		* @version 2.0.1
30		* @link http://github.com/heiglandreas/Hyphenator
31		* @since 11.11.2011
32		*/
33
34		namespace Org\Heigl\Hyphenator\Tokenizer;
35
36		/**
37		* Use Punctuation to split any input into tokens
38		*
39		* @category Hyphenation
40		* @package Org_Heigl_Hyphenator
41		* @subpackage Tokenizer
42		* @author Andreas Heigl <[email protected]>
43		* @copyright 2008-2011 Andreas Heigl<[email protected]>
44		* @license http://www.opensource.org/licenses/mit-license.php MIT-License
45		* @version 2.0.1
46		* @link http://github.com/heiglandreas/Hyphenator
47		* @since 04.11.2011
48		*/
49		class PunctuationTokenizer implements Tokenizer
50		{
51
52		/**
53		* The tokens to be handled by this tokenizer as an array.
54		*
55		* @var string[] $tokens
56		*/
57		protected $tokens = [
58		'.',
59		'?',
60		'!',
61		':',
62		';',
63		',',
64		'#',
65		'"',
66		'$',
67		'§',
68		'%',
69		'&',
70		'/',
71		'(',
72		')',
73		'=',
74		'[',
75		']',
76		'\|',
77		'{',
78		'}',
79		'\\',
80		'<',
81		'>',
82		'«',
83		'»',
84		'“',
85		'”',
86		'^',
87		'°',
88		'≤',
89		'≥',
90		'¥',
91		'©',
92		'€',
93		"'",
94		'-',
95		'_',
96		];
97
98		/**
99		* Split the given input into tokens using punktuation marks as splitter
100		*
101		* The input can be a string or a tokenRegistry. If the input is a
102		* TokenRegistry, each item will be tokenized.
103		*
104		* @param string\|TokenRegistry $input The
105		* input to be tokenized
106		*
107		* @return TokenRegistry
108		*/
109	View Code Duplication	public function run($input)
110		{
111		if ($input instanceof TokenRegistry) {
112		// Tokenize a TokenRegistry
113		$f = clone($input);
114		foreach ($input as $token) {
115		if (! $token instanceof WordToken) {
116		continue;
117		}
118		$newTokens = $this->tokenize($token->get());
119		if ($newTokens == array($token)) {
120		continue;
121		}
122		$f->replace($token, $newTokens);
123		}
124
125		return $f ;
126		}
127
128		// Tokenize a simple string.
129		$array = $this->tokenize($input);
130		$registry = new TokenRegistry();
131		foreach ($array as $item) {
132		$registry->add($item);
133		}
134
135		return $registry;
136		}
137
138		/**
139		* Split the given string into tokens using whitespace.
140		*
141		* Each whitespace is placed in a WhitespaceToken and everything else is
142		* placed in a WordToken-Object
143		*
144		* @param string $input The String to tokenize
145		*
146		* @return Token[]
147		*/
148	View Code Duplication	private function tokenize($input)
149		{
150		$tokens = array();
151		$signs = '\\' . implode('\\', $this->tokens);
152		$splits = preg_split('/([' . $signs . ']+)/u', $input, -1, PREG_SPLIT_DELIM_CAPTURE);
153		foreach ($splits as $split) {
154		if ('' == $split) {
155		continue;
156		}
157		if (in_array(mb_substr($split, 0, 1), $this->tokens)) {
158		$tokens[] = new NonWordToken($split);
159		continue;
160		}
161		$tokens[] = new WordToken($split);
162		}
163
164		return $tokens;
165		}
166		}
167

heiglandreas / Org_Heigl_Hyphenator

PunctuationTokenizer::tokenize() A last analyzed 2021-04-29 19:35 UTC

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like

PunctuationTokenizer::tokenize() A
last analyzed 2021-04-29 19:35 UTC