WhitespaceTokenizer::tokenize() - Code Metrics - heiglandreas/Org_Heigl_Hyphenator - Measure and Improve Code Quality continuously with Scrutinizer

WhitespaceTokenizer::tokenize() A
last analyzed 2021-04-29 19:35 UTC

↳ Parent: WhitespaceTokenizer

Complexity

Conditions	4
Paths	4

Size

Total Lines

Duplication

Lines	19
Ratio	100 %

Importance

Changes

Metric	Value
dl	19
loc	19
rs	9.6333
c	0
b	0
f	0
cc	4
nc	4
nop	1

<?php
/**
 * Copyright (c) 2008-2011 Andreas Heigl<[email protected]>
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 *
 * @category   Hyphenation
 * @package    Org_Heigl_Hyphenator
 * @subpackage Tokenizer
 * @author     Andreas Heigl <[email protected]>
 * @copyright  2008-2011 Andreas Heigl<[email protected]>
 * @license    http://www.opensource.org/licenses/mit-license.php MIT-License
 * @version    2.0.1
 * @link       http://github.com/heiglandreas/Hyphenator
 * @since      11.11.2011
 */

namespace Org\Heigl\Hyphenator\Tokenizer;

/**
 * Use Whitespace to split any input into tokens
 *
 * @category   Hyphenation
 * @package    Org_Heigl_Hyphenator
 * @subpackage Tokenizer
 * @author     Andreas Heigl <[email protected]>
 * @copyright  2008-2011 Andreas Heigl<[email protected]>
 * @license    http://www.opensource.org/licenses/mit-license.php MIT-License
 * @version    2.0.1
 * @link       http://github.com/heiglandreas/Hyphenator
 * @since      04.11.2011
 */
class WhitespaceTokenizer implements Tokenizer
{
    /**
     * @var string[]
     */
    protected $whitespaces = array(
      '\s',           // white space
      "\xE2\x80\xAF", // non-breaking thin white space
      "\xC2\xA0",     // non-breaking space
    );

    /**
     * Split the given input into tokens using whitespace as splitter
     *
     * The input can be a string or a tokenRegistry. If the input is a
     * TokenRegistry, each item will be tokenized.
     *
     * @param string|TokenRegistry $input The
     * input to be tokenized
     *
     * @return TokenRegistry
     */
    public function run($input)
    {
        if ($input instanceof TokenRegistry) {
            // Tokenize a TokenRegistry
            foreach ($input as $token) {
                if (! $token instanceof WordToken) {
                    continue;
                }
                $newTokens = $this->tokenize($token->get());
                if ($newTokens == array($token)) {
                    continue;
                }
                $input->replace($token, $newTokens);
            }

            return $input ;
        }

        // Tokenize a simple string.
        $array =  $this->tokenize($input);
        $registry = new TokenRegistry();
        foreach ($array as $item) {
            $registry->add($item);
        }

        return $registry;
    }

    /**
     * Split the given string into tokens using whitespace.
     *
     * Each whitespace is placed in a WhitespaceToken and everything else is
     * placed in a WordToken-Object
     *
     * @param string $input The String to tokenize
     *
     * @return Token[]
     */
    private function tokenize($input)
    {
        $tokens = array();
        $splits = preg_split("/([".implode("", $this->whitespaces)."]+)/u", $input, -1, PREG_SPLIT_DELIM_CAPTURE);

        foreach ($splits as $split) {
            if ($split === '') {
                $tokens[] = new EmptyToken($split);
                continue;
            }
            if (preg_match("/^[".implode("", $this->whitespaces)."]+$/um", $split)) {
                $tokens[] = new WhitespaceToken($split);
                continue;
            }
            $tokens[] = new WordToken($split);
        }

        return $tokens;
    }
}


1		<?php
2		/**
3		* Copyright (c) 2008-2011 Andreas Heigl<[email protected]>
4		*
5		* Permission is hereby granted, free of charge, to any person obtaining a copy
6		* of this software and associated documentation files (the "Software"), to deal
7		* in the Software without restriction, including without limitation the rights
8		* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9		* copies of the Software, and to permit persons to whom the Software is
10		* furnished to do so, subject to the following conditions:
11		*
12		* The above copyright notice and this permission notice shall be included in
13		* all copies or substantial portions of the Software.
14		*
15		* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16		* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17		* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18		* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19		* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20		* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21		* THE SOFTWARE.
22		*
23		* @category Hyphenation
24		* @package Org_Heigl_Hyphenator
25		* @subpackage Tokenizer
26		* @author Andreas Heigl <[email protected]>
27		* @copyright 2008-2011 Andreas Heigl<[email protected]>
28		* @license http://www.opensource.org/licenses/mit-license.php MIT-License
29		* @version 2.0.1
30		* @link http://github.com/heiglandreas/Hyphenator
31		* @since 11.11.2011
32		*/
33
34		namespace Org\Heigl\Hyphenator\Tokenizer;
35
36		/**
37		* Use Whitespace to split any input into tokens
38		*
39		* @category Hyphenation
40		* @package Org_Heigl_Hyphenator
41		* @subpackage Tokenizer
42		* @author Andreas Heigl <[email protected]>
43		* @copyright 2008-2011 Andreas Heigl<[email protected]>
44		* @license http://www.opensource.org/licenses/mit-license.php MIT-License
45		* @version 2.0.1
46		* @link http://github.com/heiglandreas/Hyphenator
47		* @since 04.11.2011
48		*/
49		class WhitespaceTokenizer implements Tokenizer
50		{
51		/**
52		* @var string[]
53		*/
54		protected $whitespaces = array(
55		'\s', // white space
56		"\xE2\x80\xAF", // non-breaking thin white space
57		"\xC2\xA0", // non-breaking space
58		);
59
60		/**
61		* Split the given input into tokens using whitespace as splitter
62		*
63		* The input can be a string or a tokenRegistry. If the input is a
64		* TokenRegistry, each item will be tokenized.
65		*
66		* @param string\|TokenRegistry $input The
67		* input to be tokenized
68		*
69		* @return TokenRegistry
70		*/
71	View Code Duplication	public function run($input)
72		{
73		if ($input instanceof TokenRegistry) {
74		// Tokenize a TokenRegistry
75		foreach ($input as $token) {
76		if (! $token instanceof WordToken) {
77		continue;
78		}
79		$newTokens = $this->tokenize($token->get());
80		if ($newTokens == array($token)) {
81		continue;
82		}
83		$input->replace($token, $newTokens);
84		}
85
86		return $input ;
87		}
88
89		// Tokenize a simple string.
90		$array = $this->tokenize($input);
91		$registry = new TokenRegistry();
92		foreach ($array as $item) {
93		$registry->add($item);
94		}
95
96		return $registry;
97		}
98
99		/**
100		* Split the given string into tokens using whitespace.
101		*
102		* Each whitespace is placed in a WhitespaceToken and everything else is
103		* placed in a WordToken-Object
104		*
105		* @param string $input The String to tokenize
106		*
107		* @return Token[]
108		*/
109	View Code Duplication	private function tokenize($input)
110		{
111		$tokens = array();
112		$splits = preg_split("/([".implode("", $this->whitespaces)."]+)/u", $input, -1, PREG_SPLIT_DELIM_CAPTURE);
113
114		foreach ($splits as $split) {
115		if ($split === '') {
116		$tokens[] = new EmptyToken($split);
117		continue;
118		}
119		if (preg_match("/^[".implode("", $this->whitespaces)."]+$/um", $split)) {
120		$tokens[] = new WhitespaceToken($split);
121		continue;
122		}
123		$tokens[] = new WordToken($split);
124		}
125
126		return $tokens;
127		}
128		}
129

heiglandreas / Org_Heigl_Hyphenator

WhitespaceTokenizer::tokenize() A last analyzed 2021-04-29 19:35 UTC

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like

WhitespaceTokenizer::tokenize() A
last analyzed 2021-04-29 19:35 UTC