WhitespaceTokenizer - Code Metrics - Inspection of "Set PHP 7.2 as default version" - heiglandreas/Org_Heigl_Hyphenator - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Pull Request — master (#37)

by Andreas

created 2018-02-09 17:09 UTC

WhitespaceTokenizer A

↳ Parent: Project

Complexity

Total Complexity

Size/Duplication

Total Lines	73
Duplicated Lines	0 %

Coupling/Cohesion

Components	1
Dependencies	3

Importance

Changes	2
Bugs	0	Features	0

Metric	Value
wmc	9
c	2
b	0
f	0
lcom	1
cbo	3
dl	0
loc	73
rs	10

2 Methods

Rating	Name	Duplication	Size	Complexity
B	run()	0	27	6
A	tokenize()	0	15	3

<?php
/**
 * Copyright (c) 2008-2011 Andreas Heigl<[email protected]>
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 *
 * @category   Hyphenation
 * @package    Org_Heigl_Hyphenator
 * @subpackage Tokenizer
 * @author     Andreas Heigl <[email protected]>
 * @copyright  2008-2011 Andreas Heigl<[email protected]>
 * @license    http://www.opensource.org/licenses/mit-license.php MIT-License
 * @version    2.0.1
 * @link       http://github.com/heiglandreas/Hyphenator
 * @since      11.11.2011
 */

namespace Org\Heigl\Hyphenator\Tokenizer;

/**
 * Use Whitespace to split any input into tokens
 *
 * @category   Hyphenation
 * @package    Org_Heigl_Hyphenator
 * @subpackage Tokenizer
 * @author     Andreas Heigl <[email protected]>
 * @copyright  2008-2011 Andreas Heigl<[email protected]>
 * @license    http://www.opensource.org/licenses/mit-license.php MIT-License
 * @version    2.0.1
 * @link       http://github.com/heiglandreas/Hyphenator
 * @since      04.11.2011
 */
class WhitespaceTokenizer implements Tokenizer
{
    protected $whitespaces = array(
      '\s',           // white space
      "\xE2\x80\xAF", // non-breaking thin white space
      "\xC2\xA0",     // non-breaking space
    );

    /**
     * Split the given input into tokens using whitespace as splitter
     *
     * The input can be a string or a tokenRegistry. If the input is a
     * TokenRegistry, each item will be tokenized.
     *
     * @param string|\Org\Heigl\Hyphenator\Tokenizer\TokenRegistry $input The
     * input to be tokenized
     *
     * @return \Org\Heigl\Hyphenator\Tokenizer\TokenRegistry
     */
    public function run($input)
    {
        if ($input instanceof TokenRegistry) {
            // Tokenize a TokenRegistry
            foreach ($input as $token) {
                if (! $token instanceof WordToken) {
                    continue;
                }
                $newTokens = $this->tokenize($token->get());
                if ($newTokens == array($token)) {
                    continue;
                }
                $input->replace($token, $newTokens);
            }

            return $input ;
        }

        // Tokenize a simple string.
        $array =  $this->tokenize($input);
        $registry = new TokenRegistry();
        foreach ($array as $item) {
            $registry->add($item);
        }

        return $registry;
    }

    /**
     * Split the given string into tokens using whitespace.
     *
     * Each whitespace is placed in a WhitespaceToken and everything else is
     * placed in a WordToken-Object
     *
     * @param string $input The String to tokenize
     *
     * @return Token
     */
    private function tokenize($input)
    {
        $tokens = array();
        $splits = preg_split("/([".implode("", $this->whitespaces)."]+)/u", $input, -1, PREG_SPLIT_DELIM_CAPTURE);

        foreach ($splits as $split) {
            if (preg_match("/^[".implode("", $this->whitespaces)."]+$/um", $split)) {
                $tokens[] = new WhitespaceToken($split);
                continue;
            }
            $tokens[] = new WordToken($split);
        }

        return $tokens;
    }
}


1			<?php
2			/**
3			* Copyright (c) 2008-2011 Andreas Heigl<[email protected]>
4			*
5			* Permission is hereby granted, free of charge, to any person obtaining a copy
6			* of this software and associated documentation files (the "Software"), to deal
7			* in the Software without restriction, including without limitation the rights
8			* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9			* copies of the Software, and to permit persons to whom the Software is
10			* furnished to do so, subject to the following conditions:
11			*
12			* The above copyright notice and this permission notice shall be included in
13			* all copies or substantial portions of the Software.
14			*
15			* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16			* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17			* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18			* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19			* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20			* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21			* THE SOFTWARE.
22			*
23			* @category Hyphenation
24			* @package Org_Heigl_Hyphenator
25			* @subpackage Tokenizer
26			* @author Andreas Heigl <[email protected]>
27			* @copyright 2008-2011 Andreas Heigl<[email protected]>
28			* @license http://www.opensource.org/licenses/mit-license.php MIT-License
29			* @version 2.0.1
30			* @link http://github.com/heiglandreas/Hyphenator
31			* @since 11.11.2011
32			*/
33
34			namespace Org\Heigl\Hyphenator\Tokenizer;
35
36			/**
37			* Use Whitespace to split any input into tokens
38			*
39			* @category Hyphenation
40			* @package Org_Heigl_Hyphenator
41			* @subpackage Tokenizer
42			* @author Andreas Heigl <[email protected]>
43			* @copyright 2008-2011 Andreas Heigl<[email protected]>
44			* @license http://www.opensource.org/licenses/mit-license.php MIT-License
45			* @version 2.0.1
46			* @link http://github.com/heiglandreas/Hyphenator
47			* @since 04.11.2011
48			*/
49			class WhitespaceTokenizer implements Tokenizer
50			{
51			protected $whitespaces = array(
52			'\s', // white space
53			"\xE2\x80\xAF", // non-breaking thin white space
54			"\xC2\xA0", // non-breaking space
55			);
56
57			/**
58			* Split the given input into tokens using whitespace as splitter
59			*
60			* The input can be a string or a tokenRegistry. If the input is a
61			* TokenRegistry, each item will be tokenized.
62			*
63			* @param string\|\Org\Heigl\Hyphenator\Tokenizer\TokenRegistry $input The
64			* input to be tokenized
65			*
66			* @return \Org\Heigl\Hyphenator\Tokenizer\TokenRegistry
67			*/
68			public function run($input)
69			{
70			if ($input instanceof TokenRegistry) {
71			// Tokenize a TokenRegistry
72			foreach ($input as $token) {
73			if (! $token instanceof WordToken) {
74			continue;
75			}
76			$newTokens = $this->tokenize($token->get());
77			if ($newTokens == array($token)) {
78			continue;
79			}
80			$input->replace($token, $newTokens);
81			}
82
83			return $input ;
84			}
85
86			// Tokenize a simple string.
87			$array = $this->tokenize($input);
88			$registry = new TokenRegistry();
89			foreach ($array as $item) {
90			$registry->add($item);
91			}
92
93			return $registry;
94			}
95
96			/**
97			* Split the given string into tokens using whitespace.
98			*
99			* Each whitespace is placed in a WhitespaceToken and everything else is
100			* placed in a WordToken-Object
101			*
102			* @param string $input The String to tokenize
103			*
104			* @return Token
105			*/
106			private function tokenize($input)
107			{
108			$tokens = array();
109			$splits = preg_split("/([".implode("", $this->whitespaces)."]+)/u", $input, -1, PREG_SPLIT_DELIM_CAPTURE);
110
111			foreach ($splits as $split) {
112			if (preg_match("/^[".implode("", $this->whitespaces)."]+$/um", $split)) {
113			$tokens[] = new WhitespaceToken($split);
114			continue;
115			}
116			$tokens[] = new WordToken($split);
117			}
118
119			return $tokens;
120			}
121			}
122

heiglandreas / Org_Heigl_Hyphenator

Pull Request — master (#37)

WhitespaceTokenizer A

Complexity

Size/Duplication

Coupling/Cohesion

Importance

2 Methods

Duplication Side-by-Side

Filter issues like