TypoTokenizer - Code Metrics - Inspection of "refac OuvrageMix" - Dispositif/Wikibot - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Branch — master (fd6b1a)

by Dispositif

created 2023-04-28 22:00 UTC

TypoTokenizer A

↳ Parent: Project

Complexity

Total Complexity

Size/Duplication

Total Lines	147
Duplicated Lines	0 %

Importance

Changes

Metric	Value
wmc	15
eloc	70
c	0
b	0
f	0
dl	0
loc	147
rs	10

2 Methods

Rating	Name	Duplication	Size	Complexity
A	preprocessTypoPattern()	0	40	1
C	typoPatternFromAuthor()	0	79	14

<?php
/*
 * This file is part of dispositif/wikibot application (@github)
 * 2019-2023 © Philippe M./Irønie  <[email protected]>
 * For the full copyright and MIT license information, view the license file.
 */

declare(strict_types=1);

namespace App\Domain\Predict;

use App\Domain\Utils\TextUtil;
use App\Domain\Utils\WikiTextUtil;

/**
 * Tokenizing string in predefined typographic categories.
 * Used for typographic pattern analysis & recognition.
 * Class TypoTokenizer.
 */
class TypoTokenizer
{
    private $tokenValue = [];

    /**
     * Tokenize into typographic pattern.
     * See studies from CLÉO : http://bilbo.hypotheses.org/193 et http://bilbo.hypotheses.org/133 /111
     * ALLUPPER, FIRSTUPPER, ALLLOWER, MIXED, INITIAL, ALLNUMBER, WITHNUMBER, DASHNUMBER, URL, ITALIC, BIBABREV, AND,
     * COMMA, PUNCTUATION,
     * Example of the returned array :
     * string => 'Penaud, Jean-Pierre'
     * pattern => 'FIRSTUPPER COMMA MIXED'
     * tokens => [ 0 => 'Penaud', 1 => ',', 2 => 'Jean-Pierre'].
     *
     * @param string $text
     *
     * @return array (see example)
     */
    public function typoPatternFromAuthor(string $text): array
    {
        $res = [];
        $res['string'] = $text;
        $modText = TextUtil::replaceNonBreakingSpaces($text);

        // unWikify or not ? remove wikilinks and bold/italic wikicode
        $modText = WikiTextUtil::unWikify($modText);

        /*
         * Pre-process : add spaces between relevant typographic items
         */
        $this->tokenValue = [];
        $modText = $this->preprocessTypoPattern($modText);

        // PUNCTUATION conversion
        $punctuationColl = array_filter(
            TextUtil::ALL_PUNCTUATION,
            function ($value) {
                // skip punctuation chars from mixed names (example : "Pierre-Marie L'Anglois")
                return !in_array($value, ["'", '-', '-']);
            }
        );
        // don't use str_split() which cuts on 1 byte length (≠ multibytes chars)
        $modText = str_replace($punctuationColl, ' PATTERNPUNCTUATION ', $modText);

        // "BUBBLES COMMA  DROPS COMMA  AND PARTICLES"

        // Split the string
        $tokens = preg_split('#[ ]+#', $modText);
        $res['pattern'] = '';
        foreach ($tokens as $tok) {
            if (empty($tok)) {
                continue;
            }
            if (preg_match('#^(PATTERNINITIAL|PATTERNURL|PATTERNAND|PATTERNCOMMA|PATTERNBIBABREV|PATTERNPUNCTUATION)$#', $tok, $matches) > 0) {

                $shortpattern = str_replace('PATTERN','', $tok);
                $res['pattern'] .= ' '.$shortpattern; // PATTERNAND -> AND
                if (in_array($matches[1], ['PATTERNCOMMA', 'PATTERNPUNCTUATION']) || empty($matches[1])) {
                    $res['value'][] = '*';
                } else {
                    $res['value'][] = current($this->tokenValue[$shortpattern]);
                    next($this->tokenValue[$shortpattern]);
                }
                //"J. R . R." => INITIAL (1 seule fois)
                // $res = str_replace('INITIAL INITIAL', 'INITIAL', $res);
            } elseif (preg_match('#^\d+$#', $tok) > 0) {
                $res['pattern'] .= ' ALLNUMBER';
                $res['value'][] = $tok;
            } elseif (preg_match('#^[0-9\-]+$#', $tok) > 0) {
                $res['pattern'] .= ' DASHNUMBER';
                $res['value'][] = $tok;
            } elseif (preg_match('#\d#', $tok) > 0) {
                $res['pattern'] .= ' WITHNUMBER';
                $res['value'][] = $tok;
            } elseif (mb_strtolower($tok, 'UTF-8') === $tok) {
                $res['pattern'] .= ' ALLLOWER';
                $res['value'][] = $tok;
            } elseif (mb_strtoupper($tok, 'UTF-8') === $tok) {
                $res['pattern'] .= ' ALLUPPER';
                $res['value'][] = $tok;
            } elseif (mb_strtoupper(substr($tok, 0, 1), 'UTF-8') === substr($tok, 0, 1)
                && mb_strtolower(substr($tok, 1), 'UTF-8') === substr($tok, 1)
            ) {
                $res['pattern'] .= ' FIRSTUPPER';
                $res['value'][] = $tok;
            } elseif (preg_match('#[a-zA-Zàéù]#', $tok) > 0) {
                $res['pattern'] .= ' MIXED';
                $res['value'][] = $tok;
            } else {
                $res['pattern'] .= ' UNKNOW';
                $res['value'][] = $tok;
            }
        }

        $res['pattern'] = trim($res['pattern']);

        return $res;
    }

    /**
     * Pre-process text : add spaces between relevant typographic items.
     * Save values by types in $tokenValue.
     *
     * @param string $modText
     *
     * @return string
     */
    private function preprocessTypoPattern(string $modText): string
    {
        return preg_replace_callback_array(
            [
                // URL
                '#\bhttps?://[^ \]]+#i' => function ($match): string {
                    // '#https?\:\/\/[\w\-_]+(\.[\w\-_]+)+([\w\-\.,@?^=%&amp;:/~\+\#]*[\w\-\@?^=%&amp;/~\+#])?#'
                    $this->tokenValue['URL'][] = $match[0];

                    return ' PATTERNURL ';
                },
                // BIBABREV : "dir.", "trad.", "(dir.)", "[dir.]", etc.
                // TODO: regex flaw : "(" not evaluated in BIBABREV. Example : "(dir.)"
                '#\b[(\[]?(collectif|coll\.|dir\.|trad\.|coord\.|ill\.)[)\]]?#i' => function ($match): string {
                    $this->tokenValue['BIBABREV'][] = $match[0]; // [1] = dir

                    return ' PATTERNBIBABREV ';
                },
                // AND
                '# (et|and|&|with|avec|e) #i' => function ($match): string {
                    $this->tokenValue['AND'][] = $match[0];

                    return ' PATTERNAND ';
                },
                // COMMA
                '#,#' => function (): string {
                    return ' PATTERNCOMMA ';
                },
                // INITIAL : 2) convert letter ("A.") or junior ("Jr.") or senior ("Sr.")
                // extract initial before "." converted in PUNCTUATION
                // Note : \b word boundary match between "L" and "'Amour" in "L'Amour"  (for [A-Z]\b)
                // \b([A-Z]\. |[A-Z] |JR|Jr\.|Jr\b|Sr\.|Sr\b)+ for grouping "A. B." in same INITIAL ?
                "#\b([A-Z]\.|[A-Z] |JR|Jr\.|Jr\b|Sr\.|Sr\b)#" => function ($match): string {
                    $this->tokenValue['INITIAL'][] = $match[0];

                    return ' PATTERNINITIAL ';
                },
            ],
            $modText,
            40
        );
    }
}


1			<?php
2			/*
3			* This file is part of dispositif/wikibot application (@github)
4			* 2019-2023 © Philippe M./Irønie <[email protected]>
5			* For the full copyright and MIT license information, view the license file.
6			*/
7
8			declare(strict_types=1);
9
10			namespace App\Domain\Predict;
11
12			use App\Domain\Utils\TextUtil;
13			use App\Domain\Utils\WikiTextUtil;
14
15			/**
16			* Tokenizing string in predefined typographic categories.
17			* Used for typographic pattern analysis & recognition.
18			* Class TypoTokenizer.
19			*/
20			class TypoTokenizer
21			{
22			private $tokenValue = [];
23
24			/**
25			* Tokenize into typographic pattern.
26			* See studies from CLÉO : http://bilbo.hypotheses.org/193 et http://bilbo.hypotheses.org/133 /111
27			* ALLUPPER, FIRSTUPPER, ALLLOWER, MIXED, INITIAL, ALLNUMBER, WITHNUMBER, DASHNUMBER, URL, ITALIC, BIBABREV, AND,
28			* COMMA, PUNCTUATION,
29			* Example of the returned array :
30			* string => 'Penaud, Jean-Pierre'
31			* pattern => 'FIRSTUPPER COMMA MIXED'
32			* tokens => [ 0 => 'Penaud', 1 => ',', 2 => 'Jean-Pierre'].
33			*
34			* @param string $text
35			*
36			* @return array (see example)
37			*/
38			public function typoPatternFromAuthor(string $text): array
39			{
40			$res = [];
41			$res['string'] = $text;
42			$modText = TextUtil::replaceNonBreakingSpaces($text);
43
44			// unWikify or not ? remove wikilinks and bold/italic wikicode
45			$modText = WikiTextUtil::unWikify($modText);
46
47			/*
48			* Pre-process : add spaces between relevant typographic items
49			*/
50			$this->tokenValue = [];
51			$modText = $this->preprocessTypoPattern($modText);
52
53			// PUNCTUATION conversion
54			$punctuationColl = array_filter(
55			TextUtil::ALL_PUNCTUATION,
56			function ($value) {
57			// skip punctuation chars from mixed names (example : "Pierre-Marie L'Anglois")
58			return !in_array($value, ["'", '-', '-']);
59			}
60			);
61			// don't use str_split() which cuts on 1 byte length (≠ multibytes chars)
62			$modText = str_replace($punctuationColl, ' PATTERNPUNCTUATION ', $modText);
63
64			// "BUBBLES COMMA DROPS COMMA AND PARTICLES"
65
66			// Split the string
67			$tokens = preg_split('#[ ]+#', $modText);
68			$res['pattern'] = '';
69			foreach ($tokens as $tok) {
70			if (empty($tok)) {
71			continue;
72			}
73			if (preg_match('#^(PATTERNINITIAL\|PATTERNURL\|PATTERNAND\|PATTERNCOMMA\|PATTERNBIBABREV\|PATTERNPUNCTUATION)$#', $tok, $matches) > 0) {
74
75			$shortpattern = str_replace('PATTERN','', $tok);
76			$res['pattern'] .= ' '.$shortpattern; // PATTERNAND -> AND
77			if (in_array($matches[1], ['PATTERNCOMMA', 'PATTERNPUNCTUATION']) \|\| empty($matches[1])) {
78			$res['value'][] = '*';
79			} else {
80			$res['value'][] = current($this->tokenValue[$shortpattern]);
81			next($this->tokenValue[$shortpattern]);
82			}
83			//"J. R . R." => INITIAL (1 seule fois)
84			// $res = str_replace('INITIAL INITIAL', 'INITIAL', $res);
85			} elseif (preg_match('#^\d+$#', $tok) > 0) {
86			$res['pattern'] .= ' ALLNUMBER';
87			$res['value'][] = $tok;
88			} elseif (preg_match('#^[0-9\-]+$#', $tok) > 0) {
89			$res['pattern'] .= ' DASHNUMBER';
90			$res['value'][] = $tok;
91			} elseif (preg_match('#\d#', $tok) > 0) {
92			$res['pattern'] .= ' WITHNUMBER';
93			$res['value'][] = $tok;
94			} elseif (mb_strtolower($tok, 'UTF-8') === $tok) {
95			$res['pattern'] .= ' ALLLOWER';
96			$res['value'][] = $tok;
97			} elseif (mb_strtoupper($tok, 'UTF-8') === $tok) {
98			$res['pattern'] .= ' ALLUPPER';
99			$res['value'][] = $tok;
100			} elseif (mb_strtoupper(substr($tok, 0, 1), 'UTF-8') === substr($tok, 0, 1)
101			&& mb_strtolower(substr($tok, 1), 'UTF-8') === substr($tok, 1)
102			) {
103			$res['pattern'] .= ' FIRSTUPPER';
104			$res['value'][] = $tok;
105			} elseif (preg_match('#[a-zA-Zàéù]#', $tok) > 0) {
106			$res['pattern'] .= ' MIXED';
107			$res['value'][] = $tok;
108			} else {
109			$res['pattern'] .= ' UNKNOW';
110			$res['value'][] = $tok;
111			}
112			}
113
114			$res['pattern'] = trim($res['pattern']);
115
116			return $res;
117			}
118
119			/**
120			* Pre-process text : add spaces between relevant typographic items.
121			* Save values by types in $tokenValue.
122			*
123			* @param string $modText
124			*
125			* @return string
126			*/
127			private function preprocessTypoPattern(string $modText): string
128			{
129			return preg_replace_callback_array(
130			[
131			// URL
132			'#\bhttps?://[^ \]]+#i' => function ($match): string {
133			// '#https?\:\/\/[\w\-_]+(\.[\w\-_]+)+([\w\-\.,@?^=%&:/~\+\#]*[\w\-\@?^=%&/~\+#])?#'
134			$this->tokenValue['URL'][] = $match[0];
135
136			return ' PATTERNURL ';
137			},
138			// BIBABREV : "dir.", "trad.", "(dir.)", "[dir.]", etc.
139			// TODO: regex flaw : "(" not evaluated in BIBABREV. Example : "(dir.)"
140			'#\b[(\[]?(collectif\|coll\.\|dir\.\|trad\.\|coord\.\|ill\.)[)\]]?#i' => function ($match): string {
141			$this->tokenValue['BIBABREV'][] = $match[0]; // [1] = dir
142
143			return ' PATTERNBIBABREV ';
144			},
145			// AND
146			'# (et\|and\|&\|with\|avec\|e) #i' => function ($match): string {
147			$this->tokenValue['AND'][] = $match[0];
148
149			return ' PATTERNAND ';
150			},
151			// COMMA
152			'#,#' => function (): string {
153			return ' PATTERNCOMMA ';
154			},
155			// INITIAL : 2) convert letter ("A.") or junior ("Jr.") or senior ("Sr.")
156			// extract initial before "." converted in PUNCTUATION
157			// Note : \b word boundary match between "L" and "'Amour" in "L'Amour" (for [A-Z]\b)
158			// \b([A-Z]\. \|[A-Z] \|JR\|Jr\.\|Jr\b\|Sr\.\|Sr\b)+ for grouping "A. B." in same INITIAL ?
159			"#\b([A-Z]\.\|[A-Z] \|JR\|Jr\.\|Jr\b\|Sr\.\|Sr\b)#" => function ($match): string {
160			$this->tokenValue['INITIAL'][] = $match[0];
161
162			return ' PATTERNINITIAL ';
163			},
164			],
165			$modText,
166			40
167			);
168			}
169			}
170

Dispositif / Wikibot

Branch — master (fd6b1a)

TypoTokenizer A

Complexity

Size/Duplication

Importance

2 Methods

Duplication Side-by-Side

Filter issues like