TypoTokenizer::typoPatternFromAuthor() - Code Metrics - Dispositif/Wikibot - Measure and Improve Code Quality continuously with Scrutinizer

TypoTokenizer::typoPatternFromAuthor() C
last analyzed 2024-01-18 16:03 UTC

↳ Parent: TypoTokenizer

Complexity

Conditions	14
Paths	12

Size

Total Lines	79
Code Lines	51

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
eloc	51
c	0
b	0
f	0
dl	0
loc	79
rs	6.2666
cc	14
nc	12
nop	1

How to fix Long Method Complexity

<?php
/*
 * This file is part of dispositif/wikibot application (@github)
 * 2019-2023 © Philippe M./Irønie  <[email protected]>
 * For the full copyright and MIT license information, view the license file.
 */

declare(strict_types=1);

namespace App\Domain\Predict;

use App\Domain\Utils\TextUtil;
use App\Domain\Utils\WikiTextUtil;

/**
 * Tokenizing string in predefined typographic categories.
 * Used for typographic pattern analysis & recognition.
 * Class TypoTokenizer.
 */
class TypoTokenizer
{
    private array $tokenValue = [];

    /**
     * Tokenize into typographic pattern.
     * See studies from CLÉO : http://bilbo.hypotheses.org/193 et http://bilbo.hypotheses.org/133 /111
     * ALLUPPER, FIRSTUPPER, ALLLOWER, MIXED, INITIAL, ALLNUMBER, WITHNUMBER, DASHNUMBER, URL, ITALIC, BIBABREV, AND,
     * COMMA, PUNCTUATION,
     * Example of the returned array :
     * string => 'Penaud, Jean-Pierre'
     * pattern => 'FIRSTUPPER COMMA MIXED'
     * tokens => [ 0 => 'Penaud', 1 => ',', 2 => 'Jean-Pierre'].
     */
    public function typoPatternFromAuthor(string $text): array
    {
        $res = [];
        $res['string'] = $text;
        $modText = TextUtil::replaceNonBreakingSpaces($text);

        // unWikify or not ? remove wikilinks and bold/italic wikicode
        $modText = WikiTextUtil::unWikify($modText);

        /*
         * Pre-process : add spaces between relevant typographic items
         */
        $this->tokenValue = [];
        $modText = $this->preprocessTypoPattern($modText);

        // PUNCTUATION conversion
        $punctuationColl = array_filter(
            TextUtil::ALL_PUNCTUATION,
            function ($value) {
                // skip punctuation chars from mixed names (example : "Pierre-Marie L'Anglois")
                return !in_array($value, ["'", '-', '-']);
            }
        );
        // don't use str_split() which cuts on 1 byte length (≠ multibytes chars)
        $modText = str_replace($punctuationColl, ' PATTERNPUNCTUATION ', $modText);

        // "BUBBLES COMMA  DROPS COMMA  AND PARTICLES"

        // Split the string
        $tokens = preg_split('#[ ]+#', $modText);
        $res['pattern'] = '';
        foreach ($tokens as $tok) {
            if (empty($tok)) {
                continue;
            }
            if (preg_match('#^(PATTERNINITIAL|PATTERNURL|PATTERNAND|PATTERNCOMMA|PATTERNBIBABREV|PATTERNPUNCTUATION)$#', (string) $tok, $matches) > 0) {

                $shortpattern = str_replace('PATTERN','', (string) $tok);
                $res['pattern'] .= ' '.$shortpattern; // PATTERNAND -> AND
                if (in_array($matches[1], ['PATTERNCOMMA', 'PATTERNPUNCTUATION']) || empty($matches[1])) {
                    $res['value'][] = '*';
                } else {
                    $res['value'][] = current($this->tokenValue[$shortpattern]);
                    next($this->tokenValue[$shortpattern]);
                }
                //"J. R . R." => INITIAL (1 seule fois)
                // $res = str_replace('INITIAL INITIAL', 'INITIAL', $res);
            } elseif (preg_match('#^\d+$#', (string) $tok) > 0) {
                $res['pattern'] .= ' ALLNUMBER';
                $res['value'][] = $tok;
            } elseif (preg_match('#^[0-9\-]+$#', (string) $tok) > 0) {
                $res['pattern'] .= ' DASHNUMBER';
                $res['value'][] = $tok;
            } elseif (preg_match('#\d#', (string) $tok) > 0) {
                $res['pattern'] .= ' WITHNUMBER';
                $res['value'][] = $tok;
            } elseif (mb_strtolower((string) $tok, 'UTF-8') === $tok) {
                $res['pattern'] .= ' ALLLOWER';
                $res['value'][] = $tok;
            } elseif (mb_strtoupper((string) $tok, 'UTF-8') === $tok) {
                $res['pattern'] .= ' ALLUPPER';
                $res['value'][] = $tok;
            } elseif (mb_strtoupper(substr((string) $tok, 0, 1), 'UTF-8') === substr((string) $tok, 0, 1)
                && mb_strtolower(substr((string) $tok, 1), 'UTF-8') === substr((string) $tok, 1)
            ) {
                $res['pattern'] .= ' FIRSTUPPER';
                $res['value'][] = $tok;
            } elseif (preg_match('#[a-zA-Zàéù]#', (string) $tok) > 0) {
                $res['pattern'] .= ' MIXED';
                $res['value'][] = $tok;
            } else {
                $res['pattern'] .= ' UNKNOW';
                $res['value'][] = $tok;
            }
        }

        $res['pattern'] = trim($res['pattern']);

        return $res;
    }

    /**
     * Pre-process text : add spaces between relevant typographic items.
     * Save values by types in $tokenValue.
     *
     *
     */
    private function preprocessTypoPattern(string $modText): string
    {
        return preg_replace_callback_array(
            [
                // URL
                '#\bhttps?://[^ \]]+#i' => function ($match): string {
                    // '#https?\:\/\/[\w\-_]+(\.[\w\-_]+)+([\w\-\.,@?^=%&amp;:/~\+\#]*[\w\-\@?^=%&amp;/~\+#])?#'
                    $this->tokenValue['URL'][] = $match[0];

                    return ' PATTERNURL ';
                },
                // BIBABREV : "dir.", "trad.", "(dir.)", "[dir.]", etc.
                // TODO: regex flaw : "(" not evaluated in BIBABREV. Example : "(dir.)"
                '#\b[(\[]?(collectif|coll\.|dir\.|trad\.|coord\.|ill\.)[)\]]?#i' => function ($match): string {
                    $this->tokenValue['BIBABREV'][] = $match[0]; // [1] = dir

                    return ' PATTERNBIBABREV ';
                },
                // AND
                '# (et|and|&|with|avec|e) #i' => function ($match): string {
                    $this->tokenValue['AND'][] = $match[0];

                    return ' PATTERNAND ';
                },
                // COMMA
                '#,#' => function (): string {
                    return ' PATTERNCOMMA ';
                },
                // INITIAL : 2) convert letter ("A.") or junior ("Jr.") or senior ("Sr.")
                // extract initial before "." converted in PUNCTUATION
                // Note : \b word boundary match between "L" and "'Amour" in "L'Amour"  (for [A-Z]\b)
                // \b([A-Z]\. |[A-Z] |JR|Jr\.|Jr\b|Sr\.|Sr\b)+ for grouping "A. B." in same INITIAL ?
                "#\b([A-Z]\.|[A-Z] |JR|Jr\.|Jr\b|Sr\.|Sr\b)#" => function ($match): string {
                    $this->tokenValue['INITIAL'][] = $match[0];

                    return ' PATTERNINITIAL ';
                },
            ],
            $modText,
            40
        );
    }
}


1			<?php
2			/*
3			* This file is part of dispositif/wikibot application (@github)
4			* 2019-2023 © Philippe M./Irønie <[email protected]>
5			* For the full copyright and MIT license information, view the license file.
6			*/
7
8			declare(strict_types=1);
9
10			namespace App\Domain\Predict;
11
12			use App\Domain\Utils\TextUtil;
13			use App\Domain\Utils\WikiTextUtil;
14
15			/**
16			* Tokenizing string in predefined typographic categories.
17			* Used for typographic pattern analysis & recognition.
18			* Class TypoTokenizer.
19			*/
20			class TypoTokenizer
21			{
22			private array $tokenValue = [];
23
24			/**
25			* Tokenize into typographic pattern.
26			* See studies from CLÉO : http://bilbo.hypotheses.org/193 et http://bilbo.hypotheses.org/133 /111
27			* ALLUPPER, FIRSTUPPER, ALLLOWER, MIXED, INITIAL, ALLNUMBER, WITHNUMBER, DASHNUMBER, URL, ITALIC, BIBABREV, AND,
28			* COMMA, PUNCTUATION,
29			* Example of the returned array :
30			* string => 'Penaud, Jean-Pierre'
31			* pattern => 'FIRSTUPPER COMMA MIXED'
32			* tokens => [ 0 => 'Penaud', 1 => ',', 2 => 'Jean-Pierre'].
33			*/
34			public function typoPatternFromAuthor(string $text): array
35			{
36			$res = [];
37			$res['string'] = $text;
38			$modText = TextUtil::replaceNonBreakingSpaces($text);
39
40			// unWikify or not ? remove wikilinks and bold/italic wikicode
41			$modText = WikiTextUtil::unWikify($modText);
42
43			/*
44			* Pre-process : add spaces between relevant typographic items
45			*/
46			$this->tokenValue = [];
47			$modText = $this->preprocessTypoPattern($modText);
48
49			// PUNCTUATION conversion
50			$punctuationColl = array_filter(
51			TextUtil::ALL_PUNCTUATION,
52			function ($value) {
53			// skip punctuation chars from mixed names (example : "Pierre-Marie L'Anglois")
54			return !in_array($value, ["'", '-', '-']);
55			}
56			);
57			// don't use str_split() which cuts on 1 byte length (≠ multibytes chars)
58			$modText = str_replace($punctuationColl, ' PATTERNPUNCTUATION ', $modText);
59
60			// "BUBBLES COMMA DROPS COMMA AND PARTICLES"
61
62			// Split the string
63			$tokens = preg_split('#[ ]+#', $modText);
64			$res['pattern'] = '';
65			foreach ($tokens as $tok) {
66			if (empty($tok)) {
67			continue;
68			}
69			if (preg_match('#^(PATTERNINITIAL\|PATTERNURL\|PATTERNAND\|PATTERNCOMMA\|PATTERNBIBABREV\|PATTERNPUNCTUATION)$#', (string) $tok, $matches) > 0) {
70
71			$shortpattern = str_replace('PATTERN','', (string) $tok);
72			$res['pattern'] .= ' '.$shortpattern; // PATTERNAND -> AND
73			if (in_array($matches[1], ['PATTERNCOMMA', 'PATTERNPUNCTUATION']) \|\| empty($matches[1])) {
74			$res['value'][] = '*';
75			} else {
76			$res['value'][] = current($this->tokenValue[$shortpattern]);
77			next($this->tokenValue[$shortpattern]);
78			}
79			//"J. R . R." => INITIAL (1 seule fois)
80			// $res = str_replace('INITIAL INITIAL', 'INITIAL', $res);
81			} elseif (preg_match('#^\d+$#', (string) $tok) > 0) {
82			$res['pattern'] .= ' ALLNUMBER';
83			$res['value'][] = $tok;
84			} elseif (preg_match('#^[0-9\-]+$#', (string) $tok) > 0) {
85			$res['pattern'] .= ' DASHNUMBER';
86			$res['value'][] = $tok;
87			} elseif (preg_match('#\d#', (string) $tok) > 0) {
88			$res['pattern'] .= ' WITHNUMBER';
89			$res['value'][] = $tok;
90			} elseif (mb_strtolower((string) $tok, 'UTF-8') === $tok) {
91			$res['pattern'] .= ' ALLLOWER';
92			$res['value'][] = $tok;
93			} elseif (mb_strtoupper((string) $tok, 'UTF-8') === $tok) {
94			$res['pattern'] .= ' ALLUPPER';
95			$res['value'][] = $tok;
96			} elseif (mb_strtoupper(substr((string) $tok, 0, 1), 'UTF-8') === substr((string) $tok, 0, 1)
97			&& mb_strtolower(substr((string) $tok, 1), 'UTF-8') === substr((string) $tok, 1)
98			) {
99			$res['pattern'] .= ' FIRSTUPPER';
100			$res['value'][] = $tok;
101			} elseif (preg_match('#[a-zA-Zàéù]#', (string) $tok) > 0) {
102			$res['pattern'] .= ' MIXED';
103			$res['value'][] = $tok;
104			} else {
105			$res['pattern'] .= ' UNKNOW';
106			$res['value'][] = $tok;
107			}
108			}
109
110			$res['pattern'] = trim($res['pattern']);
111
112			return $res;
113			}
114
115			/**
116			* Pre-process text : add spaces between relevant typographic items.
117			* Save values by types in $tokenValue.
118			*
119			*
120			*/
121			private function preprocessTypoPattern(string $modText): string
122			{
123			return preg_replace_callback_array(
124			[
125			// URL
126			'#\bhttps?://[^ \]]+#i' => function ($match): string {
127			// '#https?\:\/\/[\w\-_]+(\.[\w\-_]+)+([\w\-\.,@?^=%&:/~\+\#]*[\w\-\@?^=%&/~\+#])?#'
128			$this->tokenValue['URL'][] = $match[0];
129
130			return ' PATTERNURL ';
131			},
132			// BIBABREV : "dir.", "trad.", "(dir.)", "[dir.]", etc.
133			// TODO: regex flaw : "(" not evaluated in BIBABREV. Example : "(dir.)"
134			'#\b[(\[]?(collectif\|coll\.\|dir\.\|trad\.\|coord\.\|ill\.)[)\]]?#i' => function ($match): string {
135			$this->tokenValue['BIBABREV'][] = $match[0]; // [1] = dir
136
137			return ' PATTERNBIBABREV ';
138			},
139			// AND
140			'# (et\|and\|&\|with\|avec\|e) #i' => function ($match): string {
141			$this->tokenValue['AND'][] = $match[0];
142
143			return ' PATTERNAND ';
144			},
145			// COMMA
146			'#,#' => function (): string {
147			return ' PATTERNCOMMA ';
148			},
149			// INITIAL : 2) convert letter ("A.") or junior ("Jr.") or senior ("Sr.")
150			// extract initial before "." converted in PUNCTUATION
151			// Note : \b word boundary match between "L" and "'Amour" in "L'Amour" (for [A-Z]\b)
152			// \b([A-Z]\. \|[A-Z] \|JR\|Jr\.\|Jr\b\|Sr\.\|Sr\b)+ for grouping "A. B." in same INITIAL ?
153			"#\b([A-Z]\.\|[A-Z] \|JR\|Jr\.\|Jr\b\|Sr\.\|Sr\b)#" => function ($match): string {
154			$this->tokenValue['INITIAL'][] = $match[0];
155
156			return ' PATTERNINITIAL ';
157			},
158			],
159			$modText,
160			40
161			);
162			}
163			}
164

Dispositif / Wikibot

TypoTokenizer::typoPatternFromAuthor() C last analyzed 2024-01-18 16:03 UTC

Complexity

Size

Duplication

Importance

How to fix Long Method Complexity

Long Method

Duplication Side-by-Side

Filter issues like

TypoTokenizer::typoPatternFromAuthor() C
last analyzed 2024-01-18 16:03 UTC