TypoTokenizer::typoPatternFromAuthor()   C
last analyzed

Complexity

Conditions 14
Paths 12

Size

Total Lines 79
Code Lines 51

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 51
c 0
b 0
f 0
dl 0
loc 79
rs 6.2666
cc 14
nc 12
nop 1

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
/*
3
 * This file is part of dispositif/wikibot application (@github)
4
 * 2019-2023 © Philippe M./Irønie  <[email protected]>
5
 * For the full copyright and MIT license information, view the license file.
6
 */
7
8
declare(strict_types=1);
9
10
namespace App\Domain\Predict;
11
12
use App\Domain\Utils\TextUtil;
13
use App\Domain\Utils\WikiTextUtil;
14
15
/**
16
 * Tokenizing string in predefined typographic categories.
17
 * Used for typographic pattern analysis & recognition.
18
 * Class TypoTokenizer.
19
 */
20
class TypoTokenizer
21
{
22
    private array $tokenValue = [];
23
24
    /**
25
     * Tokenize into typographic pattern.
26
     * See studies from CLÉO : http://bilbo.hypotheses.org/193 et http://bilbo.hypotheses.org/133 /111
27
     * ALLUPPER, FIRSTUPPER, ALLLOWER, MIXED, INITIAL, ALLNUMBER, WITHNUMBER, DASHNUMBER, URL, ITALIC, BIBABREV, AND,
28
     * COMMA, PUNCTUATION,
29
     * Example of the returned array :
30
     * string => 'Penaud, Jean-Pierre'
31
     * pattern => 'FIRSTUPPER COMMA MIXED'
32
     * tokens => [ 0 => 'Penaud', 1 => ',', 2 => 'Jean-Pierre'].
33
     */
34
    public function typoPatternFromAuthor(string $text): array
35
    {
36
        $res = [];
37
        $res['string'] = $text;
38
        $modText = TextUtil::replaceNonBreakingSpaces($text);
39
40
        // unWikify or not ? remove wikilinks and bold/italic wikicode
41
        $modText = WikiTextUtil::unWikify($modText);
42
43
        /*
44
         * Pre-process : add spaces between relevant typographic items
45
         */
46
        $this->tokenValue = [];
47
        $modText = $this->preprocessTypoPattern($modText);
48
49
        // PUNCTUATION conversion
50
        $punctuationColl = array_filter(
51
            TextUtil::ALL_PUNCTUATION,
52
            function ($value) {
53
                // skip punctuation chars from mixed names (example : "Pierre-Marie L'Anglois")
54
                return !in_array($value, ["'", '-', '-']);
55
            }
56
        );
57
        // don't use str_split() which cuts on 1 byte length (≠ multibytes chars)
58
        $modText = str_replace($punctuationColl, ' PATTERNPUNCTUATION ', $modText);
59
60
        // "BUBBLES COMMA  DROPS COMMA  AND PARTICLES"
61
62
        // Split the string
63
        $tokens = preg_split('#[ ]+#', $modText);
64
        $res['pattern'] = '';
65
        foreach ($tokens as $tok) {
66
            if (empty($tok)) {
67
                continue;
68
            }
69
            if (preg_match('#^(PATTERNINITIAL|PATTERNURL|PATTERNAND|PATTERNCOMMA|PATTERNBIBABREV|PATTERNPUNCTUATION)$#', (string) $tok, $matches) > 0) {
70
71
                $shortpattern = str_replace('PATTERN','', (string) $tok);
72
                $res['pattern'] .= ' '.$shortpattern; // PATTERNAND -> AND
73
                if (in_array($matches[1], ['PATTERNCOMMA', 'PATTERNPUNCTUATION']) || empty($matches[1])) {
74
                    $res['value'][] = '*';
75
                } else {
76
                    $res['value'][] = current($this->tokenValue[$shortpattern]);
77
                    next($this->tokenValue[$shortpattern]);
78
                }
79
                //"J. R . R." => INITIAL (1 seule fois)
80
                // $res = str_replace('INITIAL INITIAL', 'INITIAL', $res);
81
            } elseif (preg_match('#^\d+$#', (string) $tok) > 0) {
82
                $res['pattern'] .= ' ALLNUMBER';
83
                $res['value'][] = $tok;
84
            } elseif (preg_match('#^[0-9\-]+$#', (string) $tok) > 0) {
85
                $res['pattern'] .= ' DASHNUMBER';
86
                $res['value'][] = $tok;
87
            } elseif (preg_match('#\d#', (string) $tok) > 0) {
88
                $res['pattern'] .= ' WITHNUMBER';
89
                $res['value'][] = $tok;
90
            } elseif (mb_strtolower((string) $tok, 'UTF-8') === $tok) {
91
                $res['pattern'] .= ' ALLLOWER';
92
                $res['value'][] = $tok;
93
            } elseif (mb_strtoupper((string) $tok, 'UTF-8') === $tok) {
94
                $res['pattern'] .= ' ALLUPPER';
95
                $res['value'][] = $tok;
96
            } elseif (mb_strtoupper(substr((string) $tok, 0, 1), 'UTF-8') === substr((string) $tok, 0, 1)
97
                && mb_strtolower(substr((string) $tok, 1), 'UTF-8') === substr((string) $tok, 1)
98
            ) {
99
                $res['pattern'] .= ' FIRSTUPPER';
100
                $res['value'][] = $tok;
101
            } elseif (preg_match('#[a-zA-Zàéù]#', (string) $tok) > 0) {
102
                $res['pattern'] .= ' MIXED';
103
                $res['value'][] = $tok;
104
            } else {
105
                $res['pattern'] .= ' UNKNOW';
106
                $res['value'][] = $tok;
107
            }
108
        }
109
110
        $res['pattern'] = trim($res['pattern']);
111
112
        return $res;
113
    }
114
115
    /**
116
     * Pre-process text : add spaces between relevant typographic items.
117
     * Save values by types in $tokenValue.
118
     *
119
     *
120
     */
121
    private function preprocessTypoPattern(string $modText): string
122
    {
123
        return preg_replace_callback_array(
124
            [
125
                // URL
126
                '#\bhttps?://[^ \]]+#i' => function ($match): string {
127
                    // '#https?\:\/\/[\w\-_]+(\.[\w\-_]+)+([\w\-\.,@?^=%&amp;:/~\+\#]*[\w\-\@?^=%&amp;/~\+#])?#'
128
                    $this->tokenValue['URL'][] = $match[0];
129
130
                    return ' PATTERNURL ';
131
                },
132
                // BIBABREV : "dir.", "trad.", "(dir.)", "[dir.]", etc.
133
                // TODO: regex flaw : "(" not evaluated in BIBABREV. Example : "(dir.)"
134
                '#\b[(\[]?(collectif|coll\.|dir\.|trad\.|coord\.|ill\.)[)\]]?#i' => function ($match): string {
135
                    $this->tokenValue['BIBABREV'][] = $match[0]; // [1] = dir
136
137
                    return ' PATTERNBIBABREV ';
138
                },
139
                // AND
140
                '# (et|and|&|with|avec|e) #i' => function ($match): string {
141
                    $this->tokenValue['AND'][] = $match[0];
142
143
                    return ' PATTERNAND ';
144
                },
145
                // COMMA
146
                '#,#' => function (): string {
147
                    return ' PATTERNCOMMA ';
148
                },
149
                // INITIAL : 2) convert letter ("A.") or junior ("Jr.") or senior ("Sr.")
150
                // extract initial before "." converted in PUNCTUATION
151
                // Note : \b word boundary match between "L" and "'Amour" in "L'Amour"  (for [A-Z]\b)
152
                // \b([A-Z]\. |[A-Z] |JR|Jr\.|Jr\b|Sr\.|Sr\b)+ for grouping "A. B." in same INITIAL ?
153
                "#\b([A-Z]\.|[A-Z] |JR|Jr\.|Jr\b|Sr\.|Sr\b)#" => function ($match): string {
154
                    $this->tokenValue['INITIAL'][] = $match[0];
155
156
                    return ' PATTERNINITIAL ';
157
                },
158
            ],
159
            $modText,
160
            40
161
        );
162
    }
163
}
164