Passed
Branch master (fd6b1a)
by Dispositif
03:51 queued 01:13
created

TypoTokenizer   A

Complexity

Total Complexity 15

Size/Duplication

Total Lines 147
Duplicated Lines 0 %

Importance

Changes 0
Metric Value
wmc 15
eloc 70
c 0
b 0
f 0
dl 0
loc 147
rs 10

2 Methods

Rating   Name   Duplication   Size   Complexity  
A preprocessTypoPattern() 0 40 1
C typoPatternFromAuthor() 0 79 14
1
<?php
2
/*
3
 * This file is part of dispositif/wikibot application (@github)
4
 * 2019-2023 © Philippe M./Irønie  <[email protected]>
5
 * For the full copyright and MIT license information, view the license file.
6
 */
7
8
declare(strict_types=1);
9
10
namespace App\Domain\Predict;
11
12
use App\Domain\Utils\TextUtil;
13
use App\Domain\Utils\WikiTextUtil;
14
15
/**
16
 * Tokenizing string in predefined typographic categories.
17
 * Used for typographic pattern analysis & recognition.
18
 * Class TypoTokenizer.
19
 */
20
class TypoTokenizer
21
{
22
    private $tokenValue = [];
23
24
    /**
25
     * Tokenize into typographic pattern.
26
     * See studies from CLÉO : http://bilbo.hypotheses.org/193 et http://bilbo.hypotheses.org/133 /111
27
     * ALLUPPER, FIRSTUPPER, ALLLOWER, MIXED, INITIAL, ALLNUMBER, WITHNUMBER, DASHNUMBER, URL, ITALIC, BIBABREV, AND,
28
     * COMMA, PUNCTUATION,
29
     * Example of the returned array :
30
     * string => 'Penaud, Jean-Pierre'
31
     * pattern => 'FIRSTUPPER COMMA MIXED'
32
     * tokens => [ 0 => 'Penaud', 1 => ',', 2 => 'Jean-Pierre'].
33
     *
34
     * @param string $text
35
     *
36
     * @return array (see example)
37
     */
38
    public function typoPatternFromAuthor(string $text): array
39
    {
40
        $res = [];
41
        $res['string'] = $text;
42
        $modText = TextUtil::replaceNonBreakingSpaces($text);
43
44
        // unWikify or not ? remove wikilinks and bold/italic wikicode
45
        $modText = WikiTextUtil::unWikify($modText);
46
47
        /*
48
         * Pre-process : add spaces between relevant typographic items
49
         */
50
        $this->tokenValue = [];
51
        $modText = $this->preprocessTypoPattern($modText);
52
53
        // PUNCTUATION conversion
54
        $punctuationColl = array_filter(
55
            TextUtil::ALL_PUNCTUATION,
56
            function ($value) {
57
                // skip punctuation chars from mixed names (example : "Pierre-Marie L'Anglois")
58
                return !in_array($value, ["'", '-', '-']);
59
            }
60
        );
61
        // don't use str_split() which cuts on 1 byte length (≠ multibytes chars)
62
        $modText = str_replace($punctuationColl, ' PATTERNPUNCTUATION ', $modText);
63
64
        // "BUBBLES COMMA  DROPS COMMA  AND PARTICLES"
65
66
        // Split the string
67
        $tokens = preg_split('#[ ]+#', $modText);
68
        $res['pattern'] = '';
69
        foreach ($tokens as $tok) {
70
            if (empty($tok)) {
71
                continue;
72
            }
73
            if (preg_match('#^(PATTERNINITIAL|PATTERNURL|PATTERNAND|PATTERNCOMMA|PATTERNBIBABREV|PATTERNPUNCTUATION)$#', $tok, $matches) > 0) {
74
75
                $shortpattern = str_replace('PATTERN','', $tok);
76
                $res['pattern'] .= ' '.$shortpattern; // PATTERNAND -> AND
77
                if (in_array($matches[1], ['PATTERNCOMMA', 'PATTERNPUNCTUATION']) || empty($matches[1])) {
78
                    $res['value'][] = '*';
79
                } else {
80
                    $res['value'][] = current($this->tokenValue[$shortpattern]);
81
                    next($this->tokenValue[$shortpattern]);
82
                }
83
                //"J. R . R." => INITIAL (1 seule fois)
84
                // $res = str_replace('INITIAL INITIAL', 'INITIAL', $res);
85
            } elseif (preg_match('#^\d+$#', $tok) > 0) {
86
                $res['pattern'] .= ' ALLNUMBER';
87
                $res['value'][] = $tok;
88
            } elseif (preg_match('#^[0-9\-]+$#', $tok) > 0) {
89
                $res['pattern'] .= ' DASHNUMBER';
90
                $res['value'][] = $tok;
91
            } elseif (preg_match('#\d#', $tok) > 0) {
92
                $res['pattern'] .= ' WITHNUMBER';
93
                $res['value'][] = $tok;
94
            } elseif (mb_strtolower($tok, 'UTF-8') === $tok) {
95
                $res['pattern'] .= ' ALLLOWER';
96
                $res['value'][] = $tok;
97
            } elseif (mb_strtoupper($tok, 'UTF-8') === $tok) {
98
                $res['pattern'] .= ' ALLUPPER';
99
                $res['value'][] = $tok;
100
            } elseif (mb_strtoupper(substr($tok, 0, 1), 'UTF-8') === substr($tok, 0, 1)
101
                && mb_strtolower(substr($tok, 1), 'UTF-8') === substr($tok, 1)
102
            ) {
103
                $res['pattern'] .= ' FIRSTUPPER';
104
                $res['value'][] = $tok;
105
            } elseif (preg_match('#[a-zA-Zàéù]#', $tok) > 0) {
106
                $res['pattern'] .= ' MIXED';
107
                $res['value'][] = $tok;
108
            } else {
109
                $res['pattern'] .= ' UNKNOW';
110
                $res['value'][] = $tok;
111
            }
112
        }
113
114
        $res['pattern'] = trim($res['pattern']);
115
116
        return $res;
117
    }
118
119
    /**
120
     * Pre-process text : add spaces between relevant typographic items.
121
     * Save values by types in $tokenValue.
122
     *
123
     * @param string $modText
124
     *
125
     * @return string
126
     */
127
    private function preprocessTypoPattern(string $modText): string
128
    {
129
        return preg_replace_callback_array(
130
            [
131
                // URL
132
                '#\bhttps?://[^ \]]+#i' => function ($match): string {
133
                    // '#https?\:\/\/[\w\-_]+(\.[\w\-_]+)+([\w\-\.,@?^=%&amp;:/~\+\#]*[\w\-\@?^=%&amp;/~\+#])?#'
134
                    $this->tokenValue['URL'][] = $match[0];
135
136
                    return ' PATTERNURL ';
137
                },
138
                // BIBABREV : "dir.", "trad.", "(dir.)", "[dir.]", etc.
139
                // TODO: regex flaw : "(" not evaluated in BIBABREV. Example : "(dir.)"
140
                '#\b[(\[]?(collectif|coll\.|dir\.|trad\.|coord\.|ill\.)[)\]]?#i' => function ($match): string {
141
                    $this->tokenValue['BIBABREV'][] = $match[0]; // [1] = dir
142
143
                    return ' PATTERNBIBABREV ';
144
                },
145
                // AND
146
                '# (et|and|&|with|avec|e) #i' => function ($match): string {
147
                    $this->tokenValue['AND'][] = $match[0];
148
149
                    return ' PATTERNAND ';
150
                },
151
                // COMMA
152
                '#,#' => function (): string {
153
                    return ' PATTERNCOMMA ';
154
                },
155
                // INITIAL : 2) convert letter ("A.") or junior ("Jr.") or senior ("Sr.")
156
                // extract initial before "." converted in PUNCTUATION
157
                // Note : \b word boundary match between "L" and "'Amour" in "L'Amour"  (for [A-Z]\b)
158
                // \b([A-Z]\. |[A-Z] |JR|Jr\.|Jr\b|Sr\.|Sr\b)+ for grouping "A. B." in same INITIAL ?
159
                "#\b([A-Z]\.|[A-Z] |JR|Jr\.|Jr\b|Sr\.|Sr\b)#" => function ($match): string {
160
                    $this->tokenValue['INITIAL'][] = $match[0];
161
162
                    return ' PATTERNINITIAL ';
163
                },
164
            ],
165
            $modText,
166
            40
167
        );
168
    }
169
}
170