Issues (106)

src/Domain/Utils/TemplateParser.php (1 issue)

1
<?php
2
/*
3
 * This file is part of dispositif/wikibot application (@github)
4
 * 2019-2023 © Philippe M./Irønie  <[email protected]>
5
 * For the full copyright and MIT license information, view the license file.
6
 */
7
8
declare(strict_types=1);
9
10
namespace App\Domain\Utils;
11
12
use App\Domain\Models\Wiki\AbstractWikiTemplate;
13
use App\Domain\WikiTemplateFactory;
14
use Exception;
15
use LogicException;
16
use Throwable;
17
18
/**
19
 * todo legacy.
20
 * Class TemplateParser.
21
 */
22
abstract class TemplateParser extends WikiTextUtil
23
{
24
    /**
25
     * todo : simplify array if only one occurrence ?
26
     * todo refac extract/logic.
27
     *
28
     *
29
     * @return array
30
     * @throws Exception
31
     */
32
    public static function parseAllTemplateByName(string $tplName, string $text): array
33
    {
34 2
        $result = [];
35
        // Extract wikiText from that template
36
        $arrayTplText = self::findAllTemplatesByName($tplName, $text);
37 2
38
        if ($arrayTplText === [] || empty($arrayTplText[0])) {
39 2
            return [];
40
        }
41
42
        $result[$tplName] = [];
43 2
        $inc = -1;
44 2
        foreach ($arrayTplText as $tplText) {
45 2
            ++$inc;
46 2
            // store the raw text of the template
47
            $result[$tplName][$inc] = ['raw' => $tplText];
48 2
49
            // create an object of the template
50
            try {
51
                $tplObject = WikiTemplateFactory::create($tplName);
52
            } catch (Throwable $e) {
53
                unset($e);
54
                continue;
55 2
            }
56
57
            if (!is_object($tplObject) || !is_subclass_of($tplObject, AbstractWikiTemplate::class)) {
58
                continue;
59
            }
60
61 2
            $data = self::parseDataFromTemplate($tplName, $tplText);
62
            $tplObject->hydrate($data);
63
            $tplObject->detectUserSeparator($tplText);
64
65 2
            $result[$tplName][$inc] += ['model' => $tplObject];
66 2
        }
67 2
68
        return (array)$result;
69 2
    }
70
71
    /**
72 2
     * Find all the recurrences of a wiki's template in a text.
73
     * Compatible with inclusion of sub-templates.
74
     * Example :
75
     * {{Infobox |pays={{pays|France}} }}
76
     * retourne array {{modèle|...}}.
77
     *
78
     * @return array [ 0=>{{bla|...}}, 1=>{{bla|...}} ]
79
     */
80
    public static function findAllTemplatesByName(string $templateName, string $text): array
81
    {
82
        // TODO check {{fr}}
83
        $res = preg_match_all(
84
            "#{{[ \n]*".preg_quote(trim($templateName), '#')."[ \t \n\r]*\|[^{}]*(?:{{[^{}]+}}[^{}]*)*}}#i",
85
            $text,
86
            $matches
87 2
        );
88
89
        if (false === $res) {
90 2
            return [];
91 2
        }
92 2
93 2
        return $matches[0];
94
        //OK : preg_match_all("#\{\{".preg_quote(trim($nommodele), '#')."[ \t \n\r]*\|([^\{\}]*(\{\{[^\{\}]+\}\}[^\{\}]*)*)\}\}#i", $text, $matches);
95
    }
96 2
97
    /**
98
     * todo refactor + check if @notused
99
     * Parsing of any wiki template from text and templateName
100 2
     * Using the first {{template}} definition found in text
101
     * todo legacy.
102
     *
103
     *
104
     * @return array
105
     */
106
    public static function parseDataFromTemplate(string $tplName, string $text): array
107
    {
108
        $text = str_replace("\n", '', $text); // todo WTF ?
109
110
        // check {{template}} in text
111
        $tplFounded = self::findFirstTemplateInText($tplName, $text);
112
113
        // $matches[0] : {{template|...}}
114
        if (empty($tplFounded)) {
115 27
            throw new LogicException("Template $tplName not found in text");
116
        }
117 27
        // $matches[1] : url=blabla|titre=Popo
118
        if (false === $tplFounded[1]) {
119
            throw new LogicException("No parameters found in $tplName");
120 27
        }
121
        // sub-template pipe | encoding
122
        $tplFounded[1] = self::encodeTemplatePipes($tplFounded[1]);
123 27
124
        // x flag => "\ " for space
125
        $res = preg_match_all(
126
            "/
127 27
			(
128
	  			[^|=]*=?                          # parameter name (or nothing)
129
		 		(
130
					[^|{}\[\]<>]*               # reject <i>,<ref>
131 27
					(?:\[[^\[\]]+])?              # [url text] or [text]
132
					(?:<!--[^<>]+-->)?    # commentary <!-- -->
133
					(?:{{[^}{]+}})?          # {{template}} but KO with {{tmp|...}}
134 27
					                               # test : {{bla@PIPE@bla}}
135 27
					(?:\[\[[^]]+]])?            # [[fu|bar]]
136
					[^|{}\[\]]*                 # accept <i>,<ref>
137
		 		)*
138
	 		)\|?
139
		/x",
140
            $tplFounded[1],
141
            $wikiParams
142
        );
143
144
        if (false === $res || 0 === $res || empty($wikiParams[1])) {
145
            throw new LogicException("Parameters from template '$tplName' can't be parsed");
146
        }
147
148
        return self::explodeParameterValue($wikiParams[1]);
149 27
    }
150 27
151
    /**
152
     * For multiple occurrences see findAllTemplatesByName().
153 27
     *
154
     *
155
     * @return array|null
156
     */
157 27
    private static function findFirstTemplateInText(string $templateName, string $text): ?array
158
    {
159
        // BUG marche pas avec :
160
        //        $text = '{{Ouvrage|auteur1 = Clément|titre = Les Borgia {{nobr|Alexandre {{VI}}}}}}'; // to debug
161
        //        $templateName = 'ouvrage'; // to debug
162
163
        //        $text = str_replace("\n", '', $text); // ??? todo regex multiline or encode char
164
165
        // todo: replace <!-- --> by encode char and memorize in var
166
167
        // hack : replace solitary { and } by encoded string CURLYBRACKET
168 27
        $text = preg_replace('#([^{]){([^{])#', '${1}CURLYBRACKETO$2', $text);
169
        $text = preg_replace('#([^}])}([^}])#', '${1}CURLYBRACKETC$2', $text);
170
171
        // TODO: implement better regex :(
172
        if (preg_match(
173
                '~{{ ?'.preg_quote($templateName, '~')."[ \t \n\r]*\|([^{}]*(?:{{[^{}]+}}[^{}]*)*)}}~i",
174
                $text,
175
                $matches
176
            ) > 0
177
        ) {
178
            array_walk(
179 27
                $matches,
180 27
                function (&$value) {
181
                    $value = str_replace(['CURLYBRACKETO', 'CURLYBRACKETC'], ['{', '}'], $value);
182
                }
183 27
            );
184 27
185 27
            return $matches;
186 27
        }
187 27
188
        return null;
189 27
    }
190 27
191
    /**
192 27
     * replace sub-templates pipes | by @PIPE@ in text.
193 27
     */
194
    protected static function encodeTemplatePipes(string $text): string
195
    {
196 27
        if (preg_match_all('#{{(?:[^{}]+)}}#m', $text, $subTmpl) > 0) {
197
            foreach ($subTmpl[0] as $sub) {
198
                $subSanit = str_replace('|', '@PIPE@', (string) $sub);
199
                $text = str_replace($sub, $subSanit, $text);
200
            }
201
        }
202
203
        return $text;
204
    }
205
206
    /**
207
     * From ['fr', 'url=blabla', 'titre=popo']
208
     * To [ '1'=> 'fr', url' => 'blabla', 'titre' => 'popo' ].
209 27
     *
210
     * @param array $wikiLines ['url=blabla', 'titre=popo']
211 27
     *
212 1
     * @return array
213 1
     */
214 1
    protected static function explodeParameterValue(array $wikiLines): array
215
    {
216
        $data = [];
217
        $keyNum = 1;
218 27
        foreach ($wikiLines as $line) {
219
            if (empty($line)) {
220
                continue;
221
            }
222
            $line = str_replace(
223
                ["\t", "\n", "\r", ' '],
224
                ['', '', '', ' '],
225
                (string) $line
226
            ); // perte cosmétique : est-ce bien ? + espace insécable remplacé par espace sécable
227
228
            // $line : fu = bar (OK : fu=bar=coco)
229 27
            $pos = strpos($line, '=');
230
            $param = null;
231 27
            if (is_int($pos) && $pos >= 0) {
232 27
                $param = mb_strtolower(substr($line, 0, $pos), 'UTF-8');
233 27
                $value = substr($line, $pos + 1);
234 27
            }
235 27
            // No param name => take $keyNum as param name
236
            if (false === $pos) {
237 27
                $param = (string)$keyNum;
238 27
                $value = $line;
239 27
                ++$keyNum;
240 27
            }
241
242
            if (empty($param) || !isset($value)) {
243
                throw new LogicException('param/value variable not defined');
244 27
            }
245 27
246 26
            // TODO : accept empty value ?
247 26
            if (trim($value) === '') {
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable $value does not seem to be defined for all execution paths leading up to this point.
Loading history...
248
                continue;
249
            }
250 27
            // reverse the sub-template pipe encoding
251 2
            $value = str_replace('@PIPE@', '|', $value);
252 2
            $data[trim($param)] = trim($value);
253 2
        }
254
255
        return $data;
256 27
    }
257
258
    /**
259
     * Find text style of template : only pipe, space-pipe-space, pipe-space, return-pipe, etc.
260
     */
261 27
    public static function findUserStyleSeparator(string $tplText): string
262 5
    {
263
        // Fixed : {{fu\n    | bar}}
264
        if (preg_match('#{{[^}|]+\n +\|( ?)[^}]+}}#i', $tplText, $matches) > 0) {
265 27
            return "\n |".$matches[1];
266 27
        }
267
        // {{fu | bar}} (duplicate : because [^}|\n]+ allows final space...)
268
        if (preg_match('#{{[^}|\n]+([ \n]\|[ \n]?)[^}]+}}#i', $tplText, $matches) > 0) {
269 27
            return $matches[1];
270
        }
271
        // others : {{fu|bar}} ; {{fu\n|bar}} ; {{fu |bar}} ...
272
        if (preg_match('#{{[^}|\n]+([ \n]?\|[ \n]?)[^}]+}}#i', $tplText, $matches) > 0) {
273
            return $matches[1];
274
        }
275
276
        return ' |';
277
    }
278
279 31
    /**
280
     * Detect if "param     = bla".
281
     */
282 31
    public static function isMultispacedTemplate(string $tplText): bool
283 2
    {
284
        // detect 4 spaces chars
285
        return (bool) preg_match('#{{[^}]+ {4}[^}]+}}#i', $tplText);
286 29
    }
287 6
288
    /**
289
     * https://fr.wikipedia.org/wiki/Mod%C3%A8le:P.
290 24
     * Examples:
291 24
     * 'bla {{p.|125-133}} bla' => ['{{p.|125-133}}', '125-133']
292
     * 'bla {{p.}}10, 20, 35-36 bla' => ['{{p.}}10, 20, 35-36', '10, 20, 35-36']
293
     */
294
    public static function extractPageTemplateContent(string $text): ?array
295
    {
296
        if (preg_match('#\{\{p\.(?:\|([0-9,\-—\/ ]+))?\}\}([0-9,\-—\/ ]+)?#i', $text, $matches) !== false) {
297
            if (!empty($matches[1]) && trim($matches[1]) !== '') { // {{p.|125}}
298
                return [trim($matches[0]), trim($matches[1])];
299
            }
300
            if (!empty($matches[2]) && trim($matches[2]) !== '') { // {{p.}}125
301
                return [trim($matches[0]), trim($matches[2])];
302
            }
303
        }
304 25
305
        return null;
306
    }
307
}
308