WikiTextUtil   A
last analyzed

Complexity

Total Complexity 34

Size/Duplication

Total Lines 260
Duplicated Lines 0 %

Test Coverage

Coverage 79.69%

Importance

Changes 4
Bugs 0 Features 0
Metric Value
wmc 34
eloc 84
dl 0
loc 260
rs 9.68
c 4
b 0
f 0
ccs 51
cts 64
cp 0.7969

15 Methods

Rating   Name   Duplication   Size   Complexity  
A extractRefsAndListOfLinks() 0 16 3
A unWikify() 0 23 2
B removeHTMLcomments() 0 33 8
A isCommented() 0 6 1
A getWikilinkPages() 0 7 2
A containsWikiTag() 0 7 2
A extractCommentedText() 0 7 2
A hasFilteredComment() 0 3 1
A isWikify() 0 3 1
A normalizeUrlForTemplate() 0 17 1
A stripExternalLink() 0 5 1
A filterSensitiveCommentsInText() 0 12 3
A str2WikiTitle() 0 3 1
A stripFinalPoint() 0 7 2
A wikilink() 0 15 4
1
<?php
2
/*
3
 * This file is part of dispositif/wikibot application (@github)
4
 * 2019/2020 © Philippe/Irønie  <[email protected]>
5
 * For the full copyright and MIT license information, view the license file.
6
 */
7
8
declare(strict_types=1);
9
10
namespace App\Domain\Utils;
11
12
class WikiTextUtil extends TextUtil
13
{
14
    protected const FILTERED_COMMENT = '#FILTERED_COMMENT#';
15
16
    /**
17
     * todo {{ref}}
18
     * @return array [0=>['<ref>fu</ref>', 'fu'], 1=> ...]
19
     */
20
    public static function extractRefsAndListOfLinks(string $text): array
21
    {
22
        // s = "\n" include in "." // m = ^multiline$
23
        // Exclusion des imbrications
24
        if (!preg_match_all('#<ref[^>/]*>((?:(?!</ref>).)*)</ref>#ism', $text, $refs, PREG_SET_ORDER)) {
25
            return [];
26
        }
27
        $result = $refs;
28
29
        // extraction des liens externes
30
        // ^\* *(https?:\/\/[^ ]+[^ .])$
31
        if (preg_match_all('#^\* *(https?://[^ \n]+[^ \n.])\.? *\n#im', $text, $liensExternes, PREG_SET_ORDER)) {
32
            $result = [...$result, ...$liensExternes];
33
        }
34
35
        return $result;
36
    }
37
38
    public static function isWikify(string $text): bool
39
    {
40
        return self::unWikify($text) !== $text;
41
    }
42 60
43
    /**
44 60
     * remove wiki encoding : italic, bold, links [ ] and [[fu|bar]] => bar
45 60
     * replace non-breaking spaces
46
     * replace {{lang|en|fubar}} => fubar.
47
     *
48 60
     * @param      $text
49 60
     * @param bool $stripcomment
50 60
     *
51 60
     * @return string
52
     */
53 60
    public static function unWikify(string $text, ?bool $stripcomment = true): string
54
    {
55
        if (true === $stripcomment) {
56
            $text = self::removeHTMLcomments($text);
57 60
        }
58 60
59
        $text = str_replace(
60
            ['[', ']', "'''", "''", ' '],
61
            ['', '', '', '', ' '],
62 60
            preg_replace(
63
                [
64 60
                    "#\[\[[^|\]]*\|([^]]*)]]#",
65
                    '#{{ ?(?:lang|langue) ?\|[^|]+\| ?(?:texte=)?([^{}=]+)(?:\|dir=rtl)?}}#i',
66 60
                    "#&[\w\d]{2,7};#",
67
                ],
68
                ['$1', '$1', ''],
69 24
                $text
70
            )
71 24
        );
72 3
        // {{Lien|Jeffrey Robinson}} => Jeffrey Robinson
73
        $text = preg_replace('#{{ ?lien ?\| ?([^|}]+) ?}}#i', '${1}', $text);
74
75 21
        return strip_tags($text, '<sup><sub>');
76
    }
77
78
    /**
79
     * Remove '<!--', '-->', and everything between.
80
     * To avoid leaving blank lines, when a comment is both preceded
81
     * and followed by a newline (ignoring spaces), trim leading and
82
     * trailing spaces and one of the newlines.
83
     * See also self::filterSensitiveCommentInText().
84
     * (c) WikiMedia /includes/parser/Sanitizer.php.
85
     */
86 4
    public static function removeHTMLcomments(string $text): string
87
    {
88 4
        while (false !== ($start = mb_strpos($text, '<!--'))) {
89 4
            $end = mb_strpos($text, '-->', $start + 4);
90
            if (false === $end) {
91
                // Unterminated comment; bail out
92 4
                break;
93 3
            }
94
            $end += 3;
95
            // Trim space and newline if the comment is both
96
            // preceded and followed by a newline
97 2
            $spaceStart = max($start - 1, 0);
98 2
            $spaceLen = $end - $spaceStart;
99 2
            while (' ' === substr($text, $spaceStart, 1) && $spaceStart > 0) {
100 2
                --$spaceStart;
101
                ++$spaceLen;
102
            }
103
            while (' ' === substr($text, $spaceStart + $spaceLen, 1)) {
104
                ++$spaceLen;
105
            }
106
            if ("\n" === substr($text, $spaceStart, 1)
107 4
                && "\n" === substr($text, $spaceStart + $spaceLen, 1)
108
            ) {
109 4
                // Remove the comment, leading and trailing
110
                // spaces, and leave only one newline.
111
                $text = substr_replace($text, "\n", $spaceStart, $spaceLen + 1);
112
            } else {
113
                // Remove just the comment.
114
                $text = substr_replace($text, '', $start, $end - $start);
115
            }
116
        }
117
118
        return (string)$text;
119
    }
120 1
121
    /**
122 1
     * Generate wikilink from string.
123 1
     * @return string
124
     */
125
    public static function wikilink(string $label, ?string $page = null): string
126
    {
127
        $label = trim(str_replace('_', ' ', self::unWikify($label)));
128
        $page = ($page) ? trim(self::unWikify($page)) : null;
129
130
        // fu_bar => [[fu_bar]] / Fu, fu => [[fu]]
131
        if (empty($page) || self::str2WikiTitle($label) === self::str2WikiTitle($page)) {
132
            return '[[' . $label . ']]';
133
        }
134
135
        // fu, bar => [[Bar|fu]]
136
        return sprintf(
137
            '[[%s|%s]]',
138 2
            self::str2WikiTitle($page),
139
            $label
140 2
        );
141
    }
142 2
143
    /**
144
     * "fu_bar_ " => "Fu bar".
145
     * @return string
146
     */
147
    private static function str2WikiTitle(string $str): string
148
    {
149
        return TextUtil::mb_ucfirst(trim(str_replace('_', ' ', $str)));
150 24
    }
151
152 24
    /**
153
     * Get page titles from wiki encoded links.
154
     * (but not others projects links like [[wikt:bla]].
155 24
     * @return array|null
156
     */
157
    public static function getWikilinkPages(string $text): ?array
158
    {
159
        if (preg_match_all('#\[\[([^:|\]]+)(?:\|[^|\]]*)?]]#', $text, $matches) > 0) {
160
            return $matches[1];
161
        }
162
163
        return null;
164
    }
165
166
    /**
167
     * Strip external links (http://) from wiki text.
168
     * "[http://google.fr Google]" => "Google"
169 61
     * "bla [http://google.fr]" => "bla"
170
     * @return string
171 61
     */
172 2
    public static function stripExternalLink(string $text): string
173 2
    {
174
        $text = preg_replace('#\[(https?://[^][<>\s"]+) *((?<= )[^\n\]]*|)\]#i', '${2}', $text);
175
176
        return trim($text);
177 2
    }
178
179
    /**
180 2
     * @return bool
181 2
     */
182 2
    public static function isCommented(string $text): bool
183
    {
184
        $text = str_replace('<!-- Paramètre obligatoire -->', '', $text);
185
186 2
        //ou preg_match('#<\!--(?!-->).*-->#s', '', $text); // plus lourd mais précis
187 1
        return preg_match('#<!--[^>]*-->#', $text) > 0;
188
    }
189 2
190 2
    /**
191
     * Strip the final point (".") as in <ref> ending.
192
     * @return string
193
     */
194
    public static function stripFinalPoint(string $str): string
195
    {
196
        if (str_ends_with($str, '.')) {
197 2
            return substr($str, 0, strlen($str) - 1);
198
        }
199
200
        return $str;
201 61
    }
202
203
    /**
204
     * Normalize URL for inclusion as a wiki-template value.
205
     * https://en.wikipedia.org/wiki/Template:Citation_Style_documentation/url
206
     * @return string
207
     */
208
    public static function normalizeUrlForTemplate(string $url): string
209
    {
210
        $searchReplace = [
211
            ' ' => '%20',
212
            '"' => '%22',
213
            "'''" => '%27%27%27',
214
            "''" => '%27%27',
215
            '<' => '%3c',
216
            '>' => '%3e',
217
            '[' => '%5b',
218
            ']' => '%5d',
219
            '{{' => '%7b%7b',
220
            '|' => '%7c',
221
            '}}' => '%7d%7d',
222
        ];
223
224
        return str_replace(array_keys($searchReplace), array_values($searchReplace), $url);
225
    }
226
227
    /**
228
     * Detect if contains HTML or WIKI tag, like </ref>, <ref>, <nowiki>, <ref name="bla" />
229
     */
230
    public static function containsWikiTag(string $text): bool
231
    {
232
        return
233
            // find </ref> or <ref>
234
            preg_match('#<\/?[a-z]+ ?\/?>#', $text)
235
            // find <ref name="dfs" />
236
            || preg_match('#<ref name=[^>]+>#', $text);
237
    }
238
239
    /**
240
     * Extract all HTML commented string like "<!-- fu -->".
241
     * @return string[] like ['<!-- fu -->', '<!-- bar -->']
242
     */
243
    public static function extractCommentedText(string $text): array
244
    {
245
        if (!preg_match_all('#<!--((?:(?!-->).)*)-->#is', $text, $matches, PREG_PATTERN_ORDER)) {
246
            return [];
247
        }
248
249
        return $matches[0];
250
    }
251
252
    /**
253
     * Replace HTML comment containing '<ref' or 'http' or '{{' by '#COMMENT#'.
254
     */
255
    public static function filterSensitiveCommentsInText(string $text): string
256
    {
257
        $comments = self::extractCommentedText($text); // ['<!-- blabla -->']
258
259
        // filtering commented string containing <ref> or http
260
        foreach ($comments as $comment) {
261
            if (preg_match('#<ref|</ref>|https?\:\/\/|\{\{#i', $comment)) {
262
                $text = str_replace($comment, self::FILTERED_COMMENT, $text);
263
            }
264
        }
265
266
        return $text;
267
    }
268
269
    public static function hasFilteredComment(string $text): bool
270
    {
271
        return str_contains($text, self::FILTERED_COMMENT);
272
    }
273
}
274