WikiTextUtil - Code Metrics - Dispositif/Wikibot - Measure and Improve Code Quality continuously with Scrutinizer

WikiTextUtil A
last analyzed 2024-01-18 16:03 UTC

↳ Parent: Project

Complexity

Total Complexity

Size/Duplication

Total Lines	260
Duplicated Lines	0 %

Test Coverage

Coverage

79.69%

Importance

Changes	4
Bugs	0	Features	0

Metric	Value
wmc	34
eloc	84
dl	0
loc	260
rs	9.68
c	4
b	0
f	0
ccs	51
cts	64
cp	0.7969

15 Methods

Rating	Name	Size	Complexity
A	extractRefsAndListOfLinks()	16	3
A	unWikify()	23	2
B	removeHTMLcomments()	33	8
A	isCommented()	6	1
A	getWikilinkPages()	7	2
A	containsWikiTag()	7	2
A	extractCommentedText()	7	2
A	hasFilteredComment()	3	1
A	isWikify()	3	1
A	normalizeUrlForTemplate()	17	1
A	stripExternalLink()	5	1
A	filterSensitiveCommentsInText()	12	3
A	str2WikiTitle()	3	1
A	stripFinalPoint()	7	2
A	wikilink()	15	4

<?php
/*
 * This file is part of dispositif/wikibot application (@github)
 * 2019/2020 © Philippe/Irønie  <[email protected]>
 * For the full copyright and MIT license information, view the license file.
 */

declare(strict_types=1);

namespace App\Domain\Utils;

class WikiTextUtil extends TextUtil
{
    protected const FILTERED_COMMENT = '#FILTERED_COMMENT#';

    /**
     * todo {{ref}}
     * @return array [0=>['<ref>fu</ref>', 'fu'], 1=> ...]
     */
    public static function extractRefsAndListOfLinks(string $text): array
    {
        // s = "\n" include in "." // m = ^multiline$
        // Exclusion des imbrications
        if (!preg_match_all('#<ref[^>/]*>((?:(?!</ref>).)*)</ref>#ism', $text, $refs, PREG_SET_ORDER)) {
            return [];
        }
        $result = $refs;

        // extraction des liens externes
        // ^\* *(https?:\/\/[^ ]+[^ .])$
        if (preg_match_all('#^\* *(https?://[^ \n]+[^ \n.])\.? *\n#im', $text, $liensExternes, PREG_SET_ORDER)) {
            $result = [...$result, ...$liensExternes];
        }

        return $result;
    }

    public static function isWikify(string $text): bool
    {
        return self::unWikify($text) !== $text;
    }

    /**
     * remove wiki encoding : italic, bold, links [ ] and [[fu|bar]] => bar
     * replace non-breaking spaces
     * replace {{lang|en|fubar}} => fubar.
     *
     * @param      $text
     * @param bool $stripcomment
     *
     * @return string
     */
    public static function unWikify(string $text, ?bool $stripcomment = true): string
    {
        if (true === $stripcomment) {
            $text = self::removeHTMLcomments($text);
        }

        $text = str_replace(
            ['[', ']', "'''", "''", ' '],
            ['', '', '', '', ' '],
            preg_replace(
                [
                    "#\[\[[^|\]]*\|([^]]*)]]#",
                    '#{{ ?(?:lang|langue) ?\|[^|]+\| ?(?:texte=)?([^{}=]+)(?:\|dir=rtl)?}}#i',
                    "#&[\w\d]{2,7};#",
                ],
                ['$1', '$1', ''],
                $text
            )
        );
        // {{Lien|Jeffrey Robinson}} => Jeffrey Robinson
        $text = preg_replace('#{{ ?lien ?\| ?([^|}]+) ?}}#i', '${1}', $text);

        return strip_tags($text, '<sup><sub>');
    }

    /**
     * Remove '<!--', '-->', and everything between.
     * To avoid leaving blank lines, when a comment is both preceded
     * and followed by a newline (ignoring spaces), trim leading and
     * trailing spaces and one of the newlines.
     * See also self::filterSensitiveCommentInText().
     * (c) WikiMedia /includes/parser/Sanitizer.php.
     */
    public static function removeHTMLcomments(string $text): string
    {
        while (false !== ($start = mb_strpos($text, '<!--'))) {
            $end = mb_strpos($text, '-->', $start + 4);
            if (false === $end) {
                // Unterminated comment; bail out
                break;
            }
            $end += 3;
            // Trim space and newline if the comment is both
            // preceded and followed by a newline
            $spaceStart = max($start - 1, 0);
            $spaceLen = $end - $spaceStart;
            while (' ' === substr($text, $spaceStart, 1) && $spaceStart > 0) {
                --$spaceStart;
                ++$spaceLen;
            }
            while (' ' === substr($text, $spaceStart + $spaceLen, 1)) {
                ++$spaceLen;
            }
            if ("\n" === substr($text, $spaceStart, 1)
                && "\n" === substr($text, $spaceStart + $spaceLen, 1)
            ) {
                // Remove the comment, leading and trailing
                // spaces, and leave only one newline.
                $text = substr_replace($text, "\n", $spaceStart, $spaceLen + 1);
            } else {
                // Remove just the comment.
                $text = substr_replace($text, '', $start, $end - $start);
            }
        }

        return (string)$text;
    }

    /**
     * Generate wikilink from string.
     * @return string
     */
    public static function wikilink(string $label, ?string $page = null): string
    {
        $label = trim(str_replace('_', ' ', self::unWikify($label)));
        $page = ($page) ? trim(self::unWikify($page)) : null;

        // fu_bar => [[fu_bar]] / Fu, fu => [[fu]]
        if (empty($page) || self::str2WikiTitle($label) === self::str2WikiTitle($page)) {
            return '[[' . $label . ']]';
        }

        // fu, bar => [[Bar|fu]]
        return sprintf(
            '[[%s|%s]]',
            self::str2WikiTitle($page),
            $label
        );
    }

    /**
     * "fu_bar_ " => "Fu bar".
     * @return string
     */
    private static function str2WikiTitle(string $str): string
    {
        return TextUtil::mb_ucfirst(trim(str_replace('_', ' ', $str)));
    }

    /**
     * Get page titles from wiki encoded links.
     * (but not others projects links like [[wikt:bla]].
     * @return array|null
     */
    public static function getWikilinkPages(string $text): ?array
    {
        if (preg_match_all('#\[\[([^:|\]]+)(?:\|[^|\]]*)?]]#', $text, $matches) > 0) {
            return $matches[1];
        }

        return null;
    }

    /**
     * Strip external links (http://) from wiki text.
     * "[http://google.fr Google]" => "Google"
     * "bla [http://google.fr]" => "bla"
     * @return string
     */
    public static function stripExternalLink(string $text): string
    {
        $text = preg_replace('#\[(https?://[^][<>\s"]+) *((?<= )[^\n\]]*|)\]#i', '${2}', $text);

        return trim($text);
    }

    /**
     * @return bool
     */
    public static function isCommented(string $text): bool
    {
        $text = str_replace('<!-- Paramètre obligatoire -->', '', $text);

        //ou preg_match('#<\!--(?!-->).*-->#s', '', $text); // plus lourd mais précis
        return preg_match('#<!--[^>]*-->#', $text) > 0;
    }

    /**
     * Strip the final point (".") as in <ref> ending.
     * @return string
     */
    public static function stripFinalPoint(string $str): string
    {
        if (str_ends_with($str, '.')) {
            return substr($str, 0, strlen($str) - 1);
        }

        return $str;
    }

    /**
     * Normalize URL for inclusion as a wiki-template value.
     * https://en.wikipedia.org/wiki/Template:Citation_Style_documentation/url
     * @return string
     */
    public static function normalizeUrlForTemplate(string $url): string
    {
        $searchReplace = [
            ' ' => '%20',
            '"' => '%22',
            "'''" => '%27%27%27',
            "''" => '%27%27',
            '<' => '%3c',
            '>' => '%3e',
            '[' => '%5b',
            ']' => '%5d',
            '{{' => '%7b%7b',
            '|' => '%7c',
            '}}' => '%7d%7d',
        ];

        return str_replace(array_keys($searchReplace), array_values($searchReplace), $url);
    }

    /**
     * Detect if contains HTML or WIKI tag, like </ref>, <ref>, <nowiki>, <ref name="bla" />
     */
    public static function containsWikiTag(string $text): bool
    {
        return
            // find </ref> or <ref>
            preg_match('#<\/?[a-z]+ ?\/?>#', $text)
            // find <ref name="dfs" />
            || preg_match('#<ref name=[^>]+>#', $text);
    }

    /**
     * Extract all HTML commented string like "<!-- fu -->".
     * @return string[] like ['<!-- fu -->', '<!-- bar -->']
     */
    public static function extractCommentedText(string $text): array
    {
        if (!preg_match_all('#<!--((?:(?!-->).)*)-->#is', $text, $matches, PREG_PATTERN_ORDER)) {
            return [];
        }

        return $matches[0];
    }

    /**
     * Replace HTML comment containing '<ref' or 'http' or '{{' by '#COMMENT#'.
     */
    public static function filterSensitiveCommentsInText(string $text): string
    {
        $comments = self::extractCommentedText($text); // ['<!-- blabla -->']

        // filtering commented string containing <ref> or http
        foreach ($comments as $comment) {
            if (preg_match('#<ref|</ref>|https?\:\/\/|\{\{#i', $comment)) {
                $text = str_replace($comment, self::FILTERED_COMMENT, $text);
            }
        }

        return $text;
    }

    public static function hasFilteredComment(string $text): bool
    {
        return str_contains($text, self::FILTERED_COMMENT);
    }
}


1		<?php
2		/*
3		* This file is part of dispositif/wikibot application (@github)
4		* 2019/2020 © Philippe/Irønie <[email protected]>
5		* For the full copyright and MIT license information, view the license file.
6		*/
7
8		declare(strict_types=1);
9
10		namespace App\Domain\Utils;
11
12		class WikiTextUtil extends TextUtil
13		{
14		protected const FILTERED_COMMENT = '#FILTERED_COMMENT#';
15
16		/**
17		* todo {{ref}}
18		* @return array [0=>['<ref>fu</ref>', 'fu'], 1=> ...]
19		*/
20		public static function extractRefsAndListOfLinks(string $text): array
21		{
22		// s = "\n" include in "." // m = ^multiline$
23		// Exclusion des imbrications
24		if (!preg_match_all('#<ref[^>/]>((?:(?!</ref>).))</ref>#ism', $text, $refs, PREG_SET_ORDER)) {
25		return [];
26		}
27		$result = $refs;
28
29		// extraction des liens externes
30		// ^\* *(https?:\/\/[^ ]+[^ .])$
31		if (preg_match_all('#^\* (https?://[^ \n]+[^ \n.])\.? \n#im', $text, $liensExternes, PREG_SET_ORDER)) {
32		$result = [...$result, ...$liensExternes];
33		}
34
35		return $result;
36		}
37
38		public static function isWikify(string $text): bool
39		{
40		return self::unWikify($text) !== $text;
41		}
42	60
43		/**
44	60	* remove wiki encoding : italic, bold, links [ ] and [[fu\|bar]] => bar
45	60	* replace non-breaking spaces
46		* replace {{lang\|en\|fubar}} => fubar.
47		*
48	60	* @param $text
49	60	* @param bool $stripcomment
50	60	*
51	60	* @return string
52		*/
53	60	public static function unWikify(string $text, ?bool $stripcomment = true): string
54		{
55		if (true === $stripcomment) {
56		$text = self::removeHTMLcomments($text);
57	60	}
58	60
59		$text = str_replace(
60		['[', ']', "'''", "''", ' '],
61		['', '', '', '', ' '],
62	60	preg_replace(
63		[
64	60	"#\[\[[^\|\]]\\|([^]])]]#",
65		'#{{ ?(?:lang\|langue) ?\\|[^\|]+\\| ?(?:texte=)?([^{}=]+)(?:\\|dir=rtl)?}}#i',
66	60	"#&[\w\d]{2,7};#",
67		],
68		['$1', '$1', ''],
69	24	$text
70		)
71	24	);
72	3	// {{Lien\|Jeffrey Robinson}} => Jeffrey Robinson
73		$text = preg_replace('#{{ ?lien ?\\| ?([^\|}]+) ?}}#i', '${1}', $text);
74
75	21	return strip_tags($text, '<sup><sub>');
76		}
77
78		/**
79		* Remove '<!--', '-->', and everything between.
80		* To avoid leaving blank lines, when a comment is both preceded
81		* and followed by a newline (ignoring spaces), trim leading and
82		* trailing spaces and one of the newlines.
83		* See also self::filterSensitiveCommentInText().
84		* (c) WikiMedia /includes/parser/Sanitizer.php.
85		*/
86	4	public static function removeHTMLcomments(string $text): string
87		{
88	4	while (false !== ($start = mb_strpos($text, '<!--'))) {
89	4	$end = mb_strpos($text, '-->', $start + 4);
90		if (false === $end) {
91		// Unterminated comment; bail out
92	4	break;
93	3	}
94		$end += 3;
95		// Trim space and newline if the comment is both
96		// preceded and followed by a newline
97	2	$spaceStart = max($start - 1, 0);
98	2	$spaceLen = $end - $spaceStart;
99	2	while (' ' === substr($text, $spaceStart, 1) && $spaceStart > 0) {
100	2	--$spaceStart;
101		++$spaceLen;
102		}
103		while (' ' === substr($text, $spaceStart + $spaceLen, 1)) {
104		++$spaceLen;
105		}
106		if ("\n" === substr($text, $spaceStart, 1)
107	4	&& "\n" === substr($text, $spaceStart + $spaceLen, 1)
108		) {
109	4	// Remove the comment, leading and trailing
110		// spaces, and leave only one newline.
111		$text = substr_replace($text, "\n", $spaceStart, $spaceLen + 1);
112		} else {
113		// Remove just the comment.
114		$text = substr_replace($text, '', $start, $end - $start);
115		}
116		}
117
118		return (string)$text;
119		}
120	1
121		/**
122	1	* Generate wikilink from string.
123	1	* @return string
124		*/
125		public static function wikilink(string $label, ?string $page = null): string
126		{
127		$label = trim(str_replace('_', ' ', self::unWikify($label)));
128		$page = ($page) ? trim(self::unWikify($page)) : null;
129
130		// fu_bar => [[fu_bar]] / Fu, fu => [[fu]]
131		if (empty($page) \|\| self::str2WikiTitle($label) === self::str2WikiTitle($page)) {
132		return '[[' . $label . ']]';
133		}
134
135		// fu, bar => [[Bar\|fu]]
136		return sprintf(
137		'[[%s\|%s]]',
138	2	self::str2WikiTitle($page),
139		$label
140	2	);
141		}
142	2
143		/**
144		* "fu_bar_ " => "Fu bar".
145		* @return string
146		*/
147		private static function str2WikiTitle(string $str): string
148		{
149		return TextUtil::mb_ucfirst(trim(str_replace('_', ' ', $str)));
150	24	}
151
152	24	/**
153		* Get page titles from wiki encoded links.
154		* (but not others projects links like [[wikt:bla]].
155	24	* @return array\|null
156		*/
157		public static function getWikilinkPages(string $text): ?array
158		{
159		if (preg_match_all('#\[\[([^:\|\]]+)(?:\\|[^\|\]]*)?]]#', $text, $matches) > 0) {
160		return $matches[1];
161		}
162
163		return null;
164		}
165
166		/**
167		* Strip external links (http://) from wiki text.
168		* "[http://google.fr Google]" => "Google"
169	61	* "bla [http://google.fr]" => "bla"
170		* @return string
171	61	*/
172	2	public static function stripExternalLink(string $text): string
173	2	{
174		$text = preg_replace('#\[(https?://[^][<>\s"]+) ((?<= )[^\n\]]\|)\]#i', '${2}', $text);
175
176		return trim($text);
177	2	}
178
179		/**
180	2	* @return bool
181	2	*/
182	2	public static function isCommented(string $text): bool
183		{
184		$text = str_replace('<!-- Paramètre obligatoire -->', '', $text);
185
186	2	//ou preg_match('#<\!--(?!-->).*-->#s', '', $text); // plus lourd mais précis
187	1	return preg_match('#<!--[^>]*-->#', $text) > 0;
188		}
189	2
190	2	/**
191		* Strip the final point (".") as in <ref> ending.
192		* @return string
193		*/
194		public static function stripFinalPoint(string $str): string
195		{
196		if (str_ends_with($str, '.')) {
197	2	return substr($str, 0, strlen($str) - 1);
198		}
199
200		return $str;
201	61	}
202
203		/**
204		* Normalize URL for inclusion as a wiki-template value.
205		* https://en.wikipedia.org/wiki/Template:Citation_Style_documentation/url
206		* @return string
207		*/
208		public static function normalizeUrlForTemplate(string $url): string
209		{
210		$searchReplace = [
211		' ' => '%20',
212		'"' => '%22',
213		"'''" => '%27%27%27',
214		"''" => '%27%27',
215		'<' => '%3c',
216		'>' => '%3e',
217		'[' => '%5b',
218		']' => '%5d',
219		'{{' => '%7b%7b',
220		'\|' => '%7c',
221		'}}' => '%7d%7d',
222		];
223
224		return str_replace(array_keys($searchReplace), array_values($searchReplace), $url);
225		}
226
227		/**
228		* Detect if contains HTML or WIKI tag, like </ref>, <ref>, <nowiki>, <ref name="bla" />
229		*/
230		public static function containsWikiTag(string $text): bool
231		{
232		return
233		// find </ref> or <ref>
234		preg_match('#<\/?[a-z]+ ?\/?>#', $text)
235		// find <ref name="dfs" />
236		\|\| preg_match('#<ref name=[^>]+>#', $text);
237		}
238
239		/**
240		* Extract all HTML commented string like "<!-- fu -->".
241		* @return string[] like ['<!-- fu -->', '<!-- bar -->']
242		*/
243		public static function extractCommentedText(string $text): array
244		{
245		if (!preg_match_all('#<!--((?:(?!-->).)*)-->#is', $text, $matches, PREG_PATTERN_ORDER)) {
246		return [];
247		}
248
249		return $matches[0];
250		}
251
252		/**
253		* Replace HTML comment containing '<ref' or 'http' or '{{' by '#COMMENT#'.
254		*/
255		public static function filterSensitiveCommentsInText(string $text): string
256		{
257		$comments = self::extractCommentedText($text); // ['<!-- blabla -->']
258
259		// filtering commented string containing <ref> or http
260		foreach ($comments as $comment) {
261		if (preg_match('#<ref\|</ref>\|https?\:\/\/\|\{\{#i', $comment)) {
262		$text = str_replace($comment, self::FILTERED_COMMENT, $text);
263		}
264		}
265
266		return $text;
267		}
268
269		public static function hasFilteredComment(string $text): bool
270		{
271		return str_contains($text, self::FILTERED_COMMENT);
272		}
273		}
274

Dispositif / Wikibot

WikiTextUtil A last analyzed 2024-01-18 16:03 UTC

Complexity

Size/Duplication

Test Coverage

Importance

15 Methods

Duplication Side-by-Side

Filter issues like

WikiTextUtil A
last analyzed 2024-01-18 16:03 UTC