1
|
|
|
<?php |
2
|
|
|
|
3
|
|
|
declare(strict_types=1); |
4
|
|
|
|
5
|
|
|
namespace PhpSpellcheck\TextProcessor; |
6
|
|
|
|
7
|
|
|
use PhpSpellcheck\TextInterface; |
8
|
|
|
|
9
|
|
|
/** |
10
|
|
|
* @experimental |
11
|
|
|
* |
12
|
|
|
* Removes markdown while trying to keeping original lines and offset position of |
13
|
|
|
* characters in order to make spellchecking relevant. |
14
|
|
|
*/ |
15
|
|
|
class MarkdownRemover implements TextProcessorInterface |
16
|
25 |
|
{ |
17
|
|
|
public function process(TextInterface $text): TextInterface |
18
|
|
|
{ |
19
|
25 |
|
// Horizontal rules (stripListHeaders conflict with this rule, which is why it has been moved to the top) |
20
|
|
|
$output = \Safe\preg_replace('/^(-\s*?|\*\s*?|_\s*?){3,}(\s*)$/m', PHP_EOL . '$2', $text->getContent()); |
21
|
|
|
|
22
|
|
|
// Github Flavored Markdown |
23
|
25 |
|
// Header |
24
|
|
|
$output = \Safe\preg_replace('/\n={2,}/', '\n', $output); |
25
|
|
|
/** |
26
|
25 |
|
* Fenced codeblocks. |
27
|
|
|
* |
28
|
25 |
|
*@TODO parse programming language comments from codeblock instead of removing whole block |
29
|
|
|
*/ |
30
|
|
|
$output = \Safe\preg_replace('/~{3}.*\n/', '', $output); |
31
|
25 |
|
// Strikethrough |
32
|
|
|
$output = \Safe\preg_replace('/~~/', '', $output); |
33
|
25 |
|
// Common Markdown |
34
|
|
|
// Remove HTML tags |
35
|
25 |
|
$output = \Safe\preg_replace('/<[^>]*>/', '', $output); |
36
|
25 |
|
// Remove setext-style headers |
37
|
|
|
$output = \Safe\preg_replace('/^[=\-]{2,}\s*$/', '', $output); |
38
|
25 |
|
// Remove footnotes? |
39
|
|
|
$output = \Safe\preg_replace('/\[\^.+?\](\: .*?$)?/', '', $output); |
40
|
25 |
|
$output = \Safe\preg_replace('/\s{0,2}\[.*?\]: .*?$/', '', $output); |
41
|
|
|
// Remove images |
42
|
25 |
|
$output = \Safe\preg_replace('/\!\[(.*?)\][\[\(].*?[\]\)]/', '$1', $output); |
43
|
|
|
// Remove inline links |
44
|
25 |
|
$output = \Safe\preg_replace('/\[(.*?)\][\[\(].*?[\]\)]/', '$1', $output); |
45
|
|
|
// Remove blockquotes |
46
|
|
|
$output = \Safe\preg_replace('/^\s{0,3}>\s?/', '', $output); |
47
|
|
|
// Remove reference-style links? |
48
|
25 |
|
$output = \Safe\preg_replace('/^\s{1,2}\[(.*?)\]: (\S+)( ".*?")?\s*$/', '', $output); |
49
|
|
|
/** |
50
|
25 |
|
* Remove atx-style headers. |
51
|
|
|
* |
52
|
25 |
|
*@TODO find a way to merge the two regex below |
53
|
25 |
|
* remove ## Heading ## |
54
|
|
|
*/ |
55
|
25 |
|
$output = \Safe\preg_replace('/^#{1,6}\s+(.*)(\s+#{1,6})$/m', '$1', $output); |
56
|
|
|
// remove ## Heading |
57
|
25 |
|
$output = \Safe\preg_replace('/^#{1,6}\s+(.*)$/m', '$1', $output); |
58
|
|
|
// Remove emphasis (repeat the line to remove double emphasis) |
59
|
25 |
|
$output = \Safe\preg_replace('/([\*_]{1,3})(\S.*?\S{0,1})\1/', '$2', $output); |
60
|
|
|
$output = \Safe\preg_replace('/([\*_]{1,3})(\S.*?\S{0,1})\1/', '$2', $output); |
61
|
25 |
|
// Remove list items |
62
|
|
|
$output = \Safe\preg_replace('/^([^\S\r\n]*)\*\s/m', '$1', $output); |
63
|
|
|
// Remove code blocks |
64
|
|
|
$output = \Safe\preg_replace('/^`{3,}(.*)*$/m', '', $output); |
65
|
|
|
// Remove inline code |
66
|
|
|
$output = \Safe\preg_replace('/`(.+?)`/', '$1', $output); |
67
|
|
|
|
68
|
|
|
return $text->replaceContent($output); |
69
|
|
|
} |
70
|
|
|
} |
71
|
|
|
|