|
1
|
|
|
<?php |
|
2
|
|
|
|
|
3
|
|
|
declare(strict_types=1); |
|
4
|
|
|
|
|
5
|
|
|
namespace PhpSpellcheck\TextProcessor; |
|
6
|
|
|
|
|
7
|
|
|
use PhpSpellcheck\TextInterface; |
|
8
|
|
|
|
|
9
|
|
|
/** |
|
10
|
|
|
* @experimental |
|
11
|
|
|
* |
|
12
|
|
|
* Removes markdown while trying to keeping original lines and offset position of |
|
13
|
|
|
* characters in order to make spellchecking relevant. |
|
14
|
|
|
*/ |
|
15
|
|
|
class MarkdownRemover implements TextProcessorInterface |
|
16
|
25 |
|
{ |
|
17
|
|
|
public function process(TextInterface $text): TextInterface |
|
18
|
|
|
{ |
|
19
|
25 |
|
// Horizontal rules (stripListHeaders conflict with this rule, which is why it has been moved to the top) |
|
20
|
|
|
$output = \Safe\preg_replace('/^(-\s*?|\*\s*?|_\s*?){3,}(\s*)$/m', PHP_EOL . '$2', $text->getContent()); |
|
21
|
|
|
|
|
22
|
|
|
// Github Flavored Markdown |
|
23
|
25 |
|
// Header |
|
24
|
|
|
$output = \Safe\preg_replace('/\n={2,}/', '\n', $output); |
|
25
|
|
|
/** |
|
26
|
25 |
|
* Fenced codeblocks. |
|
27
|
|
|
* |
|
28
|
25 |
|
*@TODO parse programming language comments from codeblock instead of removing whole block |
|
29
|
|
|
*/ |
|
30
|
|
|
$output = \Safe\preg_replace('/~{3}.*\n/', '', $output); |
|
31
|
25 |
|
// Strikethrough |
|
32
|
|
|
$output = \Safe\preg_replace('/~~/', '', $output); |
|
33
|
25 |
|
// Common Markdown |
|
34
|
|
|
// Remove HTML tags |
|
35
|
25 |
|
$output = \Safe\preg_replace('/<[^>]*>/', '', $output); |
|
36
|
25 |
|
// Remove setext-style headers |
|
37
|
|
|
$output = \Safe\preg_replace('/^[=\-]{2,}\s*$/', '', $output); |
|
38
|
25 |
|
// Remove footnotes? |
|
39
|
|
|
$output = \Safe\preg_replace('/\[\^.+?\](\: .*?$)?/', '', $output); |
|
40
|
25 |
|
$output = \Safe\preg_replace('/\s{0,2}\[.*?\]: .*?$/', '', $output); |
|
41
|
|
|
// Remove images |
|
42
|
25 |
|
$output = \Safe\preg_replace('/\!\[(.*?)\][\[\(].*?[\]\)]/', '$1', $output); |
|
43
|
|
|
// Remove inline links |
|
44
|
25 |
|
$output = \Safe\preg_replace('/\[(.*?)\][\[\(].*?[\]\)]/', '$1', $output); |
|
45
|
|
|
// Remove blockquotes |
|
46
|
|
|
$output = \Safe\preg_replace('/^\s{0,3}>\s?/', '', $output); |
|
47
|
|
|
// Remove reference-style links? |
|
48
|
25 |
|
$output = \Safe\preg_replace('/^\s{1,2}\[(.*?)\]: (\S+)( ".*?")?\s*$/', '', $output); |
|
49
|
|
|
/** |
|
50
|
25 |
|
* Remove atx-style headers. |
|
51
|
|
|
* |
|
52
|
25 |
|
*@TODO find a way to merge the two regex below |
|
53
|
25 |
|
* remove ## Heading ## |
|
54
|
|
|
*/ |
|
55
|
25 |
|
$output = \Safe\preg_replace('/^#{1,6}\s+(.*)(\s+#{1,6})$/m', '$1', $output); |
|
56
|
|
|
// remove ## Heading |
|
57
|
25 |
|
$output = \Safe\preg_replace('/^#{1,6}\s+(.*)$/m', '$1', $output); |
|
58
|
|
|
// Remove emphasis (repeat the line to remove double emphasis) |
|
59
|
25 |
|
$output = \Safe\preg_replace('/([\*_]{1,3})(\S.*?\S{0,1})\1/', '$2', $output); |
|
60
|
|
|
$output = \Safe\preg_replace('/([\*_]{1,3})(\S.*?\S{0,1})\1/', '$2', $output); |
|
61
|
25 |
|
// Remove list items |
|
62
|
|
|
$output = \Safe\preg_replace('/^([^\S\r\n]*)\*\s/m', '$1', $output); |
|
63
|
|
|
// Remove code blocks |
|
64
|
|
|
$output = \Safe\preg_replace('/^`{3,}(.*)*$/m', '', $output); |
|
65
|
|
|
// Remove inline code |
|
66
|
|
|
$output = \Safe\preg_replace('/`(.+?)`/', '$1', $output); |
|
67
|
|
|
|
|
68
|
|
|
return $text->replaceContent($output); |
|
69
|
|
|
} |
|
70
|
|
|
} |
|
71
|
|
|
|