1
|
|
|
<?php |
2
|
|
|
|
3
|
|
|
declare(strict_types=1); |
4
|
|
|
|
5
|
|
|
namespace PhpSpellcheck\TextProcessor; |
6
|
|
|
|
7
|
|
|
use PhpSpellcheck\Text; |
8
|
|
|
use PhpSpellcheck\TextInterface; |
9
|
|
|
|
10
|
|
|
/** |
11
|
|
|
* Removes markdown while keeping original lines and columns of |
12
|
|
|
* characters to make spellchecking relevant |
13
|
|
|
*/ |
14
|
|
|
class MarkdownRemover implements TextProcessorInterface |
15
|
|
|
{ |
16
|
25 |
|
public function process(TextInterface $text): TextInterface |
17
|
|
|
{ |
18
|
|
|
// Horizontal rules (stripListHeaders conflict with this rule, which is why it has been moved to the top) |
19
|
25 |
|
$output = \Safe\preg_replace('/^(-\s*?|\*\s*?|_\s*?){3,}(\s*)$/m', PHP_EOL . '$2', $text->getContent()); |
20
|
|
|
|
21
|
|
|
// Github Flavored Markdown |
22
|
|
|
// Header |
23
|
25 |
|
$output = \Safe\preg_replace('/\n={2,}/', '\n', $output); |
24
|
|
|
// Fenced codeblocks |
25
|
|
|
//@TODO parse programming language comments from codeblock instead of removing whole block |
26
|
25 |
|
$output = \Safe\preg_replace('/~{3}.*\n/', '', $output); |
27
|
|
|
// Strikethrough |
28
|
25 |
|
$output = \Safe\preg_replace('/~~/', '', $output); |
29
|
|
|
// Common Markdown |
30
|
|
|
// Remove HTML tags |
31
|
25 |
|
$output = \Safe\preg_replace('/<[^>]*>/', '', $output); |
32
|
|
|
// Remove setext-style headers |
33
|
25 |
|
$output = \Safe\preg_replace('/^[=\-]{2,}\s*$/', '', $output); |
34
|
|
|
// Remove footnotes? |
35
|
25 |
|
$output = \Safe\preg_replace('/\[\^.+?\](\: .*?$)?/', '', $output); |
36
|
25 |
|
$output = \Safe\preg_replace('/\s{0,2}\[.*?\]: .*?$/', '', $output); |
37
|
|
|
// Remove images |
38
|
25 |
|
$output = \Safe\preg_replace('/\!\[(.*?)\][\[\(].*?[\]\)]/', '$1', $output); |
39
|
|
|
// Remove inline links |
40
|
25 |
|
$output = \Safe\preg_replace('/\[(.*?)\][\[\(].*?[\]\)]/', '$1', $output); |
41
|
|
|
// Remove blockquotes |
42
|
25 |
|
$output = \Safe\preg_replace('/^\s{0,3}>\s?/', '', $output); |
43
|
|
|
// Remove reference-style links? |
44
|
25 |
|
$output = \Safe\preg_replace('/^\s{1,2}\[(.*?)\]: (\S+)( ".*?")?\s*$/', '', $output); |
45
|
|
|
// Remove atx-style headers |
46
|
|
|
//@TODO find a way to merge the two regex below |
47
|
|
|
// remove ## Heading ## |
48
|
25 |
|
$output = \Safe\preg_replace('/^#{1,6}\s+(.*)(\s+#{1,6})$/m', '$1', $output); |
49
|
|
|
// remove ## Heading |
50
|
25 |
|
$output = \Safe\preg_replace('/^#{1,6}\s+(.*)$/m', '$1', $output); |
51
|
|
|
// Remove emphasis (repeat the line to remove double emphasis) |
52
|
25 |
|
$output = \Safe\preg_replace('/([\*_]{1,3})(\S.*?\S{0,1})\1/', '$2', $output); |
53
|
25 |
|
$output = \Safe\preg_replace('/([\*_]{1,3})(\S.*?\S{0,1})\1/', '$2', $output); |
54
|
|
|
// Remove list items |
55
|
25 |
|
$output = \Safe\preg_replace('/^([^\S\r\n]*)\*\s/m', '$1', $output); |
56
|
|
|
// Remove code blocks |
57
|
25 |
|
$output = \Safe\preg_replace('/^`{3,}(.*)*$/m', '', $output); |
58
|
|
|
// Remove inline code |
59
|
25 |
|
$output = \Safe\preg_replace('/`(.+?)`/', '$1', $output); |
60
|
|
|
|
61
|
25 |
|
return Text::utf8($output, $text->getContext()); |
62
|
|
|
} |
63
|
|
|
} |
64
|
|
|
|