Failed Conditions
Push — master ( 01d6ae...9ca6d9 )
by Philippe
534:14 queued 469:10
created

MarkdownRemover::process()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 46
Code Lines 20

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 21
CRAP Score 1

Importance

Changes 0
Metric Value
cc 1
eloc 20
nc 1
nop 1
dl 0
loc 46
rs 9.6
c 0
b 0
f 0
ccs 21
cts 21
cp 1
crap 1
1
<?php
2
3
declare(strict_types=1);
4
5
namespace PhpSpellcheck\TextProcessor;
6
7
use PhpSpellcheck\Text;
8
use PhpSpellcheck\TextInterface;
9
10
/**
11
 * Removes markdown while keeping original lines and columns of
12
 * characters to make spellchecking relevant
13
 */
14
class MarkdownRemover implements TextProcessorInterface
15
{
16 25
    public function process(TextInterface $text): TextInterface
17
    {
18
        // Horizontal rules (stripListHeaders conflict with this rule, which is why it has been moved to the top)
19 25
        $output = \Safe\preg_replace('/^(-\s*?|\*\s*?|_\s*?){3,}(\s*)$/m', PHP_EOL . '$2', $text->getContent());
20
21
        // Github Flavored Markdown
22
        // Header
23 25
        $output = \Safe\preg_replace('/\n={2,}/', '\n', $output);
24
        // Fenced codeblocks
25
        //@TODO parse programming language comments from codeblock instead of removing whole block
26 25
        $output = \Safe\preg_replace('/~{3}.*\n/', '', $output);
27
        // Strikethrough
28 25
        $output = \Safe\preg_replace('/~~/', '', $output);
29
        // Common Markdown
30
        // Remove HTML tags
31 25
        $output = \Safe\preg_replace('/<[^>]*>/', '', $output);
32
        // Remove setext-style headers
33 25
        $output = \Safe\preg_replace('/^[=\-]{2,}\s*$/', '', $output);
34
        // Remove footnotes?
35 25
        $output = \Safe\preg_replace('/\[\^.+?\](\: .*?$)?/', '', $output);
36 25
        $output = \Safe\preg_replace('/\s{0,2}\[.*?\]: .*?$/', '', $output);
37
        // Remove images
38 25
        $output = \Safe\preg_replace('/\!\[(.*?)\][\[\(].*?[\]\)]/', '$1', $output);
39
        // Remove inline links
40 25
        $output = \Safe\preg_replace('/\[(.*?)\][\[\(].*?[\]\)]/', '$1', $output);
41
        // Remove blockquotes
42 25
        $output = \Safe\preg_replace('/^\s{0,3}>\s?/', '', $output);
43
        // Remove reference-style links?
44 25
        $output = \Safe\preg_replace('/^\s{1,2}\[(.*?)\]: (\S+)( ".*?")?\s*$/', '', $output);
45
        // Remove atx-style headers
46
        //@TODO find a way to merge the two regex below
47
        // remove ## Heading ##
48 25
        $output = \Safe\preg_replace('/^#{1,6}\s+(.*)(\s+#{1,6})$/m', '$1', $output);
49
        // remove ## Heading
50 25
        $output = \Safe\preg_replace('/^#{1,6}\s+(.*)$/m', '$1', $output);
51
        // Remove emphasis (repeat the line to remove double emphasis)
52 25
        $output = \Safe\preg_replace('/([\*_]{1,3})(\S.*?\S{0,1})\1/', '$2', $output);
53 25
        $output = \Safe\preg_replace('/([\*_]{1,3})(\S.*?\S{0,1})\1/', '$2', $output);
54
        // Remove list items
55 25
        $output = \Safe\preg_replace('/^([^\S\r\n]*)\*\s/m', '$1', $output);
56
        // Remove code blocks
57 25
        $output = \Safe\preg_replace('/^`{3,}(.*)*$/m', '', $output);
58
        // Remove inline code
59 25
        $output = \Safe\preg_replace('/`(.+?)`/', '$1', $output);
60
61 25
        return Text::utf8($output, $text->getContext());
62
    }
63
}
64