Completed
Push — master ( f3d072...cf814b )
by Peter
08:38
created

HtmlToMarkdownTransformer   A

Complexity

Total Complexity 8

Size/Duplication

Total Lines 97
Duplicated Lines 0 %

Coupling/Cohesion

Components 1
Dependencies 3

Test Coverage

Coverage 87.5%

Importance

Changes 6
Bugs 3 Features 2
Metric Value
wmc 8
c 6
b 3
f 2
lcom 1
cbo 3
dl 0
loc 97
ccs 35
cts 40
cp 0.875
rs 10

2 Methods

Rating   Name   Duplication   Size   Complexity  
A __construct() 0 5 1
C transform() 0 71 7
1
<?php
2
3
namespace TreeHouse\IoBundle\Item\Modifier\Data\Transformer;
4
5
use Markdownify\Converter;
6
use TreeHouse\Feeder\Exception\TransformationFailedException;
7
use TreeHouse\Feeder\Modifier\Data\Transformer\TransformerInterface;
8
9
class HtmlToMarkdownTransformer implements TransformerInterface
10
{
11
    /**
12
     * @var Converter
13
     */
14
    protected $converter;
15
16
    /**
17
     * @var \HTMLPurifier
18
     */
19
    protected $purifier;
20
21
    /**
22
     * @param Converter     $converter
23
     * @param \HTMLPurifier $purifier
24
     */
25 16
    public function __construct(Converter $converter, \HTMLPurifier $purifier)
26
    {
27 16
        $this->converter = $converter;
28 16
        $this->purifier = $purifier;
29 16
    }
30
31
    /**
32
     * @inheritdoc
33
     */
34 16
    public function transform($value)
35
    {
36 16
        if (is_null($value)) {
37
            return $value;
38
        }
39
40 16
        if (is_scalar($value)) {
41 16
            $value = (string) $value;
42 16
        }
43
44 16
        if (!is_string($value)) {
45
            throw new TransformationFailedException(
46
                sprintf('Expected a string to transform, got %s instead', json_encode($value))
47
            );
48
        }
49
50
        // replace non-breaking spaces, somehow this results in a question mark when markdownifying
51 16
        $value = str_replace(['&nbsp;', "\xC2\xA0"], ' ', $value);
52
53
        // remove leading spaces/tabs
54 16
        $value = preg_replace('/^[ \t]+/m', '', $value);
55
56
        // purify the html first
57 16
        $value = $this->purifier->purify($value);
58
59
        // perform some replacements...
60
        $replacements = [
61 16
            [['/>\s+</', '/\s+<\//'],            ['><', '</']],   # remove whitespace/newlines between tags: this can cause
62
                                                                  # trailing whitespace after markdownifying
63 16
            [['/\s+<br\/?>/', '/<br\/?>\s+/'],   '<br>'],         # also remove whitespace/newlines around <br> tags
64 16
            ['/([^>])\n([^<])/',                 '\\1<br>\\2'],   # replace newlines with <br> if the newline is not between 2 tags
65 16
            ['/(<(p|li)>)<br\s?\/?>/i',          '\\1'],          # remove <br>'s at the beginning of a paragraph
66 16
            ['/<br\s?\/?>(<\/(p|li)>)/i',        '\\1'],          # remove <br>'s at the end of a paragraph
67 16
        ];
68
69 16
        foreach ($replacements as list($search, $replace)) {
70 16
            $value = preg_replace($search, $replace, $value);
71 16
        }
72
73
        // strip tags in headings
74 16
        foreach(range(1, 6) as $headingSize) {
75 16
            $value = preg_replace_callback('/(<h'.$headingSize.'>)(.*)(<\/h'.$headingSize.'>)/iU', function ($matches) {
76 4
                if (count($matches) !== 4) {
77
                    return $matches[0];
78
                }
79 4
                return $matches[1] . trim(strip_tags(str_replace('<br>', ' ', $matches[2]))) . $matches[3];
80 16
            }, $value);
81 16
        }
82
        
83
        // remove any double bullets
84 16
        $value = preg_replace('/(<li>\s*)[\*|\-]{1}/im', '\\1', $value);
85
86
        // convert to markdown
87 16
        $value = @$this->converter->parseString($value);
88
89
        // Fix different types of bullets. What this does is check each line if it starts with any of "-ו○",
90
        // not followed by another bullet, and normalizes it to "* text".
91 16
        $value = preg_replace('/^[\-ו○]\s*([^\-ו○])/mu',  '* $1', $value);
92
93
        // Now make sure there's a newline before 2 consecutive lines that start with a bullet.
94
        // This could lead to superfluous newlines, but they will be corrected later on.
95 16
        $value = preg_replace('/(\n\* [^\n]+){2,}/', "\n$0", "\n" . $value);
96
97
        // remove trailing spaces/tabs
98 16
        $value = preg_replace('/[ \t]+$/m', '', $value);
99
100
        // remove excessive newlines
101 16
        $value = preg_replace('/\n{3,}/m', "\n\n", $value);
102
103 16
        return trim($value);
104
    }
105
}
106