HtmlToMarkdownTransformer::transform()   B
last analyzed

Complexity

Conditions 7
Paths 11

Size

Total Lines 71

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 23
CRAP Score 7.1591

Importance

Changes 0
Metric Value
dl 0
loc 71
ccs 23
cts 27
cp 0.8519
rs 7.6993
c 0
b 0
f 0
cc 7
nc 11
nop 1
crap 7.1591

How to fix   Long Method   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
namespace TreeHouse\IoBundle\Item\Modifier\Data\Transformer;
4
5
use Markdownify\Converter;
6
use TreeHouse\Feeder\Exception\TransformationFailedException;
7
use TreeHouse\Feeder\Modifier\Data\Transformer\TransformerInterface;
8
9
class HtmlToMarkdownTransformer implements TransformerInterface
10
{
11
    /**
12
     * @var Converter
13
     */
14
    protected $converter;
15
16
    /**
17
     * @var \HTMLPurifier
18
     */
19
    protected $purifier;
20
21
    /**
22
     * @param Converter     $converter
23
     * @param \HTMLPurifier $purifier
24
     */
25 16
    public function __construct(Converter $converter, \HTMLPurifier $purifier)
26
    {
27 16
        $this->converter = $converter;
28 16
        $this->purifier = $purifier;
29 16
    }
30
31
    /**
32
     * @inheritdoc
33
     */
34 16
    public function transform($value)
35
    {
36 16
        if (is_null($value)) {
37
            return $value;
38
        }
39
40 16
        if (is_scalar($value)) {
41 16
            $value = (string) $value;
42
        }
43
44 16
        if (!is_string($value)) {
45
            throw new TransformationFailedException(
46
                sprintf('Expected a string to transform, got %s instead', json_encode($value))
47
            );
48
        }
49
50
        // replace non-breaking spaces, somehow this results in a question mark when markdownifying
51 16
        $value = str_replace(['&nbsp;', "\xC2\xA0"], ' ', $value);
52
53
        // remove leading spaces/tabs
54 16
        $value = preg_replace('/^[ \t]+/m', '', $value);
55
56
        // purify the html first
57 16
        $value = $this->purifier->purify($value);
58
59
        // perform some replacements...
60
        $replacements = [
61 16
            [['/>\s+</', '/\s+<\//'],            ['><', '</']],   # remove whitespace/newlines between tags: this can cause
62
                                                                  # trailing whitespace after markdownifying
63
            [['/\s+<br\/?>/', '/<br\/?>\s+/'],   '<br>'],         # also remove whitespace/newlines around <br> tags
64
            ['/([^>])\n([^<])/',                 '\\1<br>\\2'],   # replace newlines with <br> if the newline is not between 2 tags
65
            ['/(<(p|li)>)<br\s?\/?>/i',          '\\1'],          # remove <br>'s at the beginning of a paragraph
66
            ['/<br\s?\/?>(<\/(p|li)>)/i',        '\\1'],          # remove <br>'s at the end of a paragraph
67
        ];
68
69 16
        foreach ($replacements as list($search, $replace)) {
70 16
            $value = preg_replace($search, $replace, $value);
71
        }
72
73
        // strip tags in headings
74 16
        foreach(range(1, 6) as $headingSize) {
75 16
            $value = preg_replace_callback('/(<h'.$headingSize.'>)(.*)(<\/h'.$headingSize.'>)/iU', function ($matches) {
76 4
                if (count($matches) !== 4) {
77
                    return $matches[0];
78
                }
79 4
                return $matches[1] . trim(strip_tags(str_replace('<br>', ' ', $matches[2]))) . $matches[3];
80 16
            }, $value);
81
        }
82
        
83
        // remove any double bullets
84 16
        $value = preg_replace('/(<li>\s*)[\*|\-]{1}/im', '\\1', $value);
85
86
        // convert to markdown
87 16
        $value = @$this->converter->parseString($value);
88
89
        // Fix different types of bullets. What this does is check each line if it starts with any of "-ו○",
90
        // not followed by another bullet, and normalizes it to "* text".
91 16
        $value = preg_replace('/^[\-ו○]\s*([^\-ו○])/mu',  '* $1', $value);
92
93
        // Now make sure there's a newline before 2 consecutive lines that start with a bullet.
94
        // This could lead to superfluous newlines, but they will be corrected later on.
95 16
        $value = preg_replace('/(\n\* [^\n]+){2,}/', "\n$0", "\n" . $value);
96
97
        // remove trailing spaces/tabs
98 16
        $value = preg_replace('/[ \t]+$/m', '', $value);
99
100
        // remove excessive newlines
101 16
        $value = preg_replace('/\n{3,}/m', "\n\n", $value);
102
103 16
        return trim($value);
104
    }
105
}
106