HtmlToMarkdownTransformer::transform() - Code Metrics - treehouselabs/io-bundle - Measure and Improve Code Quality continuously with Scrutinizer

HtmlToMarkdownTransformer::transform() B
last analyzed 2020-05-04 12:22 UTC

↳ Parent: HtmlToMarkdownTransformer

Complexity

Conditions	7
Paths	11

Size

Total Lines

Duplication

Lines	0
Ratio	0 %

Code Coverage

Tests	23
CRAP Score	7.1591

Importance

Changes

Metric	Value
dl	0
loc	71
ccs	23
cts	27
cp	0.8519
rs	7.6993
c	0
b	0
f	0
cc	7
nc	11
nop	1
crap	7.1591

How to fix Long Method

<?php

namespace TreeHouse\IoBundle\Item\Modifier\Data\Transformer;

use Markdownify\Converter;
use TreeHouse\Feeder\Exception\TransformationFailedException;
use TreeHouse\Feeder\Modifier\Data\Transformer\TransformerInterface;

class HtmlToMarkdownTransformer implements TransformerInterface
{
    /**
     * @var Converter
     */
    protected $converter;

    /**
     * @var \HTMLPurifier
     */
    protected $purifier;

    /**
     * @param Converter     $converter
     * @param \HTMLPurifier $purifier
     */
    public function __construct(Converter $converter, \HTMLPurifier $purifier)
    {
        $this->converter = $converter;
        $this->purifier = $purifier;
    }

    /**
     * @inheritdoc
     */
    public function transform($value)
    {
        if (is_null($value)) {
            return $value;
        }

        if (is_scalar($value)) {
            $value = (string) $value;
        }

        if (!is_string($value)) {
            throw new TransformationFailedException(
                sprintf('Expected a string to transform, got %s instead', json_encode($value))
            );
        }

        // replace non-breaking spaces, somehow this results in a question mark when markdownifying
        $value = str_replace(['&nbsp;', "\xC2\xA0"], ' ', $value);

        // remove leading spaces/tabs
        $value = preg_replace('/^[ \t]+/m', '', $value);

        // purify the html first
        $value = $this->purifier->purify($value);

        // perform some replacements...
        $replacements = [
            [['/>\s+</', '/\s+<\//'],            ['><', '</']],   # remove whitespace/newlines between tags: this can cause
                                                                  # trailing whitespace after markdownifying
            [['/\s+<br\/?>/', '/<br\/?>\s+/'],   '<br>'],         # also remove whitespace/newlines around <br> tags
            ['/([^>])\n([^<])/',                 '\\1<br>\\2'],   # replace newlines with <br> if the newline is not between 2 tags
            ['/(<(p|li)>)<br\s?\/?>/i',          '\\1'],          # remove <br>'s at the beginning of a paragraph
            ['/<br\s?\/?>(<\/(p|li)>)/i',        '\\1'],          # remove <br>'s at the end of a paragraph
        ];

        foreach ($replacements as list($search, $replace)) {
            $value = preg_replace($search, $replace, $value);
        }

        // strip tags in headings
        foreach(range(1, 6) as $headingSize) {
            $value = preg_replace_callback('/(<h'.$headingSize.'>)(.*)(<\/h'.$headingSize.'>)/iU', function ($matches) {
                if (count($matches) !== 4) {
                    return $matches[0];
                }
                return $matches[1] . trim(strip_tags(str_replace('<br>', ' ', $matches[2]))) . $matches[3];
            }, $value);
        }
        
        // remove any double bullets
        $value = preg_replace('/(<li>\s*)[\*|\-]{1}/im', '\\1', $value);

        // convert to markdown
        $value = @$this->converter->parseString($value);

        // Fix different types of bullets. What this does is check each line if it starts with any of "-×•○",
        // not followed by another bullet, and normalizes it to "* text".
        $value = preg_replace('/^[\-×•○]\s*([^\-×•○])/mu',  '* $1', $value);

        // Now make sure there's a newline before 2 consecutive lines that start with a bullet.
        // This could lead to superfluous newlines, but they will be corrected later on.
        $value = preg_replace('/(\n\* [^\n]+){2,}/', "\n$0", "\n" . $value);

        // remove trailing spaces/tabs
        $value = preg_replace('/[ \t]+$/m', '', $value);

        // remove excessive newlines
        $value = preg_replace('/\n{3,}/m', "\n\n", $value);

        return trim($value);
    }
}


1		<?php
2
3		namespace TreeHouse\IoBundle\Item\Modifier\Data\Transformer;
4
5		use Markdownify\Converter;
6		use TreeHouse\Feeder\Exception\TransformationFailedException;
7		use TreeHouse\Feeder\Modifier\Data\Transformer\TransformerInterface;
8
9		class HtmlToMarkdownTransformer implements TransformerInterface
10		{
11		/**
12		* @var Converter
13		*/
14		protected $converter;
15
16		/**
17		* @var \HTMLPurifier
18		*/
19		protected $purifier;
20
21		/**
22		* @param Converter $converter
23		* @param \HTMLPurifier $purifier
24		*/
25	16	public function __construct(Converter $converter, \HTMLPurifier $purifier)
26		{
27	16	$this->converter = $converter;
28	16	$this->purifier = $purifier;
29	16	}
30
31		/**
32		* @inheritdoc
33		*/
34	16	public function transform($value)
35		{
36	16	if (is_null($value)) {
37		return $value;
38		}
39
40	16	if (is_scalar($value)) {
41	16	$value = (string) $value;
42		}
43
44	16	if (!is_string($value)) {
45		throw new TransformationFailedException(
46		sprintf('Expected a string to transform, got %s instead', json_encode($value))
47		);
48		}
49
50		// replace non-breaking spaces, somehow this results in a question mark when markdownifying
51	16	$value = str_replace([' ', "\xC2\xA0"], ' ', $value);
52
53		// remove leading spaces/tabs
54	16	$value = preg_replace('/^[ \t]+/m', '', $value);
55
56		// purify the html first
57	16	$value = $this->purifier->purify($value);
58
59		// perform some replacements...
60		$replacements = [
61	16	[['/>\s+</', '/\s+<\//'], ['><', '</']], # remove whitespace/newlines between tags: this can cause
62		# trailing whitespace after markdownifying
63		[['/\s+<br\/?>/', '/<br\/?>\s+/'], '<br>'], # also remove whitespace/newlines around <br> tags
64		['/([^>])\n([^<])/', '\\1<br>\\2'], # replace newlines with <br> if the newline is not between 2 tags
65		['/(<(p\|li)>)<br\s?\/?>/i', '\\1'], # remove <br>'s at the beginning of a paragraph
66		['/<br\s?\/?>(<\/(p\|li)>)/i', '\\1'], # remove <br>'s at the end of a paragraph
67		];
68
69	16	foreach ($replacements as list($search, $replace)) {
70	16	$value = preg_replace($search, $replace, $value);
71		}
72
73		// strip tags in headings
74	16	foreach(range(1, 6) as $headingSize) {
75	16	$value = preg_replace_callback('/(<h'.$headingSize.'>)(.*)(<\/h'.$headingSize.'>)/iU', function ($matches) {
76	4	if (count($matches) !== 4) {
77		return $matches[0];
78		}
79	4	return $matches[1] . trim(strip_tags(str_replace('<br>', ' ', $matches[2]))) . $matches[3];
80	16	}, $value);
81		}
82
83		// remove any double bullets
84	16	$value = preg_replace('/(<li>\s)[\\|\-]{1}/im', '\\1', $value);
85
86		// convert to markdown
87	16	$value = @$this->converter->parseString($value);
88
89		// Fix different types of bullets. What this does is check each line if it starts with any of "-×•○",
90		// not followed by another bullet, and normalizes it to "* text".
91	16	$value = preg_replace('/^[\-×•○]\s([^\-×•○])/mu', ' $1', $value);
92
93		// Now make sure there's a newline before 2 consecutive lines that start with a bullet.
94		// This could lead to superfluous newlines, but they will be corrected later on.
95	16	$value = preg_replace('/(\n\* [^\n]+){2,}/', "\n$0", "\n" . $value);
96
97		// remove trailing spaces/tabs
98	16	$value = preg_replace('/[ \t]+$/m', '', $value);
99
100		// remove excessive newlines
101	16	$value = preg_replace('/\n{3,}/m', "\n\n", $value);
102
103	16	return trim($value);
104		}
105		}
106

treehouselabs / io-bundle

HtmlToMarkdownTransformer::transform() B last analyzed 2020-05-04 12:22 UTC

Complexity

Size

Duplication

Code Coverage

Importance

How to fix Long Method

Long Method

Duplication Side-by-Side

Filter issues like

HtmlToMarkdownTransformer::transform() B
last analyzed 2020-05-04 12:22 UTC