HTMLTextExtractor::getContent() - Code Metrics - Inspection of "fix(PDFTextExtractor): Added support for Windows,..." - silverstripe/silverstripe-textextraction - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Pull Request — master (#28)

unknown

created 2016-05-13 05:17 UTC

HTMLTextExtractor::getContent() B

↳ Parent: HTMLTextExtractor

Complexity

Conditions	1
Paths	1

Size

Total Lines	33
Code Lines	24

Duplication

Lines	0
Ratio	0 %

Importance

Changes	5
Bugs	0	Features	1

Metric	Value
c	5
b	0
f	1
dl	0
loc	33
rs	8.8571
cc	1
eloc	24
nc	1
nop	1

<?php

/**
 * Text extractor that uses php function strip_tags to get just the text. OK for indexing, not the best for readable text.
 * @author mstephens
 *
 */
class HTMLTextExtractor extends FileTextExtractor
namespace YourVendor;

class YourClass { }
{
    public function isAvailable()
    {
        return true;
    }

    public function supportsExtension($extension)
    {
        return in_array(
            strtolower($extension),
            array("html", "htm", "xhtml")
        );
    }

    public function supportsMime($mime)
    {
        return strtolower($mime) === 'text/html';
    }

    /**
     * Lower priority because its not the most clever HTML extraction. If there is something better, use it
     *
     * @config
     * @var integer
     */
    private static $priority = 10;


    /**
     * Extracts content from regex, by using strip_tags()
     * combined with regular expressions to remove non-content tags like <style> or <script>,
     * as well as adding line breaks after block tags.
     * 
     * @param string $path
     * @return string
     */
    public function getContent($path)
    {
        $content = file_get_contents($path);
        // Yes, yes, regex'ing HTML is evil.
        // Since we don't care about well-formedness or markup here, it does the job.
        $content = preg_replace(
            array(
                // Remove invisible content 
                    '@<head[^>]*?>.*?</head>@siu',
                    '@<style[^>]*?>.*?</style>@siu',
                    '@<script[^>]*?.*?</script>@siu',
                    '@<object[^>]*?.*?</object>@siu',
                    '@<embed[^>]*?.*?</embed>@siu',
                    '@<applet[^>]*?.*?</applet>@siu',
                    '@<noframes[^>]*?.*?</noframes>@siu',
                    '@<noscript[^>]*?.*?</noscript>@siu',
                    '@<noembed[^>]*?.*?</noembed>@siu',
                // Add line breaks before and after blocks 
                    '@</?((address)|(blockquote)|(center)|(del))@iu',
                    '@</?((div)|(h[1-9])|(ins)|(isindex)|(p)|(pre))@iu',
                    '@</?((dir)|(dl)|(dt)|(dd)|(li)|(menu)|(ol)|(ul))@iu',
                    '@</?((table)|(th)|(td)|(caption))@iu',
                    '@</?((form)|(button)|(fieldset)|(legend)|(input))@iu',
                    '@</?((label)|(select)|(optgroup)|(option)|(textarea))@iu',
                    '@</?((frameset)|(frame)|(iframe))@iu',
            ),
            array(
                ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', "$0", "$0", "$0", "$0", "$0", "$0", "$0", "$0",
            ),
            $content
        );
        return strip_tags($content);
    }
}


1			<?php
2
3			/**
4			* Text extractor that uses php function strip_tags to get just the text. OK for indexing, not the best for readable text.
5			* @author mstephens
6			*
7			*/
8			class HTMLTextExtractor extends FileTextExtractor
			0 ignored issues – show Coding Style Compatibility introduced 2015-11-20 23:53 UTC by Report Bug Copy Issue Report PSR1 recommends that each class must be in a namespace of at least one level to avoid collisions. You can fix this by adding a namespace to your class: namespace YourVendor; class YourClass { } When choosing a vendor namespace, try to pick something that is not too generic to avoid conflicts with other libraries. Loading history...
9			{
10			public function isAvailable()
11			{
12			return true;
13			}
14
15			public function supportsExtension($extension)
16			{
17			return in_array(
18			strtolower($extension),
19			array("html", "htm", "xhtml")
20			);
21			}
22
23			public function supportsMime($mime)
24			{
25			return strtolower($mime) === 'text/html';
26			}
27
28			/**
29			* Lower priority because its not the most clever HTML extraction. If there is something better, use it
30			*
31			* @config
32			* @var integer
33			*/
34			private static $priority = 10;
			0 ignored issues – show Comprehensibility introduced 2015-11-20 23:53 UTC by Report Bug Copy Issue Report Consider using a different property name as you override a private property of the parent class. Loading history... Unused Code introduced 2015-11-20 23:53 UTC by Report Bug Copy Issue Report The property `$priority` is not used and could be removed. This check marks private properties in classes that are never used. Those properties can be removed. Loading history...
35
36			/**
37			* Extracts content from regex, by using strip_tags()
38			* combined with regular expressions to remove non-content tags like <style> or <script>,
39			* as well as adding line breaks after block tags.
40			*
41			* @param string $path
42			* @return string
43			*/
44			public function getContent($path)
45			{
46			$content = file_get_contents($path);
47			// Yes, yes, regex'ing HTML is evil.
48			// Since we don't care about well-formedness or markup here, it does the job.
49			$content = preg_replace(
50			array(
51			// Remove invisible content
52			'@<head[^>]?>.?</head>@siu',
53			'@<style[^>]?>.?</style>@siu',
54			'@<script[^>]?.?</script>@siu',
55			'@<object[^>]?.?</object>@siu',
56			'@<embed[^>]?.?</embed>@siu',
57			'@<applet[^>]?.?</applet>@siu',
58			'@<noframes[^>]?.?</noframes>@siu',
59			'@<noscript[^>]?.?</noscript>@siu',
60			'@<noembed[^>]?.?</noembed>@siu',
61			// Add line breaks before and after blocks
62			'@</?((address)\|(blockquote)\|(center)\|(del))@iu',
63			'@</?((div)\|(h[1-9])\|(ins)\|(isindex)\|(p)\|(pre))@iu',
64			'@</?((dir)\|(dl)\|(dt)\|(dd)\|(li)\|(menu)\|(ol)\|(ul))@iu',
65			'@</?((table)\|(th)\|(td)\|(caption))@iu',
66			'@</?((form)\|(button)\|(fieldset)\|(legend)\|(input))@iu',
67			'@</?((label)\|(select)\|(optgroup)\|(option)\|(textarea))@iu',
68			'@</?((frameset)\|(frame)\|(iframe))@iu',
69			),
70			array(
71			' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', "$0", "$0", "$0", "$0", "$0", "$0", "$0", "$0",
72			),
73			$content
74			);
75			return strip_tags($content);
76			}
77			}
78

silverstripe / silverstripe-textextraction

Pull Request — master (#28)

HTMLTextExtractor::getContent() B

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like