HTMLTextExtractor::getContent()   A
last analyzed

Complexity

Conditions 2
Paths 2

Size

Total Lines 32
Code Lines 21

Duplication

Lines 0
Ratio 0 %

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 2
eloc 21
c 1
b 0
f 0
nc 2
nop 1
dl 0
loc 32
rs 9.584
1
<?php
2
3
namespace SilverStripe\TextExtraction\Extractor;
4
5
use SilverStripe\Assets\File;
6
7
/**
8
 * Text extractor that uses php function strip_tags to get just the text. OK for indexing, not
9
 * the best for readable text.
10
 *
11
 * @author mstephens
12
 */
13
class HTMLTextExtractor extends FileTextExtractor
14
{
15
    /**
16
     * Lower priority because its not the most clever HTML extraction. If there is something better, use it
17
     *
18
     * @config
19
     * @var integer
20
     */
21
    private static $priority = 10;
0 ignored issues
show
introduced by
The private property $priority is not used, and could be removed.
Loading history...
22
23
    /**
24
     * @return boolean
25
     */
26
    public function isAvailable()
27
    {
28
        return true;
29
    }
30
31
    /**
32
     * @param  string $extension
33
     * @return array
34
     */
35
    public function supportsExtension($extension)
36
    {
37
        return in_array(strtolower($extension), ["html", "htm", "xhtml"]);
0 ignored issues
show
Bug Best Practice introduced by
The expression return in_array(strtolow...html', 'htm', 'xhtml')) returns the type boolean which is incompatible with the documented return type array.
Loading history...
38
    }
39
40
    /**
41
     * @param string $mime
42
     * @return string
43
     */
44
    public function supportsMime($mime)
45
    {
46
        return strtolower($mime) === 'text/html';
0 ignored issues
show
Bug Best Practice introduced by
The expression return strtolower($mime) === 'text/html' returns the type boolean which is incompatible with the documented return type string.
Loading history...
47
    }
48
49
    /**
50
     * Extracts content from regex, by using strip_tags()
51
     * combined with regular expressions to remove non-content tags like <style> or <script>,
52
     * as well as adding line breaks after block tags.
53
     *
54
     * @param File $file
55
     * @return string
56
     */
57
    public function getContent($file)
58
    {
59
        $content = $file instanceof File ? $file->getString() : file_get_contents($file);
0 ignored issues
show
introduced by
$file is always a sub-type of SilverStripe\Assets\File.
Loading history...
60
61
        // Yes, yes, regex'ing HTML is evil.
62
        // Since we don't care about well-formedness or markup here, it does the job.
63
        $content = preg_replace(
64
            [
65
                // Remove invisible content
66
                '@<head[^>]*?>.*?</head>@siu',
67
                '@<style[^>]*?>.*?</style>@siu',
68
                '@<script[^>]*?.*?</script>@siu',
69
                '@<object[^>]*?.*?</object>@siu',
70
                '@<embed[^>]*?.*?</embed>@siu',
71
                '@<applet[^>]*?.*?</applet>@siu',
72
                '@<noframes[^>]*?.*?</noframes>@siu',
73
                '@<noscript[^>]*?.*?</noscript>@siu',
74
                '@<noembed[^>]*?.*?</noembed>@siu',
75
                // Add line breaks before and after blocks
76
                '@</?((address)|(blockquote)|(center)|(del))@iu',
77
                '@</?((div)|(h[1-9])|(ins)|(isindex)|(p)|(pre))@iu',
78
                '@</?((dir)|(dl)|(dt)|(dd)|(li)|(menu)|(ol)|(ul))@iu',
79
                '@</?((table)|(th)|(td)|(caption))@iu',
80
                '@</?((form)|(button)|(fieldset)|(legend)|(input))@iu',
81
                '@</?((label)|(select)|(optgroup)|(option)|(textarea))@iu',
82
                '@</?((frameset)|(frame)|(iframe))@iu',
83
            ],
84
            [' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', "$0", "$0", "$0", "$0", "$0", "$0", "$0", "$0"],
85
            $content
86
        );
87
88
        return strip_tags($content);
89
    }
90
}
91