Completed
Pull Request — feat/html-splitter (#180)
by Nuno
20:06 queued 15:50
created

HtmlSplitter::importanceWeight()   A

Complexity

Conditions 4
Paths 4

Size

Total Lines 16
Code Lines 9

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 8
CRAP Score 4.0218

Importance

Changes 0
Metric Value
cc 4
eloc 9
nc 4
nop 2
dl 0
loc 16
ccs 8
cts 9
cp 0.8889
crap 4.0218
rs 9.9666
c 0
b 0
f 0
1
<?php
2
3
declare(strict_types=1);
4
5
/**
6
 * This file is part of Scout Extended.
7
 *
8
 * (c) Algolia Team <[email protected]>
9
 *
10
 *  For the full copyright and license information, please view the LICENSE
11
 *  file that was distributed with this source code.
12
 */
13
14
namespace Algolia\ScoutExtended\Splitters;
15
16
use DOMXPath;
17
use DOMDocument;
18
use Algolia\ScoutExtended\Contracts\SplitterContract;
19
20
/**
21
 * Class HtmlSplitter.
22
 */
23
class HtmlSplitter implements SplitterContract
24
{
25
    /**
26
     * The list of html tags.
27
     *
28
     * @var string[]
29
     */
30
    protected $nodes = [
31
        'h1',
32
        'h2',
33
        'h3',
34
        'h4',
35
        'h5',
36
        'h6',
37
        'p',
38
    ];
39
40
    /**
41
     * String for key check purpose.
42
     *
43
     * @const string IMPORTANCE
44
     */
45
    private const IMPORTANCE = 'importance';
46
47
    /**
48
     * String for exception purpose.
49
     *
50
     * @const string PARAGRAPH
51
     */
52
    private const PARAGRAPH = 'p';
53
54
    /**
55
     * Creates a new instance of the class.
56
     *
57
     * @param array $nodes
58
     *
59
     * @return void
60
     */
61 5
    public function __construct(array $nodes = null)
62
    {
63 5
        if ($nodes !== null) {
64 2
            $this->nodes = $nodes;
65
        }
66 5
    }
67
68
    /**
69
     * Find weight of current nodes.
70
     *
71
     * @param array<string, string> $object
72
     *
73
     * @return int
74
     */
75 5
    public function findWeight(array $object): int
76
    {
77 5
        return (int) array_search((key($object)), $this->nodes, true);
78
    }
79
80
    /**
81
     * Add object to queue.
82
     *
83
     * @param array<array<string, string>> $object
84
     * @param array $queue
85
     *
86
     * @return array
87
     */
88 5
    public function addObjectToQueue(array $object, array $queue): array
89
    {
90 5
        if (count($queue) === 0) {
91 5
            $queue[] = $object;
92
93 5
            return $queue;
94
        }
95
96 5
        if ($this->findWeight($object) > $this->findWeight(end($queue))) {
97 3
            $queue[] = $object;
98
99 3
            return $queue;
100
        }
101
102 5
        array_pop($queue);
103
104 5
        return $this->addObjectToQueue($object, $queue);
105
    }
106
107
    /**
108
     * Importance formula.
109
     * Give integer from tags ranking.
110
     *
111
     * @param \DOMElement $node
112
     * @param array<array<array<string, string>, <array<string, int>>> $queue
0 ignored issues
show
Documentation Bug introduced by
The doc comment array<array<array<string..., <array<string, int>>> at position 11 could not be parsed: Unknown type name '<' at position 11 in array<array<array<string, string>, <array<string, int>>>.
Loading history...
113
     *
114
     * @return int
115
     */
116 5
    public function importanceWeight(\DOMElement $node, array $queue): int
117
    {
118 5
        if ($node->nodeName === self::PARAGRAPH) {
119 3
            if (empty(end($queue))) {
120
                return 0;
121
            }
122 3
            if (key(end($queue)) === self::PARAGRAPH) {
123 2
                $key = key(prev($queue));
124
            } else {
125 3
                $key = key(end($queue));
126
            }
127
128 3
            return (int) (count($this->nodes) - 1) + (int) array_search($key, $this->nodes, true);
129
        }
130
131 5
        return (int) array_search($node->nodeName, $this->nodes, true);
132
    }
133
134
    /**
135
     * Clean Records to have a correct format.
136
     *
137
     *
138
     * @param array<array<array<string, string>, <array<string, int>>> $objects
0 ignored issues
show
Documentation Bug introduced by
The doc comment array<array<array<string..., <array<string, int>>> at position 11 could not be parsed: Unknown type name '<' at position 11 in array<array<array<string, string>, <array<string, int>>>.
Loading history...
139
     *
140
     * @return array
141
     */
142 5
    public function cleanRecords(array $objects): array
143
    {
144 5
        $records = [];
145 5
        foreach ($objects as $object) {
146 5
            foreach ($object as $data) {
147 5
                foreach ($data as $key => $value) {
148 5
                    $record[$key] = $value;
149 5
                    if ($key === self::IMPORTANCE) {
150 5
                        $records[] = $record;
151 5
                        $record = [];
152
                    }
153
                }
154
            }
155
        }
156
157 5
        return $records;
158
    }
159
160
    /**
161
     * Clean Content from Html tag.
162
     * Remove space at the begin and end, useless space, return.
163
     *
164
     * @param string $content
165
     *
166
     * @return string
167
     */
168 5
    public function cleanContent(string $content): string
169
    {
170 5
        return trim(preg_replace('/\s+/', ' ', str_replace('\n', '', $content)));
171
    }
172
173
    /**
174
     * Acts a static factory.
175
     *
176
     * @param string|array<string> $tags
177
     *
178
     * @return static
179
     */
180 2
    public static function by($tags)
181
    {
182 2
        return new static((array) $tags);
183
    }
184
185
    /**
186
     * Splits the given value.
187
     *
188
     * @param object $searchable
189
     * @param string $value
190
     *
191
     * @return array
192
     */
193 5
    public function split($searchable, $value): array
194
    {
195 5
        $dom = new DOMDocument();
196
        try {
197 5
            $dom->loadHTML($value);
198 2
        } catch (\ErrorException $exception) {
0 ignored issues
show
Coding Style Comprehensibility introduced by
Consider adding a comment why this CATCH block is empty.
Loading history...
199
        }
200
201 5
        $xpath = new DOMXpath($dom);
202 5
        $queue = [];
203 5
        $objects = [];
204 5
        $xpathQuery = '//'.implode(' | //', $this->nodes);
205 5
        $nodes = $xpath->query($xpathQuery);
206
207 5
        foreach ($nodes as $node) {
208 5
            $content = $this->cleanContent($node->textContent);
209 5
            $object = [$node->nodeName => $content];
210 5
            $importance = $this->importanceWeight($node, $queue);
211 5
            $queue = $this->addObjectToQueue($object, $queue);
212 5
            $cloneQueue = $queue;
213 5
            $cloneQueue[] = [self::IMPORTANCE => $importance];
214 5
            $objects[] = $cloneQueue;
215
        }
216
217 5
        return $this->cleanRecords($objects);
218
    }
219
}
220