Completed
Push — master ( 641102...2ac5a6 )
by Nicola
20:54
created

Tocifier::prependAnchor()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 7
Code Lines 5

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
dl 0
loc 7
rs 9.4285
c 0
b 0
f 0
cc 1
eloc 5
nc 1
nop 3
1
<?php
2
3
namespace eNTiDi\Autotoc;
4
5
use DOMDocument;
6
use DOMElement;
7
use DOMXPath;
8
9
/*
10
 * Tocifier is intentionally decoupled from SilverStripe to be able to
11
 * test it without needing to put all the test infrastructure up.
12
 */
13
class Tocifier
14
{
15
    // Prefix to prepend to every URL fragment
16
    public static $prefix = 'TOC-';
17
18
    // The original HTML
19
    private $raw_html = '';
20
21
    // $raw_html augmented for proper navigation
22
    private $html = '';
23
24
    // The most recently generated TOC tree.
25
    private $tree;
26
27
    // Array of references to the potential parents
28
    private $dangling = array();
29
30
    // Callback for augmenting a single DOMElement
31
    private $augment_callback;
32
33
34
    /**
35
     * Get the TOC node closest to a given nesting level.
36
     *
37
     * @param  int $level  The requested nesting level.
38
     * @return array
39
     */
40
    private function &getParent($level)
41
    {
42
        while (--$level >= 0) {
43
            if (isset($this->dangling[$level])) {
44
                return $this->dangling[$level];
45
            }
46
        }
47
        // This should never be reached
48
        assert(false);
49
    }
50
51
    /**
52
     * Get the plain text content from a DOM element.
53
     *
54
     * @param  DOMElement $tag  The DOM element to inspect.
55
     * @return string
56
     */
57
    private function getPlainText(DOMElement $tag)
58
    {
59
        // Work on a copy
60
        $clone = $tag->cloneNode(true);
61
62
        // Strip unneded tags (<small>)
63
        while (($tag = $clone->getElementsByTagName('small')) && $tag->length) {
64
            $tag->item(0)->parentNode->removeChild($tag->item(0));
65
        }
66
67
        return $clone->textContent;
68
    }
69
70
    /**
71
     * Create a new TOC node.
72
     *
73
     * @param  string $id     Node id, used for anchoring
74
     * @param  string $text   Title text
75
     * @param  int    $level  The nesting level of the node
76
     * @return array
77
     */
78
    private function &newNode($id, $text, $level)
79
    {
80
        $node = array(
81
            'id'    => $id,
82
            'title' => $text
83
        );
84
85
        // Clear the trailing dangling parents after level, if any
86
        end($this->dangling);
87
        $last = key($this->dangling);
88
        for ($n = $level+1; $n <= $last; ++$n) {
89
            unset($this->dangling[$n]);
90
        }
91
92
        // Consider this node a potential dangling parent
93
        $this->dangling[$level] = & $node;
94
95
        return $node;
96
    }
97
98
    /**
99
     * Process the specific document.
100
     *
101
     * @param  DOMDocument $doc  The document to process.
102
     */
103
    private function processDocument($doc)
104
    {
105
        $this->tree = & $this->newNode(self::$prefix, '', 0);
106
        $n = 1;
107
108
        $xpath = new DOMXPath($doc);
109
        $query = '//*[translate(name(), "123456", "......") = "h."][not(@data-hide-from-toc)]';
110
111
        foreach ($xpath->query($query) as $h) {
112
            $text = $this->getPlainText($h);
113
            $level = (int) substr($h->tagName, 1);
114
            $id = self::$prefix.$n;
115
            ++$n;
116
117
            // Build the tree
118
            $parent = & $this->getParent($level);
119
            $node = & $this->newNode($id, $text, $level);
120
            if (!isset($parent['children'])) {
121
                $parent['children'] = array();
122
            }
123
            $parent['children'][] = & $node;
124
125
            call_user_func($this->augment_callback, $doc, $h, $id);
126
        }
127
128
        $body = $doc->getElementsByTagName('body')->item(0);
129
        $this->html = str_replace(array("<body>\n", '<body>', '</body>'), '', $doc->saveHTML($body));
130
    }
131
132
    /**
133
     * Debug function for dumping a TOC node and its children.
134
     *
135
     * @param  array  $node    The TOC node to dump
136
     * @param  string $indent  Indentation string.
137
     */
138
    private function dumpBranch($node, $indent = '')
139
    {
140
        echo $indent.$node['title']."\n";
141
        if (isset($node['children'])) {
142
            foreach ($node['children'] as &$child) {
143
                $this->dumpBranch($child, "$indent\t");
144
            }
145
        }
146
    }
147
148
149
    /**
150
     * Create a new TOCifier instance.
151
     *
152
     * A string containing the HTML to parse for TOC must be passed
153
     * in. The real processing will be triggered by the process()
154
     * method.
155
     *
156
     * Parsing a file can be easily performed by using
157
     * file_get_contents():
158
     *
159
     * <code>
160
     * $tocifier = new Tocifier(@file_get_content($file));
161
     * </code>
162
     *
163
     * @param string $html A chunk of valid HTML (UTF-8 encoded).
164
     */
165
    public function __construct($html)
166
    {
167
        $this->raw_html = $html;
168
        $this->setAugmentCallback(array(static::class, 'setId'));
169
    }
170
171
    /**
172
     * Change the augment method used by this Tocifier instance.
173
     *
174
     * By default the HTML is augmented prepending an anchor before
175
     * every valid destination. This behavior can be changed by using
176
     * Tocifier::setId() (that directly sets the ID on the destination
177
     * elements) or by providing your own callback.
178
     *
179
     * The signature of the callback to pass in should be compatible
180
     * with:
181
     *
182
     *     function callback(DOMDocument $dom, DOMElement $element, $id)
183
     *
184
     * @param callable $callback  The new function to call for
185
     *                            augmenting DOMElement
186
     */
187
    public function setAugmentCallback($callback)
188
    {
189
        $this->augment_callback = $callback;
190
    }
191
192
    /**
193
     * Parse and process the HTML chunk.
194
     *
195
     * The parsing phase involves picking up all the HTML header
196
     * elements (from <h1> to <h6>), so if the HTML is not well formed
197
     * or any other error is encountered this function will fail.
198
     *
199
     * @return boolean true on success, false on errors.
200
     */
201
    public function process()
202
    {
203
        // Check if $this->raw_html is valid
204
        if (!is_string($this->raw_html) || empty($this->raw_html)) {
205
            return false;
206
        }
207
208
        // DOMDocument sucks ass (welcome to PHP, you poor shit). I
209
        // really don't understand why it is so difficult for loadHTML()
210
        // to read a chunk of text in UTF-8...
211
        $html = mb_convert_encoding($this->raw_html, 'HTML-ENTITIES', 'UTF-8');
212
213
        // Parse the HTML into a DOMDocument tree
214
        $doc = new DOMDocument();
215
        if (!@$doc->loadHTML($html)) {
216
            return false;
217
        }
218
219
        // Process the doc
220
        $this->processDocument($doc);
221
        return true;
222
    }
223
224
    /**
225
     * Get the TOC (Table Of Contents) from the provided HTML.
226
     *
227
     * The HTML must be provided throught the constructor.
228
     *
229
     * The TOC is represented in the form of:
230
     *
231
     * <code>
232
     * array(
233
     *     array('id'       => 'TOC-1',
234
     *           'title'    => 'Item 1',
235
     *           'children' => array(
236
     *               array('id'       => 'TOC-2',
237
     *                     'title'    => 'Subitem 1.1'
238
     *               ),
239
     *               array('id'       => 'TOC-3',
240
     *                     'title'    => 'Subitem 1.2',
241
     *                     'children' => array(
242
     *                         array('id'      => 'TOC-4',
243
     *                               'title    => 'Subsubitem 1.2.1'
244
     *     ))))),
245
     *     array('id'       => 'TOC-5,
246
     *           'title'    => 'Item 2',
247
     *           'children' => array(
248
     *               array('id'       => 'TOC-6',
249
     *                     'title'    => 'Subitem 2.1'
250
     *               ),
251
     *               array('id'       => 'TOC-7',
252
     *                     'title'    => 'Subitem 2.2'
253
     * ))));
254
     * </code>
255
     *
256
     * The TOC is cached, so subsequent calls will return the same tree.
257
     *
258
     * @return Array An array representing the TOC. A valid array is
259
     *               always returned.
260
     */
261
    public function getTOC()
262
    {
263
        return isset($this->tree['children']) ? $this->tree['children'] : array();
264
    }
265
266
    /**
267
     * Get the HTML augmented for proper navigation.
268
     *
269
     * The HTML must be provided throught the feedHtml() method.
270
     * The returned string is cached, so subsequent calls will return
271
     * the same string without further processing.
272
     *
273
     * @return String The augmented HTML.
274
     */
275
    public function getHtml()
276
    {
277
        return $this->html;
278
    }
279
280
    /**
281
     * Dump the TOC to stdout for debugging purpose.
282
     */
283
    public function dumpTOC()
284
    {
285
        $this->dumpBranch($this->tree);
286
    }
287
288
    /**
289
     * Augment a DOMElement by prepending an anchor.
290
     *
291
     * An HTML fragment such as:
292
     *
293
     *     <h1>First</h2>
294
     *     <h2>Second</h1>
295
     *
296
     * will become:
297
     *
298
     *     <a id="TOC-1" class="anchor"></a><h1>First</h2>
299
     *     <a id="TOC-2" class="anchor"></a><h2>Second</h1>
300
     *
301
     * @param DOMDocument $dom      The DOM owning $element
302
     * @param DOMElement  $element  The element to augment
303
     * @param string      $id       The destination ID
304
     */
305
    public static function prependAnchor(DOMDocument $dom, DOMElement $element, $id)
306
    {
307
        $anchor = $dom->createElement('a');
308
        $anchor->setAttribute('id', $id);
309
        $anchor->setAttribute('class', 'anchor');
310
        $element->parentNode->insertBefore($anchor, $element);
311
    }
312
313
    /**
314
     * Augment a DOMElement by setting its ID.
315
     *
316
     * An HTML fragment such as:
317
     *
318
     *     <h1>First</h2>
319
     *     <h2>Second</h1>
320
     *
321
     * will become:
322
     *
323
     *     <h1 id="TOC-1" class="anchor">First</h2>
324
     *     <h2 id="TOC-2" class="anchor">Second</h1>
325
     *
326
     * @param DOMDocument $dom      The DOM owning $element
327
     * @param DOMElement  $element  The element to augment
328
     * @param string      $id       The destination ID
329
     */
330
    public static function setId(DOMDocument $dom, DOMElement $element, $id)
0 ignored issues
show
Unused Code introduced by
The parameter $dom is not used and could be removed.

This check looks from parameters that have been defined for a function or method, but which are not used in the method body.

Loading history...
331
    {
332
        $element->setAttribute('id', $id);
333
        $element->setAttribute('class', 'anchor');
334
    }
335
}
336