Completed
Push — master ( 382be2...94943b )
by Nicola
06:00
created

code/Tocifier.php (1 issue)

Upgrade to new PHP Analysis Engine

These results are based on our legacy PHP analysis, consider migrating to our new PHP analysis engine instead. Learn more

1
<?php
2
3
/*
4
 * Tocifier is intentionally decoupled from SilverStripe to be able to
5
 * test it without needing to put all the test infrastructure up.
6
 */
7
class Tocifier
8
{
9
    // Prefix to prepend to every URL fragment
10
    public static $prefix = 'TOC-';
11
12
    // The original HTML
13
    private $_raw_html = '';
14
15
    // $_raw_html augmented with anchor ids for proper navigation
16
    private $_html = '';
17
18
    // The most recently generated TOC tree.
19
    private $_tree;
20
21
    // Array of references to the potential parents
22
    private $_dangling = array();
23
24
25
    /**
26
     * Get the TOC node closest to a given nesting level.
27
     *
28
     * @param  int $level  The requested nesting level.
29
     * @return array
30
     */
31
    private function &_getParent($level)
32
    {
33
        while (--$level >= 0) {
34
            if (isset($this->_dangling[$level])) {
35
                return $this->_dangling[$level];
36
            }
37
        }
38
        // This should never be reached
39
        assert(false);
40
    }
41
42
    /**
43
     * Get the plain text content from a DOM element.
44
     *
45
     * @param  DOMElement $tag  The DOM element to inspect.
46
     * @return string
47
     */
48
    private function _getPlainText(DOMElement $tag)
49
    {
50
        // Work on a copy
51
        $clone = $tag->cloneNode(true);
52
53
        // Strip unneded tags (<small>)
54
        while (($tag = $clone->getElementsByTagName('small')) && $tag->length) {
55
            $tag->item(0)->parentNode->removeChild($tag->item(0));
56
        }
57
58
        return $clone->textContent;
59
    }
60
61
    /**
62
     * Create a new TOC node.
63
     *
64
     * @param  string $id     Node id, used for anchoring
65
     * @param  string $text   Title text
66
     * @param  int    $level  The nesting level of the node
67
     * @return array
68
     */
69
    private function &_newNode($id, $text, $level)
70
    {
71
        $node = array(
72
            'id'    => $id,
73
            'title' => $text
74
        );
75
76
        // Clear the trailing dangling parents after level, if any
77
        end($this->_dangling);
78
        $last = key($this->_dangling);
79
        for ($n = $level+1; $n <= $last; ++$n) {
80
            unset($this->_dangling[$n]);
81
        }
82
83
        // Consider this node a potential dangling parent
84
        $this->_dangling[$level] =& $node;
85
86
        return $node;
87
    }
88
89
    /**
90
     * Process the specific document.
91
     *
92
     * @param  DOMDocument $doc  The document to process.
93
     */
94
    private function _processDocument($doc)
95
    {
96
        $this->_tree =& $this->_newNode(self::$prefix, '', 0);
97
        $n = 1;
98
99
        $xpath = new DOMXPath($doc);
100
        $query = '//*[translate(name(), "123456", "......") = "h."][not(@data-hide-from-toc)]';
101
102
        foreach ($xpath->query($query) as $h) {
103
            $text = $this->_getPlainText($h);
104
            $level = (int) substr($h->tagName, 1);
105
            $id = self::$prefix . $n;
106
            ++$n;
107
108
            // Build the tree
109
            $parent =& $this->_getParent($level);
110
            $node =& $this->_newNode($id, $text, $level);
111
            if (! isset($parent['children'])) {
112
                $parent['children'] = array();
113
            }
114
            $parent['children'][] =& $node;
115
116
            // Prepend the anchor
117
            $anchor = $doc->createElement('a');
118
            $anchor->setAttribute('id', $id);
119
            $anchor->setAttribute('class', 'anchor');
120
            $h->parentNode->insertBefore($anchor, $h);
121
        }
122
123
        $body = $doc->getElementsByTagName('body')->item(0);
124
        $this->_html = str_replace(array("<body>\n", '</body>'), '',
125
                                   $doc->saveHTML($body));
126
    }
127
128
    /**
129
     * Debug function for dumping a TOC node and its children.
130
     *
131
     * @param  array  $node    The TOC node to dump
132
     * @param  string $indent  Indentation string.
133
     */
134
    private function _dumpBranch($node, $indent = '')
135
    {
136
        echo $indent . $node['title'] . "\n";
137
        if (isset($node['children'])) {
138
            foreach ($node['children'] as &$child) {
139
                $this->_dumpBranch($child, "$indent\t");
140
            }
141
        }
142
    }
143
144
145
    /**
146
     * Create a new TOCifier instance.
147
     *
148
     * A string containing the HTML to parse for TOC must be passed
149
     * in. The real processing will be triggered by the process()
150
     * method.
151
     *
152
     * Parsing a file can be easily performed by using
153
     * file_get_contents():
154
     *
155
     * <code>
156
     * $tocifier = new Tocifier(@file_get_content($file));
157
     * </code>
158
     *
159
     * @param string $html A chunk of valid HTML (UTF-8 encoded).
160
     */
161
    public function __construct($html)
162
    {
163
        $this->_raw_html = $html;
164
    }
165
166
    /**
167
     * Parse and process the HTML chunk.
168
     *
169
     * The parsing phase involves picking up all the HTML header
170
     * elements (from <h1> to <h6>), so if the HTML is not well formed
171
     * or any other error is encountered this function will fail.
172
     *
173
     * @return boolean true on success, false on errors.
174
     */
175
    public function process()
176
    {
177
        // Check if $this->_raw_html is valid
178
        if (! is_string($this->_raw_html) || empty($this->_raw_html)) {
179
            return false;
180
        }
181
182
        // DOMDocument sucks ass (welcome to PHP, you poor shit). I
183
        // really don't understand why it is so difficult for loadHTML()
184
        // to read a chunk of text in UTF-8...
185
        $html = mb_convert_encoding($this->_raw_html, 'HTML-ENTITIES', 'UTF-8');
186
187
        // Parse the HTML into a DOMDocument tree
188
        $doc = new DOMDocument();
189
        if (! @$doc->loadHTML($html)) {
190
            return false;
191
        }
192
193
        // Process the doc
194
        $this->_processDocument($doc);
195
        return true;
196
    }
197
198
    /**
199
     * Get the TOC (Table Of Contents) from the provided HTML.
200
     *
201
     * The HTML must be provided throught the constructor.
202
     *
203
     * The TOC is represented in the form of:
204
     *
205
     * <code>
206
     * array(
207
     *     array('id'       => 'TOC-1',
208
     *           'title'    => 'Item 1',
209
     *           'children' => array(
210
     *               array('id'       => 'TOC-2',
211
     *                     'title'    => 'Subitem 1.1'
212
     *               ),
213
     *               array('id'       => 'TOC-3',
214
     *                     'title'    => 'Subitem 1.2',
215
     *                     'children' => array(
216
     *                         array('id'      => 'TOC-4',
217
     *                               'title    => 'Subsubitem 1.2.1'
218
     *     ))))),
219
     *     array('id'       => 'TOC-5,
220
     *           'title'    => 'Item 2',
221
     *           'children' => array(
222
     *               array('id'       => 'TOC-6',
223
     *                     'title'    => 'Subitem 2.1'
224
     *               ),
225
     *               array('id'       => 'TOC-7',
226
     *                     'title'    => 'Subitem 2.2'
227
     * ))));
228
     * </code>
229
     *
230
     * The TOC is cached, so subsequent calls will return the same tree.
231
     *
232
     * @return Array An array representing the TOC. A valid array is
0 ignored issues
show
Should the return type not be string|array?

This check compares the return type specified in the @return annotation of a function or method doc comment with the types returned by the function and raises an issue if they mismatch.

Loading history...
233
     *               always returned.
234
     */
235
    public function getTOC()
236
    {
237
        return isset($this->_tree['children']) ? $this->_tree['children'] : array();
238
    }
239
240
    /**
241
     * Get the HTML augmented with anchors for proper navigation.
242
     *
243
     * The HTML must be provided throught the feedHtml() method.
244
     * The returned string is cached, so subsequent calls will return
245
     * the same string without further processing.
246
     *
247
     * @return String The augmented HTML.
248
     */
249
    public function getHtml()
250
    {
251
        return $this->_html;
252
    }
253
254
    /**
255
     * Dump the TOC to stdout for debugging purpose.
256
     */
257
    public function dumpTOC()
258
    {
259
        $this->_dumpBranch($this->_tree);
260
    }
261
}
262