These results are based on our legacy PHP analysis, consider migrating to our new PHP analysis engine instead. Learn more
1 | <?php |
||
2 | |||
3 | /* |
||
4 | * Tocifier is intentionally decoupled from SilverStripe to be able to |
||
5 | * test it without needing to put all the test infrastructure up. |
||
6 | */ |
||
7 | class Tocifier |
||
8 | { |
||
9 | // Prefix to prepend to every URL fragment |
||
10 | public static $prefix = 'TOC-'; |
||
11 | |||
12 | // The original HTML |
||
13 | private $_raw_html = ''; |
||
14 | |||
15 | // $_raw_html augmented with anchor ids for proper navigation |
||
16 | private $_html = ''; |
||
17 | |||
18 | // The most recently generated TOC tree. |
||
19 | private $_tree; |
||
20 | |||
21 | // Array of references to the potential parents |
||
22 | private $_dangling = array(); |
||
23 | |||
24 | |||
25 | /** |
||
26 | * Get the TOC node closest to a given nesting level. |
||
27 | * |
||
28 | * @param int $level The requested nesting level. |
||
29 | * @return array |
||
30 | */ |
||
31 | private function &_getParent($level) |
||
32 | { |
||
33 | while (--$level >= 0) { |
||
34 | if (isset($this->_dangling[$level])) { |
||
35 | return $this->_dangling[$level]; |
||
36 | } |
||
37 | } |
||
38 | // This should never be reached |
||
39 | assert(false); |
||
40 | } |
||
41 | |||
42 | /** |
||
43 | * Get the plain text content from a DOM element. |
||
44 | * |
||
45 | * @param DOMElement $tag The DOM element to inspect. |
||
46 | * @return string |
||
47 | */ |
||
48 | private function _getPlainText(DOMElement $tag) |
||
49 | { |
||
50 | // Work on a copy |
||
51 | $clone = $tag->cloneNode(true); |
||
52 | |||
53 | // Strip unneded tags (<small>) |
||
54 | while (($tag = $clone->getElementsByTagName('small')) && $tag->length) { |
||
55 | $tag->item(0)->parentNode->removeChild($tag->item(0)); |
||
56 | } |
||
57 | |||
58 | return $clone->textContent; |
||
59 | } |
||
60 | |||
61 | /** |
||
62 | * Create a new TOC node. |
||
63 | * |
||
64 | * @param string $id Node id, used for anchoring |
||
65 | * @param string $text Title text |
||
66 | * @param int $level The nesting level of the node |
||
67 | * @return array |
||
68 | */ |
||
69 | private function &_newNode($id, $text, $level) |
||
70 | { |
||
71 | $node = array( |
||
72 | 'id' => $id, |
||
73 | 'title' => $text |
||
74 | ); |
||
75 | |||
76 | // Clear the trailing dangling parents after level, if any |
||
77 | end($this->_dangling); |
||
78 | $last = key($this->_dangling); |
||
79 | for ($n = $level+1; $n <= $last; ++$n) { |
||
80 | unset($this->_dangling[$n]); |
||
81 | } |
||
82 | |||
83 | // Consider this node a potential dangling parent |
||
84 | $this->_dangling[$level] =& $node; |
||
85 | |||
86 | return $node; |
||
87 | } |
||
88 | |||
89 | /** |
||
90 | * Process the specific document. |
||
91 | * |
||
92 | * @param DOMDocument $doc The document to process. |
||
93 | */ |
||
94 | private function _processDocument($doc) |
||
95 | { |
||
96 | $this->_tree =& $this->_newNode(self::$prefix, '', 0); |
||
97 | $n = 1; |
||
98 | |||
99 | $xpath = new DOMXPath($doc); |
||
100 | $query = '//*[translate(name(), "123456", "......") = "h."][not(@data-hide-from-toc)]'; |
||
101 | |||
102 | foreach ($xpath->query($query) as $h) { |
||
103 | $text = $this->_getPlainText($h); |
||
104 | $level = (int) substr($h->tagName, 1); |
||
105 | $id = self::$prefix . $n; |
||
106 | ++$n; |
||
107 | |||
108 | // Build the tree |
||
109 | $parent =& $this->_getParent($level); |
||
110 | $node =& $this->_newNode($id, $text, $level); |
||
111 | if (! isset($parent['children'])) { |
||
112 | $parent['children'] = array(); |
||
113 | } |
||
114 | $parent['children'][] =& $node; |
||
115 | |||
116 | // Prepend the anchor |
||
117 | $anchor = $doc->createElement('a'); |
||
118 | $anchor->setAttribute('id', $id); |
||
119 | $anchor->setAttribute('class', 'anchor'); |
||
120 | $h->parentNode->insertBefore($anchor, $h); |
||
121 | } |
||
122 | |||
123 | $body = $doc->getElementsByTagName('body')->item(0); |
||
124 | $this->_html = str_replace(array("<body>\n", '</body>'), '', |
||
125 | $doc->saveHTML($body)); |
||
126 | } |
||
127 | |||
128 | /** |
||
129 | * Debug function for dumping a TOC node and its children. |
||
130 | * |
||
131 | * @param array $node The TOC node to dump |
||
132 | * @param string $indent Indentation string. |
||
133 | */ |
||
134 | private function _dumpBranch($node, $indent = '') |
||
135 | { |
||
136 | echo $indent . $node['title'] . "\n"; |
||
137 | if (isset($node['children'])) { |
||
138 | foreach ($node['children'] as &$child) { |
||
139 | $this->_dumpBranch($child, "$indent\t"); |
||
140 | } |
||
141 | } |
||
142 | } |
||
143 | |||
144 | |||
145 | /** |
||
146 | * Create a new TOCifier instance. |
||
147 | * |
||
148 | * A string containing the HTML to parse for TOC must be passed |
||
149 | * in. The real processing will be triggered by the process() |
||
150 | * method. |
||
151 | * |
||
152 | * Parsing a file can be easily performed by using |
||
153 | * file_get_contents(): |
||
154 | * |
||
155 | * <code> |
||
156 | * $tocifier = new Tocifier(@file_get_content($file)); |
||
157 | * </code> |
||
158 | * |
||
159 | * @param string $html A chunk of valid HTML (UTF-8 encoded). |
||
160 | */ |
||
161 | public function __construct($html) |
||
162 | { |
||
163 | $this->_raw_html = $html; |
||
164 | } |
||
165 | |||
166 | /** |
||
167 | * Parse and process the HTML chunk. |
||
168 | * |
||
169 | * The parsing phase involves picking up all the HTML header |
||
170 | * elements (from <h1> to <h6>), so if the HTML is not well formed |
||
171 | * or any other error is encountered this function will fail. |
||
172 | * |
||
173 | * @return boolean true on success, false on errors. |
||
174 | */ |
||
175 | public function process() |
||
176 | { |
||
177 | // Check if $this->_raw_html is valid |
||
178 | if (! is_string($this->_raw_html) || empty($this->_raw_html)) { |
||
179 | return false; |
||
180 | } |
||
181 | |||
182 | // DOMDocument sucks ass (welcome to PHP, you poor shit). I |
||
183 | // really don't understand why it is so difficult for loadHTML() |
||
184 | // to read a chunk of text in UTF-8... |
||
185 | $html = mb_convert_encoding($this->_raw_html, 'HTML-ENTITIES', 'UTF-8'); |
||
186 | |||
187 | // Parse the HTML into a DOMDocument tree |
||
188 | $doc = new DOMDocument(); |
||
189 | if (! @$doc->loadHTML($html)) { |
||
190 | return false; |
||
191 | } |
||
192 | |||
193 | // Process the doc |
||
194 | $this->_processDocument($doc); |
||
195 | return true; |
||
196 | } |
||
197 | |||
198 | /** |
||
199 | * Get the TOC (Table Of Contents) from the provided HTML. |
||
200 | * |
||
201 | * The HTML must be provided throught the constructor. |
||
202 | * |
||
203 | * The TOC is represented in the form of: |
||
204 | * |
||
205 | * <code> |
||
206 | * array( |
||
207 | * array('id' => 'TOC-1', |
||
208 | * 'title' => 'Item 1', |
||
209 | * 'children' => array( |
||
210 | * array('id' => 'TOC-2', |
||
211 | * 'title' => 'Subitem 1.1' |
||
212 | * ), |
||
213 | * array('id' => 'TOC-3', |
||
214 | * 'title' => 'Subitem 1.2', |
||
215 | * 'children' => array( |
||
216 | * array('id' => 'TOC-4', |
||
217 | * 'title => 'Subsubitem 1.2.1' |
||
218 | * ))))), |
||
219 | * array('id' => 'TOC-5, |
||
220 | * 'title' => 'Item 2', |
||
221 | * 'children' => array( |
||
222 | * array('id' => 'TOC-6', |
||
223 | * 'title' => 'Subitem 2.1' |
||
224 | * ), |
||
225 | * array('id' => 'TOC-7', |
||
226 | * 'title' => 'Subitem 2.2' |
||
227 | * )))); |
||
228 | * </code> |
||
229 | * |
||
230 | * The TOC is cached, so subsequent calls will return the same tree. |
||
231 | * |
||
232 | * @return Array An array representing the TOC. A valid array is |
||
0 ignored issues
–
show
|
|||
233 | * always returned. |
||
234 | */ |
||
235 | public function getTOC() |
||
236 | { |
||
237 | return isset($this->_tree['children']) ? $this->_tree['children'] : array(); |
||
238 | } |
||
239 | |||
240 | /** |
||
241 | * Get the HTML augmented with anchors for proper navigation. |
||
242 | * |
||
243 | * The HTML must be provided throught the feedHtml() method. |
||
244 | * The returned string is cached, so subsequent calls will return |
||
245 | * the same string without further processing. |
||
246 | * |
||
247 | * @return String The augmented HTML. |
||
248 | */ |
||
249 | public function getHtml() |
||
250 | { |
||
251 | return $this->_html; |
||
252 | } |
||
253 | |||
254 | /** |
||
255 | * Dump the TOC to stdout for debugging purpose. |
||
256 | */ |
||
257 | public function dumpTOC() |
||
258 | { |
||
259 | $this->_dumpBranch($this->_tree); |
||
260 | } |
||
261 | } |
||
262 |
This check compares the return type specified in the
@return
annotation of a function or method doc comment with the types returned by the function and raises an issue if they mismatch.