Total Complexity | 113 |
Total Lines | 563 |
Duplicated Lines | 0 % |
Changes | 0 |
Complex classes like selector often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
While breaking up the class, it is a good idea to analyze how other classes use selector, and based on these observations, apply Extract Interface, too.
1 | <?php |
||
23 | class selector |
||
24 | { |
||
25 | /** |
||
26 | * 版本号 |
||
27 | * @var string |
||
28 | */ |
||
29 | const VERSION = '1.0.2'; |
||
30 | public static $dom = null; |
||
31 | public static $dom_auth = ''; |
||
32 | public static $xpath = null; |
||
33 | public static $error = null; |
||
34 | |||
35 | public static function select($html, $selector, $selector_type = 'xpath') |
||
36 | { |
||
37 | if (empty($html) || empty($selector)) |
||
38 | { |
||
39 | return false; |
||
40 | } |
||
41 | |||
42 | $selector_type = strtolower($selector_type); |
||
43 | if ($selector_type == 'xpath') |
||
44 | { |
||
45 | return self::_xpath_select($html, $selector); |
||
|
|||
46 | } |
||
47 | elseif ($selector_type == 'regex') |
||
48 | { |
||
49 | return self::_regex_select($html, $selector); |
||
50 | } |
||
51 | elseif ($selector_type == 'css') |
||
52 | { |
||
53 | return self::_css_select($html, $selector); |
||
54 | } |
||
55 | } |
||
56 | |||
57 | public static function remove($html, $selector, $selector_type = 'xpath') |
||
58 | { |
||
59 | if (empty($html) || empty($selector)) |
||
60 | { |
||
61 | return false; |
||
62 | } |
||
63 | |||
64 | $remove_html = ""; |
||
65 | $selector_type = strtolower($selector_type); |
||
66 | if ($selector_type == 'xpath') |
||
67 | { |
||
68 | $remove_html = self::_xpath_select($html, $selector, true); |
||
69 | } |
||
70 | elseif ($selector_type == 'regex') |
||
71 | { |
||
72 | $remove_html = self::_regex_select($html, $selector, true); |
||
73 | } |
||
74 | elseif ($selector_type == 'css') |
||
75 | { |
||
76 | $remove_html = self::_css_select($html, $selector, true); |
||
77 | } |
||
78 | $html = str_replace($remove_html, "", $html); |
||
79 | return $html; |
||
80 | } |
||
81 | |||
82 | /** |
||
83 | * xpath选择器 |
||
84 | * |
||
85 | * @param mixed $html |
||
86 | * @param mixed $selector |
||
87 | * @return void |
||
88 | * @author seatle <[email protected]> |
||
89 | * @created time :2016-10-26 12:53 |
||
90 | */ |
||
91 | private static function _xpath_select($html, $selector, $remove = false) |
||
92 | { |
||
93 | if (!is_object(self::$dom)) |
||
94 | { |
||
95 | self::$dom = new DOMDocument(); |
||
96 | } |
||
97 | |||
98 | // 如果加载的不是之前的HTML内容,替换一下验证标识 |
||
99 | if (self::$dom_auth != md5($html)) |
||
100 | { |
||
101 | self::$dom_auth = md5($html); |
||
102 | @self::$dom->loadHTML('<?xml encoding="UTF-8">'.$html); |
||
103 | self::$xpath = new DOMXpath(self::$dom); |
||
104 | } |
||
105 | |||
106 | //libxml_use_internal_errors(true); |
||
107 | //self::$dom->loadHTML('<?xml encoding="UTF-8">'.$html); |
||
108 | //$errors = libxml_get_errors(); |
||
109 | //if (!empty($errors)) |
||
110 | //{ |
||
111 | //print_r($errors); |
||
112 | //exit; |
||
113 | //} |
||
114 | |||
115 | $elements = @self::$xpath->query($selector); |
||
116 | if ($elements === false) |
||
117 | { |
||
118 | self::$error = "the selector in the xpath(\"{$selector}\") syntax errors"; |
||
119 | // 不应该返回false,因为isset(false)为true,更不能通过 !$values 去判断,因为!0为true,所以这里只能返回null |
||
120 | //return false; |
||
121 | return null; |
||
122 | } |
||
123 | |||
124 | $result = array(); |
||
125 | if (!is_null($elements)) |
||
126 | { |
||
127 | foreach ($elements as $element) |
||
128 | { |
||
129 | // 如果是删除操作,取一整块代码 |
||
130 | if ($remove) |
||
131 | { |
||
132 | $content = self::$dom->saveXml($element); |
||
133 | } |
||
134 | else |
||
135 | { |
||
136 | $nodeName = $element->nodeName; |
||
137 | $nodeType = $element->nodeType; // 1.Element 2.Attribute 3.Text |
||
138 | //$nodeAttr = $element->getAttribute('src'); |
||
139 | //$nodes = util::node_to_array(self::$dom, $element); |
||
140 | //echo $nodes['@src']."\n"; |
||
141 | // 如果是img标签,直接取src值 |
||
142 | if ($nodeType == 1 && in_array($nodeName, array('img'))) |
||
143 | { |
||
144 | $content = $element->getAttribute('src'); |
||
145 | } |
||
146 | // 如果是标签属性,直接取节点值 |
||
147 | elseif ($nodeType == 2 || $nodeType == 3 || $nodeType == 4) |
||
148 | { |
||
149 | $content = $element->nodeValue; |
||
150 | } |
||
151 | else |
||
152 | { |
||
153 | // 保留nodeValue里的html符号,给children二次提取 |
||
154 | $content = self::$dom->saveXml($element); |
||
155 | //$content = trim(self::$dom->saveHtml($element)); |
||
156 | $content = preg_replace(array("#^<{$nodeName}.*>#isU","#</{$nodeName}>$#isU"), array('', ''), $content); |
||
157 | } |
||
158 | } |
||
159 | $result[] = $content; |
||
160 | } |
||
161 | } |
||
162 | if (empty($result)) |
||
163 | { |
||
164 | return null; |
||
165 | } |
||
166 | // 如果只有一个元素就直接返回string,否则返回数组 |
||
167 | return count($result) > 1 ? $result : $result[0]; |
||
168 | } |
||
169 | |||
170 | /** |
||
171 | * css选择器 |
||
172 | * |
||
173 | * @param mixed $html |
||
174 | * @param mixed $selector |
||
175 | * @return void |
||
176 | * @author seatle <[email protected]> |
||
177 | * @created time :2016-10-26 12:53 |
||
178 | */ |
||
179 | private static function _css_select($html, $selector, $remove = false) |
||
185 | // 如果加载的不是之前的HTML内容,替换一下验证标识 |
||
186 | //if (self::$dom_auth['css'] != md5($html)) |
||
187 | //{ |
||
188 | //self::$dom_auth['css'] = md5($html); |
||
189 | //phpQuery::loadDocumentHTML($html); |
||
190 | //} |
||
191 | //if ($remove) |
||
192 | //{ |
||
193 | //return phpQuery::pq($selector)->remove(); |
||
194 | //} |
||
195 | //else |
||
196 | //{ |
||
197 | //return phpQuery::pq($selector)->html(); |
||
198 | //} |
||
199 | } |
||
200 | |||
201 | /** |
||
202 | * 正则选择器 |
||
203 | * |
||
204 | * @param mixed $html |
||
205 | * @param mixed $selector |
||
206 | * @return void |
||
207 | * @author seatle <[email protected]> |
||
208 | * @created time :2016-10-26 12:53 |
||
209 | */ |
||
210 | private static function _regex_select($html, $selector, $remove = false) |
||
211 | { |
||
212 | if(@preg_match_all($selector, $html, $out) === false) |
||
213 | { |
||
214 | self::$error = "the selector in the regex(\"{$selector}\") syntax errors"; |
||
215 | return null; |
||
216 | } |
||
217 | $count = count($out); |
||
218 | $result = array(); |
||
219 | // 一个都没有匹配到 |
||
220 | if ($count == 0) |
||
221 | { |
||
222 | return null; |
||
223 | } |
||
224 | // 只匹配一个,就是只有一个 () |
||
225 | elseif ($count == 2) |
||
226 | { |
||
227 | // 删除的话取匹配到的所有内容 |
||
228 | if ($remove) |
||
229 | { |
||
230 | $result = $out[0]; |
||
231 | } |
||
232 | else |
||
233 | { |
||
234 | $result = $out[1]; |
||
235 | } |
||
236 | } |
||
237 | else |
||
238 | { |
||
239 | for ($i = 1; $i < $count; $i++) |
||
240 | { |
||
241 | // 如果只有一个元素,就直接返回好了 |
||
242 | $result[] = count($out[$i]) > 1 ? $out[$i] : $out[$i][0]; |
||
243 | } |
||
244 | } |
||
245 | if (empty($result)) |
||
246 | { |
||
247 | return null; |
||
248 | } |
||
249 | |||
250 | return count($result) > 1 ? $result : $result[0]; |
||
251 | } |
||
252 | |||
253 | public static function find_all($html, $selector) |
||
255 | } |
||
256 | |||
257 | |||
258 | public static function css_to_xpath($selectors) |
||
259 | { |
||
260 | $queries = self::parse_selector($selectors); |
||
261 | $delimiter_before = false; |
||
262 | $xquery = ''; |
||
263 | foreach($queries as $s) |
||
264 | { |
||
265 | // TAG |
||
266 | $is_tag = preg_match('@^[\w|\||-]+$@', $s) || $s == '*'; |
||
267 | if ($is_tag) |
||
268 | { |
||
269 | $xquery .= $s; |
||
270 | } |
||
271 | // ID |
||
272 | else if ($s[0] == '#') |
||
273 | { |
||
274 | if ($delimiter_before) |
||
275 | { |
||
276 | $xquery .= '*'; |
||
277 | } |
||
278 | // ID用精确查询 |
||
279 | $xquery .= "[@id='".substr($s, 1)."']"; |
||
280 | } |
||
281 | // CLASSES |
||
282 | else if ($s[0] == '.') |
||
283 | { |
||
284 | if ($delimiter_before) |
||
285 | { |
||
286 | $xquery .= '*'; |
||
287 | } |
||
288 | // CLASS用模糊查询 |
||
289 | $xquery .= "[contains(@class,'".substr($s, 1)."')]"; |
||
290 | } |
||
291 | // ATTRIBUTES |
||
292 | else if ($s[0] == '[') |
||
293 | { |
||
294 | if ($delimiter_before) |
||
295 | { |
||
296 | $xquery .= '*'; |
||
297 | } |
||
298 | // strip side brackets |
||
299 | $attr = trim($s, ']['); |
||
300 | // attr with specifed value |
||
301 | if (mb_strpos($s, '=')) |
||
302 | { |
||
303 | $value = null; |
||
304 | list($attr, $value) = explode('=', $attr); |
||
305 | $value = trim($value, "'\""); |
||
306 | if (self::is_regexp($attr)) |
||
307 | { |
||
308 | // cut regexp character |
||
309 | $attr = substr($attr, 0, -1); |
||
310 | $xquery .= "[@{$attr}]"; |
||
311 | } |
||
312 | else |
||
313 | { |
||
314 | $xquery .= "[@{$attr}='{$value}']"; |
||
315 | } |
||
316 | } |
||
317 | // attr without specified value |
||
318 | else |
||
319 | { |
||
320 | $xquery .= "[@{$attr}]"; |
||
321 | } |
||
322 | } |
||
323 | // ~ General Sibling Selector |
||
324 | else if ($s[0] == '~') |
||
325 | { |
||
326 | } |
||
327 | // + Adjacent sibling selectors |
||
328 | else if ($s[0] == '+') |
||
329 | { |
||
330 | } |
||
331 | // PSEUDO CLASSES |
||
332 | else if ($s[0] == ':') |
||
333 | { |
||
334 | } |
||
335 | // DIRECT DESCENDANDS |
||
336 | else if ($s == '>') |
||
337 | { |
||
338 | $xquery .= '/'; |
||
339 | $delimiter_before = 2; |
||
340 | } |
||
341 | // ALL DESCENDANDS |
||
342 | else if ($s == ' ') |
||
343 | { |
||
344 | $xquery .= '//'; |
||
345 | $delimiter_before = 2; |
||
346 | } |
||
347 | // ERRORS |
||
348 | else |
||
349 | { |
||
350 | exit("Unrecognized token '$s'"); |
||
351 | } |
||
352 | $delimiter_before = $delimiter_before === 2; |
||
353 | } |
||
354 | return $xquery; |
||
355 | } |
||
356 | |||
357 | /** |
||
358 | * @access private |
||
359 | */ |
||
360 | public static function parse_selector($query) |
||
567 | } |
||
568 | |||
569 | public static function is_char($char) |
||
570 | { |
||
571 | return preg_match('@\w@', $char); |
||
572 | } |
||
573 | |||
574 | /** |
||
575 | * 模糊匹配 |
||
576 | * ^ 前缀字符串 |
||
577 | * * 包含字符串 |
||
578 | * $ 后缀字符串 |
||
579 | * @access private |
||
580 | */ |
||
581 | protected static function is_regexp($pattern) |
||
586 | ); |
||
587 | } |
||
588 | } |
||
589 |
This check looks for function or method calls that always return null and whose return value is used.
The method
getObject()
can return nothing but null, so it makes no sense to use the return value.The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.