1 | <?php |
||
2 | |||
3 | namespace simplehtmldom; |
||
4 | |||
5 | /** |
||
6 | * Website: http://sourceforge.net/projects/simplehtmldom/ |
||
7 | * Acknowledge: Jose Solorzano (https://sourceforge.net/projects/php-html/). |
||
8 | * |
||
9 | * Licensed under The MIT License |
||
10 | * See the LICENSE file in the project root for more information. |
||
11 | * |
||
12 | * Authors: |
||
13 | * S.C. Chen |
||
14 | * John Schlick |
||
15 | * Rus Carroll |
||
16 | * logmanoriginal |
||
17 | * |
||
18 | * Contributors: |
||
19 | * Yousuke Kumakura |
||
20 | * Vadim Voituk |
||
21 | * Antcs |
||
22 | * |
||
23 | * Version Rev. 2.0-RC2 (415) |
||
24 | */ |
||
25 | include_once __DIR__ . '/HtmlDocument.php'; |
||
26 | |||
27 | class HtmlWeb |
||
28 | { |
||
29 | /** |
||
30 | * @return HtmlDocument Returns the DOM for a webpage |
||
31 | */ |
||
32 | public function load($url) |
||
33 | { |
||
34 | if (!filter_var($url, FILTER_VALIDATE_URL)) { |
||
35 | return null; |
||
36 | } |
||
37 | |||
38 | if ($scheme = parse_url($url, PHP_URL_SCHEME)) { |
||
39 | switch (strtolower($scheme)) { |
||
40 | case 'http': |
||
41 | case 'https': |
||
42 | break; |
||
43 | default: |
||
44 | return null; |
||
45 | } |
||
46 | |||
47 | if (extension_loaded('curl')) { |
||
48 | return $this->load_curl($url); |
||
49 | } elseif (ini_get('allow_url_fopen')) { |
||
50 | return $this->load_fopen($url); |
||
51 | } else { |
||
52 | error_log(__FUNCTION__ . ' requires either the cURL extension or allow_url_fopen=On in php.ini'); |
||
53 | } |
||
54 | } |
||
55 | |||
56 | return null; |
||
57 | } |
||
58 | |||
59 | /** |
||
60 | * cURL implementation of load. |
||
61 | */ |
||
62 | private function load_curl($url) |
||
63 | { |
||
64 | $ch = curl_init(); |
||
65 | |||
66 | curl_setopt($ch, CURLOPT_URL, $url); |
||
67 | curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); |
||
68 | curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); |
||
69 | |||
70 | // There is no guarantee this request will be fulfilled |
||
71 | // -- https://www.php.net/manual/en/function.curl-setopt.php |
||
72 | curl_setopt($ch, CURLOPT_BUFFERSIZE, MAX_FILE_SIZE); |
||
73 | |||
74 | // There is no guarantee this request will be fulfilled |
||
75 | $header = [ |
||
76 | 'Accept: text/html', // Prefer HTML format |
||
77 | 'Accept-Charset: utf-8', // Prefer UTF-8 encoding |
||
78 | ]; |
||
79 | curl_setopt($ch, CURLOPT_HTTPHEADER, $header); |
||
80 | |||
81 | $doc = curl_exec($ch); |
||
82 | |||
83 | if (200 !== curl_getinfo($ch, CURLINFO_RESPONSE_CODE)) { |
||
84 | return null; |
||
85 | } |
||
86 | |||
87 | curl_close($ch); |
||
88 | |||
89 | if (strlen($doc) > MAX_FILE_SIZE) { |
||
0 ignored issues
–
show
Bug
introduced
by
![]() |
|||
90 | return null; |
||
91 | } |
||
92 | |||
93 | return new HtmlDocument($doc); |
||
94 | } |
||
95 | |||
96 | /** |
||
97 | * fopen implementation of load. |
||
98 | */ |
||
99 | private function load_fopen($url) |
||
100 | { |
||
101 | // There is no guarantee this request will be fulfilled |
||
102 | $context = stream_context_create(['http' => [ |
||
103 | 'header' => [ |
||
104 | 'Accept: text/html', // Prefer HTML format |
||
105 | 'Accept-Charset: utf-8', // Prefer UTF-8 encoding |
||
106 | ], |
||
107 | 'ignore_errors' => true, // Always fetch content |
||
108 | ]]); |
||
109 | |||
110 | $doc = file_get_contents($url, false, $context, 0, MAX_FILE_SIZE + 1); |
||
111 | |||
112 | if (isset($http_response_header)) { |
||
113 | foreach ($http_response_header as $rh) { |
||
114 | // https://stackoverflow.com/a/1442526 |
||
115 | $parts = explode(' ', $rh, 3); |
||
116 | |||
117 | if (preg_match('/HTTP\/\d\.\d/', $parts[0])) { |
||
118 | $code = $parts[1]; |
||
119 | } |
||
120 | } // Last code is final status |
||
121 | |||
122 | if (!isset($code) || '200' !== $code) { |
||
123 | return null; |
||
124 | } |
||
125 | } |
||
126 | |||
127 | if (strlen($doc) > MAX_FILE_SIZE) { |
||
128 | return null; |
||
129 | } |
||
130 | |||
131 | return new HtmlDocument($doc); |
||
132 | } |
||
133 | } |
||
134 |