This project does not seem to handle request data directly as such no vulnerable execution paths were found.
include
, or for example
via PHP's auto-loading mechanism.
These results are based on our legacy PHP analysis, consider migrating to our new PHP analysis engine instead. Learn more
1 | <?php |
||
2 | |||
3 | namespace Pforret\PfPageparser; |
||
4 | |||
5 | use GuzzleHttp\Client; |
||
6 | use GuzzleHttp\Exception\GuzzleException; |
||
7 | use Psr\Log\LoggerInterface; |
||
8 | |||
9 | class PfPageparser |
||
10 | { |
||
11 | // Build your next great package. |
||
12 | private $config; |
||
13 | private $content = ''; |
||
14 | private $chunks = []; |
||
15 | private $parsed = []; |
||
16 | private $logger; |
||
17 | |||
18 | public function __construct(array $config = [], LoggerInterface $logger = null) |
||
19 | { |
||
20 | $defaults = [ |
||
21 | 'userAgent' => 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36', |
||
22 | 'cacheTtl' => 3600, |
||
23 | 'timeout' => 5, // Guzzle timeout |
||
24 | 'method' => 'GET', |
||
25 | 'headers' => [ |
||
26 | 'User-Agent' => 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36', |
||
27 | ] |
||
28 | ]; |
||
29 | |||
30 | if ($logger) { |
||
31 | $this->logger = $logger; |
||
32 | } |
||
33 | $this->config = array_merge($defaults, $config); |
||
34 | } |
||
35 | |||
36 | public function get_config() |
||
37 | { |
||
38 | return $this->config; |
||
39 | } |
||
40 | |||
41 | /* ------------------------------------------ |
||
42 | * LOADING THE CONTENT FROM A URL/FILE/STRING |
||
43 | */ |
||
44 | |||
45 | private function initialize(): void |
||
46 | { |
||
47 | $this->content = ""; |
||
48 | $this->chunks = []; |
||
49 | $this->parsed = []; |
||
50 | } |
||
51 | |||
52 | /** |
||
53 | * @param string $url |
||
54 | * @param array $options |
||
55 | * @return $this |
||
56 | */ |
||
57 | public function load_from_url(string $url, array $options = []): PfPageparser |
||
58 | { |
||
59 | // TODO: load with caching |
||
60 | $this->initialize(); |
||
61 | $options = array_merge($this->config, $options); |
||
62 | $client = new Client([ |
||
63 | 'headers' => $options['headers'], |
||
64 | ]); |
||
65 | try { |
||
66 | $res = $client->request($options['method'], $url); |
||
67 | $this->content = $res->getBody()->getContents(); |
||
68 | } catch (GuzzleException $error) { |
||
69 | $message = $error->getMessage(); |
||
70 | $this->log($message, 'error'); |
||
71 | } |
||
72 | return $this; |
||
73 | } |
||
74 | |||
75 | public function load_from_file(string $filename): PfPageparser |
||
76 | { |
||
77 | $this->initialize(); |
||
78 | if (file_exists($filename)) { |
||
79 | $this->content = file_get_contents($filename); |
||
80 | } |
||
81 | return $this; |
||
82 | } |
||
83 | |||
84 | public function load_fom_string(string $string): PfPageparser |
||
85 | { |
||
86 | $this->initialize(); |
||
87 | $this->content = $string; |
||
88 | return $this; |
||
89 | } |
||
90 | |||
91 | public function cleanup_html($remove_linefeeds = true, $shrink_spaces = true): PfPageparser |
||
92 | { |
||
93 | if ($remove_linefeeds) { |
||
94 | $this->content = preg_replace("|\n+|", " ", $this->content); |
||
95 | } // remove line feeds |
||
96 | if ($shrink_spaces) { |
||
97 | $this->content = preg_replace("|\s\s+|", " ", $this->content); |
||
98 | } // remove multiple spaces |
||
99 | return $this; |
||
100 | } |
||
101 | |||
102 | /* ------------------------------------------ |
||
103 | * GET THE RAW CONTENT |
||
104 | */ |
||
105 | public function get_content(): string |
||
106 | { |
||
107 | return $this->content; /* for backward compatibility */ |
||
108 | } |
||
109 | |||
110 | public function raw(): string |
||
111 | { |
||
112 | return $this->content; |
||
113 | } |
||
114 | |||
115 | /* ------------------------------------------ |
||
116 | * MODIFY THE RAW CONTENT |
||
117 | */ |
||
118 | |||
119 | View Code Duplication | public function trim_before(string $pattern, bool $is_regex = false): PfPageparser |
|
0 ignored issues
–
show
|
|||
120 | { |
||
121 | $found = $is_regex ? preg_match($pattern, $this->content, $matches) : strpos($this->content, $pattern); |
||
122 | if ($found) { |
||
123 | $this->content = substr($this->content, $found); |
||
124 | } |
||
125 | return $this; |
||
126 | } |
||
127 | |||
128 | View Code Duplication | public function trim_after(string $pattern, bool $is_regex = false): PfPageparser |
|
0 ignored issues
–
show
This method seems to be duplicated in your project.
Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation. You can also find more detailed suggestions in the “Code” section of your repository. ![]() |
|||
129 | { |
||
130 | $found = $is_regex ? preg_match($pattern, $this->content, $matches) : strpos($this->content, $pattern); |
||
131 | if ($found) { |
||
132 | $this->content = substr($this->content, 0, $found); |
||
133 | } |
||
134 | return $this; |
||
135 | |||
136 | } |
||
137 | |||
138 | public function trim(string $before = "<body", string $after = "</body", bool $is_regex = false): PfPageparser |
||
139 | { |
||
140 | $this->trim_before($before, $is_regex); |
||
141 | $this->trim_after($after, $is_regex); |
||
142 | return $this; |
||
143 | } |
||
144 | |||
145 | /* ------------------------------------------ |
||
146 | * RAW CONTENT => CHUNKS |
||
147 | */ |
||
148 | |||
149 | /** |
||
150 | * @param string $pattern |
||
151 | * @param bool $is_regex |
||
152 | * @return $this |
||
153 | * split the HTML content into chunks based on a text or regex separator |
||
154 | */ |
||
155 | |||
156 | public function split_chunks(string $pattern, bool $is_regex = false): PfPageparser |
||
157 | { |
||
158 | if (!$is_regex) { |
||
159 | $this->chunks = explode($pattern, $this->content); |
||
160 | } else { |
||
161 | $this->chunks = []; |
||
162 | preg_match_all($pattern, $this->content, $matches, PREG_OFFSET_CAPTURE); |
||
163 | if ($matches) { |
||
164 | $from_char = 0; |
||
165 | foreach ($matches[0] as $match) { |
||
166 | $separator = $match[0]; |
||
167 | $at_char = $match[1]; |
||
168 | $this->chunks[] = substr($this->content, $from_char, $at_char - $from_char - 1); |
||
169 | $from_char = $at_char + strlen($separator); |
||
170 | } |
||
171 | } else { |
||
172 | $this->chunks[] = $this->content; |
||
173 | } |
||
174 | } |
||
175 | return $this; |
||
176 | } |
||
177 | |||
178 | /** |
||
179 | * @param array $pattern_keep |
||
180 | * @param array $pattern_remove |
||
181 | * @param bool $is_regex |
||
182 | * @return $this |
||
183 | */ |
||
184 | public function filter_chunks(array $pattern_keep = [], array $pattern_remove = [], bool $is_regex = false): PfPageparser |
||
185 | { |
||
186 | $matches = false; |
||
187 | |||
188 | View Code Duplication | if (empty($this->chunks)) { |
|
0 ignored issues
–
show
This code seems to be duplicated across your project.
Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation. You can also find more detailed suggestions in the “Code” section of your repository. ![]() |
|||
189 | if ($this->content) { |
||
190 | $this->chunks = [$this->content]; |
||
191 | } else { |
||
192 | return $this; |
||
193 | } |
||
194 | } |
||
195 | foreach ($this->chunks as $id => $chunk) { |
||
196 | // |
||
197 | $keep_chunk = true; |
||
198 | View Code Duplication | if (!empty($pattern_keep)) { |
|
0 ignored issues
–
show
This code seems to be duplicated across your project.
Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation. You can also find more detailed suggestions in the “Code” section of your repository. ![]() |
|||
199 | $pattern_found = false; |
||
200 | foreach ($pattern_keep as $pattern) { |
||
201 | if ($is_regex) { |
||
202 | $pattern_found = ($pattern_found or preg_match($pattern, $chunk, $matches)); |
||
203 | } else { |
||
204 | $pattern_found = ($pattern_found or strpos($chunk, $pattern) !== false); |
||
205 | } |
||
206 | } |
||
207 | $keep_chunk = ($keep_chunk and $pattern_found); |
||
208 | } |
||
209 | View Code Duplication | if (!empty($pattern_remove)) { |
|
0 ignored issues
–
show
This code seems to be duplicated across your project.
Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation. You can also find more detailed suggestions in the “Code” section of your repository. ![]() |
|||
210 | $pattern_found = false; |
||
211 | foreach ($pattern_remove as $pattern) { |
||
212 | if ($is_regex) { |
||
213 | $pattern_found = ($pattern_found or preg_match($pattern, $chunk, $matches)); |
||
214 | } else { |
||
215 | $pattern_found = ($pattern_found or strpos($chunk, $pattern) !== false); |
||
216 | } |
||
217 | } |
||
218 | $keep_chunk = ($keep_chunk and !$pattern_found); |
||
219 | } |
||
220 | if (!$keep_chunk) { |
||
221 | unset($this->chunks[$id]); |
||
222 | } |
||
223 | } |
||
224 | return $this; |
||
225 | } |
||
226 | |||
227 | /** |
||
228 | * @param string $pattern |
||
229 | * @param bool $only_one |
||
230 | * @param bool $restart |
||
231 | * @return PfPageparser |
||
232 | */ |
||
233 | public function parse_fom_chunks(string $pattern, bool $only_one = false, bool $restart = false): PfPageparser |
||
234 | { |
||
235 | View Code Duplication | if (empty($this->chunks)) { |
|
0 ignored issues
–
show
This code seems to be duplicated across your project.
Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation. You can also find more detailed suggestions in the “Code” section of your repository. ![]() |
|||
236 | if ($this->content) { |
||
237 | $this->chunks = [$this->content]; |
||
238 | } else { |
||
239 | return $this; |
||
240 | } |
||
241 | } |
||
242 | if ($restart || empty($this->parsed)) { |
||
243 | $items =& $this->chunks; |
||
244 | $this->parsed = []; |
||
245 | } else { |
||
246 | $items =& $this->parsed; |
||
247 | } |
||
248 | foreach ($items as $item) { |
||
249 | $matches = []; |
||
250 | if (preg_match_all($pattern, $item, $matches, PREG_SET_ORDER)) { |
||
251 | $chunk_results = []; |
||
252 | foreach ($matches as $match) { |
||
253 | if ($only_one) { |
||
254 | $chunk_results = $match[1]; |
||
255 | } else { |
||
256 | $chunk_results[] = $match[1]; |
||
257 | } |
||
258 | } |
||
259 | $this->parsed[] = $chunk_results; |
||
260 | } |
||
261 | } |
||
262 | return $this; |
||
263 | } |
||
264 | |||
265 | public function get_chunks(): array |
||
266 | { |
||
267 | return $this->chunks; |
||
268 | } |
||
269 | |||
270 | public function results(bool $before_parsing = false): array |
||
271 | { |
||
272 | if ($before_parsing || empty($this->parsed)) { |
||
273 | return $this->chunks; |
||
274 | } |
||
275 | |||
276 | return $this->parsed; |
||
277 | } |
||
278 | |||
279 | private function log(string $text, string $level = 'info'): void |
||
280 | { |
||
281 | if ($this->logger) { |
||
282 | $this->logger->log($level, $text); |
||
283 | } |
||
284 | } |
||
285 | |||
286 | } |
||
287 |
Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.
You can also find more detailed suggestions in the “Code” section of your repository.