1
|
|
|
<?php |
2
|
|
|
|
3
|
|
|
namespace Pforret\PfPageparser; |
4
|
|
|
|
5
|
|
|
|
6
|
|
|
class PfPageparser |
7
|
|
|
{ |
8
|
|
|
// Build your next great package. |
9
|
|
|
private $config; |
10
|
|
|
private $content=""; |
11
|
|
|
private $chunks=[]; |
12
|
|
|
private $parsed=[]; |
13
|
|
|
|
14
|
|
|
public function __construct($config=[]){ |
15
|
|
|
$defaults=[ |
16
|
|
|
'userAgent' => 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36', |
17
|
|
|
'cacheTtl' => 3600, |
18
|
|
|
'timeOut' => 10, |
19
|
|
|
]; |
20
|
|
|
|
21
|
|
|
$this->config=array_merge($defaults,$config); |
22
|
|
|
} |
23
|
|
|
|
24
|
|
|
public function get_config(){ |
25
|
|
|
return $this->config; |
26
|
|
|
} |
27
|
|
|
|
28
|
|
|
public function load_from_url(string $url,array $options=[]): PfPageparser |
29
|
|
|
{ |
30
|
|
|
// TODO: load with guzzle & caching |
31
|
|
|
$options=array_merge($this->config,$options); |
32
|
|
|
|
33
|
|
|
$ch = curl_init(); |
34
|
|
|
curl_setopt ($ch, CURLOPT_URL, $url); |
35
|
|
|
curl_setopt ($ch, CURLOPT_USERAGENT, $options['userAgent']); |
36
|
|
|
curl_setopt ($ch, CURLOPT_HEADER, 0); |
37
|
|
|
curl_setopt ($ch, CURLOPT_RETURNTRANSFER, 1); |
38
|
|
|
curl_setopt ($ch, CURLOPT_FOLLOWLOCATION, 1); |
39
|
|
|
curl_setopt ($ch, CURLOPT_TIMEOUT,$options['timeOut']); |
40
|
|
|
curl_setopt ($ch, CURLOPT_CONNECTTIMEOUT,$options['timeOut']); |
41
|
|
|
$this->content = curl_exec ($ch); |
42
|
|
|
curl_close ($ch); |
43
|
|
|
return $this; |
44
|
|
|
} |
45
|
|
|
|
46
|
|
|
public function load_from_file(string $filename): PfPageparser |
47
|
|
|
{ |
48
|
|
|
// load directly from file |
49
|
|
|
|
50
|
|
|
if(file_exists($filename)){ |
51
|
|
|
$this->content=file_get_contents($filename); |
52
|
|
|
} |
53
|
|
|
return $this; |
54
|
|
|
} |
55
|
|
|
|
56
|
|
|
public function load_fom_string(string $string): PfPageparser |
57
|
|
|
{ |
58
|
|
|
// load HTML string |
59
|
|
|
$this->content=$string; |
60
|
|
|
return $this; |
61
|
|
|
} |
62
|
|
|
|
63
|
|
|
public function get_content(){ |
64
|
|
|
return $this->content; |
65
|
|
|
} |
66
|
|
|
|
67
|
|
|
public function raw(){ |
68
|
|
|
return $this->content; |
69
|
|
|
} |
70
|
|
|
|
71
|
|
|
/** |
72
|
|
|
* @param $pattern |
73
|
|
|
* @param bool $is_regex |
74
|
|
|
* @return $this |
75
|
|
|
*/ |
76
|
|
View Code Duplication |
public function trim_before($pattern,$is_regex=false): PfPageparser |
|
|
|
|
77
|
|
|
{ |
78
|
|
|
if($is_regex){ |
79
|
|
|
$matches=[]; |
80
|
|
|
if($found=preg_match($pattern,$this->content,$matches)){ |
81
|
|
|
$this->content=substr($this->content,$found); // trim before |
82
|
|
|
} |
83
|
|
|
} else { |
84
|
|
|
if($found=strpos($this->content,$pattern)){ |
85
|
|
|
$this->content=substr($this->content,$found); // trim before |
86
|
|
|
} |
87
|
|
|
} |
88
|
|
|
return $this; |
89
|
|
|
} |
90
|
|
|
|
91
|
|
|
/** |
92
|
|
|
* @param $pattern |
93
|
|
|
* @param bool $is_regex |
94
|
|
|
* @return $this |
95
|
|
|
*/ |
96
|
|
View Code Duplication |
public function trim_after($pattern,$is_regex=false): PfPageparser |
|
|
|
|
97
|
|
|
{ |
98
|
|
|
if($is_regex){ |
99
|
|
|
$matches=[]; |
100
|
|
|
if($found=preg_match($pattern,$this->content,$matches)){ |
101
|
|
|
$this->content=substr($this->content,0,$found); // trim after |
102
|
|
|
} |
103
|
|
|
} else { |
104
|
|
|
if($found=strpos($this->content,$pattern)){ |
105
|
|
|
$this->content=substr($this->content,0,$found + strlen($pattern)); // trim after |
106
|
|
|
} |
107
|
|
|
} |
108
|
|
|
return $this; |
109
|
|
|
|
110
|
|
|
} |
111
|
|
|
|
112
|
|
|
/** |
113
|
|
|
* @param string $before |
114
|
|
|
* @param string $after |
115
|
|
|
* @param bool $is_regex |
116
|
|
|
* @return $this |
117
|
|
|
*/ |
118
|
|
|
|
119
|
|
|
public function trim($before="<body",$after="</body",$is_regex=false): PfPageparser |
120
|
|
|
{ |
121
|
|
|
$this->trim_before($before,$is_regex); |
122
|
|
|
$this->trim_after($after,$is_regex); |
123
|
|
|
return $this; |
124
|
|
|
} |
125
|
|
|
|
126
|
|
|
/** |
127
|
|
|
* @param $pattern |
128
|
|
|
* @param $is_regex |
129
|
|
|
* @return $this |
130
|
|
|
* split the HTML content into chunks based on a text or regex separator |
131
|
|
|
*/ |
132
|
|
|
|
133
|
|
|
public function split_chunks($pattern,$is_regex=false): PfPageparser |
134
|
|
|
{ |
135
|
|
|
if(!$is_regex){ |
136
|
|
|
$this->chunks=explode($pattern,$this->content); |
137
|
|
|
} else { |
138
|
|
|
$this->chunks=[]; |
139
|
|
|
preg_match_all($pattern,$this->content,$matches, PREG_OFFSET_CAPTURE); |
140
|
|
|
if($matches) { |
141
|
|
|
$from_char=0; |
142
|
|
|
foreach($matches[0] as $match){ |
143
|
|
|
$separator=$match[0]; |
144
|
|
|
$at_char=$match[1]; |
145
|
|
|
$this->chunks[]=substr($this->content,$from_char,$at_char-$from_char-1); |
146
|
|
|
$from_char=$at_char+strlen($separator); |
147
|
|
|
} |
148
|
|
|
} else { |
149
|
|
|
$this->chunks[]=$this->content; |
150
|
|
|
} |
151
|
|
|
} |
152
|
|
|
return $this; |
153
|
|
|
} |
154
|
|
|
|
155
|
|
|
/** |
156
|
|
|
* @param array $pattern_keep - array of patterns that should be found (combined with OR) |
157
|
|
|
* @param array $pattern_remove - array of patterns that should not be found (combined with OR) |
158
|
|
|
* @param bool $is_regex - whether patterns are regex or just strings |
159
|
|
|
* @return $this |
160
|
|
|
*/ |
161
|
|
|
public function filter_chunks($pattern_keep=[],$pattern_remove=[],bool $is_regex=false): PfPageparser |
162
|
|
|
{ |
163
|
|
|
$id=false; |
164
|
|
|
$matches=false; |
165
|
|
|
$chunk=false; |
166
|
|
|
|
167
|
|
|
if(empty($this->chunks)){ |
168
|
|
|
// not split in chunks yet |
169
|
|
|
// do nothing |
170
|
|
|
return $this; |
171
|
|
|
} |
172
|
|
|
if($pattern_keep AND !is_array($pattern_keep)){ |
173
|
|
|
// make it always an array |
174
|
|
|
$pattern_keep=[$pattern_keep]; |
175
|
|
|
} |
176
|
|
|
if($pattern_remove AND !is_array($pattern_remove)){ |
177
|
|
|
// make it always an array |
178
|
|
|
$pattern_remove=[$pattern_remove]; |
179
|
|
|
} |
180
|
|
|
foreach($this->chunks as $id => $chunk){ |
181
|
|
|
// |
182
|
|
|
$keep_chunk=true; |
183
|
|
View Code Duplication |
if(!empty($pattern_keep)){ |
|
|
|
|
184
|
|
|
$pattern_found=false; |
185
|
|
|
foreach($pattern_keep as $pattern){ |
186
|
|
|
if($is_regex){ |
187
|
|
|
$pattern_found=($pattern_found OR preg_match($pattern,$chunk,$matches)); |
188
|
|
|
} else { |
189
|
|
|
$pattern_found=($pattern_found OR strstr($chunk,$pattern)); |
190
|
|
|
} |
191
|
|
|
} |
192
|
|
|
$keep_chunk=($keep_chunk AND $pattern_found); |
193
|
|
|
} |
194
|
|
View Code Duplication |
if(!empty($pattern_remove)){ |
|
|
|
|
195
|
|
|
$pattern_found=false; |
196
|
|
|
foreach($pattern_remove as $pattern){ |
197
|
|
|
if($is_regex){ |
198
|
|
|
$pattern_found=($pattern_found OR preg_match($pattern,$chunk,$matches)); |
199
|
|
|
} else { |
200
|
|
|
$pattern_found=($pattern_found OR strstr($chunk,$pattern)); |
201
|
|
|
} |
202
|
|
|
} |
203
|
|
|
$keep_chunk=($keep_chunk AND !$pattern_found); |
204
|
|
|
} |
205
|
|
|
if(!$keep_chunk){ |
206
|
|
|
unset($this->chunks[$id]); |
207
|
|
|
} |
208
|
|
|
} |
209
|
|
|
return $this; |
210
|
|
|
} |
211
|
|
|
|
212
|
|
|
/** |
213
|
|
|
* @param $pattern |
214
|
|
|
* @return array |
215
|
|
|
*/ |
216
|
|
|
public function parse_fom_chunks(string $pattern,bool $restart=false): array |
217
|
|
|
{ |
218
|
|
|
if(empty($this->chunks)){ |
219
|
|
|
return $this; |
220
|
|
|
} |
221
|
|
|
if($restart or empty($this->parsed)){ |
222
|
|
|
$items=&$this->chunks; |
223
|
|
|
} else { |
224
|
|
|
$items=&$this->parsed; |
225
|
|
|
} |
226
|
|
|
foreach($items as $item){ |
227
|
|
|
$matches=[]; |
228
|
|
|
if(preg_match_all($pattern,$item,$matches,PREG_SET_ORDER)){ |
229
|
|
|
$chunk_results=[]; |
230
|
|
|
foreach($matches as $match){ |
231
|
|
|
$chunk_results[]=$match[1]; |
232
|
|
|
} |
233
|
|
|
$this->parsed[]=$chunk_results; |
234
|
|
|
} |
235
|
|
|
} |
236
|
|
|
return $this; |
237
|
|
|
} |
238
|
|
|
|
239
|
|
|
/** |
240
|
|
|
* @return array |
241
|
|
|
*/ |
242
|
|
|
public function get_chunks(): array |
243
|
|
|
{ |
244
|
|
|
return $this->chunks; |
245
|
|
|
} |
246
|
|
|
|
247
|
|
|
public function results(bool $before_parsing=false): array |
248
|
|
|
{ |
249
|
|
|
if($before_parsing or empty($this->parsed)){ |
250
|
|
|
return $this->chunks; |
251
|
|
|
} else { |
252
|
|
|
return $this->parsed; |
253
|
|
|
} |
254
|
|
|
} |
255
|
|
|
|
256
|
|
|
|
257
|
|
|
public function preg_get($pattern,$haystack){ |
258
|
|
|
$matches=[]; |
259
|
|
|
if(preg_match($pattern,$haystack,$matches)){ |
260
|
|
|
return $matches[0]; |
261
|
|
|
} else { |
262
|
|
|
return ""; |
263
|
|
|
} |
264
|
|
|
} |
265
|
|
|
|
266
|
|
|
/** |
267
|
|
|
* PROTECTED FUNCTIONS |
268
|
|
|
*/ |
269
|
|
|
|
270
|
|
|
} |
271
|
|
|
|
Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.
You can also find more detailed suggestions in the “Code” section of your repository.