1
|
|
|
<?php |
2
|
|
|
|
3
|
|
|
namespace Pforret\PfPageparser; |
4
|
|
|
|
5
|
|
|
|
6
|
|
|
class PfPageparser |
7
|
|
|
{ |
8
|
|
|
// Build your next great package. |
9
|
|
|
private $config; |
10
|
|
|
private $content=""; |
11
|
|
|
private $chunks=[]; |
12
|
|
|
private $parsed=[]; |
13
|
|
|
|
14
|
|
|
public function __construct($config=[]){ |
15
|
|
|
$defaults=[ |
16
|
|
|
'userAgent' => 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36', |
17
|
|
|
'cacheTtl' => 3600, |
18
|
|
|
'timeOut' => 10, |
19
|
|
|
]; |
20
|
|
|
|
21
|
|
|
$this->config=array_merge($defaults,$config); |
22
|
|
|
} |
23
|
|
|
|
24
|
|
|
public function get_config(){ |
25
|
|
|
return $this->config; |
26
|
|
|
} |
27
|
|
|
|
28
|
|
|
/* ------------------------------------------ |
29
|
|
|
* LOADING THE CONTENT FROM A URL/FILE/STRING |
30
|
|
|
*/ |
31
|
|
|
|
32
|
|
|
public function load_from_url(string $url,array $options=[]): PfPageparser |
33
|
|
|
{ |
34
|
|
|
// TODO: load with guzzle & caching |
35
|
|
|
$options=array_merge($this->config,$options); |
36
|
|
|
|
37
|
|
|
$ch = curl_init(); |
38
|
|
|
curl_setopt ($ch, CURLOPT_URL, $url); |
39
|
|
|
curl_setopt ($ch, CURLOPT_USERAGENT, $options['userAgent']); |
40
|
|
|
curl_setopt ($ch, CURLOPT_HEADER, 0); |
41
|
|
|
curl_setopt ($ch, CURLOPT_RETURNTRANSFER, 1); |
42
|
|
|
curl_setopt ($ch, CURLOPT_FOLLOWLOCATION, 1); |
43
|
|
|
curl_setopt ($ch, CURLOPT_TIMEOUT,$options['timeOut']); |
44
|
|
|
curl_setopt ($ch, CURLOPT_CONNECTTIMEOUT,$options['timeOut']); |
45
|
|
|
$this->content = curl_exec ($ch); |
46
|
|
|
curl_close ($ch); |
47
|
|
|
return $this; |
48
|
|
|
} |
49
|
|
|
|
50
|
|
|
public function load_from_file(string $filename): PfPageparser |
51
|
|
|
{ |
52
|
|
|
// load directly from file |
53
|
|
|
|
54
|
|
|
if(file_exists($filename)){ |
55
|
|
|
$this->content=file_get_contents($filename); |
56
|
|
|
} |
57
|
|
|
return $this; |
58
|
|
|
} |
59
|
|
|
|
60
|
|
|
public function load_fom_string(string $string): PfPageparser |
61
|
|
|
{ |
62
|
|
|
// load HTML string |
63
|
|
|
$this->content=$string; |
64
|
|
|
return $this; |
65
|
|
|
} |
66
|
|
|
|
67
|
|
|
/* ------------------------------------------ |
68
|
|
|
* GET RAW CONTENT BACK |
69
|
|
|
*/ |
70
|
|
|
|
71
|
|
|
/** |
72
|
|
|
* @return string |
73
|
|
|
*/ |
74
|
|
|
public function get_content():string |
75
|
|
|
{ |
76
|
|
|
// for backward compatibility |
77
|
|
|
return $this->raw(); |
78
|
|
|
} |
79
|
|
|
|
80
|
|
|
/** |
81
|
|
|
* @return string |
82
|
|
|
*/ |
83
|
|
|
public function raw(): string |
84
|
|
|
{ |
85
|
|
|
return $this->content; |
86
|
|
|
} |
87
|
|
|
|
88
|
|
|
/* ------------------------------------------ |
89
|
|
|
* MODIFY THE RAW CONTENT |
90
|
|
|
*/ |
91
|
|
|
|
92
|
|
|
public function trim_before(string $pattern,bool $is_regex=false): PfPageparser |
93
|
|
|
{ |
94
|
|
|
$found = $is_regex ? preg_match($pattern, $this->content, $matches) : strpos($this->content, $pattern); |
95
|
|
|
if($found) $this->content = substr($this->content, $found); |
96
|
|
|
return $this; |
97
|
|
|
} |
98
|
|
|
|
99
|
|
|
public function trim_after(string $pattern,bool $is_regex=false): PfPageparser |
100
|
|
|
{ |
101
|
|
|
$found = $is_regex ? preg_match($pattern, $this->content, $matches) : strpos($this->content, $pattern); |
102
|
|
|
if($found) $this->content=substr($this->content,0,$found); |
103
|
|
|
return $this; |
104
|
|
|
|
105
|
|
|
} |
106
|
|
|
|
107
|
|
|
public function trim(string $before="<body",string $after="</body",bool $is_regex=false): PfPageparser |
108
|
|
|
{ |
109
|
|
|
$this->trim_before($before,$is_regex); |
110
|
|
|
$this->trim_after($after,$is_regex); |
111
|
|
|
return $this; |
112
|
|
|
} |
113
|
|
|
|
114
|
|
|
/* ------------------------------------------ |
115
|
|
|
* RAW CONTENT => CHUNKS |
116
|
|
|
*/ |
117
|
|
|
|
118
|
|
|
/** |
119
|
|
|
* @param $pattern |
120
|
|
|
* @param $is_regex |
121
|
|
|
* @return $this |
122
|
|
|
* split the HTML content into chunks based on a text or regex separator |
123
|
|
|
*/ |
124
|
|
|
|
125
|
|
|
public function split_chunks(string $pattern,bool $is_regex=false): PfPageparser |
126
|
|
|
{ |
127
|
|
|
if(!$is_regex){ |
128
|
|
|
$this->chunks=explode($pattern,$this->content); |
129
|
|
|
} else { |
130
|
|
|
$this->chunks=[]; |
131
|
|
|
preg_match_all($pattern,$this->content,$matches, PREG_OFFSET_CAPTURE); |
132
|
|
|
if($matches) { |
133
|
|
|
$from_char=0; |
134
|
|
|
foreach($matches[0] as $match){ |
135
|
|
|
$separator=$match[0]; |
136
|
|
|
$at_char=$match[1]; |
137
|
|
|
$this->chunks[]=substr($this->content,$from_char,$at_char-$from_char-1); |
138
|
|
|
$from_char=$at_char+strlen($separator); |
139
|
|
|
} |
140
|
|
|
} else { |
141
|
|
|
$this->chunks[]=$this->content; |
142
|
|
|
} |
143
|
|
|
} |
144
|
|
|
return $this; |
145
|
|
|
} |
146
|
|
|
|
147
|
|
|
/** |
148
|
|
|
* @param array $pattern_keep - array of patterns that should be found (combined with OR) |
149
|
|
|
* @param array $pattern_remove - array of patterns that should not be found (combined with OR) |
150
|
|
|
* @param bool $is_regex - whether patterns are regex or just strings |
151
|
|
|
* @return $this |
152
|
|
|
*/ |
153
|
|
|
public function filter_chunks($pattern_keep=[],$pattern_remove=[],bool $is_regex=false): PfPageparser |
154
|
|
|
{ |
155
|
|
|
$id=false; |
156
|
|
|
$matches=false; |
157
|
|
|
$chunk=false; |
158
|
|
|
|
159
|
|
|
if(empty($this->chunks)){ |
160
|
|
|
// not split in chunks yet |
161
|
|
|
// do nothing |
162
|
|
|
return $this; |
163
|
|
|
} |
164
|
|
|
if($pattern_keep AND !is_array($pattern_keep)){ |
165
|
|
|
// make it always an array |
166
|
|
|
$pattern_keep=[$pattern_keep]; |
167
|
|
|
} |
168
|
|
|
if($pattern_remove AND !is_array($pattern_remove)){ |
169
|
|
|
// make it always an array |
170
|
|
|
$pattern_remove=[$pattern_remove]; |
171
|
|
|
} |
172
|
|
|
foreach($this->chunks as $id => $chunk){ |
173
|
|
|
// |
174
|
|
|
$keep_chunk=true; |
175
|
|
View Code Duplication |
if(!empty($pattern_keep)){ |
|
|
|
|
176
|
|
|
$pattern_found=false; |
177
|
|
|
foreach($pattern_keep as $pattern){ |
178
|
|
|
if($is_regex){ |
179
|
|
|
$pattern_found=($pattern_found OR preg_match($pattern,$chunk,$matches)); |
180
|
|
|
} else { |
181
|
|
|
$pattern_found=($pattern_found OR strstr($chunk,$pattern)); |
182
|
|
|
} |
183
|
|
|
} |
184
|
|
|
$keep_chunk=($keep_chunk AND $pattern_found); |
185
|
|
|
} |
186
|
|
View Code Duplication |
if(!empty($pattern_remove)){ |
|
|
|
|
187
|
|
|
$pattern_found=false; |
188
|
|
|
foreach($pattern_remove as $pattern){ |
189
|
|
|
if($is_regex){ |
190
|
|
|
$pattern_found=($pattern_found OR preg_match($pattern,$chunk,$matches)); |
191
|
|
|
} else { |
192
|
|
|
$pattern_found=($pattern_found OR strstr($chunk,$pattern)); |
193
|
|
|
} |
194
|
|
|
} |
195
|
|
|
$keep_chunk=($keep_chunk AND !$pattern_found); |
196
|
|
|
} |
197
|
|
|
if(!$keep_chunk){ |
198
|
|
|
unset($this->chunks[$id]); |
199
|
|
|
} |
200
|
|
|
} |
201
|
|
|
return $this; |
202
|
|
|
} |
203
|
|
|
|
204
|
|
|
/** |
205
|
|
|
* @param $pattern |
206
|
|
|
* @return array |
207
|
|
|
*/ |
208
|
|
|
public function parse_fom_chunks(string $pattern,bool $restart=false): PfPageparser |
209
|
|
|
{ |
210
|
|
|
if(empty($this->chunks)){ |
211
|
|
|
return $this; |
212
|
|
|
} |
213
|
|
|
if($restart or empty($this->parsed)){ |
214
|
|
|
$items=&$this->chunks; |
215
|
|
|
} else { |
216
|
|
|
$items=&$this->parsed; |
217
|
|
|
} |
218
|
|
|
foreach($items as $item){ |
219
|
|
|
$matches=[]; |
220
|
|
|
if(preg_match_all($pattern,$item,$matches,PREG_SET_ORDER)){ |
221
|
|
|
$chunk_results=[]; |
222
|
|
|
foreach($matches as $match){ |
223
|
|
|
$chunk_results[]=$match[1]; |
224
|
|
|
} |
225
|
|
|
$this->parsed[]=$chunk_results; |
226
|
|
|
} |
227
|
|
|
} |
228
|
|
|
return $this; |
229
|
|
|
} |
230
|
|
|
|
231
|
|
|
/** |
232
|
|
|
* @return array |
233
|
|
|
*/ |
234
|
|
|
public function get_chunks(): array |
235
|
|
|
{ |
236
|
|
|
return $this->chunks; |
237
|
|
|
} |
238
|
|
|
|
239
|
|
|
public function results(bool $before_parsing=false): array |
240
|
|
|
{ |
241
|
|
|
if($before_parsing or empty($this->parsed)){ |
242
|
|
|
return $this->chunks; |
243
|
|
|
} else { |
244
|
|
|
return $this->parsed; |
245
|
|
|
} |
246
|
|
|
} |
247
|
|
|
|
248
|
|
|
|
249
|
|
|
public function preg_get($pattern,$haystack){ |
250
|
|
|
$matches=[]; |
251
|
|
|
if(preg_match($pattern,$haystack,$matches)){ |
252
|
|
|
return $matches[0]; |
253
|
|
|
} else { |
254
|
|
|
return ""; |
255
|
|
|
} |
256
|
|
|
} |
257
|
|
|
|
258
|
|
|
/** |
259
|
|
|
* PROTECTED FUNCTIONS |
260
|
|
|
*/ |
261
|
|
|
|
262
|
|
|
} |
263
|
|
|
|
Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.
You can also find more detailed suggestions in the “Code” section of your repository.