1
|
|
|
<?php |
2
|
|
|
|
3
|
|
|
namespace Pforret\PfPageparser; |
4
|
|
|
|
5
|
|
|
use GuzzleHttp\Client; |
6
|
|
|
use GuzzleHttp\Exception\GuzzleException; |
7
|
|
|
use Psr\Log\LoggerInterface; |
8
|
|
|
|
9
|
|
|
class PfPageparser |
10
|
|
|
{ |
11
|
|
|
// Build your next great package. |
12
|
|
|
private $config; |
13
|
|
|
private $content=""; |
14
|
|
|
private $chunks=[]; |
15
|
|
|
private $parsed=[]; |
16
|
|
|
private $logger=null; |
17
|
|
|
|
18
|
|
|
public function __construct(array $config=[], LoggerInterface $logger=null){ |
19
|
|
|
$defaults=[ |
20
|
|
|
'userAgent' => 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36', |
21
|
|
|
'cacheTtl' => 3600, |
22
|
|
|
'timeout' => 5, // Guzzle timeout |
23
|
|
|
'method' => 'GET', |
24
|
|
|
'headers' => [ |
25
|
|
|
'User-Agent' => 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36', |
26
|
|
|
] |
27
|
|
|
]; |
28
|
|
|
|
29
|
|
|
if($logger){ |
30
|
|
|
$this->logger=$logger; |
31
|
|
|
} |
32
|
|
|
$this->config=array_merge($defaults,$config); |
33
|
|
|
} |
34
|
|
|
|
35
|
|
|
public function get_config(){ |
36
|
|
|
return $this->config; |
37
|
|
|
} |
38
|
|
|
|
39
|
|
|
/* ------------------------------------------ |
40
|
|
|
* LOADING THE CONTENT FROM A URL/FILE/STRING |
41
|
|
|
*/ |
42
|
|
|
|
43
|
|
|
private function initialize(){ |
44
|
|
|
$this->content=""; |
45
|
|
|
$this->chunks=[]; |
46
|
|
|
$this->parsed=[]; |
47
|
|
|
} |
48
|
|
|
|
49
|
|
|
/** |
50
|
|
|
* @param string $url |
51
|
|
|
* @param array $options |
52
|
|
|
* @return $this |
53
|
|
|
*/ |
54
|
|
|
public function load_from_url(string $url, array $options=[]): PfPageparser |
55
|
|
|
{ |
56
|
|
|
// TODO: load with caching |
57
|
|
|
$this->initialize(); |
58
|
|
|
$options=array_merge($this->config,$options); |
59
|
|
|
$client = new Client([ |
60
|
|
|
'headers' => $options['headers'], |
61
|
|
|
]); |
62
|
|
|
try { |
63
|
|
|
$res = $client->request($options['method'], $url); |
64
|
|
|
$this->content=$res->getBody()->getContents(); |
65
|
|
|
} catch (GuzzleException $error) { |
66
|
|
|
$message = $error->getMessage(); |
67
|
|
|
$this->log($message,'error'); |
68
|
|
|
} |
69
|
|
|
return $this; |
70
|
|
|
} |
71
|
|
|
|
72
|
|
|
public function load_from_file(string $filename): PfPageparser |
73
|
|
|
{ |
74
|
|
|
$this->initialize(); |
75
|
|
|
if(file_exists($filename)){ |
76
|
|
|
$this->content=file_get_contents($filename); |
77
|
|
|
} |
78
|
|
|
return $this; |
79
|
|
|
} |
80
|
|
|
|
81
|
|
|
public function load_fom_string(string $string): PfPageparser |
82
|
|
|
{ |
83
|
|
|
$this->initialize(); |
84
|
|
|
$this->content=$string; |
85
|
|
|
return $this; |
86
|
|
|
} |
87
|
|
|
|
88
|
|
|
public function cleanup_html($remove_linefeeds=true,$shrink_spaces=true ): PfPageparser |
89
|
|
|
{ |
90
|
|
|
if($remove_linefeeds) $this->content=preg_replace("|\n+|"," ",$this->content); // remove line feeds |
91
|
|
|
if($shrink_spaces) $this->content=preg_replace("|\s\s+|"," ",$this->content); // remove multiple spaces |
92
|
|
|
return $this; |
93
|
|
|
} |
94
|
|
|
|
95
|
|
|
/* ------------------------------------------ |
96
|
|
|
* GET THE RAW CONTENT |
97
|
|
|
*/ |
98
|
|
|
public function get_content():string |
99
|
|
|
{ return $this->content; /* for backward compatibility */ } |
100
|
|
|
|
101
|
|
|
public function raw(): string |
102
|
|
|
{ return $this->content; } |
103
|
|
|
|
104
|
|
|
/* ------------------------------------------ |
105
|
|
|
* MODIFY THE RAW CONTENT |
106
|
|
|
*/ |
107
|
|
|
|
108
|
|
|
public function trim_before(string $pattern,bool $is_regex=false): PfPageparser |
109
|
|
|
{ |
110
|
|
|
$found = $is_regex ? preg_match($pattern, $this->content, $matches) : strpos($this->content, $pattern); |
111
|
|
|
if($found) $this->content = substr($this->content, $found); |
112
|
|
|
return $this; |
113
|
|
|
} |
114
|
|
|
|
115
|
|
|
public function trim_after(string $pattern,bool $is_regex=false): PfPageparser |
116
|
|
|
{ |
117
|
|
|
$found = $is_regex ? preg_match($pattern, $this->content, $matches) : strpos($this->content, $pattern); |
118
|
|
|
if($found) $this->content=substr($this->content,0,$found); |
119
|
|
|
return $this; |
120
|
|
|
|
121
|
|
|
} |
122
|
|
|
|
123
|
|
|
public function trim(string $before="<body",string $after="</body",bool $is_regex=false): PfPageparser |
124
|
|
|
{ |
125
|
|
|
$this->trim_before($before,$is_regex); |
126
|
|
|
$this->trim_after($after,$is_regex); |
127
|
|
|
return $this; |
128
|
|
|
} |
129
|
|
|
|
130
|
|
|
/* ------------------------------------------ |
131
|
|
|
* RAW CONTENT => CHUNKS |
132
|
|
|
*/ |
133
|
|
|
|
134
|
|
|
/** |
135
|
|
|
* @param $pattern |
136
|
|
|
* @param $is_regex |
137
|
|
|
* @return $this |
138
|
|
|
* split the HTML content into chunks based on a text or regex separator |
139
|
|
|
*/ |
140
|
|
|
|
141
|
|
|
public function split_chunks(string $pattern,bool $is_regex=false): PfPageparser |
142
|
|
|
{ |
143
|
|
|
if(!$is_regex){ |
144
|
|
|
$this->chunks=explode($pattern,$this->content); |
145
|
|
|
} else { |
146
|
|
|
$this->chunks=[]; |
147
|
|
|
preg_match_all($pattern,$this->content,$matches, PREG_OFFSET_CAPTURE); |
148
|
|
|
if($matches) { |
149
|
|
|
$from_char=0; |
150
|
|
|
foreach($matches[0] as $match){ |
151
|
|
|
$separator=$match[0]; |
152
|
|
|
$at_char=$match[1]; |
153
|
|
|
$this->chunks[]=substr($this->content,$from_char,$at_char-$from_char-1); |
154
|
|
|
$from_char=$at_char+strlen($separator); |
155
|
|
|
} |
156
|
|
|
} else { |
157
|
|
|
$this->chunks[]=$this->content; |
158
|
|
|
} |
159
|
|
|
} |
160
|
|
|
return $this; |
161
|
|
|
} |
162
|
|
|
|
163
|
|
|
/** |
164
|
|
|
* @param array $pattern_keep |
165
|
|
|
* @param array $pattern_remove |
166
|
|
|
* @param bool $is_regex |
167
|
|
|
* @return $this |
168
|
|
|
*/ |
169
|
|
|
public function filter_chunks(array $pattern_keep=[], array $pattern_remove=[], bool $is_regex=false): PfPageparser |
170
|
|
|
{ |
171
|
|
|
$matches=false; |
172
|
|
|
|
173
|
|
View Code Duplication |
if(empty($this->chunks)){ |
|
|
|
|
174
|
|
|
if($this->content){ |
175
|
|
|
$this->chunks=[$this->content]; |
176
|
|
|
} else { |
177
|
|
|
return $this; |
178
|
|
|
} |
179
|
|
|
} |
180
|
|
|
foreach($this->chunks as $id => $chunk){ |
181
|
|
|
// |
182
|
|
|
$keep_chunk=true; |
183
|
|
View Code Duplication |
if(!empty($pattern_keep)){ |
|
|
|
|
184
|
|
|
$pattern_found=false; |
185
|
|
|
foreach($pattern_keep as $pattern){ |
186
|
|
|
if($is_regex){ |
187
|
|
|
$pattern_found=($pattern_found OR preg_match($pattern,$chunk,$matches)); |
188
|
|
|
} else { |
189
|
|
|
$pattern_found=($pattern_found OR strstr($chunk,$pattern)); |
190
|
|
|
} |
191
|
|
|
} |
192
|
|
|
$keep_chunk=($keep_chunk AND $pattern_found); |
193
|
|
|
} |
194
|
|
View Code Duplication |
if(!empty($pattern_remove)){ |
|
|
|
|
195
|
|
|
$pattern_found=false; |
196
|
|
|
foreach($pattern_remove as $pattern){ |
197
|
|
|
if($is_regex){ |
198
|
|
|
$pattern_found=($pattern_found OR preg_match($pattern,$chunk,$matches)); |
199
|
|
|
} else { |
200
|
|
|
$pattern_found=($pattern_found OR strstr($chunk,$pattern)); |
201
|
|
|
} |
202
|
|
|
} |
203
|
|
|
$keep_chunk=($keep_chunk AND !$pattern_found); |
204
|
|
|
} |
205
|
|
|
if(!$keep_chunk){ |
206
|
|
|
unset($this->chunks[$id]); |
207
|
|
|
} |
208
|
|
|
} |
209
|
|
|
return $this; |
210
|
|
|
} |
211
|
|
|
|
212
|
|
|
/** |
213
|
|
|
* @param string $pattern |
214
|
|
|
* @param bool $only_one |
215
|
|
|
* @param bool $restart |
216
|
|
|
* @return PfPageparser |
217
|
|
|
*/ |
218
|
|
|
public function parse_fom_chunks(string $pattern,bool $only_one=false, bool $restart=false): PfPageparser |
219
|
|
|
{ |
220
|
|
View Code Duplication |
if(empty($this->chunks)){ |
|
|
|
|
221
|
|
|
if($this->content){ |
222
|
|
|
$this->chunks=[$this->content]; |
223
|
|
|
} else { |
224
|
|
|
return $this; |
225
|
|
|
} |
226
|
|
|
} |
227
|
|
|
if($restart or empty($this->parsed)){ |
228
|
|
|
$items=&$this->chunks; |
229
|
|
|
$this->parsed=[]; |
230
|
|
|
} else { |
231
|
|
|
$items=&$this->parsed; |
232
|
|
|
} |
233
|
|
|
foreach($items as $item){ |
234
|
|
|
$matches=[]; |
235
|
|
|
if(preg_match_all($pattern,$item,$matches,PREG_SET_ORDER)){ |
236
|
|
|
$chunk_results=[]; |
237
|
|
|
foreach($matches as $match){ |
238
|
|
|
if($only_one){ |
239
|
|
|
$chunk_results=$match[1]; |
240
|
|
|
} else { |
241
|
|
|
$chunk_results[]=$match[1]; |
242
|
|
|
} |
243
|
|
|
} |
244
|
|
|
$this->parsed[]=$chunk_results; |
245
|
|
|
} |
246
|
|
|
} |
247
|
|
|
return $this; |
248
|
|
|
} |
249
|
|
|
|
250
|
|
|
public function get_chunks(): array |
251
|
|
|
{ |
252
|
|
|
return $this->chunks; |
253
|
|
|
} |
254
|
|
|
|
255
|
|
|
public function results(bool $before_parsing=false): array |
256
|
|
|
{ |
257
|
|
|
if($before_parsing or empty($this->parsed)){ |
258
|
|
|
return $this->chunks; |
259
|
|
|
} else { |
260
|
|
|
return $this->parsed; |
261
|
|
|
} |
262
|
|
|
} |
263
|
|
|
|
264
|
|
|
private function log(string $text,string $level = 'info') |
265
|
|
|
{ |
266
|
|
|
if($this->logger){ |
267
|
|
|
$this->logger->log($level,$text); |
268
|
|
|
} |
269
|
|
|
} |
270
|
|
|
|
271
|
|
|
} |
272
|
|
|
|
Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.
You can also find more detailed suggestions in the “Code” section of your repository.