Completed
Push — master ( 4ab27a...55c8be )
by Peter
01:04
created

PfPageparser::load_from_url()   A

Complexity

Conditions 2
Paths 3

Size

Total Lines 17

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
dl 0
loc 17
rs 9.7
c 0
b 0
f 0
cc 2
nc 3
nop 2
1
<?php
2
3
namespace Pforret\PfPageparser;
4
5
use GuzzleHttp\Client;
6
use GuzzleHttp\Exception\GuzzleException;
7
use Psr\Log\LoggerInterface;
8
9
class PfPageparser
10
{
11
    // Build your next great package.
12
    private $config;
13
    private $content="";
14
    private $chunks=[];
15
    private $parsed=[];
16
    private $logger=null;
17
18
    public function __construct(array $config=[], LoggerInterface $logger=null){
19
        $defaults=[
20
            'userAgent' =>  'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36',
21
            'cacheTtl'  =>  3600,
22
            'timeout'   =>  5,      // Guzzle timeout
23
            'method'    =>  'GET',
24
            'headers'   => [
25
                'User-Agent'  => 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36',
26
            ]
27
        ];
28
29
        if($logger){
30
            $this->logger=$logger;
31
        }
32
        $this->config=array_merge($defaults,$config);
33
    }
34
35
    public function get_config(){
36
        return $this->config;
37
    }
38
39
    /* ------------------------------------------
40
     * LOADING THE CONTENT FROM A URL/FILE/STRING
41
     */
42
43
    private function initialize(){
44
        $this->content="";
45
        $this->chunks=[];
46
        $this->parsed=[];
47
    }
48
49
    /**
50
     * @param string $url
51
     * @param array $options
52
     * @return $this
53
     */
54
    public function load_from_url(string $url, array $options=[]): PfPageparser
55
    {
56
        // TODO: load with caching
57
        $this->initialize();
58
        $options=array_merge($this->config,$options);
59
        $client = new Client([
60
            'headers'   =>  $options['headers'],
61
        ]);
62
        try {
63
            $res = $client->request($options['method'], $url);
64
            $this->content=$res->getBody()->getContents();
65
        } catch (GuzzleException $error) {
66
            $message = $error->getMessage();
67
            $this->log($message,'error');
68
        }
69
        return $this;
70
    }
71
72
    public function load_from_file(string $filename): PfPageparser
73
    {
74
        $this->initialize();
75
        if(file_exists($filename)){
76
            $this->content=file_get_contents($filename);
77
        }
78
        return $this;
79
    }
80
81
    public function load_fom_string(string $string): PfPageparser
82
    {
83
        $this->initialize();
84
        $this->content=$string;
85
        return $this;
86
    }
87
88
    /* ------------------------------------------
89
    * GET RAW CONTENT BACK
90
    */
91
92
    /**
93
     * @return string
94
     */
95
    public function get_content():string
96
    {
97
        // for backward compatibility
98
        return $this->raw();
99
    }
100
101
    /**
102
     * @return string
103
     */
104
    public function raw(): string
105
    {
106
        return $this->content;
107
    }
108
109
    /* ------------------------------------------
110
    * MODIFY THE RAW CONTENT
111
    */
112
113
    public function trim_before(string $pattern,bool $is_regex=false): PfPageparser
114
    {
115
        $found = $is_regex ? preg_match($pattern, $this->content, $matches) : strpos($this->content, $pattern);
116
        if($found) $this->content = substr($this->content, $found);
117
        return $this;
118
    }
119
120
    public function trim_after(string $pattern,bool $is_regex=false): PfPageparser
121
    {
122
        $found = $is_regex ? preg_match($pattern, $this->content, $matches) : strpos($this->content, $pattern);
123
        if($found) $this->content=substr($this->content,0,$found);
124
        return $this;
125
126
    }
127
128
    public function trim(string $before="<body",string $after="</body",bool $is_regex=false): PfPageparser
129
    {
130
        $this->trim_before($before,$is_regex);
131
        $this->trim_after($after,$is_regex);
132
        return $this;
133
    }
134
135
    /* ------------------------------------------
136
    * RAW CONTENT => CHUNKS
137
    */
138
139
    /**
140
     * @param $pattern
141
     * @param $is_regex
142
     * @return $this
143
     * split the HTML content into chunks based on a text or regex separator
144
     */
145
146
    public function split_chunks(string $pattern,bool $is_regex=false): PfPageparser
147
    {
148
        if(!$is_regex){
149
            $this->chunks=explode($pattern,$this->content);
150
        } else {
151
            $this->chunks=[];
152
            preg_match_all($pattern,$this->content,$matches, PREG_OFFSET_CAPTURE);
153
            if($matches) {
154
                $from_char=0;
155
                foreach($matches[0] as $match){
156
                    $separator=$match[0];
157
                    $at_char=$match[1];
158
                    $this->chunks[]=substr($this->content,$from_char,$at_char-$from_char-1);
159
                    $from_char=$at_char+strlen($separator);
160
                }
161
            } else {
162
                $this->chunks[]=$this->content;
163
            }
164
         }
165
        return $this;
166
    }
167
168
    /**
169
     * @param array $pattern_keep
170
     * @param array $pattern_remove
171
     * @param bool $is_regex
172
     * @return $this
173
     */
174
    public function filter_chunks(array $pattern_keep=[], array $pattern_remove=[], bool $is_regex=false): PfPageparser
175
    {
176
        $matches=false;
177
178 View Code Duplication
        if(empty($this->chunks)){
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
179
            if($this->content){
180
                $this->chunks=[$this->content];
181
            } else {
182
                return $this;
183
            }
184
        }
185
        foreach($this->chunks as $id => $chunk){
186
            //
187
            $keep_chunk=true;
188 View Code Duplication
            if(!empty($pattern_keep)){
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
189
                $pattern_found=false;
190
                foreach($pattern_keep as $pattern){
191
                    if($is_regex){
192
                        $pattern_found=($pattern_found OR preg_match($pattern,$chunk,$matches));
193
                    } else {
194
                        $pattern_found=($pattern_found OR strstr($chunk,$pattern));
195
                    }
196
                }
197
                $keep_chunk=($keep_chunk AND $pattern_found);
198
            }
199 View Code Duplication
            if(!empty($pattern_remove)){
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
200
                $pattern_found=false;
201
                foreach($pattern_remove as $pattern){
202
                    if($is_regex){
203
                        $pattern_found=($pattern_found OR preg_match($pattern,$chunk,$matches));
204
                    } else {
205
                        $pattern_found=($pattern_found OR strstr($chunk,$pattern));
206
                    }
207
                }
208
                $keep_chunk=($keep_chunk AND !$pattern_found);
209
            }
210
            if(!$keep_chunk){
211
                unset($this->chunks[$id]);
212
            }
213
        }
214
        return $this;
215
    }
216
217
    /**
218
     * @param string $pattern
219
     * @param bool $only_one
220
     * @param bool $restart
221
     * @return PfPageparser
222
     */
223
    public function parse_fom_chunks(string $pattern,bool $only_one=false, bool $restart=false): PfPageparser
224
    {
225 View Code Duplication
        if(empty($this->chunks)){
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
226
            if($this->content){
227
                $this->chunks=[$this->content];
228
            } else {
229
                return $this;
230
            }
231
        }
232
        if($restart or empty($this->parsed)){
233
            $items=&$this->chunks;
234
	$this->parsed=[];
235
        } else {
236
            $items=&$this->parsed;
237
        }
238
        foreach($items as $item){
239
            $matches=[];
240
            if(preg_match_all($pattern,$item,$matches,PREG_SET_ORDER)){
241
                $chunk_results=[];
242
                foreach($matches as $match){
243
                    if($only_one){
244
                        $chunk_results=$match[1];
245
                    } else {
246
                        $chunk_results[]=$match[1];
247
                    }
248
                }
249
                $this->parsed[]=$chunk_results;
250
            }
251
        }
252
        return $this;
253
    }
254
255
    public function get_chunks(): array
256
    {
257
        return $this->chunks;
258
    }
259
260
    public function results(bool $before_parsing=false): array
261
    {
262
        if($before_parsing or empty($this->parsed)){
263
            return $this->chunks;
264
        } else {
265
            return $this->parsed;
266
        }
267
    }
268
269
    private function log(string $text,string $level = 'info')
270
    {
271
        if($this->logger){
272
            $this->logger->log($level,$text);
273
        }
274
    }
275
276
}
277