Completed
Push — master ( e41d4d...8e258f )
by Peter
01:53
created

PfPageparser::log()   A

Complexity

Conditions 2
Paths 2

Size

Total Lines 6

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
dl 0
loc 6
rs 10
c 0
b 0
f 0
cc 2
nc 2
nop 2
1
<?php
2
3
namespace Pforret\PfPageparser;
4
5
use GuzzleHttp\Client;
6
use GuzzleHttp\Exception\GuzzleException;
7
use Psr\Log;
8
use Psr\Log\AbstractLogger;
9
10
class PfPageparser
11
{
12
    // Build your next great package.
13
    private $config;
14
    private $content="";
15
    private $chunks=[];
16
    private $parsed=[];
17
    private $logger=null;
18
19
    public function __construct(array $config=[], AbstractLogger $logger=null){
20
        $defaults=[
21
            'userAgent' =>  'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36',
22
            'cacheTtl'  =>  3600,
23
            'timeOut'   =>  10,
24
            'method'    =>  'GET',
25
        ];
26
27
        if($logger){
28
            $this->logger=$logger;
29
        }
30
        $this->config=array_merge($defaults,$config);
31
    }
32
33
    public function get_config(){
34
        return $this->config;
35
    }
36
37
    /* ------------------------------------------
38
     * LOADING THE CONTENT FROM A URL/FILE/STRING
39
     */
40
41
    public function load_from_url(string $url,array $options=[]): PfPageparser
42
    {
43
        // TODO: load with caching
44
        $options=array_merge($this->config,$options);
45
        $client = new Client();
46
        try {
47
            $res = $client->request($options['method'], $url);
48
        } catch (GuzzleException $e) {
49
            $this->log();
0 ignored issues
show
Bug introduced by
The call to log() misses some required arguments starting with $text.
Loading history...
50
        }
51
        $this->content=$res->getBody();
0 ignored issues
show
Documentation Bug introduced by
It seems like $res->getBody() of type object<Psr\Http\Message\StreamInterface> is incompatible with the declared type string of property $content.

Our type inference engine has found an assignment to a property that is incompatible with the declared type of that property.

Either this assignment is in error or the assigned type should be added to the documentation/type hint for that property..

Loading history...
52
53
        return $this;
54
    }
55
56
    public function load_from_file(string $filename): PfPageparser
57
    {
58
        // load directly from file
59
60
        if(file_exists($filename)){
61
            $this->content=file_get_contents($filename);
62
        }
63
        return $this;
64
    }
65
66
    public function load_fom_string(string $string): PfPageparser
67
    {
68
        // load HTML string
69
        $this->content=$string;
70
        return $this;
71
    }
72
73
    /* ------------------------------------------
74
    * GET RAW CONTENT BACK
75
    */
76
77
    /**
78
     * @return string
79
     */
80
    public function get_content():string
81
    {
82
        // for backward compatibility
83
        return $this->raw();
84
    }
85
86
    /**
87
     * @return string
88
     */
89
    public function raw(): string
90
    {
91
        return $this->content;
92
    }
93
94
    /* ------------------------------------------
95
    * MODIFY THE RAW CONTENT
96
    */
97
98
    public function trim_before(string $pattern,bool $is_regex=false): PfPageparser
99
    {
100
        $found = $is_regex ? preg_match($pattern, $this->content, $matches) : strpos($this->content, $pattern);
101
        if($found) $this->content = substr($this->content, $found);
102
        return $this;
103
    }
104
105
    public function trim_after(string $pattern,bool $is_regex=false): PfPageparser
106
    {
107
        $found = $is_regex ? preg_match($pattern, $this->content, $matches) : strpos($this->content, $pattern);
108
        if($found) $this->content=substr($this->content,0,$found);
109
        return $this;
110
111
    }
112
113
    public function trim(string $before="<body",string $after="</body",bool $is_regex=false): PfPageparser
114
    {
115
        $this->trim_before($before,$is_regex);
116
        $this->trim_after($after,$is_regex);
117
        return $this;
118
    }
119
120
    /* ------------------------------------------
121
    * RAW CONTENT => CHUNKS
122
    */
123
124
    /**
125
     * @param $pattern
126
     * @param $is_regex
127
     * @return $this
128
     * split the HTML content into chunks based on a text or regex separator
129
     */
130
131
    public function split_chunks(string $pattern,bool $is_regex=false): PfPageparser
132
    {
133
        if(!$is_regex){
134
            $this->chunks=explode($pattern,$this->content);
135
        } else {
136
            $this->chunks=[];
137
            preg_match_all($pattern,$this->content,$matches, PREG_OFFSET_CAPTURE);
138
            if($matches) {
139
                $from_char=0;
140
                foreach($matches[0] as $match){
141
                    $separator=$match[0];
142
                    $at_char=$match[1];
143
                    $this->chunks[]=substr($this->content,$from_char,$at_char-$from_char-1);
144
                    $from_char=$at_char+strlen($separator);
145
                }
146
            } else {
147
                $this->chunks[]=$this->content;
148
            }
149
         }
150
        return $this;
151
    }
152
153
    /**
154
     * @param array $pattern_keep
155
     * @param array $pattern_remove
156
     * @param bool $is_regex
157
     * @return $this
158
     */
159
    public function filter_chunks(array $pattern_keep=[], array $pattern_remove=[], bool $is_regex=false): PfPageparser
160
    {
161
        $id=false;
162
        $matches=false;
163
        $chunk=false;
164
165
        if(empty($this->chunks)){
166
            // not split in chunks yet
167
            // do nothing
168
            return $this;
169
        }
170
        foreach($this->chunks as $id => $chunk){
171
            //
172
            $keep_chunk=true;
173 View Code Duplication
            if(!empty($pattern_keep)){
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
174
                $pattern_found=false;
175
                foreach($pattern_keep as $pattern){
176
                    if($is_regex){
177
                        $pattern_found=($pattern_found OR preg_match($pattern,$chunk,$matches));
178
                    } else {
179
                        $pattern_found=($pattern_found OR strstr($chunk,$pattern));
180
                    }
181
                }
182
                $keep_chunk=($keep_chunk AND $pattern_found);
183
            }
184 View Code Duplication
            if(!empty($pattern_remove)){
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
185
                $pattern_found=false;
186
                foreach($pattern_remove as $pattern){
187
                    if($is_regex){
188
                        $pattern_found=($pattern_found OR preg_match($pattern,$chunk,$matches));
189
                    } else {
190
                        $pattern_found=($pattern_found OR strstr($chunk,$pattern));
191
                    }
192
                }
193
                $keep_chunk=($keep_chunk AND !$pattern_found);
194
            }
195
            if(!$keep_chunk){
196
                unset($this->chunks[$id]);
197
            }
198
        }
199
        return $this;
200
    }
201
202
    /**
203
     * @param string $pattern
204
     * @param bool $restart
205
     * @return PfPageparser
206
     */
207
    public function parse_fom_chunks(string $pattern,bool $only_one=false, bool $restart=false): PfPageparser
208
    {
209
        if(empty($this->chunks)){
210
            return $this;
211
        }
212
        if($restart or empty($this->parsed)){
213
            $items=&$this->chunks;
214
        } else {
215
            $items=&$this->parsed;
216
        }
217
        foreach($items as $item){
218
            $matches=[];
219
            if(preg_match_all($pattern,$item,$matches,PREG_SET_ORDER)){
220
                $chunk_results=[];
221
                foreach($matches as $match){
222
                    if($only_one){
223
                        $chunk_results=$match[1];
224
                    } else {
225
                        $chunk_results[]=$match[1];
226
                    }
227
                }
228
                $this->parsed[]=$chunk_results;
229
            }
230
        }
231
        return $this;
232
    }
233
234
    public function get_chunks(): array
235
    {
236
        return $this->chunks;
237
    }
238
239
    public function results(bool $before_parsing=false): array
240
    {
241
        if($before_parsing or empty($this->parsed)){
242
            return $this->chunks;
243
        } else {
244
            return $this->parsed;
245
        }
246
    }
247
248
    private function log(string $text,int $level )
249
    {
250
        if($this->logger){
251
            $this->logger->log($level,$text);
252
        }
253
    }
254
255
}
256