Completed
Push — master ( aaef62...ebd934 )
by Peter
01:48
created

PfPageparser::preg_get()   A

Complexity

Conditions 2
Paths 2

Size

Total Lines 8

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
dl 0
loc 8
rs 10
c 0
b 0
f 0
cc 2
nc 2
nop 2
1
<?php
2
3
namespace Pforret\PfPageparser;
4
5
6
class PfPageparser
7
{
8
    // Build your next great package.
9
    private $config;
10
    private $content="";
11
    private $chunks=[];
12
    private $parsed=[];
13
14
    public function __construct($config=[]){
15
        $defaults=[
16
            'userAgent' =>  'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36',
17
            'cacheTtl'  =>  3600,
18
            'timeOut'   =>  10,
19
        ];
20
21
        $this->config=array_merge($defaults,$config);
22
    }
23
24
    public function get_config(){
25
        return $this->config;
26
    }
27
28
    public function load_from_url(string $url,array $options=[]): PfPageparser
29
    {
30
        // TODO: load with guzzle & caching
31
        $options=array_merge($this->config,$options);
32
33
        $ch = curl_init();
34
        curl_setopt ($ch, CURLOPT_URL, $url);
35
        curl_setopt ($ch, CURLOPT_USERAGENT, $options['userAgent']);
36
        curl_setopt ($ch, CURLOPT_HEADER, 0);
37
        curl_setopt ($ch, CURLOPT_RETURNTRANSFER, 1);
38
        curl_setopt ($ch, CURLOPT_FOLLOWLOCATION, 1);
39
        curl_setopt ($ch, CURLOPT_TIMEOUT,$options['timeOut']);
40
        curl_setopt ($ch, CURLOPT_CONNECTTIMEOUT,$options['timeOut']);
41
        $this->content = curl_exec ($ch);
42
        curl_close ($ch);
43
        return $this;
44
    }
45
46
    public function load_from_file(string $filename): PfPageparser
47
    {
48
        // load directly from file
49
50
        if(file_exists($filename)){
51
            $this->content=file_get_contents($filename);
52
        }
53
        return $this;
54
    }
55
56
    public function load_fom_string(string $string): PfPageparser
57
    {
58
        // load HTML string
59
        $this->content=$string;
60
        return $this;
61
    }
62
63
    public function get_content(){
64
        return $this->content;
65
    }
66
67
    public function raw(){
68
        return $this->content;
69
    }
70
71
    /**
72
     * @param $pattern
73
     * @param bool $is_regex
74
     * @return $this
75
     */
76 View Code Duplication
    public function trim_before($pattern,$is_regex=false): PfPageparser
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
77
    {
78
        if($is_regex){
79
            $matches=[];
80
            if($found=preg_match($pattern,$this->content,$matches)){
81
                $this->content=substr($this->content,$found); // trim before
82
            }
83
        } else {
84
            if($found=strpos($this->content,$pattern)){
85
                $this->content=substr($this->content,$found); // trim before
86
            }
87
        }
88
        return $this;
89
    }
90
91
    /**
92
     * @param $pattern
93
     * @param bool $is_regex
94
     * @return $this
95
     */
96 View Code Duplication
    public function trim_after($pattern,$is_regex=false): PfPageparser
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
97
    {
98
        if($is_regex){
99
            $matches=[];
100
            if($found=preg_match($pattern,$this->content,$matches)){
101
                $this->content=substr($this->content,0,$found); // trim after
102
            }
103
        } else {
104
            if($found=strpos($this->content,$pattern)){
105
                $this->content=substr($this->content,0,$found + strlen($pattern)); // trim after
106
            }
107
        }
108
        return $this;
109
110
    }
111
112
    /**
113
     * @param string $before
114
     * @param string $after
115
     * @param bool $is_regex
116
     * @return $this
117
     */
118
119
    public function trim($before="<body",$after="</body",$is_regex=false): PfPageparser
120
    {
121
        $this->trim_before($before,$is_regex);
122
        $this->trim_after($after,$is_regex);
123
        return $this;
124
    }
125
126
    /**
127
     * @param $pattern
128
     * @param $is_regex
129
     * @return $this
130
     * split the HTML content into chunks based on a text or regex separator
131
     */
132
133
    public function split_chunks($pattern,$is_regex=false): PfPageparser
134
    {
135
        if(!$is_regex){
136
            $this->chunks=explode($pattern,$this->content);
137
        } else {
138
            $this->chunks=[];
139
            preg_match_all($pattern,$this->content,$matches, PREG_OFFSET_CAPTURE);
140
            if($matches) {
141
                $from_char=0;
142
                foreach($matches[0] as $match){
143
                    $separator=$match[0];
144
                    $at_char=$match[1];
145
                    $this->chunks[]=substr($this->content,$from_char,$at_char-$from_char-1);
146
                    $from_char=$at_char+strlen($separator);
147
                }
148
            } else {
149
                $this->chunks[]=$this->content;
150
            }
151
         }
152
        return $this;
153
    }
154
155
    /**
156
     * @param array $pattern_keep   - array of patterns that should be found (combined with OR)
157
     * @param array $pattern_remove - array of patterns that should not be found (combined with OR)
158
     * @param bool $is_regex       - whether patterns are regex or just strings
159
     * @return $this
160
     */
161
    public function filter_chunks($pattern_keep=[],$pattern_remove=[],bool $is_regex=false): PfPageparser
162
    {
163
        $id=false;
164
        $matches=false;
165
        $chunk=false;
166
167
        if(empty($this->chunks)){
168
            // not split in chunks yet
169
            // do nothing
170
            return $this;
171
        }
172
        if($pattern_keep AND !is_array($pattern_keep)){
173
            // make it always an array
174
            $pattern_keep=[$pattern_keep];
175
        }
176
        if($pattern_remove AND !is_array($pattern_remove)){
177
            // make it always an array
178
            $pattern_remove=[$pattern_remove];
179
        }
180
        foreach($this->chunks as $id => $chunk){
181
            //
182
            $keep_chunk=true;
183 View Code Duplication
            if(!empty($pattern_keep)){
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
184
                $pattern_found=false;
185
                foreach($pattern_keep as $pattern){
186
                    if($is_regex){
187
                        $pattern_found=($pattern_found OR preg_match($pattern,$chunk,$matches));
188
                    } else {
189
                        $pattern_found=($pattern_found OR strstr($chunk,$pattern));
190
                    }
191
                }
192
                $keep_chunk=($keep_chunk AND $pattern_found);
193
            }
194 View Code Duplication
            if(!empty($pattern_remove)){
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
195
                $pattern_found=false;
196
                foreach($pattern_remove as $pattern){
197
                    if($is_regex){
198
                        $pattern_found=($pattern_found OR preg_match($pattern,$chunk,$matches));
199
                    } else {
200
                        $pattern_found=($pattern_found OR strstr($chunk,$pattern));
201
                    }
202
                }
203
                $keep_chunk=($keep_chunk AND !$pattern_found);
204
            }
205
            if(!$keep_chunk){
206
                unset($this->chunks[$id]);
207
            }
208
        }
209
        return $this;
210
    }
211
212
    /**
213
     * @param $pattern
214
     * @return array
215
     */
216
    public function parse_fom_chunks(string $pattern,bool $restart=false): array
217
    {
218
        if(empty($this->chunks)){
219
            return $this;
220
        }
221
        if($restart or empty($this->parsed)){
222
            $items=&$this->chunks;
223
        } else {
224
            $items=&$this->parsed;
225
        }
226
        foreach($items as $item){
227
            $matches=[];
228
            if(preg_match_all($pattern,$item,$matches,PREG_SET_ORDER)){
229
                $chunk_results=[];
230
                foreach($matches as $match){
231
                    $chunk_results[]=$match[1];
232
                }
233
                $this->parsed[]=$chunk_results;
234
            }
235
        }
236
        return $this;
237
    }
238
239
    /**
240
     * @return array
241
     */
242
    public function get_chunks(): array
243
    {
244
        return $this->chunks;
245
    }
246
247
    public function results(bool $before_parsing=false): array
248
    {
249
        if($before_parsing or empty($this->parsed)){
250
            return $this->chunks;
251
        } else {
252
            return $this->parsed;
253
        }
254
    }
255
256
257
    public function preg_get($pattern,$haystack){
258
        $matches=[];
259
        if(preg_match($pattern,$haystack,$matches)){
260
            return $matches[0];
261
        } else {
262
            return "";
263
        }
264
    }
265
266
    /**
267
     * PROTECTED FUNCTIONS
268
     */
269
270
}
271