Completed
Push — master ( ebd934...e41d4d )
by Peter
01:18
created

PfPageparser::split_chunks()   A

Complexity

Conditions 4
Paths 3

Size

Total Lines 21

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
dl 0
loc 21
rs 9.584
c 0
b 0
f 0
cc 4
nc 3
nop 2
1
<?php
2
3
namespace Pforret\PfPageparser;
4
5
6
class PfPageparser
7
{
8
    // Build your next great package.
9
    private $config;
10
    private $content="";
11
    private $chunks=[];
12
    private $parsed=[];
13
14
    public function __construct($config=[]){
15
        $defaults=[
16
            'userAgent' =>  'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36',
17
            'cacheTtl'  =>  3600,
18
            'timeOut'   =>  10,
19
        ];
20
21
        $this->config=array_merge($defaults,$config);
22
    }
23
24
    public function get_config(){
25
        return $this->config;
26
    }
27
28
    /* ------------------------------------------
29
     * LOADING THE CONTENT FROM A URL/FILE/STRING
30
     */
31
32
    public function load_from_url(string $url,array $options=[]): PfPageparser
33
    {
34
        // TODO: load with guzzle & caching
35
        $options=array_merge($this->config,$options);
36
37
        $ch = curl_init();
38
        curl_setopt ($ch, CURLOPT_URL, $url);
39
        curl_setopt ($ch, CURLOPT_USERAGENT, $options['userAgent']);
40
        curl_setopt ($ch, CURLOPT_HEADER, 0);
41
        curl_setopt ($ch, CURLOPT_RETURNTRANSFER, 1);
42
        curl_setopt ($ch, CURLOPT_FOLLOWLOCATION, 1);
43
        curl_setopt ($ch, CURLOPT_TIMEOUT,$options['timeOut']);
44
        curl_setopt ($ch, CURLOPT_CONNECTTIMEOUT,$options['timeOut']);
45
        $this->content = curl_exec ($ch);
46
        curl_close ($ch);
47
        return $this;
48
    }
49
50
    public function load_from_file(string $filename): PfPageparser
51
    {
52
        // load directly from file
53
54
        if(file_exists($filename)){
55
            $this->content=file_get_contents($filename);
56
        }
57
        return $this;
58
    }
59
60
    public function load_fom_string(string $string): PfPageparser
61
    {
62
        // load HTML string
63
        $this->content=$string;
64
        return $this;
65
    }
66
67
    /* ------------------------------------------
68
    * GET RAW CONTENT BACK
69
    */
70
71
    /**
72
     * @return string
73
     */
74
    public function get_content():string
75
    {
76
        // for backward compatibility
77
        return $this->raw();
78
    }
79
80
    /**
81
     * @return string
82
     */
83
    public function raw(): string
84
    {
85
        return $this->content;
86
    }
87
88
    /* ------------------------------------------
89
    * MODIFY THE RAW CONTENT
90
    */
91
92
    public function trim_before(string $pattern,bool $is_regex=false): PfPageparser
93
    {
94
        $found = $is_regex ? preg_match($pattern, $this->content, $matches) : strpos($this->content, $pattern);
95
        if($found) $this->content = substr($this->content, $found);
96
        return $this;
97
    }
98
99
    public function trim_after(string $pattern,bool $is_regex=false): PfPageparser
100
    {
101
        $found = $is_regex ? preg_match($pattern, $this->content, $matches) : strpos($this->content, $pattern);
102
        if($found) $this->content=substr($this->content,0,$found);
103
        return $this;
104
105
    }
106
107
    public function trim(string $before="<body",string $after="</body",bool $is_regex=false): PfPageparser
108
    {
109
        $this->trim_before($before,$is_regex);
110
        $this->trim_after($after,$is_regex);
111
        return $this;
112
    }
113
114
    /* ------------------------------------------
115
    * RAW CONTENT => CHUNKS
116
    */
117
118
    /**
119
     * @param $pattern
120
     * @param $is_regex
121
     * @return $this
122
     * split the HTML content into chunks based on a text or regex separator
123
     */
124
125
    public function split_chunks(string $pattern,bool $is_regex=false): PfPageparser
126
    {
127
        if(!$is_regex){
128
            $this->chunks=explode($pattern,$this->content);
129
        } else {
130
            $this->chunks=[];
131
            preg_match_all($pattern,$this->content,$matches, PREG_OFFSET_CAPTURE);
132
            if($matches) {
133
                $from_char=0;
134
                foreach($matches[0] as $match){
135
                    $separator=$match[0];
136
                    $at_char=$match[1];
137
                    $this->chunks[]=substr($this->content,$from_char,$at_char-$from_char-1);
138
                    $from_char=$at_char+strlen($separator);
139
                }
140
            } else {
141
                $this->chunks[]=$this->content;
142
            }
143
         }
144
        return $this;
145
    }
146
147
    /**
148
     * @param array $pattern_keep   - array of patterns that should be found (combined with OR)
149
     * @param array $pattern_remove - array of patterns that should not be found (combined with OR)
150
     * @param bool $is_regex       - whether patterns are regex or just strings
151
     * @return $this
152
     */
153
    public function filter_chunks($pattern_keep=[],$pattern_remove=[],bool $is_regex=false): PfPageparser
154
    {
155
        $id=false;
156
        $matches=false;
157
        $chunk=false;
158
159
        if(empty($this->chunks)){
160
            // not split in chunks yet
161
            // do nothing
162
            return $this;
163
        }
164
        if($pattern_keep AND !is_array($pattern_keep)){
165
            // make it always an array
166
            $pattern_keep=[$pattern_keep];
167
        }
168
        if($pattern_remove AND !is_array($pattern_remove)){
169
            // make it always an array
170
            $pattern_remove=[$pattern_remove];
171
        }
172
        foreach($this->chunks as $id => $chunk){
173
            //
174
            $keep_chunk=true;
175 View Code Duplication
            if(!empty($pattern_keep)){
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
176
                $pattern_found=false;
177
                foreach($pattern_keep as $pattern){
178
                    if($is_regex){
179
                        $pattern_found=($pattern_found OR preg_match($pattern,$chunk,$matches));
180
                    } else {
181
                        $pattern_found=($pattern_found OR strstr($chunk,$pattern));
182
                    }
183
                }
184
                $keep_chunk=($keep_chunk AND $pattern_found);
185
            }
186 View Code Duplication
            if(!empty($pattern_remove)){
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
187
                $pattern_found=false;
188
                foreach($pattern_remove as $pattern){
189
                    if($is_regex){
190
                        $pattern_found=($pattern_found OR preg_match($pattern,$chunk,$matches));
191
                    } else {
192
                        $pattern_found=($pattern_found OR strstr($chunk,$pattern));
193
                    }
194
                }
195
                $keep_chunk=($keep_chunk AND !$pattern_found);
196
            }
197
            if(!$keep_chunk){
198
                unset($this->chunks[$id]);
199
            }
200
        }
201
        return $this;
202
    }
203
204
    /**
205
     * @param $pattern
206
     * @return array
207
     */
208
    public function parse_fom_chunks(string $pattern,bool $restart=false): PfPageparser
209
    {
210
        if(empty($this->chunks)){
211
            return $this;
212
        }
213
        if($restart or empty($this->parsed)){
214
            $items=&$this->chunks;
215
        } else {
216
            $items=&$this->parsed;
217
        }
218
        foreach($items as $item){
219
            $matches=[];
220
            if(preg_match_all($pattern,$item,$matches,PREG_SET_ORDER)){
221
                $chunk_results=[];
222
                foreach($matches as $match){
223
                    $chunk_results[]=$match[1];
224
                }
225
                $this->parsed[]=$chunk_results;
226
            }
227
        }
228
        return $this;
229
    }
230
231
    /**
232
     * @return array
233
     */
234
    public function get_chunks(): array
235
    {
236
        return $this->chunks;
237
    }
238
239
    public function results(bool $before_parsing=false): array
240
    {
241
        if($before_parsing or empty($this->parsed)){
242
            return $this->chunks;
243
        } else {
244
            return $this->parsed;
245
        }
246
    }
247
248
249
    public function preg_get($pattern,$haystack){
250
        $matches=[];
251
        if(preg_match($pattern,$haystack,$matches)){
252
            return $matches[0];
253
        } else {
254
            return "";
255
        }
256
    }
257
258
    /**
259
     * PROTECTED FUNCTIONS
260
     */
261
262
}
263