Completed
Push — master ( 55c8be...946567 )
by Peter
02:35
created

PfPageparser::cleanup_html()   A

Complexity

Conditions 3
Paths 4

Size

Total Lines 6

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
dl 0
loc 6
rs 10
c 0
b 0
f 0
cc 3
nc 4
nop 2
1
<?php
2
3
namespace Pforret\PfPageparser;
4
5
use GuzzleHttp\Client;
6
use GuzzleHttp\Exception\GuzzleException;
7
use Psr\Log\LoggerInterface;
8
9
class PfPageparser
10
{
11
    // Build your next great package.
12
    private $config;
13
    private $content="";
14
    private $chunks=[];
15
    private $parsed=[];
16
    private $logger=null;
17
18
    public function __construct(array $config=[], LoggerInterface $logger=null){
19
        $defaults=[
20
            'userAgent' =>  'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36',
21
            'cacheTtl'  =>  3600,
22
            'timeout'   =>  5,      // Guzzle timeout
23
            'method'    =>  'GET',
24
            'headers'   => [
25
                'User-Agent'  => 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36',
26
            ]
27
        ];
28
29
        if($logger){
30
            $this->logger=$logger;
31
        }
32
        $this->config=array_merge($defaults,$config);
33
    }
34
35
    public function get_config(){
36
        return $this->config;
37
    }
38
39
    /* ------------------------------------------
40
     * LOADING THE CONTENT FROM A URL/FILE/STRING
41
     */
42
43
    private function initialize(){
44
        $this->content="";
45
        $this->chunks=[];
46
        $this->parsed=[];
47
    }
48
49
    /**
50
     * @param string $url
51
     * @param array $options
52
     * @return $this
53
     */
54
    public function load_from_url(string $url, array $options=[]): PfPageparser
55
    {
56
        // TODO: load with caching
57
        $this->initialize();
58
        $options=array_merge($this->config,$options);
59
        $client = new Client([
60
            'headers'   =>  $options['headers'],
61
        ]);
62
        try {
63
            $res = $client->request($options['method'], $url);
64
            $this->content=$res->getBody()->getContents();
65
        } catch (GuzzleException $error) {
66
            $message = $error->getMessage();
67
            $this->log($message,'error');
68
        }
69
        return $this;
70
    }
71
72
    public function load_from_file(string $filename): PfPageparser
73
    {
74
        $this->initialize();
75
        if(file_exists($filename)){
76
            $this->content=file_get_contents($filename);
77
        }
78
        return $this;
79
    }
80
81
    public function load_fom_string(string $string): PfPageparser
82
    {
83
        $this->initialize();
84
        $this->content=$string;
85
        return $this;
86
    }
87
88
    public function cleanup_html($remove_linefeeds=true,$shrink_spaces=true ): PfPageparser
89
    {
90
        if($remove_linefeeds)   $this->content=preg_replace("|\n+|"," ",$this->content); // remove line feeds
91
        if($shrink_spaces)      $this->content=preg_replace("|\s\s+|"," ",$this->content); // remove multiple spaces
92
        return $this;
93
    }
94
95
    /* ------------------------------------------
96
    * GET THE RAW CONTENT
97
    */
98
    public function get_content():string
99
        {   return $this->content;      /* for backward compatibility */ }
100
101
    public function raw(): string
102
        {    return $this->content; }
103
104
    /* ------------------------------------------
105
    * MODIFY THE RAW CONTENT
106
    */
107
108
    public function trim_before(string $pattern,bool $is_regex=false): PfPageparser
109
    {
110
        $found = $is_regex ? preg_match($pattern, $this->content, $matches) : strpos($this->content, $pattern);
111
        if($found) $this->content = substr($this->content, $found);
112
        return $this;
113
    }
114
115
    public function trim_after(string $pattern,bool $is_regex=false): PfPageparser
116
    {
117
        $found = $is_regex ? preg_match($pattern, $this->content, $matches) : strpos($this->content, $pattern);
118
        if($found) $this->content=substr($this->content,0,$found);
119
        return $this;
120
121
    }
122
123
    public function trim(string $before="<body",string $after="</body",bool $is_regex=false): PfPageparser
124
    {
125
        $this->trim_before($before,$is_regex);
126
        $this->trim_after($after,$is_regex);
127
        return $this;
128
    }
129
130
    /* ------------------------------------------
131
    * RAW CONTENT => CHUNKS
132
    */
133
134
    /**
135
     * @param $pattern
136
     * @param $is_regex
137
     * @return $this
138
     * split the HTML content into chunks based on a text or regex separator
139
     */
140
141
    public function split_chunks(string $pattern,bool $is_regex=false): PfPageparser
142
    {
143
        if(!$is_regex){
144
            $this->chunks=explode($pattern,$this->content);
145
        } else {
146
            $this->chunks=[];
147
            preg_match_all($pattern,$this->content,$matches, PREG_OFFSET_CAPTURE);
148
            if($matches) {
149
                $from_char=0;
150
                foreach($matches[0] as $match){
151
                    $separator=$match[0];
152
                    $at_char=$match[1];
153
                    $this->chunks[]=substr($this->content,$from_char,$at_char-$from_char-1);
154
                    $from_char=$at_char+strlen($separator);
155
                }
156
            } else {
157
                $this->chunks[]=$this->content;
158
            }
159
         }
160
        return $this;
161
    }
162
163
    /**
164
     * @param array $pattern_keep
165
     * @param array $pattern_remove
166
     * @param bool $is_regex
167
     * @return $this
168
     */
169
    public function filter_chunks(array $pattern_keep=[], array $pattern_remove=[], bool $is_regex=false): PfPageparser
170
    {
171
        $matches=false;
172
173 View Code Duplication
        if(empty($this->chunks)){
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
174
            if($this->content){
175
                $this->chunks=[$this->content];
176
            } else {
177
                return $this;
178
            }
179
        }
180
        foreach($this->chunks as $id => $chunk){
181
            //
182
            $keep_chunk=true;
183 View Code Duplication
            if(!empty($pattern_keep)){
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
184
                $pattern_found=false;
185
                foreach($pattern_keep as $pattern){
186
                    if($is_regex){
187
                        $pattern_found=($pattern_found OR preg_match($pattern,$chunk,$matches));
188
                    } else {
189
                        $pattern_found=($pattern_found OR strstr($chunk,$pattern));
190
                    }
191
                }
192
                $keep_chunk=($keep_chunk AND $pattern_found);
193
            }
194 View Code Duplication
            if(!empty($pattern_remove)){
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
195
                $pattern_found=false;
196
                foreach($pattern_remove as $pattern){
197
                    if($is_regex){
198
                        $pattern_found=($pattern_found OR preg_match($pattern,$chunk,$matches));
199
                    } else {
200
                        $pattern_found=($pattern_found OR strstr($chunk,$pattern));
201
                    }
202
                }
203
                $keep_chunk=($keep_chunk AND !$pattern_found);
204
            }
205
            if(!$keep_chunk){
206
                unset($this->chunks[$id]);
207
            }
208
        }
209
        return $this;
210
    }
211
212
    /**
213
     * @param string $pattern
214
     * @param bool $only_one
215
     * @param bool $restart
216
     * @return PfPageparser
217
     */
218
    public function parse_fom_chunks(string $pattern,bool $only_one=false, bool $restart=false): PfPageparser
219
    {
220 View Code Duplication
        if(empty($this->chunks)){
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
221
            if($this->content){
222
                $this->chunks=[$this->content];
223
            } else {
224
                return $this;
225
            }
226
        }
227
        if($restart or empty($this->parsed)){
228
            $items=&$this->chunks;
229
	$this->parsed=[];
230
        } else {
231
            $items=&$this->parsed;
232
        }
233
        foreach($items as $item){
234
            $matches=[];
235
            if(preg_match_all($pattern,$item,$matches,PREG_SET_ORDER)){
236
                $chunk_results=[];
237
                foreach($matches as $match){
238
                    if($only_one){
239
                        $chunk_results=$match[1];
240
                    } else {
241
                        $chunk_results[]=$match[1];
242
                    }
243
                }
244
                $this->parsed[]=$chunk_results;
245
            }
246
        }
247
        return $this;
248
    }
249
250
    public function get_chunks(): array
251
    {
252
        return $this->chunks;
253
    }
254
255
    public function results(bool $before_parsing=false): array
256
    {
257
        if($before_parsing or empty($this->parsed)){
258
            return $this->chunks;
259
        } else {
260
            return $this->parsed;
261
        }
262
    }
263
264
    private function log(string $text,string $level = 'info')
265
    {
266
        if($this->logger){
267
            $this->logger->log($level,$text);
268
        }
269
    }
270
271
}
272