Duplicate code is one of the most pungent code smells. A rule that is often used is to re-structure code once it is duplicated in three or more places.
Common duplication problems, and corresponding solutions are:
Complex classes like PfPageparser often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes. You can also have a look at the cohesion graph to spot any un-connected, or weakly-connected components.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
While breaking up the class, it is a good idea to analyze how other classes use PfPageparser, and based on these observations, apply Extract Interface, too.
1 | <?php |
||
10 | class PfPageparser |
||
11 | { |
||
12 | // Build your next great package. |
||
13 | private $config; |
||
14 | private $content=""; |
||
15 | private $chunks=[]; |
||
16 | private $parsed=[]; |
||
17 | private $logger=null; |
||
18 | |||
19 | public function __construct(array $config=[], AbstractLogger $logger=null){ |
||
20 | $defaults=[ |
||
21 | 'userAgent' => 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36', |
||
22 | 'cacheTtl' => 3600, |
||
23 | 'timeOut' => 10, |
||
24 | 'method' => 'GET', |
||
25 | ]; |
||
26 | |||
27 | if($logger){ |
||
28 | $this->logger=$logger; |
||
29 | } |
||
30 | $this->config=array_merge($defaults,$config); |
||
31 | } |
||
32 | |||
33 | public function get_config(){ |
||
34 | return $this->config; |
||
35 | } |
||
36 | |||
37 | /* ------------------------------------------ |
||
38 | * LOADING THE CONTENT FROM A URL/FILE/STRING |
||
39 | */ |
||
40 | |||
41 | public function load_from_url(string $url,array $options=[]): PfPageparser |
||
42 | { |
||
43 | // TODO: load with caching |
||
44 | $options=array_merge($this->config,$options); |
||
45 | $client = new Client(); |
||
46 | try { |
||
47 | $res = $client->request($options['method'], $url); |
||
48 | } catch (GuzzleException $e) { |
||
49 | $this->log(); |
||
|
|||
50 | } |
||
51 | $this->content=$res->getBody(); |
||
52 | |||
53 | return $this; |
||
54 | } |
||
55 | |||
56 | public function load_from_file(string $filename): PfPageparser |
||
57 | { |
||
58 | // load directly from file |
||
59 | |||
60 | if(file_exists($filename)){ |
||
61 | $this->content=file_get_contents($filename); |
||
62 | } |
||
63 | return $this; |
||
64 | } |
||
65 | |||
66 | public function load_fom_string(string $string): PfPageparser |
||
67 | { |
||
68 | // load HTML string |
||
69 | $this->content=$string; |
||
70 | return $this; |
||
71 | } |
||
72 | |||
73 | /* ------------------------------------------ |
||
74 | * GET RAW CONTENT BACK |
||
75 | */ |
||
76 | |||
77 | /** |
||
78 | * @return string |
||
79 | */ |
||
80 | public function get_content():string |
||
81 | { |
||
82 | // for backward compatibility |
||
83 | return $this->raw(); |
||
84 | } |
||
85 | |||
86 | /** |
||
87 | * @return string |
||
88 | */ |
||
89 | public function raw(): string |
||
90 | { |
||
91 | return $this->content; |
||
92 | } |
||
93 | |||
94 | /* ------------------------------------------ |
||
95 | * MODIFY THE RAW CONTENT |
||
96 | */ |
||
97 | |||
98 | public function trim_before(string $pattern,bool $is_regex=false): PfPageparser |
||
99 | { |
||
100 | $found = $is_regex ? preg_match($pattern, $this->content, $matches) : strpos($this->content, $pattern); |
||
101 | if($found) $this->content = substr($this->content, $found); |
||
102 | return $this; |
||
103 | } |
||
104 | |||
105 | public function trim_after(string $pattern,bool $is_regex=false): PfPageparser |
||
106 | { |
||
107 | $found = $is_regex ? preg_match($pattern, $this->content, $matches) : strpos($this->content, $pattern); |
||
108 | if($found) $this->content=substr($this->content,0,$found); |
||
109 | return $this; |
||
110 | |||
111 | } |
||
112 | |||
113 | public function trim(string $before="<body",string $after="</body",bool $is_regex=false): PfPageparser |
||
114 | { |
||
115 | $this->trim_before($before,$is_regex); |
||
116 | $this->trim_after($after,$is_regex); |
||
117 | return $this; |
||
118 | } |
||
119 | |||
120 | /* ------------------------------------------ |
||
121 | * RAW CONTENT => CHUNKS |
||
122 | */ |
||
123 | |||
124 | /** |
||
125 | * @param $pattern |
||
126 | * @param $is_regex |
||
127 | * @return $this |
||
128 | * split the HTML content into chunks based on a text or regex separator |
||
129 | */ |
||
130 | |||
131 | public function split_chunks(string $pattern,bool $is_regex=false): PfPageparser |
||
132 | { |
||
133 | if(!$is_regex){ |
||
134 | $this->chunks=explode($pattern,$this->content); |
||
135 | } else { |
||
136 | $this->chunks=[]; |
||
137 | preg_match_all($pattern,$this->content,$matches, PREG_OFFSET_CAPTURE); |
||
138 | if($matches) { |
||
139 | $from_char=0; |
||
140 | foreach($matches[0] as $match){ |
||
141 | $separator=$match[0]; |
||
142 | $at_char=$match[1]; |
||
143 | $this->chunks[]=substr($this->content,$from_char,$at_char-$from_char-1); |
||
144 | $from_char=$at_char+strlen($separator); |
||
145 | } |
||
146 | } else { |
||
147 | $this->chunks[]=$this->content; |
||
148 | } |
||
149 | } |
||
150 | return $this; |
||
151 | } |
||
152 | |||
153 | /** |
||
154 | * @param array $pattern_keep |
||
155 | * @param array $pattern_remove |
||
156 | * @param bool $is_regex |
||
157 | * @return $this |
||
158 | */ |
||
159 | public function filter_chunks(array $pattern_keep=[], array $pattern_remove=[], bool $is_regex=false): PfPageparser |
||
160 | { |
||
161 | $id=false; |
||
162 | $matches=false; |
||
163 | $chunk=false; |
||
164 | |||
165 | if(empty($this->chunks)){ |
||
166 | // not split in chunks yet |
||
167 | // do nothing |
||
168 | return $this; |
||
169 | } |
||
170 | foreach($this->chunks as $id => $chunk){ |
||
171 | // |
||
172 | $keep_chunk=true; |
||
173 | View Code Duplication | if(!empty($pattern_keep)){ |
|
174 | $pattern_found=false; |
||
175 | foreach($pattern_keep as $pattern){ |
||
176 | if($is_regex){ |
||
177 | $pattern_found=($pattern_found OR preg_match($pattern,$chunk,$matches)); |
||
178 | } else { |
||
179 | $pattern_found=($pattern_found OR strstr($chunk,$pattern)); |
||
180 | } |
||
181 | } |
||
182 | $keep_chunk=($keep_chunk AND $pattern_found); |
||
183 | } |
||
184 | View Code Duplication | if(!empty($pattern_remove)){ |
|
185 | $pattern_found=false; |
||
186 | foreach($pattern_remove as $pattern){ |
||
187 | if($is_regex){ |
||
188 | $pattern_found=($pattern_found OR preg_match($pattern,$chunk,$matches)); |
||
189 | } else { |
||
190 | $pattern_found=($pattern_found OR strstr($chunk,$pattern)); |
||
191 | } |
||
192 | } |
||
193 | $keep_chunk=($keep_chunk AND !$pattern_found); |
||
194 | } |
||
195 | if(!$keep_chunk){ |
||
196 | unset($this->chunks[$id]); |
||
197 | } |
||
198 | } |
||
199 | return $this; |
||
200 | } |
||
201 | |||
202 | /** |
||
203 | * @param string $pattern |
||
204 | * @param bool $restart |
||
205 | * @return PfPageparser |
||
206 | */ |
||
207 | public function parse_fom_chunks(string $pattern,bool $only_one=false, bool $restart=false): PfPageparser |
||
208 | { |
||
209 | if(empty($this->chunks)){ |
||
210 | return $this; |
||
211 | } |
||
212 | if($restart or empty($this->parsed)){ |
||
213 | $items=&$this->chunks; |
||
214 | } else { |
||
215 | $items=&$this->parsed; |
||
216 | } |
||
217 | foreach($items as $item){ |
||
218 | $matches=[]; |
||
219 | if(preg_match_all($pattern,$item,$matches,PREG_SET_ORDER)){ |
||
220 | $chunk_results=[]; |
||
221 | foreach($matches as $match){ |
||
222 | if($only_one){ |
||
223 | $chunk_results=$match[1]; |
||
224 | } else { |
||
225 | $chunk_results[]=$match[1]; |
||
226 | } |
||
227 | } |
||
228 | $this->parsed[]=$chunk_results; |
||
229 | } |
||
230 | } |
||
231 | return $this; |
||
232 | } |
||
233 | |||
234 | public function get_chunks(): array |
||
238 | |||
239 | public function results(bool $before_parsing=false): array |
||
247 | |||
248 | private function log(string $text,int $level ) |
||
249 | { |
||
250 | if($this->logger){ |
||
251 | $this->logger->log($level,$text); |
||
252 | } |
||
253 | } |
||
254 | |||
255 | } |
||
256 |