Duplicate code is one of the most pungent code smells. A rule that is often used is to re-structure code once it is duplicated in three or more places.
Common duplication problems, and corresponding solutions are:
Complex classes like PfPageparser often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes. You can also have a look at the cohesion graph to spot any un-connected, or weakly-connected components.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
While breaking up the class, it is a good idea to analyze how other classes use PfPageparser, and based on these observations, apply Extract Interface, too.
| 1 | <?php |
||
| 10 | class PfPageparser |
||
| 11 | { |
||
| 12 | // Build your next great package. |
||
| 13 | private $config; |
||
| 14 | private $content=""; |
||
| 15 | private $chunks=[]; |
||
| 16 | private $parsed=[]; |
||
| 17 | private $logger=null; |
||
| 18 | |||
| 19 | public function __construct(array $config=[], AbstractLogger $logger=null){ |
||
| 20 | $defaults=[ |
||
| 21 | 'userAgent' => 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36', |
||
| 22 | 'cacheTtl' => 3600, |
||
| 23 | 'timeOut' => 10, |
||
| 24 | 'method' => 'GET', |
||
| 25 | ]; |
||
| 26 | |||
| 27 | if($logger){ |
||
| 28 | $this->logger=$logger; |
||
| 29 | } |
||
| 30 | $this->config=array_merge($defaults,$config); |
||
| 31 | } |
||
| 32 | |||
| 33 | public function get_config(){ |
||
| 34 | return $this->config; |
||
| 35 | } |
||
| 36 | |||
| 37 | /* ------------------------------------------ |
||
| 38 | * LOADING THE CONTENT FROM A URL/FILE/STRING |
||
| 39 | */ |
||
| 40 | |||
| 41 | public function load_from_url(string $url,array $options=[]): PfPageparser |
||
| 42 | { |
||
| 43 | // TODO: load with caching |
||
| 44 | $options=array_merge($this->config,$options); |
||
| 45 | $client = new Client(); |
||
| 46 | try { |
||
| 47 | $res = $client->request($options['method'], $url); |
||
| 48 | } catch (GuzzleException $e) { |
||
| 49 | $this->log(); |
||
|
|
|||
| 50 | } |
||
| 51 | $this->content=$res->getBody(); |
||
| 52 | |||
| 53 | return $this; |
||
| 54 | } |
||
| 55 | |||
| 56 | public function load_from_file(string $filename): PfPageparser |
||
| 57 | { |
||
| 58 | // load directly from file |
||
| 59 | |||
| 60 | if(file_exists($filename)){ |
||
| 61 | $this->content=file_get_contents($filename); |
||
| 62 | } |
||
| 63 | return $this; |
||
| 64 | } |
||
| 65 | |||
| 66 | public function load_fom_string(string $string): PfPageparser |
||
| 67 | { |
||
| 68 | // load HTML string |
||
| 69 | $this->content=$string; |
||
| 70 | return $this; |
||
| 71 | } |
||
| 72 | |||
| 73 | /* ------------------------------------------ |
||
| 74 | * GET RAW CONTENT BACK |
||
| 75 | */ |
||
| 76 | |||
| 77 | /** |
||
| 78 | * @return string |
||
| 79 | */ |
||
| 80 | public function get_content():string |
||
| 81 | { |
||
| 82 | // for backward compatibility |
||
| 83 | return $this->raw(); |
||
| 84 | } |
||
| 85 | |||
| 86 | /** |
||
| 87 | * @return string |
||
| 88 | */ |
||
| 89 | public function raw(): string |
||
| 90 | { |
||
| 91 | return $this->content; |
||
| 92 | } |
||
| 93 | |||
| 94 | /* ------------------------------------------ |
||
| 95 | * MODIFY THE RAW CONTENT |
||
| 96 | */ |
||
| 97 | |||
| 98 | public function trim_before(string $pattern,bool $is_regex=false): PfPageparser |
||
| 99 | { |
||
| 100 | $found = $is_regex ? preg_match($pattern, $this->content, $matches) : strpos($this->content, $pattern); |
||
| 101 | if($found) $this->content = substr($this->content, $found); |
||
| 102 | return $this; |
||
| 103 | } |
||
| 104 | |||
| 105 | public function trim_after(string $pattern,bool $is_regex=false): PfPageparser |
||
| 106 | { |
||
| 107 | $found = $is_regex ? preg_match($pattern, $this->content, $matches) : strpos($this->content, $pattern); |
||
| 108 | if($found) $this->content=substr($this->content,0,$found); |
||
| 109 | return $this; |
||
| 110 | |||
| 111 | } |
||
| 112 | |||
| 113 | public function trim(string $before="<body",string $after="</body",bool $is_regex=false): PfPageparser |
||
| 114 | { |
||
| 115 | $this->trim_before($before,$is_regex); |
||
| 116 | $this->trim_after($after,$is_regex); |
||
| 117 | return $this; |
||
| 118 | } |
||
| 119 | |||
| 120 | /* ------------------------------------------ |
||
| 121 | * RAW CONTENT => CHUNKS |
||
| 122 | */ |
||
| 123 | |||
| 124 | /** |
||
| 125 | * @param $pattern |
||
| 126 | * @param $is_regex |
||
| 127 | * @return $this |
||
| 128 | * split the HTML content into chunks based on a text or regex separator |
||
| 129 | */ |
||
| 130 | |||
| 131 | public function split_chunks(string $pattern,bool $is_regex=false): PfPageparser |
||
| 132 | { |
||
| 133 | if(!$is_regex){ |
||
| 134 | $this->chunks=explode($pattern,$this->content); |
||
| 135 | } else { |
||
| 136 | $this->chunks=[]; |
||
| 137 | preg_match_all($pattern,$this->content,$matches, PREG_OFFSET_CAPTURE); |
||
| 138 | if($matches) { |
||
| 139 | $from_char=0; |
||
| 140 | foreach($matches[0] as $match){ |
||
| 141 | $separator=$match[0]; |
||
| 142 | $at_char=$match[1]; |
||
| 143 | $this->chunks[]=substr($this->content,$from_char,$at_char-$from_char-1); |
||
| 144 | $from_char=$at_char+strlen($separator); |
||
| 145 | } |
||
| 146 | } else { |
||
| 147 | $this->chunks[]=$this->content; |
||
| 148 | } |
||
| 149 | } |
||
| 150 | return $this; |
||
| 151 | } |
||
| 152 | |||
| 153 | /** |
||
| 154 | * @param array $pattern_keep |
||
| 155 | * @param array $pattern_remove |
||
| 156 | * @param bool $is_regex |
||
| 157 | * @return $this |
||
| 158 | */ |
||
| 159 | public function filter_chunks(array $pattern_keep=[], array $pattern_remove=[], bool $is_regex=false): PfPageparser |
||
| 160 | { |
||
| 161 | $id=false; |
||
| 162 | $matches=false; |
||
| 163 | $chunk=false; |
||
| 164 | |||
| 165 | if(empty($this->chunks)){ |
||
| 166 | // not split in chunks yet |
||
| 167 | // do nothing |
||
| 168 | return $this; |
||
| 169 | } |
||
| 170 | foreach($this->chunks as $id => $chunk){ |
||
| 171 | // |
||
| 172 | $keep_chunk=true; |
||
| 173 | View Code Duplication | if(!empty($pattern_keep)){ |
|
| 174 | $pattern_found=false; |
||
| 175 | foreach($pattern_keep as $pattern){ |
||
| 176 | if($is_regex){ |
||
| 177 | $pattern_found=($pattern_found OR preg_match($pattern,$chunk,$matches)); |
||
| 178 | } else { |
||
| 179 | $pattern_found=($pattern_found OR strstr($chunk,$pattern)); |
||
| 180 | } |
||
| 181 | } |
||
| 182 | $keep_chunk=($keep_chunk AND $pattern_found); |
||
| 183 | } |
||
| 184 | View Code Duplication | if(!empty($pattern_remove)){ |
|
| 185 | $pattern_found=false; |
||
| 186 | foreach($pattern_remove as $pattern){ |
||
| 187 | if($is_regex){ |
||
| 188 | $pattern_found=($pattern_found OR preg_match($pattern,$chunk,$matches)); |
||
| 189 | } else { |
||
| 190 | $pattern_found=($pattern_found OR strstr($chunk,$pattern)); |
||
| 191 | } |
||
| 192 | } |
||
| 193 | $keep_chunk=($keep_chunk AND !$pattern_found); |
||
| 194 | } |
||
| 195 | if(!$keep_chunk){ |
||
| 196 | unset($this->chunks[$id]); |
||
| 197 | } |
||
| 198 | } |
||
| 199 | return $this; |
||
| 200 | } |
||
| 201 | |||
| 202 | /** |
||
| 203 | * @param string $pattern |
||
| 204 | * @param bool $restart |
||
| 205 | * @return PfPageparser |
||
| 206 | */ |
||
| 207 | public function parse_fom_chunks(string $pattern,bool $only_one=false, bool $restart=false): PfPageparser |
||
| 208 | { |
||
| 209 | if(empty($this->chunks)){ |
||
| 210 | return $this; |
||
| 211 | } |
||
| 212 | if($restart or empty($this->parsed)){ |
||
| 213 | $items=&$this->chunks; |
||
| 214 | } else { |
||
| 215 | $items=&$this->parsed; |
||
| 216 | } |
||
| 217 | foreach($items as $item){ |
||
| 218 | $matches=[]; |
||
| 219 | if(preg_match_all($pattern,$item,$matches,PREG_SET_ORDER)){ |
||
| 220 | $chunk_results=[]; |
||
| 221 | foreach($matches as $match){ |
||
| 222 | if($only_one){ |
||
| 223 | $chunk_results=$match[1]; |
||
| 224 | } else { |
||
| 225 | $chunk_results[]=$match[1]; |
||
| 226 | } |
||
| 227 | } |
||
| 228 | $this->parsed[]=$chunk_results; |
||
| 229 | } |
||
| 230 | } |
||
| 231 | return $this; |
||
| 232 | } |
||
| 233 | |||
| 234 | public function get_chunks(): array |
||
| 238 | |||
| 239 | public function results(bool $before_parsing=false): array |
||
| 247 | |||
| 248 | private function log(string $text,int $level ) |
||
| 249 | { |
||
| 250 | if($this->logger){ |
||
| 251 | $this->logger->log($level,$text); |
||
| 252 | } |
||
| 253 | } |
||
| 254 | |||
| 255 | } |
||
| 256 |