| 
                    1
                 | 
                                    
                                                     | 
                
                 | 
                <?php  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    2
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    3
                 | 
                                    
                                                     | 
                
                 | 
                namespace PiedWeb\SeoPocketCrawler;  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    4
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    5
                 | 
                                    
                                                     | 
                
                 | 
                use PiedWeb\UrlHarvester\Harvest;  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    6
                 | 
                                    
                                                     | 
                
                 | 
                use PiedWeb\UrlHarvester\Indexable;  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    7
                 | 
                                    
                                                     | 
                
                 | 
                use Spatie\Robots\RobotsTxt;  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    8
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    9
                 | 
                                    
                                                     | 
                
                 | 
                class Crawler  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    10
                 | 
                                    
                                                     | 
                
                 | 
                { | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    11
                 | 
                                    
                                                     | 
                
                 | 
                    protected $userAgent;  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    12
                 | 
                                    
                                                     | 
                
                 | 
                    protected $project;  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    13
                 | 
                                    
                                                     | 
                
                 | 
                    protected $ignore;  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    14
                 | 
                                    
                                                     | 
                
                 | 
                    protected $limit;  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    15
                 | 
                                    
                                                     | 
                
                 | 
                    protected $recorder;  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    16
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    17
                 | 
                                    
                                                     | 
                
                 | 
                    protected $currentClick = 0;  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    18
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    19
                 | 
                                    
                                                     | 
                
                 | 
                    protected $counter = 0;  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    20
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    21
                 | 
                                    
                                                     | 
                
                 | 
                    protected $base;  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    22
                 | 
                                    
                                                     | 
                
                 | 
                    protected $urls = [];  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    23
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    24
                 | 
                                    
                             3                          | 
                
                 | 
                    public function __construct(string $startUrl, string $ignore, int $limit, string $userAgent)  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    25
                 | 
                                    
                                                     | 
                
                 | 
                    { | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    26
                 | 
                                    
                             3                          | 
                
                 | 
                        $this->urls[$startUrl] = null;  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    27
                 | 
                                    
                             3                          | 
                
                 | 
                        $this->base = Harvest::getDomainAndSchemeFrom($startUrl);  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    28
                 | 
                                    
                             3                          | 
                
                 | 
                        $this->project = preg_replace("([^\w\s\d\-_~,;\[\]\(\).])", '', $startUrl); | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    29
                 | 
                                    
                             3                          | 
                
                 | 
                        $this->ignore = new RobotsTxt($ignore);  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    30
                 | 
                                    
                             3                          | 
                
                 | 
                        $this->userAgent = $userAgent;  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    31
                 | 
                                    
                             3                          | 
                
                 | 
                        $this->limit = $limit;  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    32
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    33
                 | 
                                    
                             3                          | 
                
                 | 
                        $this->initRecorderAndCache();  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    34
                 | 
                                    
                             3                          | 
                
                 | 
                    }  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    35
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    36
                 | 
                                    
                             3                          | 
                
                 | 
                    public function getDataFolder()  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    37
                 | 
                                    
                                                     | 
                
                 | 
                    { | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    38
                 | 
                                    
                             3                          | 
                
                 | 
                        return __DIR__.'/../data/'.$this->project;  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    39
                 | 
                                    
                                                     | 
                
                 | 
                    }  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    40
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    41
                 | 
                                    
                             3                          | 
                
                 | 
                    public function getCacheFolder()  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    42
                 | 
                                    
                                                     | 
                
                 | 
                    { | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    43
                 | 
                                    
                             3                          | 
                
                 | 
                        return __DIR__.'/../cache/'.$this->project;  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    44
                 | 
                                    
                                                     | 
                
                 | 
                    }  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    45
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    46
                 | 
                                    
                             3                          | 
                
                 | 
                    protected function initRecorderAndCache()  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    47
                 | 
                                    
                                                     | 
                
                 | 
                    { | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    48
                 | 
                                    
                             3                          | 
                
                 | 
                        $this->recorder = new Recorder($this->getDataFolder());  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    49
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    50
                 | 
                                    
                             3                          | 
                
                 | 
                        exec('rm -rf '.$this->getDataFolder()); | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    51
                 | 
                                    
                             3                          | 
                
                 | 
                        exec('rm -rf '.$this->getCacheFolder()); | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    52
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    53
                 | 
                                    
                             3                          | 
                
                 | 
                        if (!file_exists($this->getDataFolder())) { | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    54
                 | 
                                    
                             3                          | 
                
                 | 
                            mkdir($this->getDataFolder());  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    55
                 | 
                                    
                             3                          | 
                
                 | 
                            mkdir($this->getDataFolder().'/links');  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    56
                 | 
                                    
                             3                          | 
                
                 | 
                            mkdir($this->getCacheFolder());  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    57
                 | 
                                    
                                                     | 
                
                 | 
                        }  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    58
                 | 
                                    
                             3                          | 
                
                 | 
                    }  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    59
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    60
                 | 
                                    
                             3                          | 
                
                 | 
                    public function crawl(bool $debug = false)  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    61
                 | 
                                    
                                                     | 
                
                 | 
                    { | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    62
                 | 
                                    
                             3                          | 
                
                 | 
                        $nothingUpdated = true;  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    63
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    64
                 | 
                                    
                             3                          | 
                
                 | 
                        if ($debug) { | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    65
                 | 
                                    
                             3                          | 
                
                 | 
                            echo PHP_EOL.PHP_EOL.'// -----'.PHP_EOL.'// '.$this->counter.' crawled / '  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    66
                 | 
                                    
                             3                          | 
                
                 | 
                                        .count($this->urls).' found '.PHP_EOL.'// -----'.PHP_EOL;  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    67
                 | 
                                    
                                                     | 
                
                 | 
                        }  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    68
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    69
                 | 
                                    
                             3                          | 
                
                 | 
                        foreach ($this->urls as $urlToParse => $url) { | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    70
                 | 
                                    
                             3                          | 
                
                 | 
                            if (null !== $url && (false === $url->can_be_crawled || true === $url->can_be_crawled)) { // déjà crawlé | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    71
                 | 
                                    
                                                     | 
                
                 | 
                                continue;  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    72
                 | 
                                    
                                                     | 
                
                 | 
                            }  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    73
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    74
                 | 
                                    
                             3                          | 
                
                 | 
                            if ($debug) { | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    75
                 | 
                                    
                             3                          | 
                
                 | 
                                echo '    '.$urlToParse.PHP_EOL;  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    76
                 | 
                                    
                                                     | 
                
                 | 
                            }  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    77
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    78
                 | 
                                    
                             3                          | 
                
                 | 
                            $nothingUpdated = false;  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    79
                 | 
                                    
                             3                          | 
                
                 | 
                            ++$this->counter;  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    80
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    81
                 | 
                                    
                             3                          | 
                
                 | 
                            $this->harvest($urlToParse);  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    82
                 | 
                                    
                                                     | 
                
                 | 
                        }  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    83
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    84
                 | 
                                    
                             3                          | 
                
                 | 
                        ++$this->currentClick;  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    85
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    86
                 | 
                                    
                             3                          | 
                
                 | 
                        $record = $nothingUpdated || $this->currentClick >= $this->limit;  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    87
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    88
                 | 
                                    
                             3                          | 
                
                 | 
                        return $record ? $this->recorder->record($this->urls) : $this->crawl($debug);  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    89
                 | 
                                    
                                                     | 
                
                 | 
                    }  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    90
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    91
                 | 
                                    
                             2                          | 
                
                 | 
                    protected function cache($harvest)  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    92
                 | 
                                    
                                                     | 
                
                 | 
                    { | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    93
                 | 
                                    
                             2                          | 
                
                 | 
                        if (false === strpos($harvest->getResponse()->getContentType(), 'text/html')) { | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    94
                 | 
                                    
                                                     | 
                
                 | 
                            return;  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    95
                 | 
                                    
                                                     | 
                
                 | 
                        }  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    96
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    97
                 | 
                                    
                             2                          | 
                
                 | 
                        $url = ltrim($harvest->getAbsoluteInternalLink($harvest->getResponse()->getEffectiveUrl()), '/');  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    98
                 | 
                                    
                             2                          | 
                
                 | 
                        $urlPart = explode('/', $url); | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    99
                 | 
                                    
                             2                          | 
                
                 | 
                        $folder = $this->getCacheFolder();  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    100
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    101
                 | 
                                    
                             2                          | 
                
                 | 
                        $urlPartLenght = count($urlPart);  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    102
                 | 
                                    
                             2                          | 
                
                 | 
                        for ($i = 0; $i < $urlPartLenght; ++$i) { | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    103
                 | 
                                    
                             2                          | 
                
                 | 
                            if ($i == count($urlPart) - 1) { | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    104
                 | 
                                    
                             2                          | 
                
                 | 
                                $filename = empty($urlPart[$i]) ? 'index.html' : $urlPart[$i];  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    105
                 | 
                                    
                             2                          | 
                
                 | 
                                file_put_contents($folder.'/'.$filename, $harvest->getResponse()->getContent());  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    106
                 | 
                                    
                                                     | 
                
                 | 
                            } else { | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    107
                 | 
                                    
                                                     | 
                
                 | 
                                $folder .= '/'.$urlPart[$i];  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    108
                 | 
                                    
                                                     | 
                
                 | 
                                if (!file_exists($folder)) { | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    109
                 | 
                                    
                                                     | 
                
                 | 
                                    mkdir($folder);  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    110
                 | 
                                    
                                                     | 
                
                 | 
                                }  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    111
                 | 
                                    
                                                     | 
                
                 | 
                            }  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    112
                 | 
                                    
                                                     | 
                
                 | 
                        }  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    113
                 | 
                                    
                             2                          | 
                
                 | 
                    }  | 
            
            
                                                                                                            
                                                                
            
                                    
            
            
                | 
                    114
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                                        
                            
            
                                    
            
            
                | 
                    115
                 | 
                                    
                             3                          | 
                
                 | 
                    protected function harvest(string $urlToParse)  | 
            
            
                                                                        
                            
            
                                    
            
            
                | 
                    116
                 | 
                                    
                                                     | 
                
                 | 
                    { | 
            
            
                                                                        
                            
            
                                    
            
            
                | 
                    117
                 | 
                                    
                             3                          | 
                
                 | 
                        $url = $this->urls[$urlToParse] = $this->urls[$urlToParse] ?? new Url($urlToParse, $this->currentClick);  | 
            
            
                                                                        
                            
            
                                    
            
            
                | 
                    118
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                                        
                            
            
                                    
            
            
                | 
                    119
                 | 
                                    
                             3                          | 
                
                 | 
                        $url->updated_at = date('Ymd'); | 
            
            
                                                                        
                            
            
                                    
            
            
                | 
                    120
                 | 
                                    
                             3                          | 
                
                 | 
                        $url->can_be_crawled = $this->ignore->allows($urlToParse, $this->userAgent);  | 
            
            
                                                                        
                            
            
                                    
            
            
                | 
                    121
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                                        
                            
            
                                    
            
            
                | 
                    122
                 | 
                                    
                             3                          | 
                
                 | 
                        if (false === $url->can_be_crawled) { | 
            
            
                                                                        
                            
            
                                    
            
            
                | 
                    123
                 | 
                                    
                                                     | 
                
                 | 
                            return;  | 
            
            
                                                                        
                            
            
                                    
            
            
                | 
                    124
                 | 
                                    
                                                     | 
                
                 | 
                        }  | 
            
            
                                                                        
                            
            
                                    
            
            
                | 
                    125
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                                        
                            
            
                                    
            
            
                | 
                    126
                 | 
                                    
                             3                          | 
                
                 | 
                        $harvest = Harvest::fromUrl($urlToParse, $this->userAgent);  | 
            
            
                                                                        
                            
            
                                    
            
            
                | 
                    127
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                                        
                            
            
                                    
            
            
                | 
                    128
                 | 
                                    
                             3                          | 
                
                 | 
                        if (!$harvest instanceof Harvest) { | 
            
            
                                                                        
                            
            
                                    
            
            
                | 
                    129
                 | 
                                    
                             1                          | 
                
                 | 
                            $url->indexable = Indexable::NOT_INDEXABLE_NETWORK_ERROR;  | 
            
            
                                                                        
                            
            
                                    
            
            
                | 
                    130
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                                        
                            
            
                                    
            
            
                | 
                    131
                 | 
                                    
                             1                          | 
                
                 | 
                            return;  | 
            
            
                                                                        
                            
            
                                    
            
            
                | 
                    132
                 | 
                                    
                                                     | 
                
                 | 
                        }  | 
            
            
                                                                        
                            
            
                                    
            
            
                | 
                    133
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                                        
                            
            
                                    
            
            
                | 
                    134
                 | 
                                    
                             2                          | 
                
                 | 
                        $url->indexable = $harvest->isIndexable();  | 
            
            
                                                                        
                            
            
                                    
            
            
                | 
                    135
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                                        
                            
            
                                    
            
            
                | 
                    136
                 | 
                                    
                             2                          | 
                
                 | 
                        if (Indexable::NOT_INDEXABLE_3XX === $url->indexable) { | 
            
            
                                                                        
                            
            
                                    
            
            
                | 
                    137
                 | 
                                    
                                                     | 
                
                 | 
                            $redir = $harvest->getRedirection();  | 
            
            
                                                                        
                            
            
                                    
            
            
                | 
                    138
                 | 
                                    
                                                     | 
                
                 | 
                            if (false !== $redir) { | 
            
            
                                                                        
                            
            
                                    
            
            
                | 
                    139
                 | 
                                    
                                                     | 
                
                 | 
                                $links = Harvest::LINK_INTERNAL === $harvest->getType($redir) ? [$redir] : [];  | 
            
                            
                    | 
                        
                     | 
                     | 
                     | 
                    
                                                                                                                                                        
                        
                         
                                                                                        
                                                                                            
                                                                                     
                     | 
                
            
                                                                        
                            
            
                                    
            
            
                | 
                    140
                 | 
                                    
                                                     | 
                
                 | 
                            }  | 
            
            
                                                                        
                            
            
                                    
            
            
                | 
                    141
                 | 
                                    
                                                     | 
                
                 | 
                        } else { | 
            
            
                                                                        
                            
            
                                    
            
            
                | 
                    142
                 | 
                                    
                             2                          | 
                
                 | 
                            $this->cache($harvest);  | 
            
            
                                                                        
                            
            
                                    
            
            
                | 
                    143
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                                        
                            
            
                                    
            
            
                | 
                    144
                 | 
                                    
                             2                          | 
                
                 | 
                            $this->recorder->recordOutboundLink($url, $harvest->getLinks());  | 
            
            
                                                                        
                            
            
                                    
            
            
                | 
                    145
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                                        
                            
            
                                    
            
            
                | 
                    146
                 | 
                                    
                             2                          | 
                
                 | 
                            $url->links = count($harvest->getLinks());  | 
            
            
                                                                        
                            
            
                                    
            
            
                | 
                    147
                 | 
                                    
                             2                          | 
                
                 | 
                            $url->links_duplicate = $harvest->getNbrDuplicateLinks();  | 
            
            
                                                                        
                            
            
                                    
            
            
                | 
                    148
                 | 
                                    
                             2                          | 
                
                 | 
                            $url->links_internal = count($harvest->getLinks(Harvest::LINK_INTERNAL));  | 
            
            
                                                                        
                            
            
                                    
            
            
                | 
                    149
                 | 
                                    
                             2                          | 
                
                 | 
                            $url->links_self = count($harvest->getLinks(Harvest::LINK_SELF));  | 
            
            
                                                                        
                            
            
                                    
            
            
                | 
                    150
                 | 
                                    
                             2                          | 
                
                 | 
                            $url->links_sub = count($harvest->getLinks(Harvest::LINK_SUB));  | 
            
            
                                                                        
                            
            
                                    
            
            
                | 
                    151
                 | 
                                    
                             2                          | 
                
                 | 
                            $url->links_external = count($harvest->getLinks(Harvest::LINK_EXTERNAL));  | 
            
            
                                                                        
                            
            
                                    
            
            
                | 
                    152
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                                        
                            
            
                                    
            
            
                | 
                    153
                 | 
                                    
                             2                          | 
                
                 | 
                            $url->ratio_text_code = $harvest->getRatioTxtCode();  | 
            
            
                                                                        
                            
            
                                    
            
            
                | 
                    154
                 | 
                                    
                             2                          | 
                
                 | 
                            $url->load_time = $harvest->getResponse()->getInfo('total_time'); | 
            
            
                                                                        
                            
            
                                    
            
            
                | 
                    155
                 | 
                                    
                             2                          | 
                
                 | 
                            $url->size = $harvest->getResponse()->getInfo('size_download'); | 
            
            
                                                                        
                            
            
                                    
            
            
                | 
                    156
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                                        
                            
            
                                    
            
            
                | 
                    157
                 | 
                                    
                             2                          | 
                
                 | 
                            $breadcrumb = $harvest->getBreadCrumb();  | 
            
            
                                                                        
                            
            
                                    
            
            
                | 
                    158
                 | 
                                    
                             2                          | 
                
                 | 
                            if (is_array($breadcrumb)) { | 
            
            
                                                                        
                            
            
                                    
            
            
                | 
                    159
                 | 
                                    
                                                     | 
                
                 | 
                                $url->breadcrumb_level = count($breadcrumb);  | 
            
            
                                                                        
                            
            
                                    
            
            
                | 
                    160
                 | 
                                    
                                                     | 
                
                 | 
                                $url->breadcrumb_fisrt = isset($breadcrumb[1]) ? $breadcrumb[1]->getCleanName() : '';  | 
            
                            
                    | 
                        
                     | 
                     | 
                     | 
                    
                                                                                                    
                        
                         
                                                                                        
                                                                                     
                     | 
                
            
                                                                        
                            
            
                                    
            
            
                | 
                    161
                 | 
                                    
                                                     | 
                
                 | 
                                $url->breadcrumb_text = $harvest->getBreadCrumb('//'); | 
            
            
                                                                        
                            
            
                                    
            
            
                | 
                    162
                 | 
                                    
                                                     | 
                
                 | 
                            }  | 
            
            
                                                                        
                            
            
                                    
            
            
                | 
                    163
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                                        
                            
            
                                    
            
            
                | 
                    164
                 | 
                                    
                             2                          | 
                
                 | 
                            $url->title = $harvest->getUniqueTag('head title') ?? ''; | 
            
            
                                                                        
                            
            
                                    
            
            
                | 
                    165
                 | 
                                    
                             2                          | 
                
                 | 
                            $url->kws = ','.implode(',', $harvest->getKws()).','; | 
            
            
                                                                        
                            
            
                                    
            
            
                | 
                    166
                 | 
                                    
                             2                          | 
                
                 | 
                            $url->h1 = $harvest->getUniqueTag('h1') ?? ''; | 
            
            
                                                                        
                            
            
                                    
            
            
                | 
                    167
                 | 
                                    
                                                     | 
                
                 | 
                        }  | 
            
            
                                                                        
                            
            
                                    
            
            
                | 
                    168
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                                        
                            
            
                                    
            
            
                | 
                    169
                 | 
                                    
                             2                          | 
                
                 | 
                        foreach ($harvest->getLinks(Harvest::LINK_INTERNAL) as $link) { | 
            
            
                                                                        
                            
            
                                    
            
            
                | 
                    170
                 | 
                                    
                             2                          | 
                
                 | 
                            $linkUrl = $link->getPageUrl();  | 
            
            
                                                                        
                            
            
                                    
            
            
                | 
                    171
                 | 
                                    
                             2                          | 
                
                 | 
                            $this->urls[$linkUrl] = $this->urls[$linkUrl] ?? new Url($linkUrl, ($this->currentClick + 1));  | 
            
            
                                                                        
                            
            
                                    
            
            
                | 
                    172
                 | 
                                    
                             2                          | 
                
                 | 
                            $this->recorder->recordInboundLink($url, $this->urls[$linkUrl]);  | 
            
            
                                                                        
                            
            
                                    
            
            
                | 
                    173
                 | 
                                    
                             2                          | 
                
                 | 
                            ++$this->urls[$linkUrl]->inboundlinks;  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    174
                 | 
                                    
                                                     | 
                
                 | 
                        }  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    175
                 | 
                                    
                             2                          | 
                
                 | 
                    }  | 
            
            
                                                                                                            
                                                                
            
                                    
            
            
                | 
                    176
                 | 
                                    
                                                     | 
                
                 | 
                }  | 
            
            
                                                        
            
                                    
            
            
                | 
                    177
                 | 
                                    
                                                     | 
                
                 | 
                 |