| 1 |  |  | <?php | 
            
                                                                                                            
                            
            
                                    
            
            
                | 2 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 3 |  |  | namespace PiedWeb\SeoPocketCrawler; | 
            
                                                                                                            
                            
            
                                    
            
            
                | 4 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 5 |  |  | use PiedWeb\UrlHarvester\Harvest; | 
            
                                                                                                            
                            
            
                                    
            
            
                | 6 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 7 |  |  | class Recorder | 
            
                                                                                                            
                            
            
                                    
            
            
                | 8 |  |  | { | 
            
                                                                                                            
                            
            
                                    
            
            
                | 9 |  |  |     const LINKS_DIR = '/links'; | 
            
                                                                                                            
                            
            
                                    
            
            
                | 10 |  |  |     const CACHE_DIR = '/cache'; | 
            
                                                                                                            
                            
            
                                    
            
            
                | 11 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 12 |  |  |     const CACHE_NONE = 0; | 
            
                                                                                                            
                            
            
                                    
            
            
                | 13 |  |  |     const CACHE_ID = 2; | 
            
                                                                                                            
                            
            
                                    
            
            
                | 14 |  |  |     const CACHE_URI = 1; | 
            
                                                                                                            
                            
            
                                    
            
            
                | 15 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 16 |  |  |     protected $folder; | 
            
                                                                                                            
                            
            
                                    
            
            
                | 17 |  |  |     protected $cacheMethod; | 
            
                                                                                                            
                            
            
                                    
            
            
                | 18 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 19 | 9 |  |     public function __construct($folder, $cacheMethod = self::CACHE_ID) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 20 |  |  |     { | 
            
                                                                                                            
                            
            
                                    
            
            
                | 21 | 9 |  |         $this->folder = $folder; | 
            
                                                                                                            
                            
            
                                    
            
            
                | 22 | 9 |  |         $this->cacheMethod = $cacheMethod; | 
            
                                                                                                            
                            
            
                                    
            
            
                | 23 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 24 | 9 |  |         if (!file_exists($folder)) { | 
            
                                                                                                            
                            
            
                                    
            
            
                | 25 | 3 |  |             mkdir($folder); | 
            
                                                                                                            
                            
            
                                    
            
            
                | 26 | 3 |  |             mkdir($folder.Recorder::LINKS_DIR); | 
            
                                                                                                            
                            
            
                                    
            
            
                | 27 | 3 |  |             mkdir($folder.Recorder::CACHE_DIR); | 
            
                                                                                                            
                            
            
                                    
            
            
                | 28 |  |  |             $this->initLinksIndex(); | 
            
                                                                                                            
                            
            
                                    
            
            
                | 29 | 9 |  |         } | 
            
                                                                                                            
                            
            
                                    
            
            
                | 30 |  |  |     } | 
            
                                                                                                            
                            
            
                                    
            
            
                | 31 | 6 |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 32 |  |  |     public function cache(Harvest $harvest, Url $url) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 33 | 6 |  |     { | 
            
                                                                                                            
                            
            
                                    
            
            
                | 34 |  |  |         if (Recorder::CACHE_NONE === $this->cacheMethod || !$this->mustWeCache($harvest)) { | 
            
                                                                                                            
                            
            
                                    
            
            
                | 35 |  |  |             return; | 
            
                                                                                                            
                            
            
                                    
            
            
                | 36 |  |  |         } | 
            
                                                                                                            
                            
            
                                    
            
            
                | 37 | 6 |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 38 | 6 |  |         $filePath = $this->getCacheFilePath($url); | 
            
                                                                                                            
                            
            
                                    
            
            
                | 39 | 6 |  |         if (!file_exists($filePath)) { | 
            
                                                                                                            
                            
            
                                    
            
            
                | 40 | 6 |  |             file_put_contents( | 
            
                                                                                                            
                            
            
                                    
            
            
                | 41 | 6 |  |                 $filePath, | 
            
                                                                                                            
                            
            
                                    
            
            
                | 42 |  |  |                 $harvest->getResponse()->getHeaders(false).PHP_EOL.PHP_EOL.$harvest->getResponse()->getContent() | 
            
                                                                                                            
                            
            
                                    
            
            
                | 43 |  |  |             ); | 
            
                                                                                                            
                            
            
                                    
            
            
                | 44 | 6 |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 45 |  |  |             return file_put_contents($filePath.'---info', json_encode($harvest->getResponse()->getInfo())); | 
            
                                                                                                            
                            
            
                                    
            
            
                | 46 | 2 |  |         } | 
            
                                                                                                            
                            
            
                                    
            
            
                | 47 |  |  |     } | 
            
                                                                                                            
                            
            
                                    
            
            
                | 48 | 7 |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 49 |  |  |     public function getCacheFilePath(Url $url) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 50 | 7 |  |     { | 
            
                                                                                                            
                            
            
                                    
            
            
                | 51 | 3 |  |         if (Recorder::CACHE_URI === $this->cacheMethod) { | 
            
                                                                                                            
                            
            
                                    
            
            
                | 52 |  |  |             return $this->getCacheFilePathWithUrlAsFilename($url); | 
            
                                                                                                            
                            
            
                                    
            
            
                | 53 | 4 |  |         } else { | 
            
                                                                                                            
                            
            
                                    
            
            
                | 54 |  |  |             return $this->getCacheFilePathWithIdAsFilename($url); | 
            
                                                                                                            
                            
            
                                    
            
            
                | 55 |  |  |         } | 
            
                                                                                                            
                            
            
                                    
            
            
                | 56 |  |  |     } | 
            
                                                                                                            
                            
            
                                    
            
            
                | 57 | 3 |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 58 |  |  |     protected function getCacheFilePathWithUrlAsFilename(Url $url) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 59 | 3 |  |     { | 
            
                                                                                                            
                            
            
                                    
            
            
                | 60 | 3 |  |         $url = trim($url->uri, '/').'/'; | 
            
                                                                                                            
                            
            
                                    
            
            
                | 61 | 3 |  |         $urlPart = explode('/', $url); | 
            
                                                                                                            
                            
            
                                    
            
            
                | 62 |  |  |         $folder = $this->folder.Recorder::CACHE_DIR; | 
            
                                                                                                            
                            
            
                                    
            
            
                | 63 | 3 |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 64 | 3 |  |         $urlPartLenght = count($urlPart); | 
            
                                                                                                            
                            
            
                                    
            
            
                | 65 | 3 |  |         for ($i = 0; $i < $urlPartLenght; ++$i) { | 
            
                                                                                                            
                            
            
                                    
            
            
                | 66 | 3 |  |             if ($i == $urlPartLenght - 1) { | 
            
                                                                                                            
                            
            
                                    
            
            
                | 67 |  |  |                 return $folder.'/'.(empty($urlPart[$i]) ? 'index.html' : $urlPart[$i]); | 
            
                                                                                                            
                            
            
                                    
            
            
                | 68 | 3 |  |             } else { | 
            
                                                                                                            
                            
            
                                    
            
            
                | 69 | 3 |  |                 $folder .= '/'.$urlPart[$i]; | 
            
                                                                                                            
                            
            
                                    
            
            
                | 70 |  |  |                 if (!file_exists($folder) || !is_dir($folder)) { | 
            
                                                                                                            
                            
            
                                    
            
            
                | 71 |  |  |                     mkdir($folder); | 
            
                                                                                                            
                            
            
                                    
            
            
                | 72 |  |  |                 } | 
            
                                                                                                            
                            
            
                                    
            
            
                | 73 |  |  |             } | 
            
                                                                                                            
                            
            
                                    
            
            
                | 74 |  |  |         } | 
            
                                                                                                            
                            
            
                                    
            
            
                | 75 |  |  |     } | 
            
                                                                                                            
                            
            
                                    
            
            
                | 76 | 4 |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 77 |  |  |     protected function getCacheFilePathWithIdAsFilename(Url $url) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 78 | 4 |  |     { | 
            
                                                                                                            
                            
            
                                    
            
            
                | 79 |  |  |         return $this->folder.Recorder::CACHE_DIR.'/'.(string) $url->id; | 
            
                                                                                                            
                            
            
                                    
            
            
                | 80 |  |  |     } | 
            
                                                                                                            
                            
            
                                    
            
            
                | 81 | 6 |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 82 |  |  |     protected function mustWeCache(Harvest $harvest) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 83 | 6 |  |     { | 
            
                                                                                                            
                            
            
                                    
            
            
                | 84 |  |  |         return false !== strpos($harvest->getResponse()->getContentType(), 'text/html'); | 
            
                                                                                                            
                            
            
                                    
            
            
                | 85 |  |  |     } | 
            
                                                                                                            
                            
            
                                    
            
            
                | 86 | 9 |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 87 |  |  |     public function record(array $urls) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 88 | 9 |  |     { | 
            
                                                                                                            
                            
            
                                    
            
            
                | 89 |  |  |         $dataCsv = fopen($this->folder.'/data.csv', 'w'); | 
            
                                                                                                            
                            
            
                                    
            
            
                | 90 | 9 |  |         $indexCsv = fopen($this->folder.'/index.csv', 'w'); | 
            
                                                                                                            
                            
            
                                    
            
            
                | 91 | 9 |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 92 | 9 |  |         if (false !== $dataCsv) { | 
            
                                                                                                            
                            
            
                                    
            
            
                | 93 |  |  |             $header = array_keys(get_object_vars(array_values($urls)[0])); | 
            
                                                                                                            
                            
            
                                    
            
            
                | 94 | 9 |  |             fputcsv($dataCsv, $header); | 
            
                                                                                                            
                            
            
                                    
            
            
                | 95 | 9 |  |             fputcsv($indexCsv, ['id', 'uri']); | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 96 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 97 |  |  |             foreach ($urls as $url) { | 
            
                                                                                                            
                            
            
                                    
            
            
                | 98 | 9 |  |                 fputcsv($dataCsv, get_object_vars($url)); | 
            
                                                                                                            
                            
            
                                    
            
            
                | 99 |  |  |                 fputcsv($indexCsv, [$url->id, $url->uri]); | 
            
                                                                                                            
                            
            
                                    
            
            
                | 100 | 9 |  |             } | 
            
                                                                                                            
                            
            
                                    
            
            
                | 101 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 102 |  |  |             fclose($dataCsv); | 
            
                                                                                                            
                            
            
                                    
            
            
                | 103 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 104 |  |  |             return true; | 
            
                                                                                                            
                            
            
                                    
            
            
                | 105 |  |  |         } | 
            
                                                                                                            
                            
            
                                    
            
            
                | 106 | 6 |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 107 |  |  |         return false; | 
            
                                                                                                            
                            
            
                                    
            
            
                | 108 | 6 |  |     } | 
            
                                                                                                            
                            
            
                                    
            
            
                | 109 | 6 |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 110 | 6 |  |     public function recordInboundLink(Url $from, Url $to, int $type) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 111 | 6 |  |     { | 
            
                                                                                                            
                            
            
                                    
            
            
                | 112 |  |  |         file_put_contents( | 
            
                                                                                                            
                            
            
                                    
            
            
                | 113 | 6 |  |             $this->folder.Recorder::LINKS_DIR.'/To_'.(string) $to->id.'_'.$type, | 
            
                                                                                                            
                            
            
                                    
            
            
                | 114 |  |  |             $from->uri.PHP_EOL, | 
            
                                                                                                            
                            
            
                                    
            
            
                | 115 | 6 |  |             FILE_APPEND | 
            
                                                                                                            
                            
            
                                    
            
            
                | 116 |  |  |         ); | 
            
                                                                                                            
                            
            
                                    
            
            
                | 117 |  |  |     } | 
            
                                                                                                            
                            
            
                                    
            
            
                | 118 | 6 |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 119 | 6 |  |     public function recordOutboundLink(Url $from, array $links) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 120 | 6 |  |     { | 
            
                                                                                                            
                            
            
                                    
            
            
                | 121 | 6 |  |         $links = array_map(function ($link) { | 
            
                                                                                                            
                            
            
                                    
            
            
                | 122 |  |  |             return $link->getUrl(); | 
            
                                                                                                            
                            
            
                                    
            
            
                | 123 |  |  |         }, $links); | 
            
                                                                                                            
                            
            
                                    
            
            
                | 124 |  |  |         file_put_contents($this->folder.Recorder::LINKS_DIR.'/From_'.(string) $from->id, implode(PHP_EOL, $links)); | 
            
                                                                                                            
                            
            
                                    
            
            
                | 125 |  |  |     } | 
            
                                                                                                            
                            
            
                                    
            
            
                | 126 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 127 |  |  |     protected function initLinksIndex() | 
            
                                                                                                            
                            
            
                                    
            
            
                | 128 |  |  |     { | 
            
                                                                                                            
                            
            
                                    
            
            
                | 129 |  |  |         if (!file_exists($this->folder.Recorder::LINKS_DIR.'/Index.csv')) { | 
            
                                                                                                            
                            
            
                                    
            
            
                | 130 |  |  |             file_put_contents($this->folder.Recorder::LINKS_DIR.'/Index.csv', 'From,To'.PHP_EOL); | 
            
                                                                                                            
                            
            
                                    
            
            
                | 131 |  |  |         } | 
            
                                                                                                            
                            
            
                                    
            
            
                | 132 |  |  |     } | 
            
                                                                                                            
                            
            
                                    
            
            
                | 133 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 134 |  |  |     public static function removeBase(string $base, string $url) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 135 |  |  |     { | 
            
                                                                                                            
                            
            
                                    
            
            
                | 136 |  |  |         return (0 === strpos($url, $base)) ? $newstring = substr_replace($url, '', 0, strlen($base)) : null; | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 137 |  |  |     } | 
            
                                                                                                            
                                                                
            
                                    
            
            
                | 138 |  |  |  | 
            
                                                                        
                            
            
                                    
            
            
                | 139 |  |  |     public function recordLinksIndex(string $base, Url $from, $urls, array $links) | 
            
                                                                        
                            
            
                                    
            
            
                | 140 |  |  |     { | 
            
                                                                        
                            
            
                                    
            
            
                | 141 |  |  |         $everAdded = []; | 
            
                                                                        
                            
            
                                    
            
            
                | 142 |  |  |         $content = ''; | 
            
                                                                        
                            
            
                                    
            
            
                | 143 |  |  |  | 
            
                                                                        
                            
            
                                    
            
            
                | 144 |  |  |         foreach ($links as $link) { | 
            
                                                                        
                            
            
                                    
            
            
                | 145 |  |  |             $content .= $from->getId(); | 
            
                                                                        
                            
            
                                    
            
            
                | 146 |  |  |             if (in_array($link->getUrl(), $everAdded)) { // like Google, we sould not add duplicate link, | 
            
                                                                        
                            
            
                                    
            
            
                | 147 |  |  |                 // so we say the juice is lost -1 | 
            
                                                                        
                            
            
                                    
            
            
                | 148 |  |  |                 $content .= ',-1'.PHP_EOL; | 
            
                                                                        
                            
            
                                    
            
            
                | 149 |  |  |             } else { | 
            
                                                                        
                            
            
                                    
            
            
                | 150 |  |  |                 $everAdded[] = $link->getUrl(); | 
            
                                                                        
                            
            
                                    
            
            
                | 151 |  |  |                 $relative = self::removeBase($base, $link->getPageUrl()); | 
            
                                                                        
                            
            
                                    
            
            
                | 152 |  |  |                 $content .= ','.(isset($urls[$relative]) ? $urls[$relative]->getId() : 0).PHP_EOL; // 0 = external | 
            
                                                                        
                            
            
                                    
            
            
                | 153 |  |  |             } | 
            
                                                                        
                            
            
                                    
            
            
                | 154 |  |  |         } | 
            
                                                                        
                            
            
                                    
            
            
                | 155 |  |  |         file_put_contents($this->folder.Recorder::LINKS_DIR.'/Index.csv', $content, FILE_APPEND); | 
            
                                                                                                            
                            
            
                                    
            
            
                | 156 |  |  |     } | 
            
                                                                                                            
                                                                
            
                                    
            
            
                | 157 |  |  | } | 
            
                                                        
            
                                    
            
            
                | 158 |  |  |  |