PiedWeb /
SeoPocketCrawler
| 1 | <?php |
||||
| 2 | |||||
| 3 | namespace PiedWeb\SeoPocketCrawler; |
||||
| 4 | |||||
| 5 | use PiedWeb\UrlHarvester\Harvest; |
||||
| 6 | use PiedWeb\UrlHarvester\Link; |
||||
| 7 | |||||
| 8 | class Recorder |
||||
| 9 | { |
||||
| 10 | const LINKS_DIR = '/links'; |
||||
| 11 | const CACHE_DIR = '/cache'; |
||||
| 12 | |||||
| 13 | const CACHE_NONE = 0; |
||||
| 14 | const CACHE_ID = 2; |
||||
| 15 | const CACHE_URI = 1; |
||||
| 16 | |||||
| 17 | protected $folder; |
||||
| 18 | protected $cacheMethod; |
||||
| 19 | 9 | ||||
| 20 | public function __construct($folder, $cacheMethod = self::CACHE_ID) |
||||
| 21 | 9 | { |
|||
| 22 | 9 | $this->folder = $folder; |
|||
| 23 | $this->cacheMethod = $cacheMethod; |
||||
| 24 | 9 | ||||
| 25 | 3 | if (! file_exists($folder)) { |
|||
| 26 | 3 | mkdir($folder); |
|||
| 27 | 3 | } |
|||
| 28 | |||||
| 29 | 9 | if (! file_exists($folder.Recorder::LINKS_DIR)) { |
|||
| 30 | mkdir($folder.Recorder::LINKS_DIR); |
||||
| 31 | 6 | $this->initLinksIndex(); |
|||
| 32 | } |
||||
| 33 | 6 | ||||
| 34 | if (! file_exists($folder.Recorder::CACHE_DIR)) { |
||||
| 35 | mkdir($folder.Recorder::CACHE_DIR); |
||||
| 36 | } |
||||
| 37 | 6 | } |
|||
| 38 | 6 | ||||
| 39 | 6 | public function cache($harvest, Url $url) |
|||
| 40 | 6 | { |
|||
| 41 | 6 | if (Recorder::CACHE_NONE === $this->cacheMethod || ! $this->mustWeCache($harvest)) { |
|||
| 42 | return; |
||||
| 43 | } |
||||
| 44 | 6 | ||||
| 45 | $filePath = $this->getCacheFilePath($url); |
||||
| 46 | 2 | if (! file_exists($filePath)) { |
|||
|
0 ignored issues
–
show
Bug
introduced
by
Loading history...
|
|||||
| 47 | if ($harvest instanceof Harvest) { |
||||
| 48 | 7 | file_put_contents( |
|||
| 49 | $filePath, |
||||
|
0 ignored issues
–
show
It seems like
$filePath can also be of type null; however, parameter $filename of file_put_contents() does only seem to accept string, maybe add an additional type check?
(
Ignorable by Annotation
)
If this is a false-positive, you can also ignore this issue in your code via the
Loading history...
|
|||||
| 50 | 7 | $harvest->getResponse()->getHeaders(false).PHP_EOL.PHP_EOL.$harvest->getResponse()->getContent() |
|||
| 51 | 3 | ); |
|||
| 52 | |||||
| 53 | 4 | return file_put_contents($filePath.'---info', json_encode($harvest->getResponse()->getInfo())); |
|||
| 54 | } else { |
||||
| 55 | return file_put_contents($filePath, 'curl_error_code:'.$harvest); // cache curl error code |
||||
| 56 | } |
||||
| 57 | 3 | } |
|||
| 58 | } |
||||
| 59 | 3 | ||||
| 60 | 3 | public function getCacheFilePath(Url $url) |
|||
| 61 | 3 | { |
|||
| 62 | if (Recorder::CACHE_URI === $this->cacheMethod) { |
||||
| 63 | 3 | return $this->getCacheFilePathWithUrlAsFilename($url); |
|||
| 64 | 3 | } else { |
|||
| 65 | 3 | return $this->getCacheFilePathWithIdAsFilename($url); |
|||
| 66 | 3 | } |
|||
| 67 | } |
||||
| 68 | 3 | ||||
| 69 | 3 | protected function getCacheFilePathWithUrlAsFilename(Url $url) |
|||
| 70 | { |
||||
| 71 | $url = trim($url->uri, '/').'/'; |
||||
| 72 | $urlPart = explode('/', $url); |
||||
| 73 | $folder = $this->folder.Recorder::CACHE_DIR; |
||||
| 74 | |||||
| 75 | $urlPartLenght = count($urlPart); |
||||
| 76 | 4 | for ($i = 0; $i < $urlPartLenght; ++$i) { |
|||
| 77 | if ($i == $urlPartLenght - 1) { |
||||
| 78 | 4 | return $folder.'/'.(empty($urlPart[$i]) ? 'index.html' : $urlPart[$i]); |
|||
| 79 | } else { |
||||
| 80 | $folder .= '/'.$urlPart[$i]; |
||||
| 81 | 6 | if (! file_exists($folder) || ! is_dir($folder)) { |
|||
| 82 | mkdir($folder); |
||||
| 83 | 6 | } |
|||
| 84 | } |
||||
| 85 | } |
||||
| 86 | 9 | } |
|||
| 87 | |||||
| 88 | 9 | protected function getCacheFilePathWithIdAsFilename(Url $url) |
|||
| 89 | { |
||||
| 90 | 9 | return $this->folder.Recorder::CACHE_DIR.'/'.(string) $url->id; |
|||
| 91 | 9 | } |
|||
| 92 | 9 | ||||
| 93 | protected function mustWeCache($harvest) |
||||
| 94 | 9 | { |
|||
| 95 | 9 | return true; //false !== strpos($harvest->getResponse()->getContentType(), 'text/html'); |
|||
| 96 | } |
||||
| 97 | |||||
| 98 | 9 | public function record(array $urls) |
|||
| 99 | { |
||||
| 100 | 9 | $dataCsv = fopen($this->folder.'/data.csv', 'w'); |
|||
| 101 | $indexCsv = fopen($this->folder.'/index.csv', 'w'); |
||||
| 102 | |||||
| 103 | if (false !== $dataCsv && false !== $indexCsv) { |
||||
| 104 | $header = array_keys(get_object_vars(array_values($urls)[0])); |
||||
| 105 | fputcsv($dataCsv, $header); |
||||
| 106 | 6 | fputcsv($indexCsv, ['id', 'uri']); |
|||
| 107 | |||||
| 108 | 6 | foreach ($urls as $url) { |
|||
| 109 | 6 | fputcsv($dataCsv, get_object_vars($url)); |
|||
| 110 | 6 | fputcsv($indexCsv, [$url->id, $url->uri]); |
|||
| 111 | 6 | } |
|||
| 112 | |||||
| 113 | 6 | fclose($dataCsv); |
|||
| 114 | |||||
| 115 | 6 | return true; |
|||
| 116 | } |
||||
| 117 | |||||
| 118 | 6 | return false; |
|||
| 119 | 6 | } |
|||
| 120 | 6 | ||||
| 121 | 6 | public function recordInboundLink(Link $link, Url $from, Url $to) |
|||
| 122 | { |
||||
| 123 | file_put_contents( |
||||
| 124 | $this->folder.Recorder::LINKS_DIR.'/To_'.(string) $to->id.'_'.((int) $link->mayFollow()), |
||||
| 125 | $this->inboundLinkToStr($link).PHP_EOL, // can use ->relativize to get only /uri |
||||
| 126 | FILE_APPEND |
||||
| 127 | ); |
||||
| 128 | } |
||||
| 129 | |||||
| 130 | protected function inboundLinkToStr(Link $link) |
||||
| 131 | { |
||||
| 132 | return $link->getParentUrl().';'.$link->getAnchor().';'.((int) $link->mayFollow()).';'.$link->getType(); |
||||
| 133 | } |
||||
| 134 | |||||
| 135 | public function recordOutboundLink(Url $from, array $links) |
||||
| 136 | { |
||||
| 137 | $links = array_map(function (Link $link) { |
||||
| 138 | return $link->getUrl().';'.$link->getAnchor().';'.((int) $link->mayFollow()).';'.$link->getType(); |
||||
| 139 | }, $links); |
||||
| 140 | file_put_contents($this->folder.Recorder::LINKS_DIR.'/From_'.(string) $from->id, implode(PHP_EOL, $links)); |
||||
| 141 | } |
||||
| 142 | |||||
| 143 | protected function initLinksIndex() |
||||
| 144 | { |
||||
| 145 | if (! file_exists($this->folder.Recorder::LINKS_DIR.'/Index.csv')) { |
||||
| 146 | file_put_contents($this->folder.Recorder::LINKS_DIR.'/Index.csv', 'From,To'.PHP_EOL); |
||||
| 147 | } |
||||
| 148 | } |
||||
| 149 | |||||
| 150 | public static function removeBase(string $base, string $url) |
||||
| 151 | { |
||||
| 152 | return (0 === strpos($url, $base)) ? substr_replace($url, '', 0, strlen($base)) : null; |
||||
| 153 | } |
||||
| 154 | |||||
| 155 | public function recordLinksIndex(string $base, Url $from, $urls, array $links) |
||||
| 156 | { |
||||
| 157 | $everAdded = []; |
||||
| 158 | $content = ''; |
||||
| 159 | |||||
| 160 | foreach ($links as $link) { |
||||
| 161 | $content .= $from->getId(); |
||||
| 162 | if (in_array($link->getUrl(), $everAdded)) { // like Google, we sould not add duplicate link, |
||||
| 163 | // so we say the juice is lost -1 |
||||
| 164 | $content .= ',-1'.PHP_EOL; |
||||
| 165 | } else { |
||||
| 166 | $everAdded[] = $link->getUrl(); |
||||
| 167 | $relative = self::removeBase($base, $link->getPageUrl()); |
||||
| 168 | $content .= ','.(isset($urls[$relative]) ? $urls[$relative]->getId() : 0).PHP_EOL; // 0 = external |
||||
| 169 | } |
||||
| 170 | } |
||||
| 171 | file_put_contents($this->folder.Recorder::LINKS_DIR.'/Index.csv', $content, FILE_APPEND); |
||||
| 172 | } |
||||
| 173 | } |
||||
| 174 |