PiedWeb /
SeoPocketCrawler
| 1 | <?php |
||
| 2 | |||
| 3 | namespace PiedWeb\SeoPocketCrawler; |
||
| 4 | |||
| 5 | use PiedWeb\UrlHarvester\Harvest; |
||
| 6 | use PiedWeb\UrlHarvester\Indexable; |
||
| 7 | use Spatie\Robots\RobotsTxt; |
||
| 8 | |||
| 9 | class Crawler |
||
| 10 | { |
||
| 11 | protected $userAgent; |
||
| 12 | protected $project; |
||
| 13 | protected $ignore; |
||
| 14 | protected $limit; |
||
| 15 | protected $recorder; |
||
| 16 | |||
| 17 | protected $currentClick = 0; |
||
| 18 | |||
| 19 | protected $counter = 0; |
||
| 20 | |||
| 21 | protected $base; |
||
| 22 | protected $urls = []; |
||
| 23 | |||
| 24 | 3 | public function __construct(string $startUrl, string $ignore, int $limit, string $userAgent) |
|
| 25 | { |
||
| 26 | 3 | $this->urls[$startUrl] = null; |
|
| 27 | 3 | $this->base = Harvest::getDomainAndSchemeFrom($startUrl); |
|
| 28 | 3 | $this->project = preg_replace("([^\w\s\d\-_~,;\[\]\(\).])", '', $startUrl); |
|
| 29 | 3 | $this->ignore = new RobotsTxt($ignore); |
|
| 30 | 3 | $this->userAgent = $userAgent; |
|
| 31 | 3 | $this->limit = $limit; |
|
| 32 | |||
| 33 | 3 | $this->initRecorderAndCache(); |
|
| 34 | 3 | } |
|
| 35 | |||
| 36 | 3 | public function getDataFolder() |
|
| 37 | { |
||
| 38 | 3 | return __DIR__.'/../data/'.$this->project; |
|
| 39 | } |
||
| 40 | |||
| 41 | 3 | public function getCacheFolder() |
|
| 42 | { |
||
| 43 | 3 | return __DIR__.'/../cache/'.$this->project; |
|
| 44 | } |
||
| 45 | |||
| 46 | 3 | protected function initRecorderAndCache() |
|
| 47 | { |
||
| 48 | 3 | $this->recorder = new Recorder($this->getDataFolder()); |
|
| 49 | |||
| 50 | 3 | exec('rm -rf '.$this->getDataFolder()); |
|
| 51 | 3 | exec('rm -rf '.$this->getCacheFolder()); |
|
| 52 | |||
| 53 | 3 | if (!file_exists($this->getDataFolder())) { |
|
| 54 | 3 | mkdir($this->getDataFolder()); |
|
| 55 | 3 | mkdir($this->getDataFolder().'/links'); |
|
| 56 | 3 | mkdir($this->getCacheFolder()); |
|
| 57 | } |
||
| 58 | 3 | } |
|
| 59 | |||
| 60 | 3 | public function crawl(bool $debug = false) |
|
| 61 | { |
||
| 62 | 3 | $nothingUpdated = true; |
|
| 63 | |||
| 64 | 3 | if ($debug) { |
|
| 65 | 3 | echo PHP_EOL.PHP_EOL.'// -----'.PHP_EOL.'// '.$this->counter.' crawled / ' |
|
| 66 | 3 | .count($this->urls).' found '.PHP_EOL.'// -----'.PHP_EOL; |
|
| 67 | } |
||
| 68 | |||
| 69 | 3 | foreach ($this->urls as $urlToParse => $url) { |
|
| 70 | 3 | if (null !== $url && (false === $url->can_be_crawled || true === $url->can_be_crawled)) { // déjà crawlé |
|
| 71 | continue; |
||
| 72 | } |
||
| 73 | |||
| 74 | 3 | if ($debug) { |
|
| 75 | 3 | echo ' '.$urlToParse.PHP_EOL; |
|
| 76 | } |
||
| 77 | |||
| 78 | 3 | $nothingUpdated = false; |
|
| 79 | 3 | ++$this->counter; |
|
| 80 | |||
| 81 | 3 | $this->harvest($urlToParse); |
|
| 82 | } |
||
| 83 | |||
| 84 | 3 | ++$this->currentClick; |
|
| 85 | |||
| 86 | 3 | $record = $nothingUpdated || $this->currentClick >= $this->limit; |
|
| 87 | |||
| 88 | 3 | return $record ? $this->recorder->record($this->urls) : $this->crawl($debug); |
|
| 89 | } |
||
| 90 | |||
| 91 | 2 | protected function cache($harvest) |
|
| 92 | { |
||
| 93 | 2 | if (false === strpos($harvest->getResponse()->getContentType(), 'text/html')) { |
|
| 94 | return; |
||
| 95 | } |
||
| 96 | |||
| 97 | 2 | $url = ltrim($harvest->getAbsoluteInternalLink($harvest->getResponse()->getEffectiveUrl()), '/'); |
|
| 98 | 2 | $urlPart = explode('/', $url); |
|
| 99 | 2 | $folder = $this->getCacheFolder(); |
|
| 100 | |||
| 101 | 2 | $urlPartLenght = count($urlPart); |
|
| 102 | 2 | for ($i = 0; $i < $urlPartLenght; ++$i) { |
|
| 103 | 2 | if ($i == count($urlPart) - 1) { |
|
| 104 | 2 | $filename = empty($urlPart[$i]) ? 'index.html' : $urlPart[$i]; |
|
| 105 | 2 | file_put_contents($folder.'/'.$filename, $harvest->getResponse()->getContent()); |
|
| 106 | } else { |
||
| 107 | $folder .= '/'.$urlPart[$i]; |
||
| 108 | if (!file_exists($folder)) { |
||
| 109 | mkdir($folder); |
||
| 110 | } |
||
| 111 | } |
||
| 112 | } |
||
| 113 | 2 | } |
|
| 114 | |||
| 115 | 3 | protected function harvest(string $urlToParse) |
|
| 116 | { |
||
| 117 | 3 | $url = $this->urls[$urlToParse] = $this->urls[$urlToParse] ?? new Url($urlToParse, $this->currentClick); |
|
| 118 | |||
| 119 | 3 | $url->updated_at = date('Ymd'); |
|
| 120 | 3 | $url->can_be_crawled = $this->ignore->allows($urlToParse, $this->userAgent); |
|
| 121 | |||
| 122 | 3 | if (false === $url->can_be_crawled) { |
|
| 123 | return; |
||
| 124 | } |
||
| 125 | |||
| 126 | 3 | $harvest = Harvest::fromUrl($urlToParse, $this->userAgent); |
|
| 127 | |||
| 128 | 3 | if (!$harvest instanceof Harvest) { |
|
| 129 | 1 | $url->indexable = Indexable::NOT_INDEXABLE_NETWORK_ERROR; |
|
| 130 | |||
| 131 | 1 | return; |
|
| 132 | } |
||
| 133 | |||
| 134 | 2 | $url->indexable = $harvest->isIndexable(); |
|
| 135 | |||
| 136 | 2 | if (Indexable::NOT_INDEXABLE_3XX === $url->indexable) { |
|
| 137 | $redir = $harvest->getRedirection(); |
||
| 138 | if (false !== $redir) { |
||
| 139 | $links = Harvest::LINK_INTERNAL === $harvest->getType($redir) ? [$redir] : []; |
||
|
0 ignored issues
–
show
introduced
by
Loading history...
|
|||
| 140 | } |
||
| 141 | } else { |
||
| 142 | 2 | $this->cache($harvest); |
|
| 143 | |||
| 144 | 2 | $this->recorder->recordOutboundLink($url, $harvest->getLinks()); |
|
| 145 | |||
| 146 | 2 | $url->links = count($harvest->getLinks()); |
|
| 147 | 2 | $url->links_duplicate = $harvest->getNbrDuplicateLinks(); |
|
| 148 | 2 | $url->links_internal = count($harvest->getLinks(Harvest::LINK_INTERNAL)); |
|
| 149 | 2 | $url->links_self = count($harvest->getLinks(Harvest::LINK_SELF)); |
|
| 150 | 2 | $url->links_sub = count($harvest->getLinks(Harvest::LINK_SUB)); |
|
| 151 | 2 | $url->links_external = count($harvest->getLinks(Harvest::LINK_EXTERNAL)); |
|
| 152 | |||
| 153 | 2 | $url->ratio_text_code = $harvest->getRatioTxtCode(); |
|
| 154 | 2 | $url->load_time = $harvest->getResponse()->getInfo('total_time'); |
|
| 155 | 2 | $url->size = $harvest->getResponse()->getInfo('size_download'); |
|
| 156 | |||
| 157 | 2 | $breadcrumb = $harvest->getBreadCrumb(); |
|
| 158 | 2 | if (is_array($breadcrumb)) { |
|
| 159 | $url->breadcrumb_level = count($breadcrumb); |
||
| 160 | $url->breadcrumb_fisrt = isset($breadcrumb[1]) ? $breadcrumb[1]->getCleanName() : ''; |
||
|
0 ignored issues
–
show
|
|||
| 161 | $url->breadcrumb_text = $harvest->getBreadCrumb('//'); |
||
| 162 | } |
||
| 163 | |||
| 164 | 2 | $url->title = $harvest->getUniqueTag('head title') ?? ''; |
|
| 165 | 2 | $url->kws = ','.implode(',', $harvest->getKws()).','; |
|
| 166 | 2 | $url->h1 = $harvest->getUniqueTag('h1') ?? ''; |
|
| 167 | } |
||
| 168 | |||
| 169 | 2 | foreach ($harvest->getLinks(Harvest::LINK_INTERNAL) as $link) { |
|
| 170 | 2 | $linkUrl = $link->getPageUrl(); |
|
| 171 | 2 | $this->urls[$linkUrl] = $this->urls[$linkUrl] ?? new Url($linkUrl, ($this->currentClick + 1)); |
|
| 172 | 2 | $this->recorder->recordInboundLink($url, $this->urls[$linkUrl]); |
|
| 173 | 2 | ++$this->urls[$linkUrl]->inboundlinks; |
|
| 174 | } |
||
| 175 | 2 | } |
|
| 176 | } |
||
| 177 |