cimmwolf /
sitemap-parser
| 1 | <?php |
||||
| 2 | |||||
| 3 | use DenisBeliaev\SitemapParser\Link; |
||||
| 4 | use DenisBeliaev\SitemapParser\Page; |
||||
| 5 | |||||
| 6 | require_once __DIR__ . '/config.php'; |
||||
| 7 | |||||
| 8 | $opts = getopt('s:p:t:'); |
||||
| 9 | |||||
| 10 | $scope = $opts['s']; |
||||
| 11 | $pPID = $opts['p'] ?? null; |
||||
| 12 | $task = $opts['t'] ?? 'check'; |
||||
| 13 | |||||
| 14 | $pdo = new PDO(DSN); |
||||
| 15 | |||||
| 16 | $stm = $pdo->prepare('SELECT url FROM pages WHERE scope=:scope'); |
||||
| 17 | $stm->execute([':scope' => $scope]); |
||||
| 18 | $items = $stm->fetchAll(PDO::FETCH_COLUMN); |
||||
| 19 | unset($stm); |
||||
| 20 | |||||
| 21 | $ch = curl_init(); |
||||
| 22 | if ($task == 'check') { |
||||
| 23 | curl_setopt($ch, CURLOPT_NOBODY, true); |
||||
|
0 ignored issues
–
show
Bug
introduced
by
Loading history...
|
|||||
| 24 | } else if ($task == 'parse-links') { |
||||
| 25 | curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); |
||||
| 26 | } |
||||
| 27 | foreach ($items as $item) { |
||||
| 28 | curl_setopt($ch, CURLOPT_URL, $item); |
||||
| 29 | $httpCode = 'ERR'; |
||||
| 30 | if ($content = curl_exec($ch)) { |
||||
|
0 ignored issues
–
show
It seems like
$ch can also be of type false; however, parameter $ch of curl_exec() does only seem to accept resource, maybe add an additional type check?
(
Ignorable by Annotation
)
If this is a false-positive, you can also ignore this issue in your code via the
Loading history...
|
|||||
| 31 | $httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE); |
||||
|
0 ignored issues
–
show
It seems like
$ch can also be of type false; however, parameter $ch of curl_getinfo() does only seem to accept resource, maybe add an additional type check?
(
Ignorable by Annotation
)
If this is a false-positive, you can also ignore this issue in your code via the
Loading history...
|
|||||
| 32 | if ($httpCode == 501 && $task == 'check') { |
||||
| 33 | curl_setopt($ch, CURLOPT_NOBODY, false); |
||||
| 34 | if ($content = curl_exec($ch)) { |
||||
| 35 | $httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE); |
||||
| 36 | } |
||||
| 37 | curl_setopt($ch, CURLOPT_NOBODY, true); |
||||
| 38 | } |
||||
| 39 | } |
||||
| 40 | |||||
| 41 | $pdo->setAttribute(PDO::ATTR_TIMEOUT, 100); |
||||
| 42 | $stm = $pdo->prepare('UPDATE pages SET status=:status WHERE url=:url'); |
||||
| 43 | $stm->execute([':status' => $httpCode, ':url' => $item]); |
||||
| 44 | unset($stm); |
||||
| 45 | |||||
| 46 | if ($task == 'parse-links') { |
||||
| 47 | $Page = new Page($content, $item); |
||||
| 48 | $links = $Page->links; |
||||
| 49 | foreach ($links as &$link) { |
||||
| 50 | $link = Link::normalize($link, $Page->base); |
||||
| 51 | } |
||||
| 52 | $links = array_unique(array_filter($links)); |
||||
| 53 | |||||
| 54 | foreach ($links as $key => &$link) { |
||||
| 55 | $link = [$item, $Page->links[$key], $link]; |
||||
| 56 | } |
||||
| 57 | $links = array_filter($links, function ($value) use ($pdo) { |
||||
| 58 | $stm = $pdo->prepare('SELECT COUNT(url) FROM pages WHERE url=:url'); |
||||
| 59 | $stm->execute([':url' => $value[2]]); |
||||
| 60 | $result = $stm->fetchColumn(); |
||||
| 61 | return $result == 0; |
||||
| 62 | }); |
||||
| 63 | if (!empty($links)) { |
||||
| 64 | $parameters = []; |
||||
| 65 | foreach ($links as $link) { |
||||
| 66 | $parameters[] = $link[0]; |
||||
| 67 | $parameters[] = $link[1]; |
||||
| 68 | $parameters[] = $link[2]; |
||||
| 69 | } |
||||
| 70 | |||||
| 71 | $pdo->setAttribute(PDO::ATTR_TIMEOUT, 100); |
||||
| 72 | $stm = $pdo->prepare('INSERT OR IGNORE INTO links (`page`, link, url) VALUES ' . implode(',', array_pad([], count($links), '(?,?,?)'))); |
||||
| 73 | if ($stm == false) { |
||||
| 74 | fwrite(STDERR, $pdo->errorInfo()[2] . PHP_EOL); |
||||
| 75 | exit; |
||||
| 76 | } |
||||
| 77 | $stm->execute($parameters); |
||||
| 78 | unset($stm); |
||||
| 79 | } |
||||
| 80 | } |
||||
| 81 | |||||
| 82 | if ($pPID && !isRunning($pPID)) { |
||||
| 83 | exit(1); |
||||
| 84 | } |
||||
| 85 | } |
||||
| 86 | |||||
| 87 | unset($pdo); |
||||
| 88 | curl_close($ch); |
||||
|
0 ignored issues
–
show
It seems like
$ch can also be of type false; however, parameter $ch of curl_close() does only seem to accept resource, maybe add an additional type check?
(
Ignorable by Annotation
)
If this is a false-positive, you can also ignore this issue in your code via the
Loading history...
|
|||||
| 89 | |||||
| 90 | function isRunning($pid) |
||||
| 91 | { |
||||
| 92 | if (function_exists('posix_kill')) { |
||||
| 93 | return posix_kill($pid, 0); |
||||
| 94 | } |
||||
| 95 | exec('ps -W -p ' . $pid, $out); |
||||
| 96 | return count($out) > 1; |
||||
| 97 | } |
||||
| 98 |