1
|
|
|
<?php |
|
|
|
|
2
|
|
|
require_once __DIR__ . '/config.php'; |
3
|
|
|
|
4
|
|
|
use Symfony\Component\Console\Helper\ProgressBar; |
5
|
|
|
use Symfony\Component\Console\Output\OutputInterface; |
6
|
|
|
|
7
|
|
|
define('PID', getmypid()); |
8
|
|
|
/** |
9
|
|
|
* @param array $pages |
10
|
|
|
* @param string $task |
11
|
|
|
* @param $output OutputInterface |
12
|
|
|
*/ |
13
|
|
|
function runProcesses($pages, $task, &$output) |
14
|
|
|
{ |
15
|
|
|
$pdo = new PDO(DSN); |
16
|
|
|
$pdo->query('DROP TABLE IF EXISTS pages'); |
17
|
|
|
$pdo->query('CREATE TABLE pages (url TEXT NOT NULL, status TEXT, scope INTEGER NOT NULL)'); |
18
|
|
|
|
19
|
|
|
for ($i = 0; $i < PROCESSES; $i++) { |
20
|
|
|
$sliceSize = ceil(count($pages) / PROCESSES); |
21
|
|
|
$slice = array_slice($pages, $i * $sliceSize, $sliceSize); |
22
|
|
|
|
23
|
|
|
foreach (array_chunk($slice, 999) as $part) { |
24
|
|
|
$values = implode(',', array_pad([], count($part), "(?,$i)")); |
25
|
|
|
$stmt = $pdo->prepare("INSERT INTO pages (url, scope) VALUES $values"); |
26
|
|
|
if (!$stmt) { |
27
|
|
|
fwrite(STDERR, $pdo->errorInfo()[2] . PHP_EOL); |
28
|
|
|
} |
29
|
|
|
$stmt->execute($part); |
30
|
|
|
unset($stmt); |
31
|
|
|
} |
32
|
|
|
|
33
|
|
|
$cPath = __DIR__ . '/_process.php'; |
34
|
|
|
execInBackground("php $cPath -s $i -p " . PID . " -t $task"); |
35
|
|
|
} |
36
|
|
|
|
37
|
|
|
$do = count($pages); |
38
|
|
|
$done = 0; |
39
|
|
|
$progress = new ProgressBar($output, $do); |
40
|
|
|
$progress->setRedrawFrequency(10); |
41
|
|
|
$progress->setFormatDefinition('custom', ' %current%/%max% %bar% %message%'); |
42
|
|
|
$progress->setFormat('custom'); |
43
|
|
|
$progress->setMessage(''); |
44
|
|
|
$progress->start(); |
45
|
|
|
while ($done < $do) { |
46
|
|
|
$done = $pdo->query('SELECT COUNT(status) FROM pages WHERE status IS NOT NULL')->fetchColumn(); |
47
|
|
|
$codes = $pdo->query('SELECT status, COUNT(status) AS count FROM pages WHERE status IS NOT NULL GROUP BY status')->fetchAll(PDO::FETCH_ASSOC); |
48
|
|
|
$message = []; |
49
|
|
|
foreach ($codes as $code) { |
50
|
|
|
$message[] = str_replace(200, 'ok', $code['status']) . ': ' . $code['count']; |
51
|
|
|
} |
52
|
|
|
$progress->setMessage(implode(' ', $message)); |
53
|
|
|
$progress->setProgress($done); |
54
|
|
|
usleep(500000); |
55
|
|
|
} |
56
|
|
|
$progress->finish(); |
57
|
|
|
} |
58
|
|
|
|
59
|
|
|
/** |
60
|
|
|
* @param $website_url |
61
|
|
|
* @param $output OutputInterface |
62
|
|
|
* @return array |
63
|
|
|
*/ |
64
|
|
|
function getPages($website_url, &$output) |
65
|
|
|
{ |
66
|
|
|
$output->writeln(" Getting sitemap of $website_url ..."); |
67
|
|
|
$sitemap = simplexml_load_file($website_url . '/sitemap.xml'); |
68
|
|
|
|
69
|
|
|
if ($sitemap === false) { |
70
|
|
|
$output->writeln("Can't get $website_url . /sitemap.xml"); |
71
|
|
|
exit; |
|
|
|
|
72
|
|
|
} |
73
|
|
|
|
74
|
|
|
$pages = []; |
75
|
|
|
foreach ($sitemap as $url) { |
76
|
|
|
$pages[] = urldecode($url->loc); |
77
|
|
|
} |
78
|
|
|
return $pages; |
79
|
|
|
} |
80
|
|
|
|
81
|
|
|
/** |
82
|
|
|
* @param $cmd string |
83
|
|
|
*/ |
84
|
|
|
function execInBackground($cmd) |
85
|
|
|
{ |
86
|
|
|
if (substr(php_uname(), 0, 7) == "Windows") { |
87
|
|
|
pclose(popen("start /B " . $cmd, "r")); |
88
|
|
|
} else { |
89
|
|
|
exec($cmd . " > /dev/null 2>/dev/null &"); |
90
|
|
|
} |
91
|
|
|
} |
92
|
|
|
|
93
|
|
|
|
94
|
|
|
$app = new Silly\Application(); |
95
|
|
|
|
96
|
|
|
$app->command('check website_url', function ($website_url, OutputInterface $output) { |
97
|
|
|
$pages = getPages($website_url, $output); |
98
|
|
|
runProcesses($pages, 'check', $output); |
|
|
|
|
99
|
|
|
}); |
100
|
|
|
|
101
|
|
|
$app->command('links website_url', function ($website_url, OutputInterface $output) { |
102
|
|
|
$pages = getPages($website_url, $output); |
103
|
|
|
|
104
|
|
|
$pdo = new PDO(DSN); |
105
|
|
|
$pdo->query('DROP TABLE IF EXISTS links'); |
106
|
|
|
$pdo->query('CREATE TABLE links (url TEXT NOT NULL, status TEXT)'); |
107
|
|
|
$pdo->query('CREATE UNIQUE INDEX links_url_uindex ON links (url);'); |
108
|
|
|
unset($pdo); |
109
|
|
|
|
110
|
|
|
runProcesses($pages, 'parse-links', $output); |
|
|
|
|
111
|
|
|
|
112
|
|
|
$pdo = new PDO(DSN); |
113
|
|
|
// $pdo->query("DELETE FROM links WHERE url NOT LIKE 'http%'"); |
|
|
|
|
114
|
|
|
$pages = $pdo->query('SELECT url FROM links')->fetchAll(PDO::FETCH_COLUMN); |
115
|
|
|
unset($pdo); |
116
|
|
|
|
117
|
|
|
$output->writeln(''); |
118
|
|
|
$output->writeln(' Checking founded links...'); |
119
|
|
|
|
120
|
|
|
runProcesses($pages, 'check', $output); |
121
|
|
|
}); |
122
|
|
|
|
123
|
|
|
$app->command('metadata website_url', function ($website_url, OutputInterface $output) { |
124
|
|
|
/** |
125
|
|
|
* @param array $tree |
126
|
|
|
* @param callable $function |
127
|
|
|
* @param int $level |
128
|
|
|
* @param string $path |
129
|
|
|
*/ |
130
|
|
|
function walker($tree, $function, $level = 0, $path = '') |
131
|
|
|
{ |
132
|
|
|
foreach ($tree as $branchName => $branch) { |
133
|
|
|
if (isset($branch['_self'])) { |
134
|
|
|
$function($branch['_self'], $level, $path); |
135
|
|
|
unset($branch['_self']); |
136
|
|
|
} |
137
|
|
|
if (count($branch) > 0) |
138
|
|
|
walker($branch, $function, $level + 1, "$path/$branchName"); |
139
|
|
|
} |
140
|
|
|
} |
141
|
|
|
|
142
|
|
|
$output->writeln(" Getting sitemap of $website_url ..."); |
143
|
|
|
$sitemap = simplexml_load_file($website_url . '/sitemap.xml'); |
144
|
|
|
|
145
|
|
|
$resultFile = __DIR__ . '/runtime/' . parse_url($website_url, PHP_URL_HOST) . '-metadata.csv'; |
146
|
|
|
|
147
|
|
|
$paths = []; |
148
|
|
|
foreach ($sitemap as $url) { |
149
|
|
|
$paths[] = parse_url($url->loc, PHP_URL_PATH); |
150
|
|
|
} |
151
|
|
|
natsort($paths); |
152
|
|
|
|
153
|
|
|
$tree = []; |
154
|
|
|
foreach ($paths as $path) { |
155
|
|
|
$levels = explode('/', $path); |
156
|
|
|
$temp = &$tree; |
157
|
|
|
foreach ($levels as $key => $level) { |
158
|
|
|
// в условии неочевидное преобразование для анализа ссылки на главную страницу |
159
|
|
|
if (!empty($level) OR (empty(array_filter($levels)) AND $level = '/')) { |
|
|
|
|
160
|
|
|
if (!isset($temp[$level])) |
161
|
|
|
$temp[$level] = []; |
162
|
|
|
|
163
|
|
|
if ($key == (count($levels) - 1)) |
164
|
|
|
$temp[$level]['_self'] = ['path' => $path]; |
165
|
|
|
|
166
|
|
|
$temp = &$temp[$level]; |
167
|
|
|
} |
168
|
|
|
} |
169
|
|
|
} |
170
|
|
|
unset($temp); |
171
|
|
|
|
172
|
|
|
file_put_contents($resultFile, 'URL, Title, Keywords, Description, "Build Time: ' . date('r') . '"' . PHP_EOL); |
173
|
|
|
$previous = ''; |
174
|
|
|
$progress = new \cli\progress\Bar(' Getting meta data', count($paths), 1000); |
175
|
|
|
walker($tree, function (&$self, $level, $path) use ($website_url, &$previous, $resultFile, &$progress) { |
|
|
|
|
176
|
|
|
$data = []; |
177
|
|
|
$page = file_get_contents($website_url . $self['path']); |
178
|
|
|
preg_match('~<title>(.*?)</title>~', $page, $temp); |
179
|
|
|
$data[] = $temp[1] ?? ''; |
180
|
|
|
preg_match('~<meta name="keywords" content="(.*?)">~', $page, $temp); |
181
|
|
|
$data[] = $temp[1] ?? ''; |
182
|
|
|
preg_match('~<meta name="description" content="(.*?)">~', $page, $temp); |
183
|
|
|
$data[] = $temp[1] ?? ''; |
184
|
|
|
|
185
|
|
|
if ($data == $previous) { |
186
|
|
|
foreach ($data as &$item) |
187
|
|
|
$item = '--//--'; |
188
|
|
|
} else |
189
|
|
|
$previous = $data; |
190
|
|
|
|
191
|
|
|
$row = []; |
192
|
|
|
$row[] = $website_url . $self['path']; |
193
|
|
|
$row = array_merge($row, $data); |
194
|
|
|
|
195
|
|
|
foreach ($row as &$item) |
196
|
|
|
$item = '"' . $item . '"'; |
197
|
|
|
|
198
|
|
|
$line = implode(',', $row) . PHP_EOL; |
199
|
|
|
file_put_contents($resultFile, $line, FILE_APPEND); |
200
|
|
|
$progress->tick(); |
201
|
|
|
}); |
202
|
|
|
|
203
|
|
|
$progress->finish(); |
204
|
|
|
}); |
205
|
|
|
|
206
|
|
|
/** @noinspection PhpUnhandledExceptionInspection */ |
207
|
|
|
$app->run(); |
The PSR-1: Basic Coding Standard recommends that a file should either introduce new symbols, that is classes, functions, constants or similar, or have side effects. Side effects are anything that executes logic, like for example printing output, changing ini settings or writing to a file.
The idea behind this recommendation is that merely auto-loading a class should not change the state of an application. It also promotes a cleaner style of programming and makes your code less prone to errors, because the logic is not spread out all over the place.
To learn more about the PSR-1, please see the PHP-FIG site on the PSR-1.