Issues (9)

sp.php (5 issues)

1
<?php
2
require_once __DIR__ . '/config.php';
3
4
use Symfony\Component\Console\Helper\ProgressBar;
5
use Symfony\Component\Console\Output\OutputInterface;
6
7
define('PID', getmypid());
8
/**
9
 * @param array $pages
10
 * @param string $task
11
 * @param $output OutputInterface
12
 */
13
function runProcesses($pages, $task, &$output)
14
{
15
    $pdo = new PDO(DSN);
16
    $pdo->query('DROP TABLE IF EXISTS pages');
17
    $pdo->query('CREATE TABLE pages (url TEXT NOT NULL, status TEXT, scope INTEGER NOT NULL)');
18
19
    for ($i = 0; $i < PROCESSES; $i++) {
20
        $sliceSize = ceil(count($pages) / PROCESSES);
21
        $slice = array_slice($pages, $i * $sliceSize, $sliceSize);
0 ignored issues
show
$i * $sliceSize of type double is incompatible with the type integer expected by parameter $offset of array_slice(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

21
        $slice = array_slice($pages, /** @scrutinizer ignore-type */ $i * $sliceSize, $sliceSize);
Loading history...
$sliceSize of type double is incompatible with the type integer expected by parameter $length of array_slice(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

21
        $slice = array_slice($pages, $i * $sliceSize, /** @scrutinizer ignore-type */ $sliceSize);
Loading history...
22
23
        foreach (array_chunk($slice, 999) as $part) {
24
            $values = implode(',', array_pad([], count($part), "(?,$i)"));
25
            $stmt = $pdo->prepare("INSERT INTO pages (url, scope) VALUES $values");
26
            if (!$stmt) {
27
                fwrite(STDERR, $pdo->errorInfo()[2] . PHP_EOL);
28
            }
29
            $stmt->execute($part);
30
            unset($stmt);
31
        }
32
33
        $cPath = __DIR__ . '/_process.php';
34
        execInBackground("php $cPath -s $i -p " . PID . " -t $task");
35
    }
36
37
    $do = count($pages);
38
    $done = 0;
39
    $progress = new ProgressBar($output, $do);
40
    $progress->setRedrawFrequency(10);
41
    $progress->setFormatDefinition('custom', ' %current%/%max% %bar% %message%');
42
    $progress->setFormat('custom');
43
    $progress->setMessage('');
44
    $progress->start();
45
    while ($done < $do) {
46
        $done = $pdo->query('SELECT COUNT(status) FROM pages WHERE status IS NOT NULL')->fetchColumn();
47
        $codes = $pdo->query('SELECT status, COUNT(status) AS count FROM pages WHERE status IS NOT NULL GROUP BY status')->fetchAll(PDO::FETCH_ASSOC);
48
        $message = [];
49
        foreach ($codes as $code) {
50
            $message[] = str_replace(200, 'ok', $code['status']) . ': ' . $code['count'];
51
        }
52
        $progress->setMessage(implode(' ', $message));
53
        $progress->setProgress($done);
54
        usleep(500000);
55
    }
56
    $progress->finish();
57
    $output->writeln('');
58
}
59
60
/**
61
 * @param $website_url
62
 * @param $output OutputInterface
63
 * @return array
64
 * @throws Exception
65
 */
66
function getPages($website_url, &$output)
67
{
68
    $output->writeln("  Getting sitemap of $website_url ...");
69
    $sitemap = simplexml_load_file($website_url . '/sitemap.xml');
70
71
    if ($sitemap === false) {
72
        throw new Exception("Can't get $website_url . /sitemap.xml");
73
    }
74
75
    $pages = [];
76
    foreach ($sitemap as $url) {
77
        $pages[] = urldecode($url->loc);
78
    }
79
    return $pages;
80
}
81
82
/**
83
 * @param $cmd string
84
 */
85
function execInBackground($cmd)
86
{
87
    if (substr(php_uname(), 0, 7) == "Windows") {
88
        pclose(popen("start /B " . $cmd, "r"));
0 ignored issues
show
It seems like popen('start /B ' . $cmd, 'r') can also be of type false; however, parameter $handle of pclose() does only seem to accept resource, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

88
        pclose(/** @scrutinizer ignore-type */ popen("start /B " . $cmd, "r"));
Loading history...
89
    } else {
90
        exec($cmd . " > /dev/null 2>/dev/null &");
91
    }
92
}
93
94
/**
95
 * @param $domain
96
 * @param $output OutputInterface
97
 */
98
function saveCheckReport($domain, &$output)
99
{
100
    $output->write('  Saving report...');
101
    $result = new PDO(DSN);
102
    $report = new PDO('sqlite:' . __DIR__ . "/reports/{$domain}.sqlite3");
103
    $report->query('DROP TABLE IF EXISTS `check`');
104
    $report->query('CREATE TABLE `check` (url TEXT NOT NULL, status TEXT NOT NULL)');
105
    $stm = $report->prepare('INSERT INTO `check` (url, status) VALUES (:url, :status)');
106
    $checks = $result->query('SELECT url, status FROM pages')->fetchAll(PDO::FETCH_ASSOC);
107
    foreach ($checks as $check) {
108
        $stm->execute($check);
109
    }
110
    $output->writeln('  ok');
111
}
112
113
/**
114
 * @param $domain
115
 * @param $output OutputInterface
116
 */
117
function saveLinksReport($domain, &$output)
118
{
119
    $output->write('  Saving report...');
120
    $result = new PDO(DSN);
121
    $report = new PDO('sqlite:' . __DIR__ . "/reports/{$domain}.sqlite3");
122
    $report->query('DROP TABLE IF EXISTS links');
123
    $report->query('CREATE TABLE links (page TEXT NOT NULL, link TEXT NOT NULL, url TEXT NOT NULL, status TEXT NOT NULL )');
124
    $stm = $report->prepare('INSERT INTO links (page, link, url, status) VALUES (:page, :link, :url, :status)');
125
    $checks = $result->query('SELECT url, status FROM pages')->fetchAll();
126
    foreach ($checks as $check) {
127
        $stm2 = $result->prepare('SELECT page, link FROM links WHERE url=:url');
128
        $stm2->execute([':url' => $check['url']]);
129
        $parameters = array_merge($check, $stm2->fetch(PDO::FETCH_ASSOC));
130
        $stm->execute($parameters);
131
    }
132
    $output->writeln('  ok');
133
}
134
135
if (!file_exists(__DIR__ . '/reports')) {
136
    mkdir(__DIR__ . '/reports');
137
}
138
139
$app = new Silly\Application();
140
141
$app->command('check website_url', function ($website_url, OutputInterface $output) {
142
    try {
143
        $pages = getPages($website_url, $output);
144
    } catch (Exception $exception) {
145
        $output->writeln($exception->getMessage());
146
        return;
147
    }
148
    runProcesses($pages, 'check', $output);
149
150
    saveCheckReport(parse_url($website_url, PHP_URL_HOST), $output);
151
});
152
153
$app->command('links website_url', function ($website_url, OutputInterface $output) {
154
    try {
155
        $pages = getPages($website_url, $output);
156
    } catch (Exception $exception) {
157
        $output->writeln($exception->getMessage());
158
        return;
159
    };
160
161
    $pdo = new PDO(DSN);
162
    $pdo->query('DROP TABLE IF EXISTS links');
163
    $pdo->query('CREATE TABLE links (page TEXT NOT NULL, link TEXT NOT NULL, url TEXT NOT NULL)');
164
    $pdo->query('CREATE UNIQUE INDEX links_url_uindex ON links (url);');
165
    unset($pdo);
166
167
    runProcesses($pages, 'parse-links', $output);
168
169
    $domain = parse_url($website_url, PHP_URL_HOST);
170
    saveCheckReport($domain, $output);
171
172
    $pdo = new PDO(DSN);
173
    $pages = $pdo->query('SELECT url FROM links')->fetchAll(PDO::FETCH_COLUMN);
174
    unset($pdo);
175
176
    $output->writeln('  Checking founded links...');
177
178
    runProcesses($pages, 'check', $output);
179
180
    saveLinksReport($domain, $output);
181
});
182
183
$app->command('metadata website_url', function ($website_url, OutputInterface $output) {
184
    /**
185
     * @param array $tree
186
     * @param callable $function
187
     * @param int $level
188
     * @param string $path
189
     */
190
    function walker($tree, $function, $level = 0, $path = '')
191
    {
192
        foreach ($tree as $branchName => $branch) {
193
            if (isset($branch['_self'])) {
194
                $function($branch['_self'], $level, $path);
195
                unset($branch['_self']);
196
            }
197
            if (count($branch) > 0) {
198
                walker($branch, $function, $level + 1, "$path/$branchName");
199
            }
200
        }
201
    }
202
203
    $output->writeln("  Getting sitemap of $website_url ...");
204
    $sitemap = simplexml_load_file($website_url . '/sitemap.xml');
205
206
    $resultFile = __DIR__ . '/runtime/' . parse_url($website_url, PHP_URL_HOST) . '-metadata.csv';
207
208
    $paths = [];
209
    foreach ($sitemap as $url) {
210
        $paths[] = parse_url(rtrim($url->loc, "/"), PHP_URL_PATH);
211
    }
212
    natsort($paths);
213
214
    $tree = [];
215
    foreach ($paths as $path) {
216
        $levels = explode('/', $path);
217
        $temp = &$tree;
218
        foreach ($levels as $key => $level) {
219
            // в условии неочевидное преобразование для анализа ссылки на главную страницу
220
            if (!empty($level) || (empty(array_filter($levels)) && $level = '/')) {
221
                if (!isset($temp[$level])) {
222
                    $temp[$level] = [];
223
                }
224
225
                if ($key == (count($levels) - 1)) {
226
                    $temp[$level]['_self'] = ['path' => $path];
227
                }
228
229
                $temp = &$temp[$level];
230
            }
231
        }
232
    }
233
    unset($temp);
234
235
    file_put_contents($resultFile, 'URL, Title, Keywords, Description, "Build Time: ' . date('r') . '"' . PHP_EOL);
236
    $previous = '';
237
    $progress = new \cli\progress\Bar(' Getting meta data', count($paths), 1000);
238
    walker($tree, function (&$self, $level, $path) use ($website_url, &$previous, $resultFile, &$progress) {
0 ignored issues
show
The parameter $path is not used and could be removed. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-unused  annotation

238
    walker($tree, function (&$self, $level, /** @scrutinizer ignore-unused */ $path) use ($website_url, &$previous, $resultFile, &$progress) {

This check looks for parameters that have been defined for a function or method, but which are not used in the method body.

Loading history...
The parameter $level is not used and could be removed. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-unused  annotation

238
    walker($tree, function (&$self, /** @scrutinizer ignore-unused */ $level, $path) use ($website_url, &$previous, $resultFile, &$progress) {

This check looks for parameters that have been defined for a function or method, but which are not used in the method body.

Loading history...
239
        $data = [];
240
        $page = file_get_contents($website_url . $self['path']);
241
        preg_match('~<title>(.*?)</title>~', $page, $temp);
242
        $data[] = $temp[1] ?? '';
243
        preg_match('~<meta name="keywords" content="(.*?)">~', $page, $temp);
244
        $data[] = $temp[1] ?? '';
245
        preg_match('~<meta name="description" content="(.*?)">~', $page, $temp);
246
        $data[] = $temp[1] ?? '';
247
248
        if ($data == $previous) {
249
            foreach ($data as &$item) {
250
                $item = '--//--';
251
            }
252
        } else {
253
            $previous = $data;
254
        }
255
256
        $row = [];
257
        $row[] = $website_url . $self['path'];
258
        $row = array_merge($row, $data);
259
260
        foreach ($row as &$item) {
261
            $item = '"' . $item . '"';
262
        }
263
264
        $line = implode(',', $row) . PHP_EOL;
265
        file_put_contents($resultFile, $line, FILE_APPEND);
266
        $progress->tick();
267
    });
268
269
    $progress->finish();
270
});
271
272
/** @noinspection PhpUnhandledExceptionInspection */
273
$app->run();
274