Test Failed
Push — main ( 9f2520...b3e801 )
by Will
13:09
created

crawlers   A

Complexity

Total Complexity 11

Size/Duplication

Total Lines 387
Duplicated Lines 0 %

Test Coverage

Coverage 100%

Importance

Changes 4
Bugs 0 Features 0
Metric Value
eloc 301
c 4
b 0
f 0
dl 0
loc 387
ccs 366
cts 366
cp 1
rs 10
wmc 11

2 Methods

Rating   Name   Duplication   Size   Complexity  
B get() 0 179 2
C getApp() 0 192 9
1
<?php
2
declare(strict_types = 1);
3
namespace hexydec\agentzero;
4
5
class crawlers {
6
7
	/**
8
	 * Extracts application and version information from a token
9
	 * 
10
	 * @param string $value The token to be processed
11
	 * @param array<string|null> $data An array containing existing data to merge
12
	 * @return array<string|int|float|null> The $data array with the processed application and version added
13
	 */
14 12
	public static function getApp(string $value, array $data = []) : array {
15 12
		if (!\str_contains($value, '://') && !\str_starts_with($value, 'Chrome/') && \strcasecmp('Cubot', $value) !== 0 && \strcasecmp('Power bot', $value) !== 0) { // bot will be in the URL
16 12
			$parts = \explode('/', $value, 2);
17
18
			// process version
19 12
			if (!empty($parts[1])) {
20 10
				$parts[1] = \ltrim($parts[1], 'v');
21 10
				$parts[1] = \substr($parts[1], 0, \strspn($parts[1], '0123456789.'));
22
			}
23 12
			$category = [
24 12
				'yacybot' => [
25 12
					'category' => 'search',
26 12
					'app' => 'YacyBot'
27 12
				],
28 12
				'googlebot' => [
29
					'category' => 'search',
30 2
					'app' => 'GoogleBot'
31
				],
32
				'googlebot-mobile' => [
33
					'category' => 'search',
34
					'app' => 'GoogleBot'
35
				],
36
				'googlebot-image' => [
37
					'category' => 'search',
38 12
					'app' => 'GoogleBot'
39 8
				],
40 8
				'googlebot-video' => [
41 8
					'category' => 'search',
42 8
					'app' => 'GoogleBot'
43 8
				],
44 2
				'googlebot-news' => [
45 2
					'category' => 'search',
46 2
					'app' => 'GoogleBot'
47 8
				],
48 8
				'storebot-google' => [
49 8
					'category' => 'search',
50 8
					'app' => 'GoogleBot'
51 8
				],
52 2
				'adsbot-google' => [
53 2
					'category' => 'ads',
54 2
					'app' => 'GoogleBot'
55 2
				],
56 2
				'adsbot-google-mobile' => [
57 2
					'category' => 'ads',
58 2
					'app' => 'GoogleBot'
59 2
				],
60 2
				'mediapartners-google' => [
61 2
					'category' => 'ads',
62 2
					'app' => 'GoogleBot'
63 2
				],
64 8
				'bingbot' => [
65 8
					'category' => 'search',
66 8
					'app' => 'BingBot'
67 8
				],
68 12
				'adidxbot' => [
69 10
					'category' => 'ads',
70 10
					'app' => 'AdidxBot'
71 10
				],
72 10
				'duckduckbot' => [
73 10
					'category' => 'search',
74 10
					'app' => 'DuckDuckBot'
75 10
				],
76 10
				'duckduckgo-favicons-bot' => [
77 10
					'category' => 'search',
78 10
					'app' => 'DuckDuckBot'
79 10
				],
80 10
				'coccocbot-image' => [
81 10
					'category' => 'search',
82 10
					'app' => 'CoccocBot'
83 10
				],
84 10
				'coccocbot-web' => [
85 10
					'category' => 'search',
86 10
					'app' => 'CoccocBot'
87 10
				],
88 10
				'applebot' => [
89 10
					'category' => 'search',
90 10
					'app' => 'AppleBot'
91 10
				],
92 10
				'yandexbot' => [
93 10
					'category' => 'search'
94 10
				],
95 10
				'mj12bot' => [
96 10
					'category' => 'search',
97 10
					'app' => 'Majestic 12 Bot'
98 10
				],
99 10
				'mail.ru_bot' => [
100 10
					'category' => 'search',
101 10
					'app' => 'Mail.ru Bot'
102 10
				],
103 10
				'exabot' => [
104 10
					'category' => 'search',
105 10
					'app' => 'ExaBot'
106 10
				],
107 10
				'uptimerobot' => [
108 10
					'category' => 'monitor'
109 10
				],
110 10
				'petalbot' => [
111 10
					'category' => 'search'
112 10
				],
113 10
				'twitterbot' => [
114 10
					'category' => 'feed',
115 10
					'app' => 'TwitterBot'
116 10
				],
117 10
				'xbot' => [
118 10
					'category' => 'feed'
119 10
				],
120 10
				'discordbot' => [
121 10
					'category' => 'feed',
122 10
					'app' => 'DiscordBot'
123 10
				],
124 10
				'sematextsyntheticsrobot' => [
125 10
					'category' => 'monitor',
126 10
					'app' => 'Sematext Synthetics Robot'
127 10
				],
128 10
				'linkedinbot' => [
129 10
					'category' => 'feed'
130 10
				],
131 10
				'paperlibot' => [
132 10
					'category' => 'feed'
133 10
				],
134 10
				'bitlybot' => [
135 10
					'category' => 'feed',
136 10
					'app' => 'Bit.ly Bot'
137 10
				],
138 10
				'tineye-bot' => [
139 10
					'category' => 'search',
140 10
					'app' => 'TinEye Bot'
141 10
				],
142 10
				'pinterestbot' => [
143 10
					'category' => 'feed',
144 10
					'app' => 'PinterestBot'
145 10
				],
146 10
				'webcrawler' => [
147 10
					'category' => 'crawler'
148 10
				],
149 10
				'webprosbot' => [
150 10
					'category' => 'crawler',
151 10
					'app' => 'WebprosBot'
152 10
				],
153 10
				'guzzlehttp' => [
154 10
					'category' => 'scraper'
155 10
				],
156 10
				'telegrambot' => [
157 10
					'category' => 'feed'
158 10
				],
159 10
				'semrushbot' => [
160 10
					'category' => 'crawler'
161 10
				],
162 10
				'mediatoolkitbot' => [
163 10
					'category' => 'crawler',
164 10
					'app' => 'MediaToolkitBot'
165 10
				],
166 10
				'iploggerbot' => [
167 10
					'category' => 'monitor'
168 10
				],
169 10
				'cfnetwork' => [
170 10
					'category' => 'feed',
171 10
					'app' => 'Apple Core Foundation Network'
172 10
				],
173 10
				'ncsc web check [email protected]' => [
174 10
					'category' => 'monitor',
175 10
					'app' => 'NCSC Web Check'
176 10
				],
177 10
				'google-site-verification' => [
178 10
					'category' => 'validator',
179 10
					'app' => 'Google Site Verification'
180 10
				],
181 10
				'google-inspectiontool' => [
182 10
					'category' => 'validator',
183 10
					'app' => 'Google Inspection Tool'
184 10
				],
185 10
				'pingdomtms' => [
186 10
					'category' => 'monitor',
187 10
					'app' => 'Pingdom.com'
188 10
				],
189 10
				'facebookexternalhit' => [
190 10
					'category' => 'feed',
191 10
					'app' => 'Facebook URL Preview'
192 10
				],
193 10
				'phxbot' => [
194 10
					'app' => 'ProtonMail Bot'
195 10
				]
196 10
			];
197 10
			return \array_merge([
198 10
				'type' => 'robot',
199 10
				'category' => \mb_stripos($value, 'crawler') !== false || \mb_stripos($value, 'bot') !== false ? 'crawler' : 'scraper',
200 10
				'app' => $parts[0],
201 10
				'appname' => $parts[0],
202 10
				'appversion' => empty($parts[1]) ? null : $parts[1]
203 10
			], $data, $category[\mb_strtolower($parts[0])] ?? []);
204 10
		}
205 10
		return [];
206 10
	}
207 10
208 10
	/**
209 10
	 * Generates a configuration array for matching crawlers
210 10
	 * 
211 10
	 * @return array<string,props> An array with keys representing the string to match, and values a props object defining how to generate the match and which properties to set
212 10
	 */
213 10
	public static function get() : array {
214 10
		$fn = [
215 10
			'search' => fn (string $value) : array => self::getApp($value, ['category' => 'search']),
216 10
			'ads' => fn (string $value) : array => self::getApp($value, ['category' => 'ads']),
217 10
			'validator' => fn (string $value) : array => self::getApp($value, ['category' => 'validator']),
218 10
			'feed' => fn (string $value) : array => self::getApp($value, \array_merge(
219 10
				\str_contains($value, 'WhatsApp/') ? [
220 10
					'app' => 'WhatsApp'
221 10
				] : [],
222 10
				[
223 10
					'category' => 'feed'
224 10
				]
225 10
			)),
226 10
			'crawler' => function (string $value) : array {
227 10
				$parts = \explode('/', $value, 2);
228 10
				$map = [
229 10
					'baiduspider' => 'search',
230 10
					'haosouspider' => 'search',
231 10
					'yisouspider' => 'search',
232 10
					'360spider' => 'search',
233 10
					'sogou web spider' => 'search',
234 10
					'bytespider' => 'search',
235 10
				];
236 10
				return self::getApp($value, [
237 10
					'category' => $map[\mb_strtolower($parts[0])] ?? 'crawler'
238 10
				]);
239 10
			},
240 10
			'monitor' => fn (string $value) : array => self::getApp($value, ['category' => 'monitor']),
241 10
			'scraper' => fn (string $value) : array => self::getApp($value, ['category' => 'scraper']),
242 10
			'map' => fn (string $value) : ?array => self::getApp($value)
243 10
		];
244 10
		return [
245 10
			'Yahoo! Slurp' => new props('exact', $fn['search']),
246 10
			'facebookexternalhit/' => new props('start', $fn['map']),
247 10
			'Google-Site-Verification/' => new props('start', $fn['map']),
248 10
			'Google-InspectionTool/' => new props('start', $fn['map']),
249
			'Mediapartners-Google' => new props('start', $fn['search']),
250 8
			'FeedFetcher-Google' => new props('exact', $fn['feed']),
251 8
			'GoogleProducer' => new props('exact', $fn['feed']),
252 8
			'Google-adstxt' => new props('exact', $fn['ads']),
253 1
			'CFNetwork/' => new props('start', $fn['map']),
254 1
			'Siteimprove.com' => new props('any', $fn['crawler']),
255 1
			'CyotekWebCopy' => new props('start', $fn['scraper']),
256 1
			'Google Page Speed Insights' => new props('exact', $fn['validator']),
257 1
			'Qwantify' => new props('start', function (string $value) : array {
258 1
				$parts = \explode('/', $value, 3);
259 1
				return [
260 1
					'type' => 'robot',
261 1
					'category' => 'search',
262 1
					'app' => 'Qwant Web Crawler',
263 1
					'appname' => $parts[0],
264 1
					'appversion' => $parts[1] ?? null
265 1
				];
266 1
			}),
267 1
			'okhttp' => new props('start', $fn['scraper']),
268 1
			'python' => new props('start', $fn['scraper']),
269 1
			'jsdom/' => new props('start', $fn['scraper']),
270 1
			'Nessus' => new props('start', $fn['monitor']),
271 1
			'Chrome-Lighthouse' => new props('start', $fn['validator']),
272 1
			'Siege/' => new props('start', $fn['validator']),
273 1
			'Microsoft Profiling/' => new props('any', $fn['validator']),
274 1
			'Bidtellect' => new props('start', $fn['crawler']),
275 1
			'magpie-crawler/' => new props('start', $fn['crawler']),
276 1
			'PingdomTMS/' => new props('start', $fn['map']),
277 1
			'DynGate' => new props('exact', $fn['monitor']),
278 1
			'Datadog/Synthetics' => new props('exact', [
279 1
				'type' => 'robot',
280 1
				'category' => 'monitor',
281 1
				'app' => 'Datadog/Synthetics'
282 1
			]),
283 1
			'RuxitSynthetic/' => new props('start', $fn['monitor']),
284 1
			'Checkly/' => new props('start', $fn['monitor']),
285 1
			'Uptime/' => new props('start', $fn['monitor']),
286 1
			'HostTracker/' => new props('start', $fn['monitor']),
287 1
			'NCSC Web Check [email protected]' => new props('exact', $fn['map']),
288 1
			'Pingdom.com' => new props('start', function (string $value) : array {
289 1
				$version = \explode('_', \trim($value, '_'));
290 1
				return [
291 1
					'type' => 'robot',
292 1
					'category' => 'monitor',
293 1
					'app' => 'Pingdom.com',
294 1
					'appname' => \trim($value, '_'),
295 1
					'appversion' => \end($version)
296 1
				];
297 1
			}),
298 1
			'proximic' => new props('exact', $fn['ads']),
299 1
			'WordPress' => new props('start', $fn['monitor']),
300 1
			'PRTG Network Monitor' => new props('exact', $fn['monitor']),
301 1
			'PRTGCloudBot/' => new props('start', $fn['monitor']),
302 1
			'Site24x7' => new props('exact', $fn['monitor']),
303 1
			'StatusCake' => new props('exact', $fn['monitor']),
304 1
			'adbeat.com' => new props('start', fn (string $value) : array => [
305 1
				'type' => 'robot',
306 1
				'category' => 'ads',
307 1
				'app' => 'Adbeat',
308 1
				'appname' => 'Adbeat',
309 1
				'url' => 'https://'.$value
310 1
			]),
311 1
			'MicrosoftPreview/' => new props('start', $fn['feed']),
312 1
			'YahooMailProxy' => new props('exact', $fn['feed']),
313 1
			'PhxBot/' => new props('start', $fn['feed']), // proton mail
314 1
			'Pleroma' => new props('start', fn (string $value) : array => [ // mastodon
315 1
				'type' => 'robot',
316 2
				'category' => 'feed',
317 2
				'app' => 'Mastodon',
318 2
				'appname' => 'Pleroma',
319 2
				'appversion' => \mb_substr($value, 8)
320 2
			]),
321 2
			'Outlook-Android/' => new props('start', fn (string $value) : array => [ // mastodon
322 1
				'type' => 'robot',
323 1
				'category' => 'feed',
324 1
				'app' => 'Outlook',
325 1
				'appname' => 'Outlook-Android',
326 1
				'platform' => 'Android',
327 1
				'appversion' => \mb_substr($value, 16)
328 1
			]),
329 1
			'Outlook-iOS/' => new props('start', fn (string $value, int $i, array $tokens) : array => [ // mastodon
330 1
				'type' => 'robot',
331 1
				'category' => 'feed',
332 1
				'app' => 'Outlook',
333 2
				'appname' => 'Outlook-iOS',
334 2
				'platform' => 'iOS',
335 2
				'appversion' => $tokens[$i+1] ?? \mb_substr($value, 12)
336 2
			]),
337 2
			'OutlookMobileCloudService-Autodetect/' => new props('start', fn (string $value) : array => [ // mastodon
338 2
				'type' => 'robot',
339 2
				'category' => 'feed',
340 1
				'app' => 'Outlook',
341 2
				'appname' => 'OutlookMobileCloudService-Autodetect',
342 2
				'appversion' => \mb_substr($value, 37)
343 2
			]),
344 2
			'Chrome Privacy Preserving Prefetch Proxy' => new props('exact', $fn['feed']),
345 2
			'ViberUrlDownloader' => new props('exact', $fn['feed']),
346 2
			'Google-Lens' => new props('exact', $fn['feed']),
347 2
			'ManicTime/' => new props('start', $fn['feed']),
348 1
			'Yik Yak/' => new props('start', $fn['feed']),
349 2
			'HubSpot-Link-Resolver' => new props('exact', $fn['feed']),
350 2
			'W3C-checklink/' => new props('start', $fn['validator']),
351 2
			'CSSCheck/' => new props('start', $fn['validator']),
352 2
			'Let\'s Encrypt validation server' => new props('exact', $fn['validator']),
353 2
			'SEO-Macroscope/' => new props('exact', $fn['validator']),
354 2
			'Electronic Frontier Foundation\'s Do Not Track Verifier' => new props('exact', $fn['validator']),
355 1
			'Expanse' => new props('start', $fn['crawler']),
356 1
			'eCairn-Grabber/' => new props('start', $fn['scraper']),
357 1
			'SEOkicks' => new props('exact', $fn['crawler']),
358 1
			'PostmanRuntime/' => new props('start', $fn['scraper']),
359 1
			'axios/' => new props('start', $fn['scraper']),
360 1
			'Rogerbot/' => new props('start', $fn['crawler']),
361 1
			'DashLinkPreviews/' => new props('start', $fn['feed']),
362 1
			'Snapchat/' => new props('start', $fn['feed']),
363 1
			'HTTPClient/' => new props('start', $fn['scraper']),
364 1
			'WhatsApp/' => new props('any', $fn['feed']),
365 1
			'Hootsuite-Authoring/' => new props('start', $fn['feed']),
366 1
			'Snap URL Preview Service' => new props('exact', $fn['feed']),
367 1
			'ApacheBench/' => new props('start', $fn['validator']),
368 1
			'Asana/' => new props('start', $fn['feed']),
369 1
			'Java/' => new props('start', $fn['scraper']),
370 1
			'curl/' => new props('any', $fn['scraper']),
371 1
			'Wget/' => new props('start', $fn['scraper']),
372 1
			'rest-client/' => new props('start', $fn['scraper']),
373 1
			'ruby/' => new props('start', $fn['scraper']),
374 1
			'Bun/' => new props('start', $fn['scraper']),
375 1
			'CakePHP' => new props('start', $fn['scraper']),
376 1
			'cpp-httplib/' => new props('start', $fn['scraper']),
377 1
			'Dart/' => new props('start', $fn['scraper']),
378 1
			'Deno/' => new props('start', $fn['scraper']),
379 1
			'libwww-perl/' => new props('start', $fn['scraper']),
380 1
			'http/' => new props('start', $fn['scraper']),
381 1
			'Cpanel-HTTP-Client/' => new props('start', $fn['scraper']),
382 1
			'http-client/' => new props('any', $fn['scraper']),
383 1
			'HttpClient/' => new props('any', $fn['scraper']),
384 1
			'Validator' => new props('any', $fn['validator']),
385 1
			'feed' => new props('any', $fn['feed']),
386 1
			'spider' => new props('any', $fn['crawler']),
387 1
			'crawler' => new props('any', $fn['map']),
388 1
			'bot/' => new props('any', $fn['map']),
389 1
			'bot-' => new props('any', $fn['map']),
390 1
			' bot ' => new props('any', $fn['map']),
391 1
			'bot' => new props('end', $fn['map'])
392 1
		];
393
	}
394
}