Passed
Push — main ( 0b3ef0...fa30e1 )
by Will
02:47
created

crawlers   A

Complexity

Total Complexity 11

Size/Duplication

Total Lines 345
Duplicated Lines 0 %

Test Coverage

Coverage 100%

Importance

Changes 3
Bugs 0 Features 0
Metric Value
wmc 11
eloc 263
c 3
b 0
f 0
dl 0
loc 345
ccs 324
cts 324
cp 1
rs 10

2 Methods

Rating   Name   Duplication   Size   Complexity  
B get() 0 312 6
A getApp() 0 17 5
1
<?php
2
declare(strict_types = 1);
3
namespace hexydec\agentzero;
4
5
/**
6
 * @phpstan-import-type props from config
7
 */
8
class crawlers {
9
10
	/**
11
	 * Extracts application and version information from a token
12
	 * 
13
	 * @param string $value The token to be processed
14
	 * @param array<string|null> $data An array containing existing data to merge
15
	 * @return array<string|int|float|null> The $data array with the processed application and version added
16
	 */
17 11
	public static function getApp(string $value, array $data = []) : array {
18 11
		if (!\str_contains($value, '://') && !\str_starts_with($value, 'Chrome/')) { // bot will be in the URL
19 11
			$parts = \explode('/', $value, 2);
20
21
			// process version
22 11
			if (!empty($parts[1])) {
23 9
				$parts[1] = \ltrim($parts[1], 'v');
24 9
				$parts[1] = \substr($parts[1], 0, \strspn($parts[1], '0123456789.'));
25
			}
26 11
			return \array_merge([
27 11
				'type' => 'robot',
28 11
				'app' => $parts[0],
29 11
				'appname' => $parts[0],
30 11
				'appversion' => empty($parts[1]) ? null : $parts[1]
31 11
			], $data);
32
		}
33 2
		return [];
34
	}
35
36
	/**
37
	 * Generates a configuration array for matching crawlers
38
	 * 
39
	 * @return props An array with keys representing the string to match, and a value of an array containing parsing and output settings
40
	 */
41 11
	public static function get() : array {
42 8
		$fn = [
43 8
			'search' => fn (string $value) : array => self::getApp($value, ['category' => 'search']),
44 8
			'ads' => fn (string $value) : array => self::getApp($value, ['category' => 'ads']),
45 8
			'validator' => fn (string $value) : array => self::getApp($value, ['category' => 'validator']),
46 8
			'feed' => fn (string $value) : array => self::getApp($value, \array_merge(
47 2
				\str_contains($value, 'WhatsApp/') ? [
48 2
					'app' => 'WhatsApp'
49 2
				] : [],
50 8
				[
51 8
					'category' => 'feed'
52 8
				]
53 8
			)),
54 8
			'crawler' => function (string $value) : array {
55 2
				$parts = \explode('/', $value, 2);
56 2
				$map = [
57 2
					'baiduspider' => 'search',
58 2
					'haosouspider' => 'search',
59 2
					'yisouspider' => 'search',
60 2
					'360spider' => 'search',
61 2
					'sogou web spider' => 'search',
62 2
					'bytespider' => 'search',
63 2
				];
64 2
				return self::getApp($value, [
65 2
					'category' => $map[\mb_strtolower($parts[0])] ?? 'crawler'
66 2
				]);
67 8
			},
68 8
			'monitor' => fn (string $value) : array => self::getApp($value, ['category' => 'monitor']),
69 8
			'scraper' => fn (string $value) : array => self::getApp($value, ['category' => 'scraper']),
70 8
			'map' => function (string $value) : ?array {
71 11
				if (!\str_contains($value, '://') && \strcasecmp('Cubot', $value) !== 0 && \strcasecmp('Power bot', $value) !== 0) { // bot will be in the URL
72 9
					$parts = \explode('/', $value, 2);
73 9
					$category = [
74 9
						'yacybot' => [
75 9
							'category' => 'search',
76 9
							'app' => 'YacyBot'
77 9
						],
78 9
						'Googlebot' => [
79 9
							'category' => 'search',
80 9
							'app' => 'GoogleBot'
81 9
						],
82 9
						'Googlebot-Mobile' => [
83 9
							'category' => 'search',
84 9
							'app' => 'GoogleBot'
85 9
						],
86 9
						'Googlebot-Image' => [
87 9
							'category' => 'search',
88 9
							'app' => 'GoogleBot'
89 9
						],
90 9
						'Googlebot-Video' => [
91 9
							'category' => 'search',
92 9
							'app' => 'GoogleBot'
93 9
						],
94 9
						'Googlebot-News' => [
95 9
							'category' => 'search',
96 9
							'app' => 'GoogleBot'
97 9
						],
98 9
						'Storebot-Google' => [
99 9
							'category' => 'search',
100 9
							'app' => 'GoogleBot'
101 9
						],
102 9
						'AdsBot-Google' => [
103 9
							'category' => 'ads',
104 9
							'app' => 'GoogleBot'
105 9
						],
106 9
						'AdsBot-Google-Mobile' => [
107 9
							'category' => 'ads',
108 9
							'app' => 'GoogleBot'
109 9
						],
110 9
						'Bingbot' => [
111 9
							'category' => 'search',
112 9
							'app' => 'BingBot'
113 9
						],
114 9
						'bingbot' => [
115 9
							'category' => 'search',
116 9
							'app' => 'BingBot'
117 9
						],
118 9
						'adidxbot' => [
119 9
							'category' => 'ads',
120 9
							'app' => 'AdidxBot'
121 9
						],
122 9
						'DuckDuckBot' => [
123 9
							'category' => 'search',
124 9
							'app' => 'DuckDuckBot'
125 9
						],
126 9
						'DuckDuckGo-Favicons-Bot' => [
127 9
							'category' => 'search',
128 9
							'app' => 'DuckDuckBot'
129 9
						],
130 9
						'coccocbot-image' => [
131 9
							'category' => 'search',
132 9
							'app' => 'CoccocBot'
133 9
						],
134 9
						'coccocbot-web' => [
135 9
							'category' => 'search',
136 9
							'app' => 'CoccocBot'
137 9
						],
138 9
						'Applebot' => [
139 9
							'category' => 'search',
140 9
							'app' => 'AppleBot'
141 9
						],
142 9
						'YandexBot' => [
143 9
							'category' => 'search'
144 9
						],
145 9
						'MJ12bot' => [
146 9
							'category' => 'search',
147 9
							'app' => 'MJ12 Bot'
148 9
						],
149 9
						'Mail.RU_Bot' => [
150 9
							'category' => 'search',
151 9
							'app' => 'Mail.ru Bot'
152 9
						],
153 9
						'Exabot' => [
154 9
							'category' => 'search',
155 9
							'app' => 'ExaBot'
156 9
						],
157 9
						'UptimeRobot' => [
158 9
							'category' => 'monitor'
159 9
						],
160 9
						'PetalBot' => [
161 9
							'category' => 'search'
162 9
						],
163 9
						'Twitterbot' => [
164 9
							'category' => 'feed',
165 9
							'app' => 'TwitterBot'
166 9
						],
167 9
						'Xbot' => [
168 9
							'category' => 'feed'
169 9
						],
170 9
						'Discordbot' => [
171 9
							'category' => 'feed',
172 9
							'app' => 'DiscordBot'
173 9
						],
174 9
						'SematextSyntheticsRobot' => [
175 9
							'category' => 'monitor',
176 9
							'app' => 'Sematext Synthetics Robot'
177 9
						],
178 9
						'LinkedInBot' => [
179 9
							'category' => 'feed'
180 9
						],
181 9
						'PaperLiBot' => [
182 9
							'category' => 'feed'
183 9
						],
184 9
						'bitlybot' => [
185 9
							'category' => 'feed',
186 9
							'app' => 'Bit.ly Bot'
187 9
						],
188 9
						'TinEye-bot' => [
189 9
							'category' => 'search',
190 9
							'app' => 'TinEye Bot'
191 9
						],
192 9
						'Pinterestbot' => [
193 9
							'category' => 'feed',
194 9
							'app' => 'PinterestBot'
195 9
						],
196 9
						'WebCrawler' => [
197 9
							'category' => 'crawler'
198 9
						],
199 9
						'webprosbot' => [
200 9
							'category' => 'crawler',
201 9
							'app' => 'WebprosBot'
202 9
						],
203 9
						'GuzzleHttp' => [
204 9
							'category' => 'scraper'
205 9
						],
206 9
						'TelegramBot' => [
207 9
							'category' => 'feed'
208 9
						],
209 9
						'Ruby' => [
210 9
							'category' => 'scraper'
211 9
						],
212 9
						'SEMrushBot' => [
213 9
							'category' => 'crawler'
214 9
						],
215 9
						'Mediatoolkitbot' => [
216 9
							'category' => 'crawler',
217 9
							'app' => 'MediaToolkitBot'
218 9
						],
219 9
						'IPLoggerBot' => [
220 9
							'category' => 'monitor'
221 9
						],
222 9
						'CFNetwork' => [
223 9
							'category' => 'feed',
224 9
							'app' => 'Apple Core Foundation Network'
225 9
						],
226 9
						'NCSC Web Check [email protected]' => [
227 9
							'category' => 'monitor',
228 9
							'app' => 'NCSC Web Check'
229 9
						],
230 9
						'Google-Site-Verification' => [
231 9
							'category' => 'validator',
232 9
							'app' => 'Google Site Verification'
233 9
						],
234 9
						'Google-InspectionTool' => [
235 9
							'category' => 'validator',
236 9
							'app' => 'Google Inspection Tool'
237 9
						],
238 9
						'PingdomTMS' => [
239 9
							'category' => 'monitor',
240 9
							'app' => 'Pingdom.com'
241 9
						],
242 9
						'facebookexternalhit' => [
243 9
							'category' => 'feed',
244 9
							'app' => 'Facebook URL Preview'
245 9
						]
246 9
					];
247 9
					return self::getApp($value, \array_merge([
248 9
						'category' => \mb_stripos($value, 'crawler') !== false ? 'crawler' : null,
249 9
						'app' => $parts[0],
250 9
						'appname' => $parts[0],
251 9
					], $category[$parts[0]] ?? []));
252
				}
253 7
				return null;
254 8
			}
255 8
		];
256 1
		return [
0 ignored issues
show
Bug Best Practice introduced by
The expression return array('Yahoo! Slu...ops('end', $fn['map'])) returns the type array<string,hexydec\agentzero\props> which is incompatible with the documented return type hexydec\agentzero\props.
Loading history...
257 1
			'Yahoo! Slurp' => new props('exact', $fn['search']),
258 1
			'facebookexternalhit/' => new props('start', $fn['map']),
259 1
			'Google-Site-Verification/' => new props('start', $fn['map']),
260 1
			'Google-InspectionTool/' => new props('start', $fn['map']),
261 1
			'Mediapartners-Google' => new props('start', $fn['search']),
262 1
			'FeedFetcher-Google' => new props('exact', $fn['feed']),
263 1
			'GoogleProducer' => new props('exact', $fn['feed']),
264 1
			'Google-adstxt' => new props('exact', $fn['ads']),
265 1
			'CFNetwork/' => new props('start', $fn['map']),
266 1
			'Siteimprove.com' => new props('any', $fn['crawler']),
267 1
			'CyotekWebCopy' => new props('start', $fn['scraper']),
268 1
			'Google Page Speed Insights' => new props('exact', $fn['validator']),
269 1
			'Qwantify' => new props('start', $fn['search']),
270 1
			'okhttp' => new props('start', $fn['scraper']),
271 1
			'python' => new props('start', $fn['scraper']),
272 1
			'jsdom/' => new props('start', $fn['scraper']),
273 1
			'Nessus' => new props('start', $fn['monitor']),
274 1
			'Chrome-Lighthouse' => new props('start', $fn['validator']),
275 1
			'Siege/' => new props('start', $fn['validator']),
276 1
			'PingdomTMS/' => new props('start', $fn['map']),
277 1
			'DynGate' => new props('exact', $fn['monitor']),
278 1
			'Datadog/Synthetics' => new props('exact', [
279 1
				'type' => 'robot',
280 1
				'category' => 'monitor',
281 1
				'app' => 'Datadog/Synthetics'
282 1
			]),
283 1
			'RuxitSynthetic/' => new props('start', $fn['monitor']),
284 1
			'Checkly/' => new props('start', $fn['monitor']),
285 1
			'Uptime/' => new props('start', $fn['monitor']),
286 1
			'HostTracker/' => new props('start', $fn['monitor']),
287 1
			'NCSC Web Check [email protected]' => new props('exact', $fn['map']),
288 1
			'Pingdom.com' => new props('start', function (string $value) : array {
289 1
				$version = \explode('_', \trim($value, '_'));
290 1
				return [
291 1
					'type' => 'robot',
292 1
					'category' => 'monitor',
293 1
					'app' => 'Pingdom.com',
294 1
					'appname' => \trim($value, '_'),
295 1
					'appversion' => \end($version)
296 1
				];
297 1
			}),
298 1
			'proximic' => new props('exact', $fn['ads']),
299 1
			'WordPress' => new props('start', $fn['monitor']),
300 1
			'PRTG Network Monitor' => new props('exact', $fn['monitor']),
301 1
			'PRTGCloudBot/' => new props('start', $fn['monitor']),
302 1
			'Site24x7' => new props('exact', $fn['monitor']),
303 1
			'StatusCake' => new props('exact', $fn['monitor']),
304 1
			'adbeat.com' => new props('start', fn (string $value) : array => [
305 2
				'type' => 'robot',
306 2
				'category' => 'ads',
307 2
				'app' => 'Adbeat',
308 2
				'appname' => 'Adbeat',
309 2
				'url' => 'https://'.$value
310 2
			]),
311 1
			'MicrosoftPreview/' => new props('start', $fn['feed']),
312 1
			'Let\'s Encrypt validation server' => new props('exact', $fn['validator']),
313 1
			'Expanse' => new props('start', $fn['crawler']),
314 1
			'Apache-HttpClient/' => new props('start', $fn['scraper']),
315 1
			'eCairn-Grabber/' => new props('start', $fn['scraper']),
316 1
			'SEOkicks' => new props('exact', $fn['crawler']),
317 1
			'PostmanRuntime/' => new props('start', $fn['scraper']),
318 1
			'axios/' => new props('start', $fn['scraper']),
319 1
			'Rogerbot/' => new props('start', $fn['crawler']),
320 1
			'Go-http-client/' => new props('start', $fn['scraper']),
321 1
			'DashLinkPreviews/' => new props('start', $fn['feed']),
322 1
			'PycURL/' => new props('start', $fn['scraper']),
323 1
			'lua-resty-http/' => new props('start', $fn['scraper']),
324 1
			'Snapchat/' => new props('start', $fn['feed']),
325 1
			'HTTPClient/' => new props('start', $fn['scraper']),
326 1
			'WhatsApp/' => new props('any', $fn['feed']),
327 1
			'Hootsuite-Authoring/' => new props('start', $fn['feed']),
328 1
			'Snap URL Preview Service' => new props('exact', $fn['feed']),
329 1
			'ApacheBench/' => new props('start', $fn['validator']),
330 1
			'Asana/' => new props('start', $fn['feed']),
331 1
			'Java/' => new props('start', $fn['scraper']),
332 1
			'curl/' => new props('start', $fn['scraper']),
333 1
			'Wget/' => new props('start', $fn['scraper']),
334 1
			'rest-client/' => new props('start', $fn['scraper']),
335 1
			'ruby/' => new props('start', $fn['scraper']),
336 1
			'libcurl/' => new props('start', $fn['scraper']),
337 1
			'Bun/' => new props('start', $fn['scraper']),
338 1
			'CakePHP' => new props('start', $fn['scraper']),
339 1
			'cpp-httplib/' => new props('start', $fn['scraper']),
340 1
			'Dart/' => new props('start', $fn['scraper']),
341 1
			'Deno/' => new props('start', $fn['scraper']),
342 1
			'libwww-perl/' => new props('start', $fn['scraper']),
343 1
			'GuzzleHttp/' => new props('start', $fn['scraper']),
344 1
			'Cpanel-HTTP-Client/' => new props('start', $fn['scraper']),
345 1
			'akka-http/' => new props('start', $fn['scraper']),
346 1
			'feed' => new props('any', $fn['feed']),
347 1
			'spider' => new props('any', $fn['crawler']),
348 1
			'crawler' => new props('any', $fn['map']),
349 1
			'bot/' => new props('any', $fn['map']),
350 1
			'bot-' => new props('any', $fn['map']),
351 1
			' bot ' => new props('any', $fn['map']),
352 1
			'bot' => new props('end', $fn['map']),
353 1
		];
354
	}
355
}