|
1
|
|
|
<?php |
|
2
|
|
|
|
|
3
|
|
|
namespace Bee4\RobotsTxt; |
|
4
|
|
|
|
|
5
|
|
|
use Bee4\RobotsTxt\Exception\InvalidArgumentException; |
|
6
|
|
|
use Bee4\RobotsTxt\Exception\RuntimeException; |
|
7
|
|
|
|
|
8
|
|
|
/** |
|
9
|
|
|
* Class ContentFactory |
|
10
|
|
|
* Take an URL, try to load the robots.txt file and return content |
|
11
|
|
|
* |
|
12
|
|
|
* @copyright Bee4 2015 |
|
13
|
|
|
* @author Stephane HULARD <[email protected]> |
|
14
|
|
|
*/ |
|
15
|
|
|
class ContentFactory |
|
16
|
|
|
{ |
|
17
|
|
|
/** |
|
18
|
|
|
* Build a parser instance from a string |
|
19
|
|
|
* @param string $item Can be an URL or a file content |
|
20
|
|
|
* @return Content The built instance |
|
21
|
|
|
*/ |
|
22
|
1 |
|
public static function build($item) |
|
23
|
|
|
{ |
|
24
|
1 |
|
if (filter_var($item, FILTER_VALIDATE_URL)!==false) { |
|
25
|
1 |
|
$parsed = parse_url($item); |
|
26
|
1 |
|
if (isset($parsed['path']) && $parsed['path'] != '/robots.txt') { |
|
27
|
|
|
throw new InvalidArgumentException( |
|
28
|
|
|
sprintf( |
|
29
|
|
|
'The robots.txt file can\'t be found at: %s this file |
|
30
|
|
|
must be hosted at website root', |
|
31
|
|
|
$item |
|
32
|
|
|
) |
|
33
|
|
|
); |
|
34
|
|
|
} |
|
35
|
|
|
|
|
36
|
1 |
|
$parsed['path'] = '/robots.txt'; |
|
37
|
1 |
|
$parsed = array_intersect_key( |
|
38
|
1 |
|
$parsed, |
|
39
|
1 |
|
array_flip(['scheme', 'host', 'port', 'path']) |
|
40
|
1 |
|
); |
|
41
|
1 |
|
$port = isset($parsed['port'])?':'.$parsed['port']:''; |
|
42
|
1 |
|
$url = $parsed['scheme'].'://'.$parsed['host'].$port.$parsed['path']; |
|
43
|
|
|
|
|
44
|
1 |
|
$item = self::download($url); |
|
45
|
1 |
|
} |
|
46
|
|
|
|
|
47
|
1 |
|
return new Content($item); |
|
48
|
|
|
} |
|
49
|
|
|
|
|
50
|
|
|
/** |
|
51
|
|
|
* Extract the content at URL |
|
52
|
|
|
* @param string $url The robots.txt URL |
|
53
|
|
|
* @return string The robots file content |
|
54
|
|
|
*/ |
|
55
|
1 |
|
protected static function download($url) |
|
56
|
|
|
{ |
|
57
|
1 |
|
$handle = curl_init(); |
|
58
|
1 |
|
curl_setopt($handle, CURLOPT_URL, $url); |
|
59
|
1 |
|
curl_setopt($handle, CURLOPT_RETURNTRANSFER, true); |
|
60
|
1 |
|
$item = curl_exec($handle); |
|
61
|
1 |
|
$status = curl_getinfo($handle, CURLINFO_HTTP_CODE); |
|
62
|
1 |
|
curl_close($handle); |
|
63
|
|
|
|
|
64
|
1 |
|
if ($status !== 200) { |
|
65
|
|
|
throw new RuntimeException(sprintf( |
|
66
|
|
|
'Can\'t access the robots.txt file at: %s', |
|
67
|
|
|
$url |
|
68
|
|
|
)); |
|
69
|
|
|
} |
|
70
|
|
|
|
|
71
|
1 |
|
return $item; |
|
72
|
|
|
} |
|
73
|
|
|
} |
|
74
|
|
|
|