1
|
|
|
<?php |
2
|
|
|
|
3
|
|
|
namespace Bee4\RobotsTxt; |
4
|
|
|
|
5
|
|
|
use Bee4\RobotsTxt\Exception\InvalidArgumentException; |
6
|
|
|
use Bee4\RobotsTxt\Exception\RuntimeException; |
7
|
|
|
|
8
|
|
|
/** |
9
|
|
|
* Class ContentFactory |
10
|
|
|
* Take an URL, try to load the robots.txt file and return content |
11
|
|
|
* |
12
|
|
|
* @copyright Bee4 2015 |
13
|
|
|
* @author Stephane HULARD <[email protected]> |
14
|
|
|
*/ |
15
|
|
|
class ContentFactory |
16
|
|
|
{ |
17
|
|
|
/** |
18
|
|
|
* Build a parser instance from a string |
19
|
|
|
* @param string $item Can be an URL or a file content |
20
|
|
|
* @return Content The built instance |
21
|
|
|
*/ |
22
|
1 |
|
public static function build($item) |
23
|
|
|
{ |
24
|
1 |
|
if (filter_var($item, FILTER_VALIDATE_URL)!==false) { |
25
|
1 |
|
$parsed = parse_url($item); |
26
|
1 |
|
if (isset($parsed['path']) && $parsed['path'] != '/robots.txt') { |
27
|
|
|
throw new InvalidArgumentException( |
28
|
|
|
sprintf( |
29
|
|
|
'The robots.txt file can\'t be found at: %s this file |
30
|
|
|
must be hosted at website root', |
31
|
|
|
$item |
32
|
|
|
) |
33
|
|
|
); |
34
|
|
|
} |
35
|
|
|
|
36
|
1 |
|
$parsed['path'] = '/robots.txt'; |
37
|
1 |
|
$parsed = array_intersect_key( |
38
|
1 |
|
$parsed, |
39
|
1 |
|
array_flip(['scheme', 'host', 'port', 'path']) |
40
|
1 |
|
); |
41
|
1 |
|
$port = isset($parsed['port'])?':'.$parsed['port']:''; |
42
|
1 |
|
$url = $parsed['scheme'].'://'.$parsed['host'].$port.$parsed['path']; |
43
|
|
|
|
44
|
1 |
|
$item = self::download($url); |
45
|
1 |
|
} |
46
|
|
|
|
47
|
1 |
|
return new Content($item); |
48
|
|
|
} |
49
|
|
|
|
50
|
|
|
/** |
51
|
|
|
* Extract the content at URL |
52
|
|
|
* @param string $url The robots.txt URL |
53
|
|
|
* @return string The robots file content |
54
|
|
|
*/ |
55
|
1 |
|
protected static function download($url) |
56
|
|
|
{ |
57
|
1 |
|
$handle = curl_init(); |
58
|
1 |
|
curl_setopt($handle, CURLOPT_URL, $url); |
59
|
1 |
|
curl_setopt($handle, CURLOPT_RETURNTRANSFER, true); |
60
|
1 |
|
$item = curl_exec($handle); |
61
|
1 |
|
$status = curl_getinfo($handle, CURLINFO_HTTP_CODE); |
62
|
1 |
|
curl_close($handle); |
63
|
|
|
|
64
|
1 |
|
if ($status !== 200) { |
65
|
|
|
throw new RuntimeException(sprintf( |
66
|
|
|
'Can\'t access the robots.txt file at: %s', |
67
|
|
|
$url |
68
|
|
|
)); |
69
|
|
|
} |
70
|
|
|
|
71
|
1 |
|
return $item; |
72
|
|
|
} |
73
|
|
|
} |
74
|
|
|
|