ContentFactory::build()   B
last analyzed

Complexity

Conditions 5
Paths 4

Size

Total Lines 27
Code Lines 17

Duplication

Lines 0
Ratio 0 %

Importance

Changes 2
Bugs 0 Features 0
Metric Value
c 2
b 0
f 0
dl 0
loc 27
rs 8.439
cc 5
eloc 17
nc 4
nop 1
1
<?php
2
3
namespace Bee4\RobotsTxt;
4
5
use Bee4\RobotsTxt\Exception\InvalidUrlException;
6
use RuntimeException;
7
8
/**
9
 * Class ContentFactory
10
 * Take an URL, try to load the robots.txt file and return content
11
 *
12
 * @copyright Bee4 2015
13
 * @author    Stephane HULARD <[email protected]>
14
 */
15
class ContentFactory
16
{
17
    /**
18
     * Build a parser instance from a string
19
     * @param  string $item     Can be an URL or a file content
20
     * @return Content          The built instance
21
     */
22
    public static function build($item)
23
    {
24
        if (filter_var($item, FILTER_VALIDATE_URL)!==false) {
25
            $parsed = parse_url($item);
26
            if (isset($parsed['path']) && $parsed['path'] != '/robots.txt') {
27
                throw (new InvalidUrlException(
28
                    sprintf(
29
                        'The robots.txt file can\'t be found at: %s',
30
                        $item
31
                    )
32
                ))
33
                        ->setUrl($item);
34
            }
35
36
            $parsed['path'] = '/robots.txt';
37
            $parsed = array_intersect_key(
38
                $parsed,
39
                array_flip(['scheme', 'host', 'port', 'path'])
40
            );
41
            $port = isset($parsed['port'])?':'.$parsed['port']:'';
42
            $url = $parsed['scheme'].'://'.$parsed['host'].$port.$parsed['path'];
43
44
            $item = self::download($url);
45
        }
46
47
        return new Content($item);
48
    }
49
50
    /**
51
     * Extract the content at URL
52
     * @param  string $url The robots.txt URL
53
     * @return string      The robots file content
54
     */
55
    protected static function download($url)
56
    {
57
        $handle = curl_init();
58
        curl_setopt($handle, CURLOPT_URL, $url);
59
        curl_setopt($handle, CURLOPT_RETURNTRANSFER, true);
60
        $item = curl_exec($handle);
61
        $status = curl_getinfo($handle, CURLINFO_HTTP_CODE);
62
        curl_close($handle);
63
64
        if ($status !== 200) {
65
            throw new RuntimeException(sprintf(
66
                'Can\'t access the robots.txt file at: %s',
67
                $url
68
            ));
69
        }
70
71
        return $item;
72
    }
73
}
74