Passed
Push — dev ( 043eb4...bf3609 )
by Dispositif
06:23
created

PublisherAction::getUrl()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 3
Code Lines 1

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 1
eloc 1
nc 1
nop 0
dl 0
loc 3
rs 10
c 0
b 0
f 0
1
<?php
2
/**
3
 * This file is part of dispositif/wikibot application
4
 * 2019 : Philippe M. <[email protected]>
5
 * For the full copyright and MIT license information, please view the LICENSE file.
6
 */
7
8
declare(strict_types=1);
9
10
namespace App\Application;
11
12
use App\Infrastructure\TagParser;
13
use Exception;
14
use GuzzleHttp\Client;
15
16
/**
17
 * todo Move Infra?
18
 * Class PublisherAction
19
 *
20
 * @package App\Application
21
 */
22
class PublisherAction
23
{
24
    private $url;
25
26
    public function __construct(string $url)
27
    {
28
        $this->url = $url;
29
    }
30
31
    public function getUrl(): ?string
32
    {
33
        return $this->url;
34
    }
35
36
    /**
37
     * import source from URL with Guzzle.
38
     *
39
     * @return string|null
40
     * @throws Exception
41
     */
42
    public function getHTMLSource(): string
43
    {
44
        $client = new Client(
45
            [
46
                'timeout' => 5,
47
                'headers' => ['User-Agent' => getenv('USER_AGENT')],
48
            ]
49
        );
50
        $response = $client->get($this->url);
51
52
        if (200 !== $response->getStatusCode()) {
53
            throw new Exception('response error '.$response->getStatusCode().' '.$response->getReasonPhrase());
54
        }
55
56
        return $response->getBody()->getContents();
57
    }
58
59
    /**
60
     * @param string $html
61
     *
62
     * @return array
63
     * @throws Exception
64
     */
65
    public function extractWebData(string $html): array
66
    {
67
        $ld = $this->extractLdJson($html);
68
        $meta = $this->getMetaTags($html);
69
70
        return ['JSON-LD' => $ld, 'meta' => $meta];
71
    }
72
73
    /**
74
     * extract LD-JSON metadata from <script type="application/ld+json">.
75
     *
76
     * @param string $html
77
     *
78
     * @return array
79
     * @throws Exception
80
     */
81
    private function extractLdJson(string $html): array
82
    {
83
        $parser = new TagParser();
84
        $results = $parser->importHtml($html)->xpathResults(
85
            '//script[@type="application/ld+json"]'
86
        );
87
88
        foreach ($results as $result) {
89
            $json = trim($result);
90
            // filtrage empty value (todo?)
91
            if (0 === strlen($json)) {
92
                continue;
93
            }
94
            $data = json_decode($json, true);
95
            if (!is_array($data)) {
96
                return [];
97
            }
98
99
            // filtrage : @type => BreadcrumbList (lemonde)
100
            // TODO : c'est quoi ça ?
101
            if (isset($data['@type']) && 'BreadcrumbList' === $data['@type']) {
102
                continue;
103
            }
104
105
            return $data;
106
        }
107
108
        return [];
109
    }
110
111
    /**
112
     * todo move/refac/delete?
113
     *
114
     * @param string $str
115
     *
116
     * @return array
117
     */
118
    private function getMetaTags(string $str): array
119
    {
120
        $pattern = '
121
  ~<\s*meta\s
122
  # using lookahead to capture type to $1
123
    (?=[^>]*?
124
    \b(?:name|property|http-equiv)\s*=\s*
125
    (?|"\s*([^"]*?)\s*"|\'\s*([^\']*?)\s*\'|
126
    ([^"\'>]*?)(?=\s*/?\s*>|\s\w+\s*=))
127
  )
128
  # capture content to $2
129
  [^>]*?\bcontent\s*=\s*
130
    (?|"\s*([^"]*?)\s*"|\'\s*([^\']*?)\s*\'|
131
    ([^"\'>]*?)(?=\s*/?\s*>|\s\w+\s*=))
132
  [^>]*>
133
  ~ix';
134
135
        if (preg_match_all($pattern, $str, $out)) {
136
            return array_combine($out[1], $out[2]);
1 ignored issue
show
Bug Best Practice introduced by
The expression return array_combine($out[1], $out[2]) could return the type false which is incompatible with the type-hinted return array. Consider adding an additional type-check to rule them out.
Loading history...
137
        }
138
139
        return [];
140
    }
141
}
142