1
|
|
|
<?php |
2
|
|
|
|
3
|
|
|
namespace Afsy\Component; |
4
|
|
|
|
5
|
|
|
use GuzzleHttp\Client as GuzzleClient; |
6
|
|
|
use Symfony\Component\DomCrawler\Crawler; |
7
|
|
|
use OldSound\RabbitMqBundle\RabbitMq\Producer; |
8
|
|
|
|
9
|
|
|
class PageHoover |
10
|
|
|
{ |
11
|
|
|
/** |
12
|
|
|
* @var GuzzleClient |
13
|
|
|
*/ |
14
|
|
|
protected $client = null; |
15
|
|
|
|
16
|
|
|
/** |
17
|
|
|
* @var array |
18
|
|
|
*/ |
19
|
|
|
protected $options = []; |
20
|
|
|
|
21
|
|
|
/** |
22
|
|
|
* @var string |
23
|
|
|
*/ |
24
|
|
|
protected $downloadFolder = null; |
25
|
|
|
|
26
|
|
|
/** |
27
|
|
|
* @var Producer |
28
|
|
|
*/ |
29
|
|
|
protected $downloadImageProducer = null; |
30
|
|
|
|
31
|
|
|
/** |
32
|
|
|
* Main constructor. |
33
|
|
|
* |
34
|
|
|
* @param (GuzzleClient) $client Guzzle Client |
35
|
|
|
* @param (Producer) $downloadImageProducer Download image producer |
36
|
|
|
* @param (array) $options Options list |
37
|
|
|
* |
38
|
|
|
* @return (void) |
39
|
|
|
*/ |
40
|
|
|
public function __construct(GuzzleClient $client, Producer $downloadImageProducer, array $options) |
41
|
|
|
{ |
42
|
|
|
// Initialize |
43
|
|
|
$this->client = $client; |
44
|
|
|
$this->options = $options; |
45
|
|
|
$this->downloadImageProducer = $downloadImageProducer; |
46
|
|
|
|
47
|
|
|
// Initialize options |
48
|
|
|
$this->downloadFolder = $options['downloadFolder']; |
49
|
|
|
} |
50
|
|
|
|
51
|
|
|
/** |
52
|
|
|
* Download page method. |
53
|
|
|
* |
54
|
|
|
* @param (string) $page Page to download (url) |
55
|
|
|
* |
56
|
|
|
* @return (boolean) Download status |
57
|
|
|
*/ |
58
|
|
|
public function downloadPage($page) |
59
|
|
|
{ |
60
|
|
|
// Initialize |
61
|
|
|
$pageParts = pathinfo($page); |
62
|
|
|
$downloadFolder = $this->downloadFolder; |
63
|
|
|
$saveFile = $downloadFolder.date('Ymd-His').'-'.$pageParts['filename'].'.htm'; |
64
|
|
|
|
65
|
|
|
// Download page |
66
|
|
|
$res = $this->client->get($page); |
67
|
|
|
|
68
|
|
|
// Check downloaded content |
69
|
|
|
if ($res->getStatusCode() !== 200) { |
70
|
|
|
return false; |
71
|
|
|
} |
72
|
|
|
|
73
|
|
|
// Get page content |
74
|
|
|
$pageContent = $res->getBody()->getContents(); |
75
|
|
|
|
76
|
|
|
// Save page in downloadFolder |
77
|
|
|
if (!file_put_contents($saveFile, "\xEF\xBB\xBF".$pageContent)) { |
78
|
|
|
// Throw error |
79
|
|
|
throw new \Exception('Error saving file', 1); |
80
|
|
|
} |
81
|
|
|
|
82
|
|
|
// Initialize crawler |
83
|
|
|
$crawler = new Crawler($pageContent); |
84
|
|
|
|
85
|
|
|
// Get images list |
86
|
|
|
$images = $crawler->filter('img')->each(function(Crawler $image) { |
87
|
|
|
return $image->attr('src'); |
88
|
|
|
}); |
89
|
|
|
|
90
|
|
|
// Download images |
91
|
|
|
foreach ($images as $image) { |
92
|
|
|
// Initialize |
93
|
|
|
$image = str_replace(' ', '', $image); |
94
|
|
|
$imgExt = pathinfo($image, PATHINFO_EXTENSION); |
95
|
|
|
$hasHost = filter_var($image, FILTER_VALIDATE_URL, FILTER_FLAG_PATH_REQUIRED); |
96
|
|
|
|
97
|
|
|
// Check host |
98
|
|
|
if (!$hasHost) { |
99
|
|
|
$image = $pageParts['dirname'].$image; |
100
|
|
|
} |
101
|
|
|
|
102
|
|
|
// Check extension |
103
|
|
|
if (!in_array($imgExt, ['png', 'jpg', 'jpeg', 'gif'])) { |
104
|
|
|
$imgExt = 'png'; |
105
|
|
|
} |
106
|
|
|
|
107
|
|
|
// Create image to publish |
108
|
|
|
$imgToPublish = [ |
109
|
|
|
'url' => $image, |
110
|
|
|
'savePath' => $this->downloadFolder.pathinfo($image, PATHINFO_FILENAME).'.'.$imgExt, |
111
|
|
|
'savedHtmlFile' => $saveFile, |
112
|
|
|
]; |
113
|
|
|
|
114
|
|
|
// Publish image |
115
|
|
|
$sImg = serialize($imgToPublish); |
116
|
|
|
$this->downloadImageProducer->publish($sImg); |
117
|
|
|
} |
118
|
|
|
|
119
|
|
|
// Return status |
120
|
|
|
return true; |
121
|
|
|
} |
122
|
|
|
} |
123
|
|
|
|