Passed
Push — master ( 9883fa...3a4fb6 )
by Kosuha
03:02
created

HtmlUniParser::parseSearch()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 4
Code Lines 2

Duplication

Lines 0
Ratio 0 %

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 1
eloc 2
c 1
b 0
f 0
nc 1
nop 1
dl 0
loc 4
rs 10
1
<?php
2
3
namespace kosuha606\HtmlUniParser;
4
5
use kosuha606\HtmlUniParser\exceptions\ParserInvalidConfigException;
6
7
/**
8
 * Class HtmlUniParser
9
 * @package kosuha606\HtmlUniParser
10
 */
11
class HtmlUniParser extends BaseObject
12
{
13
    /**
14
     * Парсинг по сценарию каталога
15
     * @var
16
     */
17
    protected $catalogUrl;
18
19
    /**
20
     * Парсинг по сценарию поиска на сайте
21
     * @var
22
     */
23
    protected $searchUrl;
24
25
    /**
26
     * Парсинг по сценарию получения данных от одной страницы
27
     * @var
28
     */
29
    protected $pageUrl;
30
31
    /**
32
     * Заставить парсер получать внешний html
33
     * @var boolean
34
     */
35
    protected $forceOuterHtml = false;
36
37
    /**
38
     * Парсинг по урлам, сгенерированным генератором
39
     * @var
40
     */
41
    protected $urlGenerator;
42
43
    /**
44
     * Кодировка сайта
45
     * @var string
46
     */
47
    protected $encoding = 'UTF-8';
48
49
    /**
50
     * @var string
51
     */
52
    protected $siteBaseUrl = '/';
53
54
    /**
55
     * @var bool
56
     */
57
    protected $resultLimit = false;
58
59
    /**
60
     * @var int
61
     */
62
    protected $sleepAfterRequest = 0;
63
64
    /**
65
     * @var bool
66
     */
67
    protected $goIntoCard = false;
68
69
    /**
70
     * @var string
71
     */
72
    protected $xpathItem;
73
74
    /**
75
     * @var string
76
     */
77
    protected $xpathLink;
78
79
    /**
80
     * @var array
81
     */
82
    protected $xpathOnCard = [];
83
84
    /** @var string */
85
    protected $typeMech;
86
87
    /**
88
     * @var array
89
     */
90
    protected $callbacks = [];
91
92
    /**
93
     * Результат должен быть множественным
94
     * @var array
95
     */
96
    private $xpathOnCardMany = [];
97
98
    /**
99
     * Результат в формате HTML
100
     * @var array
101
     */
102
    private $xpathOnCardHtml = [];
103
104
    /** @var ZendBasedParser */
105
    private $zendParser;
106
107
    /**
108
     * HtmlUniParser constructor.
109
     * @param $config
110
     */
111
    public function __construct($config, ZendBasedParser $parser)
112
    {
113
        if  (!in_array  ('dom', get_loaded_extensions())) {
114
            throw new ParserInvalidConfigException('The dom extension in not loaded in system');
115
        }
116
        if  (!in_array  ('iconv', get_loaded_extensions())) {
117
            throw new ParserInvalidConfigException('The iconv extension in not loaded in system');
118
        }
119
        parent::__construct($config);
120
        $this->zendParser = $parser;
121
        if (\count($this->xpathOnCard) > 0) {
122
            foreach ($this->xpathOnCard as $param => &$xpath) {
123
                if (\strpos($xpath, 'MANY') !== false) {
124
                    $this->xpathOnCardMany[] = $param;
125
                    $xpath = \str_replace('MANY', '', $xpath);
126
                }
127
                if (\strpos($xpath, 'HTML') !== false) {
128
                    $this->xpathOnCardHtml[] = $param;
129
                    $xpath = \str_replace('HTML', '', $xpath);
130
                }
131
            }
132
        }
133
    }
134
135
    /**
136
     * You can create instances of this class
137
     * by yourself or you can use this method
138
     * @param $config
139
     * @return HtmlUniParser
140
     */
141
    public static function create($config)
142
    {
143
144
        return new static($config, new ZendBasedParser());
145
    }
146
147
    /**
148
     * @param mixed $callbacs
149
     */
150
    public function addCallback($callbacs)
151
    {
152
        $this->callbacks[] = $callbacs;
153
    }
154
155
    /**
156
     * @param $node
157
     * @return string
158
     */
159
    public function getHtml($node)
160
    {
161
        if ($this->forceOuterHtml) {
162
            return $this->getOuterHtml($node);
163
        }
164
        $innerHTML = '';
165
        $children = $node->childNodes;
166
        foreach ($children as $child) {
167
            $innerHTML .= $child->ownerDocument->saveXML($child);
168
        }
169
        return $innerHTML;
170
    }
171
172
    public function getOuterHtml($node)
173
    {
174
        return $node->ownerDocument->saveXML($node);
175
    }
176
177
    /**
178
     * @param $object
179
     * @param $method
180
     * @return mixed|null
181
     */
182
    public function valueStub($object, $method)
183
    {
184
        if (!\is_object($object)) {
185
            return null;
186
        }
187
        if (\property_exists($object, $method)) {
188
            return $object->{$method};
189
        }
190
        return null;
191
    }
192
193
    /**
194
     * @param $results
195
     * @return array
196
     */
197
    public function getFirstMatch($results)
198
    {
199
        $result = array();
200
        foreach ($results as $r) {
201
            $result = $r;
202
        }
203
        return $result;
204
    }
205
206
    /**
207
     * @param $nodes
208
     * @return string
209
     */
210
    public function getFirstValue($nodes)
211
    {
212
        $val = $this->getFirstMatch($nodes);
213
        return $this->getValue($val);
214
    }
215
216
    /**
217
     * @param $nodes
218
     * @return string|string[]|null
219
     */
220
    public function getFirstValueHtml($nodes)
221
    {
222
        /** @var \DOMElement $val */
223
        $val = $this->getFirstMatch($nodes);
224
        $html = $val->ownerDocument->saveHTML($val);
225
        // Удаляем картинки из спарсенного текста
226
        $html = preg_replace("/<img[^>]+\>/i", "", $html);
227
        return $this->proccessValue($html);
228
    }
229
230
    /**
231
     * @param $val
232
     * @return string
233
     */
234
    public function getValue($val)
235
    {
236
        $result = null;
237
        if ($val instanceof \DOMAttr) {
238
            $result = $this->valueStub($val, 'value');
239
        }
240
        if ($val instanceof \DOMElement) {
241
            $result = $this->valueStub($val, 'nodeValue');
242
        }
243
        return $this->proccessValue(trim($result));
244
    }
245
246
    /**
247
     * @param $nodes
248
     * @return array
249
     */
250
    public function getAllValues($nodes)
251
    {
252
        $result = [];
253
        foreach ($nodes as $node) {
254
            $result[] = $this->getValue($node);
255
        }
256
        return $result;
257
    }
258
259
    /**
260
     * Спарсить ссылку, возможно каталог
261
     */
262
    public function parseUrl()
263
    {
264
        $this->zendParser->setSleepAfterRequest($this->sleepAfterRequest);
265
        $this->zendParser->setUrl($this->catalogUrl);
266
        $items = $this->zendParser->dom($this->getEncoding(), $this->getTypeMech())->queryXpath($this->xpathItem);
267
        $result = [];
268
        foreach ($items as $index => $item) {
269
            $newItem = [];
270
            $html = $this->getHtml($item);
271
            $this->zendParser->setRawHtml($html);
272
            $link = $this->zendParser->dom($this->getEncoding(), $this->getTypeMech())->queryXpath($this->xpathLink);
273
            $link = $this->getFirstValue($link);
274
            if (preg_match('/^http(s)?:\/\/.*$/i', $link)) {
275
                $newItem['link'] = $link;
276
            } else {
277
                $newItem['link'] = $this->siteBaseUrl.$link;
278
            }
279
            if ($this->goIntoCard && $newItem['link']) {
280
                $this->zendParser->setUrl($newItem['link']);
281
                foreach ($this->xpathOnCard as $param => $xpath) {
282
                    $temParam = $this->zendParser->dom($this->getEncoding(), $this->getTypeMech())->queryXpath($xpath);
283
                    if (in_array($param, $this->xpathOnCardMany)) {
284
                        $newItem[$param] = $this->getAllValues($temParam);
285
                    } elseif (in_array($param, $this->xpathOnCardHtml)) {
286
                        $newItem[$param] = $this->getFirstValueHtml($temParam);
287
                    } else {
288
                        $newItem[$param] = $this->getFirstValue($temParam);
289
                    }
290
                }
291
            }
292
            $this->handleCallbacks($newItem);
293
            $result[] = $newItem;
294
            // Не даем спарсить больше предела если установлен предел
295
            if ($this->resultLimit && $this->resultLimit <= ($index + 1)) {
296
                break;
297
            }
298
        }
299
        return $result;
300
    }
301
302
    /**
303
     * Спарсить результаты поиска
304
     */
305
    public function parseSearch($textQuery)
306
    {
307
        $this->catalogUrl = $this->searchUrl.$textQuery;
308
        return $this->parseUrl();
309
    }
310
311
    /**
312
     * Спарсить одну карточку
313
     */
314
    public function parseCard()
315
    {
316
        $this->zendParser->setUrl($this->pageUrl);
317
        foreach ($this->xpathOnCard as $param => $xpath) {
318
            $temParam = $this->zendParser->dom($this->getEncoding(), $this->getTypeMech())->queryXpath($xpath);
319
            if (in_array($param, $this->xpathOnCardMany)) {
320
                $newItem[$param] = $this->getAllValues($temParam);
321
            } elseif (in_array($param, $this->xpathOnCardHtml)) {
322
                $newItem[$param] = $this->getFirstValueHtml($temParam);
323
            } else {
324
                $newItem[$param] = $this->getFirstValue($temParam);
325
            }
326
        }
327
        $this->handleCallbacks($newItem);
328
        return $newItem;
329
    }
330
331
    /**
332
     * @return array
333
     */
334
    public function parseGenerator()
335
    {
336
        $generator = $this->urlGenerator;
337
        $urls = $generator();
338
        $results = [];
339
        foreach ($urls as $url) {
340
            $this->pageUrl = $url;
341
            $results[] = $this->parseCard();
342
        }
343
        return $results;
344
    }
345
346
    /**
347
     * @param $lastItem
348
     * @return HtmlUniParser
349
     */
350
    public function handleCallbacks(&$lastItem)
351
    {
352
        foreach ($this->callbacks as &$callbac) {
353
            $callbac($lastItem, $this->pageUrl);
354
        }
355
        return $this;
356
    }
357
358
    /**
359
     * @return string
360
     */
361
    public function getEncoding(): string
362
    {
363
        return $this->encoding;
364
    }
365
366
    /**
367
     * @param string $encoding
368
     * @return HtmlUniParser
369
     */
370
    public function setEncoding(string $encoding)
371
    {
372
        $this->encoding = $encoding;
373
        return $this;
374
    }
375
376
    /**
377
     * @param $value
378
     * @return false|string
379
     */
380
    private function proccessValue($value)
381
    {
382
        if ($this->getEncoding() === 'UTF-8') {
383
            return $value;
384
        }
385
        $result = \iconv($this->getEncoding(), 'UTF-8', $value);
386
        return $result;
387
    }
388
389
    /**
390
     * @return string
391
     */
392
    public function getTypeMech(): string
393
    {
394
        return $this->typeMech ?: 'curl';
395
    }
396
397
    /**
398
     * @param string $typeMech
399
     */
400
    public function setTypeMech(string $typeMech)
401
    {
402
        $this->typeMech = $typeMech;
403
    }
404
}
405