Completed
Push — master ( 3a4fb6...67ff38 )
by Kosuha
02:42
created

HtmlUniParser::setSleepAfterRequest()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 4
Code Lines 2

Duplication

Lines 0
Ratio 0 %

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 1
eloc 2
c 1
b 0
f 0
nc 1
nop 1
dl 0
loc 4
rs 10
1
<?php
2
3
namespace kosuha606\HtmlUniParser;
4
5
use kosuha606\HtmlUniParser\exceptions\ParserInvalidConfigException;
6
7
/**
8
 * Class HtmlUniParser
9
 * @package kosuha606\HtmlUniParser
10
 */
11
class HtmlUniParser extends BaseObject
12
{
13
    /**
14
     * Парсинг по сценарию каталога
15
     * @var
16
     */
17
    protected $catalogUrl;
18
19
    /**
20
     * Парсинг по сценарию поиска на сайте
21
     * @var
22
     */
23
    protected $searchUrl;
24
25
    /**
26
     * Парсинг по сценарию получения данных от одной страницы
27
     * @var
28
     */
29
    protected $pageUrl;
30
31
    /**
32
     * Заставить парсер получать внешний html
33
     * @var boolean
34
     */
35
    protected $forceOuterHtml = false;
36
37
    /**
38
     * Парсинг по урлам, сгенерированным генератором
39
     * @var
40
     */
41
    protected $urlGenerator;
42
43
    /**
44
     * Кодировка сайта
45
     * @var string
46
     */
47
    protected $encoding = 'UTF-8';
48
49
    /**
50
     * @var string
51
     */
52
    protected $siteBaseUrl = '/';
53
54
    /**
55
     * @var bool
56
     */
57
    protected $resultLimit = false;
58
59
    /**
60
     * @var int
61
     */
62
    protected $sleepAfterRequest = 0;
63
64
    /**
65
     * @var bool
66
     */
67
    protected $goIntoCard = false;
68
69
    /**
70
     * @var string
71
     */
72
    protected $xpathItem;
73
74
    /**
75
     * @var string
76
     */
77
    protected $xpathLink;
78
79
    /**
80
     * @var array
81
     */
82
    protected $xpathOnCard = [];
83
84
    /** @var string */
85
    protected $typeMech;
86
87
    /**
88
     * @var array
89
     */
90
    protected $callbacks = [];
91
92
    /**
93
     * Результат должен быть множественным
94
     * @var array
95
     */
96
    private $xpathOnCardMany = [];
97
98
    /**
99
     * Результат в формате HTML
100
     * @var array
101
     */
102
    private $xpathOnCardHtml = [];
103
104
    /** @var ZendBasedParser */
105
    private $zendParser;
106
107
    /**
108
     * HtmlUniParser constructor.
109
     * @param $config
110
     */
111
    public function __construct($config, ZendBasedParser $parser)
112
    {
113
        if  (!in_array  ('dom', get_loaded_extensions())) {
114
            throw new ParserInvalidConfigException('The dom extension in not loaded in system');
115
        }
116
        if  (!in_array  ('iconv', get_loaded_extensions())) {
117
            throw new ParserInvalidConfigException('The iconv extension in not loaded in system');
118
        }
119
        parent::__construct($config);
120
        $this->zendParser = $parser;
121
        if (\count($this->xpathOnCard) > 0) {
122
            foreach ($this->xpathOnCard as $param => &$xpath) {
123
                if (\strpos($xpath, 'MANY') !== false) {
124
                    $this->xpathOnCardMany[] = $param;
125
                    $xpath = \str_replace('MANY', '', $xpath);
126
                }
127
                if (\strpos($xpath, 'HTML') !== false) {
128
                    $this->xpathOnCardHtml[] = $param;
129
                    $xpath = \str_replace('HTML', '', $xpath);
130
                }
131
            }
132
        }
133
    }
134
135
    /**
136
     * You can create instances of this class
137
     * by yourself or you can use this method
138
     * @param $config
139
     * @return HtmlUniParser
140
     */
141
    public static function create($config)
142
    {
143
144
        return new static($config, new ZendBasedParser());
145
    }
146
147
    /**
148
     * @param mixed $callbacs
149
     */
150
    public function addCallback($callbacs)
151
    {
152
        $this->callbacks[] = $callbacs;
153
    }
154
155
    /**
156
     * @param $node
157
     * @return string
158
     */
159
    public function getHtml($node)
160
    {
161
        if ($this->forceOuterHtml) {
162
            return $this->getOuterHtml($node);
163
        }
164
        $innerHTML = '';
165
        $children = $node->childNodes;
166
        foreach ($children as $child) {
167
            $innerHTML .= $child->ownerDocument->saveXML($child);
168
        }
169
        return $innerHTML;
170
    }
171
172
    /**
173
     * @param $node
174
     * @return mixed
175
     */
176
    public function getOuterHtml($node)
177
    {
178
        return $node->ownerDocument->saveXML($node);
179
    }
180
181
    /**
182
     * @param $object
183
     * @param $method
184
     * @return mixed|null
185
     */
186
    public function valueStub($object, $method)
187
    {
188
        if (!\is_object($object)) {
189
            return null;
190
        }
191
        if (\property_exists($object, $method)) {
192
            return $object->{$method};
193
        }
194
        return null;
195
    }
196
197
    /**
198
     * @param $results
199
     * @return array
200
     */
201
    public function getFirstMatch($results)
202
    {
203
        $result = array();
204
        foreach ($results as $r) {
205
            $result = $r;
206
        }
207
        return $result;
208
    }
209
210
    /**
211
     * @param $nodes
212
     * @return string
213
     */
214
    public function getFirstValue($nodes)
215
    {
216
        $val = $this->getFirstMatch($nodes);
217
        return $this->getValue($val);
218
    }
219
220
    /**
221
     * @param $nodes
222
     * @return string|string[]|null
223
     */
224
    public function getFirstValueHtml($nodes)
225
    {
226
        /** @var \DOMElement $val */
227
        $val = $this->getFirstMatch($nodes);
228
        $html = $val->ownerDocument->saveHTML($val);
229
        // Удаляем картинки из спарсенного текста
230
        $html = preg_replace("/<img[^>]+\>/i", "", $html);
231
        return $this->proccessValue($html);
232
    }
233
234
    /**
235
     * @param $val
236
     * @return string
237
     */
238
    public function getValue($val)
239
    {
240
        $result = null;
241
        if ($val instanceof \DOMAttr) {
242
            $result = $this->valueStub($val, 'value');
243
        }
244
        if ($val instanceof \DOMElement) {
245
            $result = $this->valueStub($val, 'nodeValue');
246
        }
247
        return $this->proccessValue(trim($result));
248
    }
249
250
    /**
251
     * @param $nodes
252
     * @return array
253
     */
254
    public function getAllValues($nodes)
255
    {
256
        $result = [];
257
        foreach ($nodes as $node) {
258
            $result[] = $this->getValue($node);
259
        }
260
        return $result;
261
    }
262
263
    /**
264
     * Спарсить ссылку, возможно каталог
265
     */
266
    public function parseUrl()
267
    {
268
        $this->zendParser->setSleepAfterRequest($this->sleepAfterRequest);
269
        $this->zendParser->setUrl($this->catalogUrl);
270
        $items = $this->zendParser->dom($this->getEncoding(), $this->getTypeMech())->queryXpath($this->xpathItem);
271
        $result = [];
272
        foreach ($items as $index => $item) {
273
            $newItem = [];
274
            $html = $this->getHtml($item);
275
            $this->zendParser->setRawHtml($html);
276
            $link = $this->zendParser->dom($this->getEncoding(), $this->getTypeMech())->queryXpath($this->xpathLink);
277
            $link = $this->getFirstValue($link);
278
            if (preg_match('/^http(s)?:\/\/.*$/i', $link)) {
279
                $newItem['link'] = $link;
280
            } else {
281
                $newItem['link'] = $this->siteBaseUrl.$link;
282
            }
283
            if ($this->goIntoCard && $newItem['link']) {
284
                $this->zendParser->setUrl($newItem['link']);
285
                foreach ($this->xpathOnCard as $param => $xpath) {
286
                    $temParam = $this->zendParser->dom($this->getEncoding(), $this->getTypeMech())->queryXpath($xpath);
287
                    if (in_array($param, $this->xpathOnCardMany)) {
288
                        $newItem[$param] = $this->getAllValues($temParam);
289
                    } elseif (in_array($param, $this->xpathOnCardHtml)) {
290
                        $newItem[$param] = $this->getFirstValueHtml($temParam);
291
                    } else {
292
                        $newItem[$param] = $this->getFirstValue($temParam);
293
                    }
294
                }
295
            }
296
            $this->handleCallbacks($newItem);
297
            $result[] = $newItem;
298
            // Не даем спарсить больше предела если установлен предел
299
            if ($this->resultLimit && $this->resultLimit <= ($index + 1)) {
300
                break;
301
            }
302
        }
303
        return $result;
304
    }
305
306
    /**
307
     * Спарсить результаты поиска
308
     */
309
    public function parseSearch($textQuery)
310
    {
311
        $this->catalogUrl = $this->searchUrl.$textQuery;
312
        return $this->parseUrl();
313
    }
314
315
    /**
316
     * Спарсить одну карточку
317
     */
318
    public function parseCard()
319
    {
320
        $this->zendParser->setUrl($this->pageUrl);
321
        foreach ($this->xpathOnCard as $param => $xpath) {
322
            $temParam = $this->zendParser->dom($this->getEncoding(), $this->getTypeMech())->queryXpath($xpath);
323
            if (in_array($param, $this->xpathOnCardMany)) {
324
                $newItem[$param] = $this->getAllValues($temParam);
325
            } elseif (in_array($param, $this->xpathOnCardHtml)) {
326
                $newItem[$param] = $this->getFirstValueHtml($temParam);
327
            } else {
328
                $newItem[$param] = $this->getFirstValue($temParam);
329
            }
330
        }
331
        $this->handleCallbacks($newItem);
332
        return $newItem;
333
    }
334
335
    /**
336
     * @return array
337
     */
338
    public function parseGenerator()
339
    {
340
        $generator = $this->urlGenerator;
341
        $urls = $generator();
342
        $results = [];
343
        foreach ($urls as $url) {
344
            $this->pageUrl = $url;
345
            $results[] = $this->parseCard();
346
        }
347
        return $results;
348
    }
349
350
    /**
351
     * @param $lastItem
352
     * @return HtmlUniParser
353
     */
354
    public function handleCallbacks(&$lastItem)
355
    {
356
        foreach ($this->callbacks as &$callbac) {
357
            $callbac($lastItem, $this->pageUrl);
358
        }
359
        return $this;
360
    }
361
362
    /**
363
     * @return string
364
     */
365
    public function getEncoding(): string
366
    {
367
        return $this->encoding;
368
    }
369
370
    /**
371
     * @param string $encoding
372
     * @return HtmlUniParser
373
     */
374
    public function setEncoding(string $encoding)
375
    {
376
        $this->encoding = $encoding;
377
        return $this;
378
    }
379
380
    /**
381
     * @param $value
382
     * @return false|string
383
     */
384
    private function proccessValue($value)
385
    {
386
        if ($this->getEncoding() === 'UTF-8') {
387
            return $value;
388
        }
389
        $result = \iconv($this->getEncoding(), 'UTF-8', $value);
390
        return $result;
391
    }
392
393
    /**
394
     * @return string
395
     */
396
    public function getTypeMech(): string
397
    {
398
        return $this->typeMech ?: 'curl';
399
    }
400
401
    /**
402
     * @param string $typeMech
403
     */
404
    public function setTypeMech(string $typeMech)
405
    {
406
        $this->typeMech = $typeMech;
407
    }
408
409
    /**
410
     * @param mixed $catalogUrl
411
     * @return HtmlUniParser
412
     */
413
    public function setCatalogUrl($catalogUrl)
414
    {
415
        $this->catalogUrl = $catalogUrl;
416
        return $this;
417
    }
418
419
    /**
420
     * @param mixed $searchUrl
421
     * @return HtmlUniParser
422
     */
423
    public function setSearchUrl($searchUrl)
424
    {
425
        $this->searchUrl = $searchUrl;
426
        return $this;
427
    }
428
429
    /**
430
     * @param mixed $pageUrl
431
     * @return HtmlUniParser
432
     */
433
    public function setPageUrl($pageUrl)
434
    {
435
        $this->pageUrl = $pageUrl;
436
        return $this;
437
    }
438
439
    /**
440
     * @param bool $forceOuterHtml
441
     * @return HtmlUniParser
442
     */
443
    public function setForceOuterHtml(bool $forceOuterHtml)
444
    {
445
        $this->forceOuterHtml = $forceOuterHtml;
446
        return $this;
447
    }
448
449
    /**
450
     * @param mixed $urlGenerator
451
     * @return HtmlUniParser
452
     */
453
    public function setUrlGenerator($urlGenerator)
454
    {
455
        $this->urlGenerator = $urlGenerator;
456
        return $this;
457
    }
458
459
    /**
460
     * @param string $siteBaseUrl
461
     * @return HtmlUniParser
462
     */
463
    public function setSiteBaseUrl(string $siteBaseUrl)
464
    {
465
        $this->siteBaseUrl = $siteBaseUrl;
466
        return $this;
467
    }
468
469
    /**
470
     * @param bool $resultLimit
471
     * @return HtmlUniParser
472
     */
473
    public function setResultLimit(bool $resultLimit)
474
    {
475
        $this->resultLimit = $resultLimit;
476
        return $this;
477
    }
478
479
    /**
480
     * @param int $sleepAfterRequest
481
     * @return HtmlUniParser
482
     */
483
    public function setSleepAfterRequest(int $sleepAfterRequest)
484
    {
485
        $this->sleepAfterRequest = $sleepAfterRequest;
486
        return $this;
487
    }
488
489
    /**
490
     * @param bool $goIntoCard
491
     * @return HtmlUniParser
492
     */
493
    public function setGoIntoCard(bool $goIntoCard)
494
    {
495
        $this->goIntoCard = $goIntoCard;
496
        return $this;
497
    }
498
499
    /**
500
     * @param string $xpathItem
501
     * @return HtmlUniParser
502
     */
503
    public function setXpathItem(string $xpathItem)
504
    {
505
        $this->xpathItem = $xpathItem;
506
        return $this;
507
    }
508
509
    /**
510
     * @param string $xpathLink
511
     * @return HtmlUniParser
512
     */
513
    public function setXpathLink(string $xpathLink)
514
    {
515
        $this->xpathLink = $xpathLink;
516
        return $this;
517
    }
518
519
    /**
520
     * @param array $xpathOnCard
521
     * @return HtmlUniParser
522
     */
523
    public function setXpathOnCard(array $xpathOnCard)
524
    {
525
        $this->xpathOnCard = $xpathOnCard;
526
        return $this;
527
    }
528
529
    /**
530
     * @param array $callbacks
531
     * @return HtmlUniParser
532
     */
533
    public function setCallbacks(array $callbacks)
534
    {
535
        $this->callbacks = $callbacks;
536
        return $this;
537
    }
538
539
    /**
540
     * @param array $xpathOnCardMany
541
     * @return HtmlUniParser
542
     */
543
    public function setXpathOnCardMany(array $xpathOnCardMany)
544
    {
545
        $this->xpathOnCardMany = $xpathOnCardMany;
546
        return $this;
547
    }
548
549
    /**
550
     * @param array $xpathOnCardHtml
551
     * @return HtmlUniParser
552
     */
553
    public function setXpathOnCardHtml(array $xpathOnCardHtml)
554
    {
555
        $this->xpathOnCardHtml = $xpathOnCardHtml;
556
        return $this;
557
    }
558
559
    /**
560
     * @param ZendBasedParser $zendParser
561
     * @return HtmlUniParser
562
     */
563
    public function setZendParser(ZendBasedParser $zendParser)
564
    {
565
        $this->zendParser = $zendParser;
566
        return $this;
567
    }
568
}
569