Passed
Push — master ( 69dd71...2c4486 )
by Kosuha
02:31
created

HtmlUniParser::onBeforeDom()   A

Complexity

Conditions 2
Paths 2

Size

Total Lines 7
Code Lines 4

Duplication

Lines 0
Ratio 0 %

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 2
eloc 4
nc 2
nop 0
dl 0
loc 7
rs 10
c 1
b 0
f 0
1
<?php
2
3
namespace kosuha606\HtmlUniParser;
4
5
use kosuha606\HtmlUniParser\exceptions\ParserInvalidConfigException;
6
7
/**
8
 * Class HtmlUniParser
9
 * @package kosuha606\HtmlUniParser
10
 */
11
class HtmlUniParser extends BaseObject
12
{
13
    /**
14
     * Парсинг по сценарию каталога
15
     * @var
16
     */
17
    protected $catalogUrl;
18
19
    /**
20
     * Парсинг по сценарию поиска на сайте
21
     * @var
22
     */
23
    protected $searchUrl;
24
25
    /**
26
     * Парсинг по сценарию получения данных от одной страницы
27
     * @var
28
     */
29
    protected $pageUrl;
30
31
    /**
32
     * Заставить парсер получать внешний html
33
     * @var boolean
34
     */
35
    protected $forceOuterHtml = false;
36
37
    /**
38
     * Парсинг по урлам, сгенерированным генератором
39
     * @var
40
     */
41
    protected $urlGenerator;
42
43
    /** @var  */
44
    protected $beforeDomCallback;
45
46
    /**
47
     * Кодировка сайта
48
     * @var string
49
     */
50
    protected $encoding = 'UTF-8';
51
52
    /**
53
     * @var string
54
     */
55
    protected $siteBaseUrl = '/';
56
57
    /**
58
     * @var bool
59
     */
60
    protected $resultLimit = false;
61
62
    /**
63
     * @var int
64
     */
65
    protected $sleepAfterRequest = 0;
66
67
    /**
68
     * @var bool
69
     */
70
    protected $goIntoCard = false;
71
72
    /**
73
     * @var string
74
     */
75
    protected $xpathItem;
76
77
    /**
78
     * @var string
79
     */
80
    protected $xpathLink;
81
82
    /**
83
     * @var string
84
     */
85
    protected $xpathTitle;
86
87
    /**
88
     * @var array
89
     */
90
    protected $xpathOnCard = [];
91
92
    /** @var string */
93
    protected $typeMech;
94
95
    /**
96
     * @var array
97
     */
98
    protected $callbacks = [];
99
100
    /**
101
     * Результат должен быть множественным
102
     * @var array
103
     */
104
    private $xpathOnCardMany = [];
105
106
    /**
107
     * Результат в формате HTML
108
     * @var array
109
     */
110
    private $xpathOnCardHtml = [];
111
112
    /** @var ZendBasedParser */
113
    private $zendParser;
114
115
    /**
116
     * HtmlUniParser constructor.
117
     * @param $config
118
     */
119
    public function __construct($config, ZendBasedParser $parser)
120
    {
121
        if  (!in_array  ('dom', get_loaded_extensions())) {
122
            throw new ParserInvalidConfigException('The dom extension in not loaded in system');
123
        }
124
        if  (!in_array  ('iconv', get_loaded_extensions())) {
125
            throw new ParserInvalidConfigException('The iconv extension in not loaded in system');
126
        }
127
        parent::__construct($config);
128
        $this->zendParser = $parser;
129
        if (\count($this->xpathOnCard) > 0) {
130
            foreach ($this->xpathOnCard as $param => &$xpath) {
131
                if (\strpos($xpath, 'MANY') !== false) {
132
                    $this->xpathOnCardMany[] = $param;
133
                    $xpath = \str_replace('MANY', '', $xpath);
134
                }
135
                if (\strpos($xpath, 'HTML') !== false) {
136
                    $this->xpathOnCardHtml[] = $param;
137
                    $xpath = \str_replace('HTML', '', $xpath);
138
                }
139
            }
140
        }
141
    }
142
143
    /**
144
     * You can create instances of this class
145
     * by yourself or you can use this method
146
     * @param $config
147
     * @return HtmlUniParser
148
     */
149
    public static function create($config)
150
    {
151
152
        return new static($config, new ZendBasedParser());
153
    }
154
155
    /**
156
     * @param mixed $callbacs
157
     */
158
    public function addCallback($callbacs)
159
    {
160
        $this->callbacks[] = $callbacs;
161
    }
162
163
    /**
164
     * @param $node
165
     * @return string
166
     */
167
    public function getHtml($node)
168
    {
169
        if ($this->forceOuterHtml) {
170
            return $this->getOuterHtml($node);
171
        }
172
        $innerHTML = '';
173
        $children = $node->childNodes;
174
        foreach ($children as $child) {
175
            $innerHTML .= $child->ownerDocument->saveXML($child);
176
        }
177
        return $innerHTML;
178
    }
179
180
    /**
181
     * @param $node
182
     * @return mixed
183
     */
184
    public function getOuterHtml($node)
185
    {
186
        return $node->ownerDocument->saveXML($node);
187
    }
188
189
    /**
190
     * @param $object
191
     * @param $method
192
     * @return mixed|null
193
     */
194
    public function valueStub($object, $method)
195
    {
196
        if (!\is_object($object)) {
197
            return null;
198
        }
199
        if (\property_exists($object, $method)) {
200
            return $object->{$method};
201
        }
202
        return null;
203
    }
204
205
    /**
206
     * @param $results
207
     * @return array
208
     */
209
    public function getFirstMatch($results)
210
    {
211
        $result = array();
212
        foreach ($results as $r) {
213
            $result = $r;
214
        }
215
        return $result;
216
    }
217
218
    /**
219
     * @param $nodes
220
     * @return string
221
     */
222
    public function getFirstValue($nodes)
223
    {
224
        $val = $this->getFirstMatch($nodes);
225
        return $this->getValue($val);
226
    }
227
228
    /**
229
     * @param $nodes
230
     * @return string|string[]|null
231
     */
232
    public function getFirstValueHtml($nodes)
233
    {
234
        /** @var \DOMElement $val */
235
        $val = $this->getFirstMatch($nodes);
236
        $html = $val->ownerDocument->saveHTML($val);
237
        // Удаляем картинки из спарсенного текста
238
        $html = preg_replace("/<img[^>]+\>/i", "", $html);
239
        return $this->proccessValue($html);
240
    }
241
242
    /**
243
     * @param $val
244
     * @return string
245
     */
246
    public function getValue($val)
247
    {
248
        $result = null;
249
        if ($val instanceof \DOMAttr) {
250
            $result = $this->valueStub($val, 'value');
251
        }
252
        if ($val instanceof \DOMElement) {
253
            $result = $this->valueStub($val, 'nodeValue');
254
        }
255
        return $this->proccessValue(trim($result));
256
    }
257
258
    /**
259
     * @param $nodes
260
     * @return array
261
     */
262
    public function getAllValues($nodes)
263
    {
264
        $result = [];
265
        foreach ($nodes as $node) {
266
            $result[] = $this->getValue($node);
267
        }
268
        return $result;
269
    }
270
271
    /**
272
     * Спарсить ссылку, возможно каталог
273
     */
274
    public function parseUrl()
275
    {
276
        $this->zendParser->setSleepAfterRequest($this->sleepAfterRequest);
277
        $this->zendParser->setUrl($this->catalogUrl);
278
        $this->onBeforeDom();
279
        $items = $this->zendParser->dom($this->getEncoding(), $this->getTypeMech())->queryXpath($this->xpathItem);
280
        $pageHtml = $this->zendParser->getHtmlBuffer();
281
        $result = [];
282
        foreach ($items as $index => $item) {
283
            $newItem = [];
284
            $html = $this->getHtml($item);
285
            $this->zendParser->setRawHtml($html);
286
            $link = $this->zendParser->dom($this->getEncoding(), $this->getTypeMech())->queryXpath($this->xpathLink);
287
            $link = $this->getFirstValue($link);
288
            if (preg_match('/^http(s)?:\/\/.*$/i', $link)) {
289
                $newItem['link'] = $link;
290
            } else {
291
                $newItem['link'] = $this->siteBaseUrl.$link;
292
            }
293
            if ($this->xpathTitle) {
294
                $this->zendParser->setRawHtml($pageHtml);
295
                $title = $this->zendParser->dom($this->getEncoding(), $this->getTypeMech())->queryXpath($this->xpathTitle);
296
                $title = $this->getFirstValue($title);
297
                $newItem['title'] = $title;
298
            }
299
            if ($this->goIntoCard && $newItem['link']) {
300
                $this->zendParser->setUrl($newItem['link']);
301
                foreach ($this->xpathOnCard as $param => $xpath) {
302
                    $this->onBeforeDom();
303
                    $temParam = $this->zendParser->dom($this->getEncoding(), $this->getTypeMech())->queryXpath($xpath);
304
                    if (in_array($param, $this->xpathOnCardMany)) {
305
                        $newItem[$param] = $this->getAllValues($temParam);
306
                    } elseif (in_array($param, $this->xpathOnCardHtml)) {
307
                        $newItem[$param] = $this->getFirstValueHtml($temParam);
308
                    } else {
309
                        $newItem[$param] = $this->getFirstValue($temParam);
310
                    }
311
                }
312
            }
313
            $this->handleCallbacks($newItem);
314
            $result[] = $newItem;
315
            // Не даем спарсить больше предела если установлен предел
316
            if ($this->resultLimit && $this->resultLimit <= ($index + 1)) {
317
                break;
318
            }
319
        }
320
        return $result;
321
    }
322
323
    /**
324
     * Спарсить результаты поиска
325
     */
326
    public function parseSearch($textQuery)
327
    {
328
        $this->catalogUrl = $this->searchUrl.$textQuery;
329
        return $this->parseUrl();
330
    }
331
332
    /**
333
     * Спарсить одну карточку
334
     */
335
    public function parseCard()
336
    {
337
        $this->zendParser->setUrl($this->pageUrl);
338
        foreach ($this->xpathOnCard as $param => $xpath) {
339
            $this->onBeforeDom();
340
            $temParam = $this->zendParser->dom($this->getEncoding(), $this->getTypeMech())->queryXpath($xpath);
341
            if (in_array($param, $this->xpathOnCardMany)) {
342
                $newItem[$param] = $this->getAllValues($temParam);
343
            } elseif (in_array($param, $this->xpathOnCardHtml)) {
344
                $newItem[$param] = $this->getFirstValueHtml($temParam);
345
            } else {
346
                $newItem[$param] = $this->getFirstValue($temParam);
347
            }
348
        }
349
        $this->handleCallbacks($newItem);
350
        return $newItem;
351
    }
352
353
    /**
354
     * @return array
355
     */
356
    public function parseGenerator()
357
    {
358
        $generator = $this->urlGenerator;
359
        $urls = $generator();
360
        $results = [];
361
        foreach ($urls as $url) {
362
            $this->pageUrl = $url;
363
            $results[] = $this->parseCard();
364
        }
365
        return $results;
366
    }
367
368
    /**
369
     * @param $lastItem
370
     * @return HtmlUniParser
371
     */
372
    public function handleCallbacks(&$lastItem)
373
    {
374
        foreach ($this->callbacks as &$callbac) {
375
            $callbac($lastItem, $this->pageUrl);
376
        }
377
        return $this;
378
    }
379
380
    /**
381
     * @return $this
382
     */
383
    public function onBeforeDom()
384
    {
385
        $callback = $this->beforeDomCallback;
386
        if ($callback) {
387
            $callback($this);
388
        }
389
        return $this;
390
    }
391
392
    /**
393
     * @return string
394
     */
395
    public function getEncoding(): string
396
    {
397
        return $this->encoding;
398
    }
399
400
    /**
401
     * @param string $encoding
402
     * @return HtmlUniParser
403
     */
404
    public function setEncoding(string $encoding)
405
    {
406
        $this->encoding = $encoding;
407
        return $this;
408
    }
409
410
    /**
411
     * @param $value
412
     * @return false|string
413
     */
414
    private function proccessValue($value)
415
    {
416
        if ($this->getEncoding() === 'UTF-8') {
417
            return $value;
418
        }
419
        $result = \iconv($this->getEncoding(), 'UTF-8', $value);
420
        return $result;
421
    }
422
423
    /**
424
     * @return string
425
     */
426
    public function getTypeMech(): string
427
    {
428
        return $this->typeMech ?: 'curl';
429
    }
430
431
    /**
432
     * @return ZendBasedParser
433
     */
434
    public function getZendParser(): ZendBasedParser
435
    {
436
        return $this->zendParser;
437
    }
438
439
    /**
440
     * @param string $typeMech
441
     * @return HtmlUniParser
442
     */
443
    public function setTypeMech(string $typeMech)
444
    {
445
        $this->typeMech = $typeMech;
446
        return $this;
447
    }
448
449
    /**
450
     * @param mixed $catalogUrl
451
     * @return HtmlUniParser
452
     */
453
    public function setCatalogUrl($catalogUrl)
454
    {
455
        $this->catalogUrl = $catalogUrl;
456
        return $this;
457
    }
458
459
    /**
460
     * @param mixed $searchUrl
461
     * @return HtmlUniParser
462
     */
463
    public function setSearchUrl($searchUrl)
464
    {
465
        $this->searchUrl = $searchUrl;
466
        return $this;
467
    }
468
469
    /**
470
     * @param mixed $pageUrl
471
     * @return HtmlUniParser
472
     */
473
    public function setPageUrl($pageUrl)
474
    {
475
        $this->pageUrl = $pageUrl;
476
        return $this;
477
    }
478
479
    /**
480
     * @param bool $forceOuterHtml
481
     * @return HtmlUniParser
482
     */
483
    public function setForceOuterHtml(bool $forceOuterHtml)
484
    {
485
        $this->forceOuterHtml = $forceOuterHtml;
486
        return $this;
487
    }
488
489
    /**
490
     * @param mixed $urlGenerator
491
     * @return HtmlUniParser
492
     */
493
    public function setUrlGenerator($urlGenerator)
494
    {
495
        $this->urlGenerator = $urlGenerator;
496
        return $this;
497
    }
498
499
    /**
500
     * @param string $siteBaseUrl
501
     * @return HtmlUniParser
502
     */
503
    public function setSiteBaseUrl(string $siteBaseUrl)
504
    {
505
        $this->siteBaseUrl = $siteBaseUrl;
506
        return $this;
507
    }
508
509
    /**
510
     * @param bool $resultLimit
511
     * @return HtmlUniParser
512
     */
513
    public function setResultLimit($resultLimit)
514
    {
515
        $this->resultLimit = $resultLimit;
516
        return $this;
517
    }
518
519
    /**
520
     * @param int $sleepAfterRequest
521
     * @return HtmlUniParser
522
     */
523
    public function setSleepAfterRequest(int $sleepAfterRequest)
524
    {
525
        $this->sleepAfterRequest = $sleepAfterRequest;
526
        return $this;
527
    }
528
529
    /**
530
     * @param bool $goIntoCard
531
     * @return HtmlUniParser
532
     */
533
    public function setGoIntoCard(bool $goIntoCard)
534
    {
535
        $this->goIntoCard = $goIntoCard;
536
        return $this;
537
    }
538
539
    /**
540
     * @param string $xpathItem
541
     * @return HtmlUniParser
542
     */
543
    public function setXpathItem(string $xpathItem)
544
    {
545
        $this->xpathItem = $xpathItem;
546
        return $this;
547
    }
548
549
    /**
550
     * @param string $xpathLink
551
     * @return HtmlUniParser
552
     */
553
    public function setXpathLink(string $xpathLink)
554
    {
555
        $this->xpathLink = $xpathLink;
556
        return $this;
557
    }
558
559
    /**
560
     * @param array $xpathOnCard
561
     * @return HtmlUniParser
562
     */
563
    public function setXpathOnCard(array $xpathOnCard)
564
    {
565
        $this->xpathOnCard = $xpathOnCard;
566
        return $this;
567
    }
568
569
    /**
570
     * @param array $callbacks
571
     * @return HtmlUniParser
572
     */
573
    public function setCallbacks(array $callbacks)
574
    {
575
        $this->callbacks = $callbacks;
576
        return $this;
577
    }
578
579
    /**
580
     * @param array $xpathOnCardMany
581
     * @return HtmlUniParser
582
     */
583
    public function setXpathOnCardMany(array $xpathOnCardMany)
584
    {
585
        $this->xpathOnCardMany = $xpathOnCardMany;
586
        return $this;
587
    }
588
589
    /**
590
     * @param array $xpathOnCardHtml
591
     * @return HtmlUniParser
592
     */
593
    public function setXpathOnCardHtml(array $xpathOnCardHtml)
594
    {
595
        $this->xpathOnCardHtml = $xpathOnCardHtml;
596
        return $this;
597
    }
598
599
    /**
600
     * @param ZendBasedParser $zendParser
601
     * @return HtmlUniParser
602
     */
603
    public function setZendParser(ZendBasedParser $zendParser)
604
    {
605
        $this->zendParser = $zendParser;
606
        return $this;
607
    }
608
609
    /**
610
     * @param mixed $beforeDomCallback
611
     * @return HtmlUniParser
612
     */
613
    public function setBeforeDomCallback($beforeDomCallback)
614
    {
615
        $this->beforeDomCallback = $beforeDomCallback;
616
        return $this;
617
    }
618
619
    /**
620
     * @return string
621
     */
622
    public function getXpathTitle()
623
    {
624
        return $this->xpathTitle;
625
    }
626
627
    /**
628
     * @param string $xpathTitle
629
     */
630
    public function setXpathTitle($xpathTitle)
631
    {
632
        $this->xpathTitle = $xpathTitle;
633
        return $this;
634
    }
635
}
636