Completed
Push — master ( 2c4486...36c3d6 )
by Kosuha
04:23
created

HtmlUniParser::getFirstValue()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 5
Code Lines 2

Duplication

Lines 0
Ratio 0 %

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 1
eloc 2
c 1
b 0
f 0
nc 1
nop 1
dl 0
loc 5
rs 10
1
<?php
2
declare(strict_types=1);
3
4
namespace kosuha606\HtmlUniParser;
5
6
use Assert\Assertion;
7
use kosuha606\HtmlUniParser\action\ComposeHtmlAction;
8
use kosuha606\HtmlUniParser\action\GetFirstMatchAction;
9
use kosuha606\HtmlUniParser\action\InitializeHtmlUniParserAction;
10
11
/**
12
 * The main intrance point for work with the package
13
 * @package kosuha606\HtmlUniParser
14
 */
15
class HtmlUniParser extends BaseObject
16
{
17
    /**
18
     * Парсинг по сценарию каталога
19
     * @var
20
     */
21
    protected $catalogUrl;
22
23
    /**
24
     * Парсинг по сценарию поиска на сайте
25
     * @var
26
     */
27
    protected $searchUrl;
28
29
    /**
30
     * Парсинг по сценарию получения данных от одной страницы
31
     * @var
32
     */
33
    protected $pageUrl;
34
35
    /**
36
     * Заставить парсер получать внешний html
37
     * @var boolean
38
     */
39
    protected $forceOuterHtml = false;
40
41
    /**
42
     * Парсинг по урлам, сгенерированным генератором
43
     * @var
44
     */
45
    protected $urlGenerator;
46
47
    /** @var */
48
    protected $beforeDomCallback;
49
50
    /**
51
     * Кодировка сайта
52
     * @var string
53
     */
54
    protected $encoding = 'UTF-8';
55
56
    /**
57
     * @var string
58
     */
59
    protected $siteBaseUrl = '/';
60
61
    /**
62
     * @var bool
63
     */
64
    protected $resultLimit = false;
65
66
    /**
67
     * @var int
68
     */
69
    protected $sleepAfterRequest = 0;
70
71
    /**
72
     * @var bool
73
     */
74
    protected $goIntoCard = false;
75
76
    /**
77
     * @var string
78
     */
79
    protected $xpathItem;
80
81
    /**
82
     * @var string
83
     */
84
    protected $xpathLink;
85
86
    /**
87
     * @var string
88
     */
89
    protected $xpathTitle;
90
91
    /** @var string */
92
    protected $typeMech;
93
94
    /**
95
     * @var array
96
     */
97
    protected $xpathOnCard = [];
98
99
    /**
100
     * @var array
101
     */
102
    protected $callbacks = [];
103
104
    /**
105
     * Результат должен быть множественным
106
     * @var array
107
     */
108
    private $xpathOnCardMany = [];
109
110
    /**
111
     * Результат в формате HTML
112
     * @var array
113
     */
114
    private $xpathOnCardHtml = [];
115
116
    /** @var ZendBasedParser */
117
    private $zendParser;
118
119
    /**
120
     * HtmlUniParser constructor.
121
     * @param $config
122
     * @throws \Assert\AssertionFailedException
123
     */
124
    public function __construct($config, ZendBasedParser $parser)
125
    {
126
        parent::__construct($config);
127
        $this->checkPhpExtensions();
128
        InitializeHtmlUniParserAction::do($this, $parser);
129
    }
130
131
    /**
132
     * @throws \Assert\AssertionFailedException
133
     */
134
    private function checkPhpExtensions()
135
    {
136
        Assertion::keyNotExists(
137
            get_loaded_extensions(),
138
            'dom',
139
            'The dom extension in not loaded in system. HtmlUniParser cant work'
140
        );
141
        Assertion::keyNotExists(
142
            get_loaded_extensions(),
143
            'iconv',
144
            'The iconv extension in not loaded in system. HtmlUniParser cant work'
145
        );
146
    }
147
148
    /**
149
     * @return array
150
     */
151
    public function getXpathOnCard()
152
    {
153
        return $this->xpathOnCard;
154
    }
155
156
    /**
157
     * You can create instances of this class
158
     * by yourself or you can use this method
159
     * @param $config
160
     * @return HtmlUniParser
161
     * @throws \Assert\AssertionFailedException
162
     * @throws \ReflectionException
163
     */
164
    public static function create($config)
165
    {
166
        return Factory::createObject(
167
            [
168
                'class' => static::class,
169
            ],
170
            [
171
                $config,
172
                Factory::createObject(
173
                    [
174
                        'class' => ZendBasedParser::class,
175
                    ]
176
                ),
177
            ]
178
        );
179
    }
180
181
    /**
182
     * @param mixed $callbacs
183
     */
184
    public function addCallback($callbacs)
185
    {
186
        $this->callbacks[] = $callbacs;
187
    }
188
189
    /**
190
     * @param $node
191
     * @return string
192
     * @throws \Assert\AssertionFailedException
193
     */
194
    public function composeHtml($node): string
195
    {
196
        return ComposeHtmlAction::do($this, $node);
197
    }
198
199
    /**
200
     * @param $node
201
     * @return mixed
202
     */
203
    public function queryOuterHtml($node)
204
    {
205
        return $node->ownerDocument->saveXML($node);
206
    }
207
208
    /**
209
     * @param $nodes
210
     * @return string
211
     * @throws \Assert\AssertionFailedException
212
     */
213
    public function getFirstValue($nodes)
214
    {
215
        $val = GetFirstMatchAction::do($nodes);
216
217
        return $this->getValue($val);
218
    }
219
220
    /**
221
     * @param $nodes
222
     * @return string|string[]|null
223
     */
224
    public function getFirstValueHtml($nodes)
225
    {
226
        /** @var \DOMElement $val */
227
        $val = GetFirstMatchAction::do($nodes);
228
        $html = $val->ownerDocument->saveHTML($val);
229
        // Удаляем картинки из спарсенного текста
230
        $html = preg_replace("/<img[^>]+\>/i", "", $html);
231
232
        return $this->proccessValue($html);
233
    }
234
235
    /**
236
     * @param $val
237
     * @return string
238
     */
239
    public function getValue($val)
240
    {
241
        $result = null;
242
        if ($val instanceof \DOMAttr) {
243
            $result = $this->valueStub($val, 'value');
244
        }
245
        if ($val instanceof \DOMElement) {
246
            $result = $this->valueStub($val, 'nodeValue');
247
        }
248
249
        return $this->proccessValue(trim($result));
250
    }
251
252
    /**
253
     * @param $nodes
254
     * @return array
255
     */
256
    public function getAllValues($nodes)
257
    {
258
        $result = [];
259
        foreach ($nodes as $node) {
260
            $result[] = $this->getValue($node);
261
        }
262
263
        return $result;
264
    }
265
266
    /**
267
     * Спарсить ссылку, возможно каталог
268
     */
269
    public function parseUrl()
270
    {
271
        $this->zendParser->setSleepAfterRequest($this->sleepAfterRequest);
272
        $this->zendParser->setUrl($this->catalogUrl);
273
        $this->onBeforeDom();
274
        $items = $this->zendParser->dom($this->getEncoding(), $this->getTypeMech())->queryXpath($this->xpathItem);
275
        $pageHtml = $this->zendParser->getHtmlBuffer();
276
        $result = [];
277
        foreach ($items as $index => $item) {
278
            $newItem = [];
279
            $html = $this->composeHtml($item);
280
            $this->zendParser->setRawHtml($html);
281
            $link = $this->zendParser->dom($this->getEncoding(), $this->getTypeMech())->queryXpath($this->xpathLink);
282
            $link = $this->getFirstValue($link);
283
            if (preg_match('/^http(s)?:\/\/.*$/i', $link)) {
284
                $newItem['link'] = $link;
285
            } else {
286
                $newItem['link'] = $this->siteBaseUrl.$link;
287
            }
288
            if ($this->xpathTitle) {
289
                $this->zendParser->setRawHtml($pageHtml);
290
                $title = $this->zendParser->dom($this->getEncoding(), $this->getTypeMech())->queryXpath(
291
                    $this->xpathTitle
292
                );
293
                $title = $this->getFirstValue($title);
294
                $newItem['title'] = $title;
295
            }
296
            if ($this->goIntoCard && $newItem['link']) {
297
                $this->zendParser->setUrl($newItem['link']);
298
                foreach ($this->xpathOnCard as $param => $xpath) {
299
                    $this->onBeforeDom();
300
                    $temParam = $this->zendParser->dom($this->getEncoding(), $this->getTypeMech())->queryXpath($xpath);
301
                    if (in_array($param, $this->xpathOnCardMany)) {
302
                        $newItem[$param] = $this->getAllValues($temParam);
303
                    } elseif (in_array($param, $this->xpathOnCardHtml)) {
304
                        $newItem[$param] = $this->getFirstValueHtml($temParam);
305
                    } else {
306
                        $newItem[$param] = $this->getFirstValue($temParam);
307
                    }
308
                }
309
            }
310
            $this->handleCallbacks($newItem);
311
            $result[] = $newItem;
312
            // Не даем спарсить больше предела если установлен предел
313
            if ($this->resultLimit && $this->resultLimit <= ($index + 1)) {
314
                break;
315
            }
316
        }
317
318
        return $result;
319
    }
320
321
    /**
322
     * Спарсить результаты поиска
323
     */
324
    public function parseSearch($textQuery)
325
    {
326
        $this->catalogUrl = $this->searchUrl.$textQuery;
327
328
        return $this->parseUrl();
329
    }
330
331
    /**
332
     * Спарсить одну карточку
333
     */
334
    public function parseCard()
335
    {
336
        $this->zendParser->setUrl($this->pageUrl);
337
        foreach ($this->xpathOnCard as $param => $xpath) {
338
            $this->onBeforeDom();
339
            $temParam = $this->zendParser->dom($this->getEncoding(), $this->getTypeMech())->queryXpath($xpath);
340
            if (in_array($param, $this->xpathOnCardMany)) {
341
                $newItem[$param] = $this->getAllValues($temParam);
342
            } elseif (in_array($param, $this->xpathOnCardHtml)) {
343
                $newItem[$param] = $this->getFirstValueHtml($temParam);
344
            } else {
345
                $newItem[$param] = $this->getFirstValue($temParam);
346
            }
347
        }
348
        $this->handleCallbacks($newItem);
349
350
        return $newItem;
351
    }
352
353
    /**
354
     * @return array
355
     */
356
    public function parseGenerator()
357
    {
358
        $generator = $this->urlGenerator;
359
        $urls = $generator();
360
        $results = [];
361
        foreach ($urls as $url) {
362
            $this->pageUrl = $url;
363
            $results[] = $this->parseCard();
364
        }
365
366
        return $results;
367
    }
368
369
    /**
370
     * @param $lastItem
371
     * @return HtmlUniParser
372
     */
373
    public function handleCallbacks(&$lastItem)
374
    {
375
        foreach ($this->callbacks as &$callbac) {
376
            $callbac($lastItem, $this->pageUrl);
377
        }
378
379
        return $this;
380
    }
381
382
    /**
383
     * @return $this
384
     */
385
    public function onBeforeDom()
386
    {
387
        $callback = $this->beforeDomCallback;
388
        if ($callback) {
389
            $callback($this);
390
        }
391
392
        return $this;
393
    }
394
395
    /**
396
     * @return string
397
     */
398
    public function getEncoding(): string
399
    {
400
        return $this->encoding;
401
    }
402
403
    /**
404
     * @param string $encoding
405
     * @return HtmlUniParser
406
     */
407
    public function setEncoding(string $encoding)
408
    {
409
        $this->encoding = $encoding;
410
411
        return $this;
412
    }
413
414
    /**
415
     * @param $value
416
     * @return false|string
417
     */
418
    private function proccessValue($value)
419
    {
420
        if ($this->getEncoding() === 'UTF-8') {
421
            return $value;
422
        }
423
        $result = \iconv($this->getEncoding(), 'UTF-8', $value);
424
425
        return $result;
426
    }
427
428
    /**
429
     * @return string
430
     */
431
    public function getTypeMech(): string
432
    {
433
        return $this->typeMech ?: 'curl';
434
    }
435
436
    /**
437
     * @return ZendBasedParser
438
     */
439
    public function getZendParser(): ZendBasedParser
440
    {
441
        return $this->zendParser;
442
    }
443
444
    /**
445
     * @param string $typeMech
446
     * @return HtmlUniParser
447
     */
448
    public function setTypeMech(string $typeMech)
449
    {
450
        $this->typeMech = $typeMech;
451
452
        return $this;
453
    }
454
455
    /**
456
     * @param mixed $catalogUrl
457
     * @return HtmlUniParser
458
     */
459
    public function setCatalogUrl($catalogUrl)
460
    {
461
        $this->catalogUrl = $catalogUrl;
462
463
        return $this;
464
    }
465
466
    /**
467
     * @param mixed $searchUrl
468
     * @return HtmlUniParser
469
     */
470
    public function setSearchUrl($searchUrl)
471
    {
472
        $this->searchUrl = $searchUrl;
473
474
        return $this;
475
    }
476
477
    /**
478
     * @param mixed $pageUrl
479
     * @return HtmlUniParser
480
     */
481
    public function setPageUrl($pageUrl)
482
    {
483
        $this->pageUrl = $pageUrl;
484
485
        return $this;
486
    }
487
488
    /**
489
     * @param bool $forceOuterHtml
490
     * @return HtmlUniParser
491
     */
492
    public function setForceOuterHtml(bool $forceOuterHtml)
493
    {
494
        $this->forceOuterHtml = $forceOuterHtml;
495
496
        return $this;
497
    }
498
499
    /**
500
     * @return bool
501
     */
502
    public function isForceOuterHtml(): bool
503
    {
504
        return $this->forceOuterHtml;
505
    }
506
507
    /**
508
     * @param mixed $urlGenerator
509
     * @return HtmlUniParser
510
     */
511
    public function setUrlGenerator($urlGenerator)
512
    {
513
        $this->urlGenerator = $urlGenerator;
514
515
        return $this;
516
    }
517
518
    /**
519
     * @param string $siteBaseUrl
520
     * @return HtmlUniParser
521
     */
522
    public function setSiteBaseUrl(string $siteBaseUrl)
523
    {
524
        $this->siteBaseUrl = $siteBaseUrl;
525
526
        return $this;
527
    }
528
529
    /**
530
     * @param bool $resultLimit
531
     * @return HtmlUniParser
532
     */
533
    public function setResultLimit($resultLimit)
534
    {
535
        $this->resultLimit = $resultLimit;
536
537
        return $this;
538
    }
539
540
    /**
541
     * @param int $sleepAfterRequest
542
     * @return HtmlUniParser
543
     */
544
    public function setSleepAfterRequest(int $sleepAfterRequest)
545
    {
546
        $this->sleepAfterRequest = $sleepAfterRequest;
547
548
        return $this;
549
    }
550
551
    /**
552
     * @param bool $goIntoCard
553
     * @return HtmlUniParser
554
     */
555
    public function setGoIntoCard(bool $goIntoCard)
556
    {
557
        $this->goIntoCard = $goIntoCard;
558
559
        return $this;
560
    }
561
562
    /**
563
     * @param string $xpathItem
564
     * @return HtmlUniParser
565
     */
566
    public function setXpathItem(string $xpathItem)
567
    {
568
        $this->xpathItem = $xpathItem;
569
570
        return $this;
571
    }
572
573
    /**
574
     * @param string $xpathLink
575
     * @return HtmlUniParser
576
     */
577
    public function setXpathLink(string $xpathLink)
578
    {
579
        $this->xpathLink = $xpathLink;
580
581
        return $this;
582
    }
583
584
    /**
585
     * @param array $xpathOnCard
586
     * @return HtmlUniParser
587
     */
588
    public function setXpathOnCard(array $xpathOnCard)
589
    {
590
        $this->xpathOnCard = $xpathOnCard;
591
592
        return $this;
593
    }
594
595
    /**
596
     * @param array $callbacks
597
     * @return HtmlUniParser
598
     */
599
    public function setCallbacks(array $callbacks)
600
    {
601
        $this->callbacks = $callbacks;
602
603
        return $this;
604
    }
605
606
    /**
607
     * @param array $xpathOnCardMany
608
     * @return HtmlUniParser
609
     */
610
    public function setXpathOnCardMany(array $xpathOnCardMany)
611
    {
612
        $this->xpathOnCardMany = $xpathOnCardMany;
613
614
        return $this;
615
    }
616
617
    /**
618
     * @param array $xpathOnCardHtml
619
     * @return HtmlUniParser
620
     */
621
    public function setXpathOnCardHtml(array $xpathOnCardHtml)
622
    {
623
        $this->xpathOnCardHtml = $xpathOnCardHtml;
624
625
        return $this;
626
    }
627
628
    /**
629
     * @param ZendBasedParser $zendParser
630
     * @return HtmlUniParser
631
     */
632
    public function setZendParser(ZendBasedParser $zendParser)
633
    {
634
        $this->zendParser = $zendParser;
635
636
        return $this;
637
    }
638
639
    /**
640
     * @param mixed $beforeDomCallback
641
     * @return HtmlUniParser
642
     */
643
    public function setBeforeDomCallback($beforeDomCallback)
644
    {
645
        $this->beforeDomCallback = $beforeDomCallback;
646
647
        return $this;
648
    }
649
650
    /**
651
     * @return string
652
     */
653
    public function getXpathTitle()
654
    {
655
        return $this->xpathTitle;
656
    }
657
658
    /**
659
     * @param string $xpathTitle
660
     * @return HtmlUniParser
661
     */
662
    public function setXpathTitle($xpathTitle)
663
    {
664
        $this->xpathTitle = $xpathTitle;
665
666
        return $this;
667
    }
668
669
    /**
670
     * @param $object
671
     * @param $method
672
     * @return mixed|null
673
     */
674
    private function valueStub($object, $method)
675
    {
676
        if (!\is_object($object)) {
677
            return null;
678
        }
679
        if (\property_exists($object, $method)) {
680
            return $object->{$method};
681
        }
682
683
        return null;
684
    }
685
}
686