HtmlUniParser::setSiteBaseUrl()   A
last analyzed

Complexity

Conditions 1
Paths 1

Size

Total Lines 5
Code Lines 2

Duplication

Lines 0
Ratio 0 %

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 1
eloc 2
c 1
b 0
f 0
nc 1
nop 1
dl 0
loc 5
rs 10
1
<?php
2
declare(strict_types=1);
3
4
namespace kosuha606\HtmlUniParser;
5
6
use Assert\Assertion;
7
use kosuha606\HtmlUniParser\action\ComposeHtmlAction;
8
use kosuha606\HtmlUniParser\action\GetFirstMatchAction;
9
use kosuha606\HtmlUniParser\action\InitializeHtmlUniParserAction;
10
use kosuha606\HtmlUniParser\action\ValueStubAction;
11
12
/**
13
 * The main intrance point for work with the package
14
 * @package kosuha606\HtmlUniParser
15
 */
16
class HtmlUniParser extends BaseObject
17
{
18
    /**
19
     * Парсинг по сценарию каталога
20
     * @var
21
     */
22
    protected $catalogUrl;
23
24
    /**
25
     * Парсинг по сценарию поиска на сайте
26
     * @var
27
     */
28
    protected $searchUrl;
29
30
    /**
31
     * Парсинг по сценарию получения данных от одной страницы
32
     * @var
33
     */
34
    protected $pageUrl;
35
36
    /**
37
     * Заставить парсер получать внешний html
38
     * @var boolean
39
     */
40
    protected $forceOuterHtml = false;
41
42
    /**
43
     * Парсинг по урлам, сгенерированным генератором
44
     * @var
45
     */
46
    protected $urlGenerator;
47
48
    /** @var */
49
    protected $beforeDomCallback;
50
51
    /**
52
     * Кодировка сайта
53
     * @var string
54
     */
55
    protected $encoding = 'UTF-8';
56
57
    /**
58
     * @var string
59
     */
60
    protected $siteBaseUrl = '/';
61
62
    /**
63
     * @var bool
64
     */
65
    protected $resultLimit = false;
66
67
    /**
68
     * @var int
69
     */
70
    protected $sleepAfterRequest = 0;
71
72
    /**
73
     * @var bool
74
     */
75
    protected $goIntoCard = false;
76
77
    /**
78
     * @var string
79
     */
80
    protected $xpathItem;
81
82
    /**
83
     * @var string
84
     */
85
    protected $xpathLink;
86
87
    /**
88
     * @var string
89
     */
90
    protected $xpathTitle;
91
92
    /** @var string */
93
    protected $typeMech;
94
95
    /**
96
     * @var array
97
     */
98
    protected $xpathOnCard = [];
99
100
    /**
101
     * @var array
102
     */
103
    protected $callbacks = [];
104
105
    /**
106
     * Результат должен быть множественным
107
     * @var array
108
     */
109
    private $xpathOnCardMany = [];
110
111
    /**
112
     * Результат в формате HTML
113
     * @var array
114
     */
115
    private $xpathOnCardHtml = [];
116
117
    /** @var ZendBasedParser */
118
    private $zendParser;
119
120
    /**
121
     * HtmlUniParser constructor.
122
     * @param $config
123
     * @throws \Assert\AssertionFailedException
124
     */
125
    public function __construct($config, ZendBasedParser $parser)
126
    {
127
        parent::__construct($config);
128
        $this->checkPhpExtensions();
129
        InitializeHtmlUniParserAction::do($this, $parser);
130
    }
131
132
    /**
133
     * @throws \Assert\AssertionFailedException
134
     */
135
    private function checkPhpExtensions()
136
    {
137
        Assertion::keyNotExists(
138
            get_loaded_extensions(),
139
            'dom',
140
            'The dom extension in not loaded in system. HtmlUniParser cant work'
141
        );
142
        Assertion::keyNotExists(
143
            get_loaded_extensions(),
144
            'iconv',
145
            'The iconv extension in not loaded in system. HtmlUniParser cant work'
146
        );
147
    }
148
149
    /**
150
     * @return array
151
     */
152
    public function getXpathOnCard()
153
    {
154
        return $this->xpathOnCard;
155
    }
156
157
    /**
158
     * You can create instances of this class
159
     * by yourself or you can use this method
160
     * @param $config
161
     * @return HtmlUniParser
162
     * @throws \Assert\AssertionFailedException
163
     * @throws \ReflectionException
164
     */
165
    public static function create($config)
166
    {
167
        return Factory::createObject(
168
            [
169
                'class' => static::class,
170
            ],
171
            [
172
                $config,
173
                Factory::createObject(
174
                    [
175
                        'class' => ZendBasedParser::class,
176
                    ]
177
                ),
178
            ]
179
        );
180
    }
181
182
    /**
183
     * @param mixed $callbacs
184
     */
185
    public function addCallback($callbacs)
186
    {
187
        $this->callbacks[] = $callbacs;
188
    }
189
190
    /**
191
     * @param $node
192
     * @return string
193
     * @throws \Assert\AssertionFailedException
194
     */
195
    public function composeHtml($node): string
196
    {
197
        return ComposeHtmlAction::do($this, $node);
198
    }
199
200
    /**
201
     * @param $node
202
     * @return mixed
203
     */
204
    public function queryOuterHtml($node)
205
    {
206
        return $node->ownerDocument->saveXML($node);
207
    }
208
209
    /**
210
     * @param $nodes
211
     * @return string
212
     * @throws \Assert\AssertionFailedException
213
     */
214
    public function getFirstValue($nodes)
215
    {
216
        $val = GetFirstMatchAction::do($nodes);
217
218
        return $this->getValue($val);
219
    }
220
221
    /**
222
     * @param $nodes
223
     * @return string|string[]|null
224
     * @throws \Assert\AssertionFailedException
225
     */
226
    public function getFirstValueHtml($nodes)
227
    {
228
        /** @var \DOMElement $val */
229
        $val = GetFirstMatchAction::do($nodes);
230
        $html = $val->ownerDocument->saveHTML($val);
231
        // Удаляем картинки из спарсенного текста
232
        $html = preg_replace("/<img[^>]+\>/i", "", $html);
233
234
        return $this->proccessValue($html);
235
    }
236
237
    /**
238
     * @param $val
239
     * @return string
240
     * @throws \Assert\AssertionFailedException
241
     */
242
    public function getValue($val)
243
    {
244
        $result = null;
245
        if ($val instanceof \DOMAttr) {
246
            $result = ValueStubAction::do($val, 'value');
247
        }
248
        if ($val instanceof \DOMElement) {
249
            $result = ValueStubAction::do($val, 'nodeValue');
250
        }
251
252
        return $this->proccessValue(trim($result));
253
    }
254
255
    /**
256
     * @param $nodes
257
     * @return array
258
     */
259
    public function getAllValues($nodes)
260
    {
261
        $result = [];
262
        foreach ($nodes as $node) {
263
            $result[] = $this->getValue($node);
264
        }
265
266
        return $result;
267
    }
268
269
    /**
270
     * Спарсить ссылку, возможно каталог
271
     */
272
    public function parseUrl()
273
    {
274
        $this->zendParser->setSleepAfterRequest($this->sleepAfterRequest);
275
        $this->zendParser->setUrl($this->catalogUrl);
276
        $this->onBeforeDom();
277
        $items = $this->zendParser->dom($this->getEncoding(), $this->getTypeMech())->queryXpath($this->xpathItem);
278
        $pageHtml = $this->zendParser->getHtmlBuffer();
279
        $result = [];
280
        foreach ($items as $index => $item) {
281
            $newItem = [];
282
            $html = $this->composeHtml($item);
283
            $this->zendParser->setRawHtml($html);
284
            $link = $this->zendParser->dom($this->getEncoding(), $this->getTypeMech())->queryXpath($this->xpathLink);
285
            $link = $this->getFirstValue($link);
286
            if (preg_match('/^http(s)?:\/\/.*$/i', $link)) {
287
                $newItem['link'] = $link;
288
            } else {
289
                $newItem['link'] = $this->siteBaseUrl.$link;
290
            }
291
            if ($this->xpathTitle) {
292
                $this->zendParser->setRawHtml($pageHtml);
293
                $title = $this->zendParser->dom($this->getEncoding(), $this->getTypeMech())->queryXpath(
294
                    $this->xpathTitle
295
                );
296
                $title = $this->getFirstValue($title);
297
                $newItem['title'] = $title;
298
            }
299
            if ($this->goIntoCard && $newItem['link']) {
300
                $this->zendParser->setUrl($newItem['link']);
301
                foreach ($this->xpathOnCard as $param => $xpath) {
302
                    $this->onBeforeDom();
303
                    $temParam = $this->zendParser->dom($this->getEncoding(), $this->getTypeMech())->queryXpath($xpath);
304
                    if (in_array($param, $this->xpathOnCardMany)) {
305
                        $newItem[$param] = $this->getAllValues($temParam);
306
                    } elseif (in_array($param, $this->xpathOnCardHtml)) {
307
                        $newItem[$param] = $this->getFirstValueHtml($temParam);
308
                    } else {
309
                        $newItem[$param] = $this->getFirstValue($temParam);
310
                    }
311
                }
312
            }
313
            $this->handleCallbacks($newItem);
314
            $result[] = $newItem;
315
            // Не даем спарсить больше предела если установлен предел
316
            if ($this->resultLimit && $this->resultLimit <= ($index + 1)) {
317
                break;
318
            }
319
        }
320
321
        return $result;
322
    }
323
324
    /**
325
     * Спарсить результаты поиска
326
     */
327
    public function parseSearch($textQuery)
328
    {
329
        $this->catalogUrl = $this->searchUrl.$textQuery;
330
331
        return $this->parseUrl();
332
    }
333
334
    /**
335
     * Спарсить одну карточку
336
     */
337
    public function parseCard()
338
    {
339
        $this->zendParser->setUrl($this->pageUrl);
340
        foreach ($this->xpathOnCard as $param => $xpath) {
341
            $this->onBeforeDom();
342
            $temParam = $this->zendParser->dom($this->getEncoding(), $this->getTypeMech())->queryXpath($xpath);
343
            if (in_array($param, $this->xpathOnCardMany)) {
344
                $newItem[$param] = $this->getAllValues($temParam);
345
            } elseif (in_array($param, $this->xpathOnCardHtml)) {
346
                $newItem[$param] = $this->getFirstValueHtml($temParam);
347
            } else {
348
                $newItem[$param] = $this->getFirstValue($temParam);
349
            }
350
        }
351
        $this->handleCallbacks($newItem);
352
353
        return $newItem;
354
    }
355
356
    /**
357
     * @return array
358
     */
359
    public function parseGenerator()
360
    {
361
        $generator = $this->urlGenerator;
362
        $urls = $generator();
363
        $results = [];
364
        foreach ($urls as $url) {
365
            $this->pageUrl = $url;
366
            $results[] = $this->parseCard();
367
        }
368
369
        return $results;
370
    }
371
372
    /**
373
     * @param $lastItem
374
     * @return HtmlUniParser
375
     */
376
    public function handleCallbacks(&$lastItem)
377
    {
378
        foreach ($this->callbacks as &$callbac) {
379
            $callbac($lastItem, $this->pageUrl);
380
        }
381
382
        return $this;
383
    }
384
385
    /**
386
     * @return $this
387
     */
388
    public function onBeforeDom()
389
    {
390
        $callback = $this->beforeDomCallback;
391
        if ($callback) {
392
            $callback($this);
393
        }
394
395
        return $this;
396
    }
397
398
    /**
399
     * @return string
400
     */
401
    public function getEncoding(): string
402
    {
403
        return $this->encoding;
404
    }
405
406
    /**
407
     * @param string $encoding
408
     * @return HtmlUniParser
409
     */
410
    public function setEncoding(string $encoding)
411
    {
412
        $this->encoding = $encoding;
413
414
        return $this;
415
    }
416
417
    /**
418
     * @param $value
419
     * @return false|string
420
     */
421
    private function proccessValue($value)
422
    {
423
        if ($this->getEncoding() === 'UTF-8') {
424
            return $value;
425
        }
426
        $result = \iconv($this->getEncoding(), 'UTF-8', $value);
427
428
        return $result;
429
    }
430
431
    /**
432
     * @return string
433
     */
434
    public function getTypeMech(): string
435
    {
436
        return $this->typeMech ?: 'curl';
437
    }
438
439
    /**
440
     * @return ZendBasedParser
441
     */
442
    public function getZendParser(): ZendBasedParser
443
    {
444
        return $this->zendParser;
445
    }
446
447
    /**
448
     * @param string $typeMech
449
     * @return HtmlUniParser
450
     */
451
    public function setTypeMech(string $typeMech)
452
    {
453
        $this->typeMech = $typeMech;
454
455
        return $this;
456
    }
457
458
    /**
459
     * @param mixed $catalogUrl
460
     * @return HtmlUniParser
461
     */
462
    public function setCatalogUrl($catalogUrl)
463
    {
464
        $this->catalogUrl = $catalogUrl;
465
466
        return $this;
467
    }
468
469
    /**
470
     * @param mixed $searchUrl
471
     * @return HtmlUniParser
472
     */
473
    public function setSearchUrl($searchUrl)
474
    {
475
        $this->searchUrl = $searchUrl;
476
477
        return $this;
478
    }
479
480
    /**
481
     * @param mixed $pageUrl
482
     * @return HtmlUniParser
483
     */
484
    public function setPageUrl($pageUrl)
485
    {
486
        $this->pageUrl = $pageUrl;
487
488
        return $this;
489
    }
490
491
    /**
492
     * @param bool $forceOuterHtml
493
     * @return HtmlUniParser
494
     */
495
    public function setForceOuterHtml(bool $forceOuterHtml)
496
    {
497
        $this->forceOuterHtml = $forceOuterHtml;
498
499
        return $this;
500
    }
501
502
    /**
503
     * @return bool
504
     */
505
    public function isForceOuterHtml(): bool
506
    {
507
        return $this->forceOuterHtml;
508
    }
509
510
    /**
511
     * @param mixed $urlGenerator
512
     * @return HtmlUniParser
513
     */
514
    public function setUrlGenerator($urlGenerator)
515
    {
516
        $this->urlGenerator = $urlGenerator;
517
518
        return $this;
519
    }
520
521
    /**
522
     * @param string $siteBaseUrl
523
     * @return HtmlUniParser
524
     */
525
    public function setSiteBaseUrl(string $siteBaseUrl)
526
    {
527
        $this->siteBaseUrl = $siteBaseUrl;
528
529
        return $this;
530
    }
531
532
    /**
533
     * @param bool $resultLimit
534
     * @return HtmlUniParser
535
     */
536
    public function setResultLimit($resultLimit)
537
    {
538
        $this->resultLimit = $resultLimit;
539
540
        return $this;
541
    }
542
543
    /**
544
     * @param int $sleepAfterRequest
545
     * @return HtmlUniParser
546
     */
547
    public function setSleepAfterRequest(int $sleepAfterRequest)
548
    {
549
        $this->sleepAfterRequest = $sleepAfterRequest;
550
551
        return $this;
552
    }
553
554
    /**
555
     * @param bool $goIntoCard
556
     * @return HtmlUniParser
557
     */
558
    public function setGoIntoCard(bool $goIntoCard)
559
    {
560
        $this->goIntoCard = $goIntoCard;
561
562
        return $this;
563
    }
564
565
    /**
566
     * @param string $xpathItem
567
     * @return HtmlUniParser
568
     */
569
    public function setXpathItem(string $xpathItem)
570
    {
571
        $this->xpathItem = $xpathItem;
572
573
        return $this;
574
    }
575
576
    /**
577
     * @param string $xpathLink
578
     * @return HtmlUniParser
579
     */
580
    public function setXpathLink(string $xpathLink)
581
    {
582
        $this->xpathLink = $xpathLink;
583
584
        return $this;
585
    }
586
587
    /**
588
     * @param array $xpathOnCard
589
     * @return HtmlUniParser
590
     */
591
    public function setXpathOnCard(array $xpathOnCard)
592
    {
593
        $this->xpathOnCard = $xpathOnCard;
594
595
        return $this;
596
    }
597
598
    /**
599
     * @param array $callbacks
600
     * @return HtmlUniParser
601
     */
602
    public function setCallbacks(array $callbacks)
603
    {
604
        $this->callbacks = $callbacks;
605
606
        return $this;
607
    }
608
609
    /**
610
     * @param array $xpathOnCardMany
611
     * @return HtmlUniParser
612
     */
613
    public function setXpathOnCardMany(array $xpathOnCardMany)
614
    {
615
        $this->xpathOnCardMany = $xpathOnCardMany;
616
617
        return $this;
618
    }
619
620
    /**
621
     * @param array $xpathOnCardHtml
622
     * @return HtmlUniParser
623
     */
624
    public function setXpathOnCardHtml(array $xpathOnCardHtml)
625
    {
626
        $this->xpathOnCardHtml = $xpathOnCardHtml;
627
628
        return $this;
629
    }
630
631
    /**
632
     * @param ZendBasedParser $zendParser
633
     * @return HtmlUniParser
634
     */
635
    public function setZendParser(ZendBasedParser $zendParser)
636
    {
637
        $this->zendParser = $zendParser;
638
639
        return $this;
640
    }
641
642
    /**
643
     * @param mixed $beforeDomCallback
644
     * @return HtmlUniParser
645
     */
646
    public function setBeforeDomCallback($beforeDomCallback)
647
    {
648
        $this->beforeDomCallback = $beforeDomCallback;
649
650
        return $this;
651
    }
652
653
    /**
654
     * @return string
655
     */
656
    public function getXpathTitle()
657
    {
658
        return $this->xpathTitle;
659
    }
660
661
    /**
662
     * @param string $xpathTitle
663
     * @return HtmlUniParser
664
     */
665
    public function setXpathTitle($xpathTitle)
666
    {
667
        $this->xpathTitle = $xpathTitle;
668
669
        return $this;
670
    }
671
}
672