Completed
Push — master ( a69ea1...69dd71 )
by Kosuha
02:53 queued 10s
created

HtmlUniParser::getHtml()   A

Complexity

Conditions 3
Paths 3

Size

Total Lines 11
Code Lines 7

Duplication

Lines 0
Ratio 0 %

Importance

Changes 2
Bugs 0 Features 0
Metric Value
cc 3
eloc 7
c 2
b 0
f 0
nc 3
nop 1
dl 0
loc 11
rs 10
1
<?php
2
3
namespace kosuha606\HtmlUniParser;
4
5
use kosuha606\HtmlUniParser\exceptions\ParserInvalidConfigException;
6
7
/**
8
 * Class HtmlUniParser
9
 * @package kosuha606\HtmlUniParser
10
 */
11
class HtmlUniParser extends BaseObject
12
{
13
    /**
14
     * Парсинг по сценарию каталога
15
     * @var
16
     */
17
    protected $catalogUrl;
18
19
    /**
20
     * Парсинг по сценарию поиска на сайте
21
     * @var
22
     */
23
    protected $searchUrl;
24
25
    /**
26
     * Парсинг по сценарию получения данных от одной страницы
27
     * @var
28
     */
29
    protected $pageUrl;
30
31
    /**
32
     * Заставить парсер получать внешний html
33
     * @var boolean
34
     */
35
    protected $forceOuterHtml = false;
36
37
    /**
38
     * Парсинг по урлам, сгенерированным генератором
39
     * @var
40
     */
41
    protected $urlGenerator;
42
43
    /** @var  */
44
    protected $beforeDomCallback;
45
46
    /**
47
     * Кодировка сайта
48
     * @var string
49
     */
50
    protected $encoding = 'UTF-8';
51
52
    /**
53
     * @var string
54
     */
55
    protected $siteBaseUrl = '/';
56
57
    /**
58
     * @var bool
59
     */
60
    protected $resultLimit = false;
61
62
    /**
63
     * @var int
64
     */
65
    protected $sleepAfterRequest = 0;
66
67
    /**
68
     * @var bool
69
     */
70
    protected $goIntoCard = false;
71
72
    /**
73
     * @var string
74
     */
75
    protected $xpathItem;
76
77
    /**
78
     * @var string
79
     */
80
    protected $xpathLink;
81
82
    /**
83
     * @var array
84
     */
85
    protected $xpathOnCard = [];
86
87
    /** @var string */
88
    protected $typeMech;
89
90
    /**
91
     * @var array
92
     */
93
    protected $callbacks = [];
94
95
    /**
96
     * Результат должен быть множественным
97
     * @var array
98
     */
99
    private $xpathOnCardMany = [];
100
101
    /**
102
     * Результат в формате HTML
103
     * @var array
104
     */
105
    private $xpathOnCardHtml = [];
106
107
    /** @var ZendBasedParser */
108
    private $zendParser;
109
110
    /**
111
     * HtmlUniParser constructor.
112
     * @param $config
113
     */
114
    public function __construct($config, ZendBasedParser $parser)
115
    {
116
        if  (!in_array  ('dom', get_loaded_extensions())) {
117
            throw new ParserInvalidConfigException('The dom extension in not loaded in system');
118
        }
119
        if  (!in_array  ('iconv', get_loaded_extensions())) {
120
            throw new ParserInvalidConfigException('The iconv extension in not loaded in system');
121
        }
122
        parent::__construct($config);
123
        $this->zendParser = $parser;
124
        if (\count($this->xpathOnCard) > 0) {
125
            foreach ($this->xpathOnCard as $param => &$xpath) {
126
                if (\strpos($xpath, 'MANY') !== false) {
127
                    $this->xpathOnCardMany[] = $param;
128
                    $xpath = \str_replace('MANY', '', $xpath);
129
                }
130
                if (\strpos($xpath, 'HTML') !== false) {
131
                    $this->xpathOnCardHtml[] = $param;
132
                    $xpath = \str_replace('HTML', '', $xpath);
133
                }
134
            }
135
        }
136
    }
137
138
    /**
139
     * You can create instances of this class
140
     * by yourself or you can use this method
141
     * @param $config
142
     * @return HtmlUniParser
143
     */
144
    public static function create($config)
145
    {
146
147
        return new static($config, new ZendBasedParser());
148
    }
149
150
    /**
151
     * @param mixed $callbacs
152
     */
153
    public function addCallback($callbacs)
154
    {
155
        $this->callbacks[] = $callbacs;
156
    }
157
158
    /**
159
     * @param $node
160
     * @return string
161
     */
162
    public function getHtml($node)
163
    {
164
        if ($this->forceOuterHtml) {
165
            return $this->getOuterHtml($node);
166
        }
167
        $innerHTML = '';
168
        $children = $node->childNodes;
169
        foreach ($children as $child) {
170
            $innerHTML .= $child->ownerDocument->saveXML($child);
171
        }
172
        return $innerHTML;
173
    }
174
175
    /**
176
     * @param $node
177
     * @return mixed
178
     */
179
    public function getOuterHtml($node)
180
    {
181
        return $node->ownerDocument->saveXML($node);
182
    }
183
184
    /**
185
     * @param $object
186
     * @param $method
187
     * @return mixed|null
188
     */
189
    public function valueStub($object, $method)
190
    {
191
        if (!\is_object($object)) {
192
            return null;
193
        }
194
        if (\property_exists($object, $method)) {
195
            return $object->{$method};
196
        }
197
        return null;
198
    }
199
200
    /**
201
     * @param $results
202
     * @return array
203
     */
204
    public function getFirstMatch($results)
205
    {
206
        $result = array();
207
        foreach ($results as $r) {
208
            $result = $r;
209
        }
210
        return $result;
211
    }
212
213
    /**
214
     * @param $nodes
215
     * @return string
216
     */
217
    public function getFirstValue($nodes)
218
    {
219
        $val = $this->getFirstMatch($nodes);
220
        return $this->getValue($val);
221
    }
222
223
    /**
224
     * @param $nodes
225
     * @return string|string[]|null
226
     */
227
    public function getFirstValueHtml($nodes)
228
    {
229
        /** @var \DOMElement $val */
230
        $val = $this->getFirstMatch($nodes);
231
        $html = $val->ownerDocument->saveHTML($val);
232
        // Удаляем картинки из спарсенного текста
233
        $html = preg_replace("/<img[^>]+\>/i", "", $html);
234
        return $this->proccessValue($html);
235
    }
236
237
    /**
238
     * @param $val
239
     * @return string
240
     */
241
    public function getValue($val)
242
    {
243
        $result = null;
244
        if ($val instanceof \DOMAttr) {
245
            $result = $this->valueStub($val, 'value');
246
        }
247
        if ($val instanceof \DOMElement) {
248
            $result = $this->valueStub($val, 'nodeValue');
249
        }
250
        return $this->proccessValue(trim($result));
251
    }
252
253
    /**
254
     * @param $nodes
255
     * @return array
256
     */
257
    public function getAllValues($nodes)
258
    {
259
        $result = [];
260
        foreach ($nodes as $node) {
261
            $result[] = $this->getValue($node);
262
        }
263
        return $result;
264
    }
265
266
    /**
267
     * Спарсить ссылку, возможно каталог
268
     */
269
    public function parseUrl()
270
    {
271
        $this->zendParser->setSleepAfterRequest($this->sleepAfterRequest);
272
        $this->zendParser->setUrl($this->catalogUrl);
273
        $this->onBeforeDom();
274
        $items = $this->zendParser->dom($this->getEncoding(), $this->getTypeMech())->queryXpath($this->xpathItem);
275
        $result = [];
276
        foreach ($items as $index => $item) {
277
            $newItem = [];
278
            $html = $this->getHtml($item);
279
            $this->zendParser->setRawHtml($html);
280
            $this->onBeforeDom();
281
            $link = $this->zendParser->dom($this->getEncoding(), $this->getTypeMech())->queryXpath($this->xpathLink);
282
            $link = $this->getFirstValue($link);
283
            if (preg_match('/^http(s)?:\/\/.*$/i', $link)) {
284
                $newItem['link'] = $link;
285
            } else {
286
                $newItem['link'] = $this->siteBaseUrl.$link;
287
            }
288
            if ($this->goIntoCard && $newItem['link']) {
289
                $this->zendParser->setUrl($newItem['link']);
290
                foreach ($this->xpathOnCard as $param => $xpath) {
291
                    $this->onBeforeDom();
292
                    $temParam = $this->zendParser->dom($this->getEncoding(), $this->getTypeMech())->queryXpath($xpath);
293
                    if (in_array($param, $this->xpathOnCardMany)) {
294
                        $newItem[$param] = $this->getAllValues($temParam);
295
                    } elseif (in_array($param, $this->xpathOnCardHtml)) {
296
                        $newItem[$param] = $this->getFirstValueHtml($temParam);
297
                    } else {
298
                        $newItem[$param] = $this->getFirstValue($temParam);
299
                    }
300
                }
301
            }
302
            $this->handleCallbacks($newItem);
303
            $result[] = $newItem;
304
            // Не даем спарсить больше предела если установлен предел
305
            if ($this->resultLimit && $this->resultLimit <= ($index + 1)) {
306
                break;
307
            }
308
        }
309
        return $result;
310
    }
311
312
    /**
313
     * Спарсить результаты поиска
314
     */
315
    public function parseSearch($textQuery)
316
    {
317
        $this->catalogUrl = $this->searchUrl.$textQuery;
318
        return $this->parseUrl();
319
    }
320
321
    /**
322
     * Спарсить одну карточку
323
     */
324
    public function parseCard()
325
    {
326
        $this->zendParser->setUrl($this->pageUrl);
327
        foreach ($this->xpathOnCard as $param => $xpath) {
328
            $this->onBeforeDom();
329
            $temParam = $this->zendParser->dom($this->getEncoding(), $this->getTypeMech())->queryXpath($xpath);
330
            if (in_array($param, $this->xpathOnCardMany)) {
331
                $newItem[$param] = $this->getAllValues($temParam);
332
            } elseif (in_array($param, $this->xpathOnCardHtml)) {
333
                $newItem[$param] = $this->getFirstValueHtml($temParam);
334
            } else {
335
                $newItem[$param] = $this->getFirstValue($temParam);
336
            }
337
        }
338
        $this->handleCallbacks($newItem);
339
        return $newItem;
340
    }
341
342
    /**
343
     * @return array
344
     */
345
    public function parseGenerator()
346
    {
347
        $generator = $this->urlGenerator;
348
        $urls = $generator();
349
        $results = [];
350
        foreach ($urls as $url) {
351
            $this->pageUrl = $url;
352
            $results[] = $this->parseCard();
353
        }
354
        return $results;
355
    }
356
357
    /**
358
     * @param $lastItem
359
     * @return HtmlUniParser
360
     */
361
    public function handleCallbacks(&$lastItem)
362
    {
363
        foreach ($this->callbacks as &$callbac) {
364
            $callbac($lastItem, $this->pageUrl);
365
        }
366
        return $this;
367
    }
368
369
    /**
370
     * @return $this
371
     */
372
    public function onBeforeDom()
373
    {
374
        $callback = $this->beforeDomCallback;
375
        if ($callback) {
376
            $callback($this);
377
        }
378
        return $this;
379
    }
380
381
    /**
382
     * @return string
383
     */
384
    public function getEncoding(): string
385
    {
386
        return $this->encoding;
387
    }
388
389
    /**
390
     * @param string $encoding
391
     * @return HtmlUniParser
392
     */
393
    public function setEncoding(string $encoding)
394
    {
395
        $this->encoding = $encoding;
396
        return $this;
397
    }
398
399
    /**
400
     * @param $value
401
     * @return false|string
402
     */
403
    private function proccessValue($value)
404
    {
405
        if ($this->getEncoding() === 'UTF-8') {
406
            return $value;
407
        }
408
        $result = \iconv($this->getEncoding(), 'UTF-8', $value);
409
        return $result;
410
    }
411
412
    /**
413
     * @return string
414
     */
415
    public function getTypeMech(): string
416
    {
417
        return $this->typeMech ?: 'curl';
418
    }
419
420
    /**
421
     * @return ZendBasedParser
422
     */
423
    public function getZendParser(): ZendBasedParser
424
    {
425
        return $this->zendParser;
426
    }
427
428
    /**
429
     * @param string $typeMech
430
     * @return HtmlUniParser
431
     */
432
    public function setTypeMech(string $typeMech)
433
    {
434
        $this->typeMech = $typeMech;
435
        return $this;
436
    }
437
438
    /**
439
     * @param mixed $catalogUrl
440
     * @return HtmlUniParser
441
     */
442
    public function setCatalogUrl($catalogUrl)
443
    {
444
        $this->catalogUrl = $catalogUrl;
445
        return $this;
446
    }
447
448
    /**
449
     * @param mixed $searchUrl
450
     * @return HtmlUniParser
451
     */
452
    public function setSearchUrl($searchUrl)
453
    {
454
        $this->searchUrl = $searchUrl;
455
        return $this;
456
    }
457
458
    /**
459
     * @param mixed $pageUrl
460
     * @return HtmlUniParser
461
     */
462
    public function setPageUrl($pageUrl)
463
    {
464
        $this->pageUrl = $pageUrl;
465
        return $this;
466
    }
467
468
    /**
469
     * @param bool $forceOuterHtml
470
     * @return HtmlUniParser
471
     */
472
    public function setForceOuterHtml(bool $forceOuterHtml)
473
    {
474
        $this->forceOuterHtml = $forceOuterHtml;
475
        return $this;
476
    }
477
478
    /**
479
     * @param mixed $urlGenerator
480
     * @return HtmlUniParser
481
     */
482
    public function setUrlGenerator($urlGenerator)
483
    {
484
        $this->urlGenerator = $urlGenerator;
485
        return $this;
486
    }
487
488
    /**
489
     * @param string $siteBaseUrl
490
     * @return HtmlUniParser
491
     */
492
    public function setSiteBaseUrl(string $siteBaseUrl)
493
    {
494
        $this->siteBaseUrl = $siteBaseUrl;
495
        return $this;
496
    }
497
498
    /**
499
     * @param bool $resultLimit
500
     * @return HtmlUniParser
501
     */
502
    public function setResultLimit($resultLimit)
503
    {
504
        $this->resultLimit = $resultLimit;
505
        return $this;
506
    }
507
508
    /**
509
     * @param int $sleepAfterRequest
510
     * @return HtmlUniParser
511
     */
512
    public function setSleepAfterRequest(int $sleepAfterRequest)
513
    {
514
        $this->sleepAfterRequest = $sleepAfterRequest;
515
        return $this;
516
    }
517
518
    /**
519
     * @param bool $goIntoCard
520
     * @return HtmlUniParser
521
     */
522
    public function setGoIntoCard(bool $goIntoCard)
523
    {
524
        $this->goIntoCard = $goIntoCard;
525
        return $this;
526
    }
527
528
    /**
529
     * @param string $xpathItem
530
     * @return HtmlUniParser
531
     */
532
    public function setXpathItem(string $xpathItem)
533
    {
534
        $this->xpathItem = $xpathItem;
535
        return $this;
536
    }
537
538
    /**
539
     * @param string $xpathLink
540
     * @return HtmlUniParser
541
     */
542
    public function setXpathLink(string $xpathLink)
543
    {
544
        $this->xpathLink = $xpathLink;
545
        return $this;
546
    }
547
548
    /**
549
     * @param array $xpathOnCard
550
     * @return HtmlUniParser
551
     */
552
    public function setXpathOnCard(array $xpathOnCard)
553
    {
554
        $this->xpathOnCard = $xpathOnCard;
555
        return $this;
556
    }
557
558
    /**
559
     * @param array $callbacks
560
     * @return HtmlUniParser
561
     */
562
    public function setCallbacks(array $callbacks)
563
    {
564
        $this->callbacks = $callbacks;
565
        return $this;
566
    }
567
568
    /**
569
     * @param array $xpathOnCardMany
570
     * @return HtmlUniParser
571
     */
572
    public function setXpathOnCardMany(array $xpathOnCardMany)
573
    {
574
        $this->xpathOnCardMany = $xpathOnCardMany;
575
        return $this;
576
    }
577
578
    /**
579
     * @param array $xpathOnCardHtml
580
     * @return HtmlUniParser
581
     */
582
    public function setXpathOnCardHtml(array $xpathOnCardHtml)
583
    {
584
        $this->xpathOnCardHtml = $xpathOnCardHtml;
585
        return $this;
586
    }
587
588
    /**
589
     * @param ZendBasedParser $zendParser
590
     * @return HtmlUniParser
591
     */
592
    public function setZendParser(ZendBasedParser $zendParser)
593
    {
594
        $this->zendParser = $zendParser;
595
        return $this;
596
    }
597
598
    /**
599
     * @param mixed $beforeDomCallback
600
     * @return HtmlUniParser
601
     */
602
    public function setBeforeDomCallback($beforeDomCallback)
603
    {
604
        $this->beforeDomCallback = $beforeDomCallback;
605
        return $this;
606
    }
607
}
608