1
|
|
|
<?php |
2
|
|
|
|
3
|
|
|
namespace TreeHouse\IoBundle\Scrape\Parser; |
4
|
|
|
|
5
|
|
|
use Symfony\Component\DomCrawler\Crawler as DomCrawler; |
6
|
|
|
use Symfony\Component\EventDispatcher\EventDispatcher; |
7
|
|
|
use Symfony\Component\EventDispatcher\EventDispatcherInterface; |
8
|
|
|
use TreeHouse\Feeder\Event\FailedItemModificationEvent; |
9
|
|
|
use TreeHouse\Feeder\Exception\FilterException; |
10
|
|
|
use TreeHouse\Feeder\Exception\ModificationException; |
11
|
|
|
use TreeHouse\Feeder\Exception\ValidationException; |
12
|
|
|
use TreeHouse\Feeder\FeedEvents; |
13
|
|
|
use TreeHouse\Feeder\Modifier\Item\Filter\FilterInterface; |
14
|
|
|
use TreeHouse\Feeder\Modifier\Item\Mapper\MapperInterface; |
15
|
|
|
use TreeHouse\Feeder\Modifier\Item\ModifierInterface; |
16
|
|
|
use TreeHouse\Feeder\Modifier\Item\Transformer\TransformerInterface; |
17
|
|
|
use TreeHouse\Feeder\Modifier\Item\Validator\ValidatorInterface; |
18
|
|
|
use TreeHouse\IoBundle\Scrape\Modifier\Item\Mapper\CrawlerAwareInterface; |
19
|
|
|
use TreeHouse\IoBundle\Scrape\ScrapedItemBag; |
20
|
|
|
|
21
|
|
|
class DefaultParser implements ParserInterface |
22
|
|
|
{ |
23
|
|
|
/** |
24
|
|
|
* @var EventDispatcherInterface |
25
|
|
|
*/ |
26
|
|
|
protected $eventDispatcher; |
27
|
|
|
|
28
|
|
|
/** |
29
|
|
|
* @var ModifierInterface[] |
30
|
|
|
*/ |
31
|
|
|
protected $modifiers = []; |
32
|
|
|
|
33
|
|
|
/** |
34
|
|
|
* @var array |
35
|
|
|
*/ |
36
|
|
|
protected $continues = []; |
37
|
|
|
|
38
|
|
|
/** |
39
|
|
|
* @param EventDispatcherInterface $eventDispatcher |
40
|
|
|
*/ |
41
|
32 |
|
public function __construct(EventDispatcherInterface $eventDispatcher = null) |
42
|
|
|
{ |
43
|
32 |
|
$this->eventDispatcher = $eventDispatcher ?: new EventDispatcher(); |
44
|
32 |
|
} |
45
|
|
|
|
46
|
|
|
/** |
47
|
|
|
* @inheritdoc |
48
|
|
|
*/ |
49
|
4 |
|
public function getEventDispatcher() |
50
|
|
|
{ |
51
|
4 |
|
return $this->eventDispatcher; |
52
|
|
|
} |
53
|
|
|
|
54
|
|
|
/** |
55
|
|
|
* @inheritdoc |
56
|
|
|
*/ |
57
|
10 |
|
public function getModifiers() |
58
|
|
|
{ |
59
|
10 |
|
return $this->modifiers; |
60
|
|
|
} |
61
|
|
|
|
62
|
|
|
/** |
63
|
|
|
* @inheritdoc |
64
|
|
|
*/ |
65
|
26 |
|
public function addModifier(ModifierInterface $modifier, $position = null, $continueOnException = false) |
66
|
|
|
{ |
67
|
26 |
|
if (null === $position) { |
68
|
10 |
|
$position = sizeof($this->modifiers) ? (max(array_keys($this->modifiers)) + 1) : 0; |
69
|
|
|
} |
70
|
|
|
|
71
|
26 |
|
if (!is_numeric($position)) { |
72
|
|
|
throw new \InvalidArgumentException('Position must be a number'); |
73
|
|
|
} |
74
|
|
|
|
75
|
26 |
|
if (array_key_exists($position, $this->modifiers)) { |
76
|
|
|
throw new \InvalidArgumentException(sprintf('There already is a modifier at position %d', $position)); |
77
|
|
|
} |
78
|
|
|
|
79
|
26 |
|
$this->modifiers[$position] = $modifier; |
80
|
26 |
|
$this->continues[$position] = $continueOnException; |
81
|
|
|
|
82
|
26 |
|
ksort($this->modifiers); |
83
|
26 |
|
} |
84
|
|
|
|
85
|
|
|
/** |
86
|
|
|
* @param ModifierInterface $modifier |
87
|
|
|
*/ |
88
|
2 |
View Code Duplication |
public function removeModifier(ModifierInterface $modifier) |
|
|
|
|
89
|
|
|
{ |
90
|
2 |
|
foreach ($this->modifiers as $position => $_modifier) { |
91
|
2 |
|
if ($_modifier === $modifier) { |
92
|
2 |
|
unset($this->modifiers[$position]); |
93
|
|
|
|
94
|
2 |
|
break; |
95
|
|
|
} |
96
|
|
|
} |
97
|
2 |
|
} |
98
|
|
|
|
99
|
|
|
/** |
100
|
|
|
* @inheritdoc |
101
|
|
|
*/ |
102
|
2 |
View Code Duplication |
public function removeModifierAt($position) |
|
|
|
|
103
|
|
|
{ |
104
|
2 |
|
if (!array_key_exists($position, $this->modifiers)) { |
105
|
|
|
throw new \OutOfBoundsException(sprintf('There is no modifier at position %d', $position)); |
106
|
|
|
} |
107
|
|
|
|
108
|
2 |
|
unset($this->modifiers[$position]); |
109
|
2 |
|
} |
110
|
|
|
|
111
|
|
|
/** |
112
|
|
|
* @inheritdoc |
113
|
|
|
*/ |
114
|
4 |
|
public function hasModifierAt($position) |
115
|
|
|
{ |
116
|
4 |
|
return array_key_exists($position, $this->modifiers); |
117
|
|
|
} |
118
|
|
|
|
119
|
|
|
/** |
120
|
|
|
* @inheritdoc |
121
|
|
|
*/ |
122
|
12 |
|
public function parse(ScrapedItemBag $item) |
123
|
|
|
{ |
124
|
12 |
|
$crawler = $this->getDomCrawler($item->getOriginalData(), $item->getOriginalUrl()); |
125
|
|
|
|
126
|
12 |
|
foreach ($this->modifiers as $position => $modifier) { |
127
|
|
|
// set crawler if needed |
128
|
12 |
|
if ($modifier instanceof CrawlerAwareInterface) { |
129
|
|
|
$modifier->setCrawler($crawler); |
130
|
|
|
} |
131
|
|
|
|
132
|
|
|
try { |
133
|
12 |
|
if ($modifier instanceof FilterInterface) { |
134
|
2 |
|
$modifier->filter($item); |
135
|
|
|
} |
136
|
|
|
|
137
|
10 |
|
if ($modifier instanceof MapperInterface) { |
138
|
|
|
$item = $modifier->map($item); |
139
|
|
|
} |
140
|
|
|
|
141
|
10 |
|
if ($modifier instanceof TransformerInterface) { |
142
|
8 |
|
$modifier->transform($item); |
143
|
|
|
} |
144
|
|
|
|
145
|
4 |
|
if ($modifier instanceof ValidatorInterface) { |
146
|
4 |
|
$modifier->validate($item); |
147
|
|
|
} |
148
|
10 |
|
} catch (FilterException $e) { |
149
|
|
|
// filter exceptions don't get to continue |
150
|
2 |
|
throw $e; |
151
|
8 |
|
} catch (ValidationException $e) { |
152
|
|
|
// validation exceptions don't get to continue |
153
|
2 |
|
throw $e; |
154
|
6 |
|
} catch (ModificationException $e) { |
155
|
|
|
// notify listeners of this failure, give them the option to stop propagation |
156
|
6 |
|
$event = new FailedItemModificationEvent($item, $modifier, $e); |
157
|
6 |
|
$event->setContinue($this->continues[$position]); |
158
|
|
|
|
159
|
6 |
|
$this->eventDispatcher->dispatch(FeedEvents::ITEM_MODIFICATION_FAILED, $event); |
160
|
|
|
|
161
|
6 |
|
if (!$event->getContinue()) { |
162
|
8 |
|
throw $e; |
163
|
|
|
} |
164
|
|
|
} |
165
|
|
|
} |
166
|
6 |
|
} |
167
|
|
|
|
168
|
|
|
/** |
169
|
|
|
* @param string $html |
170
|
|
|
* @param string $url |
171
|
|
|
* |
172
|
|
|
* @return DomCrawler |
173
|
|
|
*/ |
174
|
12 |
|
protected function getDomCrawler($html, $url) |
175
|
|
|
{ |
176
|
12 |
|
return new DomCrawler($html, $url); |
177
|
|
|
} |
178
|
|
|
} |
179
|
|
|
|
Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.
You can also find more detailed suggestions in the “Code” section of your repository.