|
1
|
|
|
<?php |
|
2
|
|
|
|
|
3
|
|
|
namespace TreeHouse\IoBundle\Scrape; |
|
4
|
|
|
|
|
5
|
|
|
use Symfony\Component\EventDispatcher\EventDispatcherInterface; |
|
6
|
|
|
use TreeHouse\IoBundle\Entity\Scraper as ScraperEntity; |
|
7
|
|
|
use TreeHouse\IoBundle\Scrape\Crawler\CrawlerInterface; |
|
8
|
|
|
use TreeHouse\IoBundle\Scrape\Handler\HandlerInterface; |
|
9
|
|
|
use TreeHouse\IoBundle\Scrape\Parser\ParserBuilder; |
|
10
|
|
|
use TreeHouse\IoBundle\Scrape\Parser\ParserInterface; |
|
11
|
|
|
use TreeHouse\IoBundle\Scrape\Parser\Type\ParserTypeInterface; |
|
12
|
|
|
|
|
13
|
|
|
class ScraperFactory |
|
14
|
|
|
{ |
|
15
|
|
|
/** |
|
16
|
|
|
* @var EventDispatcherInterface |
|
17
|
|
|
*/ |
|
18
|
|
|
protected $eventDispatcher; |
|
19
|
|
|
|
|
20
|
|
|
/** |
|
21
|
|
|
* @var CrawlerInterface[] |
|
22
|
|
|
*/ |
|
23
|
|
|
protected $crawlers = []; |
|
24
|
|
|
|
|
25
|
|
|
/** |
|
26
|
|
|
* @var ParserTypeInterface[] |
|
27
|
|
|
*/ |
|
28
|
|
|
protected $parserTypes = []; |
|
29
|
|
|
|
|
30
|
|
|
/** |
|
31
|
|
|
* @var HandlerInterface[] |
|
32
|
|
|
*/ |
|
33
|
|
|
protected $handlers = []; |
|
34
|
|
|
|
|
35
|
|
|
/** |
|
36
|
|
|
* @var ParserInterface[] |
|
37
|
|
|
*/ |
|
38
|
|
|
protected $parsers = []; |
|
39
|
|
|
|
|
40
|
|
|
/** |
|
41
|
|
|
* @param EventDispatcherInterface $dispatcher |
|
42
|
|
|
*/ |
|
43
|
16 |
|
public function __construct(EventDispatcherInterface $dispatcher = null) |
|
44
|
|
|
{ |
|
45
|
16 |
|
$this->eventDispatcher = $dispatcher; |
|
46
|
16 |
|
} |
|
47
|
|
|
|
|
48
|
|
|
/** |
|
49
|
|
|
* @return EventDispatcherInterface |
|
50
|
|
|
*/ |
|
51
|
2 |
|
public function getEventDispatcher() |
|
52
|
|
|
{ |
|
53
|
2 |
|
return $this->eventDispatcher; |
|
54
|
|
|
} |
|
55
|
|
|
|
|
56
|
|
|
/** |
|
57
|
|
|
* @inheritdoc |
|
58
|
|
|
*/ |
|
59
|
2 |
|
public function registerCrawler(CrawlerInterface $crawler, $alias) |
|
60
|
|
|
{ |
|
61
|
2 |
|
$this->crawlers[$alias] = $crawler; |
|
62
|
2 |
|
} |
|
63
|
|
|
|
|
64
|
|
|
/** |
|
65
|
|
|
* @param string $alias |
|
66
|
|
|
* |
|
67
|
|
|
* @return CrawlerInterface |
|
68
|
|
|
*/ |
|
69
|
4 |
View Code Duplication |
public function getCrawler($alias) |
|
|
|
|
|
|
70
|
|
|
{ |
|
71
|
4 |
|
if (!array_key_exists($alias, $this->crawlers)) { |
|
72
|
2 |
|
throw new \OutOfBoundsException( |
|
73
|
2 |
|
sprintf( |
|
74
|
|
|
'Crawler "%s" is not registered. You can add it by creating a service which implements %s, ' . |
|
75
|
2 |
|
'and tag it with tree_house.io.scrape.crawler', |
|
76
|
2 |
|
$alias, |
|
77
|
2 |
|
CrawlerInterface::class |
|
78
|
|
|
) |
|
79
|
|
|
); |
|
80
|
|
|
} |
|
81
|
|
|
|
|
82
|
2 |
|
return $this->crawlers[$alias]; |
|
83
|
|
|
} |
|
84
|
|
|
|
|
85
|
|
|
/** |
|
86
|
|
|
* @return CrawlerInterface[] |
|
87
|
|
|
*/ |
|
88
|
|
|
public function getCrawlers() |
|
89
|
|
|
{ |
|
90
|
|
|
return $this->crawlers; |
|
91
|
|
|
} |
|
92
|
|
|
|
|
93
|
|
|
/** |
|
94
|
|
|
* @inheritdoc |
|
95
|
|
|
*/ |
|
96
|
2 |
|
public function registerParserType(ParserTypeInterface $parser, $alias) |
|
97
|
|
|
{ |
|
98
|
2 |
|
$this->parserTypes[$alias] = $parser; |
|
99
|
2 |
|
} |
|
100
|
|
|
|
|
101
|
|
|
/** |
|
102
|
|
|
* @param string $alias |
|
103
|
|
|
* |
|
104
|
|
|
* @return ParserTypeInterface |
|
105
|
|
|
*/ |
|
106
|
4 |
View Code Duplication |
public function getParserType($alias) |
|
|
|
|
|
|
107
|
|
|
{ |
|
108
|
4 |
|
if (!array_key_exists($alias, $this->parserTypes)) { |
|
109
|
2 |
|
throw new \OutOfBoundsException( |
|
110
|
2 |
|
sprintf( |
|
111
|
|
|
'Parser type "%s" is not registered. You can add it by creating a service which implements %s, ' . |
|
112
|
2 |
|
'and tag it with tree_house.io.scrape.parser_type', |
|
113
|
2 |
|
$alias, |
|
114
|
2 |
|
ParserTypeInterface::class |
|
115
|
|
|
) |
|
116
|
|
|
); |
|
117
|
|
|
} |
|
118
|
|
|
|
|
119
|
2 |
|
return $this->parserTypes[$alias]; |
|
120
|
|
|
} |
|
121
|
|
|
|
|
122
|
|
|
/** |
|
123
|
|
|
* @return ParserTypeInterface[] |
|
124
|
|
|
*/ |
|
125
|
|
|
public function getParserTypes() |
|
126
|
|
|
{ |
|
127
|
|
|
return $this->parserTypes; |
|
128
|
|
|
} |
|
129
|
|
|
|
|
130
|
|
|
/** |
|
131
|
|
|
* @inheritdoc |
|
132
|
|
|
*/ |
|
133
|
2 |
|
public function registerHandler(HandlerInterface $handler, $alias) |
|
134
|
|
|
{ |
|
135
|
2 |
|
$this->handlers[$alias] = $handler; |
|
136
|
2 |
|
} |
|
137
|
|
|
|
|
138
|
|
|
/** |
|
139
|
|
|
* @param string $alias |
|
140
|
|
|
* |
|
141
|
|
|
* @return HandlerInterface |
|
142
|
|
|
*/ |
|
143
|
4 |
View Code Duplication |
public function getHandler($alias) |
|
|
|
|
|
|
144
|
|
|
{ |
|
145
|
4 |
|
if (!array_key_exists($alias, $this->handlers)) { |
|
146
|
2 |
|
throw new \OutOfBoundsException( |
|
147
|
2 |
|
sprintf( |
|
148
|
|
|
'Handler "%s" is not registered. You can add it by creating a service which implements %s, ' . |
|
149
|
2 |
|
'and tag it with tree_house.io.scrape.handler', |
|
150
|
2 |
|
$alias, |
|
151
|
2 |
|
HandlerInterface::class |
|
152
|
|
|
) |
|
153
|
|
|
); |
|
154
|
|
|
} |
|
155
|
|
|
|
|
156
|
2 |
|
return $this->handlers[$alias]; |
|
157
|
|
|
} |
|
158
|
|
|
|
|
159
|
|
|
/** |
|
160
|
|
|
* @return HandlerInterface[] |
|
161
|
|
|
*/ |
|
162
|
|
|
public function getHandlers() |
|
163
|
|
|
{ |
|
164
|
|
|
return $this->handlers; |
|
165
|
|
|
} |
|
166
|
|
|
|
|
167
|
|
|
/** |
|
168
|
|
|
* @param ScraperEntity $scraper |
|
169
|
|
|
* |
|
170
|
|
|
* @return ScraperInterface |
|
171
|
|
|
*/ |
|
172
|
|
|
public function createScraper(ScraperEntity $scraper) |
|
173
|
|
|
{ |
|
174
|
|
|
$parser = $this->getParser($scraper); |
|
175
|
|
|
$crawler = $this->getCrawler($scraper->getCrawler()); |
|
176
|
|
|
$handler = $this->getHandler($scraper->getHandler()); |
|
177
|
|
|
|
|
178
|
|
|
$builder = new ScraperBuilder($this->eventDispatcher); |
|
179
|
|
|
|
|
180
|
|
|
return $builder->build($crawler, $parser, $handler); |
|
181
|
|
|
} |
|
182
|
|
|
|
|
183
|
|
|
/** |
|
184
|
|
|
* Returns a cached copy of the parser for the given scraper. |
|
185
|
|
|
* |
|
186
|
|
|
* @param ScraperEntity $scraper |
|
187
|
|
|
* |
|
188
|
|
|
* @return ParserInterface |
|
189
|
|
|
*/ |
|
190
|
|
|
protected function getParser(ScraperEntity $scraper) |
|
191
|
|
|
{ |
|
192
|
|
|
if (!isset($this->parsers[$scraper->getId()])) { |
|
193
|
|
|
$this->parsers[$scraper->getId()] = $this->createParser($scraper); |
|
194
|
|
|
} |
|
195
|
|
|
|
|
196
|
|
|
return $this->parsers[$scraper->getId()]; |
|
197
|
|
|
} |
|
198
|
|
|
|
|
199
|
|
|
/** |
|
200
|
|
|
* @param ScraperEntity $scraper |
|
201
|
|
|
* |
|
202
|
|
|
* @return ParserInterface |
|
203
|
|
|
*/ |
|
204
|
|
View Code Duplication |
protected function createParser(ScraperEntity $scraper) |
|
|
|
|
|
|
205
|
|
|
{ |
|
206
|
|
|
$options = array_merge( |
|
207
|
|
|
['scraper' => $scraper], |
|
208
|
|
|
$scraper->getParserOptions() |
|
209
|
|
|
); |
|
210
|
|
|
|
|
211
|
|
|
$parserType = $this->getParserType($scraper->getParser()); |
|
212
|
|
|
$builder = new ParserBuilder($this->eventDispatcher); |
|
213
|
|
|
|
|
214
|
|
|
return $builder->build($parserType, $options); |
|
215
|
|
|
} |
|
216
|
|
|
} |
|
217
|
|
|
|
Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.
You can also find more detailed suggestions in the “Code” section of your repository.