Total Complexity | 53 |
Total Lines | 325 |
Duplicated Lines | 0 % |
Changes | 0 |
Complex classes like ExternRefTransformer often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
While breaking up the class, it is a good idea to analyze how other classes use ExternRefTransformer, and based on these observations, apply Extract Interface, too.
1 | <?php |
||
29 | class ExternRefTransformer implements ExternRefTransformerInterface |
||
30 | { |
||
31 | use SummaryExternTrait, RobotsRulesTrait, PublisherLogicTrait; |
||
32 | |||
33 | public const HTTP_REQUEST_LOOP_DELAY = 10; |
||
34 | public const SKIP_DOMAIN_FILENAME = __DIR__ . '/../resources/config_skip_domain.txt'; |
||
35 | public const REPLACE_404 = true; |
||
36 | public const CONFIG_PRESSE = __DIR__ . '/../resources/config_presse.yaml'; |
||
37 | public const CONFIG_NEWSPAPER_JSON = __DIR__ . '/../resources/data_newspapers.json'; |
||
38 | public const CONFIG_SCIENTIFIC_JSON = __DIR__ . '/../resources/data_scientific_domain.json'; |
||
39 | public const CONFIG_SCIENTIFIC_WIKI_JSON = __DIR__ . '/../resources/data_scientific_wiki.json'; |
||
40 | |||
41 | public $skipSiteBlacklisted = true; |
||
42 | public $skipRobotNoIndex = true; |
||
43 | /** |
||
44 | * @var array |
||
45 | */ |
||
46 | public $summaryLog = []; |
||
47 | /** |
||
48 | * @var LoggerInterface |
||
49 | */ |
||
50 | protected $log; |
||
51 | protected $config; |
||
52 | /** |
||
53 | * @var string |
||
54 | */ |
||
55 | protected $registrableDomain; |
||
56 | /** |
||
57 | * @var string |
||
58 | */ |
||
59 | protected $url; |
||
60 | /** |
||
61 | * @var ExternMapper |
||
62 | */ |
||
63 | protected $mapper; |
||
64 | /** |
||
65 | * @var array |
||
66 | */ |
||
67 | protected $publisherData = []; |
||
68 | /** |
||
69 | * @var array |
||
70 | */ |
||
71 | protected $skip_domain; |
||
72 | /** |
||
73 | * @var ExternPage |
||
74 | */ |
||
75 | protected $externalPage; |
||
76 | /** |
||
77 | * @var Summary|null |
||
78 | */ |
||
79 | protected $summary; |
||
80 | /** |
||
81 | * @var ExternHttpClientInterface |
||
82 | */ |
||
83 | protected $httpClient; |
||
84 | private $externHttpErrorLogic; |
||
85 | /** |
||
86 | * @var CheckURL |
||
87 | */ |
||
88 | private $urlChecker; |
||
89 | |||
90 | public function __construct(ExternMapper $externMapper, ExternHttpClientInterface $httpClient, ?LoggerInterface $logger) |
||
91 | { |
||
92 | $this->log = $logger ?? new NullLogger(); |
||
93 | $this->importConfigAndData(); |
||
94 | $this->mapper = $externMapper; |
||
95 | $this->httpClient = $httpClient; |
||
96 | $this->externHttpErrorLogic = new ExternHttpErrorLogic($this->log); |
||
97 | $this->urlChecker = new CheckURL($logger); |
||
98 | } |
||
99 | |||
100 | /** |
||
101 | * TODO Refac : chain of responsibility or composite pattern |
||
102 | * @throws Exception |
||
103 | */ |
||
104 | public function process(string $url, Summary $summary): string |
||
105 | { |
||
106 | $this->url = $url; |
||
107 | if (!$this->urlChecker->isURLAuthorized($url)) { |
||
108 | return $url; |
||
109 | } |
||
110 | $this->registrableDomain = $this->urlChecker->getRegistrableDomain($url); // hack |
||
111 | if ($this->isSiteBlackListed()) { |
||
112 | return $url; |
||
113 | } |
||
114 | |||
115 | if (!$this->validateConfigWebDomain($this->registrableDomain)) { |
||
|
|||
116 | return $url; |
||
117 | } |
||
118 | |||
119 | try { |
||
120 | $url = WikiTextUtil::normalizeUrlForTemplate($url); |
||
121 | $pageData = $this->extractPageDataFromUrl($url); // ['JSON-LD'] & ['meta'] !! |
||
122 | } catch (Exception $exception) { |
||
123 | return $this->externHttpErrorLogic->manageHttpErrors($exception->getMessage(), $url); |
||
124 | } |
||
125 | if ($this->emptyPageData($pageData, $url)) { |
||
126 | return $url; |
||
127 | } |
||
128 | if ($this->isRobotNoIndex($pageData, $url) && $this->skipRobotNoIndex) { |
||
129 | // TODO ? return {lien web| titre=Titre inconnu... |
||
130 | // http://www.nydailynews.com/entertainment/jessica-barth-details-alleged-harvey-weinstein-encounter-article-1.3557986 |
||
131 | return $url; |
||
132 | } |
||
133 | |||
134 | $mappedData = $this->mapper->process($pageData); // only json-ld or only meta, after postprocess |
||
135 | if ($this->emptyMapData($mappedData, $url)) { |
||
136 | // TODO ? return {lien web| titre=Titre inconnu... site=prettydomain ... |
||
137 | return $url; |
||
138 | } |
||
139 | $mappedData = $this->unsetAccesLibre($mappedData); |
||
140 | |||
141 | $this->addSummaryLog($mappedData, $summary); |
||
142 | $this->tagAndLog($mappedData); |
||
143 | |||
144 | $template = $this->chooseTemplateByData($this->registrableDomain, $mappedData); |
||
145 | |||
146 | $mappedData = $this->replaceSomeData($mappedData, $template); // template specif + data + url |
||
147 | $serialized = $this->optimizeAndSerialize($template, $mappedData); |
||
148 | |||
149 | $normalized = Normalizer::normalize($serialized); // sometimes :bool |
||
150 | if (!empty($normalized) && is_string($normalized)) { |
||
151 | return $normalized; |
||
152 | } |
||
153 | if (!empty($serialized)) { |
||
154 | return $serialized; |
||
155 | } |
||
156 | |||
157 | return $url; // error fallback |
||
158 | } |
||
159 | |||
160 | protected function isSiteBlackListed(): bool |
||
161 | { |
||
162 | if ($this->skipSiteBlacklisted && in_array($this->registrableDomain, $this->skip_domain)) { |
||
163 | $this->log->notice("Skip web site " . $this->registrableDomain); |
||
164 | return true; |
||
165 | } |
||
166 | return false; |
||
167 | } |
||
168 | |||
169 | /** |
||
170 | * todo move transformer |
||
171 | */ |
||
172 | protected function validateConfigWebDomain(string $domain): bool |
||
173 | { |
||
174 | $this->logDebugConfigWebDomain($domain); |
||
175 | |||
176 | // todo move to config |
||
177 | $this->config[$domain] = $this->config[$domain] ?? []; |
||
178 | $this->config[$domain] = is_array($this->config[$domain]) ? $this->config[$domain] : []; |
||
179 | |||
180 | if ($this->config[$domain] === 'deactivated' || isset($this->config[$domain]['deactivated'])) { |
||
181 | $this->log->info("Domain " . $domain . " disabled\n"); |
||
182 | |||
183 | return false; |
||
184 | } |
||
185 | |||
186 | return true; |
||
187 | } |
||
188 | |||
189 | /** |
||
190 | * @return void |
||
191 | */ |
||
192 | protected function logDebugConfigWebDomain(string $domain): void |
||
193 | { |
||
194 | if (!isset($this->config[$domain])) { |
||
195 | $this->log->debug("Domain " . $domain . " non configuré"); |
||
196 | } else { |
||
197 | $this->log->debug("Domain " . $domain . " configuré"); |
||
198 | } |
||
199 | } |
||
200 | |||
201 | /** |
||
202 | * Stay |
||
203 | * @throws Exception |
||
204 | */ |
||
205 | protected function extractPageDataFromUrl(string $url): array |
||
206 | { |
||
207 | sleep(self::HTTP_REQUEST_LOOP_DELAY); |
||
208 | $this->externalPage = ExternPageFactory::fromURL($url, $this->httpClient, $this->log); |
||
209 | $pageData = $this->externalPage->getData(); |
||
210 | $this->log->debug('metaData', $pageData); |
||
211 | |||
212 | return $pageData; |
||
213 | } |
||
214 | |||
215 | // stay |
||
216 | |||
217 | protected function emptyPageData(array $pageData, string $url): bool |
||
218 | { |
||
219 | if ($pageData === [] |
||
220 | || (empty($pageData['JSON-LD']) && empty($pageData['meta'])) |
||
221 | ) { |
||
222 | $this->log->notice('No metadata : ' . $url); |
||
223 | |||
224 | return true; |
||
225 | } |
||
226 | |||
227 | return false; |
||
228 | } |
||
229 | |||
230 | /** |
||
231 | * check dataValide |
||
232 | * Pas de skip domaine car s'agit peut-être d'un 404 ou erreur juste sur cette URL |
||
233 | */ |
||
234 | protected function emptyMapData(array $mapData, string $url): bool |
||
235 | { |
||
236 | if ($mapData === [] || empty($mapData['url']) || empty($mapData['titre'])) { |
||
237 | $this->log->info('Mapping incomplet : ' . $url); |
||
238 | |||
239 | return true; |
||
240 | } |
||
241 | return false; |
||
242 | } |
||
243 | |||
244 | // stay |
||
245 | |||
246 | /** |
||
247 | * Pas de 'accès url=libre' # débat février 2021 |
||
248 | */ |
||
249 | protected function unsetAccesLibre(array $mapData): array |
||
250 | { |
||
251 | if (isset($mapData['accès url']) && $mapData['accès url'] === 'libre') { |
||
252 | unset($mapData['accès url']); |
||
253 | } |
||
254 | return $mapData; |
||
255 | } |
||
256 | |||
257 | /** |
||
258 | * todo Stay ? |
||
259 | * todo refac lisible |
||
260 | * |
||
261 | * @param array $mapData |
||
262 | * |
||
263 | * @return AbstractWikiTemplate |
||
264 | * @throws Exception |
||
265 | */ |
||
266 | protected function chooseTemplateByData(string $domain, array $mapData): AbstractWikiTemplate |
||
267 | { |
||
268 | // Logique : choix template |
||
269 | $this->config[$domain]['template'] = $this->config[$domain]['template'] ?? []; |
||
270 | $mapData['DATA-ARTICLE'] = $mapData['DATA-ARTICLE'] ?? false; |
||
271 | |||
272 | if (!empty($mapData['doi'])) { |
||
273 | $templateName = 'article'; |
||
274 | } |
||
275 | |||
276 | if ($this->config[$domain]['template'] === 'article' |
||
277 | || ($this->config[$domain]['template'] === 'auto' && $mapData['DATA-ARTICLE']) |
||
278 | || ($mapData['DATA-ARTICLE'] && !empty($this->publisherData['newspaper'][$domain])) |
||
279 | || $this->isScientificDomain() |
||
280 | ) { |
||
281 | $templateName = 'article'; |
||
282 | } |
||
283 | if (!isset($templateName) || $this->config[$domain]['template'] === 'lien web') { |
||
284 | $templateName = 'lien web'; |
||
285 | } |
||
286 | |||
287 | // date obligatoire pour {article} |
||
288 | if (!isset($mapData['date'])) { |
||
289 | $templateName = 'lien web'; |
||
290 | } |
||
291 | |||
292 | $template = WikiTemplateFactory::create($templateName); |
||
293 | $template->userSeparator = " |"; |
||
294 | $this->summary->memo['count ' . $templateName] = 1 + ($this->summary->memo['count ' . $templateName] ?? 0); |
||
295 | |||
296 | return $template; |
||
297 | } |
||
298 | |||
299 | protected function replaceSomeData(array $mapData, AbstractWikiTemplate $template): array |
||
314 | } |
||
315 | |||
316 | // postprocess data |
||
317 | |||
318 | protected function fallbackIfSitenameNull(array $mapData, AbstractWikiTemplate $template): array |
||
328 | } |
||
329 | |||
330 | protected function replaceURLbyOriginal(array $mapData): array |
||
331 | { |
||
332 | $mapData['url'] = $this->url; |
||
333 | |||
334 | return $mapData; |
||
335 | } |
||
336 | |||
337 | /** |
||
338 | * @param AbstractWikiTemplate $template |
||
339 | * @param array $mapData |
||
340 | * |
||
341 | * @return string |
||
342 | * @throws Exception |
||
343 | */ |
||
344 | protected function optimizeAndSerialize(AbstractWikiTemplate $template, array $mapData): string |
||
356 |