Total Complexity | 60 |
Total Lines | 327 |
Duplicated Lines | 0 % |
Changes | 3 | ||
Bugs | 2 | Features | 1 |
Complex classes like ExternRefTransformer often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
While breaking up the class, it is a good idea to analyze how other classes use ExternRefTransformer, and based on these observations, apply Extract Interface, too.
1 | <?php |
||
33 | class ExternRefTransformer implements ExternRefTransformerInterface |
||
34 | { |
||
35 | use SummaryExternTrait, PublisherLogicTrait; |
||
|
|||
36 | |||
37 | final public const HTTP_REQUEST_LOOP_DELAY = 10; |
||
38 | final public const SKIP_DOMAIN_FILENAME = __DIR__ . '/../resources/config_skip_domain.txt'; |
||
39 | final public const REPLACE_404 = true; |
||
40 | final public const REPLACE_410 = true; |
||
41 | final public const CONFIG_PRESSE = __DIR__ . '/../resources/config_presse.yaml'; |
||
42 | final public const CONFIG_NEWSPAPER_JSON = __DIR__ . '/../resources/data_newspapers.json'; |
||
43 | final public const CONFIG_SCIENTIFIC_JSON = __DIR__ . '/../resources/data_scientific_domain.json'; |
||
44 | final public const CONFIG_SCIENTIFIC_WIKI_JSON = __DIR__ . '/../resources/data_scientific_wiki.json'; |
||
45 | |||
46 | public bool $skipSiteBlacklisted = true; |
||
47 | public bool $skipRobotNoIndex = true; |
||
48 | public array $summaryLog = []; |
||
49 | |||
50 | protected $config; |
||
51 | protected ?string $registrableDomain = null; |
||
52 | protected string $url; |
||
53 | protected array $publisherData = []; |
||
54 | protected array $skip_domain = []; |
||
55 | protected ExternPage $externalPage; |
||
56 | protected ?Summary $summary = null; |
||
57 | protected ?string $originDomain = null; |
||
58 | protected array $options = []; |
||
59 | private readonly ExternHttpErrorLogic $externHttpErrorLogic; |
||
60 | private readonly CheckURL $urlChecker; |
||
61 | |||
62 | /** |
||
63 | * @param DeadlinkArchiverInterface[] $deadlinkArchivers |
||
64 | */ |
||
65 | public function __construct( |
||
66 | protected ExternMapper $mapper, |
||
67 | protected HttpClientInterface $httpClient, |
||
68 | protected InternetDomainParserInterface $domainParser, |
||
69 | protected LoggerInterface $log = new NullLogger(), |
||
70 | protected array $deadlinkArchivers = [] |
||
71 | ) |
||
72 | { |
||
73 | $this->importConfigAndData(); |
||
74 | $this->externHttpErrorLogic = new ExternHttpErrorLogic( |
||
75 | new DeadLinkTransformer($deadlinkArchivers, $domainParser, null, $log), |
||
76 | $log |
||
77 | ); |
||
78 | $this->urlChecker = new CheckURL($domainParser, $log); |
||
79 | } |
||
80 | |||
81 | /** |
||
82 | * Transform "http://bla" => "{lien web|...}}", "{article}" or "{lien brisé}". |
||
83 | * |
||
84 | * TODO Refac : chain of responsibility |
||
85 | * todo refac : return data DTO ? to much responsability! |
||
86 | * |
||
87 | * @throws Exception |
||
88 | */ |
||
89 | public function process(string $url, Summary $summary = new Summary(), array $options = []): string |
||
90 | { |
||
91 | $this->url = $url; |
||
92 | $this->options = $options; // used only to pass RegistrableDomain of archived deadlink |
||
93 | |||
94 | if (!$this->urlChecker->isURLAuthorized($url)) { |
||
95 | return $url; |
||
96 | } |
||
97 | $this->registrableDomain = $this->urlChecker->getRegistrableDomain($url); // hack |
||
98 | if ($this->isSiteBlackListed()) { |
||
99 | $this->log->debug('Site blacklisted : ' . $this->registrableDomain, ['stats' => 'externref.skip.blacklisted']); |
||
100 | return $url; |
||
101 | } |
||
102 | |||
103 | if ($this->registrableDomain && !$this->validateConfigWebDomain($this->registrableDomain)) { |
||
104 | $this->log->debug( |
||
105 | 'Domain blocked by config : ' . $this->registrableDomain, |
||
106 | ['stats' => 'externref.skip.domainDisabledByConfig'] |
||
107 | ); |
||
108 | return $url; |
||
109 | } |
||
110 | |||
111 | try { |
||
112 | $url = WikiTextUtil::normalizeUrlForTemplate($url); |
||
113 | $pageData = $this->extractPageDataFromUrl($url); // ['JSON-LD'] & ['meta'] !! |
||
114 | } catch (Exception $exception) { |
||
115 | return $this->externHttpErrorLogic->manageByHttpErrorMessage($exception->getMessage(), $url); |
||
116 | } |
||
117 | if ($this->emptyPageData($pageData, $url)) { |
||
118 | $this->log->debug('Empty page data', ['stats' => 'externref.skip.emptyPageData']); |
||
119 | return $url; |
||
120 | } |
||
121 | $noIndexValidator = new RobotNoIndexValidator($pageData, $url, $this->log); // todo inject |
||
122 | if ($noIndexValidator->validate() && $this->skipRobotNoIndex) { |
||
123 | $this->log->debug('NOINDEX detected', ['stats' => 'externref.skip.robotNoIndex']); |
||
124 | // TODO ? return {lien web| titre=Titre inconnu... |note=noindex } |
||
125 | // http://www.nydailynews.com/entertainment/jessica-barth-details-alleged-harvey-weinstein-encounter-article-1.3557986 |
||
126 | return $url; |
||
127 | } |
||
128 | |||
129 | $mappedData = $this->mapper->process($pageData); // only json-ld or only meta, after postprocess |
||
130 | if ($this->emptyMapData($mappedData, $url)) { |
||
131 | $this->log->stats->increment('externref.skip.emptyMapData'); |
||
132 | // TODO ? return {lien web| titre=Titre inconnu... site=prettydomain ... |
||
133 | return $url; |
||
134 | } |
||
135 | $mappedData = $this->unsetAccesLibre($mappedData); |
||
136 | |||
137 | $this->addSummaryLog($mappedData, $summary); |
||
138 | $this->tagAndLog($mappedData); |
||
139 | |||
140 | $template = $this->instanciateTemplate( |
||
141 | $this->chooseTemplateNameByData($this->registrableDomain, $mappedData) |
||
142 | ); |
||
143 | |||
144 | $mappedData = $this->replaceSomeData($mappedData, $template); // template specif + data + url |
||
145 | |||
146 | $serialized = $this->optimizeAndSerialize($template, $mappedData); |
||
147 | |||
148 | $normalized = Normalizer::normalize($serialized); // sometimes :bool |
||
149 | if (!empty($normalized) && is_string($normalized)) { |
||
150 | return $normalized; |
||
151 | } |
||
152 | if (!empty($serialized)) { |
||
153 | return $serialized; |
||
154 | } |
||
155 | |||
156 | return $url; // error fallback |
||
157 | } |
||
158 | |||
159 | protected function isSiteBlackListed(): bool |
||
160 | { |
||
161 | if ($this->registrableDomain && $this->skipSiteBlacklisted && in_array($this->registrableDomain, $this->skip_domain)) { |
||
162 | $this->log->notice("Skip web site " . $this->registrableDomain); |
||
163 | return true; |
||
164 | } |
||
165 | return false; |
||
166 | } |
||
167 | |||
168 | /** |
||
169 | * todo move transformer |
||
170 | */ |
||
171 | protected function validateConfigWebDomain(string $domain): bool |
||
172 | { |
||
173 | $this->logDebugConfigWebDomain($domain); |
||
174 | |||
175 | // todo move to config |
||
176 | $this->config[$domain] ??= []; |
||
177 | $this->config[$domain] = is_array($this->config[$domain]) ? $this->config[$domain] : []; |
||
178 | |||
179 | if ($this->config[$domain] === 'deactivated' || isset($this->config[$domain]['deactivated'])) { |
||
180 | $this->log->info("Domain " . $domain . " disabled\n"); |
||
181 | |||
182 | return false; |
||
183 | } |
||
184 | |||
185 | return true; |
||
186 | } |
||
187 | |||
188 | protected function logDebugConfigWebDomain(string $domain): void |
||
189 | { |
||
190 | if (!isset($this->config[$domain])) { |
||
191 | $this->log->debug("Domain " . $domain . " non configuré"); |
||
192 | } else { |
||
193 | $this->log->debug("Domain " . $domain . " configuré"); |
||
194 | } |
||
195 | } |
||
196 | |||
197 | /** |
||
198 | * Stay |
||
199 | * @throws Exception |
||
200 | */ |
||
201 | protected function extractPageDataFromUrl(string $url): array |
||
202 | { |
||
203 | sleep(self::HTTP_REQUEST_LOOP_DELAY); |
||
204 | $externPageFactory = new ExternPageFactory($this->httpClient, $this->log); |
||
205 | $this->externalPage = $externPageFactory->fromURL($url, $this->domainParser); |
||
206 | $pageData = $this->externalPage->getData(); |
||
207 | $this->log->debug('metaData', $pageData); |
||
208 | |||
209 | return $pageData; |
||
210 | } |
||
211 | |||
212 | // stay |
||
213 | |||
214 | protected function emptyPageData(array $pageData, string $url): bool |
||
215 | { |
||
216 | if ($pageData === [] |
||
217 | || (empty($pageData['JSON-LD']) && empty($pageData['meta'])) |
||
218 | ) { |
||
219 | $this->log->notice('No metadata : ' . $url); |
||
220 | |||
221 | return true; |
||
222 | } |
||
223 | |||
224 | return false; |
||
225 | } |
||
226 | |||
227 | /** |
||
228 | * check dataValide |
||
229 | * Pas de skip domaine car s'agit peut-être d'un 404 ou erreur juste sur cette URL |
||
230 | */ |
||
231 | protected function emptyMapData(array $mapData, string $url): bool |
||
232 | { |
||
233 | if ($mapData === [] || empty($mapData['url']) || empty($mapData['titre'])) { |
||
234 | $this->log->info('Mapping incomplet : ' . $url); |
||
235 | |||
236 | return true; |
||
237 | } |
||
238 | return false; |
||
239 | } |
||
240 | |||
241 | // stay |
||
242 | |||
243 | /** |
||
244 | * Pas de 'accès url=libre' # débat février 2021 |
||
245 | */ |
||
246 | protected function unsetAccesLibre(array $mapData): array |
||
247 | { |
||
248 | if (isset($mapData['accès url']) && $mapData['accès url'] === 'libre') { |
||
249 | unset($mapData['accès url']); |
||
250 | } |
||
251 | return $mapData; |
||
252 | } |
||
253 | |||
254 | /** |
||
255 | * todo Stay ? |
||
256 | * todo refac lisible |
||
257 | * @throws Exception |
||
258 | */ |
||
259 | protected function chooseTemplateNameByData(?string $domain, array $mapData): string |
||
260 | { |
||
261 | if (!$domain) { |
||
262 | return 'lien web'; |
||
263 | } |
||
264 | $this->config[$domain]['template'] ??= []; |
||
265 | $mapData['DATA-ARTICLE'] ??= false; |
||
266 | |||
267 | if (!empty($mapData['doi'])) { |
||
268 | $templateName = 'article'; |
||
269 | } |
||
270 | |||
271 | if ($this->config[$domain]['template'] === 'article' |
||
272 | || ($this->config[$domain]['template'] === 'auto' && $mapData['DATA-ARTICLE']) |
||
273 | || ($mapData['DATA-ARTICLE'] && !empty($this->publisherData['newspaper'][$domain])) |
||
274 | || $this->isScientificDomain() |
||
275 | ) { |
||
276 | $templateName = 'article'; |
||
277 | } |
||
278 | if (!isset($templateName) || $this->config[$domain]['template'] === 'lien web') { |
||
279 | $templateName = 'lien web'; |
||
280 | } |
||
281 | |||
282 | // date obligatoire pour {article} |
||
283 | if (!isset($mapData['date'])) { |
||
284 | $templateName = 'lien web'; |
||
285 | } |
||
286 | |||
287 | return $templateName; |
||
288 | } |
||
289 | |||
290 | protected function replaceSomeData(array $mapData, AbstractWikiTemplate $template): array |
||
306 | } |
||
307 | |||
308 | // postprocess data |
||
309 | |||
310 | protected function fallbackIfSitenameNull(array $mapData, AbstractWikiTemplate $template): array |
||
320 | } |
||
321 | |||
322 | protected function correctSiteViaWebarchiver(array $mapData): array |
||
323 | { |
||
324 | if (!empty($this->options['originalRegistrableDomain']) && $mapData['site']) { |
||
325 | $mapData['site'] = $this->options['originalRegistrableDomain'] . ' via ' . $mapData['site']; |
||
326 | } |
||
327 | |||
328 | return $mapData; |
||
329 | } |
||
330 | |||
331 | protected function replaceURLbyOriginal(array $mapData): array |
||
336 | } |
||
337 | |||
338 | /** |
||
339 | * @throws Exception |
||
340 | */ |
||
341 | protected function optimizeAndSerialize(AbstractWikiTemplate $template, array $mapData): string |
||
351 | } |
||
352 | |||
353 | private function instanciateTemplate(string $templateName): AbstractWikiTemplate |
||
360 | } |
||
361 | } |
||
362 |