Total Complexity | 56 |
Total Lines | 307 |
Duplicated Lines | 0 % |
Changes | 0 |
Complex classes like ExternRefTransformer often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
While breaking up the class, it is a good idea to analyze how other classes use ExternRefTransformer, and based on these observations, apply Extract Interface, too.
1 | <?php |
||
33 | class ExternRefTransformer implements ExternRefTransformerInterface |
||
34 | { |
||
35 | use SummaryExternTrait, PublisherLogicTrait; |
||
|
|||
36 | |||
37 | public const HTTP_REQUEST_LOOP_DELAY = 10; |
||
38 | public const SKIP_DOMAIN_FILENAME = __DIR__ . '/../resources/config_skip_domain.txt'; |
||
39 | public const REPLACE_404 = true; |
||
40 | public const REPLACE_410 = true; |
||
41 | public const CONFIG_PRESSE = __DIR__ . '/../resources/config_presse.yaml'; |
||
42 | public const CONFIG_NEWSPAPER_JSON = __DIR__ . '/../resources/data_newspapers.json'; |
||
43 | public const CONFIG_SCIENTIFIC_JSON = __DIR__ . '/../resources/data_scientific_domain.json'; |
||
44 | public const CONFIG_SCIENTIFIC_WIKI_JSON = __DIR__ . '/../resources/data_scientific_wiki.json'; |
||
45 | |||
46 | public bool $skipSiteBlacklisted = true; |
||
47 | public bool $skipRobotNoIndex = true; |
||
48 | public array $summaryLog = []; |
||
49 | |||
50 | protected $config; |
||
51 | protected string $registrableDomain; |
||
52 | protected string $url; |
||
53 | protected array $publisherData = []; |
||
54 | protected array $skip_domain = []; |
||
55 | protected ExternPage $externalPage; |
||
56 | protected ?Summary $summary; |
||
57 | protected ?string $originDomain; |
||
58 | protected array $options = []; |
||
59 | private readonly ExternHttpErrorLogic $externHttpErrorLogic; |
||
60 | private readonly CheckURL $urlChecker; |
||
61 | |||
62 | public function __construct( |
||
63 | protected ExternMapper $mapper, |
||
64 | protected ExternHttpClientInterface $httpClient, |
||
65 | protected InternetDomainParserInterface $domainParser, |
||
66 | protected LoggerInterface $log = new NullLogger(), |
||
67 | protected ?DeadlinkArchiverInterface $deadlinkArchiver = null |
||
68 | ) |
||
69 | { |
||
70 | $this->importConfigAndData(); |
||
71 | $this->externHttpErrorLogic = new ExternHttpErrorLogic( |
||
72 | new DeadLinkTransformer($deadlinkArchiver, $domainParser, null, $log), |
||
73 | $log |
||
74 | ); |
||
75 | $this->urlChecker = new CheckURL($domainParser, $log); |
||
76 | } |
||
77 | |||
78 | /** |
||
79 | * Transform "http://bla" => "{lien web|...}}", "{article}" or "{lien brisé}". |
||
80 | * |
||
81 | * TODO Refac : chain of responsibility or composite pattern |
||
82 | * todo refac : return data DTO ? to much responsability! |
||
83 | * |
||
84 | * @throws Exception |
||
85 | */ |
||
86 | public function process(string $url, Summary $summary = new Summary(), array $options = []): string |
||
87 | { |
||
88 | $this->url = $url; |
||
89 | $this->options = $options; // used only to pass RegistrableDomain of archived deadlink |
||
90 | |||
91 | if (!$this->urlChecker->isURLAuthorized($url)) { |
||
92 | return $url; |
||
93 | } |
||
94 | $this->registrableDomain = $this->urlChecker->getRegistrableDomain($url); // hack |
||
95 | if ($this->isSiteBlackListed()) { |
||
96 | return $url; |
||
97 | } |
||
98 | |||
99 | if (!$this->validateConfigWebDomain($this->registrableDomain)) { |
||
100 | return $url; |
||
101 | } |
||
102 | |||
103 | try { |
||
104 | $url = WikiTextUtil::normalizeUrlForTemplate($url); |
||
105 | $pageData = $this->extractPageDataFromUrl($url); // ['JSON-LD'] & ['meta'] !! |
||
106 | } catch (Exception $exception) { |
||
107 | return $this->externHttpErrorLogic->manageByHttpErrorMessage($exception->getMessage(), $url); |
||
108 | } |
||
109 | if ($this->emptyPageData($pageData, $url)) { |
||
110 | return $url; |
||
111 | } |
||
112 | $noIndexValidator = new RobotNoIndexValidator($pageData, $url, $this->log); // todo inject |
||
113 | if ($noIndexValidator->validate() && $this->skipRobotNoIndex) { |
||
114 | // TODO ? return {lien web| titre=Titre inconnu... |note=noindex } |
||
115 | // http://www.nydailynews.com/entertainment/jessica-barth-details-alleged-harvey-weinstein-encounter-article-1.3557986 |
||
116 | return $url; |
||
117 | } |
||
118 | |||
119 | $mappedData = $this->mapper->process($pageData); // only json-ld or only meta, after postprocess |
||
120 | if ($this->emptyMapData($mappedData, $url)) { |
||
121 | // TODO ? return {lien web| titre=Titre inconnu... site=prettydomain ... |
||
122 | return $url; |
||
123 | } |
||
124 | $mappedData = $this->unsetAccesLibre($mappedData); |
||
125 | |||
126 | $this->addSummaryLog($mappedData, $summary); |
||
127 | $this->tagAndLog($mappedData); |
||
128 | |||
129 | $template = $this->chooseTemplateByData($this->registrableDomain, $mappedData); |
||
130 | |||
131 | $mappedData = $this->replaceSomeData($mappedData, $template); // template specif + data + url |
||
132 | |||
133 | $serialized = $this->optimizeAndSerialize($template, $mappedData); |
||
134 | |||
135 | $normalized = Normalizer::normalize($serialized); // sometimes :bool |
||
136 | if (!empty($normalized) && is_string($normalized)) { |
||
137 | return $normalized; |
||
138 | } |
||
139 | if (!empty($serialized)) { |
||
140 | return $serialized; |
||
141 | } |
||
142 | |||
143 | return $url; // error fallback |
||
144 | } |
||
145 | |||
146 | protected function isSiteBlackListed(): bool |
||
147 | { |
||
148 | if ($this->skipSiteBlacklisted && in_array($this->registrableDomain, $this->skip_domain)) { |
||
149 | $this->log->notice("Skip web site " . $this->registrableDomain); |
||
150 | return true; |
||
151 | } |
||
152 | return false; |
||
153 | } |
||
154 | |||
155 | /** |
||
156 | * todo move transformer |
||
157 | */ |
||
158 | protected function validateConfigWebDomain(string $domain): bool |
||
159 | { |
||
160 | $this->logDebugConfigWebDomain($domain); |
||
161 | |||
162 | // todo move to config |
||
163 | $this->config[$domain] ??= []; |
||
164 | $this->config[$domain] = is_array($this->config[$domain]) ? $this->config[$domain] : []; |
||
165 | |||
166 | if ($this->config[$domain] === 'deactivated' || isset($this->config[$domain]['deactivated'])) { |
||
167 | $this->log->info("Domain " . $domain . " disabled\n"); |
||
168 | |||
169 | return false; |
||
170 | } |
||
171 | |||
172 | return true; |
||
173 | } |
||
174 | |||
175 | protected function logDebugConfigWebDomain(string $domain): void |
||
176 | { |
||
177 | if (!isset($this->config[$domain])) { |
||
178 | $this->log->debug("Domain " . $domain . " non configuré"); |
||
179 | } else { |
||
180 | $this->log->debug("Domain " . $domain . " configuré"); |
||
181 | } |
||
182 | } |
||
183 | |||
184 | /** |
||
185 | * Stay |
||
186 | * @throws Exception |
||
187 | */ |
||
188 | protected function extractPageDataFromUrl(string $url): array |
||
189 | { |
||
190 | sleep(self::HTTP_REQUEST_LOOP_DELAY); |
||
191 | $this->externalPage = ExternPageFactory::fromURL($url, $this->domainParser, $this->httpClient, $this->log); |
||
192 | $pageData = $this->externalPage->getData(); |
||
193 | $this->log->debug('metaData', $pageData); |
||
194 | |||
195 | return $pageData; |
||
196 | } |
||
197 | |||
198 | // stay |
||
199 | |||
200 | protected function emptyPageData(array $pageData, string $url): bool |
||
201 | { |
||
202 | if ($pageData === [] |
||
203 | || (empty($pageData['JSON-LD']) && empty($pageData['meta'])) |
||
204 | ) { |
||
205 | $this->log->notice('No metadata : ' . $url); |
||
206 | |||
207 | return true; |
||
208 | } |
||
209 | |||
210 | return false; |
||
211 | } |
||
212 | |||
213 | /** |
||
214 | * check dataValide |
||
215 | * Pas de skip domaine car s'agit peut-être d'un 404 ou erreur juste sur cette URL |
||
216 | */ |
||
217 | protected function emptyMapData(array $mapData, string $url): bool |
||
218 | { |
||
219 | if ($mapData === [] || empty($mapData['url']) || empty($mapData['titre'])) { |
||
220 | $this->log->info('Mapping incomplet : ' . $url); |
||
221 | |||
222 | return true; |
||
223 | } |
||
224 | return false; |
||
225 | } |
||
226 | |||
227 | // stay |
||
228 | |||
229 | /** |
||
230 | * Pas de 'accès url=libre' # débat février 2021 |
||
231 | */ |
||
232 | protected function unsetAccesLibre(array $mapData): array |
||
233 | { |
||
234 | if (isset($mapData['accès url']) && $mapData['accès url'] === 'libre') { |
||
235 | unset($mapData['accès url']); |
||
236 | } |
||
237 | return $mapData; |
||
238 | } |
||
239 | |||
240 | /** |
||
241 | * todo Stay ? |
||
242 | * todo refac lisible |
||
243 | * @throws Exception |
||
244 | */ |
||
245 | protected function chooseTemplateByData(string $domain, array $mapData): AbstractWikiTemplate |
||
246 | { |
||
247 | // Logique : choix template |
||
248 | $this->config[$domain]['template'] ??= []; |
||
249 | $mapData['DATA-ARTICLE'] ??= false; |
||
250 | |||
251 | if (!empty($mapData['doi'])) { |
||
252 | $templateName = 'article'; |
||
253 | } |
||
254 | |||
255 | if ($this->config[$domain]['template'] === 'article' |
||
256 | || ($this->config[$domain]['template'] === 'auto' && $mapData['DATA-ARTICLE']) |
||
257 | || ($mapData['DATA-ARTICLE'] && !empty($this->publisherData['newspaper'][$domain])) |
||
258 | || $this->isScientificDomain() |
||
259 | ) { |
||
260 | $templateName = 'article'; |
||
261 | } |
||
262 | if (!isset($templateName) || $this->config[$domain]['template'] === 'lien web') { |
||
263 | $templateName = 'lien web'; |
||
264 | } |
||
265 | |||
266 | // date obligatoire pour {article} |
||
267 | if (!isset($mapData['date'])) { |
||
268 | $templateName = 'lien web'; |
||
269 | } |
||
270 | |||
271 | $template = WikiTemplateFactory::create($templateName); |
||
272 | $template->userSeparator = " |"; |
||
273 | $this->summary->memo['count ' . $templateName] = 1 + ($this->summary->memo['count ' . $templateName] ?? 0); |
||
274 | |||
275 | return $template; |
||
276 | } |
||
277 | |||
278 | protected function replaceSomeData(array $mapData, AbstractWikiTemplate $template): array |
||
294 | } |
||
295 | |||
296 | // postprocess data |
||
297 | |||
298 | protected function fallbackIfSitenameNull(array $mapData, AbstractWikiTemplate $template): array |
||
308 | } |
||
309 | |||
310 | protected function replaceURLbyOriginal(array $mapData): array |
||
311 | { |
||
312 | $mapData['url'] = $this->url; |
||
313 | |||
314 | return $mapData; |
||
315 | } |
||
316 | |||
317 | /** |
||
318 | * |
||
319 | * @throws Exception |
||
320 | */ |
||
321 | protected function optimizeAndSerialize(AbstractWikiTemplate $template, array $mapData): string |
||
331 | } |
||
332 | |||
333 | protected function correctSiteViaWebarchiver(array $mapData): array |
||
342 |