Total Complexity | 166 |
Total Lines | 765 |
Duplicated Lines | 0 % |
Changes | 15 | ||
Bugs | 8 | Features | 2 |
Complex classes like EntityDecoder often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
While breaking up the class, it is a good idea to analyze how other classes use EntityDecoder, and based on these observations, apply Extract Interface, too.
1 | <?php |
||
20 | class EntityDecoder |
||
21 | { |
||
22 | private $entitiesToParse = ['bold', 'italic', 'code', 'pre', 'text_mention', 'text_link', 'strikethrough', 'underline', 'spoiler', 'blockquote', 'custom_emoji']; |
||
23 | private $entities = []; |
||
24 | private $style; |
||
25 | |||
26 | /** |
||
27 | * @param string $style Either 'HTML', 'Markdown' or 'MarkdownV2'. |
||
28 | * |
||
29 | * @throws InvalidArgumentException if the provided style name in invalid. |
||
30 | */ |
||
31 | public function __construct(string $style = 'HTML') |
||
40 | } |
||
41 | } |
||
42 | |||
43 | /** |
||
44 | * Decode entities and return decoded text |
||
45 | * |
||
46 | * @param object $message message object to reconstruct Entities from (json decoded without assoc). |
||
47 | * @return string |
||
48 | */ |
||
49 | public function decode($message): string |
||
140 | } |
||
141 | |||
142 | /** |
||
143 | * Extract all entities in an array |
||
144 | * |
||
145 | * @param object $message message object to reconstruct Entities from (json decoded without assoc). |
||
146 | * @return array |
||
147 | */ |
||
148 | public function extractAllEntities($message): array |
||
149 | { |
||
150 | $entitiesArray = []; |
||
151 | if (!is_object($message)) |
||
152 | { |
||
153 | throw new \Exception('message must be an object'); |
||
154 | } |
||
155 | //Get available entities (for text or for attachment like photo, document, etc.) |
||
156 | if (!empty($message->entities)) |
||
157 | { |
||
158 | $this->entities = $message->entities; |
||
159 | } |
||
160 | if (!empty($message->caption_entities)) |
||
161 | { |
||
162 | $this->entities = $message->caption_entities; |
||
163 | } |
||
164 | //Get internal encoding |
||
165 | $prevencoding = mb_internal_encoding(); |
||
166 | //Set encoding to UTF-8 |
||
167 | mb_internal_encoding('UTF-8'); |
||
168 | //Get available text (text message or caption for attachment) |
||
169 | $textToDecode = (!empty($message->text) ? $message->text : (!empty($message->caption) ? $message->caption : "")); |
||
170 | //if the message has no entities or no text return the empty array |
||
171 | if (empty($this->entities) || $textToDecode == "") { |
||
172 | if ($prevencoding) |
||
173 | { |
||
174 | mb_internal_encoding($prevencoding); |
||
175 | } |
||
176 | return $entitiesArray; |
||
177 | } |
||
178 | $arrayText = $this->splitCharAndLength($textToDecode); |
||
179 | $entitytext = ""; |
||
180 | |||
181 | $openedEntities = []; |
||
182 | $currenPosition = 0; |
||
183 | //Cycle characters one by one to calculate begins and ends of entities and escape special chars |
||
184 | for ($i = 0, $c = count($arrayText); $i < $c; $i++) { |
||
185 | $offsetAndLength = $currenPosition + $arrayText[$i]['length']; |
||
186 | $entityCheckStart = $this->checkForEntityStart($currenPosition); |
||
187 | $entityCheckStop = $this->checkForEntityStop($offsetAndLength); |
||
188 | if ($entityCheckStart !== false) |
||
189 | { |
||
190 | foreach ($entityCheckStart as $stEntity) |
||
191 | { |
||
192 | $startChar = $this->getEntityStartString($stEntity); |
||
193 | $openedEntities[] = $stEntity; |
||
194 | $entitytext .= $startChar; |
||
195 | } |
||
196 | $entitytext .= $this->escapeSpecialChars($arrayText[$i]['char'], true, $openedEntities); |
||
197 | } |
||
198 | if ($entityCheckStop !== false) |
||
199 | { |
||
200 | if ($entityCheckStart === false) |
||
201 | { |
||
202 | $entitytext .= $this->escapeSpecialChars($arrayText[$i]['char'], true, $openedEntities); |
||
203 | } |
||
204 | if ($this->style == 'MarkdownV2' && $this->checkMarkdownV2AmbiguousEntities($entityCheckStop)) |
||
205 | { |
||
206 | $stopChar = "_\r__"; |
||
207 | $entitytext .= $stopChar; |
||
208 | array_pop($openedEntities); |
||
209 | array_pop($openedEntities); |
||
210 | if(empty($openedEntities)) |
||
211 | { |
||
212 | $entitiesArray[] = $entitytext; |
||
213 | $entitytext = ""; |
||
214 | } |
||
215 | } |
||
216 | foreach ($entityCheckStop as $stEntity) |
||
217 | { |
||
218 | $stopChar = $this->getEntityStopString($stEntity); |
||
219 | $entitytext .= $stopChar; |
||
220 | array_pop($openedEntities); |
||
221 | if(empty($openedEntities)) |
||
222 | { |
||
223 | $entitiesArray[] = $entitytext; |
||
224 | $entitytext = ""; |
||
225 | } |
||
226 | } |
||
227 | } |
||
228 | if ($entityCheckStart === false && $entityCheckStop === false) |
||
229 | { |
||
230 | $isEntityOpen = !empty($openedEntities); |
||
231 | if($isEntityOpen) |
||
232 | { |
||
233 | $entitytext .= $this->escapeSpecialChars($arrayText[$i]['char'], $isEntityOpen, $openedEntities); |
||
234 | } |
||
235 | } |
||
236 | $currenPosition = $offsetAndLength; |
||
237 | } |
||
238 | if (!empty($openedEntities)) |
||
239 | { |
||
240 | $openedEntities = array_reverse($openedEntities); |
||
241 | foreach ($openedEntities as $oe) |
||
242 | { |
||
243 | $entitytext .= $this->getEntityStopString($oe); |
||
244 | $entitiesArray[] = $entitytext; |
||
245 | } |
||
246 | } |
||
247 | if ($prevencoding) |
||
248 | { |
||
249 | mb_internal_encoding($prevencoding); |
||
250 | } |
||
251 | return $entitiesArray; |
||
252 | } |
||
253 | |||
254 | /** |
||
255 | * Split message text in chars array with lengthes |
||
256 | */ |
||
257 | protected function splitCharAndLength($string) |
||
258 | { |
||
259 | //Split string in individual unicode points |
||
260 | $str_split_unicode = preg_split('//u', $string, -1, PREG_SPLIT_NO_EMPTY); |
||
261 | $new_string_split = []; |
||
262 | $joiner = false; |
||
263 | for ($i = 0, $c = count($str_split_unicode); $i < $c; $i++) |
||
264 | { |
||
265 | //loop the array |
||
266 | $codepoint = bin2hex(mb_convert_encoding($str_split_unicode[$i], 'UTF-16')); //Get the string rappresentation of the unicode char |
||
267 | if ($codepoint == "fe0f" || $codepoint == "1f3fb" || $codepoint == "1f3fc" || $codepoint == "1f3fd" || $codepoint == "1f3fe" || $codepoint == "1f3ff") |
||
268 | { |
||
269 | //Manage the modifiers |
||
270 | $new_string_split[count($new_string_split) - 1] .= $str_split_unicode[$i]; //Apppend the modifier to the previous char |
||
271 | } |
||
272 | else |
||
273 | { |
||
274 | if ($codepoint == "200d") |
||
275 | { |
||
276 | //Manage the Zero Width Joiner |
||
277 | $new_string_split[count($new_string_split) - 1] .= $str_split_unicode[$i]; //Apppend the ZWJ to the previous char |
||
278 | $joiner = true; |
||
279 | } |
||
280 | else |
||
281 | { |
||
282 | if ($joiner) |
||
283 | { |
||
284 | //If previous one was a ZWJ |
||
285 | $new_string_split[count($new_string_split) - 1] .= $str_split_unicode[$i]; //Apppend to the previous char |
||
286 | $joiner = false; |
||
287 | } |
||
288 | else |
||
289 | { |
||
290 | $new_string_split[] = $str_split_unicode[$i]; //New char |
||
291 | } |
||
292 | } |
||
293 | } |
||
294 | } |
||
295 | $data = []; |
||
296 | foreach ($new_string_split as $s) |
||
297 | { |
||
298 | $data[] = ["char" => $s, "length" => $this->getUTF16CodePointsLength($s)]; |
||
299 | } |
||
300 | return $data; |
||
301 | } |
||
302 | |||
303 | /** |
||
304 | * Apply Telegram escape rules for the choosen style |
||
305 | */ |
||
306 | protected function escapeSpecialChars($char, $isEntityOpen, $entities) { |
||
307 | if ($this->style == 'Markdown') |
||
308 | { |
||
309 | if ($isEntityOpen) |
||
310 | { |
||
311 | $entity = $entities[0]; |
||
312 | if ($char == '*' || $char == '_') |
||
313 | { |
||
314 | if ($char == $this->getEntityStartString($entity)) |
||
315 | { |
||
316 | return $char."\\".$char.$char; |
||
317 | } |
||
318 | else |
||
319 | { |
||
320 | return $char; |
||
321 | } |
||
322 | } |
||
323 | else |
||
324 | { |
||
325 | return $char; |
||
326 | } |
||
327 | } |
||
328 | else |
||
329 | { |
||
330 | if ($char == '*' || $char == '_' || $char == '[' || $char == '`') |
||
331 | { |
||
332 | return "\\".$char; |
||
333 | } |
||
334 | else |
||
335 | { |
||
336 | return $char; |
||
337 | } |
||
338 | } |
||
339 | } |
||
340 | else if ($this->style == 'HTML') |
||
341 | { |
||
342 | return ($char == '<' ? '<' : ($char == '>' ? '>' : ($char == '&' ? '&' : $char))); |
||
343 | } |
||
344 | else if ($this->style == 'MarkdownV2') |
||
345 | { |
||
346 | $isBlockquoteOpen = false; |
||
347 | foreach ($entities as $entity) { |
||
348 | if ($entity->type === 'blockquote') { |
||
349 | $isBlockquoteOpen = true; |
||
350 | break; |
||
351 | } |
||
352 | } |
||
353 | if($isBlockquoteOpen && $char == "\n") |
||
354 | { |
||
355 | return $char.'>'; |
||
356 | } |
||
357 | else |
||
358 | { |
||
359 | return (in_array($char, ['_', '*', '[', ']', '(', ')', '~', '`', '>', '#', '+', '-', '=', '|', '{', '}', '.', '!', '\\']) ? '\\'.$char : $char); |
||
360 | } |
||
361 | } |
||
362 | else |
||
363 | { |
||
364 | return $char; |
||
365 | } |
||
366 | } |
||
367 | |||
368 | /** |
||
369 | * Get the begin string of the entity for the choosen style |
||
370 | */ |
||
371 | protected function getEntityStartString($entity) |
||
372 | { |
||
373 | $startString = ''; |
||
374 | if ($this->style == 'Markdown') |
||
375 | { |
||
376 | switch ($entity->type) |
||
377 | { |
||
378 | case 'bold': |
||
379 | { |
||
380 | $startString = '*'; |
||
381 | break; |
||
382 | } |
||
383 | case 'italic': |
||
384 | { |
||
385 | $startString = '_'; |
||
386 | break; |
||
387 | } |
||
388 | case 'code': |
||
389 | { |
||
390 | $startString = '`'; |
||
391 | break; |
||
392 | } |
||
393 | case 'pre': |
||
394 | { |
||
395 | $startString = '```'; |
||
396 | if (isset($entity->language)) |
||
397 | { |
||
398 | $startString .= $entity->language; |
||
399 | } |
||
400 | $startString .= "\n"; |
||
401 | break; |
||
402 | } |
||
403 | case 'text_mention': |
||
404 | case 'text_link': |
||
405 | { |
||
406 | $startString = '['; |
||
407 | break; |
||
408 | } |
||
409 | } |
||
410 | } |
||
411 | else if ($this->style == 'HTML') |
||
412 | { |
||
413 | switch ($entity->type) |
||
414 | { |
||
415 | case 'bold': |
||
416 | { |
||
417 | $startString = '<b>'; |
||
418 | break; |
||
419 | } |
||
420 | case 'italic': |
||
421 | { |
||
422 | $startString = '<i>'; |
||
423 | break; |
||
424 | } |
||
425 | case 'underline': |
||
426 | { |
||
427 | $startString = '<u>'; |
||
428 | break; |
||
429 | } |
||
430 | case 'strikethrough': |
||
431 | { |
||
432 | $startString = '<s>'; |
||
433 | break; |
||
434 | } |
||
435 | case 'spoiler': |
||
436 | { |
||
437 | $startString = '<span class="tg-spoiler">'; |
||
438 | break; |
||
439 | } |
||
440 | case 'code': |
||
441 | { |
||
442 | $startString = '<code>'; |
||
443 | break; |
||
444 | } |
||
445 | case 'pre': |
||
446 | { |
||
447 | $startString = '<pre>'; |
||
448 | if (isset($entity->language)) |
||
449 | { |
||
450 | $startString .= '<code class="language-'.$entity->language.'">'; |
||
451 | } |
||
452 | break; |
||
453 | } |
||
454 | case 'text_mention': |
||
455 | { |
||
456 | $startString = '<a href="tg://user?id='.$entity->user->id.'">'; |
||
457 | break; |
||
458 | } |
||
459 | case 'text_link': |
||
460 | { |
||
461 | $startString = '<a href="'.$entity->url.'">'; |
||
462 | break; |
||
463 | } |
||
464 | case 'custom_emoji': |
||
465 | { |
||
466 | $startString = '<tg-emoji emoji-id="'.$entity->custom_emoji_id.'">'; |
||
467 | break; |
||
468 | } |
||
469 | case 'blockquote': |
||
470 | { |
||
471 | $startString = '<blockquote>'; |
||
472 | break; |
||
473 | } |
||
474 | } |
||
475 | } |
||
476 | else if ($this->style == 'MarkdownV2') |
||
477 | { |
||
478 | switch ($entity->type) |
||
479 | { |
||
480 | case 'bold': |
||
481 | { |
||
482 | $startString = '*'; |
||
483 | break; |
||
484 | } |
||
485 | case 'italic': |
||
486 | { |
||
487 | $startString = '_'; |
||
488 | break; |
||
489 | } |
||
490 | case 'spoiler': |
||
491 | { |
||
492 | $startString = '||'; |
||
493 | break; |
||
494 | } |
||
495 | case 'code': |
||
496 | { |
||
497 | $startString = '`'; |
||
498 | break; |
||
499 | } |
||
500 | case 'pre': |
||
501 | { |
||
502 | $startString = '```'; |
||
503 | if (isset($entity->language)) |
||
504 | { |
||
505 | $startString .= $entity->language; |
||
506 | } |
||
507 | $startString .= "\n"; |
||
508 | break; |
||
509 | } |
||
510 | case 'underline': |
||
511 | { |
||
512 | $startString .= '__'; |
||
513 | break; |
||
514 | } |
||
515 | case 'strikethrough': |
||
516 | { |
||
517 | $startString .= '~'; |
||
518 | break; |
||
519 | } |
||
520 | case 'text_mention': |
||
521 | case 'text_link': |
||
522 | { |
||
523 | $startString = '['; |
||
524 | break; |
||
525 | } |
||
526 | case 'custom_emoji': |
||
527 | { |
||
528 | $startString = '!['; |
||
529 | break; |
||
530 | } |
||
531 | case 'blockquote': |
||
532 | { |
||
533 | $startString = '>'; |
||
534 | break; |
||
535 | } |
||
536 | } |
||
537 | } |
||
538 | return $startString; |
||
539 | } |
||
540 | |||
541 | /** |
||
542 | * Check if there are entities that start at the given position and return them |
||
543 | */ |
||
544 | protected function checkForEntityStart($pos) |
||
561 | } |
||
562 | } |
||
563 | |||
564 | /** |
||
565 | * Get the end string of the entity for the choosen style |
||
566 | */ |
||
567 | protected function getEntityStopString($entity) |
||
724 | } |
||
725 | |||
726 | /** |
||
727 | * Check if there are entities that end at the given position and return them (reversed because they are nested) |
||
728 | */ |
||
729 | protected function checkForEntityStop($pos) |
||
730 | { |
||
731 | $entities = []; |
||
732 | foreach ($this->entities as $entity) |
||
733 | { |
||
734 | if ($entity->offset + $entity->length == $pos) |
||
735 | { |
||
736 | if (in_array($entity->type, $this->entitiesToParse)) |
||
737 | { |
||
738 | $entities[] = $entity; |
||
739 | } |
||
740 | } |
||
741 | } |
||
742 | if (!empty($entities)) { |
||
743 | return array_reverse($entities); |
||
744 | } else { |
||
745 | return false; |
||
746 | } |
||
747 | } |
||
748 | |||
749 | /** |
||
750 | * Check for ambiguous entities in MarkdownV2 style (see Telegram docs) |
||
751 | */ |
||
752 | protected function checkMarkdownV2AmbiguousEntities(&$entitiesToCheck) |
||
753 | { |
||
754 | $result = false; |
||
755 | $newEntities = []; |
||
756 | $foundIndex = 0; |
||
757 | foreach ($entitiesToCheck as $ec) |
||
758 | { |
||
759 | if ($ec->type == 'italic' || $ec->type == 'underline') |
||
760 | { |
||
761 | $foundIndex++; |
||
762 | } |
||
763 | } |
||
764 | if ($foundIndex == 2) |
||
765 | { |
||
766 | $result = true; |
||
767 | foreach ($entitiesToCheck as $ec) |
||
768 | { |
||
769 | if ($ec->type != 'italic' && $ec->type != 'underline') |
||
770 | { |
||
771 | $newEntities[] = $ec; |
||
772 | } |
||
773 | } |
||
774 | $entitiesToCheck = $newEntities; |
||
775 | } |
||
776 | return $result; |
||
777 | } |
||
778 | |||
779 | /** |
||
780 | * Count UTF-16 code units of the char passed |
||
781 | */ |
||
782 | protected function getUTF16CodePointsLength($char) { |
||
785 | } |
||
786 | } |
||
787 |