| Conditions | 20 |
| Paths | 1336 |
| Total Lines | 296 |
| Code Lines | 129 |
| Lines | 0 |
| Ratio | 0 % |
| Changes | 2 | ||
| Bugs | 1 | Features | 0 |
Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.
For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.
Commonly applied refactorings include:
If many parameters/temporary variables are present:
| 1 | <?php |
||
| 411 | function utf8_sanitize_invisibles($string, $level, $substitute) |
||
| 412 | { |
||
| 413 | global $sourcedir; |
||
| 414 | |||
| 415 | $string = (string) $string; |
||
| 416 | $level = min(max((int) $level, 0), 2); |
||
| 417 | $substitute = (string) $substitute; |
||
| 418 | |||
| 419 | require_once($sourcedir . '/Unicode/RegularExpressions.php'); |
||
| 420 | $prop_classes = utf8_regex_properties(); |
||
| 421 | |||
| 422 | // We never want non-whitespace control characters |
||
| 423 | $disallowed[] = '[^\P{Cc}\t\r\n]'; |
||
| 424 | |||
| 425 | // We never want private use characters or non-characters. |
||
| 426 | // Use our own version of \p{Cn} in order to avoid possible inconsistencies |
||
| 427 | // between our data and whichever version of PCRE happens to be installed |
||
| 428 | // on this server. Unlike \p{Cc} and \p{Co}, which never change, the value |
||
| 429 | // of \p{Cn} changes with every new version of Unicode. |
||
| 430 | $disallowed[] = '[\p{Co}' . $prop_classes['Cn'] . ']'; |
||
| 431 | |||
| 432 | // Several more things we never want: |
||
| 433 | $disallowed[] = '[' . implode('', array( |
||
| 434 | // Soft Hyphen. |
||
| 435 | '\x{AD}', |
||
| 436 | // Khmer Vowel Inherent AQ and Khmer Vowel Inherent AA. |
||
| 437 | // Unicode Standard ch. 16 says: "they are insufficient for [their] |
||
| 438 | // purpose and should be considered errors in the encoding." |
||
| 439 | '\x{17B4}-\x{17B5}', |
||
| 440 | // Invisible math characters. |
||
| 441 | '\x{2061}-\x{2064}', |
||
| 442 | // Deprecated formatting characters. |
||
| 443 | '\x{206A}-\x{206F}', |
||
| 444 | // Zero Width No-Break Space, a.k.a. Byte Order Mark. |
||
| 445 | '\x{FEFF}', |
||
| 446 | // Annotation characters and Object Replacement Character. |
||
| 447 | '\x{FFF9}-\x{FFFC}', |
||
| 448 | )) . ']'; |
||
| 449 | |||
| 450 | switch ($level) |
||
| 451 | { |
||
| 452 | case 2: |
||
| 453 | $disallowed[] = '[' . implode('', array( |
||
| 454 | // Combining Grapheme Character. |
||
| 455 | '\x{34F}', |
||
| 456 | // Zero Width Non-Joiner. |
||
| 457 | '\x{200C}', |
||
| 458 | // Zero Width Joiner. |
||
| 459 | '\x{200D}', |
||
| 460 | // All variation selectors. |
||
| 461 | $prop_classes['Variation_Selector'], |
||
| 462 | // Tag characters. |
||
| 463 | '\x{E0000}-\x{E007F}', |
||
| 464 | )) . ']'; |
||
| 465 | |||
| 466 | // no break |
||
| 467 | |||
| 468 | case 1: |
||
| 469 | $disallowed[] = '[' . implode('', array( |
||
| 470 | // Zero Width Space. |
||
| 471 | '\x{200B}', |
||
| 472 | // Word Joiner. |
||
| 473 | '\x{2060}', |
||
| 474 | // "Bidi_Control" characters. |
||
| 475 | // Disallowing means that all characters will behave according |
||
| 476 | // to their default bidirectional text properties. |
||
| 477 | $prop_classes['Bidi_Control'], |
||
| 478 | // Hangul filler characters. |
||
| 479 | // Used as placeholders in incomplete ideographs. |
||
| 480 | '\x{115F}\x{1160}\x{3164}\x{FFA0}', |
||
| 481 | // Shorthand formatting characters. |
||
| 482 | '\x{1BCA0}-\x{1BCA3}', |
||
| 483 | // Musical formatting characters. |
||
| 484 | '\x{1D173}-\x{1D17A}', |
||
| 485 | )) . ']'; |
||
| 486 | |||
| 487 | break; |
||
| 488 | |||
| 489 | default: |
||
| 490 | // Zero Width Space only allowed in certain scripts. |
||
| 491 | $disallowed[] = '(?<![\p{Thai}\p{Myanmar}\p{Khmer}\p{Hiragana}\p{Katakana}])\x{200B}'; |
||
| 492 | |||
| 493 | // Word Joiner disallowed inside words. (Yes, \w is Unicode safe.) |
||
| 494 | $disallowed[] = '(?<=\w)\x{2060}(?=\w)'; |
||
| 495 | |||
| 496 | // Hangul Choseong Filler and Hangul Jungseong Filler must followed |
||
| 497 | // by more Hangul Jamo characters. |
||
| 498 | $disallowed[] = '[\x{115F}\x{1160}](?![\x{1100}-\x{11FF}\x{A960}-\x{A97F}\x{D7B0}-\x{D7FF}])'; |
||
| 499 | |||
| 500 | // Hangul Filler for Hangul compatibility chars. |
||
| 501 | $disallowed[] = '\x{3164}(?![\x{3130}-\x{318F}])'; |
||
| 502 | |||
| 503 | // Halfwidth Hangul Filler for halfwidth Hangul compatibility chars. |
||
| 504 | $disallowed[] = '\x{FFA0}(?![\x{FFA1}-\x{FFDC}])'; |
||
| 505 | |||
| 506 | // Shorthand formatting characters only with other shorthand chars. |
||
| 507 | $disallowed[] = '[\x{1BCA0}-\x{1BCA3}](?![\x{1BC00}-\x{1BC9F}])'; |
||
| 508 | $disallowed[] = '(?<![\x{1BC00}-\x{1BC9F}])[\x{1BCA0}-\x{1BCA3}]'; |
||
| 509 | |||
| 510 | // Musical formatting characters only with other musical chars. |
||
| 511 | $disallowed[] = '[\x{1D173}\x{1D175}\x{1D177}\x{1D179}](?![\x{1D100}-\x{1D1FF}])'; |
||
| 512 | $disallowed[] = '(?<![\x{1D100}-\x{1D1FF}])[\x{1D174}\x{1D176}\x{1D178}\x{1D17A}]'; |
||
| 513 | |||
| 514 | break; |
||
| 515 | } |
||
| 516 | |||
| 517 | if ($level < 2) |
||
| 518 | { |
||
| 519 | /* |
||
| 520 | Combining Grapheme Character has two uses: to override standard |
||
| 521 | search and collation behaviours, which we never want to allow, and |
||
| 522 | to ensure correct behaviour of combining marks in a few exceptional |
||
| 523 | cases, which is legitimate and should be allowed. This means we can |
||
| 524 | simply test whether it is followed by a combining mark in order to |
||
| 525 | determine whether to allow it. |
||
| 526 | */ |
||
| 527 | $disallowed[] = '\x{34F}(?!\p{M})'; |
||
| 528 | |||
| 529 | // Tag characters not allowed inside words. |
||
| 530 | $disallowed[] = '(?<=\w)[\x{E0000}-\x{E007F}](?=\w)'; |
||
| 531 | } |
||
| 532 | |||
| 533 | $string = preg_replace('/' . implode('|', $disallowed) . '/u', $substitute, $string); |
||
| 534 | |||
| 535 | // Are we done yet? |
||
| 536 | if (!preg_match('/[' . $prop_classes['Join_Control'] . $prop_classes['Regional_Indicator'] . $prop_classes['Emoji'] . $prop_classes['Variation_Selector'] . ']/u', $string)) |
||
| 537 | return $string; |
||
| 538 | |||
| 539 | // String must be in Normalization Form C for the following checks to work. |
||
| 540 | $string = utf8_normalize_c($string); |
||
| 541 | |||
| 542 | $placeholders = array(); |
||
| 543 | |||
| 544 | // Use placeholders to preserve known emoji from further processing. |
||
| 545 | // Regex source is https://unicode.org/reports/tr51/#EBNF_and_Regex |
||
| 546 | $string = preg_replace_callback( |
||
| 547 | '/' . |
||
| 548 | // Flag emojis |
||
| 549 | '[' . $prop_classes['Regional_Indicator'] . ']{2}' . |
||
| 550 | // Or |
||
| 551 | '|' . |
||
| 552 | // Emoji characters |
||
| 553 | '[' . $prop_classes['Emoji'] . ']' . |
||
| 554 | // Possibly followed by modifiers of various sorts |
||
| 555 | '(' . |
||
| 556 | '[' . $prop_classes['Emoji_Modifier'] . ']' . |
||
| 557 | '|' . |
||
| 558 | '\x{FE0F}\x{20E3}?' . |
||
| 559 | '|' . |
||
| 560 | '[\x{E0020}-\x{E007E}]+\x{E007F}' . |
||
| 561 | ')?' . |
||
| 562 | // Possibly concatenated with Zero Width Joiner and more emojis |
||
| 563 | // (e.g. the "family" emoji sequences) |
||
| 564 | '(' . |
||
| 565 | '\x{200D}[' . $prop_classes['Emoji'] . ']' . |
||
| 566 | '(' . |
||
| 567 | '[' . $prop_classes['Emoji_Modifier'] . ']' . |
||
| 568 | '|' . |
||
| 569 | '\x{FE0F}\x{20E3}?' . |
||
| 570 | '|' . |
||
| 571 | '[\x{E0020}-\x{E007E}]+\x{E007F}' . |
||
| 572 | ')?' . |
||
| 573 | ')*' . |
||
| 574 | '/u', |
||
| 575 | function ($matches) use (&$placeholders) |
||
| 576 | { |
||
| 577 | // Skip lone ASCII characters that are not actully part of an emoji sequence. |
||
| 578 | // This can happen because the digits 0-9 and the '*' and '#' characters are |
||
| 579 | // the base characters for the "Emoji_Keycap_Sequence" emojis. |
||
| 580 | if (strlen($matches[0]) === 1) |
||
| 581 | return $matches[0]; |
||
| 582 | |||
| 583 | $placeholders[$matches[0]] = "\xEE\xB3\x9B" . md5($matches[0]) . "\xEE\xB3\x9C"; |
||
| 584 | return $placeholders[$matches[0]]; |
||
| 585 | }, |
||
| 586 | $string |
||
| 587 | ); |
||
| 588 | |||
| 589 | // Get rid of any unsanctioned variation selectors. |
||
| 590 | if (preg_match('/[' . $prop_classes['Variation_Selector'] . ']/u', $string)) |
||
| 591 | { |
||
| 592 | /* |
||
| 593 | Unicode gives pre-defined lists of sanctioned variation sequences |
||
| 594 | and says any use of variation selectors outside those sequences is |
||
| 595 | unsanctioned. |
||
| 596 | */ |
||
| 597 | |||
| 598 | $patterns = array('/[' . $prop_classes['Ideographic'] . ']\K[\x{E0100}-\x{E01EF}]/u'); |
||
| 599 | |||
| 600 | foreach (utf8_regex_variation_selectors() as $variation_selector => $allowed_base_chars) |
||
| 601 | $patterns[] = '/[' . $allowed_base_chars . ']\K[' . $variation_selector . ']/u'; |
||
| 602 | |||
| 603 | // Use placeholders for sanctioned variation selectors. |
||
| 604 | $string = preg_replace_callback( |
||
| 605 | $patterns, |
||
| 606 | function ($matches) use (&$placeholders) |
||
| 607 | { |
||
| 608 | $placeholders[$matches[0]] = "\xEE\xB3\x9B" . md5($matches[0]) . "\xEE\xB3\x9C"; |
||
| 609 | return $placeholders[$matches[0]]; |
||
| 610 | }, |
||
| 611 | $string |
||
| 612 | ); |
||
| 613 | |||
| 614 | // Remove any unsanctioned variation selectors. |
||
| 615 | $string = preg_replace('/[' . $prop_classes['Variation_Selector'] . ']/u', $substitute, $string); |
||
| 616 | } |
||
| 617 | |||
| 618 | // Join controls are only allowed inside words in special circumstances. |
||
| 619 | // See https://unicode.org/reports/tr31/#Layout_and_Format_Control_Characters |
||
| 620 | if (preg_match('/[' . $prop_classes['Join_Control'] . ']/u', $string)) |
||
| 621 | { |
||
| 622 | // Zero Width Non-Joiner (U+200C) |
||
| 623 | $zwnj = "\xE2\x80\x8C"; |
||
| 624 | // Zero Width Joiner (U+200D) |
||
| 625 | $zwj = "\xE2\x80\x8D"; |
||
| 626 | |||
| 627 | $placeholders[$zwnj] = "\xEE\x80\x8C"; |
||
| 628 | $placeholders[$zwj] = "\xEE\x80\x8D"; |
||
| 629 | |||
| 630 | // When not in strict mode, allow ZWJ at word boundaries. |
||
| 631 | if ($level === 0) |
||
| 632 | $string = preg_replace('/\b\x{200D}|\x{200D}\b/u', $placeholders[$zwj], $string); |
||
| 633 | |||
| 634 | // Tests for Zero Width Joiner and Zero Width Non-Joiner. |
||
| 635 | $joining_type_classes = utf8_regex_joining_type(); |
||
| 636 | $indic_classes = utf8_regex_indic(); |
||
| 637 | |||
| 638 | foreach (array_merge($joining_type_classes, $indic_classes) as $script => $classes) |
||
| 639 | { |
||
| 640 | // Cursive scripts like Arabic use ZWNJ in certain contexts. |
||
| 641 | // For these scripts, use test A1 for allowing ZWNJ. |
||
| 642 | // https://unicode.org/reports/tr31/#A1 |
||
| 643 | if (isset($joining_type_classes[$script])) |
||
| 644 | { |
||
| 645 | $lj = !empty($classes['Left_Joining']) ? $classes['Left_Joining'] : ''; |
||
| 646 | $rj = !empty($classes['Right_Joining']) ? $classes['Right_Joining'] : ''; |
||
| 647 | $t = !empty($classes['Transparent']) ? '[' . $classes['Transparent'] . ']*' : ''; |
||
| 648 | |||
| 649 | if (!empty($classes['Dual_Joining'])) |
||
| 650 | { |
||
| 651 | $lj .= $classes['Dual_Joining']; |
||
| 652 | $rj .= $classes['Dual_Joining']; |
||
| 653 | } |
||
| 654 | |||
| 655 | $pattern = '[' . $lj . ']' . $t . $zwnj . $t . '[' . $rj . ']'; |
||
| 656 | } |
||
| 657 | // Indic scripts with viramas use ZWNJ and ZWJ in certain contexts. |
||
| 658 | // For these scripts, use tests A2 and B for allowing ZWNJ and ZWJ. |
||
| 659 | // https://unicode.org/reports/tr31/#A2 |
||
| 660 | // https://unicode.org/reports/tr31/#B |
||
| 661 | else |
||
| 662 | { |
||
| 663 | // A letter that is part of this particular script. |
||
| 664 | $letter = '[' . $classes['Letter'] . ']'; |
||
| 665 | |||
| 666 | // Zero or more non-spacing marks used in this script. |
||
| 667 | $nonspacing_marks = '[' . $classes['Nonspacing_Mark'] . ']*'; |
||
| 668 | |||
| 669 | // Zero or more non-spacing combining marks used in this script. |
||
| 670 | $nonspacing_combining_marks = '[' . $classes['Nonspacing_Combining_Mark'] . ']*'; |
||
| 671 | |||
| 672 | // ZWNJ must be followed by another letter in the same script. |
||
| 673 | $zwnj_pattern = '\x{200C}(?=' . $nonspacing_combining_marks . $letter . ')'; |
||
| 674 | |||
| 675 | // ZWJ must NOT be followed by a vowel dependent character in this |
||
| 676 | // script or by any character from a different script. |
||
| 677 | $zwj_pattern = '\x{200D}(?!' . (!empty($classes['Vowel_Dependent']) ? '[' . $classes['Vowel_Dependent'] . ']|' : '') . '[^' . $classes['All'] . '])'; |
||
| 678 | |||
| 679 | // Now build the pattern for this script. |
||
| 680 | $pattern = $letter . $nonspacing_marks . '[' . $classes['viramas'] . ']' . $nonspacing_combining_marks . '\K' . (!empty($zwj_pattern) ? '(?:' . $zwj_pattern . '|' . $zwnj_pattern . ')' : $zwnj_pattern); |
||
| 681 | } |
||
| 682 | |||
| 683 | // Do the thing. |
||
| 684 | $string = preg_replace_callback( |
||
| 685 | '/' . $pattern . '/u', |
||
| 686 | function ($matches) use ($placeholders) |
||
| 687 | { |
||
| 688 | return strtr($matches[0], $placeholders); |
||
| 689 | }, |
||
| 690 | $string |
||
| 691 | ); |
||
| 692 | |||
| 693 | // Did we catch 'em all? |
||
| 694 | if (strpos($string, $zwnj) === false && strpos($string, $zwj) === false) |
||
| 695 | break; |
||
| 696 | } |
||
| 697 | |||
| 698 | // Apart from the exceptions above, ZWNJ and ZWJ are not allowed. |
||
| 699 | $string = str_replace(array($zwj, $zwnj), $substitute, $string); |
||
| 700 | } |
||
| 701 | |||
| 702 | // Revert placeholders back to original characters. |
||
| 703 | $string = strtr($string, array_flip($placeholders)); |
||
| 704 | |||
| 705 | |||
| 706 | return $string; |
||
| 707 | } |
||
| 709 | ?> |