Duplicate code is one of the most pungent code smells. A rule that is often used is to re-structure code once it is duplicated in three or more places.
Common duplication problems, and corresponding solutions are:
Complex classes like MimeMagic often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes. You can also have a look at the cohesion graph to spot any un-connected, or weakly-connected components.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
While breaking up the class, it is a good idea to analyze how other classes use MimeMagic, and based on these observations, apply Extract Interface, too.
| 1 | <?php |
||
| 144 | class MimeMagic { |
||
| 145 | /** |
||
| 146 | * @var array Mapping of media types to arrays of MIME types. |
||
| 147 | * This is used by findMediaType and getMediaType, respectively |
||
| 148 | */ |
||
| 149 | protected $mMediaTypes = null; |
||
| 150 | |||
| 151 | /** @var array Map of MIME type aliases |
||
| 152 | */ |
||
| 153 | protected $mMimeTypeAliases = null; |
||
| 154 | |||
| 155 | /** @var array Map of MIME types to file extensions (as a space separated list) |
||
| 156 | */ |
||
| 157 | protected $mMimeToExt = null; |
||
| 158 | |||
| 159 | /** @var array Map of file extensions types to MIME types (as a space separated list) |
||
| 160 | */ |
||
| 161 | public $mExtToMime = null; |
||
| 162 | |||
| 163 | /** @var IEContentAnalyzer |
||
| 164 | */ |
||
| 165 | protected $mIEAnalyzer; |
||
| 166 | |||
| 167 | /** @var string Extra MIME types, set for example by media handling extensions |
||
| 168 | */ |
||
| 169 | private $mExtraTypes = ''; |
||
| 170 | |||
| 171 | /** @var string Extra MIME info, set for example by media handling extensions |
||
| 172 | */ |
||
| 173 | private $mExtraInfo = ''; |
||
| 174 | |||
| 175 | /** @var Config */ |
||
| 176 | private $mConfig; |
||
| 177 | |||
| 178 | /** @var MimeMagic The singleton instance |
||
| 179 | */ |
||
| 180 | private static $instance = null; |
||
| 181 | |||
| 182 | /** Initializes the MimeMagic object. This is called by MimeMagic::singleton(). |
||
| 183 | * |
||
| 184 | * This constructor parses the mime.types and mime.info files and build internal mappings. |
||
| 185 | * |
||
| 186 | * @todo Make this constructor private once everything uses the singleton instance |
||
| 187 | * @param Config $config |
||
| 188 | */ |
||
| 189 | function __construct( Config $config = null ) { |
||
| 361 | |||
| 362 | /** |
||
| 363 | * Get an instance of this class |
||
| 364 | * @return MimeMagic |
||
| 365 | */ |
||
| 366 | public static function singleton() { |
||
| 374 | |||
| 375 | /** |
||
| 376 | * Adds to the list mapping MIME to file extensions. |
||
| 377 | * As an extension author, you are encouraged to submit patches to |
||
| 378 | * MediaWiki's core to add new MIME types to mime.types. |
||
| 379 | * @param string $types |
||
| 380 | */ |
||
| 381 | public function addExtraTypes( $types ) { |
||
| 384 | |||
| 385 | /** |
||
| 386 | * Adds to the list mapping MIME to media type. |
||
| 387 | * As an extension author, you are encouraged to submit patches to |
||
| 388 | * MediaWiki's core to add new MIME info to mime.info. |
||
| 389 | * @param string $info |
||
| 390 | */ |
||
| 391 | public function addExtraInfo( $info ) { |
||
| 394 | |||
| 395 | /** |
||
| 396 | * Returns a list of file extensions for a given MIME type as a space |
||
| 397 | * separated string or null if the MIME type was unrecognized. Resolves |
||
| 398 | * MIME type aliases. |
||
| 399 | * |
||
| 400 | * @param string $mime |
||
| 401 | * @return string|null |
||
| 402 | */ |
||
| 403 | public function getExtensionsForType( $mime ) { |
||
| 421 | |||
| 422 | /** |
||
| 423 | * Returns a list of MIME types for a given file extension as a space |
||
| 424 | * separated string or null if the extension was unrecognized. |
||
| 425 | * |
||
| 426 | * @param string $ext |
||
| 427 | * @return string|null |
||
| 428 | */ |
||
| 429 | public function getTypesForExtension( $ext ) { |
||
| 435 | |||
| 436 | /** |
||
| 437 | * Returns a single MIME type for a given file extension or null if unknown. |
||
| 438 | * This is always the first type from the list returned by getTypesForExtension($ext). |
||
| 439 | * |
||
| 440 | * @param string $ext |
||
| 441 | * @return string|null |
||
| 442 | */ |
||
| 443 | public function guessTypesForExtension( $ext ) { |
||
| 455 | |||
| 456 | /** |
||
| 457 | * Tests if the extension matches the given MIME type. Returns true if a |
||
| 458 | * match was found, null if the MIME type is unknown, and false if the |
||
| 459 | * MIME type is known but no matches where found. |
||
| 460 | * |
||
| 461 | * @param string $extension |
||
| 462 | * @param string $mime |
||
| 463 | * @return bool|null |
||
| 464 | */ |
||
| 465 | public function isMatchingExtension( $extension, $mime ) { |
||
| 477 | |||
| 478 | /** |
||
| 479 | * Returns true if the MIME type is known to represent an image format |
||
| 480 | * supported by the PHP GD library. |
||
| 481 | * |
||
| 482 | * @param string $mime |
||
| 483 | * |
||
| 484 | * @return bool |
||
| 485 | */ |
||
| 486 | public function isPHPImageType( $mime ) { |
||
| 500 | |||
| 501 | /** |
||
| 502 | * Returns true if the extension represents a type which can |
||
| 503 | * be reliably detected from its content. Use this to determine |
||
| 504 | * whether strict content checks should be applied to reject |
||
| 505 | * invalid uploads; if we can't identify the type we won't |
||
| 506 | * be able to say if it's invalid. |
||
| 507 | * |
||
| 508 | * @todo Be more accurate when using fancy MIME detector plugins; |
||
| 509 | * right now this is the bare minimum getimagesize() list. |
||
| 510 | * @param string $extension |
||
| 511 | * @return bool |
||
| 512 | */ |
||
| 513 | function isRecognizableExtension( $extension ) { |
||
| 531 | |||
| 532 | /** |
||
| 533 | * Improves a MIME type using the file extension. Some file formats are very generic, |
||
| 534 | * so their MIME type is not very meaningful. A more useful MIME type can be derived |
||
| 535 | * by looking at the file extension. Typically, this method would be called on the |
||
| 536 | * result of guessMimeType(). |
||
| 537 | * |
||
| 538 | * @param string $mime The MIME type, typically guessed from a file's content. |
||
| 539 | * @param string $ext The file extension, as taken from the file name |
||
| 540 | * |
||
| 541 | * @return string The MIME type |
||
| 542 | */ |
||
| 543 | public function improveTypeFromExtension( $mime, $ext ) { |
||
| 581 | |||
| 582 | /** |
||
| 583 | * MIME type detection. This uses detectMimeType to detect the MIME type |
||
| 584 | * of the file, but applies additional checks to determine some well known |
||
| 585 | * file formats that may be missed or misinterpreted by the default MIME |
||
| 586 | * detection (namely XML based formats like XHTML or SVG, as well as ZIP |
||
| 587 | * based formats like OPC/ODF files). |
||
| 588 | * |
||
| 589 | * @param string $file The file to check |
||
| 590 | * @param string|bool $ext The file extension, or true (default) to extract it from the filename. |
||
| 591 | * Set it to false to ignore the extension. DEPRECATED! Set to false, use |
||
| 592 | * improveTypeFromExtension($mime, $ext) later to improve MIME type. |
||
| 593 | * |
||
| 594 | * @return string The MIME type of $file |
||
| 595 | */ |
||
| 596 | public function guessMimeType( $file, $ext = true ) { |
||
| 616 | |||
| 617 | /** |
||
| 618 | * Guess the MIME type from the file contents. |
||
| 619 | * |
||
| 620 | * @todo Remove $ext param |
||
| 621 | * |
||
| 622 | * @param string $file |
||
| 623 | * @param mixed $ext |
||
| 624 | * @return bool|string |
||
| 625 | * @throws MWException |
||
| 626 | */ |
||
| 627 | private function doGuessMimeType( $file, $ext ) { |
||
| 628 | // Read a chunk of the file |
||
| 629 | MediaWiki\suppressWarnings(); |
||
| 630 | $f = fopen( $file, 'rb' ); |
||
| 631 | MediaWiki\restoreWarnings(); |
||
| 632 | |||
| 633 | if ( !$f ) { |
||
| 634 | return 'unknown/unknown'; |
||
| 635 | } |
||
| 636 | |||
| 637 | $fsize = filesize( $file ); |
||
| 638 | if ( $fsize === false ) { |
||
| 639 | return 'unknown/unknown'; |
||
| 640 | } |
||
| 641 | |||
| 642 | $head = fread( $f, 1024 ); |
||
| 643 | $tailLength = min( 65558, $fsize ); // 65558 = maximum size of a zip EOCDR |
||
| 644 | if ( fseek( $f, -1 * $tailLength, SEEK_END ) === -1 ) { |
||
| 645 | throw new MWException( |
||
| 646 | "Seeking $tailLength bytes from EOF failed in " . __METHOD__ ); |
||
| 647 | } |
||
| 648 | $tail = $tailLength ? fread( $f, $tailLength ) : ''; |
||
| 649 | fclose( $f ); |
||
| 650 | |||
| 651 | wfDebug( __METHOD__ . ": analyzing head and tail of $file for magic numbers.\n" ); |
||
| 652 | |||
| 653 | // Hardcode a few magic number checks... |
||
| 654 | $headers = [ |
||
| 655 | // Multimedia... |
||
| 656 | 'MThd' => 'audio/midi', |
||
| 657 | 'OggS' => 'application/ogg', |
||
| 658 | |||
| 659 | // Image formats... |
||
| 660 | // Note that WMF may have a bare header, no magic number. |
||
| 661 | "\x01\x00\x09\x00" => 'application/x-msmetafile', // Possibly prone to false positives? |
||
| 662 | "\xd7\xcd\xc6\x9a" => 'application/x-msmetafile', |
||
| 663 | '%PDF' => 'application/pdf', |
||
| 664 | 'gimp xcf' => 'image/x-xcf', |
||
| 665 | |||
| 666 | // Some forbidden fruit... |
||
| 667 | 'MZ' => 'application/octet-stream', // DOS/Windows executable |
||
| 668 | "\xca\xfe\xba\xbe" => 'application/octet-stream', // Mach-O binary |
||
| 669 | "\x7fELF" => 'application/octet-stream', // ELF binary |
||
| 670 | ]; |
||
| 671 | |||
| 672 | foreach ( $headers as $magic => $candidate ) { |
||
| 673 | if ( strncmp( $head, $magic, strlen( $magic ) ) == 0 ) { |
||
| 674 | wfDebug( __METHOD__ . ": magic header in $file recognized as $candidate\n" ); |
||
| 675 | return $candidate; |
||
| 676 | } |
||
| 677 | } |
||
| 678 | |||
| 679 | /* Look for WebM and Matroska files */ |
||
| 680 | if ( strncmp( $head, pack( "C4", 0x1a, 0x45, 0xdf, 0xa3 ), 4 ) == 0 ) { |
||
| 681 | $doctype = strpos( $head, "\x42\x82" ); |
||
| 682 | if ( $doctype ) { |
||
| 683 | // Next byte is datasize, then data (sizes larger than 1 byte are very stupid muxers) |
||
| 684 | $data = substr( $head, $doctype + 3, 8 ); |
||
| 685 | if ( strncmp( $data, "matroska", 8 ) == 0 ) { |
||
| 686 | wfDebug( __METHOD__ . ": recognized file as video/x-matroska\n" ); |
||
| 687 | return "video/x-matroska"; |
||
| 688 | } elseif ( strncmp( $data, "webm", 4 ) == 0 ) { |
||
| 689 | wfDebug( __METHOD__ . ": recognized file as video/webm\n" ); |
||
| 690 | return "video/webm"; |
||
| 691 | } |
||
| 692 | } |
||
| 693 | wfDebug( __METHOD__ . ": unknown EBML file\n" ); |
||
| 694 | return "unknown/unknown"; |
||
| 695 | } |
||
| 696 | |||
| 697 | /* Look for WebP */ |
||
| 698 | if ( strncmp( $head, "RIFF", 4 ) == 0 && strncmp( substr( $head, 8, 7 ), "WEBPVP8", 7 ) == 0 ) { |
||
| 699 | wfDebug( __METHOD__ . ": recognized file as image/webp\n" ); |
||
| 700 | return "image/webp"; |
||
| 701 | } |
||
| 702 | |||
| 703 | /** |
||
| 704 | * Look for PHP. Check for this before HTML/XML... Warning: this is a |
||
| 705 | * heuristic, and won't match a file with a lot of non-PHP before. It |
||
| 706 | * will also match text files which could be PHP. :) |
||
| 707 | * |
||
| 708 | * @todo FIXME: For this reason, the check is probably useless -- an attacker |
||
| 709 | * could almost certainly just pad the file with a lot of nonsense to |
||
| 710 | * circumvent the check in any case where it would be a security |
||
| 711 | * problem. On the other hand, it causes harmful false positives (bug |
||
| 712 | * 16583). The heuristic has been cut down to exclude three-character |
||
| 713 | * strings like "<? ", but should it be axed completely? |
||
| 714 | */ |
||
| 715 | if ( ( strpos( $head, '<?php' ) !== false ) || |
||
| 716 | ( strpos( $head, "<\x00?\x00p\x00h\x00p" ) !== false ) || |
||
| 717 | ( strpos( $head, "<\x00?\x00 " ) !== false ) || |
||
| 718 | ( strpos( $head, "<\x00?\x00\n" ) !== false ) || |
||
| 719 | ( strpos( $head, "<\x00?\x00\t" ) !== false ) || |
||
| 720 | ( strpos( $head, "<\x00?\x00=" ) !== false ) ) { |
||
| 721 | |||
| 722 | wfDebug( __METHOD__ . ": recognized $file as application/x-php\n" ); |
||
| 723 | return 'application/x-php'; |
||
| 724 | } |
||
| 725 | |||
| 726 | /** |
||
| 727 | * look for XML formats (XHTML and SVG) |
||
| 728 | */ |
||
| 729 | $xml = new XmlTypeCheck( $file ); |
||
| 730 | if ( $xml->wellFormed ) { |
||
| 731 | $xmlMimeTypes = $this->mConfig->get( 'XMLMimeTypes' ); |
||
| 732 | if ( isset( $xmlMimeTypes[$xml->getRootElement()] ) ) { |
||
| 733 | return $xmlMimeTypes[$xml->getRootElement()]; |
||
| 734 | } else { |
||
| 735 | return 'application/xml'; |
||
| 736 | } |
||
| 737 | } |
||
| 738 | |||
| 739 | /** |
||
| 740 | * look for shell scripts |
||
| 741 | */ |
||
| 742 | $script_type = null; |
||
| 743 | |||
| 744 | # detect by shebang |
||
| 745 | if ( substr( $head, 0, 2 ) == "#!" ) { |
||
| 746 | $script_type = "ASCII"; |
||
| 747 | } elseif ( substr( $head, 0, 5 ) == "\xef\xbb\xbf#!" ) { |
||
| 748 | $script_type = "UTF-8"; |
||
| 749 | } elseif ( substr( $head, 0, 7 ) == "\xfe\xff\x00#\x00!" ) { |
||
| 750 | $script_type = "UTF-16BE"; |
||
| 751 | } elseif ( substr( $head, 0, 7 ) == "\xff\xfe#\x00!" ) { |
||
| 752 | $script_type = "UTF-16LE"; |
||
| 753 | } |
||
| 754 | |||
| 755 | if ( $script_type ) { |
||
| 756 | if ( $script_type !== "UTF-8" && $script_type !== "ASCII" ) { |
||
| 757 | // Quick and dirty fold down to ASCII! |
||
| 758 | $pack = [ 'UTF-16BE' => 'n*', 'UTF-16LE' => 'v*' ]; |
||
| 759 | $chars = unpack( $pack[$script_type], substr( $head, 2 ) ); |
||
| 760 | $head = ''; |
||
| 761 | foreach ( $chars as $codepoint ) { |
||
| 762 | if ( $codepoint < 128 ) { |
||
| 763 | $head .= chr( $codepoint ); |
||
| 764 | } else { |
||
| 765 | $head .= '?'; |
||
| 766 | } |
||
| 767 | } |
||
| 768 | } |
||
| 769 | |||
| 770 | $match = []; |
||
| 771 | |||
| 772 | if ( preg_match( '%/?([^\s]+/)(\w+)%', $head, $match ) ) { |
||
| 773 | $mime = "application/x-{$match[2]}"; |
||
| 774 | wfDebug( __METHOD__ . ": shell script recognized as $mime\n" ); |
||
| 775 | return $mime; |
||
| 776 | } |
||
| 777 | } |
||
| 778 | |||
| 779 | // Check for ZIP variants (before getimagesize) |
||
| 780 | if ( strpos( $tail, "PK\x05\x06" ) !== false ) { |
||
| 781 | wfDebug( __METHOD__ . ": ZIP header present in $file\n" ); |
||
| 782 | return $this->detectZipType( $head, $tail, $ext ); |
||
| 783 | } |
||
| 784 | |||
| 785 | MediaWiki\suppressWarnings(); |
||
| 786 | $gis = getimagesize( $file ); |
||
| 787 | MediaWiki\restoreWarnings(); |
||
| 788 | |||
| 789 | View Code Duplication | if ( $gis && isset( $gis['mime'] ) ) { |
|
| 790 | $mime = $gis['mime']; |
||
| 791 | wfDebug( __METHOD__ . ": getimagesize detected $file as $mime\n" ); |
||
| 792 | return $mime; |
||
| 793 | } |
||
| 794 | |||
| 795 | // Also test DjVu |
||
| 796 | $deja = new DjVuImage( $file ); |
||
| 797 | if ( $deja->isValid() ) { |
||
| 798 | wfDebug( __METHOD__ . ": detected $file as image/vnd.djvu\n" ); |
||
| 799 | return 'image/vnd.djvu'; |
||
| 800 | } |
||
| 801 | |||
| 802 | # Media handling extensions can guess the MIME by content |
||
| 803 | # It's intentionally here so that if core is wrong about a type (false positive), |
||
| 804 | # people will hopefully nag and submit patches :) |
||
| 805 | $mime = false; |
||
| 806 | # Some strings by reference for performance - assuming well-behaved hooks |
||
| 807 | Hooks::run( |
||
| 808 | 'MimeMagicGuessFromContent', |
||
| 809 | [ $this, &$head, &$tail, $file, &$mime ] |
||
| 810 | ); |
||
| 811 | |||
| 812 | return $mime; |
||
| 813 | } |
||
| 814 | |||
| 815 | /** |
||
| 816 | * Detect application-specific file type of a given ZIP file from its |
||
| 817 | * header data. Currently works for OpenDocument and OpenXML types... |
||
| 818 | * If can't tell, returns 'application/zip'. |
||
| 819 | * |
||
| 820 | * @param string $header Some reasonably-sized chunk of file header |
||
| 821 | * @param string|null $tail The tail of the file |
||
| 822 | * @param string|bool $ext The file extension, or true to extract it from the filename. |
||
| 823 | * Set it to false (default) to ignore the extension. DEPRECATED! Set to false, |
||
| 824 | * use improveTypeFromExtension($mime, $ext) later to improve MIME type. |
||
| 825 | * |
||
| 826 | * @return string |
||
| 827 | */ |
||
| 828 | function detectZipType( $header, $tail = null, $ext = false ) { |
||
| 914 | |||
| 915 | /** |
||
| 916 | * Internal MIME type detection. Detection is done using an external |
||
| 917 | * program, if $wgMimeDetectorCommand is set. Otherwise, the fileinfo |
||
| 918 | * extension is tried if it is available. If detection fails and $ext |
||
| 919 | * is not false, the MIME type is guessed from the file extension, |
||
| 920 | * using guessTypesForExtension. |
||
| 921 | * |
||
| 922 | * If the MIME type is still unknown, getimagesize is used to detect the |
||
| 923 | * MIME type if the file is an image. If no MIME type can be determined, |
||
| 924 | * this function returns 'unknown/unknown'. |
||
| 925 | * |
||
| 926 | * @param string $file The file to check |
||
| 927 | * @param string|bool $ext The file extension, or true (default) to extract it from the filename. |
||
| 928 | * Set it to false to ignore the extension. DEPRECATED! Set to false, use |
||
| 929 | * improveTypeFromExtension($mime, $ext) later to improve MIME type. |
||
| 930 | * |
||
| 931 | * @return string The MIME type of $file |
||
| 932 | */ |
||
| 933 | private function detectMimeType( $file, $ext = true ) { |
||
| 994 | |||
| 995 | /** |
||
| 996 | * Determine the media type code for a file, using its MIME type, name and |
||
| 997 | * possibly its contents. |
||
| 998 | * |
||
| 999 | * This function relies on the findMediaType(), mapping extensions and MIME |
||
| 1000 | * types to media types. |
||
| 1001 | * |
||
| 1002 | * @todo analyse file if need be |
||
| 1003 | * @todo look at multiple extension, separately and together. |
||
| 1004 | * |
||
| 1005 | * @param string $path Full path to the image file, in case we have to look at the contents |
||
| 1006 | * (if null, only the MIME type is used to determine the media type code). |
||
| 1007 | * @param string $mime MIME type. If null it will be guessed using guessMimeType. |
||
| 1008 | * |
||
| 1009 | * @return string A value to be used with the MEDIATYPE_xxx constants. |
||
| 1010 | */ |
||
| 1011 | function getMediaType( $path = null, $mime = null ) { |
||
| 1087 | |||
| 1088 | /** |
||
| 1089 | * Returns a media code matching the given MIME type or file extension. |
||
| 1090 | * File extensions are represented by a string starting with a dot (.) to |
||
| 1091 | * distinguish them from MIME types. |
||
| 1092 | * |
||
| 1093 | * This function relies on the mapping defined by $this->mMediaTypes |
||
| 1094 | * @access private |
||
| 1095 | * @param string $extMime |
||
| 1096 | * @return int|string |
||
| 1097 | */ |
||
| 1098 | function findMediaType( $extMime ) { |
||
| 1126 | |||
| 1127 | /** |
||
| 1128 | * Get the MIME types that various versions of Internet Explorer would |
||
| 1129 | * detect from a chunk of the content. |
||
| 1130 | * |
||
| 1131 | * @param string $fileName The file name (unused at present) |
||
| 1132 | * @param string $chunk The first 256 bytes of the file |
||
| 1133 | * @param string $proposed The MIME type proposed by the server |
||
| 1134 | * @return array |
||
| 1135 | */ |
||
| 1136 | public function getIEMimeTypes( $fileName, $chunk, $proposed ) { |
||
| 1140 | |||
| 1141 | /** |
||
| 1142 | * Get a cached instance of IEContentAnalyzer |
||
| 1143 | * |
||
| 1144 | * @return IEContentAnalyzer |
||
| 1145 | */ |
||
| 1146 | protected function getIEContentAnalyzer() { |
||
| 1152 | } |
||
| 1153 |
In PHP, under loose comparison (like
==, or!=, orswitchconditions), values of different types might be equal.For
stringvalues, the empty string''is a special case, in particular the following results might be unexpected: