Complex classes like Taster often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes. You can also have a look at the cohesion graph to spot any un-connected, or weakly-connected components.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
While breaking up the class, it is a good idea to analyze how other classes use Taster, and based on these observations, apply Extract Interface, too.
| 1 | <?php  | 
            ||
| 60 | class Taster  | 
            ||
| 61 | { | 
            ||
| 62 | /**  | 
            ||
| 63 | * End-of-line constants.  | 
            ||
| 64 | */  | 
            ||
| 65 | const EOL_UNIX = 'lf';  | 
            ||
| 66 | const EOL_TRS80 = 'cr';  | 
            ||
| 67 | const EOL_WINDOWS = 'crlf';  | 
            ||
| 68 | |||
| 69 | /**  | 
            ||
| 70 | * ASCII character codes for "invisibles".  | 
            ||
| 71 | */  | 
            ||
| 72 | const HORIZONTAL_TAB = 9;  | 
            ||
| 73 | const LINE_FEED = 10;  | 
            ||
| 74 | const CARRIAGE_RETURN = 13;  | 
            ||
| 75 | const SPACE = 32;  | 
            ||
| 76 | |||
| 77 | /**  | 
            ||
| 78 | * Data types -- Used within the lickQuotingStyle method.  | 
            ||
| 79 | */  | 
            ||
| 80 | const DATA_NONNUMERIC = 'nonnumeric';  | 
            ||
| 81 | const DATA_SPECIAL = 'special';  | 
            ||
| 82 | const DATA_UNKNOWN = 'unknown';  | 
            ||
| 83 | |||
| 84 | /**  | 
            ||
| 85 | * Placeholder strings -- hold the place of newlines and delimiters contained  | 
            ||
| 86 | * within quoted text so that the explode method doesn't split incorrectly.  | 
            ||
| 87 | */  | 
            ||
| 88 | const PLACEHOLDER_NEWLINE = '[__NEWLINE__]';  | 
            ||
| 89 | const PLACEHOLDER_DELIM = '[__DELIM__]';  | 
            ||
| 90 | |||
| 91 | /**  | 
            ||
| 92 | * Recommended data sample size.  | 
            ||
| 93 | */  | 
            ||
| 94 | const SAMPLE_SIZE = 2500;  | 
            ||
| 95 | |||
| 96 | /**  | 
            ||
| 97 | * Column data types -- used within the lickHeader method to determine  | 
            ||
| 98 | * whether the first row contains different types of data than the rest of  | 
            ||
| 99 | * the rows (and thus, is likely a header row).  | 
            ||
| 100 | */  | 
            ||
| 101 | // +-987  | 
            ||
| 102 | const TYPE_NUMBER = 'number';  | 
            ||
| 103 | // +-12.387  | 
            ||
| 104 | const TYPE_DOUBLE = 'double';  | 
            ||
| 105 | // I am a string. I can contain all kinds of stuff.  | 
            ||
| 106 | const TYPE_STRING = 'string';  | 
            ||
| 107 | // 2010-04-23 04:23:00  | 
            ||
| 108 | const TYPE_DATETIME = 'datetime';  | 
            ||
| 109 | // 10-Jul-15, 9/1/2007, April 1st, 2006, etc.  | 
            ||
| 110 | const TYPE_DATE = 'date';  | 
            ||
| 111 | // 10:00pm, 5pm, 13:08, etc.  | 
            ||
| 112 | const TYPE_TIME = 'time';  | 
            ||
| 113 | // $98.96, ¥12389, £6.08, €87.00  | 
            ||
| 114 | const TYPE_CURRENCY = 'currency';  | 
            ||
| 115 | // 12ab44m1n2_asdf  | 
            ||
| 116 | const TYPE_ALNUM = 'alnum';  | 
            ||
| 117 | // abababab  | 
            ||
| 118 | const TYPE_ALPHA = 'alpha';  | 
            ||
| 119 | |||
| 120 | /** @var Contract\Streamable The source of data to examine */  | 
            ||
| 121 | protected $input;  | 
            ||
| 122 | |||
| 123 | /** @var string Sample of CSV data to use for tasting (determining CSV flavor) */  | 
            ||
| 124 | protected $sample;  | 
            ||
| 125 | |||
| 126 | /** @var array Possible delimiter characters in (roughly) the order of likelihood */  | 
            ||
| 127 | protected $delims = [',', "\t", ';', '|', ':', '-', '_', '#', '/', '\\', '$', '+', '=', '&', '@'];  | 
            ||
| 128 | |||
| 129 | /**  | 
            ||
| 130 | * Class constructor--accepts a CSV input source.  | 
            ||
| 131 | *  | 
            ||
| 132 | * @param Contract\Streamable The source of CSV data  | 
            ||
| 133 | *  | 
            ||
| 134 | * @throws TasterException  | 
            ||
| 135 | *  | 
            ||
| 136 | * @todo It may be a good idea to skip the first line or two for the sample  | 
            ||
| 137 | * so that the header line(s) don't throw things off (with the exception  | 
            ||
| 138 | * of lickHeader() obviously)  | 
            ||
| 139 | */  | 
            ||
| 140 | public function __construct(Streamable $input)  | 
            ||
| 148 | |||
| 149 | /**  | 
            ||
| 150 | * "Invoke" magic method.  | 
            ||
| 151 | *  | 
            ||
| 152 | * Called when an object is invoked as if it were a function. So, for instance,  | 
            ||
| 153 | * This is simply an alias to the lick method.  | 
            ||
| 154 | *  | 
            ||
| 155 | * @throws TasterException  | 
            ||
| 156 | *  | 
            ||
| 157 | * @return Flavor A flavor object  | 
            ||
| 158 | */  | 
            ||
| 159 | public function __invoke()  | 
            ||
| 163 | |||
| 164 | /**  | 
            ||
| 165 | * Examine the input source and determine what "Flavor" of CSV it contains.  | 
            ||
| 166 | * The CSV format, while having an RFC (https://tools.ietf.org/html/rfc4180),  | 
            ||
| 167 | * doesn't necessarily always conform to it. And it doesn't provide meta such as the delimiting character, quote character, or what types of data are quoted.  | 
            ||
| 168 | * such as the delimiting character, quote character, or what types of data are quoted.  | 
            ||
| 169 | * are quoted.  | 
            ||
| 170 | *  | 
            ||
| 171 | * @throws TasterException  | 
            ||
| 172 | *  | 
            ||
| 173 | * @return Flavor The metadata that the CSV format doesn't provide  | 
            ||
| 174 | *  | 
            ||
| 175 | * @todo Implement a lickQuote method for when lickQuoteAndDelim method fails  | 
            ||
| 176 | * @todo Should there bea lickEscapeChar method? the python module that inspired  | 
            ||
| 177 | * this library doesn't include one...  | 
            ||
| 178 | * @todo This should cache the results and only regenerate if $this->sample  | 
            ||
| 179 | * changes (or $this->input)  | 
            ||
| 180 | */  | 
            ||
| 181 | public function lick()  | 
            ||
| 202 | |||
| 203 | /**  | 
            ||
| 204 | * Examines the contents of the CSV data to make a determination of whether  | 
            ||
| 205 | * or not it contains a header row. To make this determination, it creates  | 
            ||
| 206 | * an array of each column's (in each row)'s data type and length and then  | 
            ||
| 207 | * compares them. If all of the rows except the header look similar, it will  | 
            ||
| 208 | * return true. This is only a guess though. There is no programmatic way to  | 
            ||
| 209 | * determine 100% whether a CSV file has a header. The format does not  | 
            ||
| 210 | * provide metadata such as that.  | 
            ||
| 211 | *  | 
            ||
| 212 | * @param string $delim The CSV data's delimiting char (can be a variety of chars but)  | 
            ||
| 213 | * typically is either a comma or a tab, sometimes a pipe)  | 
            ||
| 214 | * @param string $eol The CSV data's end-of-line char(s) (\n \r or \r\n)  | 
            ||
| 215 | *  | 
            ||
| 216 | * @return bool True if the data (most likely) contains a header row  | 
            ||
| 217 | *  | 
            ||
| 218 | * @todo This method needs a total refactor. It's not necessary to loop twice  | 
            ||
| 219 | * You could get away with one loop and that would allow for me to do  | 
            ||
| 220 | * something like only examining enough rows to get to a particular  | 
            ||
| 221 | * "hasHeader" score (+-100 for instance) & then just return true|false  | 
            ||
| 222 | * @todo Also, break out of the first loop after a certain (perhaps even a  | 
            ||
| 223 | * configurable) amount of lines (you only need to examine so much data )  | 
            ||
| 224 | * to reliably make a determination and this is an expensive method)  | 
            ||
| 225 | * @todo I could remove the need for quote, delim, and eol by "licking" the  | 
            ||
| 226 | * data sample provided in the first argument. Also, I could actually  | 
            ||
| 227 | * create a Reader object to read the data here.  | 
            ||
| 228 | */  | 
            ||
| 229 | public function lickHeader($delim, $eol)  | 
            ||
| 295 | |||
| 296 | /**  | 
            ||
| 297 | * Replaces all quoted columns with a blank string. I was using this method  | 
            ||
| 298 | * to prevent explode() from incorrectly splitting at delimiters and newlines  | 
            ||
| 299 | * within quotes when parsing a file. But this was before I wrote the  | 
            ||
| 300 | * replaceQuotedSpecialChars method which (at least to me) makes more sense.  | 
            ||
| 301 | *  | 
            ||
| 302 | * @param string $data The string to replace quoted strings within  | 
            ||
| 303 | *  | 
            ||
| 304 | * @return string The input string with quoted strings removed  | 
            ||
| 305 | *  | 
            ||
| 306 | * @todo Replace code that uses this method with the replaceQuotedSpecialChars  | 
            ||
| 307 | * method instead. I think it's cleaner.  | 
            ||
| 308 | */  | 
            ||
| 309 | protected function removeQuotedStrings($data)  | 
            ||
| 313 | |||
| 314 | /**  | 
            ||
| 315 | * Examine the input source to determine which character(s) are being used  | 
            ||
| 316 | * as the end-of-line character.  | 
            ||
| 317 | *  | 
            ||
| 318 | * @return string The end-of-line char for the input data  | 
            ||
| 319 | * @credit pulled from stackoverflow thread *tips hat to username "Harm"*  | 
            ||
| 320 | *  | 
            ||
| 321 | * @todo This should throw an exception if it cannot determine the line ending  | 
            ||
| 322 | * @todo I probably will make this method protected when I'm done with testing...  | 
            ||
| 323 | * @todo If there is any way for this method to fail (for instance if a file )  | 
            ||
| 324 | * is totally empty or contains no line breaks), then it needs to throw  | 
            ||
| 325 | * a relevant TasterException  | 
            ||
| 326 | * @todo Use replaceQuotedSpecialChars rather than removeQuotedStrings()  | 
            ||
| 327 | */  | 
            ||
| 328 | protected function lickLineEndings()  | 
            ||
| 349 | |||
| 350 | /**  | 
            ||
| 351 | * The best way to determine quote and delimiter characters is when columns  | 
            ||
| 352 | * are quoted, often you can seek out a pattern of delim, quote, stuff, quote, delim  | 
            ||
| 353 | * but this only works if you have quoted columns. If you don't you have to  | 
            ||
| 354 | * determine these characters some other way... (see lickDelimiter).  | 
            ||
| 355 | *  | 
            ||
| 356 | * @throws TasterException  | 
            ||
| 357 | *  | 
            ||
| 358 | * @return array A two-row array containing quotechar, delimchar  | 
            ||
| 359 | *  | 
            ||
| 360 | * @todo make protected  | 
            ||
| 361 | * @todo This should throw an exception if it cannot determine the delimiter  | 
            ||
| 362 | * this way.  | 
            ||
| 363 | * @todo This should check for any line endings not just \n  | 
            ||
| 364 | */  | 
            ||
| 365 | protected function lickQuoteAndDelim()  | 
            ||
| 405 | |||
| 406 | /**  | 
            ||
| 407 | * Take a list of likely delimiter characters and find the one that occurs  | 
            ||
| 408 | * the most consistent amount of times within the provided data.  | 
            ||
| 409 | *  | 
            ||
| 410 | * @param string $eol The character(s) used for newlines  | 
            ||
| 411 | *  | 
            ||
| 412 | * @return string One of four Flavor::QUOTING_* constants  | 
            ||
| 413 | *  | 
            ||
| 414 | * @see Flavor for possible quote style constants  | 
            ||
| 415 | *  | 
            ||
| 416 | * @todo Refactor this method--It needs more thorough testing against a wider  | 
            ||
| 417 | * variety of CSV data to be sure it works reliably. And I'm sure there  | 
            ||
| 418 | * are many performance and logic improvements that could be made. This  | 
            ||
| 419 | * is essentially a first draft.  | 
            ||
| 420 | * @todo Can't use replaceQuotedSpecialChars rather than removeQuotedStrings  | 
            ||
| 421 | * because the former requires u to know the delimiter  | 
            ||
| 422 | */  | 
            ||
| 423 | protected function lickDelimiter($eol = "\n")  | 
            ||
| 531 | |||
| 532 | /**  | 
            ||
| 533 | * Compare positional consistency of several characters to determine the  | 
            ||
| 534 | * probable delimiter character. The idea behind this is that the delimiter  | 
            ||
| 535 | * character is likely more consistently distributed than false-positive  | 
            ||
| 536 | * delimiter characters produced by lickDelimiter(). For instance, consider  | 
            ||
| 537 | * a series of rows similar to the following:.  | 
            ||
| 538 | *  | 
            ||
| 539 | * 1,luke,visinoni,[email protected],(530) 413-3076,04-23-1986  | 
            ||
| 540 | *  | 
            ||
| 541 | * The lickDelimiter() method will often not be able to determine whether the  | 
            ||
| 542 | * delimiter is a comma or a dash because they occur the same number of times  | 
            ||
| 543 | * on just about every line (5 for comma, 3 for dash). The difference is  | 
            ||
| 544 | * obvious to you, no doubt. But us humans are pattern-recognition machines!  | 
            ||
| 545 | * The difference between the comma and the dash are that the comma is dist-  | 
            ||
| 546 | * ributed almost evenly throughout the line. The dash characters occur  | 
            ||
| 547 | * entirely at the end of the line. This method accepts any number of possible  | 
            ||
| 548 | * delimiter characters and returns the one that is distributed  | 
            ||
| 549 | *  | 
            ||
| 550 | * If delim character cannot be determined by lickQuoteAndDelim(), taster  | 
            ||
| 551 | * tries lickDelimiter(). When that method runs into a tie, it will use this  | 
            ||
| 552 | * as a tie-breaker.  | 
            ||
| 553 | *  | 
            ||
| 554 | * @param array $delims Possible delimiter characters (method chooses from  | 
            ||
| 555 | * this array of characters)  | 
            ||
| 556 | * @param string $eol The end-of-line character (or set of characters)  | 
            ||
| 557 | *  | 
            ||
| 558 | * @throws TasterException  | 
            ||
| 559 | *  | 
            ||
| 560 | * @return string The probable delimiter character  | 
            ||
| 561 | */  | 
            ||
| 562 | protected function guessDelimByDistribution(array $delims, $eol = "\n")  | 
            ||
| 598 | |||
| 599 | /**  | 
            ||
| 600 | * Determine the "style" of data quoting. The CSV format, while having an RFC  | 
            ||
| 601 | * (https://tools.ietf.org/html/rfc4180), doesn't necessarily always conform  | 
            ||
| 602 | * to it. And it doesn't provide metadata such as the delimiting character,  | 
            ||
| 603 | * quote character, or what types of data are quoted. So this method makes a  | 
            ||
| 604 | * logical guess by finding which columns have been quoted (if any) and  | 
            ||
| 605 | * examining their data type. Most often, CSV files will only use quotes  | 
            ||
| 606 | * around columns that contain special characters such as the dilimiter,  | 
            ||
| 607 | * the quoting character, newlines, etc. (we refer to this style as )  | 
            ||
| 608 | * QUOTE_MINIMAL), but some quote all columns that contain nonnumeric data  | 
            ||
| 609 | * (QUOTE_NONNUMERIC). Then there are CSV files that quote all columns  | 
            ||
| 610 | * (QUOTE_ALL) and those that quote none (QUOTE_NONE).  | 
            ||
| 611 | *  | 
            ||
| 612 | * @param string $delim The character used as the column delimiter  | 
            ||
| 613 | * @param string $eol The character used for newlines  | 
            ||
| 614 | *  | 
            ||
| 615 | * @return string One of four "QUOTING_" constants defined above--see this  | 
            ||
| 616 | * method's description for more info.  | 
            ||
| 617 | *  | 
            ||
| 618 | * @todo Refactor this method--It needs more thorough testing against a wider  | 
            ||
| 619 | * variety of CSV data to be sure it works reliably. And I'm sure there  | 
            ||
| 620 | * are many performance and logic improvements that could be made. This  | 
            ||
| 621 | * is essentially a first draft.  | 
            ||
| 622 | */  | 
            ||
| 623 | protected function lickQuotingStyle($delim, $eol)  | 
            ||
| 698 | |||
| 699 | /**  | 
            ||
| 700 | * Remove quotes around a piece of text (if there are any).  | 
            ||
| 701 | *  | 
            ||
| 702 | * @param string $data The data to "unquote"  | 
            ||
| 703 | *  | 
            ||
| 704 | * @return string The data passed in, only with quotes stripped (off the edges)  | 
            ||
| 705 | */  | 
            ||
| 706 | protected function unQuote($data)  | 
            ||
| 710 | |||
| 711 | /**  | 
            ||
| 712 | * Determine whether a particular string of data has quotes around it.  | 
            ||
| 713 | *  | 
            ||
| 714 | * @param string $data The data to check  | 
            ||
| 715 | *  | 
            ||
| 716 | * @return bool Whether the data is quoted or not  | 
            ||
| 717 | */  | 
            ||
| 718 | protected function isQuoted($data)  | 
            ||
| 722 | |||
| 723 | /**  | 
            ||
| 724 | * Determine what type of data is contained within a variable  | 
            ||
| 725 | * Possible types:  | 
            ||
| 726 | * - nonnumeric - only numbers  | 
            ||
| 727 | * - special - contains characters that could potentially need to be quoted (possible delimiter characters)  | 
            ||
| 728 | * - unknown - everything else  | 
            ||
| 729 | * This method is really only used within the "lickQuotingStyle" method to  | 
            ||
| 730 | * help determine whether a particular column has been quoted due to it being  | 
            ||
| 731 | * nonnumeric or because it has some special character in it such as a delimiter  | 
            ||
| 732 | * or newline or quote.  | 
            ||
| 733 | *  | 
            ||
| 734 | * @param string $data The data to determine the type of  | 
            ||
| 735 | *  | 
            ||
| 736 | * @return string The type of data (one of the "DATA_" constants above)  | 
            ||
| 737 | *  | 
            ||
| 738 | * @todo I could probably eliminate this method and use an anonymous function  | 
            ||
| 739 | * instead. It isn't used anywhere else and its name could be misleading.  | 
            ||
| 740 | * Especially since I also have a lickType method that is used within the  | 
            ||
| 741 | * lickHeader method.  | 
            ||
| 742 | */  | 
            ||
| 743 | protected function lickDataType($data)  | 
            ||
| 755 | |||
| 756 | /**  | 
            ||
| 757 | * Replace all instances of newlines and whatever character you specify (as  | 
            ||
| 758 | * the delimiter) that are contained within quoted text. The replacements are  | 
            ||
| 759 | * simply a special placeholder string. This is done so that I can use the  | 
            ||
| 760 | * very unsmart "explode" function and not have to worry about it exploding  | 
            ||
| 761 | * on delimiters or newlines within quotes. Once I have exploded, I typically  | 
            ||
| 762 | * sub back in the real characters before doing anything else. Although  | 
            ||
| 763 | * currently there is no dedicated method for doing so I just use str_replace.  | 
            ||
| 764 | *  | 
            ||
| 765 | * @param string $data The string to do the replacements on  | 
            ||
| 766 | * @param string $delim The delimiter character to replace  | 
            ||
| 767 | *  | 
            ||
| 768 | * @return string The data with replacements performed  | 
            ||
| 769 | *  | 
            ||
| 770 | * @todo I could probably pass in (maybe optionally) the newline character I  | 
            ||
| 771 | * want to replace as well. I'll do that if I need to.  | 
            ||
| 772 | */  | 
            ||
| 773 | protected function replaceQuotedSpecialChars($data, $delim)  | 
            ||
| 782 | |||
| 783 | /**  | 
            ||
| 784 | * Determine the "type" of a particular string of data. Used for the lickHeader  | 
            ||
| 785 | * method to assign a type to each column to try to determine whether the  | 
            ||
| 786 | * first for is different than a consistent column type.  | 
            ||
| 787 | *  | 
            ||
| 788 | * @todo As I'm writing this method I'm beginning ot realize how expensive  | 
            ||
| 789 | * the lickHeader method is going to end up being since it has to apply all  | 
            ||
| 790 | * these regexes (potentially) to every column. I may end up writing a much  | 
            ||
| 791 | * simpler type-checking method than this if it proves to be too expensive  | 
            ||
| 792 | * to be practical.  | 
            ||
| 793 | *  | 
            ||
| 794 | * @param string $data The string of data to check the type of  | 
            ||
| 795 | *  | 
            ||
| 796 | * @return string One of the TYPE_ string constants above  | 
            ||
| 797 | */  | 
            ||
| 798 | protected function lickType($data)  | 
            ||
| 842 | }  | 
            ||
| 843 | 
Our type inference engine has found an assignment to a property that is incompatible with the declared type of that property.
Either this assignment is in error or the assigned type should be added to the documentation/type hint for that property..