Complex classes like Taster often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes. You can also have a look at the cohesion graph to spot any un-connected, or weakly-connected components.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
While breaking up the class, it is a good idea to analyze how other classes use Taster, and based on these observations, apply Extract Interface, too.
1 | <?php |
||
60 | class Taster |
||
61 | { |
||
62 | /** |
||
63 | * End-of-line constants. |
||
64 | */ |
||
65 | const EOL_UNIX = 'lf'; |
||
66 | const EOL_TRS80 = 'cr'; |
||
67 | const EOL_WINDOWS = 'crlf'; |
||
68 | |||
69 | /** |
||
70 | * ASCII character codes for "invisibles". |
||
71 | */ |
||
72 | const HORIZONTAL_TAB = 9; |
||
73 | const LINE_FEED = 10; |
||
74 | const CARRIAGE_RETURN = 13; |
||
75 | const SPACE = 32; |
||
76 | |||
77 | /** |
||
78 | * Data types -- Used within the lickQuotingStyle method. |
||
79 | */ |
||
80 | const DATA_NONNUMERIC = 'nonnumeric'; |
||
81 | const DATA_SPECIAL = 'special'; |
||
82 | const DATA_UNKNOWN = 'unknown'; |
||
83 | |||
84 | /** |
||
85 | * Placeholder strings -- hold the place of newlines and delimiters contained |
||
86 | * within quoted text so that the explode method doesn't split incorrectly. |
||
87 | */ |
||
88 | const PLACEHOLDER_NEWLINE = '[__NEWLINE__]'; |
||
89 | const PLACEHOLDER_DELIM = '[__DELIM__]'; |
||
90 | |||
91 | /** |
||
92 | * Recommended data sample size. |
||
93 | */ |
||
94 | const SAMPLE_SIZE = 2500; |
||
95 | |||
96 | /** |
||
97 | * Column data types -- used within the lickHeader method to determine |
||
98 | * whether the first row contains different types of data than the rest of |
||
99 | * the rows (and thus, is likely a header row). |
||
100 | */ |
||
101 | // +-987 |
||
102 | const TYPE_NUMBER = 'number'; |
||
103 | // +-12.387 |
||
104 | const TYPE_DOUBLE = 'double'; |
||
105 | // I am a string. I can contain all kinds of stuff. |
||
106 | const TYPE_STRING = 'string'; |
||
107 | // 2010-04-23 04:23:00 |
||
108 | const TYPE_DATETIME = 'datetime'; |
||
109 | // 10-Jul-15, 9/1/2007, April 1st, 2006, etc. |
||
110 | const TYPE_DATE = 'date'; |
||
111 | // 10:00pm, 5pm, 13:08, etc. |
||
112 | const TYPE_TIME = 'time'; |
||
113 | // $98.96, ¥12389, £6.08, €87.00 |
||
114 | const TYPE_CURRENCY = 'currency'; |
||
115 | // 12ab44m1n2_asdf |
||
116 | const TYPE_ALNUM = 'alnum'; |
||
117 | // abababab |
||
118 | const TYPE_ALPHA = 'alpha'; |
||
119 | |||
120 | /** @var Contract\Streamable The source of data to examine */ |
||
121 | protected $input; |
||
122 | |||
123 | /** @var string Sample of CSV data to use for tasting (determining CSV flavor) */ |
||
124 | protected $sample; |
||
125 | |||
126 | /** @var CharCollection Possible delimiter characters in (roughly) the order of likelihood */ |
||
127 | protected $delims; |
||
128 | |||
129 | /** |
||
130 | * Class constructor--accepts a CSV input source. |
||
131 | * |
||
132 | * @param Contract\Streamable The source of CSV data |
||
133 | * |
||
134 | * @throws TasterException |
||
135 | * |
||
136 | * @todo It may be a good idea to skip the first line or two for the sample |
||
137 | * so that the header line(s) don't throw things off (with the exception |
||
138 | * of lickHeader() obviously) |
||
139 | */ |
||
140 | public function __construct(Streamable $input) |
||
148 | |||
149 | /** |
||
150 | * "Invoke" magic method. |
||
151 | * |
||
152 | * Called when an object is invoked as if it were a function. So, for instance, |
||
153 | * This is simply an alias to the lick method. |
||
154 | * |
||
155 | * @throws TasterException |
||
156 | * |
||
157 | * @return Flavor A flavor object |
||
158 | */ |
||
159 | public function __invoke() |
||
163 | |||
164 | /** |
||
165 | * Examine the input source and determine what "Flavor" of CSV it contains. |
||
166 | * The CSV format, while having an RFC (https://tools.ietf.org/html/rfc4180), |
||
167 | * doesn't necessarily always conform to it. And it doesn't provide meta such as the delimiting character, quote character, or what types of data are quoted. |
||
168 | * such as the delimiting character, quote character, or what types of data are quoted. |
||
169 | * are quoted. |
||
170 | * |
||
171 | * @throws TasterException |
||
172 | * |
||
173 | * @return Flavor The metadata that the CSV format doesn't provide |
||
174 | * |
||
175 | * @todo Implement a lickQuote method for when lickQuoteAndDelim method fails |
||
176 | * @todo Should there bea lickEscapeChar method? the python module that inspired |
||
177 | * this library doesn't include one... |
||
178 | * @todo This should cache the results and only regenerate if $this->sample |
||
179 | * changes (or $this->input) |
||
180 | */ |
||
181 | public function lick() |
||
202 | |||
203 | /** |
||
204 | * Examines the contents of the CSV data to make a determination of whether |
||
205 | * or not it contains a header row. To make this determination, it creates |
||
206 | * an array of each column's (in each row)'s data type and length and then |
||
207 | * compares them. If all of the rows except the header look similar, it will |
||
208 | * return true. This is only a guess though. There is no programmatic way to |
||
209 | * determine 100% whether a CSV file has a header. The format does not |
||
210 | * provide metadata such as that. |
||
211 | * |
||
212 | * @param string $delim The CSV data's delimiting char (can be a variety of chars but) |
||
213 | * typically is either a comma or a tab, sometimes a pipe) |
||
214 | * @param string $eol The CSV data's end-of-line char(s) (\n \r or \r\n) |
||
215 | * |
||
216 | * @return bool True if the data (most likely) contains a header row |
||
217 | * |
||
218 | * @todo This method needs a total refactor. It's not necessary to loop twice |
||
219 | * You could get away with one loop and that would allow for me to do |
||
220 | * something like only examining enough rows to get to a particular |
||
221 | * "hasHeader" score (+-100 for instance) & then just return true|false |
||
222 | * @todo Also, break out of the first loop after a certain (perhaps even a |
||
223 | * configurable) amount of lines (you only need to examine so much data ) |
||
224 | * to reliably make a determination and this is an expensive method) |
||
225 | * @todo I could remove the need for quote, delim, and eol by "licking" the |
||
226 | * data sample provided in the first argument. Also, I could actually |
||
227 | * create a Reader object to read the data here. |
||
228 | */ |
||
229 | public function lickHeader($delim, $eol) |
||
295 | |||
296 | /** |
||
297 | * Replaces all quoted columns with a blank string. I was using this method |
||
298 | * to prevent explode() from incorrectly splitting at delimiters and newlines |
||
299 | * within quotes when parsing a file. But this was before I wrote the |
||
300 | * replaceQuotedSpecialChars method which (at least to me) makes more sense. |
||
301 | * |
||
302 | * @param string $data The string to replace quoted strings within |
||
303 | * |
||
304 | * @return string The input string with quoted strings removed |
||
305 | * |
||
306 | * @todo Replace code that uses this method with the replaceQuotedSpecialChars |
||
307 | * method instead. I think it's cleaner. |
||
308 | */ |
||
309 | protected function removeQuotedStrings($data) |
||
313 | |||
314 | /** |
||
315 | * Examine the input source to determine which character(s) are being used |
||
316 | * as the end-of-line character. |
||
317 | * |
||
318 | * @return string The end-of-line char for the input data |
||
319 | * @credit pulled from stackoverflow thread *tips hat to username "Harm"* |
||
320 | * |
||
321 | * @todo This should throw an exception if it cannot determine the line ending |
||
322 | * @todo I probably will make this method protected when I'm done with testing... |
||
323 | * @todo If there is any way for this method to fail (for instance if a file ) |
||
324 | * is totally empty or contains no line breaks), then it needs to throw |
||
325 | * a relevant TasterException |
||
326 | * @todo Use replaceQuotedSpecialChars rather than removeQuotedStrings() |
||
327 | */ |
||
328 | protected function lickLineEndings() |
||
349 | |||
350 | /** |
||
351 | * The best way to determine quote and delimiter characters is when columns |
||
352 | * are quoted, often you can seek out a pattern of delim, quote, stuff, quote, delim |
||
353 | * but this only works if you have quoted columns. If you don't you have to |
||
354 | * determine these characters some other way... (see lickDelimiter). |
||
355 | * |
||
356 | * @throws TasterException |
||
357 | * |
||
358 | * @return array A two-row array containing quotechar, delimchar |
||
359 | * |
||
360 | * @todo make protected |
||
361 | * @todo This should throw an exception if it cannot determine the delimiter |
||
362 | * this way. |
||
363 | * @todo This should check for any line endings not just \n |
||
364 | */ |
||
365 | protected function lickQuoteAndDelim() |
||
405 | |||
406 | /** |
||
407 | * Take a list of likely delimiter characters and find the one that occurs |
||
408 | * the most consistent amount of times within the provided data. |
||
409 | * |
||
410 | * @param string $eol The character(s) used for newlines |
||
411 | * |
||
412 | * @return string One of four Flavor::QUOTING_* constants |
||
413 | * |
||
414 | * @see Flavor for possible quote style constants |
||
415 | * |
||
416 | * @todo Refactor this method--It needs more thorough testing against a wider |
||
417 | * variety of CSV data to be sure it works reliably. And I'm sure there |
||
418 | * are many performance and logic improvements that could be made. This |
||
419 | * is essentially a first draft. |
||
420 | * @todo Can't use replaceQuotedSpecialChars rather than removeQuotedStrings |
||
421 | * because the former requires u to know the delimiter |
||
422 | */ |
||
423 | protected function lickDelimiter($eol = "\n") |
||
529 | |||
530 | /** |
||
531 | * Compare positional consistency of several characters to determine the |
||
532 | * probable delimiter character. The idea behind this is that the delimiter |
||
533 | * character is likely more consistently distributed than false-positive |
||
534 | * delimiter characters produced by lickDelimiter(). For instance, consider |
||
535 | * a series of rows similar to the following:. |
||
536 | * |
||
537 | * 1,luke,visinoni,[email protected],(530) 413-3076,04-23-1986 |
||
538 | * |
||
539 | * The lickDelimiter() method will often not be able to determine whether the |
||
540 | * delimiter is a comma or a dash because they occur the same number of times |
||
541 | * on just about every line (5 for comma, 3 for dash). The difference is |
||
542 | * obvious to you, no doubt. But us humans are pattern-recognition machines! |
||
543 | * The difference between the comma and the dash are that the comma is dist- |
||
544 | * ributed almost evenly throughout the line. The dash characters occur |
||
545 | * entirely at the end of the line. This method accepts any number of possible |
||
546 | * delimiter characters and returns the one that is distributed |
||
547 | * |
||
548 | * If delim character cannot be determined by lickQuoteAndDelim(), taster |
||
549 | * tries lickDelimiter(). When that method runs into a tie, it will use this |
||
550 | * as a tie-breaker. |
||
551 | * |
||
552 | * @param array $delims Possible delimiter characters (method chooses from |
||
553 | * this array of characters) |
||
554 | * @param string $eol The end-of-line character (or set of characters) |
||
555 | * |
||
556 | * @throws TasterException |
||
557 | * |
||
558 | * @return string The probable delimiter character |
||
559 | */ |
||
560 | protected function guessDelimByDistribution(array $delims, $eol = "\n") |
||
596 | |||
597 | /** |
||
598 | * Determine the "style" of data quoting. The CSV format, while having an RFC |
||
599 | * (https://tools.ietf.org/html/rfc4180), doesn't necessarily always conform |
||
600 | * to it. And it doesn't provide metadata such as the delimiting character, |
||
601 | * quote character, or what types of data are quoted. So this method makes a |
||
602 | * logical guess by finding which columns have been quoted (if any) and |
||
603 | * examining their data type. Most often, CSV files will only use quotes |
||
604 | * around columns that contain special characters such as the dilimiter, |
||
605 | * the quoting character, newlines, etc. (we refer to this style as ) |
||
606 | * QUOTE_MINIMAL), but some quote all columns that contain nonnumeric data |
||
607 | * (QUOTE_NONNUMERIC). Then there are CSV files that quote all columns |
||
608 | * (QUOTE_ALL) and those that quote none (QUOTE_NONE). |
||
609 | * |
||
610 | * @param string $delim The character used as the column delimiter |
||
611 | * @param string $eol The character used for newlines |
||
612 | * |
||
613 | * @return string One of four "QUOTING_" constants defined above--see this |
||
614 | * method's description for more info. |
||
615 | * |
||
616 | * @todo Refactor this method--It needs more thorough testing against a wider |
||
617 | * variety of CSV data to be sure it works reliably. And I'm sure there |
||
618 | * are many performance and logic improvements that could be made. This |
||
619 | * is essentially a first draft. |
||
620 | */ |
||
621 | protected function lickQuotingStyle($delim, $eol) |
||
696 | |||
697 | /** |
||
698 | * Remove quotes around a piece of text (if there are any). |
||
699 | * |
||
700 | * @param string $data The data to "unquote" |
||
701 | * |
||
702 | * @return string The data passed in, only with quotes stripped (off the edges) |
||
703 | */ |
||
704 | protected function unQuote($data) |
||
708 | |||
709 | /** |
||
710 | * Determine whether a particular string of data has quotes around it. |
||
711 | * |
||
712 | * @param string $data The data to check |
||
713 | * |
||
714 | * @return bool Whether the data is quoted or not |
||
715 | */ |
||
716 | protected function isQuoted($data) |
||
720 | |||
721 | /** |
||
722 | * Determine what type of data is contained within a variable |
||
723 | * Possible types: |
||
724 | * - nonnumeric - only numbers |
||
725 | * - special - contains characters that could potentially need to be quoted (possible delimiter characters) |
||
726 | * - unknown - everything else |
||
727 | * This method is really only used within the "lickQuotingStyle" method to |
||
728 | * help determine whether a particular column has been quoted due to it being |
||
729 | * nonnumeric or because it has some special character in it such as a delimiter |
||
730 | * or newline or quote. |
||
731 | * |
||
732 | * @param string $data The data to determine the type of |
||
733 | * |
||
734 | * @return string The type of data (one of the "DATA_" constants above) |
||
735 | * |
||
736 | * @todo I could probably eliminate this method and use an anonymous function |
||
737 | * instead. It isn't used anywhere else and its name could be misleading. |
||
738 | * Especially since I also have a lickType method that is used within the |
||
739 | * lickHeader method. |
||
740 | */ |
||
741 | protected function lickDataType($data) |
||
753 | |||
754 | /** |
||
755 | * Replace all instances of newlines and whatever character you specify (as |
||
756 | * the delimiter) that are contained within quoted text. The replacements are |
||
757 | * simply a special placeholder string. This is done so that I can use the |
||
758 | * very unsmart "explode" function and not have to worry about it exploding |
||
759 | * on delimiters or newlines within quotes. Once I have exploded, I typically |
||
760 | * sub back in the real characters before doing anything else. Although |
||
761 | * currently there is no dedicated method for doing so I just use str_replace. |
||
762 | * |
||
763 | * @param string $data The string to do the replacements on |
||
764 | * @param string $delim The delimiter character to replace |
||
765 | * |
||
766 | * @return string The data with replacements performed |
||
767 | * |
||
768 | * @todo I could probably pass in (maybe optionally) the newline character I |
||
769 | * want to replace as well. I'll do that if I need to. |
||
770 | */ |
||
771 | protected function replaceQuotedSpecialChars($data, $delim) |
||
780 | |||
781 | /** |
||
782 | * Determine the "type" of a particular string of data. Used for the lickHeader |
||
783 | * method to assign a type to each column to try to determine whether the |
||
784 | * first for is different than a consistent column type. |
||
785 | * |
||
786 | * @todo As I'm writing this method I'm beginning ot realize how expensive |
||
787 | * the lickHeader method is going to end up being since it has to apply all |
||
788 | * these regexes (potentially) to every column. I may end up writing a much |
||
789 | * simpler type-checking method than this if it proves to be too expensive |
||
790 | * to be practical. |
||
791 | * |
||
792 | * @param string $data The string of data to check the type of |
||
793 | * |
||
794 | * @return string One of the TYPE_ string constants above |
||
795 | */ |
||
796 | protected function lickType($data) |
||
840 | } |
||
841 |