| 1 |  |  | <?php | 
            
                                                                                                            
                            
            
                                    
            
            
                | 2 |  |  | /** | 
            
                                                                                                            
                            
            
                                    
            
            
                | 3 |  |  |  * Copyright (c) 2008-2011 Andreas Heigl<[email protected]> | 
            
                                                                                                            
                            
            
                                    
            
            
                | 4 |  |  |  * | 
            
                                                                                                            
                            
            
                                    
            
            
                | 5 |  |  |  * Permission is hereby granted, free of charge, to any person obtaining a copy | 
            
                                                                                                            
                            
            
                                    
            
            
                | 6 |  |  |  * of this software and associated documentation files (the "Software"), to deal | 
            
                                                                                                            
                            
            
                                    
            
            
                | 7 |  |  |  * in the Software without restriction, including without limitation the rights | 
            
                                                                                                            
                            
            
                                    
            
            
                | 8 |  |  |  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | 
            
                                                                                                            
                            
            
                                    
            
            
                | 9 |  |  |  * copies of the Software, and to permit persons to whom the Software is | 
            
                                                                                                            
                            
            
                                    
            
            
                | 10 |  |  |  * furnished to do so, subject to the following conditions: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 11 |  |  |  * | 
            
                                                                                                            
                            
            
                                    
            
            
                | 12 |  |  |  * The above copyright notice and this permission notice shall be included in | 
            
                                                                                                            
                            
            
                                    
            
            
                | 13 |  |  |  * all copies or substantial portions of the Software. | 
            
                                                                                                            
                            
            
                                    
            
            
                | 14 |  |  |  * | 
            
                                                                                                            
                            
            
                                    
            
            
                | 15 |  |  |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | 
            
                                                                                                            
                            
            
                                    
            
            
                | 16 |  |  |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | 
            
                                                                                                            
                            
            
                                    
            
            
                | 17 |  |  |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | 
            
                                                                                                            
                            
            
                                    
            
            
                | 18 |  |  |  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | 
            
                                                                                                            
                            
            
                                    
            
            
                | 19 |  |  |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | 
            
                                                                                                            
                            
            
                                    
            
            
                | 20 |  |  |  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN | 
            
                                                                                                            
                            
            
                                    
            
            
                | 21 |  |  |  * THE SOFTWARE. | 
            
                                                                                                            
                            
            
                                    
            
            
                | 22 |  |  |  * | 
            
                                                                                                            
                            
            
                                    
            
            
                | 23 |  |  |  * @category   Hyphenation | 
            
                                                                                                            
                            
            
                                    
            
            
                | 24 |  |  |  * @package    Org_Heigl_Hyphenator | 
            
                                                                                                            
                            
            
                                    
            
            
                | 25 |  |  |  * @subpackage Tokenizer | 
            
                                                                                                            
                            
            
                                    
            
            
                | 26 |  |  |  * @author     Andreas Heigl <[email protected]> | 
            
                                                                                                            
                            
            
                                    
            
            
                | 27 |  |  |  * @copyright  2008-2011 Andreas Heigl<[email protected]> | 
            
                                                                                                            
                            
            
                                    
            
            
                | 28 |  |  |  * @license    http://www.opensource.org/licenses/mit-license.php MIT-License | 
            
                                                                                                            
                            
            
                                    
            
            
                | 29 |  |  |  * @version    2.0.1 | 
            
                                                                                                            
                            
            
                                    
            
            
                | 30 |  |  |  * @link       http://github.com/heiglandreas/Hyphenator | 
            
                                                                                                            
                            
            
                                    
            
            
                | 31 |  |  |  * @since      11.11.2011 | 
            
                                                                                                            
                            
            
                                    
            
            
                | 32 |  |  |  */ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 33 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 34 |  |  | namespace Org\Heigl\Hyphenator\Tokenizer; | 
            
                                                                                                            
                            
            
                                    
            
            
                | 35 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 36 |  |  | /** | 
            
                                                                                                            
                            
            
                                    
            
            
                | 37 |  |  |  * Use Whitespace to split any input into tokens | 
            
                                                                                                            
                            
            
                                    
            
            
                | 38 |  |  |  * | 
            
                                                                                                            
                            
            
                                    
            
            
                | 39 |  |  |  * @category   Hyphenation | 
            
                                                                                                            
                            
            
                                    
            
            
                | 40 |  |  |  * @package    Org_Heigl_Hyphenator | 
            
                                                                                                            
                            
            
                                    
            
            
                | 41 |  |  |  * @subpackage Tokenizer | 
            
                                                                                                            
                            
            
                                    
            
            
                | 42 |  |  |  * @author     Andreas Heigl <[email protected]> | 
            
                                                                                                            
                            
            
                                    
            
            
                | 43 |  |  |  * @copyright  2008-2011 Andreas Heigl<[email protected]> | 
            
                                                                                                            
                            
            
                                    
            
            
                | 44 |  |  |  * @license    http://www.opensource.org/licenses/mit-license.php MIT-License | 
            
                                                                                                            
                            
            
                                    
            
            
                | 45 |  |  |  * @version    2.0.1 | 
            
                                                                                                            
                            
            
                                    
            
            
                | 46 |  |  |  * @link       http://github.com/heiglandreas/Hyphenator | 
            
                                                                                                            
                            
            
                                    
            
            
                | 47 |  |  |  * @since      04.11.2011 | 
            
                                                                                                            
                            
            
                                    
            
            
                | 48 |  |  |  */ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 49 |  |  | class WhitespaceTokenizer implements Tokenizer | 
            
                                                                                                            
                            
            
                                    
            
            
                | 50 |  |  | { | 
            
                                                                                                            
                            
            
                                    
            
            
                | 51 |  |  |     protected $whitespaces = array( | 
            
                                                                                                            
                            
            
                                    
            
            
                | 52 |  |  |       '\s',           // white space | 
            
                                                                                                            
                            
            
                                    
            
            
                | 53 |  |  |       "\xE2\x80\xAF", // non-breaking thin white space | 
            
                                                                                                            
                            
            
                                    
            
            
                | 54 |  |  |       "\xC2\xA0",     // non-breaking space | 
            
                                                                                                            
                            
            
                                    
            
            
                | 55 |  |  |     ); | 
            
                                                                                                            
                            
            
                                    
            
            
                | 56 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 57 |  |  |     /** | 
            
                                                                                                            
                            
            
                                    
            
            
                | 58 |  |  |      * Split the given input into tokens using whitespace as splitter | 
            
                                                                                                            
                            
            
                                    
            
            
                | 59 |  |  |      * | 
            
                                                                                                            
                            
            
                                    
            
            
                | 60 |  |  |      * The input can be a string or a tokenRegistry. If the input is a | 
            
                                                                                                            
                            
            
                                    
            
            
                | 61 |  |  |      * TokenRegistry, each item will be tokenized. | 
            
                                                                                                            
                            
            
                                    
            
            
                | 62 |  |  |      * | 
            
                                                                                                            
                            
            
                                    
            
            
                | 63 |  |  |      * @param string|\Org\Heigl\Hyphenator\Tokenizer\TokenRegistry $input The | 
            
                                                                                                            
                            
            
                                    
            
            
                | 64 |  |  |      * input to be tokenized | 
            
                                                                                                            
                            
            
                                    
            
            
                | 65 |  |  |      * | 
            
                                                                                                            
                            
            
                                    
            
            
                | 66 |  |  |      * @return \Org\Heigl\Hyphenator\Tokenizer\TokenRegistry | 
            
                                                                                                            
                            
            
                                    
            
            
                | 67 |  |  |      */ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 68 |  |  |     public function run($input) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 69 |  |  |     { | 
            
                                                                                                            
                            
            
                                    
            
            
                | 70 |  |  |         if ($input instanceof TokenRegistry) { | 
            
                                                                                                            
                            
            
                                    
            
            
                | 71 |  |  |             // Tokenize a TokenRegistry | 
            
                                                                                                            
                            
            
                                    
            
            
                | 72 |  |  |             foreach ($input as $token) { | 
            
                                                                                                            
                            
            
                                    
            
            
                | 73 |  |  |                 if (! $token instanceof WordToken) { | 
            
                                                                                                            
                            
            
                                    
            
            
                | 74 |  |  |                     continue; | 
            
                                                                                                            
                            
            
                                    
            
            
                | 75 |  |  |                 } | 
            
                                                                                                            
                            
            
                                    
            
            
                | 76 |  |  |                 $newTokens = $this->tokenize($token->get()); | 
            
                                                                                                            
                            
            
                                    
            
            
                | 77 |  |  |                 if ($newTokens == array($token)) { | 
            
                                                                                                            
                            
            
                                    
            
            
                | 78 |  |  |                     continue; | 
            
                                                                                                            
                            
            
                                    
            
            
                | 79 |  |  |                 } | 
            
                                                                                                            
                            
            
                                    
            
            
                | 80 |  |  |                 $input->replace($token, $newTokens); | 
            
                                                                                                            
                            
            
                                    
            
            
                | 81 |  |  |             } | 
            
                                                                                                            
                            
            
                                    
            
            
                | 82 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 83 |  |  |             return $input ; | 
            
                                                                                                            
                            
            
                                    
            
            
                | 84 |  |  |         } | 
            
                                                                                                            
                            
            
                                    
            
            
                | 85 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 86 |  |  |         // Tokenize a simple string. | 
            
                                                                                                            
                            
            
                                    
            
            
                | 87 |  |  |         $array =  $this->tokenize($input); | 
            
                                                                                                            
                            
            
                                    
            
            
                | 88 |  |  |         $registry = new TokenRegistry(); | 
            
                                                                                                            
                            
            
                                    
            
            
                | 89 |  |  |         foreach ($array as $item) { | 
            
                                                                                                            
                            
            
                                    
            
            
                | 90 |  |  |             $registry->add($item); | 
            
                                                                                                            
                            
            
                                    
            
            
                | 91 |  |  |         } | 
            
                                                                                                            
                            
            
                                    
            
            
                | 92 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 93 |  |  |         return $registry; | 
            
                                                                                                            
                            
            
                                    
            
            
                | 94 |  |  |     } | 
            
                                                                                                            
                            
            
                                    
            
            
                | 95 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 96 |  |  |     /** | 
            
                                                                                                            
                            
            
                                    
            
            
                | 97 |  |  |      * Split the given string into tokens using whitespace. | 
            
                                                                                                            
                            
            
                                    
            
            
                | 98 |  |  |      * | 
            
                                                                                                            
                            
            
                                    
            
            
                | 99 |  |  |      * Each whitespace is placed in a WhitespaceToken and everything else is | 
            
                                                                                                            
                            
            
                                    
            
            
                | 100 |  |  |      * placed in a WordToken-Object | 
            
                                                                                                            
                            
            
                                    
            
            
                | 101 |  |  |      * | 
            
                                                                                                            
                            
            
                                    
            
            
                | 102 |  |  |      * @param string $input The String to tokenize | 
            
                                                                                                            
                            
            
                                    
            
            
                | 103 |  |  |      * | 
            
                                                                                                            
                            
            
                                    
            
            
                | 104 |  |  |      * @return Token | 
            
                                                                                                            
                                                                
            
                                    
            
            
                | 105 |  |  |      */ | 
            
                                                        
            
                                    
            
            
                | 106 |  |  |     private function tokenize($input) | 
            
                                                        
            
                                    
            
            
                | 107 |  |  |     { | 
            
                                                        
            
                                    
            
            
                | 108 |  |  |         $tokens = array(); | 
            
                                                        
            
                                    
            
            
                | 109 |  |  |         $splits = preg_split("/([".implode("", $this->whitespaces)."]+)/u", $input, -1, PREG_SPLIT_DELIM_CAPTURE); | 
            
                                                        
            
                                    
            
            
                | 110 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 111 |  |  |         foreach ($splits as $split) { | 
            
                                                        
            
                                    
            
            
                | 112 |  |  |             if (preg_match("/^[".implode("", $this->whitespaces)."]+$/um", $split)) { | 
            
                                                        
            
                                    
            
            
                | 113 |  |  |                 $tokens[] = new WhitespaceToken($split); | 
            
                                                        
            
                                    
            
            
                | 114 |  |  |                 continue; | 
            
                                                        
            
                                    
            
            
                | 115 |  |  |             } | 
            
                                                        
            
                                    
            
            
                | 116 |  |  |             $tokens[] = new WordToken($split); | 
            
                                                        
            
                                    
            
            
                | 117 |  |  |         } | 
            
                                                        
            
                                    
            
            
                | 118 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 119 |  |  |         return $tokens; | 
            
                                                        
            
                                    
            
            
                | 120 |  |  |     } | 
            
                                                        
            
                                    
            
            
                | 121 |  |  | } | 
            
                                                        
            
                                    
            
            
                | 122 |  |  |  |