| Total Complexity | 10 |
| Total Lines | 46 |
| Duplicated Lines | 0 % |
| Changes | 1 | ||
| Bugs | 0 | Features | 0 |
| 1 | <?php |
||
| 9 | class NGramTokenizer extends WordTokenizer |
||
| 10 | { |
||
| 11 | /** |
||
| 12 | * @var int |
||
| 13 | */ |
||
| 14 | private $minGram; |
||
| 15 | |||
| 16 | /** |
||
| 17 | * @var int |
||
| 18 | */ |
||
| 19 | private $maxGram; |
||
| 20 | |||
| 21 | public function __construct(int $minGram = 1, int $maxGram = 2) |
||
| 22 | { |
||
| 23 | if ($minGram < 1 || $maxGram < 1 || $minGram > $maxGram) { |
||
| 24 | throw new InvalidArgumentException(sprintf('Invalid (%s, %s) minGram and maxGram value combination', $minGram, $maxGram)); |
||
| 25 | } |
||
| 26 | |||
| 27 | $this->minGram = $minGram; |
||
| 28 | $this->maxGram = $maxGram; |
||
| 29 | } |
||
| 30 | |||
| 31 | /** |
||
| 32 | * {@inheritdoc} |
||
| 33 | */ |
||
| 34 | public function tokenize(string $text): array |
||
| 35 | { |
||
| 36 | $words = []; |
||
| 37 | preg_match_all('/\w\w+/u', $text, $words); |
||
| 38 | |||
| 39 | $nGrams = []; |
||
| 40 | foreach ($words[0] as $word) { |
||
| 41 | $this->generateNGrams($word, $nGrams); |
||
| 42 | } |
||
| 43 | |||
| 44 | return $nGrams; |
||
| 45 | } |
||
| 46 | |||
| 47 | private function generateNGrams(string $word, array &$nGrams): void |
||
| 55 | } |
||
| 56 | } |
||
| 57 | } |
||
| 60 |