Test Setup Failed
Pull Request — master (#350)
by Arkadiusz
07:19
created

NGramTokenizer::generateNGrams()   A

Complexity

Conditions 4
Paths 4

Size

Total Lines 12

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
dl 0
loc 12
rs 9.8666
c 0
b 0
f 0
cc 4
nc 4
nop 2
1
<?php
2
3
declare(strict_types=1);
4
5
namespace Phpml\Tokenization;
6
7
use Phpml\Exception\InvalidArgumentException;
8
9
class NGramTokenizer extends WordTokenizer
10
{
11
    /**
12
     * @var int
13
     */
14
    private $minGram;
15
16
    /**
17
     * @var int
18
     */
19
    private $maxGram;
20
21
    public function __construct(int $minGram = 1, int $maxGram = 2)
22
    {
23
        if ($minGram < 1 || $maxGram < 1 || $minGram > $maxGram) {
24
            throw new InvalidArgumentException(sprintf('Invalid (%s, %s) minGram and maxGram value combination', $minGram, $maxGram));
25
        }
26
27
        $this->minGram = $minGram;
28
        $this->maxGram = $maxGram;
29
    }
30
31
    /**
32
     * {@inheritdoc}
33
     */
34
    public function tokenize(string $text): array
35
    {
36
        $words = [];
37
        preg_match_all('/\w\w+/u', $text, $words);
38
39
        $nGrams = [];
40
        foreach ($words[0] as $word) {
41
            $this->generateNGrams($word, $nGrams);
42
        }
43
44
        return $nGrams;
45
    }
46
47
    private function generateNGrams(string $word, array &$nGrams): void
48
    {
49
        $length = mb_strlen($word);
50
51
        for ($j = 1; $j <= $this->maxGram; $j++) {
52
            for ($k = 0; $k < $length - $j + 1; $k++) {
53
                if ($j >= $this->minGram) {
54
                    $nGrams[] = mb_substr($word, $k, $j);
55
                }
56
            }
57
        }
58
    }
59
}
60