NGramWordTokenizer   A
last analyzed

Complexity

Total Complexity 10

Size/Duplication

Total Lines 54
Duplicated Lines 0 %

Importance

Changes 1
Bugs 0 Features 0
Metric Value
wmc 10
eloc 22
c 1
b 0
f 0
dl 0
loc 54
rs 10

3 Methods

Rating   Name   Duplication   Size   Complexity  
A tokenize() 0 12 2
A getNgrams() 0 15 4
A __construct() 0 8 4
1
<?php
2
3
declare(strict_types=1);
4
5
namespace Phpml\Tokenization;
6
7
use Phpml\Exception\InvalidArgumentException;
8
9
class NGramWordTokenizer extends WordTokenizer
10
{
11
    /**
12
     * @var int
13
     */
14
    private $minGram;
15
16
    /**
17
     * @var int
18
     */
19
    private $maxGram;
20
21
    public function __construct(int $minGram = 1, int $maxGram = 2)
22
    {
23
        if ($minGram < 1 || $maxGram < 1 || $minGram > $maxGram) {
24
            throw new InvalidArgumentException(sprintf('Invalid (%s, %s) minGram and maxGram value combination', $minGram, $maxGram));
25
        }
26
27
        $this->minGram = $minGram;
28
        $this->maxGram = $maxGram;
29
    }
30
31
    /**
32
     * {@inheritdoc}
33
     */
34
    public function tokenize(string $text): array
35
    {
36
        preg_match_all('/\w\w+/u', $text, $words);
37
38
        $words = $words[0];
39
40
        $nGrams = [];
41
        for ($j = $this->minGram; $j <= $this->maxGram; $j++) {
42
            $nGrams = array_merge($nGrams, $this->getNgrams($words, $j));
43
        }
44
45
        return $nGrams;
46
    }
47
48
    private function getNgrams(array $match, int $n = 2): array
49
    {
50
        $ngrams = [];
51
        $len = count($match);
52
        for ($i = 0; $i < $len; $i++) {
53
            if ($i > ($n - 2)) {
54
                $ng = '';
55
                for ($j = $n - 1; $j >= 0; $j--) {
56
                    $ng .= ' '.$match[$i - $j];
57
                }
58
                $ngrams[] = trim($ng);
59
            }
60
        }
61
62
        return $ngrams;
63
    }
64
}
65