Test Setup Failed
Pull Request — master (#350)
by Pol
02:59
created

NGramTokenizer   A

Complexity

Total Complexity 5

Size/Duplication

Total Lines 42
Duplicated Lines 0 %

Coupling/Cohesion

Components 0
Dependencies 2

Importance

Changes 0
Metric Value
wmc 5
lcom 0
cbo 2
dl 0
loc 42
rs 10
c 0
b 0
f 0

1 Method

Rating   Name   Duplication   Size   Complexity  
A tokenize() 0 36 5
1
<?php
2
3
declare(strict_types=1);
4
5
namespace Phpml\Tokenization;
6
7
/**
8
 * Class NGramTokenizer
9
 */
10
class NGramTokenizer extends WordTokenizer
11
{
12
    /**
13
     * {@inheritdoc}
14
     */
15
    public function tokenize(string $text): array
16
    {
17
        $words = parent::tokenize($text);
18
19
        $length = 1;
20
        foreach ($words as $word) {
21
            $candidate_length = strlen($word);
22
            if ($candidate_length > $length) {
23
                $length = $candidate_length;
24
            }
25
        }
26
27
        $ngramsFactory = new \drupol\phpngrams\NGrams();
28
29
        $ngram_dataset = [];
30
        foreach ($words as $word) {
31
            $length = strlen($word);
32
33
            for ($i = 1; $i <= $length; $i++) {
34
                $ngram_dataset = array_merge(
35
                    $ngram_dataset,
36
                    array_map(
37
                        'implode',
38
                        iterator_to_array(
39
                            $ngramsFactory->ngrams(
40
                                str_split($word),
41
                                $i
42
                            )
43
                        )
44
                    )
45
                );
46
            }
47
        }
48
49
        return $ngram_dataset;
50
    }
51
}
52