Test Setup Failed
Pull Request — master (#350)
by Pol
02:39
created

NGramTokenizer   A

Complexity

Total Complexity 6

Size/Duplication

Total Lines 49
Duplicated Lines 0 %

Coupling/Cohesion

Components 1
Dependencies 3

Importance

Changes 0
Metric Value
wmc 6
lcom 1
cbo 3
dl 0
loc 49
rs 10
c 0
b 0
f 0

2 Methods

Rating   Name   Duplication   Size   Complexity  
A __construct() 0 5 1
A tokenize() 0 20 5
1
<?php
2
3
declare(strict_types=1);
4
5
namespace Phpml\Tokenization;
6
7
use drupol\phpngrams\NGrams;
8
9
/**
10
 * Class NGramTokenizer
11
 */
12
class NGramTokenizer extends WordTokenizer
13
{
14
    /**
15
     * The sizes.
16
     *
17
     * @var array|null
18
     */
19
    private $sizes;
20
21
    /**
22
     * The NGram factory.
23
     *
24
     * @var \drupol\phpngrams\NGramsInterface
25
     */
26
    private $ngramsFactory;
27
28
    /**
29
     * NGramTokenizer constructor.
30
     */
31
    public function __construct(?array $sizes = null)
32
    {
33
        $this->sizes = $sizes;
34
        $this->ngramsFactory = new NGrams();
35
    }
36
37
    /**
38
     * {@inheritdoc}
39
     */
40
    public function tokenize(string $text): array
41
    {
42
        $ngram_dataset = [];
43
44
        foreach (parent::tokenize($text) as $word) {
45
            $lengths = range(1, strlen($word), 1);
46
47
            if ($this->sizes !== null) {
48
                $lengths = $this->sizes;
49
            }
50
51
            foreach ($lengths as $length) {
52
                foreach ($this->ngramsFactory->ngrams(str_split($word), $length) as $ngram) {
53
                    $ngram_dataset[] = implode('', $ngram);
54
                }
55
            }
56
        }
57
58
        return $ngram_dataset;
59
    }
60
}
61