NGramTokenizer - Code Metrics - php-ai/php-ml - Measure and Improve Code Quality continuously with Scrutinizer

NGramTokenizer A
last analyzed 2020-05-15 05:48 UTC

↳ Parent: Project

Complexity

Total Complexity

Size/Duplication

Total Lines	46
Duplicated Lines	0 %

Importance

Changes	1
Bugs	0	Features	0

Metric	Value
wmc	10
eloc	18
c	1
b	0
f	0
dl	0
loc	46
rs	10

3 Methods

Rating	Name	Size	Complexity
A	generateNGrams()	8	4
A	__construct()	8	4
A	tokenize()	11	2

<?php

declare(strict_types=1);

namespace Phpml\Tokenization;

use Phpml\Exception\InvalidArgumentException;

class NGramTokenizer extends WordTokenizer
{
    /**
     * @var int
     */
    private $minGram;

    /**
     * @var int
     */
    private $maxGram;

    public function __construct(int $minGram = 1, int $maxGram = 2)
    {
        if ($minGram < 1 || $maxGram < 1 || $minGram > $maxGram) {
            throw new InvalidArgumentException(sprintf('Invalid (%s, %s) minGram and maxGram value combination', $minGram, $maxGram));
        }

        $this->minGram = $minGram;
        $this->maxGram = $maxGram;
    }

    /**
     * {@inheritdoc}
     */
    public function tokenize(string $text): array
    {
        $words = [];
        preg_match_all('/\w\w+/u', $text, $words);

        $nGrams = [];
        foreach ($words[0] as $word) {
            $this->generateNGrams($word, $nGrams);
        }

        return $nGrams;
    }

    private function generateNGrams(string $word, array &$nGrams): void
    {
        $length = mb_strlen($word);

        for ($j = 1; $j <= $this->maxGram; $j++) {
            for ($k = 0; $k < $length - $j + 1; $k++) {
                if ($j >= $this->minGram) {
                    $nGrams[] = mb_substr($word, $k, $j);
                }
            }
        }
    }
}


1			<?php
2
3			declare(strict_types=1);
4
5			namespace Phpml\Tokenization;
6
7			use Phpml\Exception\InvalidArgumentException;
8
9			class NGramTokenizer extends WordTokenizer
10			{
11			/**
12			* @var int
13			*/
14			private $minGram;
15
16			/**
17			* @var int
18			*/
19			private $maxGram;
20
21			public function __construct(int $minGram = 1, int $maxGram = 2)
22			{
23			if ($minGram < 1 \|\| $maxGram < 1 \|\| $minGram > $maxGram) {
24			throw new InvalidArgumentException(sprintf('Invalid (%s, %s) minGram and maxGram value combination', $minGram, $maxGram));
25			}
26
27			$this->minGram = $minGram;
28			$this->maxGram = $maxGram;
29			}
30
31			/**
32			* {@inheritdoc}
33			*/
34			public function tokenize(string $text): array
35			{
36			$words = [];
37			preg_match_all('/\w\w+/u', $text, $words);
38
39			$nGrams = [];
40			foreach ($words[0] as $word) {
41			$this->generateNGrams($word, $nGrams);
42			}
43
44			return $nGrams;
45			}
46
47			private function generateNGrams(string $word, array &$nGrams): void
48			{
49			$length = mb_strlen($word);
50
51			for ($j = 1; $j <= $this->maxGram; $j++) {
52			for ($k = 0; $k < $length - $j + 1; $k++) {
53			if ($j >= $this->minGram) {
54			$nGrams[] = mb_substr($word, $k, $j);
55			}
56			}
57			}
58			}
59			}
60

php-ai / php-ml

NGramTokenizer A last analyzed 2020-05-15 05:48 UTC

Complexity

Size/Duplication

Importance

3 Methods

Duplication Side-by-Side

Filter issues like

NGramTokenizer A
last analyzed 2020-05-15 05:48 UTC