NGramWordTokenizer - Code Metrics - php-ai/php-ml - Measure and Improve Code Quality continuously with Scrutinizer

NGramWordTokenizer A
last analyzed 2020-05-15 05:48 UTC

↳ Parent: Project

Complexity

Total Complexity

Size/Duplication

Total Lines	54
Duplicated Lines	0 %

Importance

Changes	1
Bugs	0	Features	0

Metric	Value
wmc	10
eloc	22
c	1
b	0
f	0
dl	0
loc	54
rs	10

3 Methods

Rating	Name	Size	Complexity
A	tokenize()	12	2
A	getNgrams()	15	4
A	__construct()	8	4

<?php

declare(strict_types=1);

namespace Phpml\Tokenization;

use Phpml\Exception\InvalidArgumentException;

class NGramWordTokenizer extends WordTokenizer
{
    /**
     * @var int
     */
    private $minGram;

    /**
     * @var int
     */
    private $maxGram;

    public function __construct(int $minGram = 1, int $maxGram = 2)
    {
        if ($minGram < 1 || $maxGram < 1 || $minGram > $maxGram) {
            throw new InvalidArgumentException(sprintf('Invalid (%s, %s) minGram and maxGram value combination', $minGram, $maxGram));
        }

        $this->minGram = $minGram;
        $this->maxGram = $maxGram;
    }

    /**
     * {@inheritdoc}
     */
    public function tokenize(string $text): array
    {
        preg_match_all('/\w\w+/u', $text, $words);

        $words = $words[0];

        $nGrams = [];
        for ($j = $this->minGram; $j <= $this->maxGram; $j++) {
            $nGrams = array_merge($nGrams, $this->getNgrams($words, $j));
        }

        return $nGrams;
    }

    private function getNgrams(array $match, int $n = 2): array
    {
        $ngrams = [];
        $len = count($match);
        for ($i = 0; $i < $len; $i++) {
            if ($i > ($n - 2)) {
                $ng = '';
                for ($j = $n - 1; $j >= 0; $j--) {
                    $ng .= ' '.$match[$i - $j];
                }
                $ngrams[] = trim($ng);
            }
        }

        return $ngrams;
    }
}


1			<?php
2
3			declare(strict_types=1);
4
5			namespace Phpml\Tokenization;
6
7			use Phpml\Exception\InvalidArgumentException;
8
9			class NGramWordTokenizer extends WordTokenizer
10			{
11			/**
12			* @var int
13			*/
14			private $minGram;
15
16			/**
17			* @var int
18			*/
19			private $maxGram;
20
21			public function __construct(int $minGram = 1, int $maxGram = 2)
22			{
23			if ($minGram < 1 \|\| $maxGram < 1 \|\| $minGram > $maxGram) {
24			throw new InvalidArgumentException(sprintf('Invalid (%s, %s) minGram and maxGram value combination', $minGram, $maxGram));
25			}
26
27			$this->minGram = $minGram;
28			$this->maxGram = $maxGram;
29			}
30
31			/**
32			* {@inheritdoc}
33			*/
34			public function tokenize(string $text): array
35			{
36			preg_match_all('/\w\w+/u', $text, $words);
37
38			$words = $words[0];
39
40			$nGrams = [];
41			for ($j = $this->minGram; $j <= $this->maxGram; $j++) {
42			$nGrams = array_merge($nGrams, $this->getNgrams($words, $j));
43			}
44
45			return $nGrams;
46			}
47
48			private function getNgrams(array $match, int $n = 2): array
49			{
50			$ngrams = [];
51			$len = count($match);
52			for ($i = 0; $i < $len; $i++) {
53			if ($i > ($n - 2)) {
54			$ng = '';
55			for ($j = $n - 1; $j >= 0; $j--) {
56			$ng .= ' '.$match[$i - $j];
57			}
58			$ngrams[] = trim($ng);
59			}
60			}
61
62			return $ngrams;
63			}
64			}
65

php-ai / php-ml

NGramWordTokenizer A last analyzed 2020-05-15 05:48 UTC

Complexity

Size/Duplication

Importance

3 Methods

Duplication Side-by-Side

Filter issues like

NGramWordTokenizer A
last analyzed 2020-05-15 05:48 UTC