NGramTokenizer::createNGrams()   C
last analyzed

Complexity

Conditions 7
Paths 8

Size

Total Lines 31
Code Lines 14

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 17
CRAP Score 7

Importance

Changes 0
Metric Value
dl 0
loc 31
ccs 17
cts 17
cp 1
rs 6.7272
c 0
b 0
f 0
cc 7
eloc 14
nc 8
nop 3
crap 7
1
<?php
2
3
namespace Onoi\Tesa\Tokenizer;
4
5
/**
6
 * @license GNU GPL v2+
7
 * @since 0.1
8
 *
9
 * @author mwjames
10
 */
11
class NGramTokenizer implements Tokenizer {
12
13
	/**
14
	 * @var Tokenizer
15
	 */
16
	private $tokenizer;
17
18
	/**
19
	 * @var integer
20
	 */
21
	private $ngramSize = 2;
22
23
	/**
24
	 * @var boolean
25
	 */
26
	private $withMarker = false;
27
28
	/**
29
	 * @since 0.1
30
	 *
31
	 * @param Tokenizer $tokenizer
32
	 * @param integer $ngramSize
33
	 */
34 13
	public function __construct( Tokenizer $tokenizer = null, $ngramSize = 2 ) {
35 13
		$this->tokenizer = $tokenizer;
36 13
		$this->ngramSize = (int)$ngramSize;
37 13
	}
38
39
	/**
40
	 * @since 0.1
41
	 *
42
	 * @param boolean $withMarker
43
	 */
44 3
	public function withMarker( $withMarker ) {
45 3
		$this->withMarker = (bool)$withMarker;
46 3
	}
47
48
	/**
49
	 * @since 0.1
50
	 *
51
	 * @param integer $ngramSize
52
	 */
53 1
	public function setNgramSize( $ngramSize ) {
54 1
		$this->ngramSize = (int)$ngramSize;
55 1
	}
56
57
	/**
58
	 * @since 0.1
59
	 *
60
	 * {@inheritDoc}
61
	 */
62 1
	public function setOption( $name, $value ) {
63 1
		if ( $this->tokenizer !== null ) {
64 1
			$this->tokenizer->setOption( $name, $value );
65 1
		}
66 1
	}
67
68
	/**
69
	 * @since 0.1
70
	 *
71
	 * {@inheritDoc}
72
	 */
73 8
	public function isWordTokenizer() {
74 8
		return false;
75
	}
76
77
	/**
78
	 * @since 0.1
79
	 *
80
	 * {@inheritDoc}
81
	 */
82 12
	public function tokenize( $string ) {
83
84 12
		if ( $this->tokenizer !== null ) {
85 2
			$string = implode( " ", $this->tokenizer->tokenize( $string ) );
86 2
		}
87
88 12
		$result = $this->createNGrams( $string, $this->ngramSize, $this->withMarker );
89
90 12
		if ( $result !== false ) {
91 12
			return $result;
92
		}
93
94
		return array();
95
	}
96
97 12
	private function createNGrams( $text, $ngramSize, $withMarker ) {
98
99 12
		$ngramList = array();
100
101
		// Identify the beginning-of-word and end-of-word
102 12
		if ( $withMarker ) {
103 3
			$text = '_' . str_replace( ' ', '_', $text );
104 3
		}
105
106 12
		$text = mb_strtolower( $text );
107 12
		$textLength = mb_strlen( $text, 'UTF-8' );
108
109 12
		for ( $i = 0; $i < $textLength; ++$i ) {
110
111
			// Those failing the length of the ngramSize are skipped
112 12
			if ( !$withMarker && $i + $ngramSize > $textLength ) {
113 9
				continue;
114
			}
115
116 12
			$ngram = mb_substr( $text, $i, $ngramSize, 'UTF-8' );
117
118
			// str_pad has issues with utf-8 length
119 12
			if ( $withMarker && ( $ngl = mb_strlen( $ngram, 'UTF-8' ) ) < $ngramSize ) {
120 3
				$ngram = $ngram . str_repeat( '_', ( $ngramSize - $ngl ) );
121 3
			}
122
123 12
			$ngramList[] = $ngram;
124 12
		}
125
126 12
		return array_values( $ngramList );
127
	}
128
129
}
130