Completed
Push — master ( 9401a4...2cc7a2 )
by mw
13:59
created

NGramTokenizer::createNGrams()   C

Complexity

Conditions 7
Paths 8

Size

Total Lines 31
Code Lines 14

Duplication

Lines 0
Ratio 0 %

Importance

Changes 1
Bugs 0 Features 1
Metric Value
c 1
b 0
f 1
dl 0
loc 31
rs 6.7272
cc 7
eloc 14
nc 8
nop 3
1
<?php
2
3
namespace Onoi\Tesa\Tokenizer;
4
5
/**
6
 * @license GNU GPL v2+
7
 * @since 0.1
8
 *
9
 * @author mwjames
10
 */
11
class NGramTokenizer implements Tokenizer {
12
13
	/**
14
	 * @var Tokenizer
15
	 */
16
	private $tokenizer;
17
18
	/**
19
	 * @var integer
20
	 */
21
	private $ngramSize = 2;
22
23
	/**
24
	 * @var boolean
25
	 */
26
	private $withMarker = false;
27
28
	/**
29
	 * @since 0.1
30
	 *
31
	 * @param Tokenizer $tokenizer
32
	 * @param integer $ngramSize
33
	 */
34
	public function __construct( Tokenizer $tokenizer = null, $ngramSize = 2 ) {
35
		$this->tokenizer = $tokenizer;
36
		$this->ngramSize = (int)$ngramSize;
37
	}
38
39
	/**
40
	 * @since 0.1
41
	 *
42
	 * @param boolean $withMarker
43
	 */
44
	public function withMarker( $withMarker ) {
45
		$this->withMarker = (bool)$withMarker;
46
	}
47
48
	/**
49
	 * @since 0.1
50
	 *
51
	 * @param integer $ngramSize
52
	 */
53
	public function setNgramSize( $ngramSize ) {
54
		$this->ngramSize = (int)$ngramSize;
55
	}
56
57
	/**
58
	 * @since 0.1
59
	 *
60
	 * {@inheritDoc}
61
	 */
62
	public function setOption( $name, $value ) {
63
		if ( $this->tokenizer !== null ) {
64
			$this->tokenizer->setOption( $name, $value );
65
		}
66
	}
67
68
	/**
69
	 * @since 0.1
70
	 *
71
	 * {@inheritDoc}
72
	 */
73
	public function isWordTokenizer() {
74
		return false;
75
	}
76
77
	/**
78
	 * @since 0.1
79
	 *
80
	 * {@inheritDoc}
81
	 */
82
	public function tokenize( $string ) {
83
84
		if ( $this->tokenizer !== null ) {
85
			$string = implode( " ", $this->tokenizer->tokenize( $string ) );
86
		}
87
88
		$result = $this->createNGrams( $string, $this->ngramSize, $this->withMarker );
89
90
		if ( $result !== false ) {
91
			return $result;
92
		}
93
94
		return array();
95
	}
96
97
	private function createNGrams( $text, $ngramSize, $withMarker ) {
98
99
		$ngramList = [];
100
101
		// Identify the beginning-of-word and end-of-word
102
		if ( $withMarker ) {
103
			$text = '_' . str_replace( ' ', '_', $text );
104
		}
105
106
		$text = mb_strtolower( $text );
107
		$textLength = mb_strlen( $text, 'UTF-8' );
108
109
		for ( $i = 0; $i < $textLength; ++$i ) {
110
111
			// Those failing the length of the ngramSize are skipped
112
			if ( !$withMarker && $i + $ngramSize > $textLength ) {
113
				continue;
114
			}
115
116
			$ngram = mb_substr( $text, $i, $ngramSize, 'UTF-8' );
117
118
			// str_pad has issues with utf-8 length
119
			if ( $withMarker && ( $ngl = mb_strlen( $ngram, 'UTF-8' ) ) < $ngramSize ) {
120
				$ngram = $ngram . str_repeat( '_', ( $ngramSize - $ngl ) );
121
			}
122
123
			$ngramList[] = $ngram;
124
		}
125
126
		return array_values( $ngramList );
127
	}
128
129
}
130