for testing and deploying your application
for finding and fixing issues
for empowering human code reviews
<?php
declare(strict_types=1);
namespace Phpml\Tokenization;
use drupol\phpngrams\NGrams;
/**
* Class NGramTokenizer
*/
class NGramTokenizer extends WordTokenizer
{
* The sizes.
*
* @var array|null
private $sizes;
* The NGram factory.
* @var \drupol\phpngrams\NGramsInterface
private $ngramsFactory;
* NGramTokenizer constructor.
public function __construct(?array $sizes = null)
$this->sizes = $sizes;
$this->ngramsFactory = new NGrams();
}
* {@inheritdoc}
public function tokenize(string $text): array
$ngram_dataset = [];
foreach (parent::tokenize($text) as $word) {
$lengths = range(1, strlen($word), 1);
if ($this->sizes !== null) {
$lengths = $this->sizes;
foreach ($lengths as $length) {
foreach ($this->ngramsFactory->ngrams(str_split($word), $length) as $ngram) {
$ngram_dataset[] = implode('', $ngram);
return $ngram_dataset;