for testing and deploying your application
for finding and fixing issues
for empowering human code reviews
<?php
namespace Onoi\Tesa\Tokenizer;
/**
* @license GNU GPL v2+
* @since 0.1
*
* @author mwjames
*/
class PunctuationRegExTokenizer implements Tokenizer {
* @var Tokenizer
private $tokenizer;
* @var string
private $patternExemption = '';
* @param Tokenizer|null $tokenizer
public function __construct( Tokenizer $tokenizer = null ) {
$this->tokenizer = $tokenizer;
}
* {@inheritDoc}
public function setOption( $name, $value ) {
if ( $this->tokenizer !== null ) {
$this->tokenizer->setOption( $name, $value );
if ( $name === self::REGEX_EXEMPTION ) {
$this->patternExemption = $value;
public function isWordTokenizer() {
return $this->tokenizer !== null ? $this->tokenizer->isWordTokenizer() : true;
* @param string $string
* @return array|false
public function tokenize( $string ) {
$string = implode( " ", $this->tokenizer->tokenize( $string ) );
$pattern = str_replace(
$this->patternExemption,
'',
'_-・,、;:!?.。…◆★◇□■()【】《》〈〉;:“”"〃'`[]{}「」@*\/&#%`^+<=>|~≪≫─$"_\-・,、;:!?.。()[\]{}「」@*\/&#%`^+<=>|~«»$"\s'
);
$result = preg_split( '/[' . $pattern . ']+/u', $string, null, PREG_SPLIT_NO_EMPTY );
if ( $result === false ) {
$result = array();
return $result;