1
|
|
|
<?php |
2
|
|
|
|
3
|
|
|
declare(strict_types=1); |
4
|
|
|
|
5
|
|
|
namespace PhpSpellcheck\Spellchecker; |
6
|
|
|
|
7
|
|
|
use PhpSpellcheck\Misspelling; |
8
|
|
|
use PhpSpellcheck\Spellchecker\LanguageTool\LanguageToolApiClient; |
9
|
|
|
use PhpSpellcheck\Utils\SortedNumericArrayNearestValueFinder; |
10
|
|
|
use PhpSpellcheck\Utils\TextEncoding; |
11
|
|
|
use Webmozart\Assert\Assert; |
12
|
|
|
|
13
|
|
|
class LanguageTool implements SpellcheckerInterface |
14
|
|
|
{ |
15
|
|
|
/** |
16
|
|
|
* @var LanguageToolApiClient |
17
|
|
|
*/ |
18
|
|
|
private $apiClient; |
19
|
|
|
|
20
|
4 |
|
public function __construct(LanguageToolApiClient $apiClient) |
21
|
|
|
{ |
22
|
4 |
|
$this->apiClient = $apiClient; |
23
|
4 |
|
} |
24
|
|
|
|
25
|
|
|
/** |
26
|
|
|
* @return Misspelling[] |
27
|
|
|
*/ |
28
|
2 |
|
public function check( |
29
|
|
|
string $text, |
30
|
|
|
array $languages = [], |
31
|
|
|
array $context = [], |
32
|
|
|
?string $encoding = TextEncoding::UTF8 |
33
|
|
|
): iterable { |
34
|
2 |
|
Assert::notEmpty($languages, 'LanguageTool requires at least one language to run it\'s spellchecking process'); |
35
|
|
|
|
36
|
2 |
|
$check = $this->apiClient->spellCheck($text, $languages, $context[self::class] ?? []); |
37
|
2 |
|
$lineBreaksOffset = $this->getLineBreaksOffset($text, $encoding); |
38
|
|
|
|
39
|
2 |
|
foreach ($check['matches'] as $match) { |
40
|
2 |
|
list($offsetFromLine, $line) = $this->computeRealOffsetAndLine($match, $lineBreaksOffset); |
41
|
|
|
|
42
|
2 |
|
yield new Misspelling( |
|
|
|
|
43
|
2 |
|
mb_substr($match['context']['text'], $match['context']['offset'], $match['context']['length']), |
44
|
2 |
|
$offsetFromLine, |
45
|
2 |
|
$line, // line break index transformed in line number |
46
|
2 |
|
array_column($match['replacements'], 'value'), |
47
|
2 |
|
array_merge( |
48
|
|
|
[ |
49
|
2 |
|
'sentence' => $match['sentence'], |
50
|
2 |
|
'spellingErrorMessage' => $match['message'], |
51
|
2 |
|
'ruleUsed' => $match['rule'], |
52
|
|
|
], |
53
|
2 |
|
$context |
54
|
|
|
) |
55
|
|
|
); |
56
|
|
|
} |
57
|
2 |
|
} |
58
|
|
|
|
59
|
|
|
/** |
60
|
|
|
* {@inheritdoc} |
61
|
|
|
*/ |
62
|
2 |
|
public function getSupportedLanguages(): iterable |
63
|
|
|
{ |
64
|
2 |
|
return $this->apiClient->getSupportedLanguages(); |
65
|
|
|
} |
66
|
|
|
|
67
|
2 |
|
private function computeRealOffsetAndLine(array $match, array $lineBreaksOffset): array |
68
|
|
|
{ |
69
|
2 |
|
$languageToolsOffset = (int) $match['offset']; |
70
|
2 |
|
$index = SortedNumericArrayNearestValueFinder::findIndex( |
71
|
2 |
|
(int) $match['offset'], |
72
|
2 |
|
$lineBreaksOffset, |
73
|
2 |
|
SortedNumericArrayNearestValueFinder::FIND_HIGHER |
74
|
|
|
); |
75
|
|
|
|
76
|
2 |
|
if ($index === 0) { |
77
|
|
|
// word is on the first line |
78
|
2 |
|
$offsetFromLine = $languageToolsOffset; |
79
|
2 |
|
$line = $index + 1; |
80
|
|
|
} else { |
81
|
2 |
|
if ($languageToolsOffset > $lineBreaksOffset[$index]) { |
82
|
|
|
// word is on the last line |
83
|
2 |
|
$offsetFromLine = $languageToolsOffset - $lineBreaksOffset[$index]; |
84
|
2 |
|
$line = $index + 2; |
85
|
|
|
} else { |
86
|
1 |
|
$offsetFromLine = $languageToolsOffset - $lineBreaksOffset[$index - 1]; |
87
|
1 |
|
$line = $index + 1; |
88
|
|
|
} |
89
|
|
|
} |
90
|
|
|
|
91
|
2 |
|
return [$offsetFromLine, $line]; |
92
|
|
|
} |
93
|
|
|
|
94
|
2 |
|
private function getLineBreaksOffset(string $text, ?string $encoding): array |
95
|
|
|
{ |
96
|
2 |
|
if ($encoding === null) { |
97
|
2 |
|
$encoding = \Safe\mb_internal_encoding(); |
98
|
2 |
|
} |
99
|
2 |
|
|
100
|
2 |
|
$start = 0; |
101
|
|
|
$lineBreaksOffset = []; |
102
|
|
|
while (($pos = \mb_strpos(($text), PHP_EOL, $start, $encoding)) != false) { |
|
|
|
|
103
|
2 |
|
$lineBreaksOffset[] = $pos; |
104
|
|
|
$start = $pos + 1; // start searching from next position. |
105
|
|
|
} |
106
|
|
|
|
107
|
|
|
return $lineBreaksOffset; |
108
|
|
|
} |
109
|
|
|
} |
110
|
|
|
|