|
1
|
|
|
<?php |
|
2
|
|
|
|
|
3
|
|
|
declare(strict_types=1); |
|
4
|
|
|
|
|
5
|
|
|
namespace PhpSpellcheck\Spellchecker; |
|
6
|
|
|
|
|
7
|
|
|
use PhpSpellcheck\Misspelling; |
|
8
|
|
|
use PhpSpellcheck\Spellchecker\LanguageTool\LanguageToolApiClient; |
|
9
|
|
|
use PhpSpellcheck\Utils\SortedNumericArrayNearestValueFinder; |
|
10
|
|
|
use PhpSpellcheck\Utils\TextEncoding; |
|
11
|
|
|
use Webmozart\Assert\Assert; |
|
12
|
|
|
|
|
13
|
|
|
class LanguageTool implements SpellcheckerInterface |
|
14
|
|
|
{ |
|
15
|
|
|
/** |
|
16
|
|
|
* @var LanguageToolApiClient |
|
17
|
|
|
*/ |
|
18
|
|
|
private $apiClient; |
|
19
|
|
|
|
|
20
|
4 |
|
public function __construct(LanguageToolApiClient $apiClient) |
|
21
|
|
|
{ |
|
22
|
4 |
|
$this->apiClient = $apiClient; |
|
23
|
4 |
|
} |
|
24
|
|
|
|
|
25
|
|
|
/** |
|
26
|
|
|
* @return Misspelling[] |
|
27
|
|
|
*/ |
|
28
|
2 |
|
public function check( |
|
29
|
|
|
string $text, |
|
30
|
|
|
array $languages = [], |
|
31
|
|
|
array $context = [], |
|
32
|
|
|
?string $encoding = TextEncoding::UTF8 |
|
33
|
|
|
): iterable { |
|
34
|
2 |
|
Assert::notEmpty($languages, 'LanguageTool requires at least one language to run it\'s spellchecking process'); |
|
35
|
|
|
|
|
36
|
2 |
|
$check = $this->apiClient->spellCheck($text, $languages, $context[self::class] ?? []); |
|
37
|
2 |
|
$lineBreaksOffset = $this->getLineBreaksOffset($text, $encoding); |
|
38
|
|
|
|
|
39
|
2 |
|
foreach ($check['matches'] as $match) { |
|
40
|
2 |
|
list($offsetFromLine, $line) = $this->computeRealOffsetAndLine($match, $lineBreaksOffset); |
|
41
|
|
|
|
|
42
|
2 |
|
yield new Misspelling( |
|
|
|
|
|
|
43
|
2 |
|
mb_substr($match['context']['text'], $match['context']['offset'], $match['context']['length']), |
|
44
|
2 |
|
$offsetFromLine, |
|
45
|
2 |
|
$line, // line break index transformed in line number |
|
46
|
2 |
|
array_column($match['replacements'], 'value'), |
|
47
|
2 |
|
array_merge( |
|
48
|
|
|
[ |
|
49
|
2 |
|
'sentence' => $match['sentence'], |
|
50
|
2 |
|
'spellingErrorMessage' => $match['message'], |
|
51
|
2 |
|
'ruleUsed' => $match['rule'], |
|
52
|
|
|
], |
|
53
|
2 |
|
$context |
|
54
|
|
|
) |
|
55
|
|
|
); |
|
56
|
|
|
} |
|
57
|
2 |
|
} |
|
58
|
|
|
|
|
59
|
|
|
/** |
|
60
|
|
|
* {@inheritdoc} |
|
61
|
|
|
*/ |
|
62
|
2 |
|
public function getSupportedLanguages(): iterable |
|
63
|
|
|
{ |
|
64
|
2 |
|
return $this->apiClient->getSupportedLanguages(); |
|
65
|
|
|
} |
|
66
|
|
|
|
|
67
|
2 |
|
private function computeRealOffsetAndLine(array $match, array $lineBreaksOffset): array |
|
68
|
|
|
{ |
|
69
|
2 |
|
$languageToolsOffset = (int) $match['offset']; |
|
70
|
2 |
|
$index = SortedNumericArrayNearestValueFinder::findIndex( |
|
71
|
2 |
|
(int) $match['offset'], |
|
72
|
2 |
|
$lineBreaksOffset, |
|
73
|
2 |
|
SortedNumericArrayNearestValueFinder::FIND_HIGHER |
|
74
|
|
|
); |
|
75
|
|
|
|
|
76
|
2 |
|
if ($index === 0) { |
|
77
|
|
|
// word is on the first line |
|
78
|
2 |
|
$offsetFromLine = $languageToolsOffset; |
|
79
|
2 |
|
$line = $index + 1; |
|
80
|
|
|
} else { |
|
81
|
2 |
|
if ($languageToolsOffset > $lineBreaksOffset[$index]) { |
|
82
|
|
|
// word is on the last line |
|
83
|
2 |
|
$offsetFromLine = $languageToolsOffset - $lineBreaksOffset[$index]; |
|
84
|
2 |
|
$line = $index + 2; |
|
85
|
|
|
} else { |
|
86
|
1 |
|
$offsetFromLine = $languageToolsOffset - $lineBreaksOffset[$index - 1]; |
|
87
|
1 |
|
$line = $index + 1; |
|
88
|
|
|
} |
|
89
|
|
|
} |
|
90
|
|
|
|
|
91
|
2 |
|
return [$offsetFromLine, $line]; |
|
92
|
|
|
} |
|
93
|
|
|
|
|
94
|
2 |
|
private function getLineBreaksOffset(string $text, ?string $encoding): array |
|
95
|
|
|
{ |
|
96
|
2 |
|
if ($encoding === null) { |
|
97
|
2 |
|
$encoding = \Safe\mb_internal_encoding(); |
|
98
|
2 |
|
} |
|
99
|
2 |
|
|
|
100
|
2 |
|
$start = 0; |
|
101
|
|
|
$lineBreaksOffset = []; |
|
102
|
|
|
while (($pos = \mb_strpos(($text), PHP_EOL, $start, $encoding)) != false) { |
|
|
|
|
|
|
103
|
2 |
|
$lineBreaksOffset[] = $pos; |
|
104
|
|
|
$start = $pos + 1; // start searching from next position. |
|
105
|
|
|
} |
|
106
|
|
|
|
|
107
|
|
|
return $lineBreaksOffset; |
|
108
|
|
|
} |
|
109
|
|
|
} |
|
110
|
|
|
|