1
|
|
|
<?php |
2
|
|
|
/** |
3
|
|
|
* FullTextSearch_ElasticSearch - Use Elasticsearch to index the content of your nextcloud |
4
|
|
|
* |
5
|
|
|
* This file is licensed under the Affero General Public License version 3 or |
6
|
|
|
* later. See the COPYING file. |
7
|
|
|
* |
8
|
|
|
* @author Maxence Lange <[email protected]> |
9
|
|
|
* @copyright 2018 |
10
|
|
|
* @license GNU AGPL version 3 or any later version |
11
|
|
|
* |
12
|
|
|
* This program is free software: you can redistribute it and/or modify |
13
|
|
|
* it under the terms of the GNU Affero General Public License as |
14
|
|
|
* published by the Free Software Foundation, either version 3 of the |
15
|
|
|
* License, or (at your option) any later version. |
16
|
|
|
* |
17
|
|
|
* This program is distributed in the hope that it will be useful, |
18
|
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
19
|
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
20
|
|
|
* GNU Affero General Public License for more details. |
21
|
|
|
* |
22
|
|
|
* You should have received a copy of the GNU Affero General Public License |
23
|
|
|
* along with this program. If not, see <http://www.gnu.org/licenses/>. |
24
|
|
|
* |
25
|
|
|
*/ |
26
|
|
|
|
27
|
|
|
namespace OCA\FullTextSearch_ElasticSearch\Service; |
28
|
|
|
|
29
|
|
|
use Elasticsearch\Client; |
30
|
|
|
use Elasticsearch\Common\Exceptions\Missing404Exception; |
31
|
|
|
use OCA\FullTextSearch\Model\IndexDocument; |
32
|
|
|
use OCA\FullTextSearch_ElasticSearch\Exceptions\AccessIsEmptyException; |
33
|
|
|
use OCA\FullTextSearch_ElasticSearch\Exceptions\ConfigurationException; |
34
|
|
|
|
35
|
|
|
|
36
|
|
|
class IndexMappingService { |
37
|
|
|
|
38
|
|
|
/** @var ConfigService */ |
39
|
|
|
private $configService; |
40
|
|
|
|
41
|
|
|
/** @var MiscService */ |
42
|
|
|
private $miscService; |
43
|
|
|
|
44
|
|
|
|
45
|
|
|
/** |
46
|
|
|
* MappingService constructor. |
47
|
|
|
* |
48
|
|
|
* @param ConfigService $configService |
49
|
|
|
* @param MiscService $miscService |
50
|
|
|
*/ |
51
|
|
|
public function __construct(ConfigService $configService, MiscService $miscService) { |
52
|
|
|
$this->configService = $configService; |
53
|
|
|
$this->miscService = $miscService; |
54
|
|
|
} |
55
|
|
|
|
56
|
|
|
|
57
|
|
|
/** |
58
|
|
|
* @param Client $client |
59
|
|
|
* @param IndexDocument $document |
60
|
|
|
* |
61
|
|
|
* @return array |
|
|
|
|
62
|
|
|
* @throws ConfigurationException |
63
|
|
|
* @throws AccessIsEmptyException |
64
|
|
|
*/ |
65
|
|
|
public function indexDocumentNew(Client $client, IndexDocument $document) { |
66
|
|
|
$index = [ |
67
|
|
|
'index' => |
68
|
|
|
[ |
69
|
|
|
'index' => $this->configService->getElasticIndex(), |
70
|
|
|
'id' => $document->getProviderId() . ':' . $document->getId(), |
71
|
|
|
'type' => 'standard', |
72
|
|
|
'body' => $this->generateIndexBody($document) |
73
|
|
|
] |
74
|
|
|
]; |
75
|
|
|
|
76
|
|
|
$this->onIndexingDocument($document, $index); |
77
|
|
|
|
78
|
|
|
return $client->index($index['index']); |
79
|
|
|
} |
80
|
|
|
|
81
|
|
|
|
82
|
|
|
/** |
83
|
|
|
* @param Client $client |
84
|
|
|
* @param IndexDocument $document |
85
|
|
|
* |
86
|
|
|
* @return array |
|
|
|
|
87
|
|
|
* @throws ConfigurationException |
88
|
|
|
* @throws AccessIsEmptyException |
89
|
|
|
*/ |
90
|
|
|
public function indexDocumentUpdate(Client $client, IndexDocument $document) { |
91
|
|
|
$index = [ |
92
|
|
|
'index' => |
93
|
|
|
[ |
94
|
|
|
'index' => $this->configService->getElasticIndex(), |
95
|
|
|
'id' => $document->getProviderId() . ':' . $document->getId(), |
96
|
|
|
'type' => 'standard', |
97
|
|
|
'body' => ['doc' => $this->generateIndexBody($document)] |
98
|
|
|
] |
99
|
|
|
]; |
100
|
|
|
|
101
|
|
|
$this->onIndexingDocument($document, $index); |
102
|
|
|
try { |
103
|
|
|
return $client->update($index['index']); |
104
|
|
|
} catch (Missing404Exception $e) { |
105
|
|
|
return $this->indexDocumentNew($client, $document); |
106
|
|
|
} |
107
|
|
|
} |
108
|
|
|
|
109
|
|
|
|
110
|
|
|
/** |
111
|
|
|
* @param Client $client |
112
|
|
|
* @param string $providerId |
113
|
|
|
* @param string|int $documentId |
114
|
|
|
* |
115
|
|
|
* @throws ConfigurationException |
116
|
|
|
*/ |
117
|
|
|
public function indexDocumentRemove(Client $client, $providerId, $documentId) { |
118
|
|
|
$index = [ |
119
|
|
|
'index' => |
120
|
|
|
[ |
121
|
|
|
'index' => $this->configService->getElasticIndex(), |
122
|
|
|
'id' => $providerId . ':' . $documentId, |
123
|
|
|
'type' => 'standard' |
124
|
|
|
] |
125
|
|
|
]; |
126
|
|
|
|
127
|
|
|
try { |
128
|
|
|
$client->delete($index['index']); |
129
|
|
|
} catch (Missing404Exception $e) { |
|
|
|
|
130
|
|
|
} |
131
|
|
|
} |
132
|
|
|
|
133
|
|
|
|
134
|
|
|
/** |
135
|
|
|
* @param IndexDocument $document |
136
|
|
|
* @param array $arr |
137
|
|
|
*/ |
138
|
|
|
public function onIndexingDocument(IndexDocument $document, &$arr) { |
139
|
|
|
if ($document->isContentEncoded() === IndexDocument::ENCODED_BASE64) { |
140
|
|
|
$arr['index']['pipeline'] = 'attachment'; |
141
|
|
|
} |
142
|
|
|
} |
143
|
|
|
|
144
|
|
|
|
145
|
|
|
/** |
146
|
|
|
* @param IndexDocument $document |
147
|
|
|
* |
148
|
|
|
* @return array |
149
|
|
|
* @throws AccessIsEmptyException |
150
|
|
|
*/ |
151
|
|
|
public function generateIndexBody(IndexDocument $document) { |
152
|
|
|
|
153
|
|
|
$access = $document->getAccess(); |
154
|
|
|
if ($access === null) { |
155
|
|
|
throw new AccessIsEmptyException('DocumentAccess is Empty'); |
156
|
|
|
} |
157
|
|
|
|
158
|
|
|
$body = [ |
159
|
|
|
'owner' => $access->getOwnerId(), |
160
|
|
|
'users' => $access->getUsers(), |
161
|
|
|
'groups' => $access->getGroups(), |
162
|
|
|
'circles' => $access->getCircles(), |
163
|
|
|
'tags' => $document->getTags(), |
164
|
|
|
'hash' => $document->getHash(), |
165
|
|
|
'provider' => $document->getProviderId(), |
166
|
|
|
'source' => $document->getSource(), |
167
|
|
|
'title' => $document->getTitle(), |
168
|
|
|
'parts' => $document->getParts() |
169
|
|
|
]; |
170
|
|
|
|
171
|
|
|
if ($document->getContent() !== null) { |
172
|
|
|
$body['content'] = $document->getContent(); |
173
|
|
|
} |
174
|
|
|
|
175
|
|
|
return array_merge($document->getInfoAll(), $body); |
176
|
|
|
} |
177
|
|
|
|
178
|
|
|
|
179
|
|
|
/** |
180
|
|
|
* @param bool $complete |
181
|
|
|
* |
182
|
|
|
* @return array<string,string|array<string,array<string,array<string,array>>>> |
|
|
|
|
183
|
|
|
* @throws ConfigurationException |
184
|
|
|
*/ |
185
|
|
|
public function generateGlobalMap($complete = true) { |
186
|
|
|
|
187
|
|
|
$params = [ |
188
|
|
|
'index' => $this->configService->getElasticIndex() |
189
|
|
|
]; |
190
|
|
|
|
191
|
|
|
if ($complete === false) { |
192
|
|
|
return $params; |
193
|
|
|
} |
194
|
|
|
|
195
|
|
|
$params['body'] = [ |
196
|
|
|
'settings' => [ |
197
|
|
|
'analysis' => [ |
198
|
|
|
'filter' => [ |
199
|
|
|
'shingle' => [ |
200
|
|
|
'type' => 'shingle' |
201
|
|
|
] |
202
|
|
|
], |
203
|
|
|
'char_filter' => [ |
204
|
|
|
'pre_negs' => [ |
205
|
|
|
'type' => 'pattern_replace', |
206
|
|
|
'pattern' => '(\\w+)\\s+((?i:never|no|nothing|nowhere|noone|none|not|havent|hasnt|hadnt|cant|couldnt|shouldnt|wont|wouldnt|dont|doesnt|didnt|isnt|arent|aint))\\b', |
|
|
|
|
207
|
|
|
'replacement' => '~$1 $2' |
208
|
|
|
], |
209
|
|
|
'post_negs' => [ |
210
|
|
|
'type' => 'pattern_replace', |
211
|
|
|
'pattern' => '\\b((?i:never|no|nothing|nowhere|noone|none|not|havent|hasnt|hadnt|cant|couldnt|shouldnt|wont|wouldnt|dont|doesnt|didnt|isnt|arent|aint))\\s+(\\w+)', |
|
|
|
|
212
|
|
|
'replacement' => '$1 ~$2' |
213
|
|
|
] |
214
|
|
|
], |
215
|
|
|
'analyzer' => [ |
216
|
|
|
'analyzer' => [ |
217
|
|
|
'type' => 'custom', |
218
|
|
|
'tokenizer' => $this->configService->getAppValue( |
219
|
|
|
ConfigService::ANALYZER_TOKENIZER |
220
|
|
|
), |
221
|
|
|
'filter' => ['lowercase', 'stop', 'kstem'] |
222
|
|
|
] |
223
|
|
|
] |
224
|
|
|
] |
225
|
|
|
], |
226
|
|
|
'mappings' => [ |
227
|
|
|
'standard' => [ |
228
|
|
|
'dynamic' => true, |
229
|
|
|
'properties' => [ |
230
|
|
|
'source' => [ |
231
|
|
|
'type' => 'keyword' |
232
|
|
|
], |
233
|
|
|
'title' => [ |
234
|
|
|
'type' => 'text', |
235
|
|
|
'analyzer' => 'keyword', |
236
|
|
|
'term_vector' => 'yes', |
237
|
|
|
'copy_to' => 'combined' |
238
|
|
|
], |
239
|
|
|
'provider' => [ |
240
|
|
|
'type' => 'keyword' |
241
|
|
|
], |
242
|
|
|
'tags' => [ |
243
|
|
|
'type' => 'keyword' |
244
|
|
|
], |
245
|
|
|
'content' => [ |
246
|
|
|
'type' => 'text', |
247
|
|
|
'analyzer' => 'analyzer', |
248
|
|
|
'term_vector' => 'yes', |
249
|
|
|
'copy_to' => 'combined' |
250
|
|
|
], |
251
|
|
|
'owner' => [ |
252
|
|
|
'type' => 'keyword' |
253
|
|
|
], |
254
|
|
|
'users' => [ |
255
|
|
|
'type' => 'keyword' |
256
|
|
|
], |
257
|
|
|
'groups' => [ |
258
|
|
|
'type' => 'keyword' |
259
|
|
|
], |
260
|
|
|
'circles' => [ |
261
|
|
|
'type' => 'keyword' |
262
|
|
|
], |
263
|
|
|
'hash' => [ |
264
|
|
|
'type' => 'keyword' |
265
|
|
|
], |
266
|
|
|
'combined' => [ |
267
|
|
|
'type' => 'text', |
268
|
|
|
'analyzer' => 'analyzer', |
269
|
|
|
'term_vector' => 'yes' |
270
|
|
|
] |
271
|
|
|
// , |
272
|
|
|
// 'topics' => [ |
|
|
|
|
273
|
|
|
// 'type' => 'text', |
|
|
|
|
274
|
|
|
// 'index' => 'not_analyzed' |
|
|
|
|
275
|
|
|
// ], |
276
|
|
|
// 'places' => [ |
|
|
|
|
277
|
|
|
// 'type' => 'text', |
|
|
|
|
278
|
|
|
// 'index' => 'not_analyzed' |
|
|
|
|
279
|
|
|
// ] |
280
|
|
|
] |
281
|
|
|
] |
282
|
|
|
] |
283
|
|
|
]; |
284
|
|
|
|
285
|
|
|
return $params; |
286
|
|
|
} |
287
|
|
|
|
288
|
|
|
|
289
|
|
|
/** |
290
|
|
|
* @param bool $complete |
291
|
|
|
* |
292
|
|
|
* @return array<string,string|array<string,string|array<string,array<string,string|integer>>>> |
|
|
|
|
293
|
|
|
*/ |
294
|
|
|
public function generateGlobalIngest($complete = true) { |
295
|
|
|
|
296
|
|
|
$params = ['id' => 'attachment']; |
297
|
|
|
|
298
|
|
|
if ($complete === false) { |
299
|
|
|
return $params; |
300
|
|
|
} |
301
|
|
|
|
302
|
|
|
$params['body'] = [ |
303
|
|
|
'description' => 'attachment', |
304
|
|
|
'processors' => [ |
305
|
|
|
[ |
306
|
|
|
'attachment' => [ |
307
|
|
|
'field' => 'content', |
308
|
|
|
'indexed_chars' => -1 |
309
|
|
|
], |
310
|
|
|
'convert' => [ |
311
|
|
|
'field' => 'attachment.content', |
312
|
|
|
'type' => 'string', |
313
|
|
|
'target_field' => 'content' |
314
|
|
|
], |
315
|
|
|
'remove' => [ |
316
|
|
|
'field' => 'attachment.content', |
317
|
|
|
'ignore_failure' => true |
318
|
|
|
] |
319
|
|
|
] |
320
|
|
|
] |
321
|
|
|
]; |
322
|
|
|
|
323
|
|
|
return $params; |
324
|
|
|
} |
325
|
|
|
|
326
|
|
|
|
327
|
|
|
/** |
328
|
|
|
* @param string $providerId |
329
|
|
|
* |
330
|
|
|
* @return array |
|
|
|
|
331
|
|
|
* @throws ConfigurationException |
332
|
|
|
*/ |
333
|
|
|
public function generateDeleteQuery($providerId) { |
334
|
|
|
$params = [ |
335
|
|
|
'index' => $this->configService->getElasticIndex(), |
336
|
|
|
'type' => 'standard' |
337
|
|
|
]; |
338
|
|
|
|
339
|
|
|
$params['body']['query']['match'] = ['provider' => $providerId]; |
340
|
|
|
|
341
|
|
|
return $params; |
342
|
|
|
} |
343
|
|
|
|
344
|
|
|
} |
345
|
|
|
|
This check compares the return type specified in the
@return
annotation of a function or method doc comment with the types returned by the function and raises an issue if they mismatch.If the return type contains the type array, this check recommends the use of a more specific type like
String[]
orarray<String>
.