Completed
Push — ezp30481_turkish_i_breaks_lega... ( be9c59 )
by
unknown
20:35
created

DoctrineDatabase::indexWords()   B

Complexity

Conditions 5
Paths 5

Size

Total Lines 45

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 5
nc 5
nop 4
dl 0
loc 45
rs 8.8888
c 0
b 0
f 0
1
<?php
2
/**
3
 * File containing the DoctrineDatabase Content search Gateway class.
4
 *
5
 * @copyright Copyright (C) eZ Systems AS. All rights reserved.
6
 * @license For full copyright and license information view LICENSE file distributed with this source code.
7
 */
8
namespace eZ\Publish\Core\Search\Legacy\Content\WordIndexer\Gateway;
9
10
use eZ\Publish\Core\Search\Legacy\Content\WordIndexer\Gateway;
11
use eZ\Publish\Core\Persistence\Database\DatabaseHandler;
12
use eZ\Publish\Core\Persistence\TransformationProcessor;
13
use eZ\Publish\Core\Search\Legacy\Content\WordIndexer\Repository\SearchIndex;
14
use eZ\Publish\Core\Search\Legacy\Content\FullTextData;
15
use eZ\Publish\SPI\Persistence\Content;
16
use eZ\Publish\SPI\Persistence\Content\Type\Handler as SPITypeHandler;
17
use eZ\Publish\SPI\Search\Field;
18
19
/**
20
 * WordIndexer gateway implementation using the Doctrine database.
21
 */
22
class DoctrineDatabase extends Gateway
23
{
24
    /**
25
     * Max acceptable by any DBMS INT value.
26
     *
27
     * Note: 2^31-1 seems to be the most reasonable value that should work in any setup.
28
     */
29
    const DB_INT_MAX = 2147483647;
30
31
    /**
32
     * Database handler.
33
     *
34
     * @var \eZ\Publish\Core\Persistence\Database\DatabaseHandler
35
     * @deprecated Start to use DBAL $connection instead.
36
     */
37
    protected $dbHandler;
38
39
    /**
40
     * SPI Content Type Handler.
41
     *
42
     * Need this for being able to pick fields that are searchable.
43
     *
44
     * @var \eZ\Publish\SPI\Persistence\Content\Type\Handler
45
     */
46
    protected $typeHandler;
47
48
    /**
49
     * Transformation processor.
50
     *
51
     * Need this for being able to transform text to searchable value
52
     *
53
     * @var \eZ\Publish\Core\Persistence\TransformationProcessor
54
     */
55
    protected $transformationProcessor;
56
57
    /**
58
     * LegacySearchService.
59
     *
60
     * Need this for queries on ezsearch* tables
61
     *
62
     * @var \eZ\Publish\Core\Search\Legacy\Content\WordIndexer\Repository\SearchIndex
63
     */
64
    protected $searchIndex;
65
66
    /**
67
     * Full text search configuration options.
68
     *
69
     * @var array
70
     */
71
    protected $fullTextSearchConfiguration;
72
73
    /**
74
     * Construct from handler handler.
75
     *
76
     * @param \eZ\Publish\Core\Persistence\Database\DatabaseHandler $dbHandler
77
     * @param \eZ\Publish\SPI\Persistence\Content\Type\Handler $typeHandler
78
     * @param \eZ\Publish\Core\Persistence\TransformationProcessor $transformationProcessor
79
     * @param \eZ\Publish\Core\Search\Legacy\Content\WordIndexer\Repository\SearchIndex $searchIndex
80
     * @param array $fullTextSearchConfiguration
81
     */
82
    public function __construct(
83
        DatabaseHandler $dbHandler,
84
        SPITypeHandler $typeHandler,
85
        TransformationProcessor $transformationProcessor,
86
        SearchIndex $searchIndex,
87
        array $fullTextSearchConfiguration
88
    ) {
89
        $this->dbHandler = $dbHandler;
90
        $this->typeHandler = $typeHandler;
91
        $this->transformationProcessor = $transformationProcessor;
92
        $this->searchIndex = $searchIndex;
93
        $this->fullTextSearchConfiguration = $fullTextSearchConfiguration;
94
    }
95
96
    /**
97
     * Index search engine full text data corresponding to content object field values.
98
     *
99
     * Ported from the legacy code
100
     * @see https://github.com/ezsystems/ezpublish-legacy/blob/master/kernel/search/plugins/ezsearchengine/ezsearchengine.php#L45
101
     *
102
     * @param \eZ\Publish\Core\Search\Legacy\Content\FullTextData $fullTextData
103
     */
104
    public function index(FullTextData $fullTextData)
105
    {
106
        $indexArray = [];
107
        $indexArrayOnlyWords = [];
108
        $wordCount = 0;
109
        $placement = 0;
110
111
        // Remove previously indexed content if exists to avoid keeping in index removed field values
112
        $this->remove($fullTextData->id);
113
        foreach ($fullTextData->values as $fullTextValue) {
114
            /** @var \eZ\Publish\Core\Search\Legacy\Content\FullTextValue $fullTextValue */
115
            if (is_numeric(trim($fullTextValue->value))) {
116
                $integerValue = (int)$fullTextValue->value;
117
                if ($integerValue > self::DB_INT_MAX) {
118
                    $integerValue = 0;
119
                }
120
            } else {
121
                $integerValue = 0;
122
            }
123
            $text = $this->transformationProcessor->transform($fullTextValue->value, $this->fullTextSearchConfiguration['commands']);
124
            // split by non-words
125
            $wordArray = preg_split('/\W/u', $text, -1, PREG_SPLIT_NO_EMPTY);
126
            foreach ($wordArray as $word) {
127
                if (trim($word) === '') {
128
                    continue;
129
                }
130
                // words stored in search index are limited to 150 characters
131
                if (mb_strlen($word) > 150) {
132
                    $word = mb_substr($word, 0, 150);
133
                }
134
                $indexArray[] = [
135
                    'Word' => $word,
136
                    'ContentClassAttributeID' => $fullTextValue->fieldDefinitionId,
137
                    'identifier' => $fullTextValue->fieldDefinitionIdentifier,
138
                    'integer_value' => $integerValue,
139
                ];
140
                $indexArrayOnlyWords[$word] = 1;
141
                ++$wordCount;
142
                // if we have "www." before word than
143
                // treat it as url and add additional entry to the index
144
                if (mb_strtolower(mb_substr($word, 0, 4)) === 'www.') {
145
                    $additionalUrlWord = substr($word, 4);
146
                    $indexArray[] = ['Word' => $additionalUrlWord,
147
                        'ContentClassAttributeID' => $fullTextValue->fieldDefinitionId,
148
                        'identifier' => $fullTextValue->fieldDefinitionIdentifier,
149
                        'integer_value' => $integerValue, ];
150
                    $indexArrayOnlyWords[$additionalUrlWord] = 1;
151
                    ++$wordCount;
152
                }
153
            }
154
        }
155
156
        $wordIDArray = $this->buildWordIDArray(array_keys($indexArrayOnlyWords));
157
        $this->dbHandler->beginTransaction();
158
        for ($arrayCount = 0; $arrayCount < $wordCount; $arrayCount += 1000) {
159
            $placement = $this->indexWords(
160
                $fullTextData,
161
                array_slice($indexArray, $arrayCount, 1000),
162
                $wordIDArray,
163
                $placement
164
            );
165
        }
166
        $this->dbHandler->commit();
167
    }
168
169
    /**
170
     * Indexes an array of FullTextData objects.
171
     *
172
     * Note: on large amounts of data make sure to iterate with several calls to this function with
173
     * a limited set of FullTextData objects. Amount you have memory for depends on server, size
174
     * of FullTextData objects & PHP version.
175
     *
176
     * @param \eZ\Publish\Core\Search\Legacy\Content\FullTextData[] $fullTextBulkData
177
     */
178
    public function bulkIndex(array $fullTextBulkData)
179
    {
180
        foreach ($fullTextBulkData as $fullTextData) {
181
            $this->index($fullTextData);
182
        }
183
    }
184
185
    /**
186
     * Remove whole content or a specific version from index.
187
     *
188
     * Ported from the legacy code
189
     * @see https://github.com/ezsystems/ezpublish-legacy/blob/master/kernel/search/plugins/ezsearchengine/ezsearchengine.php#L386
190
     *
191
     * @param mixed $contentId
192
     * @param mixed|null $versionId
193
     *
194
     * @return bool
195
     */
196
    public function remove($contentId, $versionId = null)
197
    {
198
        $doDelete = false;
199
        $this->dbHandler->beginTransaction();
200
        // fetch all the words and decrease the object count on all the words
201
        $wordIDList = $this->searchIndex->getContentObjectWords($contentId);
202
        if (count($wordIDList) > 0) {
203
            $this->searchIndex->decrementWordObjectCount($wordIDList);
204
            $doDelete = true;
205
        }
206
        if ($doDelete) {
207
            $this->searchIndex->deleteWordsWithoutObjects();
208
            $this->searchIndex->deleteObjectWordsLink($contentId);
209
        }
210
        $this->dbHandler->commit();
211
212
        return true;
213
    }
214
215
    /**
216
     * Remove entire search index.
217
     */
218
    public function purgeIndex()
219
    {
220
        $this->searchIndex->purge();
221
    }
222
223
    /**
224
     * Index wordIndex.
225
     *
226
     * Ported from the legacy code
227
     *
228
     * @see https://github.com/ezsystems/ezpublish-legacy/blob/master/kernel/search/plugins/ezsearchengine/ezsearchengine.php#L255
229
     *
230
     * @param \eZ\Publish\Core\Search\Legacy\Content\FullTextData $fullTextData
231
     * @param array $indexArray
232
     * @param array $wordIDArray
233
     * @param int $placement
234
     *
235
     * @return int last placement
236
     */
237
    private function indexWords(FullTextData $fullTextData, array $indexArray, array $wordIDArray, $placement = 0)
238
    {
239
        $contentId = $fullTextData->id;
240
241
        $prevWordId = 0;
242
243
        for ($i = 0; $i < count($indexArray); ++$i) {
0 ignored issues
show
Performance Best Practice introduced by
It seems like you are calling the size function count() as part of the test condition. You might want to compute the size beforehand, and not on each iteration.

If the size of the collection does not change during the iteration, it is generally a good practice to compute it beforehand, and not on each iteration:

for ($i=0; $i<count($array); $i++) { // calls count() on each iteration
}

// Better
for ($i=0, $c=count($array); $i<$c; $i++) { // calls count() just once
}
Loading history...
244
            $indexWord = $indexArray[$i]['Word'];
245
            $contentFieldId = $indexArray[$i]['ContentClassAttributeID'];
246
            $identifier = $indexArray[$i]['identifier'];
247
            $integerValue = $indexArray[$i]['integer_value'];
248
            $wordId = $wordIDArray[$indexWord];
249
250
            if (isset($indexArray[$i + 1])) {
251
                $nextIndexWord = $indexArray[$i + 1]['Word'];
252
                $nextWordId = $wordIDArray[$nextIndexWord];
253
            } else {
254
                $nextWordId = 0;
255
            }
256
257
            if ($wordId === null || $nextWordId === null) {
258
                continue;
259
            }
260
261
            $frequency = 0;
262
            $this->searchIndex->addObjectWordLink(
263
                $wordId,
264
                $contentId,
265
                $frequency,
266
                $placement,
267
                $nextWordId,
268
                $prevWordId,
269
                $fullTextData->contentTypeId,
270
                $contentFieldId,
271
                $fullTextData->published,
272
                $fullTextData->sectionId,
273
                $identifier,
274
                $integerValue
275
            );
276
            $prevWordId = $wordId;
277
            ++$placement;
278
        }
279
280
        return $placement;
281
    }
282
283
    /**
284
     * Build WordIDArray and update ezsearch_word table.
285
     *
286
     * Ported from the legacy code
287
     *
288
     * @see https://github.com/ezsystems/ezpublish-legacy/blob/master/kernel/search/plugins/ezsearchengine/ezsearchengine.php#L155
289
     *
290
     * @param array $indexArrayOnlyWords words for object to add
291
     *
292
     * @return array wordIDArray
293
     */
294
    private function buildWordIDArray(array $indexArrayOnlyWords)
295
    {
296
        $wordCount = count($indexArrayOnlyWords);
297
        $wordIDArray = [];
298
        $wordArray = [];
299
300
        // store the words in the index and remember the ID
301
        $this->dbHandler->beginTransaction();
302
        for ($arrayCount = 0; $arrayCount < $wordCount; $arrayCount += 500) {
303
            // Fetch already indexed words from database
304
            $wordArrayChuck = array_slice($indexArrayOnlyWords, $arrayCount, 500);
305
            $wordRes = $this->searchIndex->getWords($wordArrayChuck);
306
307
            // Build a has of the existing words
308
            $wordResCount = count($wordRes);
309
            $existingWordArray = [];
310
            for ($i = 0; $i < $wordResCount; ++$i) {
311
                $wordIDArray[] = $wordRes[$i]['id'];
312
                $existingWordArray[] = $wordRes[$i]['word'];
313
                $wordArray[$wordRes[$i]['word']] = $wordRes[$i]['id'];
314
            }
315
316
            // Update the object count of existing words by one
317
            if (count($wordIDArray) > 0) {
318
                $this->searchIndex->incrementWordObjectCount($wordIDArray);
319
            }
320
321
            // Insert if there is any news words
322
            $newWordArray = array_diff($wordArrayChuck, $existingWordArray);
323
            if (count($newWordArray) > 0) {
324
                $this->searchIndex->addWords($newWordArray);
325
                $newWordRes = $this->searchIndex->getWords($newWordArray);
326
                $newWordCount = count($newWordRes);
327
                for ($i = 0; $i < $newWordCount; ++$i) {
328
                    $wordLowercase = $this->transformationProcessor->transformByGroup($newWordRes[$i]['word'], 'lowercase');
329
                    $wordArray[$wordLowercase] = $newWordRes[$i]['id'];
330
                }
331
            }
332
        }
333
        $this->dbHandler->commit();
334
335
        return $wordArray;
336
    }
337
}
338