SEARCHENGINE::get_spelling_correction()   A
last analyzed

Complexity

Conditions 3
Paths 2

Size

Total Lines 5
Code Lines 3

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 0
CRAP Score 12

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 3
eloc 3
c 1
b 0
f 0
nc 2
nop 0
dl 0
loc 5
rs 10
ccs 0
cts 4
cp 0
crap 12
1
<?php
2
3
# vim:sw=4:ts=4:et:nowrap
4
5
/*
6
SEARCHENGINE class 2004-05-26
7
[email protected]
8
9
Example usage:
10
11
        include_once INCLUDESPATH."easyparliament/searchengine.php";
12
13
        $searchengine = new SEARCHENGINE($searchstring);
14
        $description = $searchengine->query_description();
15
        $short_description = $searchengine->query_description_short();
16
17
        $count = $searchengine->run_count();
18
19
        // $first_result begins at 0
20
        $searchengine->run_search($first_result, $results_per_page);
21
        $gids = $searchengine->get_gids();
22
        $relevances = $searchengine->get_relevances();
23
24
        $bestpos = $searchengine->position_of_first_word($body);
25
        $extract = $searchengine->highlight($extract);
26
27
*/
28
29
if (defined('XAPIANDB') and XAPIANDB != '') {
30
    if (file_exists('/usr/share/php/xapian.php')) {
31
        include_once '/usr/share/php/xapian.php';
32
    } else {
33
        twfy_debug('SEARCH', '/usr/share/php/xapian.php does not exist');
34
    }
35
}
36
37
class SEARCHENGINE {
38
    public $valid = false;
39
    public $error;
40
41
    public function __construct($query, $lang = '') {
42
        if (!defined('XAPIANDB') || !XAPIANDB) {
43
            return null;
44
        }
45
46
        global $xapiandb, $PAGE;
47
        if (!$xapiandb) {
48
            if (strstr(XAPIANDB, ":")) {
49
                //ini_set('display_errors', 'On');
50
                [$xapian_host, $xapian_port] = explode(":", XAPIANDB);
51
                twfy_debug("SEARCH", "Using Xapian remote backend: " . $xapian_host . " port " . $xapian_port);
52
                $xapiandb_remote = remote_open($xapian_host, intval($xapian_port));
0 ignored issues
show
Bug introduced by
The function remote_open was not found. Maybe you did not declare it correctly or list all dependencies? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

52
                $xapiandb_remote = /** @scrutinizer ignore-call */ remote_open($xapian_host, intval($xapian_port));
Loading history...
53
                $xapiandb = new XapianDatabase($xapiandb_remote);
54
            } else {
55
                $xapiandb = new XapianDatabase(XAPIANDB);
56
            }
57
        }
58
        $this->query = $query;
0 ignored issues
show
Bug Best Practice introduced by
The property query does not exist. Although not strictly required by PHP, it is generally a best practice to declare properties explicitly.
Loading history...
59
        if (!isset($this->stemmer)) {
60
            $this->stemmer = new XapianStem('english');
0 ignored issues
show
Bug Best Practice introduced by
The property stemmer does not exist. Although not strictly required by PHP, it is generally a best practice to declare properties explicitly.
Loading history...
61
        }
62
        if (!isset($this->enquire)) {
63
            $this->enquire = new XapianEnquire($xapiandb);
0 ignored issues
show
Bug Best Practice introduced by
The property enquire does not exist. Although not strictly required by PHP, it is generally a best practice to declare properties explicitly.
Loading history...
64
        }
65
        if (!isset($this->queryparser)) {
66
            $this->queryparser = new XapianQueryParser();
0 ignored issues
show
Bug Best Practice introduced by
The property queryparser does not exist. Although not strictly required by PHP, it is generally a best practice to declare properties explicitly.
Loading history...
67
            $this->datevaluerange = new XapianDateValueRangeProcessor(1);
0 ignored issues
show
Bug Best Practice introduced by
The property datevaluerange does not exist. Although not strictly required by PHP, it is generally a best practice to declare properties explicitly.
Loading history...
68
            $this->queryparser->set_stemmer($this->stemmer);
69
            $this->queryparser->set_stemming_strategy(XapianQueryParser::STEM_SOME);
70
            $this->queryparser->set_database($xapiandb);
71
            $this->queryparser->set_default_op(Query_OP_AND);
72
            $this->queryparser->add_boolean_prefix('speaker', 'S');
73
            $this->queryparser->add_boolean_prefix('major', 'M');
74
            $this->queryparser->add_boolean_prefix('date', 'D');
75
            $this->queryparser->add_boolean_prefix('batch', 'B');
76
            $this->queryparser->add_boolean_prefix('segment', 'U');
77
            $this->queryparser->add_boolean_prefix('department', 'G');
78
            $this->queryparser->add_boolean_prefix('party', 'P');
79
            $this->queryparser->add_boolean_prefix('column', 'C');
80
            $this->queryparser->add_boolean_prefix('gid', 'Q');
81
            $this->queryparser->add_valuerangeprocessor($this->datevaluerange);
82
        }
83
84
        # Force words to lower case
85
        $this->query = preg_replace_callback('#(department|party):.+?\b#i', function ($m) {
86
            return strtolower($m[0]);
87
        }, $this->query);
88
89
        // Any characters other than this are treated as, basically, white space
90
        // (apart from quotes and minuses, special case below)
91
        // The colon is in here for prefixes speaker:10043 and so on.
92
        $this->wordchars = "A-Za-z0-9,.'&:_\x80-\xbf\xc2-\xf4";
0 ignored issues
show
Bug Best Practice introduced by
The property wordchars does not exist. Although not strictly required by PHP, it is generally a best practice to declare properties explicitly.
Loading history...
93
        $this->wordcharsnodigit = "A-Za-z0-9'&_\x80-\xbf\xc2-\xf4";
0 ignored issues
show
Bug Best Practice introduced by
The property wordcharsnodigit does not exist. Although not strictly required by PHP, it is generally a best practice to declare properties explicitly.
Loading history...
94
95
        // An array of normal words.
96
        $this->words = [];
0 ignored issues
show
Bug Best Practice introduced by
The property words does not exist. Although not strictly required by PHP, it is generally a best practice to declare properties explicitly.
Loading history...
97
        // All quoted phrases, as an (array of (arrays of words in each phrase)).
98
        $this->phrases = [];
0 ignored issues
show
Bug Best Practice introduced by
The property phrases does not exist. Although not strictly required by PHP, it is generally a best practice to declare properties explicitly.
Loading history...
99
        // Items prefixed with a colon (speaker:10024) as an (array of (name, value))
100
        $this->prefixed = [];
0 ignored issues
show
Bug Best Practice introduced by
The property prefixed does not exist. Although not strictly required by PHP, it is generally a best practice to declare properties explicitly.
Loading history...
101
102
        // Split words up into individual words, and quoted phrases
103
        preg_match_all('/(' .
104
            '"|' . # match either a quote, or...
105
            '(?:(?<![' . $this->wordchars . '])-)?' . # optionally a - (exclude)
106
            # if at start of word (i.e. not preceded by a word character, in
107
            # which case it is probably a hyphenated-word)
108
            '[' . $this->wordchars . ']+' . # followed by a string of word-characters
109
            ')/', $this->query, $all_words);
110
        if ($all_words) {
111
            $all_words = $all_words[0];
112
        } else {
113
            $all_words = [];
114
        }
115
        $in_quote = false;
116
        $from = '';
117
        $to = '';
118
        foreach ($all_words as $word) {
119
            if ($word == '"') {
120
                $in_quote = !$in_quote;
0 ignored issues
show
introduced by
The condition $in_quote is always false.
Loading history...
121
                if ($in_quote) {
122
                    array_push($this->phrases, []);
123
                }
124
                if (!$in_quote && !count($this->phrases[count($this->phrases) - 1])) {
125
                    array_pop($this->phrases);
126
                }
127
                continue;
128
            }
129
            if ($word == '') {
130
                continue;
131
            }
132
133
            if (strpos($word, ':') !== false) {
134
                $items = explode(":", strtolower($word));
135
                $type = $items[0];
136
                if (substr($type, 0, 1) == '-') {
137
                    $type = substr($type, 1);
138
                }
139
                $value = strtolower(join(":", array_slice($items, 1)));
140
                if ($type == 'section') {
141
                    $newv = $value;
142
                    if ($value == 'debates' || $value == 'debate') {
143
                        $newv = 1;
144
                    } elseif ($value == 'whall' || $value == 'westminster' || $value == 'westminhall') {
145
                        $newv = 2;
146
                    } elseif ($value == 'wrans' || $value == 'wran') {
147
                        $newv = 3;
148
                    } elseif ($value == 'wms' || $value == 'statements' || $value == 'statement') {
149
                        $newv = 4;
150
                    } elseif ($value == 'lordsdebates' || $value == 'lords') {
151
                        $newv = 101;
152
                    } elseif ($value == 'ni' || $value == 'nidebates') {
153
                        $newv = 5;
154
                    } elseif ($value == 'pbc' || $value == 'standing') {
155
                        $newv = 6;
156
                    } elseif ($value == 'sp') {
157
                        $newv = 7;
158
                    } elseif ($value == 'spwrans' || $value == 'spwran') {
159
                        $newv = 8;
160
                    } elseif ($value == 'lmqs') {
161
                        $newv = 9;
162
                    } elseif ($value == 'uk') {
163
                        $newv = [1,2,3,4,6,101];
164
                    } elseif ($value == 'scotland') {
165
                        $newv = [7,8];
166
                    } elseif ($value == 'wales') {
167
                        $newv = ($lang == 'cy' || LANGUAGE == 'cy') ? 11 : 10;
168
                    } elseif ($value == 'future') {
169
                        $newv = 'F';
170
                    }
171
                    if (is_array($newv)) {
172
                        $newv = 'major:' . join(' major:', $newv);
173
                    } else {
174
                        $newv = "major:$newv";
175
                    }
176
                    $this->query = str_ireplace("$type:$value", $newv, $this->query);
177
                } elseif ($type == 'groupby') {
178
                    $newv = $value;
179
                    if ($value == 'debates' || $value == 'debate') {
180
                        $newv = 'debate';
181
                    }
182
                    if ($value == 'speech' || $value == 'speeches') {
183
                        $newv = 'speech';
184
                    }
185
                    $this->query = str_ireplace("$type:$value", '', $this->query);
186
                    array_push($this->prefixed, [$type, $newv]);
187
                } elseif ($type == 'from') {
188
                    $from = $value;
189
                } elseif ($type == 'to') {
190
                    $to = $value;
191
                }
192
            } elseif (strpos($word, '-') !== false) {
193
            } elseif ($in_quote) {
194
                array_push($this->phrases[count($this->phrases) - 1], strtolower($word));
195
            } elseif (strpos($word, '..') !== false) {
196
            } elseif ($word == 'OR' || $word == 'AND' || $word == 'XOR' || $word == 'NEAR') {
197
            } else {
198
                array_push($this->words, strtolower($word));
199
            }
200
        }
201
        if ($from && $to) {
202
            $this->query = str_ireplace("from:$from", '', $this->query);
203
            $this->query = str_ireplace("to:$to", '', $this->query);
204
            $this->query .= " $from..$to";
205
        } elseif ($from) {
206
            $this->query = str_ireplace("from:$from", '', $this->query);
207
            $this->query .= " $from.." . date('Ymd');
208
        } elseif ($to) {
209
            $this->query = str_ireplace("to:$to", '', $this->query);
210
            $this->query .= " 19990101..$to";
211
        }
212
213
        # Merged people
214
        $db = new ParlDB();
215
        $merged = $db->query('SELECT * FROM gidredirect WHERE gid_from LIKE :gid_from', [':gid_from' => "uk.org.publicwhip/person/%"]);
216
        foreach ($merged as $row) {
217
            $from_id = str_replace('uk.org.publicwhip/person/', '', $row['gid_from']);
218
            $to_id = str_replace('uk.org.publicwhip/person/', '', $row['gid_to']);
219
            $this->query = preg_replace("#speaker:($from_id|$to_id)#i", "(speaker:$from_id OR speaker:$to_id)", $this->query);
220
        }
221
222
        twfy_debug("SEARCH", "prefixed: " . var_export($this->prefixed, true));
223
224
        twfy_debug("SEARCH", "query -- " . $this->query);
0 ignored issues
show
Bug introduced by
Are you sure $this->query of type array|mixed|string can be used in concatenation? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

224
        twfy_debug("SEARCH", "query -- " . /** @scrutinizer ignore-type */ $this->query);
Loading history...
225
        $flags = XapianQueryParser::FLAG_BOOLEAN | XapianQueryParser::FLAG_LOVEHATE |
226
            XapianQueryParser::FLAG_WILDCARD | XapianQueryParser::FLAG_SPELLING_CORRECTION;
227
        $flags = $flags | XapianQueryParser::FLAG_PHRASE;
228
229
        # Without Welsh handling first, for spelling correction
230
        try {
231
            $this->queryparser->parse_query($this->query, $flags);
232
            $this->corrected = $this->queryparser->get_corrected_query_string();
0 ignored issues
show
Bug Best Practice introduced by
The property corrected does not exist. Although not strictly required by PHP, it is generally a best practice to declare properties explicitly.
Loading history...
233
        } catch (Exception $e) {
234
            # Nothing we can really do with a bad query
235
            $this->error = _htmlspecialchars($e->getMessage());
236
237
            return null;
238
        }
239
240
        # Now stick in an 'exclude other language' if needed
241
        if (!preg_match('#major:#', $this->query)) {
0 ignored issues
show
Bug introduced by
It seems like $this->query can also be of type array; however, parameter $subject of preg_match() does only seem to accept string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

241
        if (!preg_match('#major:#', /** @scrutinizer ignore-type */ $this->query)) {
Loading history...
242
            if ($lang == 'cy' || LANGUAGE == 'cy') {
243
                $this->query = "($this->query) -major:10";
244
            } else {
245
                $this->query = "($this->query) -major:11";
246
            }
247
        }
248
249
        try {
250
            $query = $this->queryparser->parse_query($this->query, $flags);
251
        } catch (Exception $e) {
252
            # Nothing we can really do with a bad query
253
            $this->error = _htmlspecialchars($e->getMessage());
254
255
            return null;
256
        }
257
258
        $this->enquire->set_query($query);
259
260
        # Now parse the parsed query back into a query string, yummy
261
262
        $qd = $query->get_description();
263
        twfy_debug("SEARCH", "queryparser original description -- " . $qd);
264
        $qd = substr($qd, 6, -1); # Strip "Query()" around description
265
        $qd = preg_replace('#@[0-9]+#', '', $qd); # Strip position variable
266
        $qd = preg_replace('#^\((.*) AND_NOT M1[01]\)$#', '$1', $qd); # Strip Welsh handling
267
        # Date range
268
        $qd = preg_replace_callback('#VALUE_RANGE 1 (\d+) (\d+)#', function ($m) {
269
            return preg_replace("#(\d{4})(\d\d)(\d\d)#", '$3/$2/$1', $m[1])
270
                . ".." . preg_replace("#(\d{4})(\d\d)(\d\d)#", '$3/$2/$1', $m[2]);
271
        }, $qd);
272
        # Replace phrases with the phrase in quotes
273
        preg_match_all('#\(([^(]*? PHRASE [^(]*?)\)#', $qd, $m);
274
        foreach ($m[1] as $phrase) {
275
            $phrase_new = preg_replace('# PHRASE \d+#', '', $phrase);
276
            #$this->phrases[] = preg_split('#\s+#', $phrase_new);
277
            $qd = str_replace("($phrase)", '"' . $phrase_new . '"', $qd);
278
        }
279
        preg_match_all('#\(([^(]*? NEAR [^(]*?)\)#', $qd, $m);
280
        foreach ($m[1] as $mm) {
281
            $mmn = preg_replace('# NEAR \d+ #', ' NEAR ', $mm);
282
            $qd = str_replace("($mm)", "($mmn)", $qd);
283
        }
284
        # Awesome regexes to get rid of superfluous matching brackets
285
        $qd = preg_replace('/( \( ( (?: (?>[^ ()]+) | (?1) ) (?: [ ](?:AND|OR|XOR|FILTER|NEAR[ ]\d+|PHRASE[ ]\d+)[ ] (?: (?>[^ ()]+) | (?1) ) )*  ) \) ) [ ] (FILTER|AND_NOT)/x', '$2 $3', $qd);
286
        $qd = preg_replace('/(?:FILTER | 0 [ ] \* ) [ ] ( \( ( (?: (?>[^ ()]+) | (?1) ) (?: [ ](?:AND|OR|XOR)[ ] (?: (?>[^ ()]+) | (?1) ) )*  ) \) )/x', '$2', $qd);
287
        $qd = preg_replace('/(?:FILTER | 0 [ ] \* ) [ ] ( [^()] )/x', '$1', $qd);
288
        $qd = str_replace('AND ', '', $qd); # AND is the default
289
        $qd = preg_replace('/^ ( \( ( (?: (?>[^()]+) | (?1) )* ) \) ) $/x', '$2', $qd);
290
        # Other prefixes
291
        $qd = preg_replace('#\bU(\d+)\b#', 'segment:$1', $qd);
292
        $qd = preg_replace('#\bC(\d+)\b#', 'column:$1', $qd);
293
        $qd = preg_replace('#\bQ(.*?)\b#', 'gid:$1', $qd);
294
        $qd = preg_replace_callback('#\bP(.*?)\b#', function ($m) {
295
            global $parties;
296
            $pu = ucfirst($m[1]);
297
            return "party:" . ($parties[$pu] ?? $m[1]);
298
        }, $qd);
299
        $qd = preg_replace('#\bD(.*?)\b#', 'date:$1', $qd);
300
        $qd = preg_replace('#\bG(.*?)\b#', 'department:$1', $qd); # XXX Lookup to show proper name of dept
301
        if (strstr($qd, 'M1 OR M2 OR M3 OR M4 OR M6 OR M101')) {
302
            $qd = str_replace('M1 OR M2 OR M3 OR M4 OR M6 OR M101', 'section:uk', $qd);
303
        } elseif (strstr($qd, 'M7 OR M8')) {
304
            $qd = str_replace('M7 OR M8', 'section:scotland', $qd);
305
        }
306
        $qd = preg_replace_callback('#\bM(\d+)\b#', function ($m) {
307
            global $hansardmajors;
308
            $title = $hansardmajors[$m[1]]["title"] ?? $m[1];
309
            return sprintf(gettext("in the '%s'"), $title);
310
        }, $qd);
311
        $qd = preg_replace('#\bMF\b#', gettext('in Future Business'), $qd);
312
313
        # Replace stemmed things with their unstemmed terms from the query
314
        $used = [];
315
        preg_match_all('#Z[^\s()]+#', $qd, $m);
316
        foreach ($m[0] as $mm) {
317
            $iter = $this->queryparser->unstem_begin($mm);
318
            $end = $this->queryparser->unstem_end($mm);
319
            while (!$iter->equals($end)) {
320
                $tt = $iter->get_term();
321
                if (!in_array($tt, $used)) {
322
                    break;
323
                }
324
                $iter->next();
325
            }
326
            $used[] = $tt;
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable $tt does not seem to be defined for all execution paths leading up to this point.
Loading history...
327
            $qd = preg_replace('#' . preg_quote($mm, '#') . '#', $tt, $qd, 1);
328
        }
329
330
        # Speakers
331
        foreach ($merged as $row) {
332
            $from_id = str_replace('uk.org.publicwhip/person/', '', $row['gid_from']);
333
            $to_id = str_replace('uk.org.publicwhip/person/', '', $row['gid_to']);
334
            $qd = str_replace("(S$from_id OR S$to_id)", "S$to_id", $qd);
335
            $qd = str_replace("S$from_id OR S$to_id", "S$to_id", $qd);
336
        }
337
338
        preg_match_all('#S(\d+)#', $qd, $m);
339
        foreach ($m[1] as $mm) {
340
            $member = new MEMBER(['person_id' => $mm]);
341
            $name = $member->full_name();
342
            $qd = str_replace("S$mm", "speaker:$name", $qd);
343
        }
344
345
        # Simplify display of excluded words
346
        $qd = preg_replace('#AND_NOT ([a-z0-9"]+)#', '-$1', $qd);
347
        preg_match_all('#AND_NOT \((.*?)\)#', $qd, $m);
348
        foreach ($m[1] as $mm) {
349
            $mmn = '-' . join(' -', explode(' OR ', $mm));
350
            $qd = str_replace("AND_NOT ($mm)", $mmn, $qd);
351
        }
352
353
        foreach ($this->prefixed as $items) {
354
            if ($items[0] == 'groupby') {
355
                if ($items[1] == 'debate') {
356
                    $qd .= ' ' . gettext('grouped by debate');
357
                } elseif ($items[1] == 'speech') {
358
                    $qd .= ' ' . gettext('showing all speeches');
359
                } else {
360
                    $PAGE->error_message("Unknown group by '$items[1]' ignored");
361
                }
362
            }
363
        }
364
365
        $this->query_desc = trim($qd);
0 ignored issues
show
Bug Best Practice introduced by
The property query_desc does not exist. Although not strictly required by PHP, it is generally a best practice to declare properties explicitly.
Loading history...
366
367
        #print 'DEBUG: ' . $query->get_description();
368
        twfy_debug("SEARCH", "words: " . var_export($this->words, true));
369
        twfy_debug("SEARCH", "phrases: " . var_export($this->phrases, true));
370
        twfy_debug("SEARCH", "queryparser description -- " . $this->query_desc);
371
372
        $this->valid = true;
373
    }
374
375
    public function query_description_internal($long) {
376
        if (!defined('XAPIANDB') || !XAPIANDB) {
377
            return '';
378
        }
379
        if (!$this->valid) {
380
            return '[bad query]';
381
        }
382
383
        return $this->query_desc;
384
    }
385
386
    // Return textual description of search
387
    public function query_description_short() {
388
        return $this->query_description_internal(false);
389
    }
390
391
    // Return textual description of search
392
    public function query_description_long() {
393
        return $this->query_description_internal(true);
394
    }
395
396
    // Return stem of a word
397
    public function stem($word) {
398
        return $this->stemmer->apply(strtolower($word));
399
    }
400
401
    public function get_spelling_correction() {
402
        if (!defined('XAPIANDB') || !XAPIANDB) {
403
            return null;
404
        }
405
        return $this->corrected;
406
    }
407
408
    // Perform partial query to get a count of number of matches
409
    public function run_count($first_result, $results_per_page, $sort_order = 'relevance') {
410
        if (!defined('XAPIANDB') || !XAPIANDB) {
411
            return null;
412
        }
413
414
        $start = getmicrotime();
415
416
        switch ($sort_order) {
417
            case 'date':
418
            case 'newest':
419
                $this->enquire->set_sort_by_value(0, true);
420
                break;
421
            case 'oldest':
422
                $this->enquire->set_sort_by_value(0, false);
423
                break;
424
            case 'created':
425
                $this->enquire->set_sort_by_value(2, false);
426
                // no break
427
            default:
428
                //do nothing, default ordering is by relevance
429
                break;
430
        }
431
432
        // Set collapsing and sorting
433
        global $PAGE;
434
        $collapsed = false;
435
        if (preg_match('#(speaker|segment):\d+#', $this->query)) {
436
            $collapsed = true;
437
        }
438
        foreach ($this->prefixed as $items) {
439
            if ($items[0] == 'groupby') {
440
                $collapsed = true;
441
                if ($items[1] == 'speech')
442
                ; // no collapse key
443
                elseif ($items[1] == 'debate') {
444
                    $this->enquire->set_collapse_key(3);
445
                } else {
446
                    $PAGE->error_message("Unknown group by '$items[1]' ignored");
447
                }
448
            }
449
        }
450
451
        // default to grouping by subdebate, i.e. by page
452
        if (!$collapsed) {
453
            $this->enquire->set_collapse_key(3);
454
        }
455
456
        /*
457
        XXX Helping to debug possible Xapian bug
458
        foreach (array(0, 50, 100, 200, 300, 400, 460) as $fff) {
459
            foreach (array(0, 100, 300, 500, 1000) as $cal) {
460
                print "get_mset($fff, 20, $cal): ";
461
                $m = $this->enquire->get_mset($fff, 20, $cal);
462
                print $m->get_matches_estimated(). ' ';
463
                print $m->get_matches_lower_bound() . ' ';
464
                print $m->get_matches_upper_bound() . "\n";
465
            }
466
        }
467
        */
468
469
        #$matches = $this->enquire->get_mset(0, 500);
470
        $this->matches = $this->enquire->get_mset($first_result, $results_per_page, 100);
0 ignored issues
show
Bug Best Practice introduced by
The property matches does not exist. Although not strictly required by PHP, it is generally a best practice to declare properties explicitly.
Loading history...
471
        // Take either: 1) the estimate which is sometimes too large or 2) the
472
        // size which is sometimes too low (it is limited to the 500 in the line
473
        // above).  We get the exact mset we need later, according to which page
474
        // we are on.
475
        #if ($matches->size() < 500) {
476
        #$count = $matches->size();
477
        #} else {
478
        $count = $this->matches->get_matches_estimated();
479
        #    print "DEBUG bounds: ";
480
        #    print $this->matches->get_matches_lower_bound();
481
        #    print ' - ';
482
        #    print $this->matches->get_matches_upper_bound();
483
        #}
484
485
        $duration = getmicrotime() - $start;
486
        twfy_debug("SEARCH", "Search count took $duration seconds.");
487
488
        return $count;
489
    }
490
491
    // Perform the full search...
492
    public function run_search($first_result, $results_per_page, $sort_order = 'relevance') {
493
        $start = getmicrotime();
494
495
        #$matches = $this->enquire->get_mset($first_result, $results_per_page);
496
        $matches = $this->matches;
497
        $this->gids = [];
0 ignored issues
show
Bug Best Practice introduced by
The property gids does not exist. Although not strictly required by PHP, it is generally a best practice to declare properties explicitly.
Loading history...
498
        $this->created = [];
0 ignored issues
show
Bug Best Practice introduced by
The property created does not exist. Although not strictly required by PHP, it is generally a best practice to declare properties explicitly.
Loading history...
499
        $this->collapsed = [];
0 ignored issues
show
Bug Best Practice introduced by
The property collapsed does not exist. Although not strictly required by PHP, it is generally a best practice to declare properties explicitly.
Loading history...
500
        $this->relevances = [];
0 ignored issues
show
Bug Best Practice introduced by
The property relevances does not exist. Although not strictly required by PHP, it is generally a best practice to declare properties explicitly.
Loading history...
501
        $iter = $matches->begin();
502
        $end = $matches->end();
503
        while (!$iter->equals($end)) {
504
            $relevancy = $iter->get_percent();
505
            $weight    = $iter->get_weight();
506
            $collapsed = $iter->get_collapse_count();
507
            $doc       = $iter->get_document();
508
            $gid       = $doc->get_data();
509
            if ($sort_order == 'created') {
510
                array_push($this->created, join('', unpack('N', $doc->get_value(2)))); # XXX Needs fixing
511
            }
512
            twfy_debug("SEARCH", "gid: $gid relevancy: $relevancy% weight: $weight");
513
            array_push($this->gids, "uk.org.publicwhip/" . $gid);
514
            array_push($this->collapsed, $collapsed);
515
            array_push($this->relevances, $relevancy);
516
            $iter->next();
517
        }
518
        $duration = getmicrotime() - $start;
519
        twfy_debug("SEARCH", "Run search took $duration seconds.");
520
    }
521
    // ... use these to get the results
522
    public function get_gids() {
523
        return $this->gids;
524
    }
525
    public function get_relevances() {
526
        return $this->relevances;
527
    }
528
    public function get_createds() {
529
        return $this->created;
530
    }
531
532
    // Puts HTML highlighting round all the matching words in the text
533
    public function highlight($body) {
534
        if (!defined('XAPIANDB') || !XAPIANDB) {
535
            return $body;
536
        }
537
538
        $stemmed_words = array_map([$this, 'stem'], $this->words);
539
        if (is_array($body)) {
540
            foreach ($body as $k => $b) {
541
                $body[$k] = $this->highlight_internal($b, $stemmed_words);
542
            }
543
544
            return $body;
545
        } else {
546
            return $this->highlight_internal($body, $stemmed_words);
547
        }
548
    }
549
550
    private $specialchars = ['&lt;', '&gt;', '&quot;', '&amp;'];
551
    private $specialchars_upper = ['&LT;', '&GT;', '&QUOT;', '&AMP;'];
552
553
    public function highlight_internal($body, $stemmed_words) {
554
        if (!defined('XAPIANDB') || !XAPIANDB) {
555
            return $body;
556
        }
557
558
        # Does html_entity_decode without the htmlspecialchars
559
        $body = str_replace($this->specialchars, $this->specialchars_upper, $body);
560
        $body = mb_convert_encoding($body, "UTF-8", "HTML-ENTITIES");
561
        $body = str_replace($this->specialchars_upper, $this->specialchars, $body);
562
        $splitextract = preg_split('/(<[^>]*>|[0-9,.]+|[' . $this->wordcharsnodigit . ']+)/', $body, -1, PREG_SPLIT_DELIM_CAPTURE);
563
        $hlextract = "";
564
        foreach ($splitextract as $extractword) {
565
            if (preg_match('/^<[^>]*>$/', $extractword)) {
566
                $hlextract .= $extractword;
567
                continue;
568
            }
569
            $endswithamp = '';
570
            if (substr($extractword, -1) == '&') {
571
                $extractword = substr($extractword, 0, -1);
572
                $endswithamp = '&';
573
            }
574
            $hl = false;
575
            $matchword = $this->stem($extractword);
576
            foreach ($stemmed_words as $word) {
577
                if ($word == '') {
578
                    continue;
579
                }
580
                if ($matchword == $word) {
581
                    $hl = true;
582
                    break;
583
                }
584
            }
585
            if ($hl) {
586
                $hlextract .= "<span class=\"hi\">$extractword</span>$endswithamp";
587
            } else {
588
                $hlextract .= $extractword . $endswithamp;
589
            }
590
        }
591
        $body = preg_replace("#</span>\s+<span class=\"hi\">#", " ", $hlextract);
592
593
        // Contents will be used in preg_replace() to highlight the search terms.
594
        $findwords = [];
595
        $replacewords = [];
596
597
        /*
598
        XXX OLD Way of doing it, doesn't work too well with stemming...
599
        foreach ($this->words as $word) {
600
            if (ctype_digit($word)) {
601
                array_push($findwords, "/\b($word|" . number_format($word) . ")\b/");
602
            } else {
603
                array_push($findwords, "/\b($word)\b/i");
604
            }
605
            array_push($replacewords, "<span class=\"hi\">\\1</span>");
606
            //array_push($findwords, "/([^>\.\'])\b(" . $word . ")\b([^<\'])/i");
607
            //array_push($replacewords, "\\1<span class=\"hi\">\\2</span>\\3");
608
        }
609
        */
610
611
        foreach ($this->phrases as $phrase) {
612
            $phrasematch = join($phrase, '[^' . $this->wordchars . ']+');
0 ignored issues
show
Bug introduced by
'[^' . $this->wordchars . ']+' of type string is incompatible with the type array|null expected by parameter $array of join(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

612
            $phrasematch = join($phrase, /** @scrutinizer ignore-type */ '[^' . $this->wordchars . ']+');
Loading history...
613
            array_push($findwords, "/\b($phrasematch)\b(?!(?>[^<>]*>))/i");
614
            $replacewords[] = "<span class=\"hi\">\\1</span>";
615
        }
616
617
        // Highlight search phrases.
618
        $hlbody = preg_replace($findwords, $replacewords, $body);
619
620
        return $hlbody;
621
    }
622
623
    // Find the position of the first of the search words/phrases in $body.
624
    public function position_of_first_word($body) {
625
        $lcbody = ' ' . html_entity_decode(strtolower($body)) . ' '; // spaces to make regexp mapping easier
626
        $pos = -1;
627
628
        // look for phrases
629
        foreach ($this->phrases as $phrase) {
630
            $phrasematch = join($phrase, '[^' . $this->wordchars . ']+');
0 ignored issues
show
Bug introduced by
'[^' . $this->wordchars . ']+' of type string is incompatible with the type array|null expected by parameter $array of join(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

630
            $phrasematch = join($phrase, /** @scrutinizer ignore-type */ '[^' . $this->wordchars . ']+');
Loading history...
631
            if (preg_match('/([^' . $this->wordchars . ']' . $phrasematch . '[^A-Za-z0-9])/', $lcbody, $matches)) {
632
                $wordpos = strpos($lcbody, $matches[0]);
633
                if ($wordpos) {
634
                    if (($wordpos < $pos) || ($pos == -1)) {
635
                        $pos = $wordpos;
636
                    }
637
                }
638
            }
639
        }
640
        if ($pos != -1) {
641
            return $pos;
642
        }
643
644
        $splitextract = preg_split('/([0-9,.]+|[' . $this->wordcharsnodigit . ']+)/', $lcbody, -1, PREG_SPLIT_DELIM_CAPTURE);
645
        $stemmed_words = array_map([$this, 'stem'], $this->words);
646
        foreach ($splitextract as $extractword) {
647
            $extractword = preg_replace('/&$/', '', $extractword);
648
            if (!$extractword) {
649
                continue;
650
            }
651
            $wordpos = strpos($lcbody, $extractword);
652
            if (!$wordpos) {
653
                continue;
654
            }
655
            foreach ($stemmed_words as $word) {
656
                if ($word == '') {
657
                    continue;
658
                }
659
                $matchword = $this->stem($extractword);
660
                if ($matchword == $word && ($wordpos < $pos || $pos == -1)) {
661
                    $pos = $wordpos;
662
                }
663
            }
664
        }
665
        // only look for earlier words if phrases weren't found
666
        if ($pos != -1) {
667
            return $pos;
668
        }
669
670
        foreach ($this->words as $word) {
671
            if (ctype_digit($word)) {
672
                $word = '(?:' . $word . '|' . number_format($word) . ')';
673
            }
674
            if (preg_match('/([^' . $this->wordchars . ']' . $word . '[^' . $this->wordchars . '])/', $lcbody, $matches)) {
675
                $wordpos = strpos($lcbody, $matches[0]);
676
                if ($wordpos) {
677
                    if (($wordpos < $pos) || ($pos == -1)) {
678
                        $pos = $wordpos;
679
                    }
680
                }
681
            }
682
        }
683
        // only look for something containing the word (ie. something stemmed, but doesn't work all the time) if no whole word was found
684
        if ($pos != -1) {
685
            return $pos;
686
        }
687
688
        foreach ($this->words as $word) {
689
            if (ctype_digit($word)) {
690
                $word = '(?:' . $word . '|' . number_format($word) . ')';
691
            }
692
            if (preg_match('/(' . $word . ')/', $lcbody, $matches)) {
693
                $wordpos = strpos($lcbody, $matches[0]);
694
                if ($wordpos) {
695
                    if (($wordpos < $pos) || ($pos == -1)) {
696
                        $pos = $wordpos;
697
                    }
698
                }
699
            }
700
        }
701
702
        if ($pos == -1) {
703
            $pos = 0;
704
        }
705
706
        return $pos;
707
    }
708
}
709
710
global $SEARCHENGINE;
711
$SEARCHENGINE = null;
712