|
1
|
|
|
<?php |
|
|
|
|
|
|
2
|
|
|
# vim:sw=4:ts=4:et:nowrap |
|
3
|
|
|
|
|
4
|
|
|
/* |
|
|
|
|
|
|
5
|
|
|
SEARCHENGINE class 2004-05-26 |
|
6
|
|
|
[email protected] |
|
7
|
|
|
|
|
8
|
|
|
Example usage: |
|
9
|
|
|
|
|
10
|
|
|
include_once INCLUDESPATH."easyparliament/searchengine.php"; |
|
11
|
|
|
|
|
12
|
|
|
$searchengine = new SEARCHENGINE($searchstring); |
|
13
|
|
|
$description = $searchengine->query_description(); |
|
14
|
|
|
$short_description = $searchengine->query_description_short(); |
|
15
|
|
|
|
|
16
|
|
|
$count = $searchengine->run_count(); |
|
17
|
|
|
|
|
18
|
|
|
// $first_result begins at 0 |
|
19
|
|
|
$searchengine->run_search($first_result, $results_per_page); |
|
20
|
|
|
$gids = $searchengine->get_gids(); |
|
21
|
|
|
$relevances = $searchengine->get_relevances(); |
|
22
|
|
|
|
|
23
|
|
|
$bestpos = $searchengine->position_of_first_word($body); |
|
24
|
|
|
$extract = $searchengine->highlight($extract); |
|
25
|
|
|
|
|
26
|
|
|
*/ |
|
27
|
|
|
|
|
28
|
|
|
if (defined('XAPIANDB') AND XAPIANDB != '') { |
|
|
|
|
|
|
29
|
|
|
if (file_exists('/usr/share/php/xapian.php')) { |
|
30
|
|
|
include_once '/usr/share/php/xapian.php'; |
|
31
|
|
|
} else { |
|
32
|
|
|
twfy_debug('SEARCH', '/usr/share/php/xapian.php does not exist'); |
|
33
|
|
|
} |
|
34
|
|
|
} |
|
35
|
|
|
|
|
36
|
|
|
class SEARCHENGINE { |
|
|
|
|
|
|
37
|
|
|
public $valid = false; |
|
38
|
|
|
public $error; |
|
39
|
|
|
|
|
40
|
|
|
public function __construct($query) { |
|
41
|
|
|
if (!defined('XAPIANDB') || !XAPIANDB) |
|
|
|
|
|
|
42
|
|
|
return null; |
|
43
|
|
|
|
|
44
|
|
|
global $xapiandb, $PAGE, $hansardmajors, $parties; |
|
|
|
|
|
|
45
|
|
|
if (!$xapiandb) { |
|
46
|
|
|
if (strstr(XAPIANDB, ":")) { |
|
47
|
|
|
//ini_set('display_errors', 'On'); |
|
|
|
|
|
|
48
|
|
|
list ($xapian_host, $xapian_port) = explode(":", XAPIANDB); |
|
49
|
|
|
twfy_debug("SEARCH", "Using Xapian remote backend: " . $xapian_host . " port " . $xapian_port); |
|
50
|
|
|
$xapiandb_remote = remote_open($xapian_host, intval($xapian_port)); |
|
|
|
|
|
|
51
|
|
|
$xapiandb = new XapianDatabase($xapiandb_remote); |
|
|
|
|
|
|
52
|
|
|
} else { |
|
53
|
|
|
$xapiandb = new XapianDatabase(XAPIANDB); |
|
54
|
|
|
} |
|
55
|
|
|
} |
|
56
|
|
|
$this->query = $query; |
|
|
|
|
|
|
57
|
|
|
if (!isset($this->stemmer)) $this->stemmer = new XapianStem('english'); |
|
|
|
|
|
|
58
|
|
|
if (!isset($this->enquire)) $this->enquire = new XapianEnquire($xapiandb); |
|
|
|
|
|
|
59
|
|
|
if (!isset($this->queryparser)) { |
|
60
|
|
|
$this->queryparser = new XapianQueryParser(); |
|
|
|
|
|
|
61
|
|
|
$this->datevaluerange = new XapianDateValueRangeProcessor(1); |
|
|
|
|
|
|
62
|
|
|
$this->queryparser->set_stemmer($this->stemmer); |
|
63
|
|
|
$this->queryparser->set_stemming_strategy(XapianQueryParser::STEM_SOME); |
|
64
|
|
|
$this->queryparser->set_database($xapiandb); |
|
65
|
|
|
$this->queryparser->set_default_op(Query_OP_AND); |
|
|
|
|
|
|
66
|
|
|
$this->queryparser->add_boolean_prefix('speaker', 'S'); |
|
67
|
|
|
$this->queryparser->add_boolean_prefix('major', 'M'); |
|
68
|
|
|
$this->queryparser->add_boolean_prefix('date', 'D'); |
|
69
|
|
|
$this->queryparser->add_boolean_prefix('batch', 'B'); |
|
70
|
|
|
$this->queryparser->add_boolean_prefix('segment', 'U'); |
|
71
|
|
|
$this->queryparser->add_boolean_prefix('department', 'G'); |
|
72
|
|
|
$this->queryparser->add_boolean_prefix('party', 'P'); |
|
73
|
|
|
$this->queryparser->add_boolean_prefix('column', 'C'); |
|
74
|
|
|
$this->queryparser->add_boolean_prefix('gid', 'Q'); |
|
75
|
|
|
$this->queryparser->add_valuerangeprocessor($this->datevaluerange); |
|
76
|
|
|
} |
|
77
|
|
|
|
|
78
|
|
|
# Force words to lower case |
|
79
|
|
|
$this->query = preg_replace('#(department|party):.+?\b#ie', 'strtolower("$0")', $this->query); |
|
80
|
|
|
|
|
81
|
|
|
// Any characters other than this are treated as, basically, white space |
|
82
|
|
|
// (apart from quotes and minuses, special case below) |
|
83
|
|
|
// The colon is in here for prefixes speaker:10043 and so on. |
|
84
|
|
|
$this->wordchars = "A-Za-z0-9,.'&:_\x80-\xbf\xc2-\xf4"; |
|
|
|
|
|
|
85
|
|
|
$this->wordcharsnodigit = "A-Za-z0-9'&_\x80-\xbf\xc2-\xf4"; |
|
|
|
|
|
|
86
|
|
|
|
|
87
|
|
|
// An array of normal words. |
|
88
|
|
|
$this->words = array(); |
|
|
|
|
|
|
89
|
|
|
// All quoted phrases, as an (array of (arrays of words in each phrase)). |
|
90
|
|
|
$this->phrases = array(); |
|
|
|
|
|
|
91
|
|
|
// Items prefixed with a colon (speaker:10024) as an (array of (name, value)) |
|
92
|
|
|
$this->prefixed = array(); |
|
|
|
|
|
|
93
|
|
|
|
|
94
|
|
|
// Split words up into individual words, and quoted phrases |
|
95
|
|
|
preg_match_all('/(' . |
|
96
|
|
|
'"|' . # match either a quote, or... |
|
97
|
|
|
'(?:(?<![' .$this->wordchars. '])-)?' . # optionally a - (exclude) |
|
98
|
|
|
# if at start of word (i.e. not preceded by a word character, in |
|
99
|
|
|
# which case it is probably a hyphenated-word) |
|
100
|
|
|
'['.$this->wordchars.']+' . # followed by a string of word-characters |
|
101
|
|
|
')/', $this->query, $all_words); |
|
102
|
|
|
if ($all_words) { |
|
103
|
|
|
$all_words = $all_words[0]; |
|
104
|
|
|
} else { |
|
105
|
|
|
$all_words = array(); |
|
106
|
|
|
} |
|
107
|
|
|
$in_quote = false; |
|
108
|
|
|
$from = ''; $to = ''; |
|
109
|
|
|
foreach ($all_words as $word) { |
|
110
|
|
|
if ($word == '"') { |
|
111
|
|
|
$in_quote = !$in_quote; |
|
|
|
|
|
|
112
|
|
|
if ($in_quote) array_push($this->phrases, array()); |
|
113
|
|
|
if (!$in_quote && !count($this->phrases[count($this->phrases) - 1])) { |
|
114
|
|
|
array_pop($this->phrases); |
|
115
|
|
|
} |
|
116
|
|
|
continue; |
|
117
|
|
|
} |
|
118
|
|
|
if ($word == '') { |
|
119
|
|
|
continue; |
|
120
|
|
|
} |
|
121
|
|
|
|
|
122
|
|
|
if (strpos($word, ':') !== false) { |
|
123
|
|
|
$items = explode(":", strtolower($word)); |
|
124
|
|
|
$type = $items[0]; |
|
125
|
|
|
if (substr($type, 0, 1)=='-') $type = substr($type, 1); |
|
126
|
|
|
$value = strtolower(join(":", array_slice($items,1))); |
|
127
|
|
|
if ($type == 'section') { |
|
128
|
|
|
$newv = $value; |
|
129
|
|
|
if ($value == 'debates' || $value == 'debate') $newv = 1; |
|
130
|
|
|
elseif ($value == 'whall' || $value == 'westminster' || $value == 'westminhall') $newv = 2; |
|
131
|
|
|
elseif ($value == 'wrans' || $value == 'wran') $newv = 3; |
|
132
|
|
|
elseif ($value == 'wms' || $value == 'statements' || $value == 'statement') $newv = 4; |
|
133
|
|
|
elseif ($value == 'lordsdebates' || $value == 'lords') $newv = 101; |
|
134
|
|
|
elseif ($value == 'ni' || $value == 'nidebates') $newv = 5; |
|
135
|
|
|
elseif ($value == 'pbc' || $value == 'standing') $newv = 6; |
|
136
|
|
|
elseif ($value == 'sp') $newv = 7; |
|
137
|
|
|
elseif ($value == 'spwrans' || $value == 'spwran') $newv = 8; |
|
138
|
|
|
elseif ($value == 'uk') $newv = array(1,2,3,4,6,101); |
|
139
|
|
|
elseif ($value == 'scotland') $newv = array(7,8); |
|
140
|
|
|
elseif ($value == 'future') $newv = 'F'; |
|
141
|
|
|
if (is_array($newv)) { |
|
142
|
|
|
$newv = 'major:' . join(' major:', $newv); |
|
143
|
|
|
} else { |
|
144
|
|
|
$newv = "major:$newv"; |
|
145
|
|
|
} |
|
146
|
|
|
$this->query = str_ireplace("$type:$value", $newv, $this->query); |
|
147
|
|
|
} elseif ($type == 'groupby') { |
|
148
|
|
|
$newv = $value; |
|
149
|
|
|
if ($value == 'debates' || $value == 'debate') $newv = 'debate'; |
|
150
|
|
|
if ($value == 'speech' || $value == 'speeches') $newv = 'speech'; |
|
151
|
|
|
$this->query = str_ireplace("$type:$value", '', $this->query); |
|
152
|
|
|
array_push($this->prefixed, array($type, $newv)); |
|
153
|
|
|
} elseif ($type == 'from') { |
|
154
|
|
|
$from = $value; |
|
155
|
|
|
} elseif ($type == 'to') { |
|
156
|
|
|
$to = $value; |
|
157
|
|
|
} |
|
158
|
|
|
} elseif (strpos($word, '-') !== false) { |
|
159
|
|
|
} elseif ($in_quote) { |
|
160
|
|
|
array_push($this->phrases[count($this->phrases) - 1], strtolower($word)); |
|
161
|
|
|
} elseif (strpos($word, '..') !== false) { |
|
162
|
|
|
} elseif ($word == 'OR' || $word == 'AND' || $word == 'XOR' || $word == 'NEAR') { |
|
163
|
|
|
} else { |
|
164
|
|
|
array_push($this->words, strtolower($word)); |
|
165
|
|
|
} |
|
166
|
|
|
} |
|
167
|
|
|
if ($from && $to) { |
|
168
|
|
|
$this->query = str_ireplace("from:$from", '', $this->query); |
|
169
|
|
|
$this->query = str_ireplace("to:$to", '', $this->query); |
|
170
|
|
|
$this->query .= " $from..$to"; |
|
171
|
|
|
} elseif ($from) { |
|
172
|
|
|
$this->query = str_ireplace("from:$from", '', $this->query); |
|
173
|
|
|
$this->query .= " $from..".date('Ymd'); |
|
174
|
|
|
} elseif ($to) { |
|
175
|
|
|
$this->query = str_ireplace("to:$to", '', $this->query); |
|
176
|
|
|
$this->query .= " 19990101..$to"; |
|
177
|
|
|
} |
|
178
|
|
|
|
|
179
|
|
|
# Merged people |
|
180
|
|
|
$db = new ParlDB; |
|
181
|
|
|
$merged = $db->query('SELECT * FROM gidredirect WHERE gid_from LIKE :gid_from', array(':gid_from' => "uk.org.publicwhip/person/%")); |
|
182
|
|
|
for ($n=0; $n<$merged->rows(); $n++) { |
|
183
|
|
|
$from_id = str_replace('uk.org.publicwhip/person/', '', $merged->field($n, 'gid_from')); |
|
184
|
|
|
$to_id = str_replace('uk.org.publicwhip/person/', '', $merged->field($n, 'gid_to')); |
|
185
|
|
|
$this->query = preg_replace("#speaker:($from_id|$to_id)#i", "(speaker:$from_id OR speaker:$to_id)", $this->query); |
|
186
|
|
|
} |
|
187
|
|
|
|
|
188
|
|
|
twfy_debug("SEARCH", "prefixed: " . var_export($this->prefixed, true)); |
|
189
|
|
|
|
|
190
|
|
|
twfy_debug("SEARCH", "query -- ". $this->query); |
|
191
|
|
|
$flags = XapianQueryParser::FLAG_BOOLEAN | XapianQueryParser::FLAG_LOVEHATE | |
|
192
|
|
|
XapianQueryParser::FLAG_WILDCARD | XapianQueryParser::FLAG_SPELLING_CORRECTION; |
|
193
|
|
|
$flags = $flags | XapianQueryParser::FLAG_PHRASE; |
|
194
|
|
|
try { |
|
195
|
|
|
$query = $this->queryparser->parse_query($this->query, $flags); |
|
196
|
|
|
} catch (Exception $e) { |
|
197
|
|
|
# Nothing we can really do with a bad query |
|
198
|
|
|
$this->error = _htmlspecialchars($e->getMessage()); |
|
199
|
|
|
|
|
200
|
|
|
return null; |
|
201
|
|
|
} |
|
202
|
|
|
|
|
203
|
|
|
$this->enquire->set_query($query); |
|
204
|
|
|
|
|
205
|
|
|
# Now parse the parsed query back into a query string, yummy |
|
206
|
|
|
|
|
207
|
|
|
$qd = $query->get_description(); |
|
208
|
|
|
twfy_debug("SEARCH", "queryparser original description -- " . $qd); |
|
209
|
|
|
$qd = substr($qd, 14, -1); # Strip Xapian::Query() |
|
|
|
|
|
|
210
|
|
|
$qd = preg_replace('#:\(.*?\)#', '', $qd); # Don't need pos or weight |
|
211
|
|
|
# Date range |
|
212
|
|
|
$qd = preg_replace('#VALUE_RANGE 1 (\d+) (\d+)#e', 'preg_replace("#(\d{4})(\d\d)(\d\d)#", "\$3/\$2/\$1", $1) |
|
213
|
|
|
. ".." . preg_replace("#(\d{4})(\d\d)(\d\d)#", "\$3/\$2/\$1", $2)', $qd); |
|
214
|
|
|
# Replace phrases with the phrase in quotes |
|
215
|
|
|
preg_match_all('#\(([^(]*? PHRASE [^(]*?)\)#', $qd, $m); |
|
216
|
|
|
foreach ($m[1] as $phrase) { |
|
217
|
|
|
$phrase_new = preg_replace('# PHRASE \d+#', '', $phrase); |
|
218
|
|
|
#$this->phrases[] = preg_split('#\s+#', $phrase_new); |
|
|
|
|
|
|
219
|
|
|
$qd = str_replace("($phrase)", '"'.$phrase_new.'"', $qd); |
|
220
|
|
|
} |
|
221
|
|
|
preg_match_all('#\(([^(]*? NEAR [^(]*?)\)#', $qd, $m); |
|
222
|
|
|
foreach ($m[1] as $mm) { |
|
223
|
|
|
$mmn = preg_replace('# NEAR \d+ #', ' NEAR ', $mm); |
|
224
|
|
|
$qd = str_replace("($mm)", "($mmn)", $qd); |
|
225
|
|
|
} |
|
226
|
|
|
# Awesome regexes to get rid of superfluous matching brackets |
|
227
|
|
|
$qd = preg_replace('/( \( ( (?: (?>[^ ()]+) | (?1) ) (?: [ ](?:AND|OR|XOR|FILTER|NEAR[ ]\d+|PHRASE[ ]\d+)[ ] (?: (?>[^ ()]+) | (?1) ) )* ) \) ) [ ] (FILTER|AND_NOT)/x', '$2 $3', $qd); |
|
228
|
|
|
$qd = preg_replace('/(?:FILTER | 0 [ ] \* ) [ ] ( \( ( (?: (?>[^ ()]+) | (?1) ) (?: [ ](?:AND|OR|XOR)[ ] (?: (?>[^ ()]+) | (?1) ) )* ) \) )/x', '$2', $qd); |
|
229
|
|
|
$qd = preg_replace('/(?:FILTER | 0 [ ] \* ) [ ] ( [^()] )/x', '$1', $qd); |
|
230
|
|
|
$qd = str_replace('AND ', '', $qd); # AND is the default |
|
231
|
|
|
$qd = preg_replace('/^ ( \( ( (?: (?>[^()]+) | (?1) )* ) \) ) $/x', '$2', $qd); |
|
232
|
|
|
# Other prefixes |
|
233
|
|
|
$qd = preg_replace('#\bU(\d+)\b#', 'segment:$1', $qd); |
|
234
|
|
|
$qd = preg_replace('#\bC(\d+)\b#', 'column:$1', $qd); |
|
235
|
|
|
$qd = preg_replace('#\bQ(.*?)\b#', 'gid:$1', $qd); |
|
236
|
|
|
$qd = preg_replace('#\bP(.*?)\b#e', '"party:" . (isset($parties[ucfirst("$1")]) ? $parties[ucfirst("$1")] : "$1")', $qd); |
|
237
|
|
|
$qd = preg_replace('#\bD(.*?)\b#', 'date:$1', $qd); |
|
238
|
|
|
$qd = preg_replace('#\bG(.*?)\b#', 'department:$1', $qd); # XXX Lookup to show proper name of dept |
|
239
|
|
|
if (strstr($qd, 'M1 OR M2 OR M3 OR M4 OR M6 OR M101')) { |
|
240
|
|
|
$qd = str_replace('M1 OR M2 OR M3 OR M4 OR M6 OR M101', 'section:uk', $qd); |
|
241
|
|
|
} elseif (strstr($qd, 'M7 OR M8')) { |
|
242
|
|
|
$qd = str_replace('M7 OR M8', 'section:scotland', $qd); |
|
243
|
|
|
} |
|
244
|
|
|
$qd = preg_replace('#\bM(\d+)\b#e', '"in the \'" . (isset($hansardmajors[$1]["title"]) ? $hansardmajors[$1]["title"] . "\'" : "$1")', $qd); |
|
245
|
|
|
$qd = preg_replace('#\bMF\b#', 'in Future Business', $qd); |
|
246
|
|
|
|
|
247
|
|
|
# Replace stemmed things with their unstemmed terms from the query |
|
248
|
|
|
$used = array(); |
|
249
|
|
|
preg_match_all('#Z[^\s()]+#', $qd, $m); |
|
250
|
|
|
foreach ($m[0] as $mm) { |
|
251
|
|
|
$iter = $this->queryparser->unstem_begin($mm); |
|
252
|
|
|
$end = $this->queryparser->unstem_end($mm); |
|
253
|
|
|
while (!$iter->equals($end)) { |
|
254
|
|
|
$tt = $iter->get_term(); |
|
255
|
|
|
if (!in_array($tt, $used)) break; |
|
256
|
|
|
$iter->next(); |
|
257
|
|
|
} |
|
258
|
|
|
$used[] = $tt; |
|
|
|
|
|
|
259
|
|
|
$qd = preg_replace('#' . preg_quote($mm, '#') . '#', $tt, $qd, 1); |
|
260
|
|
|
} |
|
261
|
|
|
|
|
262
|
|
|
# Speakers |
|
263
|
|
|
for ($n=0; $n<$merged->rows(); $n++) { |
|
264
|
|
|
$from_id = str_replace('uk.org.publicwhip/person/', '', $merged->field($n, 'gid_from')); |
|
265
|
|
|
$to_id = str_replace('uk.org.publicwhip/person/', '', $merged->field($n, 'gid_to')); |
|
266
|
|
|
$qd = str_replace("(S$from_id OR S$to_id)", "S$to_id", $qd); |
|
267
|
|
|
$qd = str_replace("S$from_id OR S$to_id", "S$to_id", $qd); |
|
268
|
|
|
} |
|
269
|
|
|
|
|
270
|
|
|
preg_match_all('#S(\d+)#', $qd, $m); |
|
271
|
|
|
foreach ($m[1] as $mm) { |
|
272
|
|
|
$member = new MEMBER(array('person_id' => $mm)); |
|
273
|
|
|
$name = $member->full_name(); |
|
274
|
|
|
$qd = str_replace("S$mm", "speaker:$name", $qd); |
|
275
|
|
|
} |
|
276
|
|
|
|
|
277
|
|
|
# Simplify display of excluded words |
|
278
|
|
|
$qd = preg_replace('#AND_NOT ([a-z0-9"]+)#', '-$1', $qd); |
|
279
|
|
|
preg_match_all('#AND_NOT \((.*?)\)#', $qd, $m); |
|
280
|
|
|
foreach ($m[1] as $mm) { |
|
281
|
|
|
$mmn = '-' . join(' -', explode(' OR ', $mm)); |
|
282
|
|
|
$qd = str_replace("AND_NOT ($mm)", $mmn, $qd); |
|
283
|
|
|
} |
|
284
|
|
|
|
|
285
|
|
|
foreach ($this->prefixed as $items) { |
|
286
|
|
|
if ($items[0] == 'groupby') { |
|
287
|
|
|
if ($items[1] == 'debate') { |
|
288
|
|
|
$qd .= ' grouped by debate'; |
|
289
|
|
|
} elseif ($items[1] == 'speech') { |
|
290
|
|
|
$qd .= ' showing all speeches'; |
|
291
|
|
|
} else { |
|
292
|
|
|
$PAGE->error_message("Unknown group by '$items[1]' ignored"); |
|
293
|
|
|
} |
|
294
|
|
|
} |
|
295
|
|
|
} |
|
296
|
|
|
|
|
297
|
|
|
$this->query_desc = trim($qd); |
|
|
|
|
|
|
298
|
|
|
|
|
299
|
|
|
#print 'DEBUG: ' . $query->get_description(); |
|
|
|
|
|
|
300
|
|
|
twfy_debug("SEARCH", "words: " . var_export($this->words, true)); |
|
301
|
|
|
twfy_debug("SEARCH", "phrases: " . var_export($this->phrases, true)); |
|
302
|
|
|
twfy_debug("SEARCH", "queryparser description -- " . $this->query_desc); |
|
303
|
|
|
|
|
304
|
|
|
$this->valid = true; |
|
305
|
|
|
} |
|
306
|
|
|
|
|
307
|
|
|
public function query_description_internal($long) { |
|
308
|
|
|
if (!defined('XAPIANDB') || !XAPIANDB) { |
|
|
|
|
|
|
309
|
|
|
return ''; |
|
310
|
|
|
} |
|
311
|
|
|
if (!$this->valid) { |
|
312
|
|
|
return '[bad query]'; |
|
313
|
|
|
} |
|
314
|
|
|
|
|
315
|
|
|
return $this->query_desc; |
|
316
|
|
|
} |
|
317
|
|
|
|
|
318
|
|
|
// Return textual description of search |
|
319
|
|
|
public function query_description_short() { |
|
320
|
|
|
return $this->query_description_internal(false); |
|
321
|
|
|
} |
|
322
|
|
|
|
|
323
|
|
|
// Return textual description of search |
|
324
|
|
|
public function query_description_long() { |
|
325
|
|
|
return $this->query_description_internal(true); |
|
326
|
|
|
} |
|
327
|
|
|
|
|
328
|
|
|
// Return stem of a word |
|
329
|
|
|
public function stem($word) { |
|
330
|
|
|
return $this->stemmer->apply(strtolower($word)); |
|
331
|
|
|
} |
|
332
|
|
|
|
|
333
|
|
|
public function get_spelling_correction() { |
|
334
|
|
|
if (!defined('XAPIANDB') || !XAPIANDB) |
|
|
|
|
|
|
335
|
|
|
return null; |
|
336
|
|
|
|
|
337
|
|
|
$qd = $this->queryparser->get_corrected_query_string(); |
|
338
|
|
|
return $qd; |
|
339
|
|
|
} |
|
340
|
|
|
|
|
341
|
|
|
// Perform partial query to get a count of number of matches |
|
342
|
|
|
public function run_count($first_result, $results_per_page, $sort_order='relevance') { |
|
343
|
|
|
if (!defined('XAPIANDB') || !XAPIANDB) |
|
|
|
|
|
|
344
|
|
|
return null; |
|
345
|
|
|
|
|
346
|
|
|
$start = getmicrotime(); |
|
347
|
|
|
|
|
348
|
|
|
switch ($sort_order) { |
|
349
|
|
|
case 'date': |
|
350
|
|
|
case 'newest': |
|
351
|
|
|
$this->enquire->set_sort_by_value(0, true); |
|
352
|
|
|
break; |
|
353
|
|
|
case 'oldest': |
|
354
|
|
|
$this->enquire->set_sort_by_value(0, false); |
|
355
|
|
|
break; |
|
356
|
|
|
case 'created': |
|
|
|
|
|
|
357
|
|
|
$this->enquire->set_sort_by_value(2); |
|
|
|
|
|
|
358
|
|
|
default: |
|
359
|
|
|
//do nothing, default ordering is by relevance |
|
360
|
|
|
break; |
|
361
|
|
|
} |
|
362
|
|
|
|
|
363
|
|
|
// Set collapsing and sorting |
|
364
|
|
|
global $PAGE; |
|
|
|
|
|
|
365
|
|
|
$collapsed = false; |
|
366
|
|
|
if (preg_match('#(speaker|segment):\d+#', $this->query)) { |
|
367
|
|
|
$collapsed = true; |
|
368
|
|
|
} |
|
369
|
|
|
foreach ($this->prefixed as $items) { |
|
370
|
|
|
if ($items[0] == 'groupby') { |
|
371
|
|
|
$collapsed = true; |
|
372
|
|
|
if ($items[1] == 'speech') |
|
373
|
|
|
; // no collapse key |
|
374
|
|
|
elseif ($items[1] == 'debate') |
|
375
|
|
|
$this->enquire->set_collapse_key(3); |
|
376
|
|
|
else |
|
377
|
|
|
$PAGE->error_message("Unknown group by '$items[1]' ignored"); |
|
378
|
|
|
} |
|
379
|
|
|
} |
|
380
|
|
|
|
|
381
|
|
|
// default to grouping by subdebate, i.e. by page |
|
382
|
|
|
if (!$collapsed) |
|
383
|
|
|
$this->enquire->set_collapse_key(3); |
|
384
|
|
|
|
|
385
|
|
|
/* |
|
|
|
|
|
|
386
|
|
|
XXX Helping to debug possible Xapian bug |
|
387
|
|
|
foreach (array(0, 50, 100, 200, 300, 400, 460) as $fff) { |
|
388
|
|
|
foreach (array(0, 100, 300, 500, 1000) as $cal) { |
|
389
|
|
|
print "get_mset($fff, 20, $cal): "; |
|
390
|
|
|
$m = $this->enquire->get_mset($fff, 20, $cal); |
|
391
|
|
|
print $m->get_matches_estimated(). ' '; |
|
392
|
|
|
print $m->get_matches_lower_bound() . ' '; |
|
393
|
|
|
print $m->get_matches_upper_bound() . "\n"; |
|
394
|
|
|
} |
|
395
|
|
|
} |
|
396
|
|
|
*/ |
|
397
|
|
|
|
|
398
|
|
|
#$matches = $this->enquire->get_mset(0, 500); |
|
|
|
|
|
|
399
|
|
|
$this->matches = $this->enquire->get_mset($first_result, $results_per_page, 100); |
|
|
|
|
|
|
400
|
|
|
// Take either: 1) the estimate which is sometimes too large or 2) the |
|
401
|
|
|
// size which is sometimes too low (it is limited to the 500 in the line |
|
402
|
|
|
// above). We get the exact mset we need later, according to which page |
|
403
|
|
|
// we are on. |
|
404
|
|
|
#if ($matches->size() < 500) { |
|
|
|
|
|
|
405
|
|
|
#$count = $matches->size(); |
|
|
|
|
|
|
406
|
|
|
#} else { |
|
407
|
|
|
$count = $this->matches->get_matches_estimated(); |
|
408
|
|
|
# print "DEBUG bounds: "; |
|
409
|
|
|
# print $this->matches->get_matches_lower_bound(); |
|
|
|
|
|
|
410
|
|
|
# print ' - '; |
|
411
|
|
|
# print $this->matches->get_matches_upper_bound(); |
|
|
|
|
|
|
412
|
|
|
#} |
|
413
|
|
|
|
|
414
|
|
|
$duration = getmicrotime() - $start; |
|
415
|
|
|
twfy_debug ("SEARCH", "Search count took $duration seconds."); |
|
416
|
|
|
|
|
417
|
|
|
return $count; |
|
418
|
|
|
} |
|
419
|
|
|
|
|
420
|
|
|
// Perform the full search... |
|
421
|
|
|
public function run_search($first_result, $results_per_page, $sort_order='relevance') { |
|
422
|
|
|
$start = getmicrotime(); |
|
423
|
|
|
|
|
424
|
|
|
#$matches = $this->enquire->get_mset($first_result, $results_per_page); |
|
|
|
|
|
|
425
|
|
|
$matches = $this->matches; |
|
426
|
|
|
$this->gids = array(); |
|
|
|
|
|
|
427
|
|
|
$this->created = array(); |
|
|
|
|
|
|
428
|
|
|
$this->collapsed = array(); |
|
|
|
|
|
|
429
|
|
|
$this->relevances = array(); |
|
|
|
|
|
|
430
|
|
|
$iter = $matches->begin(); |
|
431
|
|
|
$end = $matches->end(); |
|
432
|
|
|
while (!$iter->equals($end)) { |
|
433
|
|
|
$relevancy = $iter->get_percent(); |
|
434
|
|
|
$weight = $iter->get_weight(); |
|
435
|
|
|
$collapsed = $iter->get_collapse_count(); |
|
436
|
|
|
$doc = $iter->get_document(); |
|
437
|
|
|
$gid = $doc->get_data(); |
|
438
|
|
|
if ($sort_order == 'created') { |
|
439
|
|
|
array_push($this->created, join('', unpack('N', $doc->get_value(2)))); # XXX Needs fixing |
|
440
|
|
|
} |
|
441
|
|
|
twfy_debug("SEARCH", "gid: $gid relevancy: $relevancy% weight: $weight"); |
|
442
|
|
|
array_push($this->gids, "uk.org.publicwhip/".$gid); |
|
443
|
|
|
array_push($this->collapsed, $collapsed); |
|
444
|
|
|
array_push($this->relevances, $relevancy); |
|
445
|
|
|
$iter->next(); |
|
446
|
|
|
} |
|
447
|
|
|
$duration = getmicrotime() - $start; |
|
448
|
|
|
twfy_debug ("SEARCH", "Run search took $duration seconds."); |
|
449
|
|
|
} |
|
450
|
|
|
// ... use these to get the results |
|
451
|
|
|
public function get_gids() { |
|
452
|
|
|
return $this->gids; |
|
453
|
|
|
} |
|
454
|
|
|
public function get_relevances() { |
|
455
|
|
|
return $this->relevances; |
|
456
|
|
|
} |
|
457
|
|
|
public function get_createds() { |
|
458
|
|
|
return $this->created; |
|
459
|
|
|
} |
|
460
|
|
|
|
|
461
|
|
|
// Puts HTML highlighting round all the matching words in the text |
|
462
|
|
|
public function highlight($body) { |
|
463
|
|
|
if (!defined('XAPIANDB') || !XAPIANDB) |
|
|
|
|
|
|
464
|
|
|
return $body; |
|
465
|
|
|
|
|
466
|
|
|
$stemmed_words = array_map(array($this, 'stem'), $this->words); |
|
467
|
|
|
if (is_array($body)) { |
|
468
|
|
|
foreach ($body as $k => $b) { |
|
469
|
|
|
$body[$k] = $this->highlight_internal($b, $stemmed_words); |
|
470
|
|
|
} |
|
471
|
|
|
|
|
472
|
|
|
return $body; |
|
473
|
|
|
} else { |
|
474
|
|
|
return $this->highlight_internal($body, $stemmed_words); |
|
475
|
|
|
} |
|
476
|
|
|
} |
|
477
|
|
|
|
|
478
|
|
|
private $specialchars = array('<', '>', '"', '&'); |
|
479
|
|
|
private $specialchars_upper = array('<', '>', '"', '&'); |
|
480
|
|
|
|
|
481
|
|
|
public function highlight_internal($body, $stemmed_words) { |
|
482
|
|
|
if (!defined('XAPIANDB') || !XAPIANDB) |
|
|
|
|
|
|
483
|
|
|
return $body; |
|
484
|
|
|
|
|
485
|
|
|
# Does html_entity_decode without the htmlspecialchars |
|
486
|
|
|
$body = str_replace($this->specialchars, $this->specialchars_upper, $body); |
|
487
|
|
|
$body = mb_convert_encoding($body, "UTF-8", "HTML-ENTITIES"); |
|
488
|
|
|
$body = str_replace($this->specialchars_upper, $this->specialchars, $body); |
|
489
|
|
|
$splitextract = preg_split('/(<[^>]*>|[0-9,.]+|['.$this->wordcharsnodigit.']+)/', $body, -1, PREG_SPLIT_DELIM_CAPTURE); |
|
490
|
|
|
$hlextract = ""; |
|
491
|
|
|
foreach ($splitextract as $extractword) { |
|
492
|
|
|
if (preg_match('/^<[^>]*>$/', $extractword)) { |
|
493
|
|
|
$hlextract .= $extractword; |
|
494
|
|
|
continue; |
|
495
|
|
|
} |
|
496
|
|
|
$endswithamp = ''; |
|
497
|
|
|
if (substr($extractword, -1) == '&') { |
|
498
|
|
|
$extractword = substr($extractword, 0, -1); |
|
499
|
|
|
$endswithamp = '&'; |
|
500
|
|
|
} |
|
501
|
|
|
$hl = false; |
|
502
|
|
|
$matchword = $this->stem($extractword); |
|
503
|
|
|
foreach ($stemmed_words as $word) { |
|
504
|
|
|
if ($word == '') continue; |
|
505
|
|
|
if ($matchword == $word) { |
|
506
|
|
|
$hl = true; |
|
507
|
|
|
break; |
|
508
|
|
|
} |
|
509
|
|
|
} |
|
510
|
|
|
if ($hl) { |
|
511
|
|
|
$hlextract .= "<span class=\"hi\">$extractword</span>$endswithamp"; |
|
512
|
|
|
} else { |
|
513
|
|
|
$hlextract .= $extractword . $endswithamp; |
|
514
|
|
|
} |
|
515
|
|
|
} |
|
516
|
|
|
$body = preg_replace("#</span>\s+<span class=\"hi\">#", " ", $hlextract); |
|
517
|
|
|
|
|
518
|
|
|
// Contents will be used in preg_replace() to highlight the search terms. |
|
519
|
|
|
$findwords = array(); |
|
520
|
|
|
$replacewords = array(); |
|
521
|
|
|
|
|
522
|
|
|
/* |
|
523
|
|
|
XXX OLD Way of doing it, doesn't work too well with stemming... |
|
524
|
|
|
foreach ($this->words as $word) { |
|
525
|
|
|
if (ctype_digit($word)) { |
|
526
|
|
|
array_push($findwords, "/\b($word|" . number_format($word) . ")\b/"); |
|
527
|
|
|
} else { |
|
528
|
|
|
array_push($findwords, "/\b($word)\b/i"); |
|
529
|
|
|
} |
|
530
|
|
|
array_push($replacewords, "<span class=\"hi\">\\1</span>"); |
|
531
|
|
|
//array_push($findwords, "/([^>\.\'])\b(" . $word . ")\b([^<\'])/i"); |
|
532
|
|
|
//array_push($replacewords, "\\1<span class=\"hi\">\\2</span>\\3"); |
|
533
|
|
|
} |
|
534
|
|
|
*/ |
|
535
|
|
|
|
|
536
|
|
|
foreach ($this->phrases as $phrase) { |
|
537
|
|
|
$phrasematch = join($phrase, '[^'.$this->wordchars.']+'); |
|
|
|
|
|
|
538
|
|
|
array_push($findwords, "/\b($phrasematch)\b(?!(?>[^<>]*>))/i"); |
|
539
|
|
|
$replacewords[] = "<span class=\"hi\">\\1</span>"; |
|
540
|
|
|
} |
|
541
|
|
|
|
|
542
|
|
|
// Highlight search phrases. |
|
543
|
|
|
$hlbody = preg_replace($findwords, $replacewords, $body); |
|
544
|
|
|
|
|
545
|
|
|
return $hlbody; |
|
546
|
|
|
} |
|
547
|
|
|
|
|
548
|
|
|
// Find the position of the first of the search words/phrases in $body. |
|
549
|
|
|
public function position_of_first_word($body) { |
|
550
|
|
|
$lcbody = ' ' . html_entity_decode(strtolower($body)) . ' '; // spaces to make regexp mapping easier |
|
551
|
|
|
$pos = -1; |
|
552
|
|
|
|
|
553
|
|
|
// look for phrases |
|
554
|
|
|
foreach ($this->phrases as $phrase) { |
|
555
|
|
|
$phrasematch = join($phrase, '[^'.$this->wordchars.']+'); |
|
|
|
|
|
|
556
|
|
|
if (preg_match('/([^'.$this->wordchars.']' . $phrasematch . '[^A-Za-z0-9])/', $lcbody, $matches)) |
|
557
|
|
|
{ |
|
558
|
|
|
$wordpos = strpos( $lcbody, $matches[0] ); |
|
559
|
|
|
if ($wordpos) { |
|
560
|
|
|
if ( ($wordpos < $pos) || ($pos==-1) ) { |
|
561
|
|
|
$pos = $wordpos; |
|
562
|
|
|
} |
|
563
|
|
|
} |
|
564
|
|
|
} |
|
565
|
|
|
} |
|
566
|
|
|
if ($pos != -1) return $pos; |
|
567
|
|
|
|
|
568
|
|
|
$splitextract = preg_split('/([0-9,.]+|['.$this->wordcharsnodigit.']+)/', $lcbody, -1, PREG_SPLIT_DELIM_CAPTURE); |
|
569
|
|
|
$stemmed_words = array_map(array($this, 'stem'), $this->words); |
|
570
|
|
|
foreach ($splitextract as $extractword) { |
|
571
|
|
|
$extractword = preg_replace('/&$/', '', $extractword); |
|
572
|
|
|
if (!$extractword) continue; |
|
573
|
|
|
$wordpos = strpos($lcbody, $extractword); |
|
574
|
|
|
if (!$wordpos) continue; |
|
575
|
|
|
foreach ($stemmed_words as $word) { |
|
576
|
|
|
if ($word == '') continue; |
|
577
|
|
|
$matchword = $this->stem($extractword); |
|
578
|
|
|
if ($matchword == $word && ($wordpos < $pos || $pos==-1)) { |
|
579
|
|
|
$pos = $wordpos; |
|
580
|
|
|
} |
|
581
|
|
|
} |
|
582
|
|
|
} |
|
583
|
|
|
// only look for earlier words if phrases weren't found |
|
584
|
|
|
if ($pos != -1) return $pos; |
|
585
|
|
|
|
|
586
|
|
|
foreach ($this->words as $word) { |
|
587
|
|
|
if (ctype_digit($word)) $word = '(?:'.$word.'|'.number_format($word).')'; |
|
588
|
|
|
if (preg_match('/([^'.$this->wordchars.']' . $word . '[^'.$this->wordchars. '])/', $lcbody, $matches)) { |
|
589
|
|
|
$wordpos = strpos( $lcbody, $matches[0] ); |
|
590
|
|
|
if ($wordpos) { |
|
591
|
|
|
if ( ($wordpos < $pos) || ($pos==-1) ) { |
|
592
|
|
|
$pos = $wordpos; |
|
593
|
|
|
} |
|
594
|
|
|
} |
|
595
|
|
|
} |
|
596
|
|
|
} |
|
597
|
|
|
// only look for something containing the word (ie. something stemmed, but doesn't work all the time) if no whole word was found |
|
598
|
|
|
if ($pos != -1) return $pos; |
|
599
|
|
|
|
|
600
|
|
|
foreach ($this->words as $word) { |
|
601
|
|
|
if (ctype_digit($word)) $word = '(?:'.$word.'|'.number_format($word).')'; |
|
602
|
|
|
if (preg_match('/(' . $word . ')/', $lcbody, $matches)) { |
|
603
|
|
|
$wordpos = strpos( $lcbody, $matches[0] ); |
|
604
|
|
|
if ($wordpos) { |
|
605
|
|
|
if ( ($wordpos < $pos) || ($pos==-1) ) { |
|
606
|
|
|
$pos = $wordpos; |
|
607
|
|
|
} |
|
608
|
|
|
} |
|
609
|
|
|
} |
|
610
|
|
|
} |
|
611
|
|
|
|
|
612
|
|
|
if ($pos == -1) |
|
613
|
|
|
$pos = 0; |
|
614
|
|
|
|
|
615
|
|
|
return $pos; |
|
616
|
|
|
} |
|
617
|
|
|
} |
|
618
|
|
|
|
|
619
|
|
|
global $SEARCHENGINE; |
|
|
|
|
|
|
620
|
|
|
$SEARCHENGINE = null; |
|
621
|
|
|
|
The PSR-1: Basic Coding Standard recommends that a file should either introduce new symbols, that is classes, functions, constants or similar, or have side effects. Side effects are anything that executes logic, like for example printing output, changing ini settings or writing to a file.
The idea behind this recommendation is that merely auto-loading a class should not change the state of an application. It also promotes a cleaner style of programming and makes your code less prone to errors, because the logic is not spread out all over the place.
To learn more about the PSR-1, please see the PHP-FIG site on the PSR-1.