|
1
|
|
|
<?php |
|
2
|
|
|
/** |
|
3
|
|
|
* @package midcom.helper.datamanager2 |
|
4
|
|
|
* @author The Midgard Project, http://www.midgard-project.org |
|
5
|
|
|
* @copyright The Midgard Project, http://www.midgard-project.org |
|
6
|
|
|
* @license http://www.gnu.org/licenses/lgpl.html GNU Lesser General Public License |
|
7
|
|
|
*/ |
|
8
|
|
|
|
|
9
|
|
|
namespace midcom\datamanager\indexer; |
|
10
|
|
|
|
|
11
|
|
|
use Symfony\Component\Form\FormView; |
|
12
|
|
|
use midcom\datamanager\datamanager; |
|
13
|
|
|
use midcom\datamanager\template\view; |
|
14
|
|
|
use midcom\datamanager\renderer; |
|
15
|
|
|
use midcom_services_indexer_document_midcom; |
|
16
|
|
|
use midcom_error; |
|
17
|
|
|
|
|
18
|
|
|
/** |
|
19
|
|
|
* This class is geared to ease indexing of datamanager driven documents. The |
|
20
|
|
|
* user invoking the indexing must have full read permissions to the object. |
|
21
|
|
|
* |
|
22
|
|
|
* <b>Basic indexing operation</b> |
|
23
|
|
|
* |
|
24
|
|
|
* This class uses a number of conventions, see below, to merge an existing |
|
25
|
|
|
* datamanager driven document into an indexing capable document. It requires |
|
26
|
|
|
* the callee to instantiate the datamanager, as this class would have no |
|
27
|
|
|
* idea where to take the schema database from. |
|
28
|
|
|
* |
|
29
|
|
|
* The RI (the GUID) from the base class is left untouched. |
|
30
|
|
|
* |
|
31
|
|
|
* <b>Indexing field defaults:</b> |
|
32
|
|
|
* |
|
33
|
|
|
* Unless you specify anything else explicitly in the schema, |
|
34
|
|
|
* the class will merge all text based fields together to form the <i>content</i> |
|
35
|
|
|
* field of the index record, to allow for easy searching of the document. |
|
36
|
|
|
* This will *not* include any metadata like keywords or summaries. |
|
37
|
|
|
* |
|
38
|
|
|
* If the schema contains a field <i>abstract</i>, it will also be used as |
|
39
|
|
|
* abstract field for the indexing process. In the same way, fields named |
|
40
|
|
|
* <i>title</i> or <i>author</i> will be used for the index document's title |
|
41
|
|
|
* or author respectively. The contents of abstract, title and author will also |
|
42
|
|
|
* be appended to the content field at the end of the object construction, |
|
43
|
|
|
* easing searching over this fields. |
|
44
|
|
|
* |
|
45
|
|
|
* If no abstract field is present, the first 200 characters of the content |
|
46
|
|
|
* area are used instead. |
|
47
|
|
|
* |
|
48
|
|
|
* Not all types can be indexed, check the various types in question about their |
|
49
|
|
|
* indexing capabilities. In general, if the system should index any non-text |
|
50
|
|
|
* field, it will use the CSV representation for implicit conversion. |
|
51
|
|
|
* |
|
52
|
|
|
* Metadata processing is done by the base class. |
|
53
|
|
|
* |
|
54
|
|
|
* <b>Document title:</b> |
|
55
|
|
|
* |
|
56
|
|
|
* You should either have an auto-indexed title field, or an assortment |
|
57
|
|
|
* of other fields manually assigned to index to the title field. |
|
58
|
|
|
* |
|
59
|
|
|
* <b>Configurability using the Datamanager schema:</b> |
|
60
|
|
|
* |
|
61
|
|
|
* You can decorate datamanager fields with various directives influencing |
|
62
|
|
|
* the indexing. See the Datamanager's schema documentation for details. |
|
63
|
|
|
* Basically, you can choose from the following indexing methods using the |
|
64
|
|
|
* key 'index_method' for each field: |
|
65
|
|
|
* |
|
66
|
|
|
* - The default <i>auto</i> mode will use the above guidelines to determine |
|
67
|
|
|
* the indexing destination automatically, adding data to the content, abstract, |
|
68
|
|
|
* title and author fields respectively. |
|
69
|
|
|
* - You can specify <i>abstract</i>, <i>content</i>, <i>title</i> or |
|
70
|
|
|
* <i>author</i> to indicate that the field should be used for the indicated |
|
71
|
|
|
* document fields. The content selector may be specified more than once, |
|
72
|
|
|
* indicating that the content of the relevant fields should be merged. |
|
73
|
|
|
* - Any date field can be indexed into its own, range-filterable field using |
|
74
|
|
|
* the <i>date</i> method. In this case, two document fields will be created |
|
75
|
|
|
* actually. One containing the filterable timestamp named directly after |
|
76
|
|
|
* the schema field, and a second one, having the _TS postfix which is set as |
|
77
|
|
|
* noindex containing the plain timestamp. |
|
78
|
|
|
* - Finally, you can explicitly index a field as a separate document field |
|
79
|
|
|
* using one of the five field types <i>keyword</i>, <i>unindexed</i>, |
|
80
|
|
|
* <i>unstored</i> or <i>text</i>. You can further control if the content |
|
81
|
|
|
* of these fields is also added to the main content field. This is useful |
|
82
|
|
|
* if you want to have fields searchable both by explicit field specification |
|
83
|
|
|
* and the default field for simpler searches. This is controlled by setting |
|
84
|
|
|
* the bolean key 'index_merge_with_content' in the field, which defaults |
|
85
|
|
|
* to true. |
|
86
|
|
|
* - <i>noindex</i> will prevent indexing of this field. |
|
87
|
|
|
* |
|
88
|
|
|
* The documents type is "midcom_datamanager". |
|
89
|
|
|
* |
|
90
|
|
|
* @see midcom_services_indexer |
|
91
|
|
|
*/ |
|
92
|
|
|
class document extends midcom_services_indexer_document_midcom |
|
93
|
|
|
{ |
|
94
|
|
|
/** |
|
95
|
|
|
* The datamanager instance of the document we need to index. |
|
96
|
|
|
* |
|
97
|
|
|
* @var datamanager |
|
98
|
|
|
*/ |
|
99
|
|
|
private $datamanager; |
|
100
|
|
|
|
|
101
|
|
|
/** |
|
102
|
|
|
* The constructor initializes the member variables and invokes |
|
103
|
|
|
* _process_datamanager, which will read and process the information |
|
104
|
|
|
* out of that instance. |
|
105
|
|
|
* |
|
106
|
|
|
* The document is ready for indexing after construction. On any |
|
107
|
|
|
* critical error, midcom_error is triggered. |
|
108
|
|
|
* |
|
109
|
|
|
* @param datamanager $datamanager The fully initialized datamanager2 instance to use |
|
110
|
|
|
*/ |
|
111
|
|
|
public function __construct($datamanager) |
|
112
|
|
|
{ |
|
113
|
|
|
parent::__construct($datamanager->get_storage()->get_value()); |
|
114
|
|
|
|
|
115
|
|
|
$this->_set_type('datamanager'); |
|
116
|
|
|
|
|
117
|
|
|
$this->datamanager = $datamanager; |
|
118
|
|
|
|
|
119
|
|
|
$this->process_datamanager(); |
|
120
|
|
|
$this->complete_fields(); |
|
121
|
|
|
} |
|
122
|
|
|
|
|
123
|
|
|
/** |
|
124
|
|
|
* Completes all fields which are not yet complete: |
|
125
|
|
|
* |
|
126
|
|
|
* content is completed with author, title and, if necessary, abstract. |
|
127
|
|
|
* |
|
128
|
|
|
* The title is set to the documents' URL in case that no title is set yet. The title |
|
129
|
|
|
* is not added to the content field in that case. |
|
130
|
|
|
*/ |
|
131
|
|
View Code Duplication |
private function complete_fields() |
|
|
|
|
|
|
132
|
|
|
{ |
|
133
|
|
|
$this->content .= "{$this->author}\n{$this->title}\n"; |
|
134
|
|
|
|
|
135
|
|
|
// Add the abstract only if we haven't done so already. |
|
136
|
|
|
if (strstr($this->abstract, $this->content) === false) |
|
137
|
|
|
{ |
|
138
|
|
|
$this->content .= "{$this->abstract}\n"; |
|
139
|
|
|
} |
|
140
|
|
|
|
|
141
|
|
|
if (! $this->title) |
|
142
|
|
|
{ |
|
143
|
|
|
$this->title = $this->document_url; |
|
144
|
|
|
} |
|
145
|
|
|
} |
|
146
|
|
|
|
|
147
|
|
|
/** |
|
148
|
|
|
* Processes the information contained in the datamanager instance. |
|
149
|
|
|
* |
|
150
|
|
|
* The function iterates over the fields in the schema, and processes them |
|
151
|
|
|
* according to the rules given in the introduction. |
|
152
|
|
|
*/ |
|
153
|
|
|
private function process_datamanager() |
|
154
|
|
|
{ |
|
155
|
|
|
$view = $this->datamanager->get_form()->createView(); |
|
156
|
|
|
$renderer = $this->datamanager->get_renderer(); |
|
157
|
|
|
$renderer->set_template($view, new view($renderer)); |
|
158
|
|
|
foreach ($view as $name => $field) |
|
159
|
|
|
{ |
|
160
|
|
|
$method = $field->vars['index_method']; |
|
161
|
|
|
if ($method == 'auto') |
|
162
|
|
|
{ |
|
163
|
|
|
$method = $this->resolve_auto_method($field->vars['name']); |
|
164
|
|
|
} |
|
165
|
|
|
|
|
166
|
|
|
switch ($method) |
|
167
|
|
|
{ |
|
168
|
|
|
case 'abstract': |
|
169
|
|
|
case 'title': |
|
170
|
|
|
case 'author': |
|
171
|
|
|
$this->{$method} = $renderer->widget($field); |
|
172
|
|
|
break; |
|
173
|
|
|
|
|
174
|
|
|
case 'content': |
|
175
|
|
|
$this->content .= $renderer->widget($field) . "\n"; |
|
176
|
|
|
break; |
|
177
|
|
|
|
|
178
|
|
|
case 'date': |
|
179
|
|
|
$this->add_as_date_field($name); |
|
180
|
|
|
break; |
|
181
|
|
|
|
|
182
|
|
|
case 'attachment': |
|
183
|
|
|
if (!empty($field->vars['value'])) |
|
184
|
|
|
{ |
|
185
|
|
|
//only index the first attachment for now |
|
186
|
|
|
$attachment = array_shift($field->vars['value']); |
|
187
|
|
|
if ( !$attachment instanceof \midcom_db_attachment |
|
188
|
|
|
&& !empty($attachment['object'])) |
|
189
|
|
|
{ |
|
190
|
|
|
//This is the form edit case |
|
191
|
|
|
//@todo: In create case, nothing is found currently |
|
192
|
|
|
$attachment = $attachment['object']; |
|
193
|
|
|
} |
|
194
|
|
|
if ($attachment instanceof \midcom_db_attachment) |
|
195
|
|
|
{ |
|
196
|
|
|
$att_doc = new \midcom_services_indexer_document_attachment($attachment, $view->vars['value']->get_value()); |
|
197
|
|
|
$this->content .= $att_doc->content; |
|
198
|
|
|
$this->abstract .= $att_doc->abstract; |
|
199
|
|
|
} |
|
200
|
|
|
} |
|
201
|
|
|
|
|
202
|
|
|
break; |
|
203
|
|
|
|
|
204
|
|
|
case 'unstored': |
|
205
|
|
|
case 'unindexed': |
|
206
|
|
|
case 'text': |
|
207
|
|
View Code Duplication |
case 'keyword': |
|
|
|
|
|
|
208
|
|
|
$data = $renderer->widget($field); |
|
209
|
|
|
$function = 'add_' . $method; |
|
210
|
|
|
$this->$function($name, $data); |
|
211
|
|
|
if ($field->vars['index_merge_with_content']) |
|
212
|
|
|
{ |
|
213
|
|
|
$this->content .= $data . "\n"; |
|
214
|
|
|
} |
|
215
|
|
|
break; |
|
216
|
|
|
|
|
217
|
|
|
case 'noindex': |
|
218
|
|
|
break; |
|
219
|
|
|
|
|
220
|
|
|
default: |
|
221
|
|
|
throw new midcom_error(" Unknown indexing method {$method} for field {$name} discovered, aborting."); |
|
222
|
|
|
} |
|
223
|
|
|
} |
|
224
|
|
|
|
|
225
|
|
View Code Duplication |
if ($this->abstract == '') |
|
|
|
|
|
|
226
|
|
|
{ |
|
227
|
|
|
$this->abstract = $this->html2text($this->content); |
|
228
|
|
|
if (strlen($this->abstract) > 200) |
|
229
|
|
|
{ |
|
230
|
|
|
$this->abstract = substr($this->abstract, 0, 200) . ' ...'; |
|
231
|
|
|
} |
|
232
|
|
|
} |
|
233
|
|
|
} |
|
234
|
|
|
|
|
235
|
|
|
/** |
|
236
|
|
|
* This function tries to convert the field $name into a date |
|
237
|
|
|
* representation. Unixdate fields are used directly (localtime is used, |
|
238
|
|
|
* not GMT), other fields will be parsed with strtodate. |
|
239
|
|
|
* |
|
240
|
|
|
* Invalid strings which are not parseable using strtotime will be |
|
241
|
|
|
* stored as a "0" timestamp. |
|
242
|
|
|
* |
|
243
|
|
|
* Be aware, that this will work only for current dates in range of an |
|
244
|
|
|
* UNIX timestamp. For all other cases you should use an ISO 8601 representation, |
|
245
|
|
|
* which should work as well with Lucene range queries. |
|
246
|
|
|
* |
|
247
|
|
|
* @todo Refactor this to use DateTime |
|
248
|
|
|
* @param string $name The name of the field that should be stored |
|
|
|
|
|
|
249
|
|
|
*/ |
|
250
|
|
View Code Duplication |
private function add_as_date_field(FormView $field) |
|
|
|
|
|
|
251
|
|
|
{ |
|
252
|
|
|
if ($field->vars['dm2_type'] == 'date') |
|
253
|
|
|
{ |
|
254
|
|
|
$timestamp = 0; |
|
255
|
|
|
if (!$this->datamanager->types[$name]->is_empty()) |
|
256
|
|
|
{ |
|
257
|
|
|
$timestamp = $this->datamanager->types[$name]->value->format('U'); |
|
|
|
|
|
|
258
|
|
|
} |
|
259
|
|
|
$this->add_date_pair($name, $timestamp); |
|
260
|
|
|
} |
|
261
|
|
|
else |
|
262
|
|
|
{ |
|
263
|
|
|
$string = $this->datamanager->types[$name]->convert_to_html(); |
|
264
|
|
|
$timestamp = strtotime($string); |
|
265
|
|
|
if ($timestamp === -1) |
|
266
|
|
|
{ |
|
267
|
|
|
debug_add("The string representation of the field {$name} could not be parsed into a timestamp; treating as 0.", MIDCOM_LOG_INFO); |
|
268
|
|
|
debug_print_r('String representation was:', $string); |
|
269
|
|
|
$timestamp = 0; |
|
270
|
|
|
} |
|
271
|
|
|
$this->add_date_pair($name, $timestamp); |
|
272
|
|
|
} |
|
273
|
|
|
} |
|
274
|
|
|
|
|
275
|
|
|
/** |
|
276
|
|
|
* @param string $name The field name |
|
277
|
|
|
* @return string index method |
|
278
|
|
|
*/ |
|
279
|
|
|
private function resolve_auto_method($name) |
|
280
|
|
|
{ |
|
281
|
|
|
if ( $name == 'abstract' |
|
282
|
|
|
|| $name == 'title' |
|
283
|
|
|
|| $name == 'author') |
|
284
|
|
|
{ |
|
285
|
|
|
return $name; |
|
286
|
|
|
} |
|
287
|
|
|
return 'content'; |
|
288
|
|
|
} |
|
289
|
|
|
} |
|
290
|
|
|
|
Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.
You can also find more detailed suggestions in the “Code” section of your repository.