document::resolve_method()   A
last analyzed

Complexity

Conditions 3
Paths 3

Size

Total Lines 9
Code Lines 5

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 6
CRAP Score 3

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 3
eloc 5
c 1
b 0
f 0
nc 3
nop 1
dl 0
loc 9
ccs 6
cts 6
cp 1
crap 3
rs 10
1
<?php
2
/**
3
 * @author The Midgard Project, http://www.midgard-project.org
4
 * @copyright The Midgard Project, http://www.midgard-project.org
5
 * @license http://www.gnu.org/licenses/lgpl.html GNU Lesser General Public License
6
 */
7
8
namespace midcom\datamanager\indexer;
9
10
use Symfony\Component\Form\FormView;
11
use midcom\datamanager\datamanager;
12
use midcom_services_indexer_document_midcom;
13
use midcom_error;
14
15
/**
16
 * This class is geared to ease indexing of datamanager driven documents. The
17
 * user invoking the indexing must have full read permissions to the object.
18
 *
19
 * <b>Basic indexing operation</b>
20
 *
21
 * This class uses a number of conventions, see below, to merge an existing
22
 * datamanager driven document into an indexing capable document. It requires
23
 * the callee to instantiate the datamanager, as this class would have no
24
 * idea where to take the schema database from.
25
 *
26
 * The RI (the GUID) from the base class is left untouched.
27
 *
28
 * <b>Indexing field defaults:</b>
29
 *
30
 * Unless you specify anything else explicitly in the schema,
31
 * the class will merge all text based fields together to form the <i>content</i>
32
 * field of the index record, to allow for easy searching of the document.
33
 * This will *not* include any metadata like keywords or summaries.
34
 *
35
 * If the schema contains a field <i>abstract</i>, it will also be used as
36
 * abstract field for the indexing process. In the same way, fields named
37
 * <i>title</i> or <i>author</i> will be used for the index document's title
38
 * or author respectively. The contents of abstract, title and author will also
39
 * be appended to the content field at the end of the object construction,
40
 * easing searching over this fields.
41
 *
42
 * If no abstract field is present, the first 200 characters of the content
43
 * area are used instead.
44
 *
45
 * Not all types can be indexed, check the various types in question about their
46
 * indexing capabilities. In general, if the system should index any non-text
47
 * field, it will use the CSV representation for implicit conversion.
48
 *
49
 * Metadata processing is done by the base class.
50
 *
51
 * <b>Document title:</b>
52
 *
53
 * You should either have an auto-indexed title field, or an assortment
54
 * of other fields manually assigned to index to the title field.
55
 *
56
 * <b>Configurability using the Datamanager schema:</b>
57
 *
58
 * You can decorate datamanager fields with various directives influencing
59
 * the indexing. See the Datamanager's schema documentation for details.
60
 * Basically, you can choose from the following indexing methods using the
61
 * key 'index_method' for each field:
62
 *
63
 * - The default <i>auto</i> mode will use the above guidelines to determine
64
 *   the indexing destination automatically, adding data to the content, abstract,
65
 *   title and author fields respectively.
66
 * - You can specify <i>abstract</i>, <i>content</i>, <i>title</i> or
67
 *   <i>author</i> to indicate that the field should be used for the indicated
68
 *   document fields. The content selector may be specified more than once,
69
 *   indicating that the content of the relevant fields should be merged.
70
 * - Any date field can be indexed into its own, range-filterable field using
71
 *   the <i>date</i> method. In this case, two document fields will be created
72
 *   actually. One containing the filterable timestamp named directly after
73
 *   the schema field, and a second one, having the _TS postfix which is set as
74
 *   noindex containing the plain timestamp.
75
 * - Finally, you can explicitly index a field as a separate document field
76
 *   using one of the five field types <i>keyword</i>, <i>unindexed</i>,
77
 *   <i>unstored</i> or <i>text</i>. You can further control if the content
78
 *   of these fields is also added to the main content field. This is useful
79
 *   if you want to have fields searchable both by explicit field specification
80
 *   and the default field for simpler searches. This is controlled by setting
81
 *   the boolean key 'index_merge_with_content' in the field, which defaults
82
 *   to true.
83
 * - <i>noindex</i> will prevent indexing of this field.
84
 *
85
 * The documents type is "midcom_datamanager".
86
 *
87
 * @see midcom_services_indexer
88
 */
89
class document extends midcom_services_indexer_document_midcom
90
{
91
    /**
92
     * The datamanager instance of the document we need to index.
93
     */
94
    private datamanager $datamanager;
95
96
    /**
97
     * The constructor initializes the member variables and invokes
98
     * _process_datamanager, which will read and process the information
99
     * out of that instance.
100
     *
101
     * The document is ready for indexing after construction. On any
102
     * critical error, midcom_error is triggered.
103
     */
104 6
    public function __construct(datamanager $datamanager)
105
    {
106 6
        parent::__construct($datamanager->get_storage()->get_value());
107
108 6
        $this->_set_type('datamanager');
109
110 6
        $this->datamanager = $datamanager;
111
112 6
        $this->process_datamanager();
113 6
        $this->complete_fields();
114
    }
115
116
    /**
117
     * Completes all fields which are not yet complete:
118
     *
119
     * content is completed with author, title and, if necessary, abstract.
120
     *
121
     * The title is set to the documents' URL in case that no title is set yet. The title
122
     * is not added to the content field in that case.
123
     */
124 6
    private function complete_fields()
125
    {
126 6
        $this->content .= "{$this->author}\n{$this->title}\n";
127
128
        // Add the abstract only if we haven't done so already.
129 6
        if (!str_contains($this->content, $this->abstract)) {
130 6
            $this->content .= "{$this->abstract}\n";
131
        }
132
133 6
        if (!$this->title) {
134 1
            $this->title = $this->document_url;
135
        }
136
    }
137
138
    /**
139
     * Processes the information contained in the datamanager instance.
140
     *
141
     * The function iterates over the fields in the schema, and processes them
142
     * according to the rules given in the introduction.
143
     */
144 6
    private function process_datamanager()
145
    {
146 6
        $renderer = $this->datamanager->get_renderer('view');
147 6
        foreach ($renderer->get_view() as $name => $field) {
148 6
            $method = $this->resolve_method($field->vars);
149
150
            switch ($method) {
151 6
                case 'abstract':
152 6
                case 'title':
153 6
                case 'author':
154 6
                    $this->{$method} = $renderer->widget($field);
155 6
                    break;
156
157 6
                case 'content':
158 6
                    $this->content .= $renderer->widget($field) . "\n";
159 6
                    break;
160
161 6
                case 'date':
162
                    $this->add_as_date_field($field);
163
                    break;
164
165 6
                case 'attachment':
166
                    if (!empty($field->vars['value'])) {
167
                        //only index the first attachment for now
168
                        $attachment = array_shift($field->vars['value']);
169
                        if (   !$attachment instanceof \midcom_db_attachment
170
                            && !empty($attachment['object'])) {
171
                            //This is the form edit case
172
                            //@todo: In create case, nothing is found currently
173
                            $attachment = $attachment['object'];
174
                        }
175
                        if ($attachment instanceof \midcom_db_attachment) {
176
                            $att_doc = new \midcom_services_indexer_document_attachment($attachment);
177
                            $this->content .= $att_doc->content;
178
                            $this->abstract .= $att_doc->abstract;
179
                        }
180
                    }
181
182
                    break;
183
184 6
                case 'unstored':
185 6
                case 'unindexed':
186 6
                case 'text':
187 6
                case 'keyword':
188
                    $data = $renderer->widget($field);
189
                    $function = 'add_' . $method;
190
                    $this->$function($name, $data);
191
                    if ($field->vars['index_merge_with_content']) {
192
                        $this->content .= $data . "\n";
193
                    }
194
                    break;
195
196 6
                case 'noindex':
197 6
                    break;
198
199
                default:
200
                    throw new midcom_error(" Unknown indexing method {$method} for field {$name} discovered, aborting.");
201
            }
202
        }
203
204 6
        if ($this->abstract == '') {
205 6
            $this->abstract = $this->html2text($this->content);
206 6
            if (mb_strlen($this->abstract) > 200) {
207
                $this->abstract = mb_substr($this->abstract, 0, 200) . ' ...';
208
            }
209
        }
210
    }
211
212
    /**
213
     * This function tries to convert the $field into a date
214
     * representation. Unixdate fields are used directly (localtime is used,
215
     * not GMT), other fields will be parsed with strtodate.
216
     *
217
     * Invalid strings which are not parseable using strtotime will be
218
     * stored as a "0" timestamp.
219
     *
220
     * Be aware, that this will work only for current dates in range of an
221
     * UNIX timestamp. For all other cases you should use an ISO 8601 representation,
222
     * which should work as well with Lucene range queries.
223
     */
224
    private function add_as_date_field(FormView $field)
225
    {
226
        if (is_array($field->vars['value']) && array_key_exists('date', $field->vars['value'])) {
227
            $timestamp = 0;
228
            if (!empty($field->vars['value']['date'])) {
229
                $timestamp = (int) $field->vars['value']['date']->format('U');
230
            }
231
        } else {
232
            $string = (string) $field->vars['value'];
233
            $timestamp = strtotime($string);
234
            if ($timestamp === false) {
235
                debug_add("The string representation of the field {$field->vars['name']} could not be parsed into a timestamp; treating as 0.", MIDCOM_LOG_INFO);
236
                debug_print_r('String representation was:', $string);
237
                $timestamp = 0;
238
            }
239
        }
240
        $this->add_date_pair($field->vars['name'], $timestamp);
241
    }
242
243 6
    private function resolve_method(array $vars) : string
244
    {
245 6
        if ($vars['index_method'] == 'auto') {
246 6
            if (in_array($vars['name'], ['abstract', 'title', 'author'])) {
247 6
                return $vars['name'];
248
            }
249 6
            return 'content';
250
        }
251 6
        return $vars['index_method'];
252
    }
253
}
254