Passed
Push — master ( 5347c1...625fac )
by Andreas
18:24
created

midcom_services_indexer_document::html2text()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 12
Code Lines 8

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 5
CRAP Score 1

Importance

Changes 0
Metric Value
cc 1
eloc 8
nc 1
nop 1
dl 0
loc 12
ccs 5
cts 5
cp 1
crap 1
rs 10
c 0
b 0
f 0
1
<?php
2
/**
3
 * @package midcom.services
4
 * @author The Midgard Project, http://www.midgard-project.org
5
 * @copyright The Midgard Project, http://www.midgard-project.org
6
 * @license http://www.gnu.org/licenses/lgpl.html GNU Lesser General Public License
7
 */
8
9
/**
10
 * This class encapsulates a single indexer document. It is used for both indexing
11
 * and retrieval.
12
 *
13
 * A document consists of a number of fields, each field has different properties
14
 * when handled by the indexer (exact behavior depends, as always, on the indexer
15
 * backend in use). On retrieval, this field information is lost, all fields being
16
 * of the same type (naturally). The core indexer backend supports these field
17
 * types:
18
 *
19
 * - <i>date</i> is a date-wrapped field suitable for use with the Date Filter.
20
 * - <i>keyword</i> is store and indexed, but not tokenized.
21
 * - <i>unindexed</i> is stored but neither indexed nor tokenized.
22
 * - <i>unstored</i> is not stored, but indexed and tokenized.
23
 * - <i>text</i> is stored, indexed and tokenized.
24
 *
25
 * This class should not be instantiated directly, a new instance of this class
26
 * can be obtained using the midcom_services_indexer class.
27
 *
28
 * A number of predefined fields are available using member fields. These fields
29
 * are all meta-fields. See their individual documentation for details. All fields
30
 * are mandatory unless mentioned otherwise explicitly and, as always, assumed to
31
 * be in the local charset.
32
 *
33
 * Remember, that both date and unstored fields are not available on retrieval.
34
 * For the core fields, all timestamps are stored twice therefore, once as searchable
35
 * field, and once as readable timestamp.
36
 *
37
 * The class will automatically pass all data to the i18n charset conversion functions,
38
 * thus you work using your site's charset like usual. UTF-8 conversion is done
39
 * implicitly.
40
 *
41
 * @package midcom.services
42
 * @see midcom_services_indexer
43
 */
44
class midcom_services_indexer_document
45
{
46
    /**
47
     * An associative array containing all fields of the current document.
48
     *
49
     * Each field is indexed by its name (a string). The value is another
50
     * array containing the fields "name", type" and "content".
51
     *
52
     * @var array
53
     */
54
    private $_fields = [];
55
56
    /**
57
     * The i18n service, used for charset conversion.
58
     *
59
     * @var midcom_services_i18n
60
     */
61
    protected $_i18n;
62
63
    /**
64
     * This is the score of this document. Only populated on resultset documents,
65
     * of course.
66
     *
67
     * @var double
68
     */
69
    public $score = 0.0;
70
71
    /* ------ START OF DOCUMENT FIELDS --------- */
72
73
    /**
74
     * The Resource Identifier of this document.
75
     *
76
     * Must be UTF-8 on assignment already.
77
     *
78
     * This field is mandatory.
79
     *
80
     * @var string
81
     */
82
    public $RI = '';
83
84
    /**
85
     * Two letter language code of the document content
86
     *
87
     * This field is optional.
88
     *
89
     * @var string
90
     */
91
    public $lang = '';
92
93
    /**
94
     * The GUID of the topic the document is assigned to.
95
     *
96
     * May be empty for non-midgard resources.
97
     *
98
     * This field is mandatory.
99
     *
100
     * @var string GUID
101
     */
102
    public $topic_guid = '';
103
104
    /**
105
     * The name of the component responsible for the document.
106
     *
107
     * May be empty for non-midgard resources.
108
     *
109
     * This field is mandatory.
110
     *
111
     * @var string
112
     */
113
    public $component = '';
114
115
    /**
116
     * The fully qualified URL to the document, this should be a PermaLink.
117
     *
118
     * This field is mandatory.
119
     *
120
     * @var string
121
     */
122
    public $document_url = '';
123
124
    /**
125
     * The time of document creation, this is a UNIX timestamp.
126
     *
127
     * This field is mandatory.
128
     *
129
     * @var int
130
     */
131
    public $created = 0;
132
133
    /**
134
     * The time of the last document modification, this is a UNIX timestamp.
135
     *
136
     * This field is mandatory.
137
     *
138
     * @var int
139
     */
140
    public $edited = 0;
141
142
    /**
143
     * The timestamp of indexing.
144
     *
145
     * This field is added automatically and to be considered read-only.
146
     *
147
     * @var int
148
     */
149
    public $indexed = 0;
150
151
    /**
152
     * The MidgardPerson who created the object.
153
     *
154
     * This is optional.
155
     *
156
     * @var midcom_db_person
157
     */
158
    public $creator;
159
160
    /**
161
     * The MidgardPerson who modified the object the last time.
162
     *
163
     * This is optional.
164
     *
165
     * @var midcom_db_person
166
     */
167
    public $editor;
168
169
    /**
170
     * The title of the document
171
     *
172
     * This is mandatory.
173
     *
174
     * @var string
175
     */
176
    public $title = '';
177
178
    /**
179
     * The content of the document
180
     *
181
     * This is mandatory.
182
     *
183
     * This field is empty on documents retrieved from the index.
184
     *
185
     * @var string
186
     */
187
    public $content = '';
188
189
    /**
190
     * The abstract of the document
191
     *
192
     * This is optional.
193
     *
194
     * @var string
195
     */
196
    public $abstract = '';
197
198
    /**
199
     * The author of the document
200
     *
201
     * This is optional.
202
     *
203
     * @var string
204
     */
205
    public $author = '';
206
207
    /**
208
     * An additional tag indicating the source of the document for use by the
209
     * component doing the indexing.
210
     *
211
     * This value is not indexed and should not be used by anybody except the
212
     * component doing the indexing.
213
     *
214
     * This is optional.
215
     *
216
     * @var string
217
     */
218
    public $source = '';
219
220
    /**
221
     * The full path to the topic that houses the document.
222
     *
223
     * For external resources, this should be either a MidCOM topic, to which this
224
     * resource is associated or some "directory" after which you could filter.
225
     * You may also leave it empty prohibiting it to appear on any topic-specific search.
226
     *
227
     * The value should be fully qualified, as returned by MIDCOM_NAV_FULLURL, including
228
     * a trailing slash, f.x. https://host/path/to/topic/
229
     *
230
     * This is optional.
231
     *
232
     * @var string
233
     */
234
    public $topic_url = '';
235
236
    /**
237
     * The type of the document, set by subclasses and added to the index
238
     * automatically.
239
     *
240
     * The type *must* reflect the original type hierarchy. It is to be set
241
     * using the $this->_set_type call <i>after</i> initializing the base class.
242
     *
243
     * @see is_a()
244
     * @see _set_type()
245
     * @var string
246
     */
247
    public $type = '';
248
249
    /**
250
     * This is have support for #651 without rewriting all components' index methods
251
     *
252
     * If set to false the indexer backend will silently skip this document.
253
     *
254
     * @see http://trac.midgard-project.org/ticket/651
255
     * @var boolean
256
     */
257
    public $actually_index = true;
258
259
    /* ------ END OF DOCUMENT FIELDS --------- */
260
261
    /**
262
     * Initialize the object, nothing fancy here.
263
     */
264 14
    public function __construct()
265
    {
266 14
        $this->_i18n = midcom::get()->i18n;
267 14
    }
268
269
    /**
270
     * Returns the contents of the field name or false on failure.
271
     *
272
     * @return mixed The content of the field or false on failure.
273
     */
274
    public function get_field(string $name)
275
    {
276
        if (!array_key_exists($name, $this->_fields)) {
277
            debug_add("Field {$name} not found in the document.", MIDCOM_LOG_INFO);
278
            return false;
279
        }
280
        return $this->_i18n->convert_from_utf8($this->_fields[$name]['content']);
281
    }
282
283
    /**
284
     * Returns the complete internal field records, including type and UTF-8 encoded
285
     * content.
286
     *
287
     * This should normally not be used from the outside, it is geared towards the
288
     * indexer backends, which need the full field information on indexing.
289
     */
290
    public function get_fields() : array
291
    {
292
        return $this->_fields;
293
    }
294
295
    /**
296
     * Remove a field from the list. Nonexistent fields are ignored silently.
297
     */
298
    public function remove_field(string $name)
299
    {
300
        unset($this->_fields[$name]);
301
    }
302
303
    /**
304
     * Add a date field. A timestamp is expected, which is automatically
305
     * converted to a suitable ISO timestamp before storage.
306
     *
307
     * Direct specification of the ISO timestamp is not yet possible due
308
     * to lacking validation outside the timestamp range.
309
     *
310
     * If a field of the same name is already present, it is overwritten
311
     * silently.
312
     */
313
    public function add_date(string $name, int $timestamp)
314
    {
315
        // This is always UTF-8 conformant.
316
        $this->_add_field($name, 'date', gmstrftime('%Y-%m-%dT%H:%M:%SZ', $timestamp), true);
317
    }
318
319
    /**
320
     * Create a normal date field and an unindexed _TS-postfixed timestamp field at the same time.
321
     *
322
     * This is useful because the date fields are not in a readable format,
323
     * it can't even be determined that they were a date in the first place.
324
     * so the _TS field is quite useful if you need the original value for the
325
     * timestamp.
326
     *
327
     * @param string $name The field's name, "_TS" is appended for the plain-timestamp field.
328
     */
329
    public function add_date_pair(string $name, int $timestamp)
330
    {
331
        $this->add_date($name, $timestamp);
332
        $this->add_unindexed("{$name}_TS", $timestamp);
333
    }
334
335
    public function add_keyword(string $name, string $content)
336
    {
337
        $this->_add_field($name, 'keyword', $content);
338
    }
339
340
    public function add_unindexed(string $name, string $content)
341
    {
342
        $this->_add_field($name, 'unindexed', $content);
343
    }
344
345
    public function add_unstored(string $name, string $content)
346
    {
347
        $this->_add_field($name, 'unstored', $this->html2text($content));
348
    }
349
350
    public function add_text(string $name, string $content)
351
    {
352
        $this->_add_field($name, 'text', $this->html2text($content));
353
    }
354
355
    /**
356
     * Add a search result field, this should normally not be done
357
     * manually, the indexer will call this function when creating a
358
     * document out of a search result.
359
     *
360
     * @param string $content The field's content, which is <b>assumed to be UTF-8 already</b>
361
     */
362
    public function add_result(string $name, $content)
363
    {
364
        $this->_add_field($name, 'result', $content, true);
365
    }
366
367
    /**
368
     * Add a person field.
369
     */
370
    private function add_person(string $name, ?midcom_db_person $person)
371
    {
372
        $this->add_text($name, $person->guid ?? '');
373
    }
374
375
    /**
376
     * This will translate all member variables into appropriate
377
     * field records, the backend should call this immediately before
378
     * indexing.
379
     *
380
     * This call will automatically populate indexed with time()
381
     * and author with the name of the creator (if set).
382
     */
383
    public function members_to_fields()
384
    {
385
        // Complete fields
386
        $this->indexed = time();
387
        if (   $this->author == ''
388
            && isset($this->creator->name)) {
389
            $this->author = $this->creator->name;
390
        }
391
392
        // __RI does not need to be populated, this is done by backends.
393
        $this->add_unindexed('__LANG', $this->lang);
394
        $this->add_text('__TOPIC_GUID', $this->topic_guid);
395
        $this->add_text('__COMPONENT', $this->component);
396
        $this->add_unindexed('__DOCUMENT_URL', $this->document_url);
397
        $this->add_text('__TOPIC_URL', $this->topic_url);
398
        $this->add_date_pair('__CREATED', $this->created);
399
        $this->add_date_pair('__EDITED', $this->edited);
400
        $this->add_date_pair('__INDEXED', $this->indexed);
401
        $this->add_text('title', $this->title);
402
        $this->add_unstored('content', $this->content);
403
404
        $this->add_unindexed('__SOURCE', $this->source);
405
        $this->add_person('__CREATOR', $this->creator);
406
        $this->add_person('__EDITOR', $this->editor);
407
408
        $this->add_text('author', $this->author);
409
        $this->add_text('abstract', $this->abstract);
410
        $this->add_text('__TYPE', $this->type);
411
    }
412
413
    /**
414
     * Populate all relevant members with the respective values after
415
     * retrieving a document from the index
416
     */
417
    public function fields_to_members()
418
    {
419
        $this->RI = $this->get_field('__RI');
0 ignored issues
show
Documentation Bug introduced by
It seems like $this->get_field('__RI') can also be of type false. However, the property $RI is declared as type string. Maybe add an additional type check?

Our type inference engine has found a suspicous assignment of a value to a property. This check raises an issue when a value that can be of a mixed type is assigned to a property that is type hinted more strictly.

For example, imagine you have a variable $accountId that can either hold an Id object or false (if there is no account id yet). Your code now assigns that value to the id property of an instance of the Account class. This class holds a proper account, so the id value must no longer be false.

Either this assignment is in error or a type check should be added for that assignment.

class Id
{
    public $id;

    public function __construct($id)
    {
        $this->id = $id;
    }

}

class Account
{
    /** @var  Id $id */
    public $id;
}

$account_id = false;

if (starsAreRight()) {
    $account_id = new Id(42);
}

$account = new Account();
if ($account instanceof Id)
{
    $account->id = $account_id;
}
Loading history...
420
        $this->lang = $this->get_field('__LANG');
0 ignored issues
show
Documentation Bug introduced by
It seems like $this->get_field('__LANG') can also be of type false. However, the property $lang is declared as type string. Maybe add an additional type check?

Our type inference engine has found a suspicous assignment of a value to a property. This check raises an issue when a value that can be of a mixed type is assigned to a property that is type hinted more strictly.

For example, imagine you have a variable $accountId that can either hold an Id object or false (if there is no account id yet). Your code now assigns that value to the id property of an instance of the Account class. This class holds a proper account, so the id value must no longer be false.

Either this assignment is in error or a type check should be added for that assignment.

class Id
{
    public $id;

    public function __construct($id)
    {
        $this->id = $id;
    }

}

class Account
{
    /** @var  Id $id */
    public $id;
}

$account_id = false;

if (starsAreRight()) {
    $account_id = new Id(42);
}

$account = new Account();
if ($account instanceof Id)
{
    $account->id = $account_id;
}
Loading history...
421
        $this->topic_guid = $this->get_field('__TOPIC_GUID');
0 ignored issues
show
Documentation Bug introduced by
It seems like $this->get_field('__TOPIC_GUID') can also be of type false. However, the property $topic_guid is declared as type string. Maybe add an additional type check?

Our type inference engine has found a suspicous assignment of a value to a property. This check raises an issue when a value that can be of a mixed type is assigned to a property that is type hinted more strictly.

For example, imagine you have a variable $accountId that can either hold an Id object or false (if there is no account id yet). Your code now assigns that value to the id property of an instance of the Account class. This class holds a proper account, so the id value must no longer be false.

Either this assignment is in error or a type check should be added for that assignment.

class Id
{
    public $id;

    public function __construct($id)
    {
        $this->id = $id;
    }

}

class Account
{
    /** @var  Id $id */
    public $id;
}

$account_id = false;

if (starsAreRight()) {
    $account_id = new Id(42);
}

$account = new Account();
if ($account instanceof Id)
{
    $account->id = $account_id;
}
Loading history...
422
        $this->component = $this->get_field('__COMPONENT');
0 ignored issues
show
Documentation Bug introduced by
It seems like $this->get_field('__COMPONENT') can also be of type false. However, the property $component is declared as type string. Maybe add an additional type check?

Our type inference engine has found a suspicous assignment of a value to a property. This check raises an issue when a value that can be of a mixed type is assigned to a property that is type hinted more strictly.

For example, imagine you have a variable $accountId that can either hold an Id object or false (if there is no account id yet). Your code now assigns that value to the id property of an instance of the Account class. This class holds a proper account, so the id value must no longer be false.

Either this assignment is in error or a type check should be added for that assignment.

class Id
{
    public $id;

    public function __construct($id)
    {
        $this->id = $id;
    }

}

class Account
{
    /** @var  Id $id */
    public $id;
}

$account_id = false;

if (starsAreRight()) {
    $account_id = new Id(42);
}

$account = new Account();
if ($account instanceof Id)
{
    $account->id = $account_id;
}
Loading history...
423
        $this->document_url = $this->get_field('__DOCUMENT_URL');
0 ignored issues
show
Documentation Bug introduced by
It seems like $this->get_field('__DOCUMENT_URL') can also be of type false. However, the property $document_url is declared as type string. Maybe add an additional type check?

Our type inference engine has found a suspicous assignment of a value to a property. This check raises an issue when a value that can be of a mixed type is assigned to a property that is type hinted more strictly.

For example, imagine you have a variable $accountId that can either hold an Id object or false (if there is no account id yet). Your code now assigns that value to the id property of an instance of the Account class. This class holds a proper account, so the id value must no longer be false.

Either this assignment is in error or a type check should be added for that assignment.

class Id
{
    public $id;

    public function __construct($id)
    {
        $this->id = $id;
    }

}

class Account
{
    /** @var  Id $id */
    public $id;
}

$account_id = false;

if (starsAreRight()) {
    $account_id = new Id(42);
}

$account = new Account();
if ($account instanceof Id)
{
    $account->id = $account_id;
}
Loading history...
424
        $this->topic_url = $this->get_field('__TOPIC_URL');
0 ignored issues
show
Documentation Bug introduced by
It seems like $this->get_field('__TOPIC_URL') can also be of type false. However, the property $topic_url is declared as type string. Maybe add an additional type check?

Our type inference engine has found a suspicous assignment of a value to a property. This check raises an issue when a value that can be of a mixed type is assigned to a property that is type hinted more strictly.

For example, imagine you have a variable $accountId that can either hold an Id object or false (if there is no account id yet). Your code now assigns that value to the id property of an instance of the Account class. This class holds a proper account, so the id value must no longer be false.

Either this assignment is in error or a type check should be added for that assignment.

class Id
{
    public $id;

    public function __construct($id)
    {
        $this->id = $id;
    }

}

class Account
{
    /** @var  Id $id */
    public $id;
}

$account_id = false;

if (starsAreRight()) {
    $account_id = new Id(42);
}

$account = new Account();
if ($account instanceof Id)
{
    $account->id = $account_id;
}
Loading history...
425
        $this->created = $this->get_field('__CREATED_TS');
0 ignored issues
show
Documentation Bug introduced by
It seems like $this->get_field('__CREATED_TS') of type false or string is incompatible with the declared type integer of property $created.

Our type inference engine has found an assignment to a property that is incompatible with the declared type of that property.

Either this assignment is in error or the assigned type should be added to the documentation/type hint for that property..

Loading history...
426
        $this->edited = $this->get_field('__EDITED_TS');
0 ignored issues
show
Documentation Bug introduced by
It seems like $this->get_field('__EDITED_TS') of type false or string is incompatible with the declared type integer of property $edited.

Our type inference engine has found an assignment to a property that is incompatible with the declared type of that property.

Either this assignment is in error or the assigned type should be added to the documentation/type hint for that property..

Loading history...
427
        $this->indexed = $this->get_field('__INDEXED_TS');
0 ignored issues
show
Documentation Bug introduced by
It seems like $this->get_field('__INDEXED_TS') of type false or string is incompatible with the declared type integer of property $indexed.

Our type inference engine has found an assignment to a property that is incompatible with the declared type of that property.

Either this assignment is in error or the assigned type should be added to the documentation/type hint for that property..

Loading history...
428
        $this->title = $this->get_field('title');
0 ignored issues
show
Documentation Bug introduced by
It seems like $this->get_field('title') can also be of type false. However, the property $title is declared as type string. Maybe add an additional type check?

Our type inference engine has found a suspicous assignment of a value to a property. This check raises an issue when a value that can be of a mixed type is assigned to a property that is type hinted more strictly.

For example, imagine you have a variable $accountId that can either hold an Id object or false (if there is no account id yet). Your code now assigns that value to the id property of an instance of the Account class. This class holds a proper account, so the id value must no longer be false.

Either this assignment is in error or a type check should be added for that assignment.

class Id
{
    public $id;

    public function __construct($id)
    {
        $this->id = $id;
    }

}

class Account
{
    /** @var  Id $id */
    public $id;
}

$account_id = false;

if (starsAreRight()) {
    $account_id = new Id(42);
}

$account = new Account();
if ($account instanceof Id)
{
    $account->id = $account_id;
}
Loading history...
429
430
        $this->source = $this->get_field('__SOURCE');
0 ignored issues
show
Documentation Bug introduced by
It seems like $this->get_field('__SOURCE') can also be of type false. However, the property $source is declared as type string. Maybe add an additional type check?

Our type inference engine has found a suspicous assignment of a value to a property. This check raises an issue when a value that can be of a mixed type is assigned to a property that is type hinted more strictly.

For example, imagine you have a variable $accountId that can either hold an Id object or false (if there is no account id yet). Your code now assigns that value to the id property of an instance of the Account class. This class holds a proper account, so the id value must no longer be false.

Either this assignment is in error or a type check should be added for that assignment.

class Id
{
    public $id;

    public function __construct($id)
    {
        $this->id = $id;
    }

}

class Account
{
    /** @var  Id $id */
    public $id;
}

$account_id = false;

if (starsAreRight()) {
    $account_id = new Id(42);
}

$account = new Account();
if ($account instanceof Id)
{
    $account->id = $account_id;
}
Loading history...
431
        if ($creator = $this->get_field('__CREATOR')) {
432
            $this->creator = $this->read_person($creator);
433
        }
434
        if ($editor = $this->get_field('__EDITOR')) {
435
            $this->editor = $this->read_person($editor);
436
        }
437
        $this->author = $this->get_field('author');
0 ignored issues
show
Documentation Bug introduced by
It seems like $this->get_field('author') can also be of type false. However, the property $author is declared as type string. Maybe add an additional type check?

Our type inference engine has found a suspicous assignment of a value to a property. This check raises an issue when a value that can be of a mixed type is assigned to a property that is type hinted more strictly.

For example, imagine you have a variable $accountId that can either hold an Id object or false (if there is no account id yet). Your code now assigns that value to the id property of an instance of the Account class. This class holds a proper account, so the id value must no longer be false.

Either this assignment is in error or a type check should be added for that assignment.

class Id
{
    public $id;

    public function __construct($id)
    {
        $this->id = $id;
    }

}

class Account
{
    /** @var  Id $id */
    public $id;
}

$account_id = false;

if (starsAreRight()) {
    $account_id = new Id(42);
}

$account = new Account();
if ($account instanceof Id)
{
    $account->id = $account_id;
}
Loading history...
438
        $this->abstract = $this->get_field('abstract');
0 ignored issues
show
Documentation Bug introduced by
It seems like $this->get_field('abstract') can also be of type false. However, the property $abstract is declared as type string. Maybe add an additional type check?

Our type inference engine has found a suspicous assignment of a value to a property. This check raises an issue when a value that can be of a mixed type is assigned to a property that is type hinted more strictly.

For example, imagine you have a variable $accountId that can either hold an Id object or false (if there is no account id yet). Your code now assigns that value to the id property of an instance of the Account class. This class holds a proper account, so the id value must no longer be false.

Either this assignment is in error or a type check should be added for that assignment.

class Id
{
    public $id;

    public function __construct($id)
    {
        $this->id = $id;
    }

}

class Account
{
    /** @var  Id $id */
    public $id;
}

$account_id = false;

if (starsAreRight()) {
    $account_id = new Id(42);
}

$account = new Account();
if ($account instanceof Id)
{
    $account->id = $account_id;
}
Loading history...
439
        $this->type = $this->get_field('__TYPE');
0 ignored issues
show
Documentation Bug introduced by
It seems like $this->get_field('__TYPE') can also be of type false. However, the property $type is declared as type string. Maybe add an additional type check?

Our type inference engine has found a suspicous assignment of a value to a property. This check raises an issue when a value that can be of a mixed type is assigned to a property that is type hinted more strictly.

For example, imagine you have a variable $accountId that can either hold an Id object or false (if there is no account id yet). Your code now assigns that value to the id property of an instance of the Account class. This class holds a proper account, so the id value must no longer be false.

Either this assignment is in error or a type check should be added for that assignment.

class Id
{
    public $id;

    public function __construct($id)
    {
        $this->id = $id;
    }

}

class Account
{
    /** @var  Id $id */
    public $id;
}

$account_id = false;

if (starsAreRight()) {
    $account_id = new Id(42);
}

$account = new Account();
if ($account instanceof Id)
{
    $account->id = $account_id;
}
Loading history...
440
    }
441
442
    /**
443
     * Internal helper which actually stores a field.
444
     */
445
    protected function _add_field(string $name, string $type, $content, bool $is_utf8 = false)
446
    {
447
        $this->_fields[$name] = [
448
            'name' => $name,
449
            'type' => $type,
450
            'content' => ($is_utf8 ? $content : $this->_i18n->convert_to_utf8($content))
451
        ];
452
    }
453
454
    /**
455
     * Convert HTML to plain text (relatively simple):
456
     *
457
     * Basically, JavaScript blocks and
458
     * HTML Tags are stripped, and all HTML Entities
459
     * are converted to their native equivalents.
460
     *
461
     * Don't replace with an empty string but with a space, so that constructs like
462
     * <li>torben</li><li>nehmer</li> are recognized correctly.
463
     */
464 14
    public function html2text(string $text) : string
465
    {
466
        $search = [
467 14
            "'\s*<script[^>]*?>.*?</script>\s*'si", // Strip out javascript
468
            "'\s*<[\/\!]*?[^<>]*?>\s*'si", // Strip out html tags
469
        ];
470
        $replace = [
471 14
            ' ',
472
            ' ',
473
        ];
474 14
        $result = $this->_i18n->html_entity_decode(preg_replace($search, $replace, $text));
475 14
        return trim(preg_replace('/\s+/s', ' ', $result));
476
    }
477
478
    /**
479
     * Checks whether the given document is an instance of given document type.
480
     *
481
     * This is equivalent to the is_a object hierarchy check, except that it
482
     * works with MidCOM documents.
483
     *
484
     * @see $type
485
     * @see _set_type()
486
     */
487
    public function is_a(string $document_type) : bool
488
    {
489
        return str_starts_with($this->type, $document_type);
490
    }
491
492
    /**
493
     * Sets the type of the object, reflecting the inheritance hierarchy.
494
     *
495
     * @see $type
496
     * @see is_a()
497
     */
498 6
    protected function _set_type(string $type)
499
    {
500 6
        if (empty($this->type)) {
501 6
            $this->type = $type;
502
        } else {
503
            $this->type .= "_{$type}";
504
        }
505 6
    }
506
507
    /**
508
     * Tries to determine the topic GUID and component using NAPs reverse-lookup capabilities.
509
     *
510
     * If this fails, you have to set the members $topic_guid, $topic_url and
511
     * $component manually.
512
     */
513
    protected function process_topic()
514
    {
515
        $nav = new midcom_helper_nav();
516
        $object = $nav->resolve_guid($this->source, true);
517
        if (!$object) {
518
            debug_add("Failed to resolve the topic, skipping autodetection.");
519
            return;
520
        }
521
        if ($object[MIDCOM_NAV_TYPE] == 'leaf') {
522
            $object = $nav->get_node($object[MIDCOM_NAV_NODEID]);
523
        }
524
        $this->topic_guid = $object[MIDCOM_NAV_GUID];
525
        $this->topic_url = $object[MIDCOM_NAV_FULLURL];
526
        $this->component = $object[MIDCOM_NAV_COMPONENT];
527
    }
528
529
    /**
530
     * Tries to resolve created, revised, author, editor and creator for the document from Midgard object
531
     */
532 4
    public function read_metadata_from_object(midcom_core_dbaobject $object)
533
    {
534
        // if published is set to non-empty value, use it as creation data
535 4
        $this->created = $object->metadata->published ?: $object->metadata->created;
536
        // Revised
537 4
        $this->edited = $object->metadata->revised;
538
        // Heuristics to determine author
539 4
        if (!empty($object->metadata->authors)) {
540 4
            $this->author = $this->read_authorname($object->metadata->authors);
541
        } elseif (!empty($object->metadata->creator)) {
542
            $this->author = $this->read_authorname($object->metadata->creator);
543
        }
544
        // Creator
545 4
        if (isset($object->metadata->creator)) {
546 4
            $this->creator = $this->read_person($object->metadata->creator);
547
        }
548
        // Editor
549 4
        if (isset($object->metadata->revisor)) {
550 4
            $this->editor = $this->read_person($object->metadata->revisor);
551
        }
552 4
    }
553
554
    /**
555
     * Get person by given ID, caches results.
556
     */
557 4
    private function read_person(string $guid) : ?midcom_db_person
558
    {
559
        try {
560 4
            return midcom_db_person::get_cached($guid);
561
        } catch (midcom_error $e) {
562
            return null;
563
        }
564
    }
565
566
    /**
567
     * Gets person name for given ID (in case it's imploded_wrapped of multiple GUIDs it will use the first)
568
     */
569 4
    private function read_authorname(string $input) : string
570
    {
571
        // Check for imploded_wrapped datamanager storage.
572 4
        if (str_contains($input, '|')) {
573
            // Find first non-empty value in the array and use that
574 4
            $id_arr = array_values(array_filter(explode('|', $input)));
575 4
            $input = $id_arr[0] ?? null;
576
        }
577
578 4
        return midcom::get()->auth->get_user($input)->name ?? '';
579
    }
580
}
581