Text::extractTextFromNeo()   B
last analyzed

Complexity

Conditions 10
Paths 15

Size

Total Lines 31
Code Lines 17

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 0
CRAP Score 110

Importance

Changes 0
Metric Value
eloc 17
c 0
b 0
f 0
dl 0
loc 31
ccs 0
cts 18
cp 0
rs 7.6666
cc 10
nc 15
nop 2
crap 110

How to fix   Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
/**
3
 * SEOmatic plugin for Craft CMS
4
 *
5
 * A turnkey SEO implementation for Craft CMS that is comprehensive, powerful,
6
 * and flexible
7
 *
8
 * @link      https://nystudio107.com
9
 * @copyright Copyright (c) 2017 nystudio107
10
 */
11
12
namespace nystudio107\seomatic\helpers;
13
14
use benf\neo\elements\Block as NeoBlock;
15
use benf\neo\elements\db\BlockQuery as NeoBlockQuery;
16
use craft\elements\db\MatrixBlockQuery;
17
use craft\elements\db\TagQuery;
18
use craft\elements\MatrixBlock;
19
use craft\elements\Tag;
20
use craft\helpers\HtmlPurifier;
21
use craft\models\FieldLayout;
22
use Illuminate\Support\Collection;
23
use nystudio107\seomatic\helpers\Field as FieldHelper;
24
use nystudio107\seomatic\Seomatic;
25
use PhpScience\TextRank\TextRankFacade;
26
use PhpScience\TextRank\Tool\StopWords\StopWordsAbstract;
27
use Stringy\Stringy;
28
use verbb\doxter\Doxter;
29
use verbb\doxter\fields\data\DoxterData;
30
use verbb\supertable\elements\db\SuperTableBlockQuery;
31
use verbb\supertable\elements\SuperTableBlockElement as SuperTableBlock;
32
use yii\base\InvalidConfigException;
33
use function array_slice;
34
use function function_exists;
35
use function is_array;
36
37
/**
38
 * @author    nystudio107
39
 * @package   Seomatic
40
 * @since     3.0.0
41
 */
42
class Text
43
{
44
    // Constants
45
    // =========================================================================
46
47
    public const LANGUAGE_MAP = [
48
        'en' => 'English',
49
        'fr' => 'French',
50
        'de' => 'German',
51
        'it' => 'Italian',
52
        'no' => 'Norwegian',
53
        'es' => 'Spanish',
54
    ];
55
56
    // Public Static Methods
57
    // =========================================================================
58
59
    /**
60
     * Truncates the string to a given length. If $substring is provided, and
61
     * truncating occurs, the string is further truncated so that the substring
62
     * may be appended without exceeding the desired length.
63
     *
64
     * @param string $string The string to truncate
65
     * @param int $length Desired length of the truncated string
66
     * @param string $substring The substring to append if it can fit
67
     *
68
     * @return string with the resulting $str after truncating
69
     */
70
    public static function truncate($string, $length, $substring = '…'): string
71
    {
72
        $result = $string;
73
74
        if (!empty($string)) {
75
            $string = HtmlPurifier::process($string, ['HTML.Allowed' => '']);
76
            $string = html_entity_decode($string, ENT_NOQUOTES, 'UTF-8');
77
            $result = (string)Stringy::create($string)->truncate($length, $substring);
78
        }
79
80
        return $result;
81
    }
82
83
    /**
84
     * Truncates the string to a given length, while ensuring that it does not
85
     * split words. If $substring is provided, and truncating occurs, the
86
     * string is further truncated so that the substring may be appended without
87
     * exceeding the desired length.
88
     *
89
     * @param string $string The string to truncate
90
     * @param int $length Desired length of the truncated string
91
     * @param string $substring The substring to append if it can fit
92
     *
93
     * @return string with the resulting $str after truncating
94
     */
95 1
    public static function truncateOnWord($string, $length, $substring = '…'): string
96
    {
97 1
        $result = $string;
98
99 1
        if (!empty($string)) {
100 1
            $string = HtmlPurifier::process($string, ['HTML.Allowed' => '']);
101 1
            $string = html_entity_decode($string, ENT_NOQUOTES, 'UTF-8');
102 1
            $result = (string)Stringy::create($string)->safeTruncate($length, $substring);
103
        }
104
105 1
        return $result;
106
    }
107
108
    /**
109
     * Extract plain old text from a field
110
     *
111
     * @param $field
112
     *
113
     * @return string
114
     */
115
    public static function extractTextFromField($field): string
116
    {
117
        if (empty($field)) {
118
            return '';
119
        }
120
        if ($field instanceof MatrixBlockQuery
121
            || (self::isArrayLike($field) && $field[0] instanceof MatrixBlock)) {
122
            $result = self::extractTextFromMatrix($field);
123
        } elseif ($field instanceof NeoBlockQuery
124
            || (self::isArrayLike($field) && $field[0] instanceof NeoBlock)) {
125
            $result = self::extractTextFromNeo($field);
126
        } elseif ($field instanceof SuperTableBlockQuery
127
            || (self::isArrayLike($field) && $field[0] instanceof SuperTableBlock)) {
128
            $result = self::extractTextFromSuperTable($field);
129
        } elseif ($field instanceof TagQuery
130
            || (self::isArrayLike($field) && $field[0] instanceof Tag)) {
131
            $result = self::extractTextFromTags($field);
132
        } elseif ($field instanceof DoxterData) {
133
            $result = self::smartStripTags(Doxter::$plugin->getService()->parseMarkdown($field->getRaw()));
134
        } else {
135
            if (self::isArrayLike($field)) {
136
                $result = self::smartStripTags((string)$field[0]);
137
            } else {
138
                $result = self::smartStripTags((string)$field);
139
            }
140
        }
141
142
        //return $result;
143
        return self::sanitizeUserInput($result);
144
    }
145
146
    /**
147
     * Extract concatenated text from all of the tags in the $tagElement and
148
     * return as a comma-delimited string
149
     *
150
     * @param TagQuery|Tag[]|array $tags
151
     *
152
     * @return string
153
     */
154
    public static function extractTextFromTags($tags): string
155
    {
156
        if (empty($tags)) {
157
            return '';
158
        }
159
        $result = '';
160
        // Iterate through all of the matrix blocks
161
        if ($tags instanceof TagQuery) {
162
            $tags = $tags->all();
163
        }
164
        foreach ($tags as $tag) {
165
            $result .= $tag->title . ', ';
166
        }
167
        $result = rtrim($result, ', ');
168
169
        return $result;
170
    }
171
172
    /**
173
     * Extract text from all of the blocks in a matrix field, concatenating it
174
     * together.
175
     *
176
     * @param MatrixBlockQuery|MatrixBlock[]|array $blocks
177
     * @param string $fieldHandle
178
     *
179
     * @return string
180
     */
181
    public static function extractTextFromMatrix($blocks, $fieldHandle = ''): string
182
    {
183
        if (empty($blocks)) {
184
            return '';
185
        }
186
        $result = '';
187
        // Iterate through all of the matrix blocks
188
        if ($blocks instanceof MatrixBlockQuery) {
189
            $blocks = $blocks->all();
190
        }
191
        foreach ($blocks as $block) {
192
            try {
193
                $matrixBlockTypeModel = $block->getType();
194
            } catch (InvalidConfigException $e) {
195
                $matrixBlockTypeModel = null;
196
            }
197
            // Find any text fields inside of the matrix block
198
            if ($matrixBlockTypeModel) {
199
                $fieldClasses = FieldHelper::FIELD_CLASSES[FieldHelper::TEXT_FIELD_CLASS_KEY];
200
                $fields = $matrixBlockTypeModel->getCustomFields();
201
202
                foreach ($fields as $field) {
203
                    /** @var array $fieldClasses */
204
                    foreach ($fieldClasses as $fieldClassKey) {
205
                        if ($field instanceof $fieldClassKey) {
206
                            if ($field->handle === $fieldHandle || empty($fieldHandle)) {
207
                                $result .= self::extractTextFromField($block[$field->handle]) . ' ';
208
                            }
209
                        }
210
                    }
211
                }
212
            }
213
        }
214
215
        return $result;
216
    }
217
218
    /**
219
     * Extract text from all of the blocks in a Neo field, concatenating it
220
     * together.
221
     *
222
     * @param NeoBlockQuery|NeoBlock[]|array $blocks
223
     * @param string $fieldHandle
224
     *
225
     * @return string
226
     */
227
    public static function extractTextFromNeo($blocks, $fieldHandle = ''): string
228
    {
229
        if (empty($blocks)) {
230
            return '';
231
        }
232
        $result = '';
233
        // Iterate through all of the matrix blocks
234
        if ($blocks instanceof NeoBlockQuery) {
235
            $blocks = $blocks->all();
236
        }
237
        foreach ($blocks as $block) {
238
            $layout = $block->getFieldLayout();
239
            // Find any text fields inside of the neo block
240
            if ($layout) {
241
                $fieldClasses = FieldHelper::FIELD_CLASSES[FieldHelper::TEXT_FIELD_CLASS_KEY];
242
                $fieldElements = $layout->getCustomFieldElements();
243
                foreach ($fieldElements as $fieldElement) {
244
                    $field = $fieldElement->getField();
245
                    /** @var array $fieldClasses */
246
                    foreach ($fieldClasses as $fieldClassKey) {
247
                        if ($field instanceof $fieldClassKey) {
248
                            if ($field->handle === $fieldHandle || empty($fieldHandle)) {
249
                                $result .= self::extractTextFromField($block[$field->handle]) . ' ';
250
                            }
251
                        }
252
                    }
253
                }
254
            }
255
        }
256
257
        return $result;
258
    }
259
260
    /**
261
     * Extract text from all of the blocks in a matrix field, concatenating it
262
     * together.
263
     *
264
     * @param SuperTableBlockQuery|SuperTableBlock[]|array $blocks
265
     * @param string $fieldHandle
266
     *
267
     * @return string
268
     */
269
    public static function extractTextFromSuperTable($blocks, $fieldHandle = ''): string
270
    {
271
        if (empty($blocks)) {
272
            return '';
273
        }
274
        $result = '';
275
        // Iterate through all of the supertable blocks
276
        if ($blocks instanceof SuperTableBlockQuery) {
277
            $blocks = $blocks->all();
278
        }
279
        foreach ($blocks as $block) {
280
            try {
281
                $superTableBlockTypeModel = $block->getType();
282
            } catch (InvalidConfigException $e) {
283
                $superTableBlockTypeModel = null;
284
            }
285
            // Find any text fields inside of the matrix block
286
            if ($superTableBlockTypeModel) {
287
                $fieldClasses = FieldHelper::FIELD_CLASSES[FieldHelper::TEXT_FIELD_CLASS_KEY];
288
                /** @var ?FieldLayout $layout */
289
                // The SuperTableBlockType class lacks @mixin FieldLayoutBehavior in its annotations
290
                /** @phpstan-ignore-next-line */
291
                $layout = $superTableBlockTypeModel->getFieldLayout();
292
                $fieldElements = $layout->getCustomFieldElements();
293
                foreach ($fieldElements as $fieldElement) {
294
                    $field = $fieldElement->getField();
295
                    /** @var array $fieldClasses */
296
                    foreach ($fieldClasses as $fieldClassKey) {
297
                        if ($field instanceof $fieldClassKey) {
298
                            if ($field->handle === $fieldHandle || empty($fieldHandle)) {
299
                                $result .= self::extractTextFromField($block[$field->handle]) . ' ';
300
                            }
301
                        }
302
                    }
303
                }
304
            }
305
        }
306
307
        return $result;
308
    }
309
310
    /**
311
     * Return the most important keywords extracted from the text as a comma-
312
     * delimited string
313
     *
314
     * @param string $text
315
     * @param int $limit
316
     * @param bool $useStopWords
317
     *
318
     * @return string
319
     */
320
    public static function extractKeywords($text, $limit = 15, $useStopWords = true): string
321
    {
322
        if (empty($text)) {
323
            return '';
324
        }
325
        $api = new TextRankFacade();
326
        // Set the stop words that should be ignored
327
        if ($useStopWords) {
328
            $language = strtolower(substr(Seomatic::$language, 0, 2));
329
            $stopWords = self::stopWordsForLanguage($language);
330
            if ($stopWords !== null) {
331
                $api->setStopWords($stopWords);
332
            }
333
        }
334
        // Array of the most important keywords:
335
        $keywords = $api->getOnlyKeyWords(self::cleanupText($text));
336
337
        // If it's empty, just return the text
338
        if (empty($keywords)) {
339
            return $text;
340
        }
341
342
        $result = implode(', ', array_slice(array_keys($keywords), 0, $limit));
343
344
        return self::sanitizeUserInput($result);
345
    }
346
347
    /**
348
     * Extract a summary consisting of the 3 most important sentences from the
349
     * text
350
     *
351
     * @param string $text
352
     * @param bool $useStopWords
353
     *
354
     * @return string
355
     */
356
    public static function extractSummary($text, $useStopWords = true): string
357
    {
358
        if (empty($text)) {
359
            return '';
360
        }
361
        $api = new TextRankFacade();
362
        // Set the stop words that should be ignored
363
        if ($useStopWords) {
364
            $language = strtolower(substr(Seomatic::$language, 0, 2));
365
            $stopWords = self::stopWordsForLanguage($language);
366
            if ($stopWords !== null) {
367
                $api->setStopWords($stopWords);
368
            }
369
        }
370
        // Array of the most important keywords:
371
        $sentences = $api->getHighlights(self::cleanupText($text));
372
373
        // If it's empty, just return the text
374
        if (empty($sentences)) {
375
            return $text;
376
        }
377
378
        $result = implode(' ', $sentences);
379
380
        return self::sanitizeUserInput($result);
381
    }
382
383
384
    /**
385
     * Sanitize user input by decoding any HTML Entities, URL decoding the text,
386
     * then removing any newlines, stripping tags, stripping Twig tags, and changing
387
     * single {}'s into ()'s
388
     *
389
     * @param $str
390
     * @return string
391
     */
392 2
    public static function sanitizeUserInput($str): string
393
    {
394
        // Do some general cleanup
395 2
        $str = html_entity_decode($str, ENT_NOQUOTES, 'UTF-8');
396 2
        $str = rawurldecode($str);
397
        // Remove any linebreaks
398 2
        $str = (string)preg_replace("/\r|\n/", "", $str);
399 2
        $str = HtmlPurifier::process($str, ['HTML.Allowed' => '']);
400 2
        $str = html_entity_decode($str, ENT_NOQUOTES, 'UTF-8');
401
        // Remove any embedded Twig code
402 2
        $str = preg_replace('/{{.*?}}/', '', $str);
403 2
        $str = preg_replace('/{%.*?%}/', '', $str);
404
        // Change single brackets to parenthesis
405 2
        $str = preg_replace('/{/', '(', $str);
406 2
        $str = preg_replace('/}/', ')', $str);
407 2
        if (empty($str)) {
408 2
            $str = '';
409
        }
410
411 2
        return $str;
412
    }
413
414
    /**
415
     * Strip HTML tags, but replace them with a space rather than just eliminating them
416
     *
417
     * @param $str
418
     * @return string
419
     */
420
    public static function smartStripTags($str)
421
    {
422
        $str = str_replace('<', ' <', $str);
423
        $str = HtmlPurifier::process($str, ['HTML.Allowed' => '']);
424
        $str = html_entity_decode($str, ENT_NOQUOTES, 'UTF-8');
425
        $str = str_replace('  ', ' ', $str);
426
427
        return $str;
428
    }
429
430
    /**
431
     * Clean up the passed in text by converting it to UTF-8, stripping tags,
432
     * removing whitespace, and decoding HTML entities
433
     *
434
     * @param string $text
435
     *
436
     * @return string
437
     */
438
    public static function cleanupText($text): string
439
    {
440
        if (empty($text)) {
441
            return '';
442
        }
443
        // Convert to UTF-8
444
        if (function_exists('iconv')) {
445
            $text = iconv(mb_detect_encoding($text, mb_detect_order(), true), 'UTF-8//IGNORE', $text);
0 ignored issues
show
Bug introduced by
It seems like mb_detect_order() can also be of type true; however, parameter $encodings of mb_detect_encoding() does only seem to accept array|null|string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

445
            $text = iconv(mb_detect_encoding($text, /** @scrutinizer ignore-type */ mb_detect_order(), true), 'UTF-8//IGNORE', $text);
Loading history...
446
        } else {
447
            ini_set('mbstring.substitute_character', 'none');
448
            $text = mb_convert_encoding($text, 'UTF-8', 'UTF-8');
449
        }
450
        // Strip HTML tags
451
        $text = HtmlPurifier::process($text, ['HTML.Allowed' => '']);
0 ignored issues
show
Bug introduced by
It seems like $text can also be of type array; however, parameter $content of yii\helpers\BaseHtmlPurifier::process() does only seem to accept string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

451
        $text = HtmlPurifier::process(/** @scrutinizer ignore-type */ $text, ['HTML.Allowed' => '']);
Loading history...
452
        $text = html_entity_decode($text, ENT_NOQUOTES, 'UTF-8');
453
        // Remove excess whitespace
454
        $text = preg_replace('/\s{2,}/u', ' ', $text);
455
        // Decode any HTML entities
456
        $text = html_entity_decode($text);
457
458
        return $text;
459
    }
460
461
    /**
462
     * Is $var an array or array-like object?
463
     *
464
     * @param $var
465
     * @return bool
466
     */
467
    public static function isArrayLike($var): bool
468
    {
469
        return is_array($var) || ($var instanceof Collection);
470
    }
471
472
    // Protected Static Methods
473
    // =========================================================================
474
475
    /**
476
     * @param string $language
477
     *
478
     * @return null|StopWordsAbstract
479
     */
480
    protected static function stopWordsForLanguage(string $language)
481
    {
482
        $stopWords = null;
483
        if (!empty(self::LANGUAGE_MAP[$language])) {
484
            $language = self::LANGUAGE_MAP[$language];
485
        } else {
486
            $language = 'English';
487
        }
488
489
        $className = 'PhpScience\\TextRank\\Tool\\StopWords\\' . ucfirst($language);
490
        if (class_exists($className)) {
491
            $stopWords = new $className();
492
        }
493
494
        return $stopWords;
495
    }
496
}
497