Passed
Push — master ( 2556d0...3ec723 )
by Dispositif
02:20
created

SeoSanitizer::cleanSEOTitle()   A

Complexity

Conditions 6
Paths 5

Size

Total Lines 27
Code Lines 14

Duplication

Lines 0
Ratio 0 %

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 6
eloc 14
nc 5
nop 2
dl 0
loc 27
rs 9.2222
c 1
b 0
f 0
1
<?php
2
/*
3
 * This file is part of dispositif/wikibot application (@github)
4
 * 2019-2023 © Philippe M./Irønie  <[email protected]>
5
 * For the full copyright and MIT license information, view the license file.
6
 */
7
8
declare(strict_types=1);
9
10
namespace App\Domain\Publisher;
11
12
use App\Domain\Utils\TextUtil;
13
14
class SeoSanitizer
15
{
16
    private const MAX_LENGTH_FIRST_SEG_ALLOWING_SECOND_SEG = 30;
17
    private const REBUILD_SEPARATOR = ' - ';
18
19
    /**
20
     * Naive SEO sanitization of web page title.
21
     * pretty domain name as "google.com" or "google.co.uk"
22
     */
23
    public function cleanSEOTitle(string $prettyDomainName, ?string $title): ?string
24
    {
25
        if (empty(trim($title))) {
0 ignored issues
show
Bug introduced by
It seems like $title can also be of type null; however, parameter $string of trim() does only seem to accept string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

25
        if (empty(trim(/** @scrutinizer ignore-type */ $title))) {
Loading history...
26
            return null;
27
        }
28
        $title = str_replace(['–', '—', '\\'], ['-', '-', '/'], $title); // replace em dash with hyphen
29
30
        $seoSegments = $this->extractSEOSegments($title);
31
        // No SEO segmentation found
32
        if (count($seoSegments) < 2) {
33
            return $title;
34
        }
35
        $seoSegmentsFiltered = $this->deleteSegmentsContainingSitename($prettyDomainName, $seoSegments);
36
        if (count($seoSegmentsFiltered) === 0) {
37
            return trim($seoSegments[0]);
38
        }
39
40
        // if only one segment or first segment is long enough, return it
41
        if (
42
            count($seoSegmentsFiltered) === 1
43
            || mb_strlen($seoSegmentsFiltered[0]) >= self::MAX_LENGTH_FIRST_SEG_ALLOWING_SECOND_SEG
44
        ) {
45
            return trim($seoSegmentsFiltered[0]);
46
        }
47
48
        // rebuild bestTitle but keep only the first 2 SEO segments
49
        return trim($seoSegmentsFiltered[0]) . self::REBUILD_SEPARATOR . trim($seoSegmentsFiltered[1]);
50
    }
51
52
    private function extractSEOSegments(string $title): array
53
    {
54
        $seoSeparator = $this->getSEOSeparator($title);
55
        if (null === $seoSeparator) {
56
            return [$title];
57
        }
58
59
        return explode($seoSeparator, $title);
60
    }
61
62
    private function getSEOSeparator(string $title): ?string
63
    {
64
        if (strpos($title, ' | ') !== false) {
65
            return ' | ';
66
        }
67
        if (strpos($title, ' / ') !== false) {
68
            return ' / ';
69
        }
70
        if (strpos($title, ' - ') !== false) {
71
            return ' - ';
72
        }
73
        if (strpos($title, ' : ') !== false) {
74
            return ' : ';
75
        }
76
77
        return null;
78
    }
79
80
    /**
81
     * Remove SEO segments as containing same words as the website domain name.
82
     */
83
    private function deleteSegmentsContainingSitename(string $prettyDomainName, array $seoSegments): array
84
    {
85
        // strip string after last dot in prettyDomainName : blabla.com => blabla
86
        $siteName = preg_replace('/\.[^.]*$/', '', $prettyDomainName);
87
        // strip string if only 2 chars after dot : so 'blabla.co.uk' => blabla
88
        $siteName = preg_replace('/\.[^.]{2}$/', '', $siteName);
89
        $siteName = TextUtil::stripPunctuation($siteName); // bla-bla => blabla
90
91
        return array_values(array_filter(
92
            $seoSegments,
93
            function ($segment) use ($prettyDomainName, $siteName) {
94
                $strippedSegment = str_replace(
95
                    [' ', '-'],
96
                    '',
97
                    mb_strtolower(TextUtil::stripPunctuation(TextUtil::stripAccents($segment)))
98
                );
99
100
                return !empty(trim($segment))
101
                    && false === strpos($strippedSegment, str_replace(['.', '-'], '', $prettyDomainName))
102
                    && false === strpos($strippedSegment, str_replace(['.', '-'], '', $siteName));
103
            }
104
        ));
105
    }
106
}