Passed
Push — master ( dff8a4...2556d0 )
by Dispositif
08:19
created

SeoSanitizer::cleanSEOTitle()   B

Complexity

Conditions 8
Paths 5

Size

Total Lines 37
Code Lines 18

Duplication

Lines 0
Ratio 0 %

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 8
eloc 18
c 1
b 0
f 0
nc 5
nop 2
dl 0
loc 37
rs 8.4444
1
<?php
2
/*
3
 * This file is part of dispositif/wikibot application (@github)
4
 * 2019-2023 © Philippe M./Irønie  <[email protected]>
5
 * For the full copyright and MIT license information, view the license file.
6
 */
7
8
declare(strict_types=1);
9
10
namespace App\Domain\Publisher;
11
12
use App\Domain\Utils\TextUtil;
13
14
class SeoSanitizer
15
{
16
    /**
17
     * Naive SEO sanitization of web page title.
18
     * pretty domain name as "google.com" or "google.co.uk"
19
     */
20
    public function cleanSEOTitle(string $prettyDomainName, ?string $title): ?string
21
    {
22
        if (empty(trim($title))) {
0 ignored issues
show
Bug introduced by
It seems like $title can also be of type null; however, parameter $string of trim() does only seem to accept string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

22
        if (empty(trim(/** @scrutinizer ignore-type */ $title))) {
Loading history...
23
            return null;
24
        }
25
        $title = str_replace(['–','—','\\'],['-','-','/'], $title); // replace em dash with hyphen
26
27
        $seoSegments = $this->extractSEOSegments($title);
28
        // No SEO segmentation found
29
        if (count($seoSegments) < 2) {
30
            return $title;
31
        }
32
33
        // remove seo segments containing same words as the website domain name
34
        $seoSegmentsFiltered = array_values(array_filter($seoSegments, function ($segment) use ($prettyDomainName) {
35
            // strip string after last dot in prettyDomainName : blabla.com => blabla
36
            $domainMain = preg_replace('/\.[^.]*$/', '', $prettyDomainName);
37
            // strip string if only 2 chars after dot : like in 'blabla.co.uk'
38
            $domainMain = preg_replace('/\.[^.]{2}$/', '', $domainMain);
39
40
            $strippedSegment = str_replace([' ', '-'], '', mb_strtolower(TextUtil::stripPunctuation(TextUtil::stripAccents($segment))));
41
42
            return !empty(trim($segment))
43
                && false === strpos($strippedSegment, str_replace(['.', '-'], '', $prettyDomainName))
44
                && false === strpos($strippedSegment, str_replace(['.', '-'], '', $domainMain));
45
        }));
46
        if (count($seoSegmentsFiltered) === 0) {
47
            return trim($seoSegments[0]); // before filtering
48
        }
49
50
        // if only one segment or first segment is long enough, return it
51
        if (count($seoSegmentsFiltered) === 1 || mb_strlen($seoSegmentsFiltered[0]) >= 25) {
52
            return trim($seoSegmentsFiltered[0]);
53
        }
54
55
        // rebuild bestTitle but keep only the first 2 SEO segments
56
        return trim($seoSegmentsFiltered[0]) . ' - ' . trim($seoSegmentsFiltered[1]);
57
    }
58
59
    private function extractSEOSegments(string $title): array
60
    {
61
        $seoSeparator = $this->getSEOSeparator($title);
62
        if (null === $seoSeparator) {
63
            return [$title];
64
        }
65
66
        return explode($seoSeparator, $title);
67
    }
68
69
    private function getSEOSeparator(string $title): ?string
70
    {
71
        $title = str_replace(' — ', ' - ', $title);
72
        if (strpos($title, ' | ') !== false) {
73
            return ' | ';
74
        }
75
        if (strpos($title, ' - ') !== false) {
76
            return ' - ';
77
        }
78
        if (strpos($title, ' / ') !== false) {
79
            return ' / ';
80
        }
81
        if (strpos($title, ' : ') !== false) {
82
            return ' : ';
83
        }
84
85
        return null;
86
    }
87
}