Passed
Push — master ( 3ec723...d98805 )
by Dispositif
02:16
created

SeoSanitizer   A

Complexity

Total Complexity 18

Size/Duplication

Total Lines 114
Duplicated Lines 0 %

Importance

Changes 1
Bugs 0 Features 0
Metric Value
eloc 43
c 1
b 0
f 0
dl 0
loc 114
rs 10
wmc 18

6 Methods

Rating   Name   Duplication   Size   Complexity  
A buildNewTitle() 0 12 3
A cleanSEOTitle() 0 18 4
A extractSEOSegments() 0 5 2
A extractSiteName() 0 7 1
A deleteSegmentsContainingSitename() 0 16 3
A getSEOSeparator() 0 17 5
1
<?php
2
/*
3
 * This file is part of dispositif/wikibot application (@github)
4
 * 2019-2023 © Philippe M./Irønie  <[email protected]>
5
 * For the full copyright and MIT license information, view the license file.
6
 */
7
8
declare(strict_types=1);
9
10
namespace App\Domain\Publisher;
11
12
use App\Domain\Utils\TextUtil;
13
14
class SeoSanitizer
15
{
16
    private const MAX_LENGTH_FIRST_SEG_ALLOWING_SECOND_SEG = 30;
17
    private const REBUILD_SEPARATOR = ' - ';
18
19
    /**
20
     * Naive SEO sanitization of web page title.
21
     * pretty domain name as "google.com" or "google.co.uk"
22
     */
23
    public function cleanSEOTitle(string $prettyDomainName, ?string $title): ?string
24
    {
25
        if (empty(trim($title))) {
0 ignored issues
show
Bug introduced by
It seems like $title can also be of type null; however, parameter $string of trim() does only seem to accept string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

25
        if (empty(trim(/** @scrutinizer ignore-type */ $title))) {
Loading history...
26
            return null;
27
        }
28
        $title = str_replace(['–', '—', '\\'], ['-', '-', '/'], $title); // replace em dash with hyphen
29
30
        $seoSegments = $this->extractSEOSegments($title);
31
        // No SEO segmentation found
32
        if (count($seoSegments) < 2) {
33
            return $title;
34
        }
35
        $seoSegmentsFiltered = $this->deleteSegmentsContainingSitename($prettyDomainName, $seoSegments);
36
        if (count($seoSegmentsFiltered) === 0) {
37
            return trim($seoSegments[0]);
38
        }
39
40
        return $this->buildNewTitle($seoSegmentsFiltered);
41
    }
42
43
    /**
44
     * @param string[] $titleSegments
45
     */
46
    protected function buildNewTitle(array $titleSegments): string
47
    {
48
        // if only one segment or first segment is long enough, return it
49
        if (
50
            count($titleSegments) === 1
51
            || mb_strlen($titleSegments[0]) >= self::MAX_LENGTH_FIRST_SEG_ALLOWING_SECOND_SEG
52
        ) {
53
            return trim($titleSegments[0]);
54
        }
55
56
        // rebuild title but keep only the first 2 segments
57
        return trim($titleSegments[0]) . self::REBUILD_SEPARATOR . trim($titleSegments[1]);
58
    }
59
60
    /**
61
     * @return string[]
62
     */
63
    private function extractSEOSegments(string $title): array
64
    {
65
        $seoSeparator = $this->getSEOSeparator($title);
66
67
        return (null === $seoSeparator) ? [$title] : explode($seoSeparator, $title);
68
    }
69
70
    private function getSEOSeparator(string $title): ?string
71
    {
72
        // order is important. '-' before '/' ? see date, etc
73
        if (strpos($title, ' | ') !== false) {
74
            return ' | ';
75
        }
76
        if (strpos($title, ' / ') !== false) {
77
            return ' / ';
78
        }
79
        if (strpos($title, ' - ') !== false) {
80
            return ' - ';
81
        }
82
        if (strpos($title, ' : ') !== false) {
83
            return ' : ';
84
        }
85
86
        return null;
87
    }
88
89
    /**
90
     * Remove SEO segments as containing same words as the website domain name.
91
     *
92
     * @param string[] $seoSegments
93
     *
94
     * @return string[]
95
     */
96
    private function deleteSegmentsContainingSitename(string $prettyDomainName, array $seoSegments): array
97
    {
98
        $siteName = TextUtil::stripPunctuation($this->extractSiteName($prettyDomainName));
99
        $siteName = str_replace(['.', '-', ' '], '', $siteName);
100
        $prettyDomainName = TextUtil::stripPunctuation($prettyDomainName);
101
        $prettyDomainName = str_replace(['.', '-', ' '], '', $prettyDomainName);
102
103
        return array_values(array_filter(
104
            $seoSegments,
105
            function ($segment) use ($prettyDomainName, $siteName) {
106
                $strippedSegment = mb_strtolower(TextUtil::stripPunctuation(TextUtil::stripAccents($segment)));
107
                $strippedSegment = str_replace(['.', '-', ' '], '', $strippedSegment);
108
109
                return !empty(trim($segment))
110
                    && false === strpos($strippedSegment, $prettyDomainName)
111
                    && false === strpos($strippedSegment, $siteName);
112
            }
113
        ));
114
    }
115
116
    /**
117
     * Get site name from pretty domain name.
118
     * Ex: "google.com" => "google"
119
     * Ex: "my-news.co.uk" => "my-news"
120
     */
121
    private function extractSiteName(string $prettyDomainName): string
122
    {
123
        // strip string after last dot
124
        $siteName = preg_replace('/\.[^.]*$/', '', $prettyDomainName);
125
126
        // strip string if only 2 chars after dot
127
        return preg_replace('/\.[^.]{2}$/', '', $siteName);
128
    }
129
}