Passed
Push — master ( dff8a4...2556d0 )
by Dispositif
08:19
created

OpenGraphMapper::chooseBestTitle()   A

Complexity

Conditions 5
Paths 3

Size

Total Lines 16
Code Lines 8

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 0
CRAP Score 30

Importance

Changes 0
Metric Value
cc 5
eloc 8
c 0
b 0
f 0
nc 3
nop 3
dl 0
loc 16
ccs 0
cts 0
cp 0
crap 30
rs 9.6111
1
<?php
2
/*
3
 * This file is part of dispositif/wikibot application (@github)
4
 * 2019-2023 © Philippe M./Irønie  <[email protected]>
5
 * For the full copyright and MIT license information, view the license file.
6
 */
7
8
declare(strict_types=1);
9
10
11
namespace App\Domain\Publisher;
12
13
14
use Exception;
15
16
/**
17
 * Parsing/mapping Open Graph and Dublin Core meta data, and HTML meta tags
18
 * Currently only used by ExternMapper (mixed with JSON-LD mapping) for ExternRefWorker.
19
 */
20
class OpenGraphMapper implements MapperInterface
21
{
22
    use ExternConverterTrait;
23
24
    /**
25
     * Allowing use of HTML <title> or <h1> to predict web page title ?
26
     */
27
    protected $htmlTitleAllowed = true;
28
29
    protected $titleFromHtmlState = false;
30
31 5
    /**
32
     * @param array|null $options
33
     */
34 5
    public function __construct(?array $options = [])
35 5
    {
36 5
        if (!empty($options['htmlTitleAllowed']) && is_bool($options['htmlTitleAllowed'])) {
37 5
            $this->htmlTitleAllowed = $options['htmlTitleAllowed'];
38 5
        }
39 5
    }
40 5
41
    /**
42 5
     * Mapping from Open Graph and Dublin Core meta tags
43 5
     * https://ogp.me/
44 5
     * https://www.dublincore.org/schemas/
45
     * todo extract DC ?
46 5
     *
47 5
     * @param array $meta
48 5
     *
49 5
     * @return array
50
     * @throws Exception
51 5
     */
52
    public function process($meta): array
53
    {
54 5
        $seoSanitizer = new SeoSanitizer();
55 5
56 5
        return [
57 5
            'DATA-TYPE' => 'Open Graph/Dublin Core',
58
            'DATA-ARTICLE' => $this->isAnArticle($meta['og:type'] ?? ''),
59 5
            'site' => $this->clean($meta['og:site_name'] ?? null),
60 5
            'titre' => $seoSanitizer->cleanSEOTitle(
61 5
                $meta['prettyDomainName'],
62 5
                $this->predictBestTitle($meta)
63
            ),
64
            'url' => $meta['og:url'] ?? $meta['URL'] ?? $meta['html-url'] ?? null,
65
            'langue' => $this->convertLangue(
66 5
                $meta['og:locale'] ?? $meta['DC.language'] ?? $meta['citation_language'] ?? $meta['lang'] ??
67 5
                $meta['language'] ??
68 5
                $meta['content-language'] ?? $meta['Content-Language'] ?? $meta['html-lang'] ?? null
69 5
            ),
70 5
            'consulté le' => date('d-m-Y'),
71 5
            'auteur' => $this->cleanAuthor(
72 5
                $meta['og:article:author'] ??
73 5
                $meta['article:author'] ?? $meta['citation_author'] ?? $meta['article:author_name'] ?? null
74
            ),
75
            'format' => $this->convertOGtype2format($meta['og:type'] ?? null),
76
            'date' => $this->convertDate(
77
                $meta['og:article:published_time'] ?? $meta['article:published_time'] ??
78
                $meta['DC.date'] ?? $meta['citation_date'] ?? $meta['citation_publication_date'] ?? null
79
            ),
80
            'accès url' => $this->convertURLaccess($meta),
81
82
            // DUBLIN CORE ONLY
83
            'périodique' => $this->clean($meta['DC.isPartOf'] ?? $meta['citation_journal_title'] ?? null),
84
            'et al.' => $this->authorsEtAl(
85
                $meta['citation_authors'] ?? $meta['DC.Contributor'] ?? null,
86
                true
87
            ),
88
            'auteur1' => $this->wikifyPressAgency(
89
                $this->cleanAuthor(
90
                    $this->authorsEtAl($meta['citation_authors'] ?? $meta['DC.Contributor'] ?? $meta['Author'] ?? null)
91
                )
92
            ),
93
            'volume' => $meta['citation_volume'] ?? null,
94
            'numéro' => $meta['citation_issue'] ?? null,
95
            'page' => $this->convertDCpage($meta),
96
            'doi' => $meta['citation_doi'] ?? $meta['DOI'] ?? null,
97
            'éditeur' => $meta['DC.publisher'] ?? $meta['dc.publisher'] ?? null, // Persée dégeulasse todo?
98
            'pmid' => $meta['citation_pmid'] ?? null,
99
            'issn' => $meta["citation_issn"] ?? null,
100
            'isbn' => $meta["citation_isbn"] ?? null,
101
            // "prism.eIssn" => "2262-7197"
102
        ];
103
    }
104
105
    public function isTitleFromHtmlState(): bool
106
    {
107
        return $this->titleFromHtmlState;
108
    }
109
110
    /**
111
     * Todo extraire cette logique to ExternConverterTrait or ExternPageTitlePredictor ?
112
     */
113
    private function predictBestTitle(array $meta): ?string
114
    {
115
        // Mode "pas de titre html"
116
        if (!$this->htmlTitleAllowed) {
117
            return $this->getBestTitleFromMetadata($meta);
118
        }
119
120
        if (null === $this->getBestTitleFromMetadata($meta)
121
            && !empty($meta['html-title'])
122
        ) {
123
            $this->titleFromHtmlState = true;
124
        }
125
126
        // Responsibility ?!! sanitize title here conflicts with ExternMapper:postprocess()
127
        return $this->chooseBestTitle(
128
            $this->getBestTitleFromMetadata($meta),
129
            $meta['html-title'],
130
            $meta['html-h1']
131
        );
132
    }
133
134
    /**
135
     * Choose page's title from OpenGrap or Dublin core.
136
     */
137
    private function getBestTitleFromMetadata(array $meta): ?string
138
    {
139
        if (!empty($meta['og:title'])) {
140
            return $meta['og:title'];
141
        }
142
        if (!empty($meta['twitter:title'])) {
143
            return $meta['twitter:title'];
144
        }
145
        if (!empty($meta['DC.title'])) {
146
            return $meta['DC.title'];
147
        }
148
149
        return null;
150
    }
151
152
    /**
153
     * Choose best title from meta-title, html-title and html-h1.
154
     * Title is sanitized in ExternMapper::postprocess()
155
     */
156
    public function chooseBestTitle(?string $metaTitle, ?string $htmlTitle, ?string $htmlH1): ?string
157
    {
158
        // clean all titles
159
        $metaTitle = $this->clean($metaTitle);
160
        $htmlTitle = $this->clean($htmlTitle);
161
        $htmlH1 = $this->clean($htmlH1);
162
163
        // check if htmlh1 included in htmltitle, if yes use htmlh1
164
        if (!empty($metaTitle)) {
165
            return $metaTitle;
166
        }
167
        if (!empty($htmlH1) && !empty($htmlTitle) && strpos($htmlTitle, $htmlH1) !== false) {
168
            return $htmlH1;
169
        }
170
171
        return $htmlTitle ?? $htmlH1 ?? null;
172
    }
173
}
174