ExternMapper::processJsonLDMapping()   A
last analyzed

Complexity

Conditions 5
Paths 4

Size

Total Lines 13
Code Lines 6

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 2
CRAP Score 5

Importance

Changes 0
Metric Value
cc 5
eloc 6
c 0
b 0
f 0
nc 4
nop 1
dl 0
loc 13
ccs 2
cts 2
cp 1
crap 5
rs 9.6111
1
<?php
2
/*
3
 * This file is part of dispositif/wikibot application (@github)
4
 * 2019-2023 © Philippe M./Irønie  <[email protected]>
5
 * For the full copyright and MIT license information, view the license file.
6
 */
7
8
declare(strict_types=1);
9
10
namespace App\Domain\Publisher;
11
12
use App\Domain\Utils\ArrayProcessTrait;
13
use App\Domain\Utils\TextUtil;
14
use Psr\Log\LoggerInterface;
15
16
/**
17
 * Generic mapper for press/revue article on web.
18
 * Using JSON-LD and meta tags to obtain {article} data.
19
 * Generic mapper for web pages URL to wiki-template references.
20
 * Converting to {article}, {lien web} or {lien brisé}
21
 * Using JSON-LD, Open Graph and Dublin Core meta extracted from HTML.
22
 */
23
class ExternMapper implements MapperInterface
24
{
25
    use ArrayProcessTrait;
26
27
    // Added if title extracted from HTML <title> or <h1>
28
    final public const TITLE_TO_VERIFY_COMMENT = '<!-- Vérifiez ce titre -->';
29
    // if title extracted from json-ld, or anything else
30
    final public const TITLE_VERY_MAX_LENGTH = 150;
31
    // if title extracted from HTML <title> or <h1> is too long, it's probably SEO
32
    final public const TITLE_HTML_MAX_LENGTH = 80;
33
    // if title contains too many all-caps words, it's probably SEO
34 5
    final public const TITLE_MAX_ALLCAPS = 2;
35
    // if site name extracted for meta data is too long, it's probably SEO
36 5
    final public const SITE_MAX_LENGTH = 40;
37 5
    final public const SITE_MAX_ALLCAPS = 1;
38
    private bool $titleFromHtmlState = false;
39 5
40
    /**
41 5
     * @param mixed[] $options
42
     */
43 5
    public function __construct(private readonly LoggerInterface $log, private readonly ?array $options = [])
44
    {
45
    }
46 5
47
    public function process($data): array
48 5
    {
49 5
        $parsedData = $this->processMapping($data);
50 5
51 3
        return ($parsedData === []) ? [] : $this->postProcess($parsedData);
52
    }
53 5
54 5
    protected function processMapping($data): array
55
    {
56
        $mapJson = [];
57
        $mapMeta = [];
58 5
        $this->titleFromHtmlState = false;
59 3
60 2
        if (!empty($data['JSON-LD'])) {
61 2
            $mapJson = $this->processJsonLDMapping($data['JSON-LD']);
62
        }
63
        if (!empty($data['meta'])) {
64 3
            $openGraphMapper = new OpenGraphMapper($this->options); // todo inject/extract to reduce instanciationS ?
65
            $mapMeta = $openGraphMapper->process($data['meta']);
66
            $this->titleFromHtmlState = $openGraphMapper->isTitleFromHtmlState();
67 2
        }
68
69
        // langue absente JSON-LD mais array_merge risqué (doublon)
70
        if ($mapJson !== []) {
71
            if (!isset($mapJson['langue']) && isset($mapMeta['langue'])) {
72
                $mapJson['langue'] = $mapMeta['langue'];
73
                $mapJson['DATA-TYPE'] = 'JSON-LD+META';
74
            }
75
            // récupère "accès url" de OpenGraph (prévaut sur JSON:'isAccessibleForFree'
76
            if (isset($mapMeta['accès url'])) {
77 3
                $mapJson['accès url'] = $mapMeta['accès url'];
78
                $mapJson['DATA-TYPE'] = 'JSON-LD+META';
79 3
            }
80 2
81
            return $mapJson;
82
        }
83 1
84 1
        return $mapMeta;
85 1
    }
86
87
    /**
88
     * todo move to mapper ?
89
     */
90
    private function processJsonLDMapping(array $LDdata): array
91
    {
92 3
        if ($this->checkJSONLD($LDdata)) {
93
            return (new JsonLDMapper())->process($LDdata);
94 3
        }
95
        // gestion des multiples objets comme Figaro
96
        foreach ($LDdata as $dat) {
97
            if (is_array($dat) && $this->checkJSONLD($dat)) {
98
                return (new JsonLDMapper())->process($dat);
99
            }
100
        }
101
102
        return [];
103
    }
104
105 5
    protected function checkJSONLD(array $jsonLD): bool
106
    {
107 5
        return isset($jsonLD['headline']) && isset($jsonLD['@type']);
108 5
    }
109 3
110
    /**
111
     * Data sanitization.
112
     * todo complexity/conditions
113 5
     * todo Config parameter for post-process
114
     */
115
    protected function postProcess(array $data): array
116
    {
117 5
        $data = $this->deleteEmptyValueArray($data);
118
        if (isset($data['langue']) && 'fr' === $data['langue']) {
119
            unset($data['langue']);
120
        }
121
122
        // Ça m'énerve ! Gallica met "vidéo" pour livre numérisé
123
        if (isset($data['site']) && $data['site'] === 'Gallica') {
124
            unset($data['format']);
125
        }
126
        if (isset($data['site']) && TextUtil::countAllCapsWords($data['site']) > self::SITE_MAX_ALLCAPS) {
127
            $this->log->debug('lowercase site name');
128
            $data['site'] = TextUtil::mb_ucfirst(mb_strtolower((string) $data['site']));
129
        }
130
        // SEO : cut site name if too long if no domain.name and no wiki link
131
        if (
132
            isset($data['site'])
133
            && false === mb_strpos((string) $data['site'], '.')
134
            && false === mb_strpos((string) $data['site'], '[[')) {
135
            $data['site'] = TextUtil::cutTextOnSpace($data['site'], self::SITE_MAX_LENGTH);
136
        }
137
138
        // lowercase title if too many ALLCAPS words
139
        if (isset($data['titre']) && TextUtil::countAllCapsWords($data['titre']) > self::TITLE_MAX_ALLCAPS) {
140
            $this->log->debug('lowercase title');
141
            $data['titre'] = TextUtil::mb_ucfirst(mb_strtolower((string) $data['titre']));
142
        }
143
144
        // title has 150 chars max, or is cut with "…" at the end
145
        if (isset($data['titre'])) {
146
            $data['titre'] = TextUtil::cutTextOnSpace($data['titre'], self::TITLE_VERY_MAX_LENGTH);
147
            $data['titre'] = $this->addVerifyCommentIfNecessary($data['titre']);
148
        }
149
150
        return $data;
151
    }
152
153
    /**
154
     * todo Créer un modèle {titre à vérifier} ?
155
     */
156
    private function addVerifyCommentIfNecessary(?string $title): ?string
157
    {
158
        if (
159
            !empty($title)
160
            && mb_strlen($title) >= 30
161
            && $this->titleFromHtmlState
162
        ) {
163
            /** @noinspection PhpRedundantOptionalArgumentInspection */
164
            $title = TextUtil::cutTextOnSpace($title, self::TITLE_HTML_MAX_LENGTH);
165
            $title .= self::TITLE_TO_VERIFY_COMMENT;
166
        }
167
168
        return $title;
169
    }
170
}
171