Passed
Branch dev (1a31b5)
by Dispositif
03:03
created

WebMapper::clean()   A

Complexity

Conditions 2
Paths 2

Size

Total Lines 8
Code Lines 4

Duplication

Lines 0
Ratio 0 %

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 2
eloc 4
c 1
b 0
f 0
nc 2
nop 1
dl 0
loc 8
rs 10
1
<?php
2
/**
3
 * This file is part of dispositif/wikibot application
4
 * 2019 © Philippe M. <[email protected]>
5
 * For the full copyright and MIT license information, please view the LICENSE file.
6
 */
7
8
declare(strict_types=1);
9
10
namespace App\Domain\Publisher;
11
12
use App\Domain\Enums\Language;
13
use App\Domain\Utils\ArrayProcessTrait;
14
use App\Infrastructure\Logger;
15
use DateTime;
16
use Exception;
17
use Psr\Log\LoggerInterface;
18
19
/**
20
 * Generic mapper for press/revue article on web.
21
 * Using JSON-LD and meta tags to obtain {article} data.
22
 * Class WebMapper
23
 *
24
 * @package App\Domain\Publisher
25
 */
26
class WebMapper implements MapperInterface
27
{
28
    use ArrayProcessTrait, WebOGMapperTrait, WebLDMapperTrait;
29
30
    /**
31
     * @var Logger
32
     */
33
    private $log;
34
35
    public function __construct(LoggerInterface $log)
36
    {
37
        $this->log = $log;
1 ignored issue
show
Documentation Bug introduced by
$log is of type Psr\Log\LoggerInterface, but the property $log was declared to be of type App\Infrastructure\Logger. Are you sure that you always receive this specific sub-class here, or does it make sense to add an instanceof check?

Our type inference engine has found a suspicous assignment of a value to a property. This check raises an issue when a value that can be of a given class or a super-class is assigned to a property that is type hinted more strictly.

Either this assignment is in error or an instanceof check should be added for that assignment.

class Alien {}

class Dalek extends Alien {}

class Plot
{
    /** @var  Dalek */
    public $villain;
}

$alien = new Alien();
$plot = new Plot();
if ($alien instanceof Dalek) {
    $plot->villain = $alien;
}
Loading history...
38
    }
39
40
    public function process($data): array
41
    {
42
        $dat = $this->processMapping($data);
43
44
        return $this->postProcess($dat);
45
    }
46
47
    protected function processMapping($data): array
48
    {
49
        if (!empty($data['JSON-LD'])) {
50
            if ($this->checkJSONLD($data['JSON-LD'])) {
51
                return $this->mapArticleDataFromJSONLD($data['JSON-LD']);
52
            }
53
            // gestion des multiples objets comme Figaro
54
            foreach ($data['JSON-LD'] as $dat) {
55
                if (is_array($dat) && $this->checkJSONLD($dat)) {
56
                    return $this->mapArticleDataFromJSONLD($dat);
57
                }
58
            }
59
        }
60
        if (!empty($data['meta'])) {
61
            // Dublin Core mapping included ;-)
62
            return $this->mapLienwebFromOpenGraph($data['meta']);
63
        }
64
65
        return [];
66
    }
67
68
    protected function checkJSONLD(array $jsonLD): bool
69
    {
70
        return isset($jsonLD['headline']) && isset($jsonLD['@type']);
71
    }
72
73
    /**
74
     * todo Refac/move domain special mapping
75
     *
76
     * @param array $dat
77
     *
78
     * @return array
79
     */
80
    protected function postProcess(array $dat): array
81
    {
82
        $dat = $this->deleteEmptyValueArray($dat);
83
        if (isset($dat['langue']) && 'fr' === $dat['langue']) {
84
            unset($dat['langue']);
85
        }
86
87
        if (isset($dat['site']) && $dat['site'] === 'Gallica') {
88
            unset($dat['format']); // "vidéo"...
89
        }
90
91
        return $dat;
92
    }
93
94
    protected function isAnArticle(?string $str): bool
95
    {
96
        if (in_array($str, ['article', 'journalArticle'])) {
97
            return true;
98
        }
99
100
        return false;
101
    }
102
103
    protected function convertURLaccess($data): ?string
104
    {
105
        // NYT, Figaro
106
        if (isset($data['isAccessibleForFree'])) {
107
            return $data['isAccessibleForFree'] ? 'ouvert' : 'limité';
108
        }
109
        if (isset($data['DC.rights'])) {
110
            return (in_array($data['DC.rights'], ['free', 'public domain'])) ? 'ouvert' : 'limité';
111
        }
112
        if (isset($data['og:article:content_tier'])) {
113
            return ($data['og:article:content_tier'] === 'free') ? 'ouvert' : 'limité';
114
        }
115
116
        return null;
117
    }
118
119
    /**
120
     * Réduit le nombre d'auteurs si > 3.
121
     * En $modeEtAll=true vérification pour "et al.=oui".
122
     * TODO : wikifyPressAgency()
123
     *
124
     * @param string|null $authors
125
     * @param bool        $modeEtAl
126
     *
127
     * @return string|null
128
     */
129
    protected function authorsEtAl(?string $authors, $modeEtAl = false): ?string
130
    {
131
        if (empty($authors)) {
132
            return null;
133
        }
134
        // conserve juste les 3 premiers auteurs TODO : refactor
135
        // Bob, Martin ; Yul, Bar ; ... ; ...
136
        if (preg_match('#([^;]+;[^;]+);[^;]+;.+#', $authors, $matches)) {
137
            return ($modeEtAl) ? 'oui' : $matches[1];
138
        }
139
        // Bob Martin, Yul Bar, ..., ...,...
140
        if (preg_match('#([^,]+,[^,]+),[^,]+,.+#', $authors, $matches)) {
141
            return ($modeEtAl) ? 'oui' : $matches[1];
142
        }
143
144
        return ($modeEtAl) ? null : $authors;
145
    }
146
147
    protected function convertDCpage(array $meta): ?string
148
    {
149
        if (isset($meta['citation_firstpage'])) {
150
            $page = $meta['citation_firstpage'];
151
            if (isset($meta['citation_lastpage'])) {
152
                $page .= '–'.$meta['citation_lastpage'];
153
            }
154
155
            return (string)$page;
156
        }
157
158
        return null;
159
    }
160
161
    protected function clean(?string $str = null): ?string
162
    {
163
        if ($str === null) {
164
            return null;
165
        }
166
        $str = str_replace(['&apos;', "\n", "&#10;", "|"], ["'", '', ' ', '/'], $str);
167
168
        return html_entity_decode($str);
169
    }
170
171
    protected function convertOGtype2format(?string $ogType): ?string
172
    {
173
        if (empty($ogType)) {
174
            return null;
175
        }
176
        // og:type = default: website / video.movie / video.tv_show video.other / article, book, profile
177
        if (strpos($ogType, 'video') !== false) {
178
            return 'vidéo';
179
        }
180
        if (strpos($ogType, 'book') !== false) {
181
            return 'livre';
182
        }
183
184
        return null;
185
    }
186
187
    /**
188
     * https://developers.facebook.com/docs/internationalization#locales
189
     */
190
    protected function convertLangue(?string $lang = null): ?string
191
    {
192
        if (empty($lang)) {
193
            return null;
194
        }
195
        // en_GB
196
        if (preg_match('#^([a-z]{2})_[A-Z]{2}$#', $lang, $matches)) {
197
            return $matches[1];
198
        }
199
200
        return Language::all2wiki($lang);
201
    }
202
203
    protected function convertAuteur($data, $indice)
204
    {
205
        // author=Bob
206
        if (isset($data['author']) && is_string($data['author']) && $indice === 1) {
207
            return html_entity_decode($data['author']);
208
        }
209
210
        // author ['name'=>'Bob','@type'=>'Person']
211
        if (0 === $indice
212
            && isset($data['author'])
213
            && isset($data['author']['name'])
214
            && (!isset($data['author']['@type'])
215
                || 'Person' === $data['author']['@type'])
216
        ) {
217
            if (is_string($data['author']['name'])) {
218
                return html_entity_decode($data['author']['name']);
219
            }
220
221
            return html_entity_decode($data['author']['name'][0]);
222
        }
223
224
        // author [ 0 => ['name'=>'Bob'], 1=> ...]
225
        if (isset($data['author']) && isset($data['author'][$indice])
226
            && (!isset($data['author'][$indice]['@type'])
227
                || 'Person' === $data['author'][$indice]['@type'])
228
        ) {
229
            if (isset($data['author'][$indice]['name']) && is_string($data['author'][$indice]['name'])) {
230
                return html_entity_decode($data['author'][$indice]['name']);
231
            }
232
233
            // "author" => [ "@type" => "Person", "name" => [] ]
234
            return html_entity_decode($data['author'][$indice]['name'][0]);
235
        }
236
237
        return null;
238
    }
239
240
    protected function convertInstitutionnel($data)
241
    {
242
        if (isset($data['author']) && isset($data['author'][0]) && isset($data['author'][0]['@type'])
243
            && 'Person' !== $data['author'][0]['@type']
244
        ) {
245
            return html_entity_decode($data['author'][0]['name']);
246
        }
247
248
        return null;
249
    }
250
251
    /**
252
     * @param string $str
253
     *
254
     * @return string
255
     */
256
    protected function convertDate(?string $str): ?string
257
    {
258
        if (empty($str)) {
259
            return null;
260
        }
261
262
        // "2012"
263
        if (preg_match('#^[12][0-9]{3}$#', $str)) {
264
            return $str;
265
        }
266
267
        try {
268
            $date = new DateTime($str);
269
        } catch (Exception $e) {
270
            dump('EXCEPTION DATE');
271
272
            return $str;
273
        }
274
275
        return $date->format('d-m-Y');
276
    }
277
278
    /**
279
     * Wikification des noms/acronymes d'agences de presse.
280
     *
281
     * @param string $str
282
     *
283
     * @return string
284
     */
285
    protected function wikifyPressAgency(?string $str): ?string
286
    {
287
        if (empty($str)) {
288
            return null;
289
        }
290
        // skip potential wikilinks
291
        if (strpos($str, '[') !== false) {
292
            return $str;
293
        }
294
        $str = preg_replace('#\b(AFP)\b#i', '[[Agence France-Presse|AFP]]', $str);
295
        $str = str_replace('Reuters', '[[Reuters]]', $str);
296
        $str = str_replace('Associated Press', '[[Associated Press]]', $str);
297
        $str = preg_replace('#\b(PA)\b#', '[[Press Association|PA]]', $str);
298
        $str = preg_replace('#\b(AP)\b#', '[[Associated Press|AP]]', $str);
299
        $str = str_replace('Xinhua', '[[Xinhua]]', $str);
300
        $str = preg_replace('#\b(ATS)\b#', '[[Agence télégraphique suisse|ATS]]', $str);
301
        $str = preg_replace('#\b(PC|CP)\b#', '[[La Presse canadienne|PC]]', $str);
302
303
        return $str;
304
    }
305
}
306