Scrapper::initializeData()   B
last analyzed

Complexity

Conditions 10
Paths 58

Size

Total Lines 56
Code Lines 31

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 10
eloc 31
nc 58
nop 4
dl 0
loc 56
rs 7.6666
c 0
b 0
f 0

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
declare(strict_types=1);
4
5
namespace ReliqArts\Scavenger\Service;
6
7
use Exception;
8
use Illuminate\Console\Command;
9
use Illuminate\Database\QueryException;
10
use Illuminate\Support\Collection;
11
use InvalidArgumentException;
12
use JsonException;
13
use Psr\Log\LoggerInterface;
14
use ReliqArts\Scavenger\Helper\FormattedMessage;
15
use ReliqArts\Scavenger\Helper\TargetKey;
16
use ReliqArts\Scavenger\Model\Scrap;
17
use ReliqArts\Scavenger\Model\Target;
18
use ReliqArts\Scavenger\TitleLink;
19
use Symfony\Component\DomCrawler\Crawler;
20
21
class Scrapper extends Communicator
22
{
23
    private const ERROR_INVALID_SPECIAL_KEY = '!#ERR - S.Key not valid!';
24
    private const KEY_PREFIX = ConfigProvider::SPECIAL_KEY_PREFIX;
25
    private const KEY_TARGET = self::KEY_PREFIX . 'target';
26
    private const KEY_ID = self::KEY_PREFIX . 'id';
27
    private const KEY_SERP_RESULT = self::KEY_PREFIX . 'serp_result';
28
    private const ENCODE_CHARSET = 'UTF-8';
29
30
    /**
31
     * @var array[]
32
     */
33
    private array $raw;
34
    private LoggerInterface $logger;
35
    private int $newScrapsCount;
36
    private Collection $relatedObjects;
37
    private Scanner $scanner;
38
39
    /**
40
     * Scraps found so far.
41
     */
42
    private Collection $scraps;
43
44
    /**
45
     * Scrapper constructor.
46
     */
47
    public function __construct(
48
        LoggerInterface $logger,
49
        ?Command $callingCommand = null,
50
        string $hashAlgorithm = self::HASH_ALGORITHM,
51
        int $verbosity = self::VERBOSITY_LOW
52
    ) {
53
        parent::__construct($callingCommand);
54
55
        $this->raw = [];
56
        $this->logger = $logger;
57
        $this->scanner = new Scanner();
58
        $this->hashAlgorithm = $hashAlgorithm;
59
        $this->scraps = collect([]);
60
        $this->relatedObjects = collect([]);
61
        $this->newScrapsCount = 0;
62
        $this->verbosity = $verbosity;
63
    }
64
65
    /**
66
     * Collect scrap array from crawler using markup.
67
     *
68
     * @throws JsonException
69
     */
70
    public function collect(TitleLink $titleLink, Target $target, Crawler $crawler): void
71
    {
72
        $markup = $target->getMarkup();
73
        $data = $this->initializeData(
74
            $titleLink,
75
            $target,
76
            $crawler,
77
            $markup[TargetKey::special(TargetKey::MARKUP_INSIDE)] ?? []
78
        );
79
        $data = $this->preprocess($data, $target);
80
81
        if (!$this->verifyData($data, $target)) {
82
            return;
83
        }
84
85
        $data = $this->finalizeData($data, $target);
86
87
        $this->raw[$data[self::KEY_ID]] = $data;
88
        $this->scraps->push($this->buildScrapFromData($data));
89
90
        // feedback
91
        $this->tell(
92
            FormattedMessage::get(FormattedMessage::SCRAP_GATHERED, $data[self::KEY_ID])
93
            . ($this->verbosity >= self::VERBOSITY_HIGH ? '-- ' . json_encode($data, JSON_THROW_ON_ERROR, 512) : null)
94
        );
95
    }
96
97
    public function convertScraps(bool $convertDuplicates = false, bool $storeRelatedReferences = false): void
98
    {
99
        $this->scraps->map(
100
            function (Scrap $scrap) use ($convertDuplicates, $storeRelatedReferences) {
101
                try {
102
                    $relatedObject = $scrap->convert($convertDuplicates, $storeRelatedReferences);
103
                    if ($relatedObject !== null) {
104
                        $this->relatedObjects->push($relatedObject);
105
                    }
106
                } catch (QueryException $e) {
107
                    $this->logger->warning($e, ['scrap' => $scrap]);
108
                }
109
            }
110
        );
111
    }
112
113
    public function getNewScrapsCount(): int
114
    {
115
        return $this->newScrapsCount;
116
    }
117
118
    public function getScraps(): Collection
119
    {
120
        return $this->scraps;
121
    }
122
123
    public function saveScraps(): void
124
    {
125
        $this->scraps->map(
126
            function (Scrap $scrap): void {
127
                try {
128
                    $scrap->save();
129
                } catch (QueryException $e) {
130
                    $errorMessage = FormattedMessage::get(
131
                        FormattedMessage::SCRAP_SAVE_EXCEPTION,
132
                        $scrap->hash,
133
                        $e->getMessage()
134
                    );
135
                    if ($this->verbosity >= self::VERBOSITY_HIGH) {
136
                        $errorMessage .= ' -- ' . $scrap->toJson();
137
                    }
138
139
                    $this->tell($errorMessage);
140
                    $this->logger->error($errorMessage);
141
                }
142
            }
143
        );
144
    }
145
146
    /**
147
     * Retrieve related objects which are a result of scrap conversion.
148
     */
149
    public function getRelatedObjects(): Collection
150
    {
151
        return $this->relatedObjects;
152
    }
153
154
    /**
155
     * Initialize data.
156
     *
157
     * @noinspection PhpTooManyParametersInspection
158
     */
159
    private function initializeData(
160
        TitleLink $titleLink,
161
        Target $target,
162
        Crawler $crawler,
163
        array $markupOverride = []
164
    ): array {
165
        $markup = !empty($markupOverride) ? $markupOverride : $target->getMarkup();
166
167
        $data[TargetKey::TITLE] = $titleLink->getTitle();
0 ignored issues
show
Comprehensibility Best Practice introduced by
$data was never initialized. Although not strictly required by PHP, it is generally a good practice to add $data = array(); before regardless.
Loading history...
168
        $data[TargetKey::special(TargetKey::LINK)] = $titleLink->getLink();
169
        $data[TargetKey::special(TargetKey::POSITION)] = $target->getCursor();
170
        $data[TargetKey::special(TargetKey::SOURCE)] = $titleLink->getLink();
171
        $data[TargetKey::special(TargetKey::MODEL)] = $target->getModel();
172
173
        // build initial scrap data from markup and dissect
174
        foreach ($markup as $attr => $path) {
175
            if (ConfigProvider::isSpecialKey($path)) {
176
                // path is special key, use special key value from scrap
177
                $data[$attr] = !empty($data[$path]) ? $data[$path] : self::ERROR_INVALID_SPECIAL_KEY;
178
            } elseif (!ConfigProvider::isSpecialKey($attr)) {
179
                try {
180
                    $attrCrawler = $crawler->filter($path);
181
                    $data[$attr] = $attr === TargetKey::TITLE ? $attrCrawler->text() : $attrCrawler->html();
182
183
                    // split single attributes into multiple based on regex
184
                    if (!empty($target->getDissect()[$attr])) {
185
                        $dissectMap = $target->getDissect()[$attr];
186
187
                        // check _retain meta property
188
                        // to determine whether details should be left in source attribute after extraction
189
                        $retain = empty($dissectMap[TargetKey::special(TargetKey::RETAIN)])
190
                            ? false
191
                            : $dissectMap[TargetKey::special(TargetKey::RETAIN)];
192
                        unset($dissectMap[TargetKey::special(TargetKey::RETAIN)]);
193
194
                        // Extract details into scrap
195
                        $data = array_merge($data, Scanner::pluckDetails($data[$attr], $dissectMap, $retain));
196
197
                        // unset dissectMap
198
                        unset($dissectMap);
199
                    }
200
                } catch (InvalidArgumentException $e) {
201
                    $exMessage = FormattedMessage::get(
202
                        FormattedMessage::EXCEPTION_THROWN_FOR_ATTRIBUTE,
203
                        $attr,
204
                        $target->getName(),
205
                        $e->getMessage()
206
                    );
207
                    $this->tell($exMessage);
208
                    $this->logger->warning($exMessage, [$markup[$attr]]);
209
                }
210
            }
211
            unset($path);
212
        }
213
214
        return $data;
215
    }
216
217
    private function preprocess(array $data, Target $target): array
218
    {
219
        // preprocess and remap scrap data parts
220
        foreach ($data as $attr => $value) {
221
            $data[$attr] = $this->encodeAttribute($data, $attr);
222
223
            // preprocess
224
            if (!empty($target->getPreprocess()[$attr])) {
225
                $preprocess = $target->getPreprocess()[$attr];
226
                // check for optional third parameter of array, which indicates that callable method needs an instance
227
                if (is_array($preprocess) && isset($preprocess[2])) {
228
                    // if callable needs instance, resolve object
229
                    if ($preprocess[2]) {
230
                        $preprocess[0] = resolve($preprocess[0]);
231
                    }
232
                    unset($preprocess[2]);
233
                }
234
                // if preprocess is callable call it on attribute value
235
                if (is_callable($preprocess)) {
236
                    try {
237
                        $data[$attr] = $preprocess($data[$attr]);
238
                    } catch (Exception $e) {
239
                        $this->logger->error($e);
240
                    }
241
                } else {
242
                    $this->logger->warning(
243
                        FormattedMessage::get(
244
                            FormattedMessage::PREPROCESS_NOT_CALLABLE,
245
                            $attr,
246
                            $target->getName()
247
                        )
248
                    );
249
                }
250
            }
251
252
            // remap entity attribute name if specified
253
            if (!empty($target->getRemap()[$attr])) {
254
                $newAttrName = $target->getRemap()[$attr];
255
                $data[$newAttrName] = !empty($data[$attr]) ? $data[$attr] : null;
256
257
                if ($attr !== TargetKey::TITLE) {
258
                    unset($data[$attr]);
259
                }
260
            }
261
        }
262
263
        return $data;
264
    }
265
266
    private function encodeAttribute(array $data, string $attr): string
267
    {
268
        $attributeText = (string)$data[$attr];
269
270
        // ensure title has UC words
271
        if ($attr === TargetKey::TITLE) {
272
            return utf8_encode(ucwords(mb_strtolower($attributeText)));
273
        }
274
275
        return iconv(
276
            mb_detect_encoding($attributeText, mb_detect_order(), true),
277
            self::ENCODE_CHARSET,
278
            $attributeText
279
        );
280
    }
281
282
    /**
283
     * @throws JsonException
284
     */
285
    private function verifyData(array $data, Target $target): bool
286
    {
287
        if ($this->scanner->hasBadWords($data, $target->getBadWords())) {
288
            $badWordMessage = sprintf(
289
                FormattedMessage::SCRAP_CONTAINS_BAD_WORD,
290
                json_encode($data, JSON_THROW_ON_ERROR, 512)
291
            );
292
            $this->logger->notice($badWordMessage);
293
            if ($this->verbosity >= self::VERBOSITY_HIGH) {
294
                $this->tell($badWordMessage);
295
            }
296
297
            return false;
298
        }
299
300
        return true;
301
    }
302
303
    /**
304
     * Finalize scrap.
305
     */
306
    private function finalizeData(array $data, Target $target): array
307
    {
308
        $data[self::KEY_ID] = hash($this->hashAlgorithm, json_encode($data, JSON_THROW_ON_ERROR, 512));
309
        $data[self::KEY_SERP_RESULT] = $target->isSearchEngineRequestPages();
310
        $data[self::KEY_TARGET] = $target->getName();
311
312
        ksort($data);
313
314
        return $data;
315
    }
316
317
    private function buildScrapFromData(array $data): Scrap
318
    {
319
        $scrap = Scrap::firstOrNew(
320
            [
321
                'hash' => $data[self::KEY_ID],
322
            ],
323
            [
324
                'model' => $data[TargetKey::special(TargetKey::MODEL)],
325
                'source' => $data[TargetKey::special(TargetKey::SOURCE)],
326
                'title' => $data[TargetKey::TITLE],
327
                'data' => json_encode($data),
328
            ]
329
        );
330
331
        if (!$scrap->exists) {
332
            ++$this->newScrapsCount;
333
        }
334
335
        return $scrap;
336
    }
337
}
338