Passed
Pull Request — main (#16)
by Stefano
12:56
created

ImportSitemapCommand::loadBlackList()   A

Complexity

Conditions 3
Paths 3

Size

Total Lines 10
Code Lines 5

Duplication

Lines 0
Ratio 0 %

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 3
eloc 5
c 1
b 0
f 0
nc 3
nop 1
dl 0
loc 10
rs 10
1
<?php
2
declare(strict_types=1);
3
4
/**
5
 * BEdita Brevia plugin
6
 *
7
 * Copyright 2024 Atlas Srl
8
 */
9
namespace Brevia\BEdita\Command;
10
11
use BEdita\Core\Utility\LoggedUser;
12
use Brevia\BEdita\Client\BreviaClient;
13
use Brevia\BEdita\Utility\ReadCSVTrait;
14
use Cake\Command\Command;
15
use Cake\Console\Arguments;
16
use Cake\Console\ConsoleIo;
17
use Cake\Console\ConsoleOptionParser;
18
use Cake\Log\LogTrait;
19
use Cake\ORM\Table;
20
use Cake\Utility\Hash;
21
22
/**
23
 * Import links from sitemap and create links
24
 *
25
 * @property \BEdita\Core\Model\Table\ObjectsTable $Collections
26
 */
27
class ImportSitemapCommand extends Command
28
{
29
    use LogTrait;
30
    use ReadCSVTrait;
31
32
    /**
33
     * Brevia API client
34
     *
35
     * @var \Brevia\BEdita\Client\BreviaClient
36
     */
37
    protected BreviaClient $client;
38
39
    /**
40
     * Links Table
41
     *
42
     * @var \Cake\ORM\Table
43
     */
44
    protected Table $Links;
45
46
    /**
47
     * @inheritDoc
48
     */
49
    public $defaultTable = 'Collections';
50
51
    /**
52
     * @inheritDoc
53
     */
54
    protected function buildOptionParser(ConsoleOptionParser $parser): ConsoleOptionParser
55
    {
56
        return $parser->addOption('sitemap', [
57
                'help' => 'File path or URL of sitemap to import',
58
                'short' => 's',
59
                'required' => true,
60
            ])
61
            ->addOption('prefix', [
62
                'help' => 'Optional path prefix of URLs to import',
63
                'short' => 'p',
64
                'required' => false,
65
            ])
66
            ->addOption('black-list', [
67
                'help' => 'Path to a black list file containing URLs to exclude - txt file with one URL per line',
68
                'short' => 'b',
69
                'required' => false,
70
            ])
71
            ->addOption('collection', [
72
                'help' => 'Collection used to index (use the unique collection name)',
73
                'short' => 'c',
74
                'required' => true,
75
            ]);
76
    }
77
78
    /**
79
     * @inheritDoc
80
     */
81
    public function initialize(): void
82
    {
83
        $this->client = new BreviaClient();
84
        $this->Links = $this->fetchTable('Links');
85
    }
86
87
    /**
88
     * @inheritDoc
89
     */
90
    public function execute(Arguments $args, ConsoleIo $io)
91
    {
92
        $sitemap = $args->getOption('sitemap');
93
        $content = '';
94
        if (!empty($sitemap)) {
95
            $content = file_get_contents($sitemap);
96
            if ($content === false) {
97
                $io->abort(sprintf('Error reading sitemap file: %s', $sitemap));
98
            }
99
        }
100
101
        $name = $args->getOption('collection');
102
        $response = $this->client->get('/collections', compact('name'));
103
        $collectionId = Hash::get($response->getJson(), '0.cmetadata.id');
104
        if (empty($collectionId)) {
105
            $io->abort(sprintf('Collection not found: %s', $name));
106
        }
107
        $collection = $this->Collections->get($collectionId, ['contain' => ['HasDocuments']]);
108
        $currentUrls = array_filter(array_map(function ($link) {
109
                $link = $link->getTable()->get($link->id);
110
111
                return $link->get('url');
112
        },
113
            (array)$collection->get('has_documents')));
114
        $prefix = $args->getOption('prefix');
115
        $blackList = $this->loadBlackList((string)$args->getOption('black-list'));
116
117
        $xml = simplexml_load_string($content);
118
        $json = json_encode($xml);
119
        $data = (array)json_decode($json, true);
120
        $urls = Hash::extract($data, 'url.{n}.loc');
121
        if (empty($urls)) {
122
            $io->abort('No URLs found in sitemap');
123
        }
124
        $entities = [];
125
        LoggedUser::setUserAdmin();
126
        foreach ($urls as $url) {
127
            if (
128
                in_array($url, $currentUrls) ||
129
                in_array(urldecode($url), $currentUrls) ||
130
                in_array($url, $blackList) ||
131
                ($prefix && strpos($url, $prefix) !== 0)
132
            ) {
133
                continue;
134
            }
135
            $io->info('Adding link: ' . $url);
136
            $data = [
137
                'status' => 'on',
138
                'title' => $url,
139
                'url' => $url,
140
                'extra' => [
141
                    'brevia' => [
142
                        'metadata' => [
143
                            'type' => 'links',
144
                            'url' => $url,
145
                        ],
146
                        'options' => $this->linkOptions($url, (array)$collection->get('link_load_options')),
147
                    ],
148
                ],
149
            ];
150
            $entity = $this->Links->newEntity($data);
151
            $entities[] = $this->Links->saveOrFail($entity);
152
        }
153
        // @phpstan-ignore-next-line
154
        $this->Collections->addRelated($collection, 'has_documents', $entities);
155
156
        $io->out('Done. Link added successfully: ' . count($entities));
157
158
        return null;
159
    }
160
161
    /**
162
     * Get link options
163
     *
164
     * @param string $url URL
165
     * @param array $linkLoadOptions Link load options
166
     * @return array
167
     */
168
    protected function linkOptions(string $url, array $linkLoadOptions): array
169
    {
170
        $options = array_filter($linkLoadOptions, function ($o) use ($url) {
171
            return $o['url'] === $url;
172
        });
173
        $selector = Hash::get($options, '0.selector');
174
        if (!empty($selector)) {
175
            return compact('selector');
176
        }
177
        $options = array_filter($linkLoadOptions, function ($o) use ($url) {
178
            return strpos($url, $o['url']) === 0;
179
        });
180
181
        return ['selector' => Hash::get($options, '0.selector')];
182
    }
183
184
    /**
185
     * Load black list from file and return as array
186
     *
187
     * @param string $blackListPath Path to black list file
188
     * @param string $collection Collection name
189
     * @return array
190
     */
191
    protected function loadBlackList(string $blackListPath): array
192
    {
193
        if (empty($blackListPath)) {
194
            return [];
195
        }
196
        if (!file_exists($blackListPath)) {
197
            $this->io->abort(sprintf('Blacklist file not found: %s', $blackListPath));
198
        }
199
200
        return (array)file($blackListPath, FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES);
201
    }
202
}
203