Passed
Pull Request — master (#49)
by Teye
04:15
created

MetadataBuilder::dataSources()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 8
Code Lines 3

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 4
CRAP Score 1

Importance

Changes 0
Metric Value
eloc 3
dl 0
loc 8
ccs 4
cts 4
cp 1
rs 10
c 0
b 0
f 0
cc 1
nc 1
nop 0
crap 1
1
<?php
2
declare(strict_types=1);
3
4
namespace Level23\Druid\Metadata;
5
6
use DateTime;
7
use Exception;
8
use InvalidArgumentException;
9
use Level23\Druid\DruidClient;
10
use Level23\Druid\Types\TimeBound;
11
use Level23\Druid\Filters\FilterBuilder;
12
use Level23\Druid\Context\ContextInterface;
13
use Level23\Druid\DataSources\DataSourceInterface;
14
use Level23\Druid\Exceptions\QueryResponseException;
15
16
class MetadataBuilder
17
{
18
    protected DruidClient $client;
19
20 28
    public function __construct(DruidClient $client)
21
    {
22 28
        $this->client = $client;
23
    }
24
25
    /**
26
     * Return all intervals for the given dataSource.
27
     * Return an array containing the interval.
28
     *
29
     * We will store the result in static cache to prevent multiple requests.
30
     *
31
     * Example response:
32
     * [
33
     *   "2019-08-19T14:00:00.000Z/2019-08-19T15:00:00.000Z" => [ "size" => 75208,  "count" => 4 ],
34
     *   "2019-08-19T13:00:00.000Z/2019-08-19T14:00:00.000Z" => [ "size" => 161870, "count" => 8 ],
35
     * ]
36
     *
37
     * @param string $dataSource
38
     *
39
     * @return array<string,array<string,string|int>>
40
     *
41
     * @throws \Level23\Druid\Exceptions\QueryResponseException
42
     * @throws \GuzzleHttp\Exception\GuzzleException
43
     */
44 1
    public function intervals(string $dataSource): array
45
    {
46 1
        static $intervals = [];
47
48 1
        if (!array_key_exists($dataSource, $intervals)) {
49 1
            $url = $this->client->config('coordinator_url') . '/druid/coordinator/v1/datasources/' . urlencode($dataSource) . '/intervals';
50
51 1
            $intervals[$dataSource] = $this->client->executeRawRequest('get', $url, ['simple' => '']);
52
        }
53
54 1
        return $intervals[$dataSource];
55
    }
56
57
    /**
58
     * Return the time boundary for the given dataSource.
59
     * This finds the first and/or last occurrence of a record in the given dataSource.
60
     * Optionally, you can also apply a filter. For example, to only see when the first and/or last occurrence
61
     * was for a record where a specific condition was met.
62
     *
63
     * The return type varies per given $bound. If TimeBound::BOTH was given (or null, which is the same),
64
     * we will return an array with the minTime and maxTime:
65
     * ```
66
     * array(
67
     *  'minTime' => \DateTime object,
68
     *  'maxTime' => \DateTime object
69
     * )
70
     * ```
71
     *
72
     * If only one time was requested with either TimeBound::MIN_TIME or TimeBound::MAX_TIME, we will return
73
     * a DateTime object.
74
     *
75
     * @param string|\Level23\Druid\DataSources\DataSourceInterface $dataSource
76
     * @param string|\Level23\Druid\Types\TimeBound|null            $bound
77
     * @param \Closure|null                                         $filterBuilder
78
     * @param \Level23\Druid\Context\ContextInterface|null          $context
79
     *
80
     * @return ( $bound is null ? array<\DateTime> : ( $bound is "both" ? array<DateTime> : \DateTime))
0 ignored issues
show
Documentation Bug introduced by
The doc comment ( at position 1 could not be parsed: the token is null at position 1.
Loading history...
81
     * @throws \GuzzleHttp\Exception\GuzzleException
82
     * @throws \Level23\Druid\Exceptions\QueryResponseException
83
     * @throws \Exception
84
     */
85 10
    public function timeBoundary(
86
        string|DataSourceInterface $dataSource,
87
        null|string|TimeBound $bound = TimeBound::BOTH,
88
        \Closure $filterBuilder = null,
89
        ContextInterface $context = null
90
    ): DateTime|array {
91
92 10
        $query = [
93 10
            'queryType'  => 'timeBoundary',
94 10
            'dataSource' => is_string($dataSource) ? $dataSource : $dataSource->toArray(),
95 10
        ];
96
97 10
        if (is_string($bound)) {
98 1
            $bound = TimeBound::from($bound);
99
        }
100
101 10
        if (!empty($bound) && $bound != TimeBound::BOTH) {
102 2
            $query['bound'] = $bound->value;
103
        }
104
105 10
        if ($filterBuilder) {
106 1
            $builder = new FilterBuilder();
107 1
            call_user_func($filterBuilder, $builder);
108 1
            $filter = $builder->getFilter();
109
110 1
            if ($filter) {
111 1
                $query['filter'] = $filter->toArray();
112
            }
113
        }
114
115 10
        if ($context) {
116 1
            $query['context'] = $context->toArray();
117
        }
118
119 10
        $url = $this->client->config('broker_url') . '/druid/v2';
120
121
        /** @var array<int,null|array<string,string[]|string>> $response */
122 10
        $response = $this->client->executeRawRequest('post', $url, $query);
123
124 10
        if (!empty($response[0])
125 10
            && !empty($response[0]['result'])
126 10
            && is_array($response[0]['result'])
127
        ) {
128 9
            if (sizeof($response[0]['result']) == 1) {
129 3
                $dateString = reset($response[0]['result']);
130 3
                $date       = DateTime::createFromFormat('Y-m-d\TH:i:s.000\Z', $dateString);
131
132 3
                if (!$date) {
0 ignored issues
show
introduced by
$date is of type DateTime, thus it always evaluated to true.
Loading history...
133 1
                    throw new Exception('Failed to parse time: ' . $dateString);
134
                }
135
136 2
                return $date;
137
            } else {
138 6
                $result = [];
139 6
                foreach ($response[0]['result'] as $key => $dateString) {
140
                    /** @var string $key */
141 6
                    $date = DateTime::createFromFormat('Y-m-d\TH:i:s.000\Z', $dateString);
142
143 6
                    if (!$date) {
144 1
                        throw new Exception('Failed to parse time: ' . $dateString);
145
                    }
146
147 5
                    $result[$key] = $date;
148
                }
149
150 5
                return $result;
151
            }
152
        }
153
154 1
        throw new Exception('Received incorrect response: ' . var_export($response, true));
155
    }
156
157
    /**
158
     * Returns a map of segment intervals contained within the specified interval to a map of segment metadata to a set
159
     * of server names that contain the segment for an interval.
160
     * The latest intervals will come as first, the oldest as last.
161
     *
162
     * Example response:
163
     *
164
     * Array
165
     * (
166
     *     [2017-01-01T00:00:00.000Z/2017-01-02T00:00:00.000Z] => Array
167
     *         (
168
     *             [traffic-conversions_2017-01-01T00:00:00.000Z_2017-01-02T00:00:00.000Z_2019-05-15T11:29:56.874Z] =>
169
     *             Array
170
     *                 (
171
     *                     [metadata] => Array
172
     *                         (
173
     *                             [dataSource] => traffic-conversions
174
     *                             [interval] => 2017-01-01T00:00:00.000Z/2017-01-02T00:00:00.000Z
175
     *                             [version] => 2019-05-15T11:29:56.874Z
176
     *                             [loadSpec] => Array
177
     *                                 (
178
     *                                     [type] => s3_zip
179
     *                                     [bucket] => level23-druid-data
180
     *                                     [key] =>
181
     *                                     druid/segments/traffic-conversions/2017-01-01T00:00:00.000Z_2017-01-02T00:00:00.000Z/2019-05-15T11:29:56.874Z/0/index.zip
182
     *                                     [S3Schema] => s3n
183
     *                                 )
184
     *
185
     *                             [dimensions] =>
186
     *                             country_iso,flags,mccmnc,offer_id,product_type_id,promo_id,test_data_id,test_data_reason,third_party_id
187
     *                             [metrics] => conversion_time,conversions,revenue_external,revenue_internal
188
     *                             [shardSpec] => Array
189
     *                                 (
190
     *                                     [type] => numbered
191
     *                                     [partitionNum] => 0
192
     *                                     [partitions] => 0
193
     *                                 )
194
     *
195
     *                             [binaryVersion] => 9
196
     *                             [size] => 272709
197
     *                             [identifier] =>
198
     *                             traffic-conversions_2017-01-01T00:00:00.000Z_2017-01-02T00:00:00.000Z_2019-05-15T11:29:56.874Z
199
     *                         )
200
     *
201
     *                     [servers] => Array
202
     *                         (
203
     *                             [0] => 172.31.23.160:8083
204
     *                             [1] => 172.31.3.204:8083
205
     *                         )
206
     *
207
     *                 )
208
     *
209
     *         )
210
     *
211
     * )
212
     *
213
     * @param string $dataSource
214
     * @param string $interval
215
     *
216
     * @return array<string,array<mixed>|string|int>
217
     * @throws \Level23\Druid\Exceptions\QueryResponseException
218
     * @throws \GuzzleHttp\Exception\GuzzleException
219
     */
220 1
    public function interval(string $dataSource, string $interval): array
221
    {
222 1
        $url = $this->client->config('coordinator_url') .
223 1
            '/druid/coordinator/v1/datasources/' . urlencode($dataSource) .
224 1
            '/intervals/' . urlencode($interval);
225
226 1
        return $this->client->executeRawRequest('get', $url, ['full' => '']);
227
    }
228
229
    /**
230
     * Get the columns for the given interval. This will return something like this:
231
     *
232
     *   Array
233
     *  (
234
     *      0 => Array
235
     *          (
236
     *              [field] => __time
237
     *              [type] => LONG
238
     *              [hasMultipleValues] =>
239
     *              [size] => 0
240
     *              [cardinality] =>
241
     *              [minValue] =>
242
     *              [maxValue] =>
243
     *              [errorMessage] =>
244
     *          )
245
     *      1 => Array
246
     *          (
247
     *              [field] => delta
248
     *              [type] => LONG
249
     *              [hasMultipleValues] =>
250
     *              [size] => 0
251
     *              [cardinality] =>
252
     *              [minValue] =>
253
     *              [maxValue] =>
254
     *              [errorMessage] =>
255
     *          )
256
     *      2 => Array
257
     *          (
258
     *              [field] => cityName
259
     *              [type] => STRING
260
     *              [hasMultipleValues] =>
261
     *              [size] => 0
262
     *              [cardinality] => 59
263
     *              [minValue] => af
264
     *              [maxValue] => zm
265
     *              [errorMessage] =>
266
     *          )
267
     *      3 => Array
268
     *          (
269
     *              [field] => comment
270
     *              [type] => STRING
271
     *              [hasMultipleValues] =>
272
     *              [size] => 0
273
     *              [cardinality] => 84
274
     *              [minValue] =>
275
     *              [maxValue] => 74807
276
     *              [errorMessage] =>
277
     *          )
278
     *      4 => Array
279
     *          (
280
     *              [field] => added
281
     *              [type] => LONG
282
     *              [hasMultipleValues] =>
283
     *              [size] => 0
284
     *              [cardinality] =>
285
     *              [minValue] =>
286
     *              [maxValue] =>
287
     *              [errorMessage] =>
288
     *          )
289
     *  )
290
     *
291
     * @param string $dataSource
292
     * @param string $interval
293
     *
294
     * @return array<int,array<string,string>>
295
     * @throws \Level23\Druid\Exceptions\QueryResponseException
296
     * @throws \Exception
297
     * @throws \GuzzleHttp\Exception\GuzzleException
298
     */
299 1
    protected function getColumnsForInterval(string $dataSource, string $interval): array
300
    {
301 1
        $response = $this->client->query($dataSource)
302 1
            ->interval($interval)
303 1
            ->segmentMetadata();
304
305 1
        return $response->data();
306
    }
307
308
    /**
309
     * Return the druid interval by the shorthand "first" or "last".
310
     *
311
     * We will return something like "2017-01-01T00:00:00.000Z/2017-01-02T00:00:00.000Z"
312
     *
313
     * We will return an empty array when no interval data is found.
314
     *
315
     * @param string $dataSource
316
     * @param string $shortHand
317
     *
318
     * @return string
319
     * @throws \Level23\Druid\Exceptions\QueryResponseException
320
     * @throws \GuzzleHttp\Exception\GuzzleException
321
     */
322 3
    protected function getIntervalByShorthand(string $dataSource, string $shortHand): string
323
    {
324
        // Get the interval which we will use to do a "structure" scan.
325 3
        $shortHand = strtolower($shortHand);
326 3
        if ($shortHand != 'last' && $shortHand != 'first') {
327 1
            throw new InvalidArgumentException('Only shorthand "first" and "last" are supported!');
328
        }
329
330 2
        $intervals = array_keys($this->intervals($dataSource));
331
332 2
        if ($shortHand == 'last') {
333 1
            return $intervals[0] ?? '';
334
        }
335
336 1
        return $intervals[count($intervals) - 1] ?? '';
337
    }
338
339
    /**
340
     * Generate a structure class for the given dataSource.
341
     *
342
     * @param string $dataSource
343
     * @param string $interval "last", "first" or a raw interval string as returned by druid.
344
     *
345
     * @return \Level23\Druid\Metadata\Structure
346
     * @throws \GuzzleHttp\Exception\GuzzleException
347
     * @throws \Level23\Druid\Exceptions\QueryResponseException
348
     */
349 6
    public function structure(string $dataSource, string $interval = 'last'): Structure
350
    {
351
        // shorthand given? Then retrieve the real interval for them.
352 6
        if (in_array(strtolower($interval), ['first', 'last'])) {
353 2
            $interval = $this->getIntervalByShorthand($dataSource, $interval);
354
        }
355
356 6
        if (empty($interval)) {
357 1
            throw new InvalidArgumentException(
358 1
                'Error, interval "' . $interval . '" is invalid. Maybe there are no intervals for this dataSource?'
359 1
            );
360
        }
361
362 5
        $rawStructure = $this->interval($dataSource, $interval);
363
364 5
        $structureData = reset($rawStructure);
365 5
        if (!$structureData || !is_array($structureData)) {
366 2
            throw new QueryResponseException([],
367 2
                'We failed to retrieve a correct structure for dataSource: ' . $dataSource . '.' . PHP_EOL .
368 2
                'Failed to parse raw interval structure data: ' . var_export($rawStructure, true)
369
370 2
            );
371
        }
372
373
        /** @var array<string|string[]> $data */
374 3
        $data = reset($structureData);
375
376 3
        $dimensionFields = explode(',', $data['metadata']['dimensions'] ?? '');
377 3
        $metricFields    = explode(',', $data['metadata']['metrics'] ?? '');
378
379 3
        $dimensions = [];
380 3
        $metrics    = [];
381
382 3
        $columns = $this->getColumnsForInterval($dataSource, $interval);
383
384 3
        foreach ($columns as $info) {
385 3
            $column = $info['field'];
386
387 3
            if (in_array($column, $dimensionFields)) {
388 3
                $dimensions[$column] = $info['type'];
389
            }
390 3
            if (in_array($column, $metricFields)) {
391 3
                $metrics[$column] = $info['type'];
392
            }
393
        }
394
395 3
        return new Structure($dataSource, $dimensions, $metrics);
396
    }
397
398
    /**
399
     * Return a list of all known dataSources
400
     *
401
     * @return array<string>
402
     * @throws \GuzzleHttp\Exception\GuzzleException
403
     * @throws \Level23\Druid\Exceptions\QueryResponseException
404
     */
405 1
    public function dataSources(): array
406
    {
407 1
        $url = $this->client->config('coordinator_url') . '/druid/coordinator/v1/datasources';
408
409
        /** @var array<int,string> $dataSources */
410 1
        $dataSources = $this->client->executeRawRequest('get', $url);
411
412 1
        return $dataSources;
413
    }
414
}