Passed
Pull Request — master (#46)
by Teye
05:09
created

MetadataBuilder::getColumnsForInterval()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 7
Code Lines 4

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 5
CRAP Score 1

Importance

Changes 0
Metric Value
eloc 4
c 0
b 0
f 0
dl 0
loc 7
ccs 5
cts 5
cp 1
rs 10
cc 1
nc 1
nop 2
crap 1
1
<?php
2
declare(strict_types=1);
3
4
namespace Level23\Druid\Metadata;
5
6
use DateTime;
7
use Exception;
8
use InvalidArgumentException;
9
use Level23\Druid\DruidClient;
10
use Level23\Druid\Types\TimeBound;
11
use Level23\Druid\Filters\FilterBuilder;
12
use Level23\Druid\Context\ContextInterface;
13
use Level23\Druid\DataSources\DataSourceInterface;
14
use Level23\Druid\Exceptions\QueryResponseException;
15
16
class MetadataBuilder
17
{
18
    protected DruidClient $client;
19
20 27
    public function __construct(DruidClient $client)
21
    {
22 27
        $this->client = $client;
23
    }
24
25
    /**
26
     * Return all intervals for the given dataSource.
27
     * Return an array containing the interval.
28
     *
29
     * We will store the result in static cache to prevent multiple requests.
30
     *
31
     * Example response:
32
     * [
33
     *   "2019-08-19T14:00:00.000Z/2019-08-19T15:00:00.000Z" => [ "size" => 75208,  "count" => 4 ],
34
     *   "2019-08-19T13:00:00.000Z/2019-08-19T14:00:00.000Z" => [ "size" => 161870, "count" => 8 ],
35
     * ]
36
     *
37
     * @param string $dataSource
38
     *
39
     * @return array<string,array<string,string|int>>
40
     *
41
     * @throws \Level23\Druid\Exceptions\QueryResponseException
42
     * @throws \GuzzleHttp\Exception\GuzzleException
43
     */
44 1
    public function intervals(string $dataSource): array
45
    {
46 1
        static $intervals = [];
47
48 1
        if (!array_key_exists($dataSource, $intervals)) {
49 1
            $url = $this->client->config('coordinator_url') . '/druid/coordinator/v1/datasources/' . urlencode($dataSource) . '/intervals';
50
51 1
            $intervals[$dataSource] = $this->client->executeRawRequest('get', $url, ['simple' => '']);
52
        }
53
54 1
        return $intervals[$dataSource];
55
    }
56
57
    /**
58
     * Return the time boundary for the given dataSource.
59
     * This finds the first and/or last occurrence of a record in the given dataSource.
60
     * Optionally, you can also apply a filter. For example, to only see when the first and/or last occurrence
61
     * was for a record where a specific condition was met.
62
     *
63
     * The return type varies per given $bound. If TimeBound::BOTH was given (or null, which is the same),
64
     * we will return an array with the minTime and maxTime:
65
     * ```
66
     * array(
67
     *  'minTime' => \DateTime object,
68
     *  'maxTime' => \DateTime object
69
     * )
70
     * ```
71
     *
72
     * If only one time was requested with either TimeBound::MIN_TIME or TimeBound::MAX_TIME, we will return
73
     * a DateTime object.
74
     *
75
     * @param string|\Level23\Druid\DataSources\DataSourceInterface $dataSource
76
     * @param string|\Level23\Druid\Types\TimeBound|null            $bound
77
     * @param \Closure|null                                         $filterBuilder
78
     * @param \Level23\Druid\Context\ContextInterface|null          $context
79
     *
80
     * @return ( $bound is null ? array<\DateTime> : ( $bound is "both" ? array<DateTime> : \DateTime))
0 ignored issues
show
Documentation Bug introduced by
The doc comment ( at position 1 could not be parsed: the token is null at position 1.
Loading history...
81
     * @throws \GuzzleHttp\Exception\GuzzleException
82
     * @throws \Level23\Druid\Exceptions\QueryResponseException
83
     * @throws \Exception
84
     */
85 10
    public function timeBoundary(
86
        string|DataSourceInterface $dataSource,
87
        null|string|TimeBound $bound = TimeBound::BOTH,
88
        \Closure $filterBuilder = null,
89
        ContextInterface $context = null
90
    ): DateTime|array {
91
92 10
        $query = [
93 10
            'queryType'  => 'timeBoundary',
94 10
            'dataSource' => is_string($dataSource) ? $dataSource : $dataSource->toArray(),
95 10
        ];
96
97 10
        if (is_string($bound)) {
98 1
            $bound = TimeBound::from($bound);
99
        }
100
101 10
        if (!empty($bound) && $bound != TimeBound::BOTH) {
102 2
            $query['bound'] = $bound->value;
103
        }
104
105 10
        if ($filterBuilder) {
106 1
            $builder = new FilterBuilder();
107 1
            call_user_func($filterBuilder, $builder);
108 1
            $filter = $builder->getFilter();
109
110 1
            if ($filter) {
111 1
                $query['filter'] = $filter->toArray();
112
            }
113
        }
114
115 10
        if ($context) {
116 1
            $query['context'] = $context->toArray();
117
        }
118
119 10
        $url = $this->client->config('broker_url') . '/druid/v2';
120
121
        /** @var array<int,null|array<string,string[]|string>> $response */
122 10
        $response = $this->client->executeRawRequest('post', $url, $query);
123
124 10
        if (!empty($response[0])
125 10
            && !empty($response[0]['result'])
126 10
            && is_array($response[0]['result'])
127
        ) {
128 9
            if (sizeof($response[0]['result']) == 1) {
129 3
                $dateString = reset($response[0]['result']);
130 3
                $date       = DateTime::createFromFormat('Y-m-d\TH:i:s.000\Z', $dateString);
131
132 3
                if (!$date) {
0 ignored issues
show
introduced by
$date is of type DateTime, thus it always evaluated to true.
Loading history...
133 1
                    throw new Exception('Failed to parse time: ' . $dateString);
134
                }
135
136 2
                return $date;
137
            } else {
138 6
                $result = [];
139 6
                foreach ($response[0]['result'] as $key => $dateString) {
140 6
                    $date = DateTime::createFromFormat('Y-m-d\TH:i:s.000\Z', $dateString);
141
142 6
                    if (!$date) {
143 1
                        throw new Exception('Failed to parse time: ' . $dateString);
144
                    }
145
146 5
                    $result[$key] = $date;
147
                }
148
149 5
                return $result;
150
            }
151
        }
152
153 1
        throw new Exception('Received incorrect response: ' . var_export($response, true));
154
    }
155
156
    /**
157
     * Returns a map of segment intervals contained within the specified interval to a map of segment metadata to a set
158
     * of server names that contain the segment for an interval.
159
     * The latest intervals will come as first, the oldest as last.
160
     *
161
     * Example response:
162
     *
163
     * Array
164
     * (
165
     *     [2017-01-01T00:00:00.000Z/2017-01-02T00:00:00.000Z] => Array
166
     *         (
167
     *             [traffic-conversions_2017-01-01T00:00:00.000Z_2017-01-02T00:00:00.000Z_2019-05-15T11:29:56.874Z] =>
168
     *             Array
169
     *                 (
170
     *                     [metadata] => Array
171
     *                         (
172
     *                             [dataSource] => traffic-conversions
173
     *                             [interval] => 2017-01-01T00:00:00.000Z/2017-01-02T00:00:00.000Z
174
     *                             [version] => 2019-05-15T11:29:56.874Z
175
     *                             [loadSpec] => Array
176
     *                                 (
177
     *                                     [type] => s3_zip
178
     *                                     [bucket] => level23-druid-data
179
     *                                     [key] =>
180
     *                                     druid/segments/traffic-conversions/2017-01-01T00:00:00.000Z_2017-01-02T00:00:00.000Z/2019-05-15T11:29:56.874Z/0/index.zip
181
     *                                     [S3Schema] => s3n
182
     *                                 )
183
     *
184
     *                             [dimensions] =>
185
     *                             country_iso,flags,mccmnc,offer_id,product_type_id,promo_id,test_data_id,test_data_reason,third_party_id
186
     *                             [metrics] => conversion_time,conversions,revenue_external,revenue_internal
187
     *                             [shardSpec] => Array
188
     *                                 (
189
     *                                     [type] => numbered
190
     *                                     [partitionNum] => 0
191
     *                                     [partitions] => 0
192
     *                                 )
193
     *
194
     *                             [binaryVersion] => 9
195
     *                             [size] => 272709
196
     *                             [identifier] =>
197
     *                             traffic-conversions_2017-01-01T00:00:00.000Z_2017-01-02T00:00:00.000Z_2019-05-15T11:29:56.874Z
198
     *                         )
199
     *
200
     *                     [servers] => Array
201
     *                         (
202
     *                             [0] => 172.31.23.160:8083
203
     *                             [1] => 172.31.3.204:8083
204
     *                         )
205
     *
206
     *                 )
207
     *
208
     *         )
209
     *
210
     * )
211
     *
212
     * @param string $dataSource
213
     * @param string $interval
214
     *
215
     * @return array<string,array<mixed>|string|int>
216
     * @throws \Level23\Druid\Exceptions\QueryResponseException
217
     * @throws \GuzzleHttp\Exception\GuzzleException
218
     */
219 1
    public function interval(string $dataSource, string $interval): array
220
    {
221 1
        $url = $this->client->config('coordinator_url') .
222 1
            '/druid/coordinator/v1/datasources/' . urlencode($dataSource) .
223 1
            '/intervals/' . urlencode($interval);
224
225 1
        return $this->client->executeRawRequest('get', $url, ['full' => '']);
226
    }
227
228
    /**
229
     * Get the columns for the given interval. This will return something like this:
230
     *
231
     *   Array
232
     *  (
233
     *      0 => Array
234
     *          (
235
     *              [field] => __time
236
     *              [type] => LONG
237
     *              [hasMultipleValues] =>
238
     *              [size] => 0
239
     *              [cardinality] =>
240
     *              [minValue] =>
241
     *              [maxValue] =>
242
     *              [errorMessage] =>
243
     *          )
244
     *      1 => Array
245
     *          (
246
     *              [field] => delta
247
     *              [type] => LONG
248
     *              [hasMultipleValues] =>
249
     *              [size] => 0
250
     *              [cardinality] =>
251
     *              [minValue] =>
252
     *              [maxValue] =>
253
     *              [errorMessage] =>
254
     *          )
255
     *      2 => Array
256
     *          (
257
     *              [field] => cityName
258
     *              [type] => STRING
259
     *              [hasMultipleValues] =>
260
     *              [size] => 0
261
     *              [cardinality] => 59
262
     *              [minValue] => af
263
     *              [maxValue] => zm
264
     *              [errorMessage] =>
265
     *          )
266
     *      3 => Array
267
     *          (
268
     *              [field] => comment
269
     *              [type] => STRING
270
     *              [hasMultipleValues] =>
271
     *              [size] => 0
272
     *              [cardinality] => 84
273
     *              [minValue] =>
274
     *              [maxValue] => 74807
275
     *              [errorMessage] =>
276
     *          )
277
     *      4 => Array
278
     *          (
279
     *              [field] => added
280
     *              [type] => LONG
281
     *              [hasMultipleValues] =>
282
     *              [size] => 0
283
     *              [cardinality] =>
284
     *              [minValue] =>
285
     *              [maxValue] =>
286
     *              [errorMessage] =>
287
     *          )
288
     *  )
289
     *
290
     * @param string $dataSource
291
     * @param string $interval
292
     *
293
     * @return array<int,array<string,string>>
294
     * @throws \Level23\Druid\Exceptions\QueryResponseException
295
     * @throws \Exception
296
     * @throws \GuzzleHttp\Exception\GuzzleException
297
     */
298 1
    protected function getColumnsForInterval(string $dataSource, string $interval): array
299
    {
300 1
        $response = $this->client->query($dataSource)
301 1
            ->interval($interval)
302 1
            ->segmentMetadata();
303
304 1
        return $response->data();
305
    }
306
307
    /**
308
     * Return the druid interval by the shorthand "first" or "last".
309
     *
310
     * We will return something like "2017-01-01T00:00:00.000Z/2017-01-02T00:00:00.000Z"
311
     *
312
     * We will return an empty array when no interval data is found.
313
     *
314
     * @param string $dataSource
315
     * @param string $shortHand
316
     *
317
     * @return string
318
     * @throws \Level23\Druid\Exceptions\QueryResponseException
319
     * @throws \GuzzleHttp\Exception\GuzzleException
320
     */
321 3
    protected function getIntervalByShorthand(string $dataSource, string $shortHand): string
322
    {
323
        // Get the interval which we will use to do a "structure" scan.
324 3
        $shortHand = strtolower($shortHand);
325 3
        if ($shortHand != 'last' && $shortHand != 'first') {
326 1
            throw new InvalidArgumentException('Only shorthand "first" and "last" are supported!');
327
        }
328
329 2
        $intervals = array_keys($this->intervals($dataSource));
330
331 2
        if ($shortHand == 'last') {
332 1
            return $intervals[0] ?? '';
333
        }
334
335 1
        return $intervals[count($intervals) - 1] ?? '';
336
    }
337
338
    /**
339
     * Generate a structure class for the given dataSource.
340
     *
341
     * @param string $dataSource
342
     * @param string $interval "last", "first" or a raw interval string as returned by druid.
343
     *
344
     * @return \Level23\Druid\Metadata\Structure
345
     * @throws \GuzzleHttp\Exception\GuzzleException
346
     * @throws \Level23\Druid\Exceptions\QueryResponseException
347
     */
348 6
    public function structure(string $dataSource, string $interval = 'last'): Structure
349
    {
350
        // shorthand given? Then retrieve the real interval for them.
351 6
        if (in_array(strtolower($interval), ['first', 'last'])) {
352 2
            $interval = $this->getIntervalByShorthand($dataSource, $interval);
353
        }
354
355 6
        if (empty($interval)) {
356 1
            throw new InvalidArgumentException(
357 1
                'Error, interval "' . $interval . '" is invalid. Maybe there are no intervals for this dataSource?'
358 1
            );
359
        }
360
361 5
        $rawStructure = $this->interval($dataSource, $interval);
362
363 5
        $structureData = reset($rawStructure);
364 5
        if (!$structureData || !is_array($structureData)) {
365 2
            throw new QueryResponseException([],
366 2
                'We failed to retrieve a correct structure for dataSource: ' . $dataSource . '.' . PHP_EOL .
367 2
                'Failed to parse raw interval structure data: ' . var_export($rawStructure, true)
368
369 2
            );
370
        }
371
372
        /** @var array<string|string[]> $data */
373 3
        $data = reset($structureData);
374
375 3
        $dimensionFields = explode(',', $data['metadata']['dimensions'] ?? '');
376 3
        $metricFields    = explode(',', $data['metadata']['metrics'] ?? '');
377
378 3
        $dimensions = [];
379 3
        $metrics    = [];
380
381 3
        $columns = $this->getColumnsForInterval($dataSource, $interval);
382
383 3
        foreach ($columns as $info) {
384 3
            $column = $info['field'];
385
386 3
            if (in_array($column, $dimensionFields)) {
387 3
                $dimensions[$column] = $info['type'];
388
            }
389 3
            if (in_array($column, $metricFields)) {
390 3
                $metrics[$column] = $info['type'];
391
            }
392
        }
393
394 3
        return new Structure($dataSource, $dimensions, $metrics);
395
    }
396
}