|
1
|
|
|
<?php |
|
2
|
|
|
declare(strict_types=1); |
|
3
|
|
|
|
|
4
|
|
|
namespace Level23\Druid\Aggregations; |
|
5
|
|
|
|
|
6
|
|
|
use Level23\Druid\Collections\DimensionCollection; |
|
7
|
|
|
|
|
8
|
|
|
class CardinalityAggregator implements AggregatorInterface |
|
9
|
|
|
{ |
|
10
|
|
|
protected string $outputName; |
|
11
|
|
|
|
|
12
|
|
|
protected bool $byRow; |
|
13
|
|
|
|
|
14
|
|
|
protected bool $round; |
|
15
|
|
|
|
|
16
|
|
|
protected DimensionCollection $dimensions; |
|
17
|
|
|
|
|
18
|
|
|
/** |
|
19
|
|
|
* CardinalityAggregator constructor. |
|
20
|
|
|
* |
|
21
|
|
|
* Computes the cardinality of a set of Apache Druid (incubating) dimensions, using HyperLogLog to estimate the |
|
22
|
|
|
* cardinality. Please note that this aggregator will be much slower than indexing a column with the hyperUnique |
|
23
|
|
|
* aggregator. This aggregator also runs over a dimension column, which means the string dimension cannot be |
|
24
|
|
|
* removed from the dataset to improve rollup. In general, we strongly recommend using the hyperUnique aggregator |
|
25
|
|
|
* instead of the cardinality aggregator if you do not care about the individual values of a dimension. |
|
26
|
|
|
* |
|
27
|
|
|
* The HyperLogLog algorithm generates decimal estimates with some error. "round" can be set to true to round off |
|
28
|
|
|
* estimated values to whole numbers. Note that even with rounding, the cardinality is still an estimate. The |
|
29
|
|
|
* "round" field only affects query-time behavior, and is ignored at ingestion-time. |
|
30
|
|
|
* |
|
31
|
|
|
* When setting byRow to false (the default) it computes the cardinality of the set composed of the union of all |
|
32
|
|
|
* dimension values for all the given dimensions. For a single dimension, this is equivalent to: |
|
33
|
|
|
* ``` |
|
34
|
|
|
* SELECT COUNT(DISTINCT(dimension)) FROM <datasource> |
|
35
|
|
|
* ``` |
|
36
|
|
|
* |
|
37
|
|
|
* For multiple dimensions, this is equivalent to something akin to |
|
38
|
|
|
* ``` |
|
39
|
|
|
* SELECT COUNT(DISTINCT(value)) FROM ( |
|
40
|
|
|
* SELECT dim_1 as value FROM <datasource> |
|
41
|
|
|
* UNION |
|
42
|
|
|
* SELECT dim_2 as value FROM <datasource> |
|
43
|
|
|
* UNION |
|
44
|
|
|
* SELECT dim_3 as value FROM <datasource> |
|
45
|
|
|
* ) |
|
46
|
|
|
* ``` |
|
47
|
|
|
* |
|
48
|
|
|
* When setting byRow to true it computes the cardinality by row, i.e. the cardinality of distinct dimension |
|
49
|
|
|
* combinations. This is equivalent to something akin to |
|
50
|
|
|
* |
|
51
|
|
|
* ``` |
|
52
|
|
|
* SELECT COUNT(*) FROM ( SELECT DIM1, DIM2, DIM3 FROM <datasource> GROUP BY DIM1, DIM2, DIM3 ) |
|
53
|
|
|
* ``` |
|
54
|
|
|
* |
|
55
|
|
|
* @see https://druid.apache.org/docs/latest/querying/hll-old.html |
|
56
|
|
|
* |
|
57
|
|
|
* @param string $outputName |
|
58
|
|
|
* @param \Level23\Druid\Collections\DimensionCollection $dimensions |
|
59
|
|
|
* @param bool $byRow |
|
60
|
|
|
* @param bool $round |
|
61
|
|
|
*/ |
|
62
|
4 |
|
public function __construct( |
|
63
|
|
|
string $outputName, |
|
64
|
|
|
DimensionCollection $dimensions, |
|
65
|
|
|
bool $byRow = false, |
|
66
|
|
|
bool $round = false |
|
67
|
|
|
) { |
|
68
|
4 |
|
$this->outputName = $outputName; |
|
69
|
4 |
|
$this->byRow = $byRow; |
|
70
|
4 |
|
$this->round = $round; |
|
71
|
4 |
|
$this->dimensions = $dimensions; |
|
72
|
|
|
} |
|
73
|
|
|
|
|
74
|
|
|
/** |
|
75
|
|
|
* Return the aggregator as it can be used in a druid query. |
|
76
|
|
|
* |
|
77
|
|
|
* @return array<string|bool|array<int,array<mixed>>> |
|
78
|
|
|
*/ |
|
79
|
4 |
|
public function toArray(): array |
|
80
|
|
|
{ |
|
81
|
4 |
|
return [ |
|
82
|
4 |
|
'type' => 'cardinality', |
|
83
|
4 |
|
'name' => $this->outputName, |
|
84
|
4 |
|
'fields' => $this->dimensions->toArray(), |
|
85
|
4 |
|
'byRow' => $this->byRow, |
|
86
|
4 |
|
'round' => $this->round, |
|
87
|
4 |
|
]; |
|
88
|
|
|
} |
|
89
|
|
|
} |