FieldsInferrer::inferField()   B
last analyzed

Complexity

Conditions 5
Paths 8

Size

Total Lines 25
Code Lines 15

Duplication

Lines 0
Ratio 0 %

Importance

Changes 1
Bugs 0 Features 1
Metric Value
cc 5
eloc 15
nc 8
nop 2
dl 0
loc 25
rs 8.439
c 1
b 0
f 1
1
<?php
2
3
namespace frictionlessdata\tableschema\Fields;
4
5
use frictionlessdata\tableschema\Exceptions\FieldValidationException;
6
7
class FieldsInferrer
8
{
9
    /**
10
     * @param null|array $rows optional initial rows to infer by, each row is an array of field name => field value
11
     */
12
    public function __construct($rows = null, $lenient = false)
13
    {
14
        $this->lenient = $lenient;
15
        if (!empty($rows)) {
16
            $this->addRows($rows);
17
        }
18
    }
19
20
    /**
21
     * add rows and updates the fieldsPopularity array - to make the inferred fields more accurate.
22
     *
23
     * @param $rows
24
     *
25
     * @throws FieldValidationException
26
     */
27
    public function addRows($rows)
28
    {
29
        foreach ($rows as $row) {
30
            $this->inputRows[] = $row;
31
            $inferredRow = $this->inferRow($row);
32
            foreach ($this->getFieldNames() as $fieldName) {
33
                /** @var BaseField $inferredField */
34
                $inferredField = $inferredRow[$fieldName];
35
                $inferredFieldType = $inferredField->getInferIdentifier($this->lenient);
36
                if (!array_key_exists($fieldName, $this->fieldsPopularity)) {
37
                    $this->fieldsPopularity[$fieldName] = [];
38
                    $this->fieldsPopularityObjects[$fieldName] = [];
39
                }
40
                if (!array_key_exists($inferredFieldType, $this->fieldsPopularity[$fieldName])) {
41
                    $this->fieldsPopularity[$fieldName][$inferredFieldType] = 0;
42
                    $this->fieldsPopularityObjects[$fieldName][$inferredFieldType] = $inferredField;
43
                }
44
                ++$this->fieldsPopularity[$fieldName][$inferredFieldType];
45
                arsort($this->fieldsPopularity[$fieldName]);
46
            }
47
        }
48
    }
49
50
    /**
51
     * return the best inferred fields along with the best value casting according to the rows received so far.
52
     *
53
     * @return array field name => inferred field object
54
     *
55
     * @throws FieldValidationException
56
     */
57
    public function infer()
58
    {
59
        $bestInferredFields = [];
60
        foreach ($this->fieldsPopularity as $fieldName => $fieldTypesPopularity) {
61
            $bestInferredFields[$fieldName] = $this->inferField($fieldName, $fieldTypesPopularity);
62
        }
63
64
        return $bestInferredFields;
65
    }
66
67
    /**
68
     * returns all the input rows got so far with the best cast value for each field.
69
     *
70
     * @return array of arrays of field name => best cast value
71
     */
72
    public function castRows()
73
    {
74
        return $this->castRows;
75
    }
76
77
    protected $inputRows = [];
78
    protected $castRows = [];
79
    protected $fieldsPopularity = [];
80
    protected $fieldsPopularityObjects = [];
81
    protected $lenient;
82
83
    /**
84
     * infer field objects for the given row
85
     * raises exception if fails to infer a field.
86
     *
87
     * @param $row array field name => value to infer by
88
     *
89
     * @return array field name => inferred field object
90
     *
91
     * @throws FieldValidationException
92
     */
93
    protected function inferRow($row)
94
    {
95
        $rowFields = [];
96
        foreach ($row as $k => $v) {
97
            $rowFields[$k] = FieldsFactory::infer($v, (object) ['name' => $k], $this->lenient);
98
        }
99
100
        return $rowFields;
101
    }
102
103
    /**
104
     * @return array
105
     */
106
    protected function getFieldNames()
107
    {
108
        // we assume csv file where all rows have the same column positions
109
        // so we can use the first row to get the field names
110
        return array_keys($this->inputRows[0]);
111
    }
112
113
    /**
114
     * finds the best inferred fields for the given field name according to the popularity
115
     * also updates the castRows array with the latest cast values.
116
     *
117
     * @param $fieldName
118
     * @param $fieldTypesPopularity
119
     *
120
     * @return BaseField|null
121
     */
122
    protected function inferField($fieldName, $fieldTypesPopularity)
123
    {
124
        // the $fieldTypesPopularity array is already sorted with most popular fields first
125
        $inferredField = null;
126
        foreach (array_keys($fieldTypesPopularity) as $inferredFieldType) {
127
            /** @var BaseField $inferredField */
128
            $inferredField = $this->fieldsPopularityObjects[$fieldName][$inferredFieldType];
129
            try {
130
                $rowNum = 0;
131
                foreach ($this->inputRows as $inputRow) {
132
                    if (!array_key_exists($rowNum, $this->castRows)) {
133
                        $this->castRows[$rowNum] = [];
134
                    }
135
                    $this->castRows[$rowNum][$fieldName] = $inferredField->castValue($inputRow[$fieldName]);
136
                    ++$rowNum;
137
                }
138
                break;
139
            } catch (FieldValidationException $e) {
140
                // a row failed validation for this field type, will continue to the next one according to popularity
141
                continue;
142
            }
143
        }
144
145
        return $inferredField;
146
    }
147
}
148