Passed
Push — master ( 451fb6...e51e88 )
by Andreas
28:25
created

org_openpsa_contacts_duplicates_check::mark_all()   A

Complexity

Conditions 3
Paths 3

Size

Total Lines 18
Code Lines 12

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 0
CRAP Score 12

Importance

Changes 0
Metric Value
cc 3
eloc 12
c 0
b 0
f 0
nc 3
nop 1
dl 0
loc 18
ccs 0
cts 13
cp 0
crap 12
rs 9.8666
1
<?php
2
/**
3
 * @package org.openpsa.contacts
4
 * @author Nemein Oy http://www.nemein.com/
5
 * @copyright Nemein Oy http://www.nemein.com/
6
 * @license http://www.gnu.org/licenses/gpl.html GNU General Public License
7
 */
8
9
/**
10
 * Search for duplicate persons and groups in database
11
 *
12
 * @package org.openpsa.contacts
13
 */
14
abstract class org_openpsa_contacts_duplicates_check
15
{
16
    /**
17
     * Used to store map of probabilities when seeking duplicates for given person/group
18
     */
19
    private $p_map = [];
20
21
    /**
22
     * Minimum score to count as duplicate
23
     *
24
     * @var integer
25
     */
26
    private $threshold = 1;
27
28
    /**
29
     * Calculates P for the given two candidates being duplicates
30
     *
31
     * @return array with overall P and matched checks
32
     */
33
    abstract protected function p_duplicate(array $candidate1, array $candidate2) : array;
34
35
    abstract protected function get_class() : string;
36
37
    abstract protected function get_fields() : array;
38
39
    /**
40
     * Find all duplicates and mark them
41
     */
42
    public function mark_all(bool $output)
43
    {
44
        $time_start = time();
45
        $this->output($output, 'Starting');
46
47
        $ret = $this->check_all();
48
        foreach ($ret as $guid1 => $duplicates) {
49
            $duplicate1 = $this->load($guid1);
50
            foreach ($duplicates as $guid2 => $details) {
51
                $duplicate2 = $this->load($guid2);
52
                $msg = "Marking {$guid1} (#{$duplicate1->id}) and {$guid2} (#{$duplicate2->id}) as duplicates with P {$details['p']}";
53
                $duplicate1->set_parameter('org.openpsa.contacts.duplicates:possible_duplicate', $guid2, $details['p']);
54
                $duplicate2->set_parameter('org.openpsa.contacts.duplicates:possible_duplicate', $guid1, $details['p']);
55
                $this->output($output, $msg, '&nbsp;&nbsp;&nbsp;');
56
            }
57
        }
58
59
        $this->output($output, "DONE. Elapsed time " . (time() - $time_start) . " seconds");
60
    }
61
62
    /**
63
     * Find duplicates for given object
64
     *
65
     * @return midcom_core_dbaobject[] List of possible duplicates
66
     */
67 1
    public function find_duplicates(midcom_core_dbaobject $object, int $threshold = 1) : array
68
    {
69 1
        $ret = [];
70 1
        $fields = array_flip($this->get_fields());
71
72 1
        foreach ($fields as $name => &$val) {
73 1
            $val = $object->$name;
74
        }
75 1
        $normalized = $this->normalize_fields($fields, $object->guid);
76
77 1
        foreach ($this->get_candidates($object) as $candidate) {
78 1
            $p_array = $this->p_duplicate($normalized, $candidate);
79 1
            if ($p_array['p'] >= $threshold) {
80 1
                $ret[] = $this->load($candidate['guid']);
81
            }
82
        }
83
84 1
        return $ret;
85
    }
86
87 1
    private function get_candidates(midcom_core_dbaobject $object = null) : array
88
    {
89 1
        $classname = $this->get_class();
90 1
        $fields = $this->get_fields();
91 1
        $results = [];
92 1
        $mc = $classname::new_collector();
93
94 1
        if ($object) {
95 1
            if ($object->id) {
96
                $mc->add_constraint('id', '<>', $object->id);
97
            }
98
            // TODO: Avoid objects marked as not_duplicate already in this phase.
99 1
            $mc->begin_group('OR');
100 1
            foreach ($this->get_fields() as $field) {
101 1
                if ($field != 'id' && $object->$field) {
102 1
                    $mc->add_constraint($field, 'LIKE', $object->$field);
103
                }
104
            }
105 1
            $mc->end_group();
106
        }
107
108 1
        foreach ($mc->get_rows($fields) as $guid => $result) {
109 1
            $results[] = $this->normalize_fields($result, $guid);
110
        }
111 1
        return $results;
112
    }
113
114 1
    protected function match(string $property, array $data1, array $data2) : bool
115
    {
116 1
        if (   !empty($data1[$property])
117 1
            && $data1[$property] == $data2[$property]) {
118 1
            return true;
119
        }
120 1
        return false;
121
    }
122
123 1
    private function load(string $guid) : midcom_core_dbaobject
124
    {
125 1
        $classname = $this->get_class();
126 1
        return $classname::get_cached($guid);
127
    }
128
129
    /**
130
     * Prepare fields for easier comparison
131
     */
132 1
    private function normalize_fields(array $fields, string $guid) : array
133
    {
134 1
        $fields = array_map('strtolower', array_map('trim', $fields));
135 1
        $fields['guid'] = $guid;
136
137 1
        return $fields;
138
    }
139
140
    /**
141
     * Find duplicates in database
142
     *
143
     * @return array array of persons with their possible duplicates
144
     */
145
    protected function check_all(int $threshold = 1) : array
146
    {
147
        $this->p_map = []; //Make sure this is clean before starting
148
        $this->threshold = $threshold;
149
        midcom::get()->disable_limits();
150
151
        // PONDER: Can we do this in smaller batches using find_duplicated_person
152
        /*
153
         IDEA: Make an AT method for checking single persons duplicates, then another to batch
154
         register a check for every person in batches of say 500.
155
         */
156
        $candidates = $this->get_candidates();
157
158
        array_walk($candidates, [$this, 'check_all_arraywalk'], $candidates);
159
160
        return $this->p_map;
161
    }
162
163
    /**
164
     * Used by check_all() to walk the QB result and checking each against the rest
165
     */
166
    protected function check_all_arraywalk(array $arr1, $key1, array $objects)
167
    {
168
        foreach ($objects as $key2 => $arr2) {
169
            if ($arr1['guid'] == $arr2['guid']) {
170
                continue;
171
            }
172
173
            // we've already examined this combination from the other end
174
            if ($key2 < $key1) {
175
                if (isset($this->p_map[$arr2['guid']][$arr1['guid']])) {
176
                    if (!isset($this->p_map[$arr1['guid']])) {
177
                        $this->p_map[$arr1['guid']] = [];
178
                    }
179
                    $this->p_map[$arr1['guid']][$arr2['guid']] = $this->p_map[$arr2['guid']][$arr1['guid']];
180
                }
181
                continue;
182
            }
183
184
            $p_arr = $this->p_duplicate($arr1, $arr2);
185
186
            if ($p_arr['p'] < $this->threshold) {
187
                continue;
188
            }
189
190
            try {
191
                $obj1 = $this->load($arr1['guid']);
192
                $obj2 = $this->load($arr2['guid']);
193
            } catch (midcom_error $e) {
194
                $e->log();
195
                continue;
196
            }
197
198
            if (   $obj1->get_parameter('org.openpsa.contacts.duplicates:not_duplicate', $obj2->guid)
199
                || $obj2->get_parameter('org.openpsa.contacts.duplicates:not_duplicate', $obj1->guid)) {
200
                // Not-duplicate parameter found, returning zero probability
201
                continue;
202
            }
203
204
            if (!isset($this->p_map[$arr1['guid']])) {
205
                $this->p_map[$arr1['guid']] = [];
206
            }
207
208
            $this->p_map[$arr1['guid']][$arr2['guid']] = $p_arr;
209
        }
210
    }
211
212
    protected function output($output, string $message, string $indent = '')
213
    {
214
        debug_add($message);
215
        if ($output) {
216
            echo $indent . 'INFO: ' . $message . "<br/>\n";
217
            flush();
218
        }
219
    }
220
}
221