|
1
|
|
|
<?php |
|
2
|
|
|
/** |
|
3
|
|
|
* @package org.openpsa.contacts |
|
4
|
|
|
* @author Nemein Oy http://www.nemein.com/ |
|
5
|
|
|
* @copyright Nemein Oy http://www.nemein.com/ |
|
6
|
|
|
* @license http://www.gnu.org/licenses/gpl.html GNU General Public License |
|
7
|
|
|
*/ |
|
8
|
|
|
|
|
9
|
|
|
/** |
|
10
|
|
|
* Search for duplicate persons and groups in database |
|
11
|
|
|
* |
|
12
|
|
|
* @package org.openpsa.contacts |
|
13
|
|
|
*/ |
|
14
|
|
|
abstract class org_openpsa_contacts_duplicates_check |
|
15
|
|
|
{ |
|
16
|
|
|
/** |
|
17
|
|
|
* Used to store map of probabilities when seeking duplicates for given person/group |
|
18
|
|
|
*/ |
|
19
|
|
|
private $p_map = []; |
|
20
|
|
|
|
|
21
|
|
|
/** |
|
22
|
|
|
* Minimum score to count as duplicate |
|
23
|
|
|
* |
|
24
|
|
|
* @var integer |
|
25
|
|
|
*/ |
|
26
|
|
|
private $threshold = 1; |
|
27
|
|
|
|
|
28
|
|
|
/** |
|
29
|
|
|
* Calculates P for the given two candidates being duplicates |
|
30
|
|
|
* |
|
31
|
|
|
* @return array with overall P and matched checks |
|
32
|
|
|
*/ |
|
33
|
|
|
abstract protected function p_duplicate(array $candidate1, array $candidate2) : array; |
|
34
|
|
|
|
|
35
|
|
|
abstract protected function get_class() : string; |
|
36
|
|
|
|
|
37
|
|
|
abstract protected function get_fields() : array; |
|
38
|
|
|
|
|
39
|
|
|
/** |
|
40
|
|
|
* Find all duplicates and mark them |
|
41
|
|
|
*/ |
|
42
|
|
|
public function mark_all(bool $output) |
|
43
|
|
|
{ |
|
44
|
|
|
$time_start = time(); |
|
45
|
|
|
$this->output($output, 'Starting'); |
|
46
|
|
|
|
|
47
|
|
|
$ret = $this->check_all(); |
|
48
|
|
|
foreach ($ret as $guid1 => $duplicates) { |
|
49
|
|
|
$duplicate1 = $this->load($guid1); |
|
50
|
|
|
foreach ($duplicates as $guid2 => $details) { |
|
51
|
|
|
$duplicate2 = $this->load($guid2); |
|
52
|
|
|
$msg = "Marking {$guid1} (#{$duplicate1->id}) and {$guid2} (#{$duplicate2->id}) as duplicates with P {$details['p']}"; |
|
53
|
|
|
$duplicate1->set_parameter('org.openpsa.contacts.duplicates:possible_duplicate', $guid2, $details['p']); |
|
54
|
|
|
$duplicate2->set_parameter('org.openpsa.contacts.duplicates:possible_duplicate', $guid1, $details['p']); |
|
55
|
|
|
$this->output($output, $msg, ' '); |
|
56
|
|
|
} |
|
57
|
|
|
} |
|
58
|
|
|
|
|
59
|
|
|
$this->output($output, "DONE. Elapsed time " . (time() - $time_start) . " seconds"); |
|
60
|
|
|
} |
|
61
|
|
|
|
|
62
|
|
|
/** |
|
63
|
|
|
* Find duplicates for given object |
|
64
|
|
|
* |
|
65
|
|
|
* @return midcom_core_dbaobject[] List of possible duplicates |
|
66
|
|
|
*/ |
|
67
|
1 |
|
public function find_duplicates(midcom_core_dbaobject $object, int $threshold = 1) : array |
|
68
|
|
|
{ |
|
69
|
1 |
|
$ret = []; |
|
70
|
1 |
|
$fields = array_flip($this->get_fields()); |
|
71
|
|
|
|
|
72
|
1 |
|
foreach ($fields as $name => &$val) { |
|
73
|
1 |
|
$val = $object->$name; |
|
74
|
|
|
} |
|
75
|
1 |
|
$normalized = $this->normalize_fields($fields, $object->guid); |
|
76
|
|
|
|
|
77
|
1 |
|
foreach ($this->get_candidates($object) as $candidate) { |
|
78
|
1 |
|
$p_array = $this->p_duplicate($normalized, $candidate); |
|
79
|
1 |
|
if ($p_array['p'] >= $threshold) { |
|
80
|
1 |
|
$ret[] = $this->load($candidate['guid']); |
|
81
|
|
|
} |
|
82
|
|
|
} |
|
83
|
|
|
|
|
84
|
1 |
|
return $ret; |
|
85
|
|
|
} |
|
86
|
|
|
|
|
87
|
1 |
|
private function get_candidates(midcom_core_dbaobject $object = null) : array |
|
88
|
|
|
{ |
|
89
|
1 |
|
$classname = $this->get_class(); |
|
90
|
1 |
|
$fields = $this->get_fields(); |
|
91
|
1 |
|
$results = []; |
|
92
|
1 |
|
$mc = $classname::new_collector(); |
|
93
|
|
|
|
|
94
|
1 |
|
if ($object) { |
|
95
|
1 |
|
if ($object->id) { |
|
96
|
|
|
$mc->add_constraint('id', '<>', $object->id); |
|
97
|
|
|
} |
|
98
|
|
|
// TODO: Avoid objects marked as not_duplicate already in this phase. |
|
99
|
1 |
|
$mc->begin_group('OR'); |
|
100
|
1 |
|
foreach ($this->get_fields() as $field) { |
|
101
|
1 |
|
if ($field != 'id' && $object->$field) { |
|
102
|
1 |
|
$mc->add_constraint($field, 'LIKE', $object->$field); |
|
103
|
|
|
} |
|
104
|
|
|
} |
|
105
|
1 |
|
$mc->end_group(); |
|
106
|
|
|
} |
|
107
|
|
|
|
|
108
|
1 |
|
foreach ($mc->get_rows($fields) as $guid => $result) { |
|
109
|
1 |
|
$results[] = $this->normalize_fields($result, $guid); |
|
110
|
|
|
} |
|
111
|
1 |
|
return $results; |
|
112
|
|
|
} |
|
113
|
|
|
|
|
114
|
1 |
|
protected function match(string $property, array $data1, array $data2) : bool |
|
115
|
|
|
{ |
|
116
|
1 |
|
if ( !empty($data1[$property]) |
|
117
|
1 |
|
&& $data1[$property] == $data2[$property]) { |
|
118
|
1 |
|
return true; |
|
119
|
|
|
} |
|
120
|
1 |
|
return false; |
|
121
|
|
|
} |
|
122
|
|
|
|
|
123
|
1 |
|
private function load(string $guid) : midcom_core_dbaobject |
|
124
|
|
|
{ |
|
125
|
1 |
|
$classname = $this->get_class(); |
|
126
|
1 |
|
return $classname::get_cached($guid); |
|
127
|
|
|
} |
|
128
|
|
|
|
|
129
|
|
|
/** |
|
130
|
|
|
* Prepare fields for easier comparison |
|
131
|
|
|
*/ |
|
132
|
1 |
|
private function normalize_fields(array $fields, string $guid) : array |
|
133
|
|
|
{ |
|
134
|
1 |
|
$fields = array_map('strtolower', array_map('trim', $fields)); |
|
135
|
1 |
|
$fields['guid'] = $guid; |
|
136
|
|
|
|
|
137
|
1 |
|
return $fields; |
|
138
|
|
|
} |
|
139
|
|
|
|
|
140
|
|
|
/** |
|
141
|
|
|
* Find duplicates in database |
|
142
|
|
|
* |
|
143
|
|
|
* @return array array of persons with their possible duplicates |
|
144
|
|
|
*/ |
|
145
|
|
|
protected function check_all(int $threshold = 1) : array |
|
146
|
|
|
{ |
|
147
|
|
|
$this->p_map = []; //Make sure this is clean before starting |
|
148
|
|
|
$this->threshold = $threshold; |
|
149
|
|
|
midcom::get()->disable_limits(); |
|
150
|
|
|
|
|
151
|
|
|
// PONDER: Can we do this in smaller batches using find_duplicated_person |
|
152
|
|
|
/* |
|
153
|
|
|
IDEA: Make an AT method for checking single persons duplicates, then another to batch |
|
154
|
|
|
register a check for every person in batches of say 500. |
|
155
|
|
|
*/ |
|
156
|
|
|
$candidates = $this->get_candidates(); |
|
157
|
|
|
|
|
158
|
|
|
array_walk($candidates, [$this, 'check_all_arraywalk'], $candidates); |
|
159
|
|
|
|
|
160
|
|
|
return $this->p_map; |
|
161
|
|
|
} |
|
162
|
|
|
|
|
163
|
|
|
/** |
|
164
|
|
|
* Used by check_all() to walk the QB result and checking each against the rest |
|
165
|
|
|
*/ |
|
166
|
|
|
protected function check_all_arraywalk(array $arr1, $key1, array $objects) |
|
167
|
|
|
{ |
|
168
|
|
|
foreach ($objects as $key2 => $arr2) { |
|
169
|
|
|
if ($arr1['guid'] == $arr2['guid']) { |
|
170
|
|
|
continue; |
|
171
|
|
|
} |
|
172
|
|
|
|
|
173
|
|
|
// we've already examined this combination from the other end |
|
174
|
|
|
if ($key2 < $key1) { |
|
175
|
|
|
if (isset($this->p_map[$arr2['guid']][$arr1['guid']])) { |
|
176
|
|
|
if (!isset($this->p_map[$arr1['guid']])) { |
|
177
|
|
|
$this->p_map[$arr1['guid']] = []; |
|
178
|
|
|
} |
|
179
|
|
|
$this->p_map[$arr1['guid']][$arr2['guid']] = $this->p_map[$arr2['guid']][$arr1['guid']]; |
|
180
|
|
|
} |
|
181
|
|
|
continue; |
|
182
|
|
|
} |
|
183
|
|
|
|
|
184
|
|
|
$p_arr = $this->p_duplicate($arr1, $arr2); |
|
185
|
|
|
|
|
186
|
|
|
if ($p_arr['p'] < $this->threshold) { |
|
187
|
|
|
continue; |
|
188
|
|
|
} |
|
189
|
|
|
|
|
190
|
|
|
try { |
|
191
|
|
|
$obj1 = $this->load($arr1['guid']); |
|
192
|
|
|
$obj2 = $this->load($arr2['guid']); |
|
193
|
|
|
} catch (midcom_error $e) { |
|
194
|
|
|
$e->log(); |
|
195
|
|
|
continue; |
|
196
|
|
|
} |
|
197
|
|
|
|
|
198
|
|
|
if ( $obj1->get_parameter('org.openpsa.contacts.duplicates:not_duplicate', $obj2->guid) |
|
199
|
|
|
|| $obj2->get_parameter('org.openpsa.contacts.duplicates:not_duplicate', $obj1->guid)) { |
|
200
|
|
|
// Not-duplicate parameter found, returning zero probability |
|
201
|
|
|
continue; |
|
202
|
|
|
} |
|
203
|
|
|
|
|
204
|
|
|
if (!isset($this->p_map[$arr1['guid']])) { |
|
205
|
|
|
$this->p_map[$arr1['guid']] = []; |
|
206
|
|
|
} |
|
207
|
|
|
|
|
208
|
|
|
$this->p_map[$arr1['guid']][$arr2['guid']] = $p_arr; |
|
209
|
|
|
} |
|
210
|
|
|
} |
|
211
|
|
|
|
|
212
|
|
|
protected function output($output, string $message, string $indent = '') |
|
213
|
|
|
{ |
|
214
|
|
|
debug_add($message); |
|
215
|
|
|
if ($output) { |
|
216
|
|
|
echo $indent . 'INFO: ' . $message . "<br/>\n"; |
|
217
|
|
|
flush(); |
|
218
|
|
|
} |
|
219
|
|
|
} |
|
220
|
|
|
} |
|
221
|
|
|
|