1
|
|
|
<?php |
2
|
|
|
/** |
3
|
|
|
* User: jensk |
4
|
|
|
* Date: 1-3-2017 |
5
|
|
|
* Time: 10:34 |
6
|
|
|
*/ |
7
|
|
|
|
8
|
|
|
namespace library\search\indexer; |
9
|
|
|
|
10
|
|
|
use library\search\Indexer; |
11
|
|
|
|
12
|
|
|
/** |
13
|
|
|
* Calculate, relatively how often a term is used in a document |
14
|
|
|
* Where relativly means compared to the total of terms, how often is term X |
15
|
|
|
* used. For example: |
16
|
|
|
* doc1 has the following terms: |
17
|
|
|
* - term1 (count 2) |
18
|
|
|
* - term2 (count 1) |
19
|
|
|
* The total count of terms = 3 |
20
|
|
|
* The frequency of term1 in doc1 is: |
21
|
|
|
* count of term 1 / total count of terms |
22
|
|
|
* = |
23
|
|
|
* 2 / 3 = 0.66666666667 |
24
|
|
|
*/ |
25
|
|
|
class TermFrequency |
26
|
|
|
{ |
27
|
|
|
protected $dbHandle; |
28
|
|
|
|
29
|
|
|
/** |
30
|
|
|
* TermFrequency constructor. |
31
|
|
|
* |
32
|
|
|
* @param \PDO $dbHandle |
33
|
|
|
*/ |
34
|
|
|
public function __construct($dbHandle) |
35
|
|
|
{ |
36
|
|
|
$this->dbHandle = $dbHandle; |
37
|
|
|
} |
38
|
|
|
|
39
|
|
|
public function execute() |
40
|
|
|
{ |
41
|
|
|
$db = $this->dbHandle; |
42
|
|
|
$totalTermCountPerDocument = $this->getTotalTermCountPerDocument($db); |
43
|
|
|
foreach ($totalTermCountPerDocument as $documentField) { |
44
|
|
|
$termsForDocumentField = $this->getTermsForDocumentField($documentField->documentPath, $documentField->field); |
45
|
|
|
$sql = ' |
46
|
|
|
INSERT INTO term_frequency (documentPath, field, term, frequency) |
47
|
|
|
VALUES |
48
|
|
|
'; |
49
|
|
|
$quotedDocumentPath = $db->quote($documentField->documentPath); |
50
|
|
|
$quotedField = $db->quote($documentField->field); |
51
|
|
|
$values = array(); |
52
|
|
|
$i = 0; |
53
|
|
|
foreach ($termsForDocumentField as $term) { |
54
|
|
|
$frequency = intval($term->count) / $documentField->totalTermCount; |
55
|
|
|
$values[] = $quotedDocumentPath . ',' . $quotedField . ', ' . $db->quote($term->term) . ', ' . $db->quote($frequency); |
56
|
|
|
$i += 1; |
57
|
|
|
if ($i >= Indexer::SQLITE_MAX_COMPOUND_SELECT) { |
58
|
|
|
$this->executeStore($sql, $values, $db); |
59
|
|
|
$i = 0; |
60
|
|
|
$values = array(); |
61
|
|
|
} |
62
|
|
|
} |
63
|
|
|
if (count($values) != 0) { |
64
|
|
|
$this->executeStore($sql, $values, $db); |
65
|
|
|
} |
66
|
|
|
} |
67
|
|
|
} |
68
|
|
|
|
69
|
|
|
private function getTermsForDocumentField($documentPath, $field) |
70
|
|
|
{ |
71
|
|
|
$db = $this->dbHandle; |
72
|
|
|
$stmt = $db->prepare(' |
73
|
|
|
SELECT `term`, `count` |
74
|
|
|
FROM `term_count` |
75
|
|
|
WHERE `documentPath` = :documentPath |
76
|
|
|
AND `field` = :field |
77
|
|
|
'); |
78
|
|
|
$stmt->bindValue(':documentPath', $documentPath); |
79
|
|
|
$stmt->bindValue(':field', $field); |
80
|
|
|
$stmt->execute(); |
81
|
|
|
return $stmt->fetchAll(\PDO::FETCH_CLASS); |
82
|
|
|
} |
83
|
|
|
|
84
|
|
|
/** |
85
|
|
|
* @param $db |
86
|
|
|
* |
87
|
|
|
* @return mixed |
88
|
|
|
*/ |
89
|
|
|
private function getTotalTermCountPerDocument($db) |
90
|
|
|
{ |
91
|
|
|
$stmt = $db->prepare(' |
92
|
|
|
SELECT documentPath, field, SUM(count) as totalTermCount |
93
|
|
|
FROM term_count |
94
|
|
|
GROUP BY documentPath, field |
95
|
|
|
'); |
96
|
|
|
$stmt->execute(); |
97
|
|
|
$totalTermCountPerDocument = $stmt->fetchAll(\PDO::FETCH_CLASS); |
98
|
|
|
|
99
|
|
|
return $totalTermCountPerDocument; |
100
|
|
|
} |
101
|
|
|
|
102
|
|
|
private function executeStore($sql, $values, $db) |
103
|
|
|
{ |
104
|
|
|
$sql .= '(' . implode('),' . PHP_EOL . '(', $values) . ');'; |
105
|
|
View Code Duplication |
if (!$db->query($sql)) { |
|
|
|
|
106
|
|
|
$errorInfo = $db->errorInfo(); |
107
|
|
|
$errorMsg = $errorInfo[2]; |
108
|
|
|
throw new \Exception('SQLite Exception: ' . $errorMsg . ' in SQL: <br /><pre>' . $sql . '</pre>'); |
109
|
|
|
} |
110
|
|
|
} |
111
|
|
|
|
112
|
|
|
} |
Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.
You can also find more detailed suggestions in the “Code” section of your repository.