@@ -13,47 +13,47 @@ |
||
13 | 13 | */ |
14 | 14 | class Tokenizer |
15 | 15 | { |
16 | - protected $inputString; |
|
17 | - protected $tokenVector = array(); |
|
18 | - |
|
19 | - /** |
|
20 | - * Tokenizer constructor. |
|
21 | - * |
|
22 | - * @param string $string Should preferably be parsed wit \CloudControl\Cms\search\CharacterFilter |
|
23 | - * @see \CloudControl\Cms\search\CharacterFilter |
|
24 | - */ |
|
25 | - public function __construct($string) |
|
26 | - { |
|
27 | - $this->inputString = $string; |
|
28 | - $this->tokenize(); |
|
29 | - } |
|
30 | - |
|
31 | - protected function tokenize() |
|
32 | - { |
|
33 | - $tokens = explode(' ', $this->inputString); |
|
34 | - foreach ($tokens as $token) { |
|
35 | - $this->addTokenToVector($token); |
|
36 | - } |
|
37 | - } |
|
38 | - |
|
39 | - protected function addTokenToVector($token) |
|
40 | - { |
|
41 | - if (!empty($token)) { |
|
42 | - if (isset($this->tokenVector[$token])) { |
|
43 | - $this->tokenVector[$token] += 1; |
|
44 | - } else { |
|
45 | - $this->tokenVector[$token] = 1; |
|
46 | - } |
|
47 | - } |
|
48 | - } |
|
49 | - |
|
50 | - /** |
|
51 | - * @return array |
|
52 | - */ |
|
53 | - public function getTokenVector() |
|
54 | - { |
|
55 | - return $this->tokenVector; |
|
56 | - } |
|
16 | + protected $inputString; |
|
17 | + protected $tokenVector = array(); |
|
18 | + |
|
19 | + /** |
|
20 | + * Tokenizer constructor. |
|
21 | + * |
|
22 | + * @param string $string Should preferably be parsed wit \CloudControl\Cms\search\CharacterFilter |
|
23 | + * @see \CloudControl\Cms\search\CharacterFilter |
|
24 | + */ |
|
25 | + public function __construct($string) |
|
26 | + { |
|
27 | + $this->inputString = $string; |
|
28 | + $this->tokenize(); |
|
29 | + } |
|
30 | + |
|
31 | + protected function tokenize() |
|
32 | + { |
|
33 | + $tokens = explode(' ', $this->inputString); |
|
34 | + foreach ($tokens as $token) { |
|
35 | + $this->addTokenToVector($token); |
|
36 | + } |
|
37 | + } |
|
38 | + |
|
39 | + protected function addTokenToVector($token) |
|
40 | + { |
|
41 | + if (!empty($token)) { |
|
42 | + if (isset($this->tokenVector[$token])) { |
|
43 | + $this->tokenVector[$token] += 1; |
|
44 | + } else { |
|
45 | + $this->tokenVector[$token] = 1; |
|
46 | + } |
|
47 | + } |
|
48 | + } |
|
49 | + |
|
50 | + /** |
|
51 | + * @return array |
|
52 | + */ |
|
53 | + public function getTokenVector() |
|
54 | + { |
|
55 | + return $this->tokenVector; |
|
56 | + } |
|
57 | 57 | |
58 | 58 | |
59 | 59 | } |
60 | 60 | \ No newline at end of file |
@@ -11,15 +11,15 @@ |
||
11 | 11 | |
12 | 12 | interface Filter |
13 | 13 | { |
14 | - /** |
|
15 | - * Filter constructor. |
|
16 | - * |
|
17 | - * @param array $tokens |
|
18 | - */ |
|
19 | - public function __construct($tokens); |
|
14 | + /** |
|
15 | + * Filter constructor. |
|
16 | + * |
|
17 | + * @param array $tokens |
|
18 | + */ |
|
19 | + public function __construct($tokens); |
|
20 | 20 | |
21 | - /** |
|
22 | - * @return array |
|
23 | - */ |
|
24 | - public function getFilterResults(); |
|
21 | + /** |
|
22 | + * @return array |
|
23 | + */ |
|
24 | + public function getFilterResults(); |
|
25 | 25 | } |
26 | 26 | \ No newline at end of file |
@@ -13,149 +13,149 @@ |
||
13 | 13 | |
14 | 14 | class DocumentTokenizer |
15 | 15 | { |
16 | - /** |
|
17 | - * @var Document |
|
18 | - */ |
|
19 | - protected $document; |
|
20 | - |
|
21 | - /** |
|
22 | - * @var array |
|
23 | - */ |
|
24 | - protected $tokenVector = array(); |
|
25 | - protected $storage; |
|
26 | - |
|
27 | - /** |
|
28 | - * Tokenizer constructor. |
|
29 | - * |
|
30 | - * @param \CloudControl\Cms\storage\Document $document |
|
31 | - * @param Storage $storage |
|
32 | - */ |
|
33 | - public function __construct(Document $document, Storage $storage) |
|
34 | - { |
|
35 | - $this->document = $document; |
|
36 | - $this->storage = $storage; |
|
37 | - $this->tokenize(); |
|
38 | - } |
|
39 | - |
|
40 | - /** |
|
41 | - * Execute tokenization of all document fields |
|
42 | - */ |
|
43 | - private function tokenize() |
|
44 | - { |
|
45 | - $this->tokenizeTitle(); |
|
46 | - $this->tokenizeFields(); |
|
47 | - $this->tokenizeBricks(); |
|
48 | - $this->tokenizeDynamicBricks(); |
|
49 | - $this->tokenVector = array_filter($this->tokenVector); |
|
50 | - arsort($this->tokenVector); |
|
51 | - } |
|
52 | - |
|
53 | - private function tokenizeTitle() |
|
54 | - { |
|
55 | - $filteredString = new CharacterFilter($this->document->title); |
|
56 | - $tokenizer = new Tokenizer($filteredString); |
|
57 | - $this->addTokenVectorToVector($tokenizer->getTokenVector(), 'title'); |
|
58 | - } |
|
59 | - |
|
60 | - private function tokenizeFields() |
|
61 | - { |
|
62 | - $fields = $this->document->fields; |
|
63 | - $documentDefinition = $this->storage->getDocumentTypes()->getDocumentTypeBySlug($this->document->documentTypeSlug); |
|
64 | - foreach ($fields as $fieldName => $field) { |
|
65 | - $fieldType = $this->getFieldType($fieldName, $documentDefinition); |
|
66 | - $this->tokenizeField($field, $fieldName, $fieldType); |
|
67 | - } |
|
68 | - } |
|
69 | - |
|
70 | - private function tokenizeField($field, $fieldName, $fieldType) |
|
71 | - { |
|
72 | - foreach ($field as $value) { |
|
73 | - // Only index fields that contain text |
|
74 | - if (in_array($fieldType, array('String', 'Text', 'Rich Text'))) { |
|
75 | - $filteredString = new CharacterFilter($value); |
|
76 | - $tokenizer = new Tokenizer($filteredString); |
|
77 | - $this->addTokenVectorToVector($tokenizer->getTokenVector(), $fieldName); |
|
78 | - } |
|
79 | - } |
|
80 | - } |
|
81 | - |
|
82 | - private function tokenizeBricks() |
|
83 | - { |
|
84 | - $bricks = $this->document->bricks; |
|
85 | - foreach ($bricks as $brickSlug => $bricks) { |
|
86 | - foreach ($bricks as $brick) { |
|
87 | - $this->tokenizeBrick($brick, $brickSlug); |
|
88 | - } |
|
89 | - } |
|
90 | - } |
|
91 | - |
|
92 | - private function tokenizeBrick($brick, $brickSlug) |
|
93 | - { |
|
94 | - $fields = $brick->fields; |
|
95 | - $brickDefinition = $this->storage->getBricks()->getBrickBySlug($brick->type); |
|
96 | - foreach ($fields as $fieldName => $field) { |
|
97 | - $fieldType = $this->getFieldType($fieldName, $brickDefinition); |
|
98 | - $this->tokenizeField($field, $brickSlug . '__' . $fieldName, $fieldType); |
|
99 | - } |
|
100 | - } |
|
101 | - |
|
102 | - private function tokenizeDynamicBricks() |
|
103 | - { |
|
104 | - $dynamicBricks = $this->document->dynamicBricks; |
|
105 | - foreach ($dynamicBricks as $key => $brick) { |
|
106 | - $this->tokenizeBrick($brick, 'dynamicBricks__' . $brick->type . $key); |
|
107 | - } |
|
108 | - } |
|
109 | - |
|
110 | - public function getTokens() |
|
111 | - { |
|
112 | - return $this->tokenVector; |
|
113 | - } |
|
114 | - |
|
115 | - /** |
|
116 | - * Add a token to the existing tokenvector |
|
117 | - * @param $token |
|
118 | - * @param string $field |
|
119 | - * @param int $count |
|
120 | - */ |
|
121 | - private function addTokenToVector($token, $field, $count = 1) |
|
122 | - { |
|
123 | - if (!empty($token)) { |
|
124 | - if (isset($this->tokenVector[$field][$token])) { |
|
125 | - $this->tokenVector[$field][$token] += $count; |
|
126 | - } else { |
|
127 | - $this->tokenVector[$field][$token] = $count; |
|
128 | - } |
|
129 | - } |
|
130 | - } |
|
131 | - |
|
132 | - /** |
|
133 | - * Add a complete token vector to the existing one. |
|
134 | - * @param $tokenVector |
|
135 | - * @param $field |
|
136 | - */ |
|
137 | - private function addTokenVectorToVector($tokenVector, $field) |
|
138 | - { |
|
139 | - foreach ($tokenVector as $token => $count) { |
|
140 | - $this->addTokenToVector($token, $field, $count); |
|
141 | - } |
|
142 | - } |
|
143 | - |
|
144 | - /** |
|
145 | - * Get the type for a field |
|
146 | - * @param $fieldName |
|
147 | - * @param $documentDefinition |
|
148 | - * @return mixed |
|
149 | - * @throws \Exception |
|
150 | - */ |
|
151 | - private function getFieldType($fieldName, $documentDefinition) |
|
152 | - { |
|
153 | - foreach ($documentDefinition->fields as $fieldTypeDefinition) { |
|
154 | - if ($fieldTypeDefinition->slug === $fieldName) { |
|
155 | - return $fieldTypeDefinition->type; |
|
156 | - } |
|
157 | - } |
|
158 | - |
|
159 | - throw new \Exception('Unknown field type for field' . $fieldName . ' in document ' . $this->document->path); |
|
160 | - } |
|
16 | + /** |
|
17 | + * @var Document |
|
18 | + */ |
|
19 | + protected $document; |
|
20 | + |
|
21 | + /** |
|
22 | + * @var array |
|
23 | + */ |
|
24 | + protected $tokenVector = array(); |
|
25 | + protected $storage; |
|
26 | + |
|
27 | + /** |
|
28 | + * Tokenizer constructor. |
|
29 | + * |
|
30 | + * @param \CloudControl\Cms\storage\Document $document |
|
31 | + * @param Storage $storage |
|
32 | + */ |
|
33 | + public function __construct(Document $document, Storage $storage) |
|
34 | + { |
|
35 | + $this->document = $document; |
|
36 | + $this->storage = $storage; |
|
37 | + $this->tokenize(); |
|
38 | + } |
|
39 | + |
|
40 | + /** |
|
41 | + * Execute tokenization of all document fields |
|
42 | + */ |
|
43 | + private function tokenize() |
|
44 | + { |
|
45 | + $this->tokenizeTitle(); |
|
46 | + $this->tokenizeFields(); |
|
47 | + $this->tokenizeBricks(); |
|
48 | + $this->tokenizeDynamicBricks(); |
|
49 | + $this->tokenVector = array_filter($this->tokenVector); |
|
50 | + arsort($this->tokenVector); |
|
51 | + } |
|
52 | + |
|
53 | + private function tokenizeTitle() |
|
54 | + { |
|
55 | + $filteredString = new CharacterFilter($this->document->title); |
|
56 | + $tokenizer = new Tokenizer($filteredString); |
|
57 | + $this->addTokenVectorToVector($tokenizer->getTokenVector(), 'title'); |
|
58 | + } |
|
59 | + |
|
60 | + private function tokenizeFields() |
|
61 | + { |
|
62 | + $fields = $this->document->fields; |
|
63 | + $documentDefinition = $this->storage->getDocumentTypes()->getDocumentTypeBySlug($this->document->documentTypeSlug); |
|
64 | + foreach ($fields as $fieldName => $field) { |
|
65 | + $fieldType = $this->getFieldType($fieldName, $documentDefinition); |
|
66 | + $this->tokenizeField($field, $fieldName, $fieldType); |
|
67 | + } |
|
68 | + } |
|
69 | + |
|
70 | + private function tokenizeField($field, $fieldName, $fieldType) |
|
71 | + { |
|
72 | + foreach ($field as $value) { |
|
73 | + // Only index fields that contain text |
|
74 | + if (in_array($fieldType, array('String', 'Text', 'Rich Text'))) { |
|
75 | + $filteredString = new CharacterFilter($value); |
|
76 | + $tokenizer = new Tokenizer($filteredString); |
|
77 | + $this->addTokenVectorToVector($tokenizer->getTokenVector(), $fieldName); |
|
78 | + } |
|
79 | + } |
|
80 | + } |
|
81 | + |
|
82 | + private function tokenizeBricks() |
|
83 | + { |
|
84 | + $bricks = $this->document->bricks; |
|
85 | + foreach ($bricks as $brickSlug => $bricks) { |
|
86 | + foreach ($bricks as $brick) { |
|
87 | + $this->tokenizeBrick($brick, $brickSlug); |
|
88 | + } |
|
89 | + } |
|
90 | + } |
|
91 | + |
|
92 | + private function tokenizeBrick($brick, $brickSlug) |
|
93 | + { |
|
94 | + $fields = $brick->fields; |
|
95 | + $brickDefinition = $this->storage->getBricks()->getBrickBySlug($brick->type); |
|
96 | + foreach ($fields as $fieldName => $field) { |
|
97 | + $fieldType = $this->getFieldType($fieldName, $brickDefinition); |
|
98 | + $this->tokenizeField($field, $brickSlug . '__' . $fieldName, $fieldType); |
|
99 | + } |
|
100 | + } |
|
101 | + |
|
102 | + private function tokenizeDynamicBricks() |
|
103 | + { |
|
104 | + $dynamicBricks = $this->document->dynamicBricks; |
|
105 | + foreach ($dynamicBricks as $key => $brick) { |
|
106 | + $this->tokenizeBrick($brick, 'dynamicBricks__' . $brick->type . $key); |
|
107 | + } |
|
108 | + } |
|
109 | + |
|
110 | + public function getTokens() |
|
111 | + { |
|
112 | + return $this->tokenVector; |
|
113 | + } |
|
114 | + |
|
115 | + /** |
|
116 | + * Add a token to the existing tokenvector |
|
117 | + * @param $token |
|
118 | + * @param string $field |
|
119 | + * @param int $count |
|
120 | + */ |
|
121 | + private function addTokenToVector($token, $field, $count = 1) |
|
122 | + { |
|
123 | + if (!empty($token)) { |
|
124 | + if (isset($this->tokenVector[$field][$token])) { |
|
125 | + $this->tokenVector[$field][$token] += $count; |
|
126 | + } else { |
|
127 | + $this->tokenVector[$field][$token] = $count; |
|
128 | + } |
|
129 | + } |
|
130 | + } |
|
131 | + |
|
132 | + /** |
|
133 | + * Add a complete token vector to the existing one. |
|
134 | + * @param $tokenVector |
|
135 | + * @param $field |
|
136 | + */ |
|
137 | + private function addTokenVectorToVector($tokenVector, $field) |
|
138 | + { |
|
139 | + foreach ($tokenVector as $token => $count) { |
|
140 | + $this->addTokenToVector($token, $field, $count); |
|
141 | + } |
|
142 | + } |
|
143 | + |
|
144 | + /** |
|
145 | + * Get the type for a field |
|
146 | + * @param $fieldName |
|
147 | + * @param $documentDefinition |
|
148 | + * @return mixed |
|
149 | + * @throws \Exception |
|
150 | + */ |
|
151 | + private function getFieldType($fieldName, $documentDefinition) |
|
152 | + { |
|
153 | + foreach ($documentDefinition->fields as $fieldTypeDefinition) { |
|
154 | + if ($fieldTypeDefinition->slug === $fieldName) { |
|
155 | + return $fieldTypeDefinition->type; |
|
156 | + } |
|
157 | + } |
|
158 | + |
|
159 | + throw new \Exception('Unknown field type for field' . $fieldName . ' in document ' . $this->document->path); |
|
160 | + } |
|
161 | 161 | } |
162 | 162 | \ No newline at end of file |
@@ -9,76 +9,76 @@ |
||
9 | 9 | |
10 | 10 | class CharacterFilter |
11 | 11 | { |
12 | - protected $originalString; |
|
13 | - protected $filteredString = ''; |
|
12 | + protected $originalString; |
|
13 | + protected $filteredString = ''; |
|
14 | 14 | |
15 | - /** |
|
16 | - * CharacterFilter constructor. |
|
17 | - * |
|
18 | - * @param $string |
|
19 | - */ |
|
20 | - public function __construct($string) |
|
21 | - { |
|
22 | - $this->originalString = $string; |
|
23 | - $string = $this->convertToUTF8($string); |
|
24 | - $string = mb_strtolower($string); |
|
25 | - $string = $this->filterSpecialCharacters($string); |
|
26 | - $this->filteredString = $string; |
|
27 | - } |
|
15 | + /** |
|
16 | + * CharacterFilter constructor. |
|
17 | + * |
|
18 | + * @param $string |
|
19 | + */ |
|
20 | + public function __construct($string) |
|
21 | + { |
|
22 | + $this->originalString = $string; |
|
23 | + $string = $this->convertToUTF8($string); |
|
24 | + $string = mb_strtolower($string); |
|
25 | + $string = $this->filterSpecialCharacters($string); |
|
26 | + $this->filteredString = $string; |
|
27 | + } |
|
28 | 28 | |
29 | - /** |
|
30 | - * Returns the filtered string |
|
31 | - * @return string|void |
|
32 | - */ |
|
33 | - public function __toString() |
|
34 | - { |
|
35 | - return $this->filteredString; |
|
36 | - } |
|
29 | + /** |
|
30 | + * Returns the filtered string |
|
31 | + * @return string|void |
|
32 | + */ |
|
33 | + public function __toString() |
|
34 | + { |
|
35 | + return $this->filteredString; |
|
36 | + } |
|
37 | 37 | |
38 | - /** |
|
39 | - * Filter out all special characters, like punctuation and characters with accents |
|
40 | - * |
|
41 | - * @param $string |
|
42 | - * |
|
43 | - * @return mixed|string |
|
44 | - */ |
|
45 | - private function filterSpecialCharacters($string) |
|
46 | - { |
|
47 | - $string = str_replace('<', ' <', $string); // This is need, otherwise this: <h1>something</h1><h2>something</h2> will result in somethingsomething |
|
48 | - $string = strip_tags($string); |
|
49 | - $string = trim($string); |
|
50 | - $string = iconv('UTF-8', 'ASCII//TRANSLIT//IGNORE', $string); // Remove special alphanumeric characters |
|
51 | - $string = str_replace(array('+', '=', '!', ',', '.',';', ':', '?'), ' ', $string); // Replace sentence breaking charaters with spaces |
|
52 | - $string = preg_replace("/[\r\n]+/", " ", $string); // Replace multiple newlines with a single space. |
|
53 | - $string = preg_replace("/[\t]+/", " ", $string); // Replace multiple tabs with a single space. |
|
54 | - $string = preg_replace("/[^a-zA-Z0-9 ]/", '', $string); // Filter out everything that is not alphanumeric or a space |
|
55 | - $string = preg_replace('!\s+!', ' ', $string); // Replace multiple spaces with a single space |
|
56 | - return $string; |
|
57 | - } |
|
38 | + /** |
|
39 | + * Filter out all special characters, like punctuation and characters with accents |
|
40 | + * |
|
41 | + * @param $string |
|
42 | + * |
|
43 | + * @return mixed|string |
|
44 | + */ |
|
45 | + private function filterSpecialCharacters($string) |
|
46 | + { |
|
47 | + $string = str_replace('<', ' <', $string); // This is need, otherwise this: <h1>something</h1><h2>something</h2> will result in somethingsomething |
|
48 | + $string = strip_tags($string); |
|
49 | + $string = trim($string); |
|
50 | + $string = iconv('UTF-8', 'ASCII//TRANSLIT//IGNORE', $string); // Remove special alphanumeric characters |
|
51 | + $string = str_replace(array('+', '=', '!', ',', '.',';', ':', '?'), ' ', $string); // Replace sentence breaking charaters with spaces |
|
52 | + $string = preg_replace("/[\r\n]+/", " ", $string); // Replace multiple newlines with a single space. |
|
53 | + $string = preg_replace("/[\t]+/", " ", $string); // Replace multiple tabs with a single space. |
|
54 | + $string = preg_replace("/[^a-zA-Z0-9 ]/", '', $string); // Filter out everything that is not alphanumeric or a space |
|
55 | + $string = preg_replace('!\s+!', ' ', $string); // Replace multiple spaces with a single space |
|
56 | + return $string; |
|
57 | + } |
|
58 | 58 | |
59 | - /** |
|
60 | - * Convert the string to UTF-8 encoding |
|
61 | - * @param $string |
|
62 | - * |
|
63 | - * @return string |
|
64 | - */ |
|
65 | - private function convertToUTF8($string) |
|
66 | - { |
|
67 | - $encoding = mb_detect_encoding($string, mb_detect_order(), false); |
|
59 | + /** |
|
60 | + * Convert the string to UTF-8 encoding |
|
61 | + * @param $string |
|
62 | + * |
|
63 | + * @return string |
|
64 | + */ |
|
65 | + private function convertToUTF8($string) |
|
66 | + { |
|
67 | + $encoding = mb_detect_encoding($string, mb_detect_order(), false); |
|
68 | 68 | |
69 | - if($encoding == "UTF-8") { |
|
70 | - $string = mb_convert_encoding($string, 'UTF-8', 'UTF-8'); |
|
71 | - } |
|
69 | + if($encoding == "UTF-8") { |
|
70 | + $string = mb_convert_encoding($string, 'UTF-8', 'UTF-8'); |
|
71 | + } |
|
72 | 72 | |
73 | - $out = iconv(mb_detect_encoding($string, mb_detect_order(), false), "UTF-8//IGNORE", $string); |
|
74 | - return $out; |
|
75 | - } |
|
73 | + $out = iconv(mb_detect_encoding($string, mb_detect_order(), false), "UTF-8//IGNORE", $string); |
|
74 | + return $out; |
|
75 | + } |
|
76 | 76 | |
77 | - /** |
|
78 | - * @return mixed|string |
|
79 | - */ |
|
80 | - public function getFilteredString() |
|
81 | - { |
|
82 | - return $this->filteredString; |
|
83 | - } |
|
77 | + /** |
|
78 | + * @return mixed|string |
|
79 | + */ |
|
80 | + public function getFilteredString() |
|
81 | + { |
|
82 | + return $this->filteredString; |
|
83 | + } |
|
84 | 84 | } |
85 | 85 | \ No newline at end of file |
@@ -10,5 +10,5 @@ |
||
10 | 10 | |
11 | 11 | class EnglishStopWords extends StopWordsFilter |
12 | 12 | { |
13 | - protected $stopWords = array('a', 'about', 'above', 'after', 'again', 'against', 'all', 'am', 'an', 'and', 'any', 'are', 'arent', 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'cant', 'cannot', 'could', 'couldnt', 'did', 'didnt', 'do', 'does', 'doesnt', 'doing', 'dont', 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadnt', 'has', 'hasnt', 'have', 'havent', 'having', 'he', 'hed', 'hell', 'hes', 'her', 'here', 'heres', 'hers', 'herself', 'him', 'himself', 'his', 'how', 'hows', 'i', 'id', 'ill', 'im', 'ive', 'if', 'in', 'into', 'is', 'isnt', 'it', 'its', 'its', 'itself', 'lets', 'me', 'more', 'most', 'mustnt', 'my', 'myself', 'no', 'nor', 'not', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'ought', 'our', 'ours ourselves', 'out', 'over', 'own', 'same', 'shant', 'she', 'shed', 'shell', 'shes', 'should', 'shouldnt', 'so', 'some', 'such', 'than', 'that', 'thats', 'the', 'their', 'theirs', 'them', 'themselves', 'then', 'there', 'theres', 'these', 'they', 'theyd', 'theyll', 'theyre', 'theyve', 'this', 'those', 'through', 'to', 'too', 'under', 'until', 'up', 'very', 'was', 'wasnt', 'we', 'wed', 'well', 'were', 'weve', 'were', 'werent', 'what', 'whats', 'when', 'whens', 'where', 'wheres', 'which', 'while', 'who', 'whos', 'whom', 'why', 'whys', 'with', 'wont', 'would', 'wouldnt', 'you', 'youd', 'youll', 'youre', 'youve', 'your', 'yours', 'yourself', 'yourselves'); |
|
13 | + protected $stopWords = array('a', 'about', 'above', 'after', 'again', 'against', 'all', 'am', 'an', 'and', 'any', 'are', 'arent', 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'cant', 'cannot', 'could', 'couldnt', 'did', 'didnt', 'do', 'does', 'doesnt', 'doing', 'dont', 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadnt', 'has', 'hasnt', 'have', 'havent', 'having', 'he', 'hed', 'hell', 'hes', 'her', 'here', 'heres', 'hers', 'herself', 'him', 'himself', 'his', 'how', 'hows', 'i', 'id', 'ill', 'im', 'ive', 'if', 'in', 'into', 'is', 'isnt', 'it', 'its', 'its', 'itself', 'lets', 'me', 'more', 'most', 'mustnt', 'my', 'myself', 'no', 'nor', 'not', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'ought', 'our', 'ours ourselves', 'out', 'over', 'own', 'same', 'shant', 'she', 'shed', 'shell', 'shes', 'should', 'shouldnt', 'so', 'some', 'such', 'than', 'that', 'thats', 'the', 'their', 'theirs', 'them', 'themselves', 'then', 'there', 'theres', 'these', 'they', 'theyd', 'theyll', 'theyre', 'theyve', 'this', 'those', 'through', 'to', 'too', 'under', 'until', 'up', 'very', 'was', 'wasnt', 'we', 'wed', 'well', 'were', 'weve', 'were', 'werent', 'what', 'whats', 'when', 'whens', 'where', 'wheres', 'which', 'while', 'who', 'whos', 'whom', 'why', 'whys', 'with', 'wont', 'would', 'wouldnt', 'you', 'youd', 'youll', 'youre', 'youve', 'your', 'yours', 'yourself', 'yourselves'); |
|
14 | 14 | } |
15 | 15 | \ No newline at end of file |
@@ -10,5 +10,5 @@ |
||
10 | 10 | |
11 | 11 | class DutchStopWords extends StopWordsFilter |
12 | 12 | { |
13 | - protected $stopWords = array('aan','af','al','alles','als','altijd','andere','ben','bij','daar','dan','dat','de','der','deze','die','dit','doch','doen','door','doorgaans','dus','een','eens','en','er','ge','geen','geweest','haar','had','heb','hebben','heeft','hem','het','hier','hij','hoe','hun','iemand','iets','ik','in','is','ja','je','kan','kon','kunnen','maar','me','meer','men','met','mij','mijn','moet','na','naar','niet','niets','nog','nu','of','om','omdat','ons','ook','op','over','reeds','te','tegen','toch','toen','tot','u','uit','uw','van','veel','voor','want','waren','was','wat','we','wel','werd','wezen','wie','wij','wil','worden','zal','ze','zei','zelf','zich','zij','zijn','zo','zodat','zonder','zou'); |
|
13 | + protected $stopWords = array('aan','af','al','alles','als','altijd','andere','ben','bij','daar','dan','dat','de','der','deze','die','dit','doch','doen','door','doorgaans','dus','een','eens','en','er','ge','geen','geweest','haar','had','heb','hebben','heeft','hem','het','hier','hij','hoe','hun','iemand','iets','ik','in','is','ja','je','kan','kon','kunnen','maar','me','meer','men','met','mij','mijn','moet','na','naar','niet','niets','nog','nu','of','om','omdat','ons','ook','op','over','reeds','te','tegen','toch','toen','tot','u','uit','uw','van','veel','voor','want','waren','was','wat','we','wel','werd','wezen','wie','wij','wil','worden','zal','ze','zei','zelf','zich','zij','zijn','zo','zodat','zonder','zou'); |
|
14 | 14 | } |
15 | 15 | \ No newline at end of file |
@@ -22,87 +22,87 @@ discard block |
||
22 | 22 | */ |
23 | 23 | class Indexer extends SearchDbConnected |
24 | 24 | { |
25 | - const SQLITE_MAX_COMPOUND_SELECT = 100; |
|
26 | - protected $filters = array( |
|
27 | - 'DutchStopWords', |
|
28 | - 'EnglishStopWords' |
|
29 | - ); |
|
30 | - protected $storageDir; |
|
31 | - /** |
|
32 | - * @var double |
|
33 | - */ |
|
34 | - protected $loggingStart; |
|
35 | - /** |
|
36 | - * @var string |
|
37 | - */ |
|
38 | - protected $log; |
|
39 | - /** |
|
40 | - * @var double |
|
41 | - */ |
|
42 | - protected $lastLog; |
|
43 | - |
|
44 | - const SEARCH_TEMP_DB = 'search_tmp.db'; |
|
45 | - |
|
46 | - /** |
|
47 | - * Creates a new temporary search db, cleans it if it exists |
|
48 | - * then calculates and stores the search index in this db |
|
49 | - * and finally if indexing completed replaces the current search |
|
50 | - * db with the temporary one. Returns the log in string format. |
|
51 | - * @return string |
|
52 | - */ |
|
53 | - public function updateIndex() |
|
54 | - { |
|
55 | - $this->startLogging(); |
|
56 | - $this->addLog('Indexing start.'); |
|
57 | - $this->addLog('Clearing index.'); |
|
58 | - $this->resetIndex(); |
|
59 | - $this->addLog('Cleaning Published Deleted Documents'); |
|
60 | - $this->storage->getDocuments()->cleanPublishedDeletedDocuments(); |
|
61 | - $this->addLog('Retrieving documents to be indexed.'); |
|
62 | - $documents = $this->storage->getDocuments()->getPublishedDocumentsNoFolders(); |
|
63 | - $this->addLog('Start Document Term Count for ' . count($documents) . ' documents'); |
|
64 | - $this->createDocumentTermCount($documents); |
|
65 | - $this->addLog('Start Document Term Frequency.'); |
|
66 | - $this->createDocumentTermFrequency(); |
|
67 | - $this->addLog('Start Term Field Length Norm.'); |
|
68 | - $this->createTermFieldLengthNorm(); |
|
69 | - $this->addLog('Start Inverse Document Frequency.'); |
|
70 | - $this->createInverseDocumentFrequency(); |
|
71 | - $this->addLog('Replacing old index.'); |
|
72 | - $this->replaceOldIndex(); |
|
73 | - $this->addLog('Indexing complete.'); |
|
74 | - return $this->log; |
|
75 | - } |
|
76 | - |
|
77 | - /** |
|
78 | - * Count how often a term is used in a document |
|
79 | - * |
|
80 | - * @param $documents |
|
81 | - */ |
|
82 | - public function createDocumentTermCount($documents) |
|
83 | - { |
|
84 | - $termCount = new TermCount($this->getSearchDbHandle(), $documents, $this->filters, $this->storage); |
|
85 | - $termCount->execute(); |
|
86 | - } |
|
87 | - |
|
88 | - /** |
|
89 | - * Calculate the frequency index for a term with |
|
90 | - * a field |
|
91 | - */ |
|
92 | - public function createDocumentTermFrequency() |
|
93 | - { |
|
94 | - $termFrequency = new TermFrequency($this->getSearchDbHandle()); |
|
95 | - $termFrequency->execute(); |
|
96 | - } |
|
97 | - |
|
98 | - |
|
99 | - /** |
|
100 | - * Resets the entire index |
|
101 | - */ |
|
102 | - public function resetIndex() |
|
103 | - { |
|
104 | - $db = $this->getSearchDbHandle(); |
|
105 | - $sql = ' |
|
25 | + const SQLITE_MAX_COMPOUND_SELECT = 100; |
|
26 | + protected $filters = array( |
|
27 | + 'DutchStopWords', |
|
28 | + 'EnglishStopWords' |
|
29 | + ); |
|
30 | + protected $storageDir; |
|
31 | + /** |
|
32 | + * @var double |
|
33 | + */ |
|
34 | + protected $loggingStart; |
|
35 | + /** |
|
36 | + * @var string |
|
37 | + */ |
|
38 | + protected $log; |
|
39 | + /** |
|
40 | + * @var double |
|
41 | + */ |
|
42 | + protected $lastLog; |
|
43 | + |
|
44 | + const SEARCH_TEMP_DB = 'search_tmp.db'; |
|
45 | + |
|
46 | + /** |
|
47 | + * Creates a new temporary search db, cleans it if it exists |
|
48 | + * then calculates and stores the search index in this db |
|
49 | + * and finally if indexing completed replaces the current search |
|
50 | + * db with the temporary one. Returns the log in string format. |
|
51 | + * @return string |
|
52 | + */ |
|
53 | + public function updateIndex() |
|
54 | + { |
|
55 | + $this->startLogging(); |
|
56 | + $this->addLog('Indexing start.'); |
|
57 | + $this->addLog('Clearing index.'); |
|
58 | + $this->resetIndex(); |
|
59 | + $this->addLog('Cleaning Published Deleted Documents'); |
|
60 | + $this->storage->getDocuments()->cleanPublishedDeletedDocuments(); |
|
61 | + $this->addLog('Retrieving documents to be indexed.'); |
|
62 | + $documents = $this->storage->getDocuments()->getPublishedDocumentsNoFolders(); |
|
63 | + $this->addLog('Start Document Term Count for ' . count($documents) . ' documents'); |
|
64 | + $this->createDocumentTermCount($documents); |
|
65 | + $this->addLog('Start Document Term Frequency.'); |
|
66 | + $this->createDocumentTermFrequency(); |
|
67 | + $this->addLog('Start Term Field Length Norm.'); |
|
68 | + $this->createTermFieldLengthNorm(); |
|
69 | + $this->addLog('Start Inverse Document Frequency.'); |
|
70 | + $this->createInverseDocumentFrequency(); |
|
71 | + $this->addLog('Replacing old index.'); |
|
72 | + $this->replaceOldIndex(); |
|
73 | + $this->addLog('Indexing complete.'); |
|
74 | + return $this->log; |
|
75 | + } |
|
76 | + |
|
77 | + /** |
|
78 | + * Count how often a term is used in a document |
|
79 | + * |
|
80 | + * @param $documents |
|
81 | + */ |
|
82 | + public function createDocumentTermCount($documents) |
|
83 | + { |
|
84 | + $termCount = new TermCount($this->getSearchDbHandle(), $documents, $this->filters, $this->storage); |
|
85 | + $termCount->execute(); |
|
86 | + } |
|
87 | + |
|
88 | + /** |
|
89 | + * Calculate the frequency index for a term with |
|
90 | + * a field |
|
91 | + */ |
|
92 | + public function createDocumentTermFrequency() |
|
93 | + { |
|
94 | + $termFrequency = new TermFrequency($this->getSearchDbHandle()); |
|
95 | + $termFrequency->execute(); |
|
96 | + } |
|
97 | + |
|
98 | + |
|
99 | + /** |
|
100 | + * Resets the entire index |
|
101 | + */ |
|
102 | + public function resetIndex() |
|
103 | + { |
|
104 | + $db = $this->getSearchDbHandle(); |
|
105 | + $sql = ' |
|
106 | 106 | DELETE FROM term_count; |
107 | 107 | DELETE FROM term_frequency; |
108 | 108 | DELETE FROM inverse_document_frequency; |
@@ -110,82 +110,82 @@ discard block |
||
110 | 110 | UPDATE `sqlite_sequence` SET `seq`= 0 WHERE `name`=\'term_frequency\'; |
111 | 111 | UPDATE `sqlite_sequence` SET `seq`= 0 WHERE `name`=\'inverse_document_frequency\'; |
112 | 112 | '; |
113 | - $db->exec($sql); |
|
114 | - } |
|
115 | - |
|
116 | - /** |
|
117 | - * Calculates the inverse document frequency for each |
|
118 | - * term. This is a representation of how often a certain |
|
119 | - * term is used in comparison to all terms. |
|
120 | - */ |
|
121 | - public function createInverseDocumentFrequency() |
|
122 | - { |
|
123 | - $documentCount = $this->getTotalDocumentCount(); |
|
124 | - $inverseDocumentFrequency = new InverseDocumentFrequency($this->getSearchDbHandle(), $documentCount); |
|
125 | - $inverseDocumentFrequency->execute(); |
|
126 | - } |
|
127 | - |
|
128 | - /** |
|
129 | - * @return int|mixed |
|
130 | - */ |
|
131 | - private function getTotalDocumentCount() |
|
132 | - { |
|
133 | - return $this->storage->getDocuments()->getTotalDocumentCount(); |
|
134 | - } |
|
135 | - |
|
136 | - /** |
|
137 | - * Calculates the Term Field Length Norm. |
|
138 | - * This is an index determining how important a |
|
139 | - * term is, based on the total length of the field |
|
140 | - * it comes from. |
|
141 | - */ |
|
142 | - public function createTermFieldLengthNorm() |
|
143 | - { |
|
144 | - $termFieldLengthNorm = new TermFieldLengthNorm($this->getSearchDbHandle()); |
|
145 | - $termFieldLengthNorm->execute(); |
|
146 | - } |
|
147 | - |
|
148 | - /** |
|
149 | - * Stores the time the indexing started in memory |
|
150 | - */ |
|
151 | - private function startLogging() |
|
152 | - { |
|
153 | - $this->loggingStart = round(microtime(true) * 1000); |
|
154 | - $this->lastLog = $this->loggingStart; |
|
155 | - } |
|
156 | - |
|
157 | - /** |
|
158 | - * Adds a logline with the time since last log |
|
159 | - * @param $string |
|
160 | - */ |
|
161 | - private function addLog($string) |
|
162 | - { |
|
163 | - $currentTime = round(microtime(true) * 1000); |
|
164 | - $this->log .= date('d-m-Y H:i:s - ') . str_pad($string, 50, " ", STR_PAD_RIGHT) . "\t" . ($currentTime - $this->lastLog) . 'ms since last log. ' . "\t" . ($currentTime - $this->loggingStart) . 'ms since start.' . PHP_EOL; |
|
165 | - $this->lastLog = round(microtime(true) * 1000); |
|
166 | - } |
|
167 | - |
|
168 | - /** |
|
169 | - * Creates the SQLite \PDO object if it doesnt |
|
170 | - * exist and returns it. |
|
171 | - * @return \PDO |
|
172 | - */ |
|
173 | - protected function getSearchDbHandle() |
|
174 | - { |
|
175 | - if ($this->searchDbHandle === null) { |
|
176 | - $path = $this->storageDir . DIRECTORY_SEPARATOR; |
|
177 | - $this->searchDbHandle = new \PDO('sqlite:' . $path . self::SEARCH_TEMP_DB); |
|
178 | - } |
|
179 | - return $this->searchDbHandle; |
|
180 | - } |
|
181 | - |
|
182 | - /** |
|
183 | - * Replaces the old search index database with the new one. |
|
184 | - */ |
|
185 | - public function replaceOldIndex() |
|
186 | - { |
|
187 | - $this->searchDbHandle = null; |
|
188 | - $path = $this->storageDir . DIRECTORY_SEPARATOR; |
|
189 | - rename($path . self::SEARCH_TEMP_DB, $path . 'search.db'); |
|
190 | - } |
|
113 | + $db->exec($sql); |
|
114 | + } |
|
115 | + |
|
116 | + /** |
|
117 | + * Calculates the inverse document frequency for each |
|
118 | + * term. This is a representation of how often a certain |
|
119 | + * term is used in comparison to all terms. |
|
120 | + */ |
|
121 | + public function createInverseDocumentFrequency() |
|
122 | + { |
|
123 | + $documentCount = $this->getTotalDocumentCount(); |
|
124 | + $inverseDocumentFrequency = new InverseDocumentFrequency($this->getSearchDbHandle(), $documentCount); |
|
125 | + $inverseDocumentFrequency->execute(); |
|
126 | + } |
|
127 | + |
|
128 | + /** |
|
129 | + * @return int|mixed |
|
130 | + */ |
|
131 | + private function getTotalDocumentCount() |
|
132 | + { |
|
133 | + return $this->storage->getDocuments()->getTotalDocumentCount(); |
|
134 | + } |
|
135 | + |
|
136 | + /** |
|
137 | + * Calculates the Term Field Length Norm. |
|
138 | + * This is an index determining how important a |
|
139 | + * term is, based on the total length of the field |
|
140 | + * it comes from. |
|
141 | + */ |
|
142 | + public function createTermFieldLengthNorm() |
|
143 | + { |
|
144 | + $termFieldLengthNorm = new TermFieldLengthNorm($this->getSearchDbHandle()); |
|
145 | + $termFieldLengthNorm->execute(); |
|
146 | + } |
|
147 | + |
|
148 | + /** |
|
149 | + * Stores the time the indexing started in memory |
|
150 | + */ |
|
151 | + private function startLogging() |
|
152 | + { |
|
153 | + $this->loggingStart = round(microtime(true) * 1000); |
|
154 | + $this->lastLog = $this->loggingStart; |
|
155 | + } |
|
156 | + |
|
157 | + /** |
|
158 | + * Adds a logline with the time since last log |
|
159 | + * @param $string |
|
160 | + */ |
|
161 | + private function addLog($string) |
|
162 | + { |
|
163 | + $currentTime = round(microtime(true) * 1000); |
|
164 | + $this->log .= date('d-m-Y H:i:s - ') . str_pad($string, 50, " ", STR_PAD_RIGHT) . "\t" . ($currentTime - $this->lastLog) . 'ms since last log. ' . "\t" . ($currentTime - $this->loggingStart) . 'ms since start.' . PHP_EOL; |
|
165 | + $this->lastLog = round(microtime(true) * 1000); |
|
166 | + } |
|
167 | + |
|
168 | + /** |
|
169 | + * Creates the SQLite \PDO object if it doesnt |
|
170 | + * exist and returns it. |
|
171 | + * @return \PDO |
|
172 | + */ |
|
173 | + protected function getSearchDbHandle() |
|
174 | + { |
|
175 | + if ($this->searchDbHandle === null) { |
|
176 | + $path = $this->storageDir . DIRECTORY_SEPARATOR; |
|
177 | + $this->searchDbHandle = new \PDO('sqlite:' . $path . self::SEARCH_TEMP_DB); |
|
178 | + } |
|
179 | + return $this->searchDbHandle; |
|
180 | + } |
|
181 | + |
|
182 | + /** |
|
183 | + * Replaces the old search index database with the new one. |
|
184 | + */ |
|
185 | + public function replaceOldIndex() |
|
186 | + { |
|
187 | + $this->searchDbHandle = null; |
|
188 | + $path = $this->storageDir . DIRECTORY_SEPARATOR; |
|
189 | + rename($path . self::SEARCH_TEMP_DB, $path . 'search.db'); |
|
190 | + } |
|
191 | 191 | } |
192 | 192 | \ No newline at end of file |
@@ -13,43 +13,43 @@ |
||
13 | 13 | |
14 | 14 | class SearchResult |
15 | 15 | { |
16 | - /** |
|
17 | - * @var string |
|
18 | - */ |
|
19 | - public $documentPath; |
|
20 | - /** |
|
21 | - * @var array |
|
22 | - */ |
|
23 | - public $matchingTokens; |
|
24 | - /** |
|
25 | - * @var float |
|
26 | - */ |
|
27 | - public $score; |
|
28 | - |
|
29 | - protected $document; |
|
30 | - /** |
|
31 | - * @var Storage |
|
32 | - */ |
|
33 | - protected $storage; |
|
34 | - |
|
35 | - /** |
|
36 | - * @return Document |
|
37 | - */ |
|
38 | - public function getDocument() |
|
39 | - { |
|
40 | - if ($this->document instanceof Document) { |
|
41 | - return $this->document; |
|
42 | - } else { |
|
43 | - $this->document = $this->storage->getDocuments()->getDocumentBySlug(substr($this->documentPath, 1)); |
|
44 | - $this->document->dbHandle = $this->storage->getContentDbHandle(); |
|
45 | - $this->document->documentStorage = $this->storage->getRepository(); |
|
46 | - |
|
47 | - return $this->document; |
|
48 | - } |
|
49 | - } |
|
50 | - |
|
51 | - public function setStorage($storage) |
|
52 | - { |
|
53 | - $this->storage = $storage; |
|
54 | - } |
|
16 | + /** |
|
17 | + * @var string |
|
18 | + */ |
|
19 | + public $documentPath; |
|
20 | + /** |
|
21 | + * @var array |
|
22 | + */ |
|
23 | + public $matchingTokens; |
|
24 | + /** |
|
25 | + * @var float |
|
26 | + */ |
|
27 | + public $score; |
|
28 | + |
|
29 | + protected $document; |
|
30 | + /** |
|
31 | + * @var Storage |
|
32 | + */ |
|
33 | + protected $storage; |
|
34 | + |
|
35 | + /** |
|
36 | + * @return Document |
|
37 | + */ |
|
38 | + public function getDocument() |
|
39 | + { |
|
40 | + if ($this->document instanceof Document) { |
|
41 | + return $this->document; |
|
42 | + } else { |
|
43 | + $this->document = $this->storage->getDocuments()->getDocumentBySlug(substr($this->documentPath, 1)); |
|
44 | + $this->document->dbHandle = $this->storage->getContentDbHandle(); |
|
45 | + $this->document->documentStorage = $this->storage->getRepository(); |
|
46 | + |
|
47 | + return $this->document; |
|
48 | + } |
|
49 | + } |
|
50 | + |
|
51 | + public function setStorage($storage) |
|
52 | + { |
|
53 | + $this->storage = $storage; |
|
54 | + } |
|
55 | 55 | } |
56 | 56 | \ No newline at end of file |
@@ -10,7 +10,7 @@ |
||
10 | 10 | |
11 | 11 | class SearchSuggestion |
12 | 12 | { |
13 | - public $original; |
|
14 | - public $term; |
|
15 | - public $editDistance; |
|
13 | + public $original; |
|
14 | + public $term; |
|
15 | + public $editDistance; |
|
16 | 16 | } |
17 | 17 | \ No newline at end of file |