|
1
|
|
|
<?php |
|
2
|
|
|
/** |
|
3
|
|
|
* @package midcom.services |
|
4
|
|
|
* @author The Midgard Project, http://www.midgard-project.org |
|
5
|
|
|
* @copyright The Midgard Project, http://www.midgard-project.org |
|
6
|
|
|
* @license http://www.gnu.org/licenses/lgpl.html GNU Lesser General Public License |
|
7
|
|
|
*/ |
|
8
|
|
|
|
|
9
|
|
|
/** |
|
10
|
|
|
* This is a class geared at indexing attachments. It requires you to "assign" the |
|
11
|
|
|
* attachment to a topic, which is used as TOPIC_URL for permission purposes. In addition |
|
12
|
|
|
* you may set another MidgardObject as source object, its GUID is stored in the |
|
13
|
|
|
* __SOURCE field of the index. |
|
14
|
|
|
* |
|
15
|
|
|
* The documents type is "midcom_attachment", though it is *not* derived from midcom |
|
16
|
|
|
* for several reasons directly. They should be compatible though, in terms of usage. |
|
17
|
|
|
* |
|
18
|
|
|
* <b>Example Usage:</b> |
|
19
|
|
|
* |
|
20
|
|
|
* <code> |
|
21
|
|
|
* $document = new midcom_services_indexer_document_attachment($attachment, $object); |
|
22
|
|
|
* $indexer->index($document); |
|
23
|
|
|
* </code> |
|
24
|
|
|
* |
|
25
|
|
|
* Where $attachment is the attachment to be indexed and $object is the object the object |
|
26
|
|
|
* is associated with. The corresponding topic will be detected using the object's GUID |
|
27
|
|
|
* through NAP. If this fails, you have to set the members $topic_guid, $topic_url and |
|
28
|
|
|
* $component manually. |
|
29
|
|
|
* |
|
30
|
|
|
* @package midcom.services |
|
31
|
|
|
* @see midcom_services_indexer |
|
32
|
|
|
*/ |
|
33
|
|
|
class midcom_services_indexer_document_attachment extends midcom_services_indexer_document |
|
34
|
|
|
{ |
|
35
|
|
|
private midcom_db_attachment $attachment; |
|
36
|
|
|
|
|
37
|
|
|
/** |
|
38
|
|
|
* Create a new attachment document |
|
39
|
|
|
*/ |
|
40
|
|
|
public function __construct(midcom_db_attachment $attachment) |
|
41
|
|
|
{ |
|
42
|
|
|
//before doing anything else, verify that the attachment is readable, otherwise we might get stuck in endless loops later on |
|
43
|
|
|
if (!$attachment->open('r')) { |
|
44
|
|
|
debug_add('Attachment ' . $attachment->guid . ' cannot be read, aborting. Last midgard error: ' . midcom_connection::get_error_string(), MIDCOM_LOG_ERROR); |
|
45
|
|
|
return; |
|
46
|
|
|
} |
|
47
|
|
|
$attachment->close(); |
|
48
|
|
|
|
|
49
|
|
|
parent::__construct(); |
|
50
|
|
|
|
|
51
|
|
|
$this->_set_type('midcom_attachment'); |
|
52
|
|
|
|
|
53
|
|
|
$this->attachment = $attachment; |
|
54
|
|
|
$this->source = $attachment->parentguid; |
|
55
|
|
|
$this->RI = $attachment->guid; |
|
56
|
|
|
$this->document_url = midcom::get()->permalinks->create_attachment_link($this->RI, $attachment->name); |
|
57
|
|
|
|
|
58
|
|
|
$this->process_attachment(); |
|
59
|
|
|
$this->process_topic(); |
|
60
|
|
|
} |
|
61
|
|
|
|
|
62
|
|
|
private function process_attachment() |
|
63
|
|
|
{ |
|
64
|
|
|
$this->creator = new midcom_db_person($this->attachment->metadata->creator); |
|
65
|
|
|
$this->created = $this->attachment->metadata->created; |
|
66
|
|
|
$this->editor = $this->creator; |
|
67
|
|
|
$this->edited = $this->created; |
|
68
|
|
|
$this->author = $this->creator->name; |
|
69
|
|
|
$this->add_text('mimetype', $this->attachment->mimetype); |
|
70
|
|
|
$this->add_text('filename', $this->attachment->name); |
|
71
|
|
|
|
|
72
|
|
|
$mimetype = explode("/", $this->attachment->mimetype); |
|
73
|
|
|
debug_print_r("Evaluating this Mime Type:", $mimetype); |
|
74
|
|
|
|
|
75
|
|
|
switch ($mimetype[1]) { |
|
76
|
|
|
case 'html': |
|
77
|
|
|
case 'xml': |
|
78
|
|
|
$this->process_mime_html(); |
|
79
|
|
|
break; |
|
80
|
|
|
|
|
81
|
|
|
case 'rtf': |
|
82
|
|
|
case 'richtext': |
|
83
|
|
|
$this->process_mime_richtext(); |
|
84
|
|
|
break; |
|
85
|
|
|
|
|
86
|
|
|
case 'xml-dtd': |
|
87
|
|
|
$this->process_mime_plaintext(); |
|
88
|
|
|
break; |
|
89
|
|
|
|
|
90
|
|
|
case 'pdf': |
|
91
|
|
|
$this->process_mime_pdf(); |
|
92
|
|
|
break; |
|
93
|
|
|
|
|
94
|
|
|
case 'msword': |
|
95
|
|
|
case 'vnd.ms-word': |
|
96
|
|
|
$this->process_mime_word(); |
|
97
|
|
|
break; |
|
98
|
|
|
|
|
99
|
|
|
default: |
|
100
|
|
|
if ($mimetype[0] === 'text') { |
|
101
|
|
|
$this->process_mime_plaintext(); |
|
102
|
|
|
} else { |
|
103
|
|
|
$this->process_mime_binary(); |
|
104
|
|
|
} |
|
105
|
|
|
break; |
|
106
|
|
|
} |
|
107
|
|
|
|
|
108
|
|
|
if (!empty(trim($this->attachment->title))) { |
|
109
|
|
|
$this->title = "{$this->attachment->title} ({$this->attachment->name})"; |
|
110
|
|
|
$this->content .= "\n{$this->attachment->title}\n{$this->attachment->name}"; |
|
111
|
|
|
} else { |
|
112
|
|
|
$this->title = $this->attachment->name; |
|
113
|
|
|
$this->content .= "\n{$this->attachment->name}"; |
|
114
|
|
|
} |
|
115
|
|
|
|
|
116
|
|
|
if (mb_strlen($this->content) > 200) { |
|
117
|
|
|
$this->abstract = mb_substr($this->content, 0, 200) . ' ...'; |
|
118
|
|
|
} else { |
|
119
|
|
|
$this->abstract = $this->content; |
|
120
|
|
|
} |
|
121
|
|
|
} |
|
122
|
|
|
|
|
123
|
|
|
/** |
|
124
|
|
|
* Convert a Word attachment to plain text and index it. |
|
125
|
|
|
*/ |
|
126
|
|
|
private function process_mime_word() |
|
127
|
|
|
{ |
|
128
|
|
|
if ($path = $this->check_utility('catdoc')) { |
|
129
|
|
|
debug_add("Converting Word-Attachment to plain text"); |
|
130
|
|
|
$wordfile = $this->attachment->get_path(); |
|
131
|
|
|
$txtfile = "{$wordfile}.txt"; |
|
132
|
|
|
$encoding = (strtoupper($this->_i18n->get_current_charset()) == 'UTF-8') ? 'utf-8' : '8859-1'; |
|
133
|
|
|
|
|
134
|
|
|
$command = $path . " -d{$encoding} -a $wordfile > $txtfile"; |
|
135
|
|
|
$this->process_command($command, $txtfile); |
|
136
|
|
|
} |
|
137
|
|
|
} |
|
138
|
|
|
|
|
139
|
|
|
/** |
|
140
|
|
|
* Convert a PDF attachment to plain text and index it. |
|
141
|
|
|
*/ |
|
142
|
|
|
private function process_mime_pdf() |
|
143
|
|
|
{ |
|
144
|
|
|
if ($path = $this->check_utility('pdftotext')) { |
|
145
|
|
|
debug_add("Converting PDF-Attachment to plain text"); |
|
146
|
|
|
$pdffile = $this->attachment->get_path(); |
|
147
|
|
|
$txtfile = "{$pdffile}.txt"; |
|
148
|
|
|
$encoding = (strtoupper($this->_i18n->get_current_charset()) == 'UTF-8') ? 'UTF-8' : 'Latin1'; |
|
149
|
|
|
|
|
150
|
|
|
$command = $path . " -enc {$encoding} -nopgbrk -eol unix $pdffile $txtfile 2>&1"; |
|
151
|
|
|
$this->process_command($command, $txtfile); |
|
152
|
|
|
} |
|
153
|
|
|
} |
|
154
|
|
|
|
|
155
|
|
|
/** |
|
156
|
|
|
* Convert an RTF attachment to plain text and index it. |
|
157
|
|
|
*/ |
|
158
|
|
|
private function process_mime_richtext() |
|
159
|
|
|
{ |
|
160
|
|
|
if ($path = $this->check_utility('unrtf')) { |
|
161
|
|
|
debug_add("Converting RTF-Attachment to plain text"); |
|
162
|
|
|
$rtffile = $this->attachment->get_path(); |
|
163
|
|
|
$txtfile = "{$rtffile}.txt"; |
|
164
|
|
|
|
|
165
|
|
|
// Kill the first five lines, they are crap from the converter. |
|
166
|
|
|
$command = $path . " --nopict --text $rtffile | sed '1,5d' > $txtfile"; |
|
167
|
|
|
$this->process_command($command, $txtfile); |
|
168
|
|
|
} |
|
169
|
|
|
} |
|
170
|
|
|
|
|
171
|
|
|
private function check_utility(string $name) : ?string |
|
172
|
|
|
{ |
|
173
|
|
|
if ($path = midcom::get()->config->get('utility_' . $name)) { |
|
174
|
|
|
return $path; |
|
175
|
|
|
} |
|
176
|
|
|
debug_add('Could not find ' . $name . ', indexing as binary.', MIDCOM_LOG_INFO); |
|
177
|
|
|
$this->process_mime_binary(); |
|
178
|
|
|
return null; |
|
179
|
|
|
} |
|
180
|
|
|
|
|
181
|
|
|
private function process_command(string $command, string $txtfile) |
|
182
|
|
|
{ |
|
183
|
|
|
debug_add("Executing: {$command}"); |
|
184
|
|
|
exec($command, $result, $returncode); |
|
185
|
|
|
debug_print_r("Execution returned {$returncode}: ", $result); |
|
186
|
|
|
|
|
187
|
|
|
if (!file_exists($txtfile)) { |
|
188
|
|
|
// We were unable to read the document into text |
|
189
|
|
|
$this->process_mime_binary(); |
|
190
|
|
|
return; |
|
191
|
|
|
} |
|
192
|
|
|
|
|
193
|
|
|
$handle = fopen($txtfile, "r"); |
|
194
|
|
|
$this->content = $this->get_content($handle); |
|
195
|
|
|
// Kill all ^L (FF) characters |
|
196
|
|
|
$this->content = str_replace("\x0C", '', $this->content); |
|
197
|
|
|
fclose($handle); |
|
198
|
|
|
unlink($txtfile); |
|
199
|
|
|
} |
|
200
|
|
|
|
|
201
|
|
|
/** |
|
202
|
|
|
* Simple plain-text driver, just copies the attachment. |
|
203
|
|
|
*/ |
|
204
|
|
|
private function process_mime_plaintext() |
|
205
|
|
|
{ |
|
206
|
|
|
$this->content = $this->get_content(); |
|
207
|
|
|
} |
|
208
|
|
|
|
|
209
|
|
|
/** |
|
210
|
|
|
* Processes HTML-style attachments (should therefore work with XML too), |
|
211
|
|
|
* strips tags and resolves entities. |
|
212
|
|
|
*/ |
|
213
|
|
|
private function process_mime_html() |
|
214
|
|
|
{ |
|
215
|
|
|
$this->content = $this->html2text($this->get_content()); |
|
216
|
|
|
} |
|
217
|
|
|
|
|
218
|
|
|
/** |
|
219
|
|
|
* Any binary file will have its name in the abstract unless no title |
|
220
|
|
|
* is defined, in which case the documents title already contains the file's |
|
221
|
|
|
* name. |
|
222
|
|
|
*/ |
|
223
|
|
|
private function process_mime_binary() |
|
224
|
|
|
{ |
|
225
|
|
|
if (!empty(trim($this->title))) { |
|
226
|
|
|
$this->abstract = $this->attachment->name; |
|
227
|
|
|
} |
|
228
|
|
|
} |
|
229
|
|
|
|
|
230
|
|
|
/** |
|
231
|
|
|
* Returns the first four megabytes of the File referenced by $handle. |
|
232
|
|
|
* The limit is in place to |
|
233
|
|
|
* avoid clashes with the PHP Memory limit, it should be enough for most text |
|
234
|
|
|
* based attachments anyway. |
|
235
|
|
|
* |
|
236
|
|
|
* If you omit $handle, a handle to the documents' attachment is created. If no |
|
237
|
|
|
* handle is specified, it is automatically closed after reading the data, otherwise |
|
238
|
|
|
* you have to close it yourselves afterwards. |
|
239
|
|
|
* |
|
240
|
|
|
* @param resource $handle A valid file-handle to read from, or null to automatically create a |
|
241
|
|
|
* handle to the current attachment. |
|
242
|
|
|
*/ |
|
243
|
|
|
private function get_content($handle = null) : string |
|
244
|
|
|
{ |
|
245
|
|
|
// Read a max of 4 MB |
|
246
|
|
|
$max = 4194304; |
|
247
|
|
|
$close = false; |
|
248
|
|
|
if ($handle === null) { |
|
249
|
|
|
$handle = $this->attachment->open('r'); |
|
250
|
|
|
$close = true; |
|
251
|
|
|
} |
|
252
|
|
|
$content = fread($handle, $max); |
|
|
|
|
|
|
253
|
|
|
if ($close) { |
|
254
|
|
|
$this->attachment->close(); |
|
255
|
|
|
} |
|
256
|
|
|
return $this->_i18n->convert_to_current_charset($content); |
|
257
|
|
|
} |
|
258
|
|
|
} |
|
259
|
|
|
|