1
|
|
|
<?php |
2
|
|
|
/** |
3
|
|
|
* @package midcom.services |
4
|
|
|
* @author The Midgard Project, http://www.midgard-project.org |
5
|
|
|
* @copyright The Midgard Project, http://www.midgard-project.org |
6
|
|
|
* @license http://www.gnu.org/licenses/lgpl.html GNU Lesser General Public License |
7
|
|
|
*/ |
8
|
|
|
|
9
|
|
|
/** |
10
|
|
|
* This is a class geared at indexing attachments. It requires you to "assign" the |
11
|
|
|
* attachment to a topic, which is used as TOPIC_URL for permission purposes. In addition |
12
|
|
|
* you may set another MidgardObject as source object, its GUID is stored in the |
13
|
|
|
* __SOURCE field of the index. |
14
|
|
|
* |
15
|
|
|
* The documents type is "midcom_attachment", though it is *not* derived from midcom |
16
|
|
|
* for several reasons directly. They should be compatible though, in terms of usage. |
17
|
|
|
* |
18
|
|
|
* <b>Example Usage:</b> |
19
|
|
|
* |
20
|
|
|
* <code> |
21
|
|
|
* $document = new midcom_services_indexer_document_attachment($attachment, $object); |
22
|
|
|
* $indexer->index($document); |
23
|
|
|
* </code> |
24
|
|
|
* |
25
|
|
|
* Where $attachment is the attachment to be indexed and $object is the object the object |
26
|
|
|
* is associated with. The corresponding topic will be detected using the object's GUID |
27
|
|
|
* through NAP. If this fails, you have to set the members $topic_guid, $topic_url and |
28
|
|
|
* $component manually. |
29
|
|
|
* |
30
|
|
|
* @package midcom.services |
31
|
|
|
* @see midcom_services_indexer |
32
|
|
|
*/ |
33
|
|
|
class midcom_services_indexer_document_attachment extends midcom_services_indexer_document |
34
|
|
|
{ |
35
|
|
|
private midcom_db_attachment $attachment; |
36
|
|
|
|
37
|
|
|
/** |
38
|
|
|
* Create a new attachment document |
39
|
|
|
*/ |
40
|
|
|
public function __construct(midcom_db_attachment $attachment) |
41
|
|
|
{ |
42
|
|
|
//before doing anything else, verify that the attachment is readable, otherwise we might get stuck in endless loops later on |
43
|
|
|
if (!$attachment->open('r')) { |
44
|
|
|
debug_add('Attachment ' . $attachment->guid . ' cannot be read, aborting. Last midgard error: ' . midcom_connection::get_error_string(), MIDCOM_LOG_ERROR); |
45
|
|
|
return; |
46
|
|
|
} |
47
|
|
|
$attachment->close(); |
48
|
|
|
|
49
|
|
|
parent::__construct(); |
50
|
|
|
|
51
|
|
|
$this->_set_type('midcom_attachment'); |
52
|
|
|
|
53
|
|
|
$this->attachment = $attachment; |
54
|
|
|
$this->source = $attachment->parentguid; |
55
|
|
|
$this->RI = $attachment->guid; |
56
|
|
|
$this->document_url = midcom::get()->permalinks->create_attachment_link($this->RI, $attachment->name); |
57
|
|
|
|
58
|
|
|
$this->process_attachment(); |
59
|
|
|
$this->process_topic(); |
60
|
|
|
} |
61
|
|
|
|
62
|
|
|
private function process_attachment() |
63
|
|
|
{ |
64
|
|
|
$this->creator = new midcom_db_person($this->attachment->metadata->creator); |
65
|
|
|
$this->created = $this->attachment->metadata->created; |
66
|
|
|
$this->editor = $this->creator; |
67
|
|
|
$this->edited = $this->created; |
68
|
|
|
$this->author = $this->creator->name; |
69
|
|
|
$this->add_text('mimetype', $this->attachment->mimetype); |
70
|
|
|
$this->add_text('filename', $this->attachment->name); |
71
|
|
|
|
72
|
|
|
$mimetype = explode("/", $this->attachment->mimetype); |
73
|
|
|
debug_print_r("Evaluating this Mime Type:", $mimetype); |
74
|
|
|
|
75
|
|
|
switch ($mimetype[1]) { |
76
|
|
|
case 'html': |
77
|
|
|
case 'xml': |
78
|
|
|
$this->process_mime_html(); |
79
|
|
|
break; |
80
|
|
|
|
81
|
|
|
case 'rtf': |
82
|
|
|
case 'richtext': |
83
|
|
|
$this->process_mime_richtext(); |
84
|
|
|
break; |
85
|
|
|
|
86
|
|
|
case 'xml-dtd': |
87
|
|
|
$this->process_mime_plaintext(); |
88
|
|
|
break; |
89
|
|
|
|
90
|
|
|
case 'pdf': |
91
|
|
|
$this->process_mime_pdf(); |
92
|
|
|
break; |
93
|
|
|
|
94
|
|
|
case 'msword': |
95
|
|
|
case 'vnd.ms-word': |
96
|
|
|
$this->process_mime_word(); |
97
|
|
|
break; |
98
|
|
|
|
99
|
|
|
default: |
100
|
|
|
if ($mimetype[0] === 'text') { |
101
|
|
|
$this->process_mime_plaintext(); |
102
|
|
|
} else { |
103
|
|
|
$this->process_mime_binary(); |
104
|
|
|
} |
105
|
|
|
break; |
106
|
|
|
} |
107
|
|
|
|
108
|
|
|
if (!empty(trim($this->attachment->title))) { |
109
|
|
|
$this->title = "{$this->attachment->title} ({$this->attachment->name})"; |
110
|
|
|
$this->content .= "\n{$this->attachment->title}\n{$this->attachment->name}"; |
111
|
|
|
} else { |
112
|
|
|
$this->title = $this->attachment->name; |
113
|
|
|
$this->content .= "\n{$this->attachment->name}"; |
114
|
|
|
} |
115
|
|
|
|
116
|
|
|
if (mb_strlen($this->content) > 200) { |
117
|
|
|
$this->abstract = mb_substr($this->content, 0, 200) . ' ...'; |
118
|
|
|
} else { |
119
|
|
|
$this->abstract = $this->content; |
120
|
|
|
} |
121
|
|
|
} |
122
|
|
|
|
123
|
|
|
/** |
124
|
|
|
* Convert a Word attachment to plain text and index it. |
125
|
|
|
*/ |
126
|
|
|
private function process_mime_word() |
127
|
|
|
{ |
128
|
|
|
if ($path = $this->check_utility('catdoc')) { |
129
|
|
|
debug_add("Converting Word-Attachment to plain text"); |
130
|
|
|
$wordfile = $this->attachment->get_path(); |
131
|
|
|
$txtfile = "{$wordfile}.txt"; |
132
|
|
|
$encoding = (strtoupper($this->_i18n->get_current_charset()) == 'UTF-8') ? 'utf-8' : '8859-1'; |
133
|
|
|
|
134
|
|
|
$command = $path . " -d{$encoding} -a $wordfile > $txtfile"; |
135
|
|
|
$this->process_command($command, $txtfile); |
136
|
|
|
} |
137
|
|
|
} |
138
|
|
|
|
139
|
|
|
/** |
140
|
|
|
* Convert a PDF attachment to plain text and index it. |
141
|
|
|
*/ |
142
|
|
|
private function process_mime_pdf() |
143
|
|
|
{ |
144
|
|
|
if ($path = $this->check_utility('pdftotext')) { |
145
|
|
|
debug_add("Converting PDF-Attachment to plain text"); |
146
|
|
|
$pdffile = $this->attachment->get_path(); |
147
|
|
|
$txtfile = "{$pdffile}.txt"; |
148
|
|
|
$encoding = (strtoupper($this->_i18n->get_current_charset()) == 'UTF-8') ? 'UTF-8' : 'Latin1'; |
149
|
|
|
|
150
|
|
|
$command = $path . " -enc {$encoding} -nopgbrk -eol unix $pdffile $txtfile 2>&1"; |
151
|
|
|
$this->process_command($command, $txtfile); |
152
|
|
|
} |
153
|
|
|
} |
154
|
|
|
|
155
|
|
|
/** |
156
|
|
|
* Convert an RTF attachment to plain text and index it. |
157
|
|
|
*/ |
158
|
|
|
private function process_mime_richtext() |
159
|
|
|
{ |
160
|
|
|
if ($path = $this->check_utility('unrtf')) { |
161
|
|
|
debug_add("Converting RTF-Attachment to plain text"); |
162
|
|
|
$rtffile = $this->attachment->get_path(); |
163
|
|
|
$txtfile = "{$rtffile}.txt"; |
164
|
|
|
|
165
|
|
|
// Kill the first five lines, they are crap from the converter. |
166
|
|
|
$command = $path . " --nopict --text $rtffile | sed '1,5d' > $txtfile"; |
167
|
|
|
$this->process_command($command, $txtfile); |
168
|
|
|
} |
169
|
|
|
} |
170
|
|
|
|
171
|
|
|
private function check_utility(string $name) : ?string |
172
|
|
|
{ |
173
|
|
|
if ($path = midcom::get()->config->get('utility_' . $name)) { |
174
|
|
|
return $path; |
175
|
|
|
} |
176
|
|
|
debug_add('Could not find ' . $name . ', indexing as binary.', MIDCOM_LOG_INFO); |
177
|
|
|
$this->process_mime_binary(); |
178
|
|
|
return null; |
179
|
|
|
} |
180
|
|
|
|
181
|
|
|
private function process_command(string $command, string $txtfile) |
182
|
|
|
{ |
183
|
|
|
debug_add("Executing: {$command}"); |
184
|
|
|
exec($command, $result, $returncode); |
185
|
|
|
debug_print_r("Execution returned {$returncode}: ", $result); |
186
|
|
|
|
187
|
|
|
if (!file_exists($txtfile)) { |
188
|
|
|
// We were unable to read the document into text |
189
|
|
|
$this->process_mime_binary(); |
190
|
|
|
return; |
191
|
|
|
} |
192
|
|
|
|
193
|
|
|
$handle = fopen($txtfile, "r"); |
194
|
|
|
$this->content = $this->get_content($handle); |
195
|
|
|
// Kill all ^L (FF) characters |
196
|
|
|
$this->content = str_replace("\x0C", '', $this->content); |
197
|
|
|
fclose($handle); |
198
|
|
|
unlink($txtfile); |
199
|
|
|
} |
200
|
|
|
|
201
|
|
|
/** |
202
|
|
|
* Simple plain-text driver, just copies the attachment. |
203
|
|
|
*/ |
204
|
|
|
private function process_mime_plaintext() |
205
|
|
|
{ |
206
|
|
|
$this->content = $this->get_content(); |
207
|
|
|
} |
208
|
|
|
|
209
|
|
|
/** |
210
|
|
|
* Processes HTML-style attachments (should therefore work with XML too), |
211
|
|
|
* strips tags and resolves entities. |
212
|
|
|
*/ |
213
|
|
|
private function process_mime_html() |
214
|
|
|
{ |
215
|
|
|
$this->content = $this->html2text($this->get_content()); |
216
|
|
|
} |
217
|
|
|
|
218
|
|
|
/** |
219
|
|
|
* Any binary file will have its name in the abstract unless no title |
220
|
|
|
* is defined, in which case the documents title already contains the file's |
221
|
|
|
* name. |
222
|
|
|
*/ |
223
|
|
|
private function process_mime_binary() |
224
|
|
|
{ |
225
|
|
|
if (!empty(trim($this->title))) { |
226
|
|
|
$this->abstract = $this->attachment->name; |
227
|
|
|
} |
228
|
|
|
} |
229
|
|
|
|
230
|
|
|
/** |
231
|
|
|
* Returns the first four megabytes of the File referenced by $handle. |
232
|
|
|
* The limit is in place to |
233
|
|
|
* avoid clashes with the PHP Memory limit, it should be enough for most text |
234
|
|
|
* based attachments anyway. |
235
|
|
|
* |
236
|
|
|
* If you omit $handle, a handle to the documents' attachment is created. If no |
237
|
|
|
* handle is specified, it is automatically closed after reading the data, otherwise |
238
|
|
|
* you have to close it yourselves afterwards. |
239
|
|
|
* |
240
|
|
|
* @param resource $handle A valid file-handle to read from, or null to automatically create a |
241
|
|
|
* handle to the current attachment. |
242
|
|
|
*/ |
243
|
|
|
private function get_content($handle = null) : string |
244
|
|
|
{ |
245
|
|
|
// Read a max of 4 MB |
246
|
|
|
$max = 4194304; |
247
|
|
|
$close = false; |
248
|
|
|
if ($handle === null) { |
249
|
|
|
$handle = $this->attachment->open('r'); |
250
|
|
|
$close = true; |
251
|
|
|
} |
252
|
|
|
$content = fread($handle, $max); |
|
|
|
|
253
|
|
|
if ($close) { |
254
|
|
|
$this->attachment->close(); |
255
|
|
|
} |
256
|
|
|
return $this->_i18n->convert_to_current_charset($content); |
257
|
|
|
} |
258
|
|
|
} |
259
|
|
|
|