Passed
Push — master ( a6b8d2...cbe674 )
by Andreas
18:08
created

midcom_services_indexer_document_attachment::write_attachment_tmpfile()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 11
Code Lines 8

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 0
CRAP Score 2

Importance

Changes 0
Metric Value
cc 1
eloc 8
nc 1
nop 0
dl 0
loc 11
ccs 0
cts 10
cp 0
crap 2
rs 10
c 0
b 0
f 0
1
<?php
2
/**
3
 * @package midcom.services
4
 * @author The Midgard Project, http://www.midgard-project.org
5
 * @copyright The Midgard Project, http://www.midgard-project.org
6
 * @license http://www.gnu.org/licenses/lgpl.html GNU Lesser General Public License
7
 */
8
9
/**
10
 * This is a class geared at indexing attachments. It requires you to "assign" the
11
 * attachment to a topic, which is used as TOPIC_URL for permission purposes. In addition
12
 * you may set another MidgardObject as source object, its GUID is stored in the
13
 * __SOURCE field of the index.
14
 *
15
 * The documents type is "midcom_attachment", though it is *not* derived from midcom
16
 * for several reasons directly. They should be compatible though, in terms of usage.
17
 *
18
 * <b>Example Usage:</b>
19
 *
20
 * <code>
21
 * $document = new midcom_services_indexer_document_attachment($attachment, $object);
22
 * $indexer->index($document);
23
 * </code>
24
 *
25
 * Where $attachment is the attachment to be indexed and $object is the object the object
26
 * is associated with. The corresponding topic will be detected using the object's GUID
27
 * through NAP. If this fails, you have to set the members $topic_guid, $topic_url and
28
 * $component manually.
29
 *
30
 * @todo More DBA stuff: use DBA classes, which allow you to implicitly load the parent
31
 *     object using get_parent.
32
 *
33
 * @package midcom.services
34
 * @see midcom_services_indexer
35
 * @see midcom_helper_metadata
36
 */
37
class midcom_services_indexer_document_attachment extends midcom_services_indexer_document
38
{
39
    private $attachment;
40
41
    /**
42
     * Create a new attachment document
43
     *
44
     * @param midcom_db_attachment $attachment The Attachment to index.
45
     */
46
    public function __construct(midcom_db_attachment $attachment)
47
    {
48
        //before doing anything else, verify that the attachment is readable, otherwise we might get stuck in endless loops later on
49
        if (!$attachment->open('r')) {
50
            debug_add('Attachment ' . $attachment->guid . ' cannot be read, aborting. Last midgard error: ' . midcom_connection::get_error_string(), MIDCOM_LOG_ERROR);
51
            return;
52
        }
53
        $attachment->close();
54
55
        parent::__construct();
56
57
        $this->_set_type('midcom_attachment');
58
59
        $this->attachment = $attachment;
60
61
        debug_print_r("Processing this attachment:", $attachment);
62
63
        $this->source = $attachment->parentguid;
64
        $this->RI = $attachment->guid;
65
        $this->document_url = midcom::get()->permalinks->create_attachment_link($this->RI, $attachment->name);
66
67
        $this->process_attachment();
68
        $this->process_topic();
69
    }
70
71
    private function process_attachment()
72
    {
73
        $this->creator = new midcom_db_person($this->attachment->metadata->creator);
74
        $this->created = $this->attachment->metadata->created;
75
        $this->editor = $this->creator;
76
        $this->edited = $this->created;
77
        $this->author = $this->creator->name;
78
        $this->add_text('mimetype', $this->attachment->mimetype);
79
        $this->add_text('filename', $this->attachment->name);
80
81
        $mimetype = explode("/", $this->attachment->mimetype);
82
        debug_print_r("Evaluating this Mime Type:", $mimetype);
83
84
        switch ($mimetype[1]) {
85
            case 'html':
86
            case 'xml':
87
                $this->process_mime_html();
88
                break;
89
90
            case 'rtf':
91
            case 'richtext':
92
                $this->process_mime_richtext();
93
                break;
94
95
            case 'xml-dtd':
96
                $this->process_mime_plaintext();
97
                break;
98
99
            case 'pdf':
100
                $this->process_mime_pdf();
101
                break;
102
103
            case 'msword':
104
            case 'vnd.ms-word':
105
                $this->process_mime_word();
106
                break;
107
108
            default:
109
                if ($mimetype[0] === 'text') {
110
                    $this->process_mime_plaintext();
111
                } else {
112
                    $this->process_mime_binary();
113
                }
114
                break;
115
        }
116
117
        if (!empty(trim($this->attachment->title))) {
118
            $this->title = "{$this->attachment->title} ({$this->attachment->name})";
119
            $this->content .= "\n{$this->attachment->title}\n{$this->attachment->name}";
120
        } else {
121
            $this->title = $this->attachment->name;
122
            $this->content .= "\n{$this->attachment->name}";
123
        }
124
125
        if (mb_strlen($this->content) > 200) {
126
            $this->abstract = mb_substr($this->content, 0, 200) . ' ...';
127
        } else {
128
            $this->abstract = $this->content;
129
        }
130
    }
131
132
    /**
133
     * Convert a Word attachment to plain text and index it.
134
     */
135
    private function process_mime_word()
136
    {
137
        if (!midcom::get()->config->get('utility_catdoc')) {
138
            debug_add('Could not find catdoc, indexing as binary.', MIDCOM_LOG_INFO);
139
            $this->process_mime_binary();
140
            return;
141
        }
142
143
        debug_add("Converting Word-Attachment to plain text");
144
        $wordfile = $this->attachment->get_path();
145
        $txtfile = "{$wordfile}.txt";
146
        $encoding = (strtoupper($this->_i18n->get_current_charset()) == 'UTF-8') ? 'utf-8' : '8859-1';
147
148
        $command = midcom::get()->config->get('utility_catdoc') . " -d{$encoding} -a $wordfile > $txtfile";
149
        debug_add("Executing: {$command}");
150
        exec($command, $result, $returncode);
151
        debug_print_r("Execution returned {$returncode}: ", $result);
152
153
        if (!file_exists($txtfile)) {
154
            // We were unable to read the document into text
155
            $this->process_mime_binary();
156
            return;
157
        }
158
159
        $handle = fopen($txtfile, "r");
160
        $this->content = $this->get_attachment_content($handle);
161
        // Kill all ^L (FF) characters
162
        $this->content = str_replace("\x0C", '', $this->content);
163
        fclose($handle);
164
165
        unlink($txtfile);
166
    }
167
168
    /**
169
     * Convert a PDF attachment to plain text and index it.
170
     */
171
    private function process_mime_pdf()
172
    {
173
        if (!midcom::get()->config->get('utility_pdftotext')) {
174
            debug_add('Could not find pdftotext, indexing as binary.', MIDCOM_LOG_INFO);
175
            $this->process_mime_binary();
176
            return;
177
        }
178
179
        debug_add("Converting PDF-Attachment to plain text");
180
        $pdffile = $this->attachment->get_path();
181
        $txtfile = "{$pdffile}.txt";
182
        $encoding = (strtoupper($this->_i18n->get_current_charset()) == 'UTF-8') ? 'UTF-8' : 'Latin1';
183
184
        $command = midcom::get()->config->get('utility_pdftotext') . " -enc {$encoding} -nopgbrk -eol unix $pdffile $txtfile 2>&1";
185
        debug_add("Executing: {$command}");
186
        exec($command, $result, $returncode);
187
        debug_print_r("Execution returned {$returncode}: ", $result);
188
189
        if (!file_exists($txtfile)) {
190
            // We were unable to read the document into text
191
            $this->process_mime_binary();
192
            return;
193
        }
194
195
        $handle = fopen($txtfile, 'r');
196
        $this->content = $this->get_attachment_content($handle);
197
        fclose($handle);
198
199
        unlink($txtfile);
200
    }
201
202
    /**
203
     * Convert an RTF attachment to plain text and index it.
204
     */
205
    private function process_mime_richtext()
206
    {
207
        if (!midcom::get()->config->get('utility_unrtf')) {
208
            debug_add('Could not find unrtf, indexing as binary.', MIDCOM_LOG_INFO);
209
            $this->process_mime_binary();
210
            return;
211
        }
212
213
        debug_add("Converting RTF-Attachment to plain text");
214
        $rtffile = $this->attachment->get_path();
215
        $txtfile = "{$rtffile}.txt";
216
217
        // Kill the first five lines, they are crap from the converter.
218
        $command = midcom::get()->config->get('utility_unrtf') . " --nopict --text $rtffile | sed '1,5d' > $txtfile";
219
        debug_add("Executing: {$command}");
220
        exec($command, $result, $returncode);
221
        debug_print_r("Execution returned {$returncode}: ", $result);
222
223
        if (!file_exists($txtfile)) {
224
            // We were unable to read the document into text
225
            $this->process_mime_binary();
226
            return;
227
        }
228
229
        $handle = fopen($txtfile, 'r');
230
        $this->content = $this->_i18n->convert_to_current_charset($this->get_attachment_content($handle));
231
        fclose($handle);
232
233
        unlink($txtfile);
234
    }
235
236
    /**
237
     * Simple plain-text driver, just copies the attachment.
238
     */
239
    private function process_mime_plaintext()
240
    {
241
        $this->content = $this->_i18n->convert_to_current_charset($this->get_attachment_content());
242
    }
243
244
    /**
245
     * Processes HTML-style attachments (should therefore work with XML too),
246
     * strips tags and resolves entities.
247
     */
248
    private function process_mime_html()
249
    {
250
        $this->content = $this->_i18n->convert_to_current_charset($this->html2text($this->get_attachment_content()));
251
    }
252
253
    /**
254
     * Any binary file will have its name in the abstract unless no title
255
     * is defined, in which case the documents title already contains the file's
256
     * name.
257
     */
258
    private function process_mime_binary()
259
    {
260
        if (!empty(trim($this->title))) {
261
            $this->abstract = $this->attachment->name;
262
        }
263
    }
264
265
    /**
266
     * Returns the first four megabytes of the File referenced by $handle.
267
     * The limit is in place to
268
     * avoid clashes with the PHP Memory limit, it should be enough for most text
269
     * based attachments anyway.
270
     *
271
     * If you omit $handle, a handle to the documents' attachment is created. If no
272
     * handle is specified, it is automatically closed after reading the data, otherwise
273
     * you have to close it yourselves afterwards.
274
     *
275
     * @param resource $handle A valid file-handle to read from, or null to automatically create a
276
     *        handle to the current attachment.
277
     */
278
    private function get_attachment_content($handle = null)
279
    {
280
        // Read a max of 4 MB
281
        debug_add("Returning File content of handle {$handle}");
282
        $max = 4194304;
283
        $close = false;
284
        if ($handle === null) {
285
            $handle = $this->attachment->open('r');
286
            $close = true;
287
        }
288
        $content = fread($handle, $max);
289
        if ($close) {
290
            $this->attachment->close();
291
        }
292
        return $content;
293
    }
294
}
295