Passed
Push — master ( cbe674...dbb482 )
by Andreas
18:37
created

get_content()   A

Complexity

Conditions 3
Paths 4

Size

Total Lines 14
Code Lines 9

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 0
CRAP Score 12

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 3
eloc 9
c 1
b 0
f 0
nc 4
nop 1
dl 0
loc 14
ccs 0
cts 13
cp 0
crap 12
rs 9.9666
1
<?php
2
/**
3
 * @package midcom.services
4
 * @author The Midgard Project, http://www.midgard-project.org
5
 * @copyright The Midgard Project, http://www.midgard-project.org
6
 * @license http://www.gnu.org/licenses/lgpl.html GNU Lesser General Public License
7
 */
8
9
/**
10
 * This is a class geared at indexing attachments. It requires you to "assign" the
11
 * attachment to a topic, which is used as TOPIC_URL for permission purposes. In addition
12
 * you may set another MidgardObject as source object, its GUID is stored in the
13
 * __SOURCE field of the index.
14
 *
15
 * The documents type is "midcom_attachment", though it is *not* derived from midcom
16
 * for several reasons directly. They should be compatible though, in terms of usage.
17
 *
18
 * <b>Example Usage:</b>
19
 *
20
 * <code>
21
 * $document = new midcom_services_indexer_document_attachment($attachment, $object);
22
 * $indexer->index($document);
23
 * </code>
24
 *
25
 * Where $attachment is the attachment to be indexed and $object is the object the object
26
 * is associated with. The corresponding topic will be detected using the object's GUID
27
 * through NAP. If this fails, you have to set the members $topic_guid, $topic_url and
28
 * $component manually.
29
 *
30
 * @todo More DBA stuff: use DBA classes, which allow you to implicitly load the parent
31
 *     object using get_parent.
32
 *
33
 * @package midcom.services
34
 * @see midcom_services_indexer
35
 * @see midcom_helper_metadata
36
 */
37
class midcom_services_indexer_document_attachment extends midcom_services_indexer_document
38
{
39
    private $attachment;
40
41
    /**
42
     * Create a new attachment document
43
     *
44
     * @param midcom_db_attachment $attachment The Attachment to index.
45
     */
46
    public function __construct(midcom_db_attachment $attachment)
47
    {
48
        //before doing anything else, verify that the attachment is readable, otherwise we might get stuck in endless loops later on
49
        if (!$attachment->open('r')) {
50
            debug_add('Attachment ' . $attachment->guid . ' cannot be read, aborting. Last midgard error: ' . midcom_connection::get_error_string(), MIDCOM_LOG_ERROR);
51
            return;
52
        }
53
        $attachment->close();
54
55
        parent::__construct();
56
57
        $this->_set_type('midcom_attachment');
58
59
        $this->attachment = $attachment;
60
61
        debug_print_r("Processing this attachment:", $attachment);
62
63
        $this->source = $attachment->parentguid;
64
        $this->RI = $attachment->guid;
65
        $this->document_url = midcom::get()->permalinks->create_attachment_link($this->RI, $attachment->name);
66
67
        $this->process_attachment();
68
        $this->process_topic();
69
    }
70
71
    private function process_attachment()
72
    {
73
        $this->creator = new midcom_db_person($this->attachment->metadata->creator);
74
        $this->created = $this->attachment->metadata->created;
75
        $this->editor = $this->creator;
76
        $this->edited = $this->created;
77
        $this->author = $this->creator->name;
78
        $this->add_text('mimetype', $this->attachment->mimetype);
79
        $this->add_text('filename', $this->attachment->name);
80
81
        $mimetype = explode("/", $this->attachment->mimetype);
82
        debug_print_r("Evaluating this Mime Type:", $mimetype);
83
84
        switch ($mimetype[1]) {
85
            case 'html':
86
            case 'xml':
87
                $this->process_mime_html();
88
                break;
89
90
            case 'rtf':
91
            case 'richtext':
92
                $this->process_mime_richtext();
93
                break;
94
95
            case 'xml-dtd':
96
                $this->process_mime_plaintext();
97
                break;
98
99
            case 'pdf':
100
                $this->process_mime_pdf();
101
                break;
102
103
            case 'msword':
104
            case 'vnd.ms-word':
105
                $this->process_mime_word();
106
                break;
107
108
            default:
109
                if ($mimetype[0] === 'text') {
110
                    $this->process_mime_plaintext();
111
                } else {
112
                    $this->process_mime_binary();
113
                }
114
                break;
115
        }
116
117
        if (!empty(trim($this->attachment->title))) {
118
            $this->title = "{$this->attachment->title} ({$this->attachment->name})";
119
            $this->content .= "\n{$this->attachment->title}\n{$this->attachment->name}";
120
        } else {
121
            $this->title = $this->attachment->name;
122
            $this->content .= "\n{$this->attachment->name}";
123
        }
124
125
        if (mb_strlen($this->content) > 200) {
126
            $this->abstract = mb_substr($this->content, 0, 200) . ' ...';
127
        } else {
128
            $this->abstract = $this->content;
129
        }
130
    }
131
132
    /**
133
     * Convert a Word attachment to plain text and index it.
134
     */
135
    private function process_mime_word()
136
    {
137
        if (!midcom::get()->config->get('utility_catdoc')) {
138
            debug_add('Could not find catdoc, indexing as binary.', MIDCOM_LOG_INFO);
139
            $this->process_mime_binary();
140
            return;
141
        }
142
143
        debug_add("Converting Word-Attachment to plain text");
144
        $wordfile = $this->attachment->get_path();
145
        $txtfile = "{$wordfile}.txt";
146
        $encoding = (strtoupper($this->_i18n->get_current_charset()) == 'UTF-8') ? 'utf-8' : '8859-1';
147
148
        $command = midcom::get()->config->get('utility_catdoc') . " -d{$encoding} -a $wordfile > $txtfile";
149
        $this->process_command($command, $txtfile);
150
    }
151
152
    /**
153
     * Convert a PDF attachment to plain text and index it.
154
     */
155
    private function process_mime_pdf()
156
    {
157
        if (!midcom::get()->config->get('utility_pdftotext')) {
158
            debug_add('Could not find pdftotext, indexing as binary.', MIDCOM_LOG_INFO);
159
            $this->process_mime_binary();
160
            return;
161
        }
162
163
        debug_add("Converting PDF-Attachment to plain text");
164
        $pdffile = $this->attachment->get_path();
165
        $txtfile = "{$pdffile}.txt";
166
        $encoding = (strtoupper($this->_i18n->get_current_charset()) == 'UTF-8') ? 'UTF-8' : 'Latin1';
167
168
        $command = midcom::get()->config->get('utility_pdftotext') . " -enc {$encoding} -nopgbrk -eol unix $pdffile $txtfile 2>&1";
169
        $this->process_command($command, $txtfile);
170
    }
171
172
    /**
173
     * Convert an RTF attachment to plain text and index it.
174
     */
175
    private function process_mime_richtext()
176
    {
177
        if (!midcom::get()->config->get('utility_unrtf')) {
178
            debug_add('Could not find unrtf, indexing as binary.', MIDCOM_LOG_INFO);
179
            $this->process_mime_binary();
180
            return;
181
        }
182
183
        debug_add("Converting RTF-Attachment to plain text");
184
        $rtffile = $this->attachment->get_path();
185
        $txtfile = "{$rtffile}.txt";
186
187
        // Kill the first five lines, they are crap from the converter.
188
        $command = midcom::get()->config->get('utility_unrtf') . " --nopict --text $rtffile | sed '1,5d' > $txtfile";
189
190
        $this->process_command($command, $txtfile);
191
    }
192
193
    private function process_command(string $command, string $txtfile)
194
    {
195
        debug_add("Executing: {$command}");
196
        exec($command, $result, $returncode);
197
        debug_print_r("Execution returned {$returncode}: ", $result);
198
199
        if (!file_exists($txtfile)) {
200
            // We were unable to read the document into text
201
            $this->process_mime_binary();
202
            return;
203
        }
204
205
        $handle = fopen($txtfile, "r");
206
        $this->content = $this->get_content($handle);
207
        // Kill all ^L (FF) characters
208
        $this->content = str_replace("\x0C", '', $this->content);
209
        fclose($handle);
210
        unlink($txtfile);
211
    }
212
213
    /**
214
     * Simple plain-text driver, just copies the attachment.
215
     */
216
    private function process_mime_plaintext()
217
    {
218
        $this->content = $this->get_content();
219
    }
220
221
    /**
222
     * Processes HTML-style attachments (should therefore work with XML too),
223
     * strips tags and resolves entities.
224
     */
225
    private function process_mime_html()
226
    {
227
        $this->content = $this->html2text($this->get_content());
228
    }
229
230
    /**
231
     * Any binary file will have its name in the abstract unless no title
232
     * is defined, in which case the documents title already contains the file's
233
     * name.
234
     */
235
    private function process_mime_binary()
236
    {
237
        if (!empty(trim($this->title))) {
238
            $this->abstract = $this->attachment->name;
239
        }
240
    }
241
242
    /**
243
     * Returns the first four megabytes of the File referenced by $handle.
244
     * The limit is in place to
245
     * avoid clashes with the PHP Memory limit, it should be enough for most text
246
     * based attachments anyway.
247
     *
248
     * If you omit $handle, a handle to the documents' attachment is created. If no
249
     * handle is specified, it is automatically closed after reading the data, otherwise
250
     * you have to close it yourselves afterwards.
251
     *
252
     * @param resource $handle A valid file-handle to read from, or null to automatically create a
253
     *        handle to the current attachment.
254
     */
255
    private function get_content($handle = null) : string
256
    {
257
        // Read a max of 4 MB
258
        $max = 4194304;
259
        $close = false;
260
        if ($handle === null) {
261
            $handle = $this->attachment->open('r');
262
            $close = true;
263
        }
264
        $content = fread($handle, $max);
265
        if ($close) {
266
            $this->attachment->close();
267
        }
268
        return $this->_i18n->convert_to_current_charset($content);
269
    }
270
}
271