process_mime_html()   A
last analyzed

Complexity

Conditions 1
Paths 1

Size

Total Lines 3
Code Lines 1

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 0
CRAP Score 2

Importance

Changes 0
Metric Value
cc 1
eloc 1
nc 1
nop 0
dl 0
loc 3
ccs 0
cts 2
cp 0
crap 2
rs 10
c 0
b 0
f 0
1
<?php
2
/**
3
 * @package midcom.services
4
 * @author The Midgard Project, http://www.midgard-project.org
5
 * @copyright The Midgard Project, http://www.midgard-project.org
6
 * @license http://www.gnu.org/licenses/lgpl.html GNU Lesser General Public License
7
 */
8
9
/**
10
 * This is a class geared at indexing attachments. It requires you to "assign" the
11
 * attachment to a topic, which is used as TOPIC_URL for permission purposes. In addition
12
 * you may set another MidgardObject as source object, its GUID is stored in the
13
 * __SOURCE field of the index.
14
 *
15
 * The documents type is "midcom_attachment", though it is *not* derived from midcom
16
 * for several reasons directly. They should be compatible though, in terms of usage.
17
 *
18
 * <b>Example Usage:</b>
19
 *
20
 * <code>
21
 * $document = new midcom_services_indexer_document_attachment($attachment, $object);
22
 * $indexer->index($document);
23
 * </code>
24
 *
25
 * Where $attachment is the attachment to be indexed and $object is the object the object
26
 * is associated with. The corresponding topic will be detected using the object's GUID
27
 * through NAP. If this fails, you have to set the members $topic_guid, $topic_url and
28
 * $component manually.
29
 *
30
 * @package midcom.services
31
 * @see midcom_services_indexer
32
 */
33
class midcom_services_indexer_document_attachment extends midcom_services_indexer_document
34
{
35
    private midcom_db_attachment $attachment;
36
37
    /**
38
     * Create a new attachment document
39
     */
40
    public function __construct(midcom_db_attachment $attachment)
41
    {
42
        //before doing anything else, verify that the attachment is readable, otherwise we might get stuck in endless loops later on
43
        if (!$attachment->open('r')) {
44
            debug_add('Attachment ' . $attachment->guid . ' cannot be read, aborting. Last midgard error: ' . midcom_connection::get_error_string(), MIDCOM_LOG_ERROR);
45
            return;
46
        }
47
        $attachment->close();
48
49
        parent::__construct();
50
51
        $this->_set_type('midcom_attachment');
52
53
        $this->attachment = $attachment;
54
        $this->source = $attachment->parentguid;
55
        $this->RI = $attachment->guid;
56
        $this->document_url = midcom::get()->permalinks->create_attachment_link($this->RI, $attachment->name);
57
58
        $this->process_attachment();
59
        $this->process_topic();
60
    }
61
62
    private function process_attachment()
63
    {
64
        $this->creator = new midcom_db_person($this->attachment->metadata->creator);
65
        $this->created = $this->attachment->metadata->created;
66
        $this->editor = $this->creator;
67
        $this->edited = $this->created;
68
        $this->author = $this->creator->name;
69
        $this->add_text('mimetype', $this->attachment->mimetype);
70
        $this->add_text('filename', $this->attachment->name);
71
72
        $mimetype = explode("/", $this->attachment->mimetype);
73
        debug_print_r("Evaluating this Mime Type:", $mimetype);
74
75
        switch ($mimetype[1]) {
76
            case 'html':
77
            case 'xml':
78
                $this->process_mime_html();
79
                break;
80
81
            case 'rtf':
82
            case 'richtext':
83
                $this->process_mime_richtext();
84
                break;
85
86
            case 'xml-dtd':
87
                $this->process_mime_plaintext();
88
                break;
89
90
            case 'pdf':
91
                $this->process_mime_pdf();
92
                break;
93
94
            case 'msword':
95
            case 'vnd.ms-word':
96
                $this->process_mime_word();
97
                break;
98
99
            default:
100
                if ($mimetype[0] === 'text') {
101
                    $this->process_mime_plaintext();
102
                } else {
103
                    $this->process_mime_binary();
104
                }
105
                break;
106
        }
107
108
        if (!empty(trim($this->attachment->title))) {
109
            $this->title = "{$this->attachment->title} ({$this->attachment->name})";
110
            $this->content .= "\n{$this->attachment->title}\n{$this->attachment->name}";
111
        } else {
112
            $this->title = $this->attachment->name;
113
            $this->content .= "\n{$this->attachment->name}";
114
        }
115
116
        if (mb_strlen($this->content) > 200) {
117
            $this->abstract = mb_substr($this->content, 0, 200) . ' ...';
118
        } else {
119
            $this->abstract = $this->content;
120
        }
121
    }
122
123
    /**
124
     * Convert a Word attachment to plain text and index it.
125
     */
126
    private function process_mime_word()
127
    {
128
        if ($path = $this->check_utility('catdoc')) {
129
            debug_add("Converting Word-Attachment to plain text");
130
            $wordfile = $this->attachment->get_path();
131
            $txtfile = "{$wordfile}.txt";
132
            $encoding = (strtoupper($this->_i18n->get_current_charset()) == 'UTF-8') ? 'utf-8' : '8859-1';
133
134
            $command = $path . " -d{$encoding} -a $wordfile > $txtfile";
135
            $this->process_command($command, $txtfile);
136
        }
137
    }
138
139
    /**
140
     * Convert a PDF attachment to plain text and index it.
141
     */
142
    private function process_mime_pdf()
143
    {
144
        if ($path = $this->check_utility('pdftotext')) {
145
            debug_add("Converting PDF-Attachment to plain text");
146
            $pdffile = $this->attachment->get_path();
147
            $txtfile = "{$pdffile}.txt";
148
            $encoding = (strtoupper($this->_i18n->get_current_charset()) == 'UTF-8') ? 'UTF-8' : 'Latin1';
149
150
            $command = $path . " -enc {$encoding} -nopgbrk -eol unix $pdffile $txtfile 2>&1";
151
            $this->process_command($command, $txtfile);
152
        }
153
    }
154
155
    /**
156
     * Convert an RTF attachment to plain text and index it.
157
     */
158
    private function process_mime_richtext()
159
    {
160
        if ($path = $this->check_utility('unrtf')) {
161
            debug_add("Converting RTF-Attachment to plain text");
162
            $rtffile = $this->attachment->get_path();
163
            $txtfile = "{$rtffile}.txt";
164
165
            // Kill the first five lines, they are crap from the converter.
166
            $command = $path . " --nopict --text $rtffile | sed '1,5d' > $txtfile";
167
            $this->process_command($command, $txtfile);
168
        }
169
    }
170
171
    private function check_utility(string $name) : ?string
172
    {
173
        if ($path = midcom::get()->config->get('utility_' . $name)) {
174
            return $path;
175
        }
176
        debug_add('Could not find ' . $name . ', indexing as binary.', MIDCOM_LOG_INFO);
177
        $this->process_mime_binary();
178
        return null;
179
    }
180
181
    private function process_command(string $command, string $txtfile)
182
    {
183
        debug_add("Executing: {$command}");
184
        exec($command, $result, $returncode);
185
        debug_print_r("Execution returned {$returncode}: ", $result);
186
187
        if (!file_exists($txtfile)) {
188
            // We were unable to read the document into text
189
            $this->process_mime_binary();
190
            return;
191
        }
192
193
        $handle = fopen($txtfile, "r");
194
        $this->content = $this->get_content($handle);
195
        // Kill all ^L (FF) characters
196
        $this->content = str_replace("\x0C", '', $this->content);
197
        fclose($handle);
198
        unlink($txtfile);
199
    }
200
201
    /**
202
     * Simple plain-text driver, just copies the attachment.
203
     */
204
    private function process_mime_plaintext()
205
    {
206
        $this->content = $this->get_content();
207
    }
208
209
    /**
210
     * Processes HTML-style attachments (should therefore work with XML too),
211
     * strips tags and resolves entities.
212
     */
213
    private function process_mime_html()
214
    {
215
        $this->content = $this->html2text($this->get_content());
216
    }
217
218
    /**
219
     * Any binary file will have its name in the abstract unless no title
220
     * is defined, in which case the documents title already contains the file's
221
     * name.
222
     */
223
    private function process_mime_binary()
224
    {
225
        if (!empty(trim($this->title))) {
226
            $this->abstract = $this->attachment->name;
227
        }
228
    }
229
230
    /**
231
     * Returns the first four megabytes of the File referenced by $handle.
232
     * The limit is in place to
233
     * avoid clashes with the PHP Memory limit, it should be enough for most text
234
     * based attachments anyway.
235
     *
236
     * If you omit $handle, a handle to the documents' attachment is created. If no
237
     * handle is specified, it is automatically closed after reading the data, otherwise
238
     * you have to close it yourselves afterwards.
239
     *
240
     * @param resource $handle A valid file-handle to read from, or null to automatically create a
241
     *        handle to the current attachment.
242
     */
243
    private function get_content($handle = null) : string
244
    {
245
        // Read a max of 4 MB
246
        $max = 4194304;
247
        $close = false;
248
        if ($handle === null) {
249
            $handle = $this->attachment->open('r');
250
            $close = true;
251
        }
252
        $content = fread($handle, $max);
0 ignored issues
show
Bug introduced by
It seems like $handle can also be of type false; however, parameter $stream of fread() does only seem to accept resource, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

252
        $content = fread(/** @scrutinizer ignore-type */ $handle, $max);
Loading history...
253
        if ($close) {
254
            $this->attachment->close();
255
        }
256
        return $this->_i18n->convert_to_current_charset($content);
257
    }
258
}
259