Passed
Pull Request — master (#1249)
by
unknown
19:13
created

ExtractingQuery   A

Complexity

Total Complexity 12

Size/Duplication

Total Lines 123
Duplicated Lines 0 %

Coupling/Cohesion

Components 1
Dependencies 1

Test Coverage

Coverage 69.77%

Importance

Changes 0
Metric Value
wmc 12
lcom 1
cbo 1
dl 0
loc 123
ccs 30
cts 43
cp 0.6977
rs 10
c 0
b 0
f 0

8 Methods

Rating   Name   Duplication   Size   Complexity  
A __construct() 0 7 1
A getMultiPartPostDataBoundary() 0 4 1
A getFile() 0 4 1
A setFile() 0 6 2
A getFileName() 0 4 1
A getRawPostFileData() 0 22 3
A setExtractOnly() 0 8 2
A getQueryParameters() 0 13 1
1
<?php
2
namespace ApacheSolrForTypo3\Solr;
3
4
/***************************************************************
5
 *  Copyright notice
6
 *
7
 *  (c) 2010-2015 Ingo Renner <[email protected]>
8
 *  All rights reserved
9
 *
10
 *  This script is part of the TYPO3 project. The TYPO3 project is
11
 *  free software; you can redistribute it and/or modify
12
 *  it under the terms of the GNU General Public License as published by
13
 *  the Free Software Foundation; either version 2 of the License, or
14
 *  (at your option) any later version.
15
 *
16
 *  The GNU General Public License can be found at
17
 *  http://www.gnu.org/copyleft/gpl.html.
18
 *
19
 *  This script is distributed in the hope that it will be useful,
20
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
21
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
22
 *  GNU General Public License for more details.
23
 *
24
 *  This copyright notice MUST APPEAR in all copies of the script!
25
 ***************************************************************/
26
27
/**
28
 * Specialized query for content extraction using Solr Cell
29
 *
30
 */
31
class ExtractingQuery extends Query
32
{
33
    protected $file;
34
    protected $multiPartPostDataBoundary;
35
36
    /**
37
     * Constructor
38
     *
39
     * @param string $file Absolute path to the file to extract content and meta data from.
40
     */
41 1
    public function __construct($file)
42
    {
43 1
        parent::__construct('');
44
45 1
        $this->file = $file;
46 1
        $this->multiPartPostDataBoundary = '--' . md5(uniqid(time()));
47 1
    }
48
49
    /**
50
     * Returns the boundary used for this multi-part form-data POST body data.
51
     *
52
     * @return string multi-part form-data POST boundary
53
     */
54 1
    public function getMultiPartPostDataBoundary()
55
    {
56 1
        return $this->multiPartPostDataBoundary;
57
    }
58
59
    /**
60
     * Gets the absolute path to the file to extract content and meta data from.
61
     *
62
     * @return string Absolute path to the file to extract content and meta data from.
63
     */
64
    public function getFile()
65
    {
66
        return $this->file;
67
    }
68
69
    /**
70
     * Sets the absolute path to the file to extract content and meta data from.
71
     *
72
     * @param string $file Absolute path to the file to extract content and meta data from.
73
     */
74
    public function setFile($file)
75
    {
76
        if (is_file($file)) {
77
            $this->file = $file;
78
        }
79
    }
80
81
    /**
82
     * Gets the filename portion of the file.
83
     *
84
     * @return string The filename.
85
     */
86
    public function getFileName()
87
    {
88
        return basename($this->file);
89
    }
90
91
    /**
92
     * Constructs a multi-part form-data POST body from the file's content.
93
     *
94
     * @param string $boundary Optional boundary to use
95
     * @return string The file to extract as raw POST data.
96
     * @throws \Apache_Solr_InvalidArgumentException
97
     */
98 1
    public function getRawPostFileData($boundary = '')
99
    {
100 1
        if (empty($boundary)) {
101 1
            $boundary = $this->multiPartPostDataBoundary;
102 1
        }
103
104 1
        $fileData = file_get_contents($this->file);
105 1
        if ($fileData === false) {
106
            throw new \Apache_Solr_InvalidArgumentException(
107
                'Could not retrieve content from file ' . $this->file
108
            );
109
        }
110
111 1
        $data = "--{$boundary}\r\n";
112
        // The 'filename' used here becomes the property name in the response.
113 1
        $data .= 'Content-Disposition: form-data; name="file"; filename="extracted"';
114 1
        $data .= "\r\nContent-Type: application/octet-stream\r\n\r\n";
115 1
        $data .= $fileData;
116 1
        $data .= "\r\n--{$boundary}--\r\n";
117
118 1
        return $data;
119
    }
120
121
    /**
122
     * En / Disables extraction only
123
     *
124
     * @param bool $extractOnly If TRUE, only extracts content from the given file without indexing
125
     */
126 1
    public function setExtractOnly($extractOnly = true)
127
    {
128 1
        if ($extractOnly) {
129 1
            $this->queryParameters['extractOnly'] = 'true';
130 1
        } else {
131
            unset($this->queryParameters['extractOnly']);
132
        }
133 1
    }
134
135
    /**
136
     * Builds an array of query parameters to use for the search query.
137
     *
138
     * @return array An array ready to use with query parameters
139
     */
140 1
    public function getQueryParameters()
141
    {
142 1
        $filename = basename($this->file);
143
144
        // TODO create an Apache Solr patch to support Apache Tika's -m (and -l) options
145
        $suggestParameters = [
146 1
            'resource.name' => $filename,
147 1
            'extractFormat' => 'text',
148
            // Matches the -t command for the tika CLI app.
149 1
        ];
150
151 1
        return array_merge($suggestParameters, $this->queryParameters);
152
    }
153
}
154