1
|
|
|
<?php |
2
|
|
|
namespace ApacheSolrForTypo3\Solr; |
3
|
|
|
|
4
|
|
|
/*************************************************************** |
5
|
|
|
* Copyright notice |
6
|
|
|
* |
7
|
|
|
* (c) 2010-2015 Ingo Renner <[email protected]> |
8
|
|
|
* All rights reserved |
9
|
|
|
* |
10
|
|
|
* This script is part of the TYPO3 project. The TYPO3 project is |
11
|
|
|
* free software; you can redistribute it and/or modify |
12
|
|
|
* it under the terms of the GNU General Public License as published by |
13
|
|
|
* the Free Software Foundation; either version 2 of the License, or |
14
|
|
|
* (at your option) any later version. |
15
|
|
|
* |
16
|
|
|
* The GNU General Public License can be found at |
17
|
|
|
* http://www.gnu.org/copyleft/gpl.html. |
18
|
|
|
* |
19
|
|
|
* This script is distributed in the hope that it will be useful, |
20
|
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
21
|
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
22
|
|
|
* GNU General Public License for more details. |
23
|
|
|
* |
24
|
|
|
* This copyright notice MUST APPEAR in all copies of the script! |
25
|
|
|
***************************************************************/ |
26
|
|
|
|
27
|
|
|
/** |
28
|
|
|
* Specialized query for content extraction using Solr Cell |
29
|
|
|
* |
30
|
|
|
*/ |
31
|
|
|
class ExtractingQuery extends Query |
32
|
|
|
{ |
33
|
|
|
protected $file; |
34
|
|
|
protected $multiPartPostDataBoundary; |
35
|
|
|
|
36
|
|
|
/** |
37
|
|
|
* Constructor |
38
|
|
|
* |
39
|
|
|
* @param string $file Absolute path to the file to extract content and meta data from. |
40
|
|
|
*/ |
41
|
1 |
|
public function __construct($file) |
42
|
|
|
{ |
43
|
1 |
|
parent::__construct(''); |
44
|
|
|
|
45
|
1 |
|
$this->file = $file; |
46
|
1 |
|
$this->multiPartPostDataBoundary = '--' . md5(uniqid(time())); |
47
|
1 |
|
} |
48
|
|
|
|
49
|
|
|
/** |
50
|
|
|
* Returns the boundary used for this multi-part form-data POST body data. |
51
|
|
|
* |
52
|
|
|
* @return string multi-part form-data POST boundary |
53
|
|
|
*/ |
54
|
1 |
|
public function getMultiPartPostDataBoundary() |
55
|
|
|
{ |
56
|
1 |
|
return $this->multiPartPostDataBoundary; |
57
|
|
|
} |
58
|
|
|
|
59
|
|
|
/** |
60
|
|
|
* Gets the absolute path to the file to extract content and meta data from. |
61
|
|
|
* |
62
|
|
|
* @return string Absolute path to the file to extract content and meta data from. |
63
|
|
|
*/ |
64
|
|
|
public function getFile() |
65
|
|
|
{ |
66
|
|
|
return $this->file; |
67
|
|
|
} |
68
|
|
|
|
69
|
|
|
/** |
70
|
|
|
* Sets the absolute path to the file to extract content and meta data from. |
71
|
|
|
* |
72
|
|
|
* @param string $file Absolute path to the file to extract content and meta data from. |
73
|
|
|
*/ |
74
|
|
|
public function setFile($file) |
75
|
|
|
{ |
76
|
|
|
if (is_file($file)) { |
77
|
|
|
$this->file = $file; |
78
|
|
|
} |
79
|
|
|
} |
80
|
|
|
|
81
|
|
|
/** |
82
|
|
|
* Gets the filename portion of the file. |
83
|
|
|
* |
84
|
|
|
* @return string The filename. |
85
|
|
|
*/ |
86
|
|
|
public function getFileName() |
87
|
|
|
{ |
88
|
|
|
return basename($this->file); |
89
|
|
|
} |
90
|
|
|
|
91
|
|
|
/** |
92
|
|
|
* Constructs a multi-part form-data POST body from the file's content. |
93
|
|
|
* |
94
|
|
|
* @param string $boundary Optional boundary to use |
95
|
|
|
* @return string The file to extract as raw POST data. |
96
|
|
|
* @throws \Apache_Solr_InvalidArgumentException |
97
|
|
|
*/ |
98
|
1 |
|
public function getRawPostFileData($boundary = '') |
99
|
|
|
{ |
100
|
1 |
|
if (empty($boundary)) { |
101
|
1 |
|
$boundary = $this->multiPartPostDataBoundary; |
102
|
1 |
|
} |
103
|
|
|
|
104
|
1 |
|
$fileData = file_get_contents($this->file); |
105
|
1 |
|
if ($fileData === false) { |
106
|
|
|
throw new \Apache_Solr_InvalidArgumentException( |
107
|
|
|
'Could not retrieve content from file ' . $this->file |
108
|
|
|
); |
109
|
|
|
} |
110
|
|
|
|
111
|
1 |
|
$data = "--{$boundary}\r\n"; |
112
|
|
|
// The 'filename' used here becomes the property name in the response. |
113
|
1 |
|
$data .= 'Content-Disposition: form-data; name="file"; filename="extracted"'; |
114
|
1 |
|
$data .= "\r\nContent-Type: application/octet-stream\r\n\r\n"; |
115
|
1 |
|
$data .= $fileData; |
116
|
1 |
|
$data .= "\r\n--{$boundary}--\r\n"; |
117
|
|
|
|
118
|
1 |
|
return $data; |
119
|
|
|
} |
120
|
|
|
|
121
|
|
|
/** |
122
|
|
|
* En / Disables extraction only |
123
|
|
|
* |
124
|
|
|
* @param bool $extractOnly If TRUE, only extracts content from the given file without indexing |
125
|
|
|
*/ |
126
|
1 |
|
public function setExtractOnly($extractOnly = true) |
127
|
|
|
{ |
128
|
1 |
|
if ($extractOnly) { |
129
|
1 |
|
$this->queryParameters['extractOnly'] = 'true'; |
130
|
1 |
|
} else { |
131
|
|
|
unset($this->queryParameters['extractOnly']); |
132
|
|
|
} |
133
|
1 |
|
} |
134
|
|
|
|
135
|
|
|
/** |
136
|
|
|
* Builds an array of query parameters to use for the search query. |
137
|
|
|
* |
138
|
|
|
* @return array An array ready to use with query parameters |
139
|
|
|
*/ |
140
|
1 |
|
public function getQueryParameters() |
141
|
|
|
{ |
142
|
1 |
|
$filename = basename($this->file); |
143
|
|
|
|
144
|
|
|
// TODO create an Apache Solr patch to support Apache Tika's -m (and -l) options |
145
|
|
|
$suggestParameters = [ |
146
|
1 |
|
'resource.name' => $filename, |
147
|
1 |
|
'extractFormat' => 'text', |
148
|
|
|
// Matches the -t command for the tika CLI app. |
149
|
1 |
|
]; |
150
|
|
|
|
151
|
1 |
|
return array_merge($suggestParameters, $this->queryParameters); |
152
|
|
|
} |
153
|
|
|
} |
154
|
|
|
|