SolrCellTextExtractor   A
last analyzed

Complexity

Total Complexity 16

Size/Duplication

Total Lines 141
Duplicated Lines 0 %

Importance

Changes 2
Bugs 0 Features 0
Metric Value
eloc 52
c 2
b 0
f 0
dl 0
loc 141
rs 10
wmc 16

6 Methods

Rating   Name   Duplication   Size   Complexity  
A supportsMime() 0 4 1
A isAvailable() 0 5 1
A supportsExtension() 0 8 1
A setHttpClient() 0 4 1
A getHttpClient() 0 7 2
B getContent() 0 56 10
1
<?php
2
3
namespace SilverStripe\TextExtraction\Extractor;
4
5
use Exception;
6
use GuzzleHttp\Client;
7
use GuzzleHttp\Psr7\Response;
8
use InvalidArgumentException;
9
use Psr\Log\LoggerInterface;
10
use SilverStripe\Assets\File;
11
use SilverStripe\Core\Injector\Injector;
12
13
/**
14
 * Text extractor that calls an Apache Solr instance
15
 * and extracts content via the "ExtractingRequestHandler" endpoint.
16
 * Does not alter the Solr index itself, but uses it purely
17
 * for its file parsing abilities.
18
 *
19
 * @author ischommer
20
 * @see  http://wiki.apache.org/solr/ExtractingRequestHandler
21
 */
22
class SolrCellTextExtractor extends FileTextExtractor
23
{
24
    /**
25
     * Base URL to use for Solr text extraction.
26
     * E.g. http://localhost:8983/solr/update/extract
27
     *
28
     * @config
29
     * @var string
30
     */
31
    private static $base_url;
0 ignored issues
show
introduced by
The private property $base_url is not used, and could be removed.
Loading history...
32
33
    /**
34
     * @var int
35
     * @config
36
     */
37
    private static $priority = 75;
0 ignored issues
show
introduced by
The private property $priority is not used, and could be removed.
Loading history...
38
39
    /**
40
     * @var Client
41
     */
42
    protected $httpClient;
43
44
    /**
45
     * @return Client
46
     */
47
    public function getHttpClient()
48
    {
49
        if (!$this->httpClient) {
50
            $this->httpClient = new Client();
51
        }
52
53
        return $this->httpClient;
54
    }
55
56
    /**
57
     * @param  Client $client
58
     * @return $this
59
     */
60
    public function setHttpClient(Client $client)
61
    {
62
        $this->httpClient = $client;
63
        return $this;
64
    }
65
66
    /**
67
     * @return string
68
     */
69
    public function isAvailable()
70
    {
71
        $url = $this->config()->get('base_url');
72
73
        return (bool) $url;
0 ignored issues
show
Bug Best Practice introduced by
The expression return (bool)$url returns the type boolean which is incompatible with the documented return type string.
Loading history...
74
    }
75
76
    /**
77
     * @param  string $extension
78
     * @return bool
79
     */
80
    public function supportsExtension($extension)
81
    {
82
        return in_array(
83
            strtolower($extension),
84
            [
85
                'pdf', 'doc', 'docx', 'xls', 'xlsx',
86
                'epub', 'rtf', 'odt', 'fodt', 'ods', 'fods',
87
                'ppt', 'pptx', 'odp', 'fodp', 'csv'
88
            ]
89
        );
90
    }
91
92
    /**
93
     * @param  string $mime
94
     * @return bool
95
     */
96
    public function supportsMime($mime)
97
    {
98
        // Rely on supportsExtension
99
        return false;
100
    }
101
102
    /**
103
     * @param File|string $file
104
     * @return string
105
     * @throws InvalidArgumentException
106
     */
107
    public function getContent($file)
108
    {
109
        if (!$file || (is_string($file) && !file_exists($file))) {
110
            // no file
111
            return '';
112
        }
113
114
        $fileName = $file instanceof File ? $file->getFilename() : basename($file);
115
        $client = $this->getHttpClient();
116
117
        // Get and validate base URL
118
        $baseUrl = $this->config()->get('base_url');
119
        if (!$this->config()->get('base_url')) {
120
            throw new InvalidArgumentException('SolrCellTextExtractor.base_url not specified');
121
        }
122
123
        try {
124
            $stream = $file instanceof File ? $file->getStream() : fopen($file, 'r');
125
            /** @var Response $response */
126
            $response = $client
127
                ->post($baseUrl, [
128
                    'multipart' => [
129
                        ['name' => 'extractOnly', 'contents' => 'true'],
130
                        ['name' => 'extractFormat', 'contents' => 'text'],
131
                        ['name' => 'myfile', 'contents' => $stream],
132
                    ]
133
                ]);
134
        } catch (InvalidArgumentException $e) {
135
            $msg = sprintf(
136
                'Error extracting text from "%s" (message: %s)',
137
                $fileName,
138
                $e->getMessage()
139
            );
140
            Injector::inst()->get(LoggerInterface::class)->notice($msg);
141
            return null;
142
        } catch (Exception $e) {
143
            // Catch other errors that Tika can throw via Guzzle but are not caught and break Solr search
144
            // query in some cases.
145
            $msg = sprintf(
146
                'Tika server error attempting to extract from "%s" (message: %s)',
147
                $fileName,
148
                $e->getMessage()
149
            );
150
            Injector::inst()->get(LoggerInterface::class)->notice($msg);
151
            return null;
152
        }
153
154
        $matches = [];
155
        // Use preg match to avoid SimpleXML running out of memory on large text nodes
156
        preg_match(
157
            sprintf('/\<str name\="%s"\>(.*?)\<\/str\>/s', preg_quote($fileName)),
158
            (string)$response->getBody(),
159
            $matches
160
        );
161
162
        return $matches ? $matches[1] : null;
163
    }
164
}
165