Completed
Push — master ( bde4cf...61750e )
by Daniel
9s
created

SolrCellTextExtractor::isAvailable()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 5
Code Lines 3

Duplication

Lines 0
Ratio 0 %

Importance

Changes 3
Bugs 0 Features 1
Metric Value
c 3
b 0
f 1
dl 0
loc 5
rs 9.4285
cc 1
eloc 3
nc 1
nop 0
1
<?php
2
use Guzzle\Http\Client;
3
4
/**
5
 * Text extractor that calls an Apache Solr instance
6
 * and extracts content via the "ExtractingRequestHandler" endpoint.
7
 * Does not alter the Solr index itself, but uses it purely
8
 * for its file parsing abilities.
9
 * 
10
 * @author ischommer
11
 * @see  http://wiki.apache.org/solr/ExtractingRequestHandler
12
 */
13
class SolrCellTextExtractor extends FileTextExtractor
0 ignored issues
show
Coding Style Compatibility introduced by
PSR1 recommends that each class must be in a namespace of at least one level to avoid collisions.

You can fix this by adding a namespace to your class:

namespace YourVendor;

class YourClass { }

When choosing a vendor namespace, try to pick something that is not too generic to avoid conflicts with other libraries.

Loading history...
14
{
15
    /**
16
     * Base URL to use for solr text extraction.
17
     * E.g. http://localhost:8983/solr/update/extract
18
     *
19
     * @config
20
     * @var string
21
     */
22
    private static $base_url;
0 ignored issues
show
Unused Code introduced by
The property $base_url is not used and could be removed.

This check marks private properties in classes that are never used. Those properties can be removed.

Loading history...
23
24
    private static $priority = 75;
0 ignored issues
show
Comprehensibility introduced by
Consider using a different property name as you override a private property of the parent class.
Loading history...
Unused Code introduced by
The property $priority is not used and could be removed.

This check marks private properties in classes that are never used. Those properties can be removed.

Loading history...
25
26
    protected $httpClient;
27
28
    public function getHttpClient()
29
    {
30
        if (!$this->config()->get('base_url')) {
31
            throw new InvalidArgumentException('SolrCellTextExtractor.base_url not specified');
32
        }
33
        if (!$this->httpClient) {
34
            $this->httpClient = new Client($this->config()->get('base_url'));
35
        }
36
        return $this->httpClient;
37
    }
38
39
    public function setHttpClient($client)
40
    {
41
        $this->httpClient = $client;
42
    }
43
44
    public function isAvailable()
45
    {
46
        $url = $this->config()->get('base_url');
47
        return (boolean) $url;
48
    }
49
50
    public function supportsExtension($extension)
51
    {
52
        return in_array(
53
            strtolower($extension),
54
            array(
55
                'pdf', 'doc', 'docx', 'xls', 'xlsx',
56
                'epub', 'rtf', 'odt', 'fodt', 'ods', 'fods',
57
                'ppt', 'pptx', 'odp', 'fodp', 'csv'
58
            )
59
        );
60
    }
61
62
    public function supportsMime($mime)
63
    {
64
        // Rely on supportsExtension
65
        return false;
66
    }
67
    
68
    public function getContent($path)
69
    {
70
        if (!$path) {
71
            return "";
72
        } // no file
73
74
        $fileName = basename($path);
75
        $client = $this->getHttpClient();
76
        try {
77
            $request = $client
78
                ->post()
79
                ->addPostFields(array('extractOnly' => 'true', 'extractFormat' => 'text'))
80
                ->addPostFiles(array('myfile' => $path));
81
            $response = $request->send();
82
        } catch (InvalidArgumentException $e) {
83
            SS_Log::log(
84
                sprintf(
85
                    'Error extracting text from "%s" (message: %s)',
86
                    $path,
87
                    $e->getMessage()
88
                ),
89
                SS_Log::NOTICE
90
            );
91
            return null;
92
        }
93
        // Use preg match to avoid SimpleXML running out of memory on large text nodes
94
        preg_match(
95
            sprintf('/\<str name\="%s"\>(.*?)\<\/str\>/s', preg_quote($fileName)),
96
            (string)$response->getBody(),
97
            $matches
98
        );
99
100
        return $matches ? $matches[1] : null;
101
    }
102
}
103