Completed
Pull Request — master (#28)
by
unknown
08:00
created

PDFTextExtractor   A

Complexity

Total Complexity 18

Size/Duplication

Total Lines 133
Duplicated Lines 0 %

Coupling/Cohesion

Components 1
Dependencies 2

Importance

Changes 8
Bugs 1 Features 2
Metric Value
c 8
b 1
f 2
dl 0
loc 133
wmc 18
lcom 1
cbo 2
rs 10

7 Methods

Rating   Name   Duplication   Size   Complexity  
A isAvailable() 0 5 3
A supportsExtension() 0 4 1
A supportsMime() 0 12 1
B bin() 0 23 5
A getContent() 0 8 2
B getRawOutput() 0 19 5
A cleanupLigatures() 0 13 1
1
<?php
2
3
/**
4
 * Text extractor that calls pdftotext to do the conversion.
5
 * @author mstephens
6
 *
7
 */
8
class PDFTextExtractor extends FileTextExtractor
0 ignored issues
show
Coding Style Compatibility introduced by
PSR1 recommends that each class must be in a namespace of at least one level to avoid collisions.

You can fix this by adding a namespace to your class:

namespace YourVendor;

class YourClass { }

When choosing a vendor namespace, try to pick something that is not too generic to avoid conflicts with other libraries.

Loading history...
9
{
10
    /**
11
     * Set to bin path this extractor can execute
12
     *
13
     * @var string
14
     */
15
    private static $binary_location = null;
0 ignored issues
show
Unused Code introduced by
The property $binary_location is not used and could be removed.

This check marks private properties in classes that are never used. Those properties can be removed.

Loading history...
16
17
    /**
18
     * Used if binary_location isn't set.
19
     * List of locations to search for a given binary in
20
     *
21
     * @config
22
     * @var array
23
     */
24
    private static $search_binary_locations = array(
0 ignored issues
show
Unused Code introduced by
The property $search_binary_locations is not used and could be removed.

This check marks private properties in classes that are never used. Those properties can be removed.

Loading history...
25
        '/usr/bin',
26
        '/usr/local/bin',
27
    );
28
29
    public function isAvailable()
30
    {
31
        $bin = $this->bin('pdftotext');
32
        return $bin && file_exists($bin) && is_executable($bin);
33
    }
34
35
    public function supportsExtension($extension)
36
    {
37
        return strtolower($extension) === 'pdf';
38
    }
39
40
    public function supportsMime($mime)
41
    {
42
        return in_array(
43
            strtolower($mime),
44
            array(
45
                'application/pdf',
46
                'application/x-pdf',
47
                'application/x-bzpdf',
48
                'application/x-gzpdf'
49
            )
50
        );
51
    }
52
53
    /**
54
     * Accessor to get the location of the binary
55
     *
56
     * @param string $program Name of binary
57
     * @return string
58
     */
59
    protected function bin($program = '')
60
    {
61
        // Get list of allowed search paths
62
        if ($location = $this->config()->binary_location) {
63
            $locations = array($location);
64
        } else {
65
            $locations = $this->config()->search_binary_locations;
66
        }
67
68
        // Find program in each path
69
        foreach($locations as $location) {
70
            $path = "{$location}/{$program}";
71
            if(file_exists($path)) {
72
                return $path;
73
            }
74
            if (file_exists($path.'.exe')) {
75
                return $path.'.exe';
76
            }
77
        }
78
        
79
        // Not found
80
        return null;
81
    }
82
83
    public function getContent($path)
84
    {
85
        if (!$path) {
86
            return "";
87
        } // no file
88
        $content = $this->getRawOutput($path);
89
        return $this->cleanupLigatures($content);
90
    }
91
92
    /**
93
     * Invoke pdftotext with the given path
94
     *
95
     * @param string $path
96
     * @return string Output
97
     * @throws FileTextExtractor_Exception
98
     */
99
    protected function getRawOutput($path)
100
    {
101
        if(!$this->isAvailable()) {
102
            throw new FileTextExtractor_Exception("getRawOutput called on unavailable extractor");
103
        }
104
        exec(sprintf('%s %s - 2>&1', $this->bin('pdftotext'), escapeshellarg($path)), $content, $err);
105
        if ($err) {
0 ignored issues
show
Bug Best Practice introduced by
The expression $err of type integer|null is loosely compared to true; this is ambiguous if the integer can be zero. You might want to explicitly use !== null instead.

In PHP, under loose comparison (like ==, or !=, or switch conditions), values of different types might be equal.

For integer values, zero is a special case, in particular the following results might be unexpected:

0   == false // true
0   == null  // true
123 == false // false
123 == null  // false

// It is often better to use strict comparison
0 === false // false
0 === null  // false
Loading history...
106
            if (!is_array($err) && $err == 1) {
107
                // For Windows compatibility
108
                $err = $content;
109
            }
110
            throw new FileTextExtractor_Exception(sprintf(
111
                'PDFTextExtractor->getContent() failed for %s: %s',
112
                $path,
113
                implode('', $err)
114
            ));
115
        }
116
        return implode('', $content);
117
    }
118
119
    /**
120
     * Removes utf-8 ligatures.
121
     *
122
     * @link http://en.wikipedia.org/wiki/Typographic_ligature#Computer_typesetting
123
     *
124
     * @param string $input
125
     * @return string
126
     */
127
    protected function cleanupLigatures($input)
128
    {
129
        $mapping = array(
130
            'ff' => 'ff',
131
            'fi' => 'fi',
132
            'fl' => 'fl',
133
            'ffi' => 'ffi',
134
            'ffl' => 'ffl',
135
            'ſt' => 'ft',
136
            'st' => 'st'
137
        );
138
        return str_replace(array_keys($mapping), array_values($mapping), $input);
139
    }
140
}
141