Issues (98)

application/mQueue/Service/SearchEngine.php (1 issue)

1
<?php
2
3
namespace mQueue\Service;
4
5
/**
6
 * Search Engine to find sources for movies, given its title.
7
 * It relies on Nova to query several popular websites.
8
 */
9
class SearchEngine
10
{
11
    /**
12
     * Returns the command for the appropriate version of Nova
13
     *
14
     * @return string
15
     */
16
    protected function getNovaCmd()
17
    {
18
        $cmd = shell_exec('python --version 2>&1');
19
        preg_match('/\\d+\\.\\d+\\.\\d+/', $cmd, $matches);
20
        $version = $matches[0];
21
        if (version_compare($version, '3.0.0', '>=')) {
22
            return 'python ' . APPLICATION_PATH . '/../library/searchengine/nova3/nova2.py';
23
        }
24
25
        return 'python ' . APPLICATION_PATH . '/../library/searchengine/nova/nova2.py';
26
    }
27
28
    /**
29
     * Execute a shell command with a timeout
30
     *
31
     * @param string $cmd shell command
32
     * @param int $timeout seconds after which the process will be killed
33
     *
34
     * @return string the stdout of the command
35
     */
36
    protected function execute($cmd, $timeout)
37
    {
38
        $maximumTime = time() + $timeout;
39
        $stdout = null;
40
41
        $pipes = [];
42
        $process = proc_open(
43
            $cmd, [['pipe', 'r'], ['pipe', 'w'], ['pipe', 'w']], $pipes
44
        );
45
46
        if (is_resource($process)) {
47
            // Give group id to process (to later kill all its children)
48
            $status = proc_get_status($process);
49
            posix_setpgid($status['pid'], $status['pid']);
50
51
            stream_set_blocking($pipes[0], 0);
52
            stream_set_blocking($pipes[1], 0);
53
            stream_set_blocking($pipes[2], 0);
54
            fclose($pipes[0]);
55
        }
56
57
        while (is_resource($process)) {
58
            $stdout .= stream_get_contents($pipes[1]);
59
60
            if (time() > $maximumTime) {
61
                // sends SIGKILL to all processes inside group
62
                posix_kill(-$status['pid'], 9);
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable $status does not seem to be defined for all execution paths leading up to this point.
Loading history...
63
                proc_terminate($process, 9);
64
            }
65
66
            $status = proc_get_status($process);
67
            if (!$status['running']) {
68
                fclose($pipes[1]);
69
                fclose($pipes[2]);
70
                proc_close($process);
71
            }
72
73
            // 1 second will not make accurate timeout, but we don't really need accuracy
74
            sleep(1);
75
        }
76
77
        return $stdout;
78
    }
79
80
    /**
81
     * Parse string content and return an array of unique sources
82
     *
83
     * @param string $content
84
     *
85
     * @return array
86
     */
87
    public function parse($content)
88
    {
89
        $data = [];
90
        $keys = ['link', 'name', 'size', 'seeds', 'leech', 'engine_url', 'page'];
91
        $duplicates = [];
92
        foreach (explode(PHP_EOL, trim($content)) as $line) {
93
            $values = explode('|', trim($line));
94
            if (count($keys) == count($values)) {
95
                $source = array_combine($keys, $values);
96
                $duplicateKey = $source['name'] . $source['size'];
97
                if (!array_key_exists($duplicateKey, $duplicates)) {
98
                    $data[$source['link']] = $source;
99
                    $duplicates[$duplicateKey] = true;
100
                }
101
            }
102
        }
103
104
        return $data;
105
    }
106
107
    /**
108
     * Search for the given title and return an array of sources
109
     *
110
     * @param string $title
111
     *
112
     * @return array sources
113
     */
114
    public function search($title)
115
    {
116
        $cmd = $this->getNovaCmd() . ' all movies ' . escapeshellarg(str_replace(' ', '+', $title)) . ' 2>&1';
117
        echo $cmd . PHP_EOL;
118
        $content = $this->execute($cmd, 5 * 60); // 5 minutes to search
119
120
        $path = sys_get_temp_dir() . '/mqueue_' . $title;
121
        file_put_contents($path, $content);
122
123
        return $this->parse($content);
124
    }
125
126
    /**
127
     * Returns the given name simplified as much as possible
128
     *
129
     * @param string $name
130
     *
131
     * @return string
132
     */
133
    protected function cleanName($name)
134
    {
135
        // Insert space before uppercase letters
136
        $name = preg_replace('/([A-Z])/', ' \1', $name);
137
        $name = mb_strtolower($name);
138
139
        // Get rid of all accents
140
        $name = iconv('UTF-8', 'ASCII//TRANSLIT//IGNORE', $name);
141
142
        // Replace special character with common representation
143
        $name = str_replace('&', 'and', $name);
144
145
        // Remove all grouped things (may include team name at the beginning of name)
146
        // Or remove incomplete grouped things at the end of string
147
        $name = preg_replace('/\(.*(\)|$)/U', ' ', $name);
148
        $name = preg_replace('/\[.*(\]|$)/U', ' ', $name);
149
        $name = preg_replace('/\{.*(\}|$)/U', ' ', $name);
150
151
        // Keep only alphanum character
152
        $name = preg_replace('/[^[:alnum:]]/', ' ', $name);
153
154
        // Keep only a single space character
155
        $name = preg_replace('/[[:space:]]+/', ' ', $name);
156
        $name = trim($name);
157
158
        return $name;
159
    }
160
161
    /**
162
     * Compute scores of all sources according to the title searched
163
     *
164
     * @param string $title
165
     * @param array $sources
166
     *
167
     * @return array
168
     */
169
    public function computeScores($title, array $sources)
170
    {
171
        $rules = [
172
            // @see http://en.wikipedia.org/wiki/Pirated_movie_release_types
173
            '/\b(dvdrip|bdrip|brrip|blu-ray|bluray|bdr|bd5|bd9)\b/i' => 80, // Good sources
174
            '/\b(dvdr|dvd-full|full-rip|iso|dvd-5|dvd-9)\b/i' => 40, // Ok sources
175
            '/\b(dsr|dsrip|dthrip|dvbrip|hdtv|pdtv|tvrip|hdtvrip|vodrip|vodr)\b/i' => 10, // Soso sources
176
            '/\b(cam|camrip|ts|telesync|pdvd|wp|workprint|tc|telecine|ppv|ppvrip|scr|screener|dvdscr|dvdscreener|bdscr|ddc|r5)\b/i' => -20, // Low quality or unsure sources
177
            '/\b(maxspeed|axxo|dimension|fxg)\b/i' => 50, // Well known teams
178
            '/\b(swesub|nlt-release)\b/i' => -30, // Avoid teams specialized in foreign versions
179
            '/\b(french|fra|truefrench|italian|ita|russian|german)\b/i' => -30, // Avoid dubbed language
180
            '/\b1080p\b/i' => 20,
181
            '/\b720p\b/i' => 30, // Favor 720p instead of 1080p because of the filesize and better "compatibility" for low powered computer
182
            '/\b(x264|xvid)\b/i' => 20, // Good formats
183
            '/\b(uncut|unrated|extended|director\'s cut|director cut)\b/i' => 20, // Director's cut version is supposedly better
184
        ];
185
186
        $cleanTitle = $this->cleanName($title);
187
        preg_match('/((18|19|20)\d{2})(– )?\)$/', $title, $matches);
188
        $year = $matches[1];
189
190
        foreach ($sources as &$source) {
191
            $identity = 0;
192
            $quality = 0;
193
194
            // TODO: re-evaluate the alternate identity method with more data or drop it entirely
195
            //			$yearPattern = '/(\D)' . $year . '\D.*$/';
196
            //			if (preg_match($yearPattern, $source['name']))
197
            //			{
198
            //				$sourceWithoutYear = trim(preg_replace($yearPattern, '\1', $source['name']));
199
            //				$cleanSource = $this->cleanName($sourceWithoutYear);
200
            //
201
            //				// Boost identity because we found the year
202
            //				$identity += 20;
203
            //			}
204
            //			else
205
            //			{
206
            //
207
            //				$length = strlen($cleanTitle);
208
            //				$pattern = '/^(.{0,' . $length . '}\w*)/';
209
            //				preg_match($pattern, $this->cleanName($source['name']), $m);
210
            //				$cleanSource = $m[1];
211
            //			}
212
            //			v($cleanTitle, $year, $source['name'], $sourceWithoutYear, $cleanSource, $identity);
213
            // Identity mostly is matching title in source name
214
            $cleanSource = mb_substr($this->cleanName($source['name']), 0, mb_strlen($cleanTitle));
215
            similar_text($cleanTitle, $cleanSource, $identity);
216
217
            // If the name contains the year of the movie, boost identity
218
            if (preg_match("/\b$year\b/", $source['name'])) {
219
                $identity += 20;
220
            }
221
222
            // Apply all regexp based quality rules
223
            foreach ($rules as $pattern => $score) {
224
                if (preg_match($pattern, $source['name'], $matches)) {
225
                    $quality += $score;
226
                }
227
            }
228
229
            // File should be at the very minimum +500MB
230
            if ($source['size'] > 500 * 1024 * 1024) {
231
                $quality += 10;
232
            }
233
234
            $source['identity'] = $identity;
235
            $source['quality'] = $quality;
236
            $source['score'] = $identity >= 100 && $quality > 80 ? 2 * $identity + $quality : 0;
237
        }
238
239
        // Sort by score, then seeds, then leech
240
        usort($sources, function ($source, $other) {
241
            if ($other['score'] != $source['score']) {
242
                return $other['score'] - $source['score'];
243
            }
244
            if ($other['seeds'] != $source['seeds']) {
245
                return $other['seeds'] - $source['seeds'];
246
            }
247
            if ($other['leech'] != $source['leech']) {
248
                return $other['leech'] - $source['leech'];
249
            }
250
251
            return strcmp($other['link'], $source['link']);
252
        });
253
254
        return $sources;
255
    }
256
}
257