1
|
|
|
<?php |
2
|
|
|
|
3
|
|
|
namespace mQueue\Service; |
4
|
|
|
|
5
|
|
|
/** |
6
|
|
|
* Search Engine to find sources for movies, given its title. |
7
|
|
|
* It relies on Nova to query several popular websites. |
8
|
|
|
*/ |
9
|
|
|
class SearchEngine |
10
|
|
|
{ |
11
|
|
|
/** |
12
|
|
|
* Returns the command for the appropriate version of Nova |
13
|
|
|
* |
14
|
|
|
* @return string |
15
|
|
|
*/ |
16
|
|
|
protected function getNovaCmd() |
17
|
|
|
{ |
18
|
|
|
$cmd = shell_exec('python --version 2>&1'); |
19
|
|
|
preg_match('/\\d+\\.\\d+\\.\\d+/', $cmd, $matches); |
20
|
|
|
$version = $matches[0]; |
21
|
|
|
if (version_compare($version, '3.0.0', '>=')) { |
22
|
|
|
return 'python ' . APPLICATION_PATH . '/../library/searchengine/nova3/nova2.py'; |
23
|
|
|
} |
24
|
|
|
|
25
|
|
|
return 'python ' . APPLICATION_PATH . '/../library/searchengine/nova/nova2.py'; |
26
|
|
|
} |
27
|
|
|
|
28
|
|
|
/** |
29
|
|
|
* Execute a shell command with a timeout |
30
|
|
|
* |
31
|
|
|
* @param string $cmd shell command |
32
|
|
|
* @param int $timeout seconds after which the process will be killed |
33
|
|
|
* |
34
|
|
|
* @return string the stdout of the command |
35
|
|
|
*/ |
36
|
|
|
protected function execute($cmd, $timeout) |
37
|
|
|
{ |
38
|
|
|
$maximumTime = time() + $timeout; |
39
|
|
|
$stdout = null; |
40
|
|
|
|
41
|
|
|
$pipes = []; |
42
|
|
|
$process = proc_open( |
43
|
|
|
$cmd, [['pipe', 'r'], ['pipe', 'w'], ['pipe', 'w']], $pipes |
44
|
|
|
); |
45
|
|
|
|
46
|
|
|
if (is_resource($process)) { |
47
|
|
|
// Give group id to process (to later kill all its children) |
48
|
|
|
$status = proc_get_status($process); |
49
|
|
|
posix_setpgid($status['pid'], $status['pid']); |
50
|
|
|
|
51
|
|
|
stream_set_blocking($pipes[0], 0); |
52
|
|
|
stream_set_blocking($pipes[1], 0); |
53
|
|
|
stream_set_blocking($pipes[2], 0); |
54
|
|
|
fclose($pipes[0]); |
55
|
|
|
} |
56
|
|
|
|
57
|
|
|
while (is_resource($process)) { |
58
|
|
|
$stdout .= stream_get_contents($pipes[1]); |
59
|
|
|
|
60
|
|
|
if (time() > $maximumTime) { |
61
|
|
|
// sends SIGKILL to all processes inside group |
62
|
|
|
posix_kill(-$status['pid'], 9); |
|
|
|
|
63
|
|
|
proc_terminate($process, 9); |
64
|
|
|
} |
65
|
|
|
|
66
|
|
|
$status = proc_get_status($process); |
67
|
|
|
if (!$status['running']) { |
68
|
|
|
fclose($pipes[1]); |
69
|
|
|
fclose($pipes[2]); |
70
|
|
|
proc_close($process); |
71
|
|
|
} |
72
|
|
|
|
73
|
|
|
// 1 second will not make accurate timeout, but we don't really need accuracy |
74
|
|
|
sleep(1); |
75
|
|
|
} |
76
|
|
|
|
77
|
|
|
return $stdout; |
78
|
|
|
} |
79
|
|
|
|
80
|
|
|
/** |
81
|
|
|
* Parse string content and return an array of unique sources |
82
|
|
|
* |
83
|
|
|
* @param string $content |
84
|
|
|
* |
85
|
|
|
* @return array |
86
|
|
|
*/ |
87
|
|
|
public function parse($content) |
88
|
|
|
{ |
89
|
|
|
$data = []; |
90
|
|
|
$keys = ['link', 'name', 'size', 'seeds', 'leech', 'engine_url', 'page']; |
91
|
|
|
$duplicates = []; |
92
|
|
|
foreach (explode(PHP_EOL, trim($content)) as $line) { |
93
|
|
|
$values = explode('|', trim($line)); |
94
|
|
|
if (count($keys) == count($values)) { |
95
|
|
|
$source = array_combine($keys, $values); |
96
|
|
|
$duplicateKey = $source['name'] . $source['size']; |
97
|
|
|
if (!array_key_exists($duplicateKey, $duplicates)) { |
98
|
|
|
$data[$source['link']] = $source; |
99
|
|
|
$duplicates[$duplicateKey] = true; |
100
|
|
|
} |
101
|
|
|
} |
102
|
|
|
} |
103
|
|
|
|
104
|
|
|
return $data; |
105
|
|
|
} |
106
|
|
|
|
107
|
|
|
/** |
108
|
|
|
* Search for the given title and return an array of sources |
109
|
|
|
* |
110
|
|
|
* @param string $title |
111
|
|
|
* |
112
|
|
|
* @return array sources |
113
|
|
|
*/ |
114
|
|
|
public function search($title) |
115
|
|
|
{ |
116
|
|
|
$cmd = $this->getNovaCmd() . ' all movies ' . escapeshellarg(str_replace(' ', '+', $title)) . ' 2>&1'; |
117
|
|
|
echo $cmd . PHP_EOL; |
118
|
|
|
$content = $this->execute($cmd, 5 * 60); // 5 minutes to search |
119
|
|
|
|
120
|
|
|
$path = sys_get_temp_dir() . '/mqueue_' . $title; |
121
|
|
|
file_put_contents($path, $content); |
122
|
|
|
|
123
|
|
|
return $this->parse($content); |
124
|
|
|
} |
125
|
|
|
|
126
|
|
|
/** |
127
|
|
|
* Returns the given name simplified as much as possible |
128
|
|
|
* |
129
|
|
|
* @param string $name |
130
|
|
|
* |
131
|
|
|
* @return string |
132
|
|
|
*/ |
133
|
|
|
protected function cleanName($name) |
134
|
|
|
{ |
135
|
|
|
// Insert space before uppercase letters |
136
|
|
|
$name = preg_replace('/([A-Z])/', ' \1', $name); |
137
|
|
|
$name = mb_strtolower($name); |
138
|
|
|
|
139
|
|
|
// Get rid of all accents |
140
|
|
|
$name = iconv('UTF-8', 'ASCII//TRANSLIT//IGNORE', $name); |
141
|
|
|
|
142
|
|
|
// Replace special character with common representation |
143
|
|
|
$name = str_replace('&', 'and', $name); |
144
|
|
|
|
145
|
|
|
// Remove all grouped things (may include team name at the beginning of name) |
146
|
|
|
// Or remove incomplete grouped things at the end of string |
147
|
|
|
$name = preg_replace('/\(.*(\)|$)/U', ' ', $name); |
148
|
|
|
$name = preg_replace('/\[.*(\]|$)/U', ' ', $name); |
149
|
|
|
$name = preg_replace('/\{.*(\}|$)/U', ' ', $name); |
150
|
|
|
|
151
|
|
|
// Keep only alphanum character |
152
|
|
|
$name = preg_replace('/[^[:alnum:]]/', ' ', $name); |
153
|
|
|
|
154
|
|
|
// Keep only a single space character |
155
|
|
|
$name = preg_replace('/[[:space:]]+/', ' ', $name); |
156
|
|
|
$name = trim($name); |
157
|
|
|
|
158
|
|
|
return $name; |
159
|
|
|
} |
160
|
|
|
|
161
|
|
|
/** |
162
|
|
|
* Compute scores of all sources according to the title searched |
163
|
|
|
* |
164
|
|
|
* @param string $title |
165
|
|
|
* @param array $sources |
166
|
|
|
* |
167
|
|
|
* @return array |
168
|
|
|
*/ |
169
|
|
|
public function computeScores($title, array $sources) |
170
|
|
|
{ |
171
|
|
|
$rules = [ |
172
|
|
|
// @see http://en.wikipedia.org/wiki/Pirated_movie_release_types |
173
|
|
|
'/\b(dvdrip|bdrip|brrip|blu-ray|bluray|bdr|bd5|bd9)\b/i' => 80, // Good sources |
174
|
|
|
'/\b(dvdr|dvd-full|full-rip|iso|dvd-5|dvd-9)\b/i' => 40, // Ok sources |
175
|
|
|
'/\b(dsr|dsrip|dthrip|dvbrip|hdtv|pdtv|tvrip|hdtvrip|vodrip|vodr)\b/i' => 10, // Soso sources |
176
|
|
|
'/\b(cam|camrip|ts|telesync|pdvd|wp|workprint|tc|telecine|ppv|ppvrip|scr|screener|dvdscr|dvdscreener|bdscr|ddc|r5)\b/i' => -20, // Low quality or unsure sources |
177
|
|
|
'/\b(maxspeed|axxo|dimension|fxg)\b/i' => 50, // Well known teams |
178
|
|
|
'/\b(swesub|nlt-release)\b/i' => -30, // Avoid teams specialized in foreign versions |
179
|
|
|
'/\b(french|fra|truefrench|italian|ita|russian|german)\b/i' => -30, // Avoid dubbed language |
180
|
|
|
'/\b1080p\b/i' => 20, |
181
|
|
|
'/\b720p\b/i' => 30, // Favor 720p instead of 1080p because of the filesize and better "compatibility" for low powered computer |
182
|
|
|
'/\b(x264|xvid)\b/i' => 20, // Good formats |
183
|
|
|
'/\b(uncut|unrated|extended|director\'s cut|director cut)\b/i' => 20, // Director's cut version is supposedly better |
184
|
|
|
]; |
185
|
|
|
|
186
|
|
|
$cleanTitle = $this->cleanName($title); |
187
|
|
|
preg_match('/((18|19|20)\d{2})(– )?\)$/', $title, $matches); |
188
|
|
|
$year = $matches[1]; |
189
|
|
|
|
190
|
|
|
foreach ($sources as &$source) { |
191
|
|
|
$identity = 0; |
192
|
|
|
$quality = 0; |
193
|
|
|
|
194
|
|
|
// TODO: re-evaluate the alternate identity method with more data or drop it entirely |
195
|
|
|
// $yearPattern = '/(\D)' . $year . '\D.*$/'; |
196
|
|
|
// if (preg_match($yearPattern, $source['name'])) |
197
|
|
|
// { |
198
|
|
|
// $sourceWithoutYear = trim(preg_replace($yearPattern, '\1', $source['name'])); |
199
|
|
|
// $cleanSource = $this->cleanName($sourceWithoutYear); |
200
|
|
|
// |
201
|
|
|
// // Boost identity because we found the year |
202
|
|
|
// $identity += 20; |
203
|
|
|
// } |
204
|
|
|
// else |
205
|
|
|
// { |
206
|
|
|
// |
207
|
|
|
// $length = strlen($cleanTitle); |
208
|
|
|
// $pattern = '/^(.{0,' . $length . '}\w*)/'; |
209
|
|
|
// preg_match($pattern, $this->cleanName($source['name']), $m); |
210
|
|
|
// $cleanSource = $m[1]; |
211
|
|
|
// } |
212
|
|
|
// v($cleanTitle, $year, $source['name'], $sourceWithoutYear, $cleanSource, $identity); |
213
|
|
|
// Identity mostly is matching title in source name |
214
|
|
|
$cleanSource = mb_substr($this->cleanName($source['name']), 0, mb_strlen($cleanTitle)); |
215
|
|
|
similar_text($cleanTitle, $cleanSource, $identity); |
216
|
|
|
|
217
|
|
|
// If the name contains the year of the movie, boost identity |
218
|
|
|
if (preg_match("/\b$year\b/", $source['name'])) { |
219
|
|
|
$identity += 20; |
220
|
|
|
} |
221
|
|
|
|
222
|
|
|
// Apply all regexp based quality rules |
223
|
|
|
foreach ($rules as $pattern => $score) { |
224
|
|
|
if (preg_match($pattern, $source['name'], $matches)) { |
225
|
|
|
$quality += $score; |
226
|
|
|
} |
227
|
|
|
} |
228
|
|
|
|
229
|
|
|
// File should be at the very minimum +500MB |
230
|
|
|
if ($source['size'] > 500 * 1024 * 1024) { |
231
|
|
|
$quality += 10; |
232
|
|
|
} |
233
|
|
|
|
234
|
|
|
$source['identity'] = $identity; |
235
|
|
|
$source['quality'] = $quality; |
236
|
|
|
$source['score'] = $identity >= 100 && $quality > 80 ? 2 * $identity + $quality : 0; |
237
|
|
|
} |
238
|
|
|
|
239
|
|
|
// Sort by score, then seeds, then leech |
240
|
|
|
usort($sources, function ($source, $other) { |
241
|
|
|
if ($other['score'] != $source['score']) { |
242
|
|
|
return $other['score'] - $source['score']; |
243
|
|
|
} |
244
|
|
|
if ($other['seeds'] != $source['seeds']) { |
245
|
|
|
return $other['seeds'] - $source['seeds']; |
246
|
|
|
} |
247
|
|
|
if ($other['leech'] != $source['leech']) { |
248
|
|
|
return $other['leech'] - $source['leech']; |
249
|
|
|
} |
250
|
|
|
|
251
|
|
|
return strcmp($other['link'], $source['link']); |
252
|
|
|
}); |
253
|
|
|
|
254
|
|
|
return $sources; |
255
|
|
|
} |
256
|
|
|
} |
257
|
|
|
|