|
1
|
|
|
<?php |
|
2
|
|
|
|
|
3
|
|
|
namespace mQueue\Service; |
|
4
|
|
|
|
|
5
|
|
|
/** |
|
6
|
|
|
* Search Engine to find sources for movies, given its title. |
|
7
|
|
|
* It relies on Nova to query several popular websites. |
|
8
|
|
|
*/ |
|
9
|
|
|
class SearchEngine |
|
10
|
|
|
{ |
|
11
|
|
|
/** |
|
12
|
|
|
* Returns the command for the appropriate version of Nova |
|
13
|
|
|
* |
|
14
|
|
|
* @return string |
|
15
|
|
|
*/ |
|
16
|
|
|
protected function getNovaCmd() |
|
17
|
|
|
{ |
|
18
|
|
|
$cmd = shell_exec('python --version 2>&1'); |
|
19
|
|
|
preg_match('/\\d+\\.\\d+\\.\\d+/', $cmd, $matches); |
|
20
|
|
|
$version = $matches[0]; |
|
21
|
|
|
if (version_compare($version, '3.0.0', '>=')) { |
|
22
|
|
|
return APPLICATION_PATH . '/../library/searchengine/nova3/nova2.py'; |
|
23
|
|
|
} |
|
24
|
|
|
|
|
25
|
|
|
return APPLICATION_PATH . '/../library/searchengine/nova/nova2.py'; |
|
26
|
|
|
} |
|
27
|
|
|
|
|
28
|
|
|
/** |
|
29
|
|
|
* Execute a shell command with a timeout |
|
30
|
|
|
* |
|
31
|
|
|
* @param string $cmd shell command |
|
32
|
|
|
* @param int $timeout seconds after which the process will be killed |
|
33
|
|
|
* |
|
34
|
|
|
* @return string the stdout of the command |
|
35
|
|
|
*/ |
|
36
|
|
|
protected function execute($cmd, $timeout) |
|
37
|
|
|
{ |
|
38
|
|
|
$maximumTime = time() + $timeout; |
|
39
|
|
|
$stdout = null; |
|
40
|
|
|
|
|
41
|
|
|
$pipes = []; |
|
42
|
|
|
$process = proc_open( |
|
43
|
|
|
$cmd, [['pipe', 'r'], ['pipe', 'w'], ['pipe', 'w']], $pipes |
|
44
|
|
|
); |
|
45
|
|
|
|
|
46
|
|
|
if (is_resource($process)) { |
|
47
|
|
|
// Give group id to process (to later kill all its children) |
|
48
|
|
|
$status = proc_get_status($process); |
|
49
|
|
|
posix_setpgid($status['pid'], $status['pid']); |
|
50
|
|
|
|
|
51
|
|
|
stream_set_blocking($pipes[0], 0); |
|
52
|
|
|
stream_set_blocking($pipes[1], 0); |
|
53
|
|
|
stream_set_blocking($pipes[2], 0); |
|
54
|
|
|
fclose($pipes[0]); |
|
55
|
|
|
} |
|
56
|
|
|
|
|
57
|
|
|
while (is_resource($process)) { |
|
58
|
|
|
$stdout .= stream_get_contents($pipes[1]); |
|
59
|
|
|
|
|
60
|
|
|
if (time() > $maximumTime) { |
|
61
|
|
|
// sends SIGKILL to all processes inside group |
|
62
|
|
|
posix_kill(-$status['pid'], 9); |
|
|
|
|
|
|
63
|
|
|
proc_terminate($process, 9); |
|
64
|
|
|
} |
|
65
|
|
|
|
|
66
|
|
|
$status = proc_get_status($process); |
|
67
|
|
|
if (!$status['running']) { |
|
68
|
|
|
fclose($pipes[1]); |
|
69
|
|
|
fclose($pipes[2]); |
|
70
|
|
|
proc_close($process); |
|
71
|
|
|
} |
|
72
|
|
|
|
|
73
|
|
|
// 1 second will not make accurate timeout, but we don't really need accuracy |
|
74
|
|
|
sleep(1); |
|
75
|
|
|
} |
|
76
|
|
|
|
|
77
|
|
|
return $stdout; |
|
78
|
|
|
} |
|
79
|
|
|
|
|
80
|
|
|
/** |
|
81
|
|
|
* Parse string content and return an array of unique sources |
|
82
|
|
|
* |
|
83
|
|
|
* @param string $content |
|
84
|
|
|
* |
|
85
|
|
|
* @return array |
|
86
|
|
|
*/ |
|
87
|
|
|
public function parse($content) |
|
88
|
|
|
{ |
|
89
|
|
|
$data = []; |
|
90
|
|
|
$keys = ['link', 'name', 'size', 'seeds', 'leech', 'engine_url', 'page']; |
|
91
|
|
|
$duplicates = []; |
|
92
|
|
|
foreach (explode(PHP_EOL, trim($content)) as $line) { |
|
93
|
|
|
$values = explode('|', trim($line)); |
|
94
|
|
|
if (count($keys) == count($values)) { |
|
95
|
|
|
$source = array_combine($keys, $values); |
|
96
|
|
|
$duplicateKey = $source['name'] . $source['size']; |
|
97
|
|
|
if (!array_key_exists($duplicateKey, $duplicates)) { |
|
98
|
|
|
$data[$source['link']] = $source; |
|
99
|
|
|
$duplicates[$duplicateKey] = true; |
|
100
|
|
|
} |
|
101
|
|
|
} |
|
102
|
|
|
} |
|
103
|
|
|
|
|
104
|
|
|
return $data; |
|
105
|
|
|
} |
|
106
|
|
|
|
|
107
|
|
|
/** |
|
108
|
|
|
* Search for the given title and return an array of sources |
|
109
|
|
|
* |
|
110
|
|
|
* @param string $title |
|
111
|
|
|
* |
|
112
|
|
|
* @return array sources |
|
113
|
|
|
*/ |
|
114
|
|
|
public function search($title) |
|
115
|
|
|
{ |
|
116
|
|
|
$cmd = $this->getNovaCmd() . ' all movies ' . escapeshellarg(str_replace(' ', '+', $title)) . ' 2>&1'; |
|
117
|
|
|
echo $cmd . PHP_EOL; |
|
118
|
|
|
$content = $this->execute($cmd, 5 * 60); // 5 minutes to search |
|
119
|
|
|
|
|
120
|
|
|
$path = sys_get_temp_dir() . '/mqueue_' . $title; |
|
121
|
|
|
file_put_contents($path, $content); |
|
122
|
|
|
|
|
123
|
|
|
return $this->parse($content); |
|
124
|
|
|
} |
|
125
|
|
|
|
|
126
|
|
|
/** |
|
127
|
|
|
* Returns the given name simplified as much as possible |
|
128
|
|
|
* |
|
129
|
|
|
* @param string $name |
|
130
|
|
|
* |
|
131
|
|
|
* @return string |
|
132
|
|
|
*/ |
|
133
|
|
|
protected function cleanName($name) |
|
134
|
|
|
{ |
|
135
|
|
|
// Insert space before uppercase letters |
|
136
|
|
|
$name = preg_replace('/([A-Z])/', ' \1', $name); |
|
137
|
|
|
$name = mb_strtolower($name); |
|
138
|
|
|
|
|
139
|
|
|
// Get rid of all accents |
|
140
|
|
|
$name = iconv('UTF-8', 'ASCII//TRANSLIT//IGNORE', $name); |
|
141
|
|
|
|
|
142
|
|
|
// Replace special character with common representation |
|
143
|
|
|
$name = str_replace('&', 'and', $name); |
|
144
|
|
|
|
|
145
|
|
|
// Remove all grouped things (may include team name at the beginning of name) |
|
146
|
|
|
// Or remove incomplete grouped things at the end of string |
|
147
|
|
|
$name = preg_replace('/\(.*(\)|$)/U', ' ', $name); |
|
148
|
|
|
$name = preg_replace('/\[.*(\]|$)/U', ' ', $name); |
|
149
|
|
|
$name = preg_replace('/\{.*(\}|$)/U', ' ', $name); |
|
150
|
|
|
|
|
151
|
|
|
// Keep only alphanum character |
|
152
|
|
|
$name = preg_replace('/[^[:alnum:]]/', ' ', $name); |
|
153
|
|
|
|
|
154
|
|
|
// Keep only a single space character |
|
155
|
|
|
$name = preg_replace('/[[:space:]]+/', ' ', $name); |
|
156
|
|
|
$name = trim($name); |
|
157
|
|
|
|
|
158
|
|
|
return $name; |
|
159
|
|
|
} |
|
160
|
|
|
|
|
161
|
|
|
/** |
|
162
|
|
|
* Compute scores of all sources according to the title searched |
|
163
|
|
|
* |
|
164
|
|
|
* @param string $title |
|
165
|
|
|
* @param array $sources |
|
166
|
|
|
* |
|
167
|
|
|
* @return array |
|
168
|
|
|
*/ |
|
169
|
|
|
public function computeScores($title, array $sources) |
|
170
|
|
|
{ |
|
171
|
|
|
$rules = [ |
|
172
|
|
|
// @see http://en.wikipedia.org/wiki/Pirated_movie_release_types |
|
173
|
|
|
'/\b(dvdrip|bdrip|brrip|blu-ray|bluray|bdr|bd5|bd9)\b/i' => 80, // Good sources |
|
174
|
|
|
'/\b(dvdr|dvd-full|full-rip|iso|dvd-5|dvd-9)\b/i' => 40, // Ok sources |
|
175
|
|
|
'/\b(dsr|dsrip|dthrip|dvbrip|hdtv|pdtv|tvrip|hdtvrip|vodrip|vodr)\b/i' => 10, // Soso sources |
|
176
|
|
|
'/\b(cam|camrip|ts|telesync|pdvd|wp|workprint|tc|telecine|ppv|ppvrip|scr|screener|dvdscr|dvdscreener|bdscr|ddc|r5)\b/i' => -20, // Low quality or unsure sources |
|
177
|
|
|
'/\b(maxspeed|axxo|dimension|fxg)\b/i' => 50, // Well known teams |
|
178
|
|
|
'/\b(swesub|nlt-release)\b/i' => -30, // Avoid teams specialized in foreign versions |
|
179
|
|
|
'/\b(french|fra|truefrench|italian|ita|russian|german)\b/i' => -30, // Avoid dubbed language |
|
180
|
|
|
'/\b1080p\b/i' => 20, |
|
181
|
|
|
'/\b720p\b/i' => 30, // Favor 720p instead of 1080p because of the filesize and better "compatibility" for low powered computer |
|
182
|
|
|
'/\b(x264|xvid)\b/i' => 20, // Good formats |
|
183
|
|
|
'/\b(uncut|unrated|extended|director\'s cut|director cut)\b/i' => 20, // Director's cut version is supposedly better |
|
184
|
|
|
]; |
|
185
|
|
|
|
|
186
|
|
|
$cleanTitle = $this->cleanName($title); |
|
187
|
|
|
preg_match('/((18|19|20)\d{2})(– )?\)$/', $title, $matches); |
|
188
|
|
|
$year = $matches[1]; |
|
189
|
|
|
|
|
190
|
|
|
foreach ($sources as &$source) { |
|
191
|
|
|
$identity = 0; |
|
192
|
|
|
$quality = 0; |
|
193
|
|
|
|
|
194
|
|
|
// TODO: re-evaluate the alternate identity method with more data or drop it entirely |
|
195
|
|
|
// $yearPattern = '/(\D)' . $year . '\D.*$/'; |
|
196
|
|
|
// if (preg_match($yearPattern, $source['name'])) |
|
197
|
|
|
// { |
|
198
|
|
|
// $sourceWithoutYear = trim(preg_replace($yearPattern, '\1', $source['name'])); |
|
199
|
|
|
// $cleanSource = $this->cleanName($sourceWithoutYear); |
|
200
|
|
|
// |
|
201
|
|
|
// // Boost identity because we found the year |
|
202
|
|
|
// $identity += 20; |
|
203
|
|
|
// } |
|
204
|
|
|
// else |
|
205
|
|
|
// { |
|
206
|
|
|
// |
|
207
|
|
|
// $length = strlen($cleanTitle); |
|
208
|
|
|
// $pattern = '/^(.{0,' . $length . '}\w*)/'; |
|
209
|
|
|
// preg_match($pattern, $this->cleanName($source['name']), $m); |
|
210
|
|
|
// $cleanSource = $m[1]; |
|
211
|
|
|
// } |
|
212
|
|
|
// v($cleanTitle, $year, $source['name'], $sourceWithoutYear, $cleanSource, $identity); |
|
213
|
|
|
// Identity mostly is matching title in source name |
|
214
|
|
|
$cleanSource = mb_substr($this->cleanName($source['name']), 0, mb_strlen($cleanTitle)); |
|
215
|
|
|
similar_text($cleanTitle, $cleanSource, $identity); |
|
216
|
|
|
|
|
217
|
|
|
// If the name contains the year of the movie, boost identity |
|
218
|
|
|
if (preg_match("/\b$year\b/", $source['name'])) { |
|
219
|
|
|
$identity += 20; |
|
220
|
|
|
} |
|
221
|
|
|
|
|
222
|
|
|
// Apply all regexp based quality rules |
|
223
|
|
|
foreach ($rules as $pattern => $score) { |
|
224
|
|
|
if (preg_match($pattern, $source['name'], $matches)) { |
|
225
|
|
|
$quality += $score; |
|
226
|
|
|
} |
|
227
|
|
|
} |
|
228
|
|
|
|
|
229
|
|
|
// File should be at the very minimum +500MB |
|
230
|
|
|
if ($source['size'] > 500 * 1024 * 1024) { |
|
231
|
|
|
$quality += 10; |
|
232
|
|
|
} |
|
233
|
|
|
|
|
234
|
|
|
$source['identity'] = $identity; |
|
235
|
|
|
$source['quality'] = $quality; |
|
236
|
|
|
$source['score'] = $identity >= 100 && $quality > 80 ? 2 * $identity + $quality : 0; |
|
237
|
|
|
} |
|
238
|
|
|
|
|
239
|
|
|
// Sort by score, then seeds, then leech |
|
240
|
|
|
usort($sources, function ($source, $other) { |
|
241
|
|
|
if ($other['score'] != $source['score']) { |
|
242
|
|
|
return $other['score'] - $source['score']; |
|
243
|
|
|
} |
|
244
|
|
|
if ($other['seeds'] != $source['seeds']) { |
|
245
|
|
|
return $other['seeds'] - $source['seeds']; |
|
246
|
|
|
} |
|
247
|
|
|
if ($other['leech'] != $source['leech']) { |
|
248
|
|
|
return $other['leech'] - $source['leech']; |
|
249
|
|
|
} |
|
250
|
|
|
|
|
251
|
|
|
return strcmp($other['link'], $source['link']); |
|
252
|
|
|
}); |
|
253
|
|
|
|
|
254
|
|
|
return $sources; |
|
255
|
|
|
} |
|
256
|
|
|
} |
|
257
|
|
|
|