1 | <?php |
||
2 | |||
3 | namespace mQueue\Service; |
||
4 | |||
5 | /** |
||
6 | * Search Engine to find sources for movies, given its title. |
||
7 | * It relies on Nova to query several popular websites. |
||
8 | */ |
||
9 | class SearchEngine |
||
10 | { |
||
11 | /** |
||
12 | * Returns the command for the appropriate version of Nova |
||
13 | * |
||
14 | * @return string |
||
15 | */ |
||
16 | protected function getNovaCmd() |
||
17 | { |
||
18 | $cmd = shell_exec('python --version 2>&1'); |
||
19 | preg_match('/\\d+\\.\\d+\\.\\d+/', $cmd, $matches); |
||
20 | $version = $matches[0]; |
||
21 | if (version_compare($version, '3.0.0', '>=')) { |
||
22 | return 'python ' . APPLICATION_PATH . '/../library/searchengine/nova3/nova2.py'; |
||
23 | } |
||
24 | |||
25 | return 'python ' . APPLICATION_PATH . '/../library/searchengine/nova/nova2.py'; |
||
26 | } |
||
27 | |||
28 | /** |
||
29 | * Execute a shell command with a timeout |
||
30 | * |
||
31 | * @param string $cmd shell command |
||
32 | * @param int $timeout seconds after which the process will be killed |
||
33 | * |
||
34 | * @return string the stdout of the command |
||
35 | */ |
||
36 | protected function execute($cmd, $timeout) |
||
37 | { |
||
38 | $maximumTime = time() + $timeout; |
||
39 | $stdout = null; |
||
40 | |||
41 | $pipes = []; |
||
42 | $process = proc_open( |
||
43 | $cmd, [['pipe', 'r'], ['pipe', 'w'], ['pipe', 'w']], $pipes |
||
44 | ); |
||
45 | |||
46 | if (is_resource($process)) { |
||
47 | // Give group id to process (to later kill all its children) |
||
48 | $status = proc_get_status($process); |
||
49 | posix_setpgid($status['pid'], $status['pid']); |
||
50 | |||
51 | stream_set_blocking($pipes[0], 0); |
||
52 | stream_set_blocking($pipes[1], 0); |
||
53 | stream_set_blocking($pipes[2], 0); |
||
54 | fclose($pipes[0]); |
||
55 | } |
||
56 | |||
57 | while (is_resource($process)) { |
||
58 | $stdout .= stream_get_contents($pipes[1]); |
||
59 | |||
60 | if (time() > $maximumTime) { |
||
61 | // sends SIGKILL to all processes inside group |
||
62 | posix_kill(-$status['pid'], 9); |
||
0 ignored issues
–
show
Comprehensibility
Best Practice
introduced
by
![]() |
|||
63 | proc_terminate($process, 9); |
||
64 | } |
||
65 | |||
66 | $status = proc_get_status($process); |
||
67 | if (!$status['running']) { |
||
68 | fclose($pipes[1]); |
||
69 | fclose($pipes[2]); |
||
70 | proc_close($process); |
||
71 | } |
||
72 | |||
73 | // 1 second will not make accurate timeout, but we don't really need accuracy |
||
74 | sleep(1); |
||
75 | } |
||
76 | |||
77 | return $stdout; |
||
78 | } |
||
79 | |||
80 | /** |
||
81 | * Parse string content and return an array of unique sources |
||
82 | * |
||
83 | * @param string $content |
||
84 | * |
||
85 | * @return array |
||
86 | */ |
||
87 | public function parse($content) |
||
88 | { |
||
89 | $data = []; |
||
90 | $keys = ['link', 'name', 'size', 'seeds', 'leech', 'engine_url', 'page']; |
||
91 | $duplicates = []; |
||
92 | foreach (explode(PHP_EOL, trim($content)) as $line) { |
||
93 | $values = explode('|', trim($line)); |
||
94 | if (count($keys) == count($values)) { |
||
95 | $source = array_combine($keys, $values); |
||
96 | $duplicateKey = $source['name'] . $source['size']; |
||
97 | if (!array_key_exists($duplicateKey, $duplicates)) { |
||
98 | $data[$source['link']] = $source; |
||
99 | $duplicates[$duplicateKey] = true; |
||
100 | } |
||
101 | } |
||
102 | } |
||
103 | |||
104 | return $data; |
||
105 | } |
||
106 | |||
107 | /** |
||
108 | * Search for the given title and return an array of sources |
||
109 | * |
||
110 | * @param string $title |
||
111 | * |
||
112 | * @return array sources |
||
113 | */ |
||
114 | public function search($title) |
||
115 | { |
||
116 | $cmd = $this->getNovaCmd() . ' all movies ' . escapeshellarg(str_replace(' ', '+', $title)) . ' 2>&1'; |
||
117 | echo $cmd . PHP_EOL; |
||
118 | $content = $this->execute($cmd, 5 * 60); // 5 minutes to search |
||
119 | |||
120 | $path = sys_get_temp_dir() . '/mqueue_' . $title; |
||
121 | file_put_contents($path, $content); |
||
122 | |||
123 | return $this->parse($content); |
||
124 | } |
||
125 | |||
126 | /** |
||
127 | * Returns the given name simplified as much as possible |
||
128 | * |
||
129 | * @param string $name |
||
130 | * |
||
131 | * @return string |
||
132 | */ |
||
133 | protected function cleanName($name) |
||
134 | { |
||
135 | // Insert space before uppercase letters |
||
136 | $name = preg_replace('/([A-Z])/', ' \1', $name); |
||
137 | $name = mb_strtolower($name); |
||
138 | |||
139 | // Get rid of all accents |
||
140 | $name = iconv('UTF-8', 'ASCII//TRANSLIT//IGNORE', $name); |
||
141 | |||
142 | // Replace special character with common representation |
||
143 | $name = str_replace('&', 'and', $name); |
||
144 | |||
145 | // Remove all grouped things (may include team name at the beginning of name) |
||
146 | // Or remove incomplete grouped things at the end of string |
||
147 | $name = preg_replace('/\(.*(\)|$)/U', ' ', $name); |
||
148 | $name = preg_replace('/\[.*(\]|$)/U', ' ', $name); |
||
149 | $name = preg_replace('/\{.*(\}|$)/U', ' ', $name); |
||
150 | |||
151 | // Keep only alphanum character |
||
152 | $name = preg_replace('/[^[:alnum:]]/', ' ', $name); |
||
153 | |||
154 | // Keep only a single space character |
||
155 | $name = preg_replace('/[[:space:]]+/', ' ', $name); |
||
156 | $name = trim($name); |
||
157 | |||
158 | return $name; |
||
159 | } |
||
160 | |||
161 | /** |
||
162 | * Compute scores of all sources according to the title searched |
||
163 | * |
||
164 | * @param string $title |
||
165 | * @param array $sources |
||
166 | * |
||
167 | * @return array |
||
168 | */ |
||
169 | public function computeScores($title, array $sources) |
||
170 | { |
||
171 | $rules = [ |
||
172 | // @see http://en.wikipedia.org/wiki/Pirated_movie_release_types |
||
173 | '/\b(dvdrip|bdrip|brrip|blu-ray|bluray|bdr|bd5|bd9)\b/i' => 80, // Good sources |
||
174 | '/\b(dvdr|dvd-full|full-rip|iso|dvd-5|dvd-9)\b/i' => 40, // Ok sources |
||
175 | '/\b(dsr|dsrip|dthrip|dvbrip|hdtv|pdtv|tvrip|hdtvrip|vodrip|vodr)\b/i' => 10, // Soso sources |
||
176 | '/\b(cam|camrip|ts|telesync|pdvd|wp|workprint|tc|telecine|ppv|ppvrip|scr|screener|dvdscr|dvdscreener|bdscr|ddc|r5)\b/i' => -20, // Low quality or unsure sources |
||
177 | '/\b(maxspeed|axxo|dimension|fxg)\b/i' => 50, // Well known teams |
||
178 | '/\b(swesub|nlt-release)\b/i' => -30, // Avoid teams specialized in foreign versions |
||
179 | '/\b(french|fra|truefrench|italian|ita|russian|german)\b/i' => -30, // Avoid dubbed language |
||
180 | '/\b1080p\b/i' => 20, |
||
181 | '/\b720p\b/i' => 30, // Favor 720p instead of 1080p because of the filesize and better "compatibility" for low powered computer |
||
182 | '/\b(x264|xvid)\b/i' => 20, // Good formats |
||
183 | '/\b(uncut|unrated|extended|director\'s cut|director cut)\b/i' => 20, // Director's cut version is supposedly better |
||
184 | ]; |
||
185 | |||
186 | $cleanTitle = $this->cleanName($title); |
||
187 | preg_match('/((18|19|20)\d{2})(– )?\)$/', $title, $matches); |
||
188 | $year = $matches[1]; |
||
189 | |||
190 | foreach ($sources as &$source) { |
||
191 | $identity = 0; |
||
192 | $quality = 0; |
||
193 | |||
194 | // TODO: re-evaluate the alternate identity method with more data or drop it entirely |
||
195 | // $yearPattern = '/(\D)' . $year . '\D.*$/'; |
||
196 | // if (preg_match($yearPattern, $source['name'])) |
||
197 | // { |
||
198 | // $sourceWithoutYear = trim(preg_replace($yearPattern, '\1', $source['name'])); |
||
199 | // $cleanSource = $this->cleanName($sourceWithoutYear); |
||
200 | // |
||
201 | // // Boost identity because we found the year |
||
202 | // $identity += 20; |
||
203 | // } |
||
204 | // else |
||
205 | // { |
||
206 | // |
||
207 | // $length = strlen($cleanTitle); |
||
208 | // $pattern = '/^(.{0,' . $length . '}\w*)/'; |
||
209 | // preg_match($pattern, $this->cleanName($source['name']), $m); |
||
210 | // $cleanSource = $m[1]; |
||
211 | // } |
||
212 | // v($cleanTitle, $year, $source['name'], $sourceWithoutYear, $cleanSource, $identity); |
||
213 | // Identity mostly is matching title in source name |
||
214 | $cleanSource = mb_substr($this->cleanName($source['name']), 0, mb_strlen($cleanTitle)); |
||
215 | similar_text($cleanTitle, $cleanSource, $identity); |
||
216 | |||
217 | // If the name contains the year of the movie, boost identity |
||
218 | if (preg_match("/\b$year\b/", $source['name'])) { |
||
219 | $identity += 20; |
||
220 | } |
||
221 | |||
222 | // Apply all regexp based quality rules |
||
223 | foreach ($rules as $pattern => $score) { |
||
224 | if (preg_match($pattern, $source['name'], $matches)) { |
||
225 | $quality += $score; |
||
226 | } |
||
227 | } |
||
228 | |||
229 | // File should be at the very minimum +500MB |
||
230 | if ($source['size'] > 500 * 1024 * 1024) { |
||
231 | $quality += 10; |
||
232 | } |
||
233 | |||
234 | $source['identity'] = $identity; |
||
235 | $source['quality'] = $quality; |
||
236 | $source['score'] = $identity >= 100 && $quality > 80 ? 2 * $identity + $quality : 0; |
||
237 | } |
||
238 | |||
239 | // Sort by score, then seeds, then leech |
||
240 | usort($sources, function ($source, $other) { |
||
241 | if ($other['score'] != $source['score']) { |
||
242 | return $other['score'] - $source['score']; |
||
243 | } |
||
244 | if ($other['seeds'] != $source['seeds']) { |
||
245 | return $other['seeds'] - $source['seeds']; |
||
246 | } |
||
247 | if ($other['leech'] != $source['leech']) { |
||
248 | return $other['leech'] - $source['leech']; |
||
249 | } |
||
250 | |||
251 | return strcmp($other['link'], $source['link']); |
||
252 | }); |
||
253 | |||
254 | return $sources; |
||
255 | } |
||
256 | } |
||
257 |