PowerKiKi /
mqueue
| 1 | <?php |
||
| 2 | |||
| 3 | namespace mQueue\Service; |
||
| 4 | |||
| 5 | /** |
||
| 6 | * Search Engine to find sources for movies, given its title. |
||
| 7 | * It relies on Nova to query several popular websites. |
||
| 8 | */ |
||
| 9 | class SearchEngine |
||
| 10 | { |
||
| 11 | /** |
||
| 12 | * Returns the command for the appropriate version of Nova |
||
| 13 | * |
||
| 14 | * @return string |
||
| 15 | */ |
||
| 16 | protected function getNovaCmd() |
||
| 17 | { |
||
| 18 | $cmd = shell_exec('python --version 2>&1'); |
||
| 19 | preg_match('/\\d+\\.\\d+\\.\\d+/', $cmd, $matches); |
||
| 20 | $version = $matches[0]; |
||
| 21 | if (version_compare($version, '3.0.0', '>=')) { |
||
| 22 | return 'python ' . APPLICATION_PATH . '/../library/searchengine/nova3/nova2.py'; |
||
| 23 | } |
||
| 24 | |||
| 25 | return 'python ' . APPLICATION_PATH . '/../library/searchengine/nova/nova2.py'; |
||
| 26 | } |
||
| 27 | |||
| 28 | /** |
||
| 29 | * Execute a shell command with a timeout |
||
| 30 | * |
||
| 31 | * @param string $cmd shell command |
||
| 32 | * @param int $timeout seconds after which the process will be killed |
||
| 33 | * |
||
| 34 | * @return string the stdout of the command |
||
| 35 | */ |
||
| 36 | protected function execute($cmd, $timeout) |
||
| 37 | { |
||
| 38 | $maximumTime = time() + $timeout; |
||
| 39 | $stdout = null; |
||
| 40 | |||
| 41 | $pipes = []; |
||
| 42 | $process = proc_open( |
||
| 43 | $cmd, [['pipe', 'r'], ['pipe', 'w'], ['pipe', 'w']], $pipes |
||
| 44 | ); |
||
| 45 | |||
| 46 | if (is_resource($process)) { |
||
| 47 | // Give group id to process (to later kill all its children) |
||
| 48 | $status = proc_get_status($process); |
||
| 49 | posix_setpgid($status['pid'], $status['pid']); |
||
| 50 | |||
| 51 | stream_set_blocking($pipes[0], 0); |
||
| 52 | stream_set_blocking($pipes[1], 0); |
||
| 53 | stream_set_blocking($pipes[2], 0); |
||
| 54 | fclose($pipes[0]); |
||
| 55 | } |
||
| 56 | |||
| 57 | while (is_resource($process)) { |
||
| 58 | $stdout .= stream_get_contents($pipes[1]); |
||
| 59 | |||
| 60 | if (time() > $maximumTime) { |
||
| 61 | // sends SIGKILL to all processes inside group |
||
| 62 | posix_kill(-$status['pid'], 9); |
||
|
0 ignored issues
–
show
Comprehensibility
Best Practice
introduced
by
Loading history...
|
|||
| 63 | proc_terminate($process, 9); |
||
| 64 | } |
||
| 65 | |||
| 66 | $status = proc_get_status($process); |
||
| 67 | if (!$status['running']) { |
||
| 68 | fclose($pipes[1]); |
||
| 69 | fclose($pipes[2]); |
||
| 70 | proc_close($process); |
||
| 71 | } |
||
| 72 | |||
| 73 | // 1 second will not make accurate timeout, but we don't really need accuracy |
||
| 74 | sleep(1); |
||
| 75 | } |
||
| 76 | |||
| 77 | return $stdout; |
||
| 78 | } |
||
| 79 | |||
| 80 | /** |
||
| 81 | * Parse string content and return an array of unique sources |
||
| 82 | * |
||
| 83 | * @param string $content |
||
| 84 | * |
||
| 85 | * @return array |
||
| 86 | */ |
||
| 87 | public function parse($content) |
||
| 88 | { |
||
| 89 | $data = []; |
||
| 90 | $keys = ['link', 'name', 'size', 'seeds', 'leech', 'engine_url', 'page']; |
||
| 91 | $duplicates = []; |
||
| 92 | foreach (explode(PHP_EOL, trim($content)) as $line) { |
||
| 93 | $values = explode('|', trim($line)); |
||
| 94 | if (count($keys) == count($values)) { |
||
| 95 | $source = array_combine($keys, $values); |
||
| 96 | $duplicateKey = $source['name'] . $source['size']; |
||
| 97 | if (!array_key_exists($duplicateKey, $duplicates)) { |
||
| 98 | $data[$source['link']] = $source; |
||
| 99 | $duplicates[$duplicateKey] = true; |
||
| 100 | } |
||
| 101 | } |
||
| 102 | } |
||
| 103 | |||
| 104 | return $data; |
||
| 105 | } |
||
| 106 | |||
| 107 | /** |
||
| 108 | * Search for the given title and return an array of sources |
||
| 109 | * |
||
| 110 | * @param string $title |
||
| 111 | * |
||
| 112 | * @return array sources |
||
| 113 | */ |
||
| 114 | public function search($title) |
||
| 115 | { |
||
| 116 | $cmd = $this->getNovaCmd() . ' all movies ' . escapeshellarg(str_replace(' ', '+', $title)) . ' 2>&1'; |
||
| 117 | echo $cmd . PHP_EOL; |
||
| 118 | $content = $this->execute($cmd, 5 * 60); // 5 minutes to search |
||
| 119 | |||
| 120 | $path = sys_get_temp_dir() . '/mqueue_' . $title; |
||
| 121 | file_put_contents($path, $content); |
||
| 122 | |||
| 123 | return $this->parse($content); |
||
| 124 | } |
||
| 125 | |||
| 126 | /** |
||
| 127 | * Returns the given name simplified as much as possible |
||
| 128 | * |
||
| 129 | * @param string $name |
||
| 130 | * |
||
| 131 | * @return string |
||
| 132 | */ |
||
| 133 | protected function cleanName($name) |
||
| 134 | { |
||
| 135 | // Insert space before uppercase letters |
||
| 136 | $name = preg_replace('/([A-Z])/', ' \1', $name); |
||
| 137 | $name = mb_strtolower($name); |
||
| 138 | |||
| 139 | // Get rid of all accents |
||
| 140 | $name = iconv('UTF-8', 'ASCII//TRANSLIT//IGNORE', $name); |
||
| 141 | |||
| 142 | // Replace special character with common representation |
||
| 143 | $name = str_replace('&', 'and', $name); |
||
| 144 | |||
| 145 | // Remove all grouped things (may include team name at the beginning of name) |
||
| 146 | // Or remove incomplete grouped things at the end of string |
||
| 147 | $name = preg_replace('/\(.*(\)|$)/U', ' ', $name); |
||
| 148 | $name = preg_replace('/\[.*(\]|$)/U', ' ', $name); |
||
| 149 | $name = preg_replace('/\{.*(\}|$)/U', ' ', $name); |
||
| 150 | |||
| 151 | // Keep only alphanum character |
||
| 152 | $name = preg_replace('/[^[:alnum:]]/', ' ', $name); |
||
| 153 | |||
| 154 | // Keep only a single space character |
||
| 155 | $name = preg_replace('/[[:space:]]+/', ' ', $name); |
||
| 156 | $name = trim($name); |
||
| 157 | |||
| 158 | return $name; |
||
| 159 | } |
||
| 160 | |||
| 161 | /** |
||
| 162 | * Compute scores of all sources according to the title searched |
||
| 163 | * |
||
| 164 | * @param string $title |
||
| 165 | * @param array $sources |
||
| 166 | * |
||
| 167 | * @return array |
||
| 168 | */ |
||
| 169 | public function computeScores($title, array $sources) |
||
| 170 | { |
||
| 171 | $rules = [ |
||
| 172 | // @see http://en.wikipedia.org/wiki/Pirated_movie_release_types |
||
| 173 | '/\b(dvdrip|bdrip|brrip|blu-ray|bluray|bdr|bd5|bd9)\b/i' => 80, // Good sources |
||
| 174 | '/\b(dvdr|dvd-full|full-rip|iso|dvd-5|dvd-9)\b/i' => 40, // Ok sources |
||
| 175 | '/\b(dsr|dsrip|dthrip|dvbrip|hdtv|pdtv|tvrip|hdtvrip|vodrip|vodr)\b/i' => 10, // Soso sources |
||
| 176 | '/\b(cam|camrip|ts|telesync|pdvd|wp|workprint|tc|telecine|ppv|ppvrip|scr|screener|dvdscr|dvdscreener|bdscr|ddc|r5)\b/i' => -20, // Low quality or unsure sources |
||
| 177 | '/\b(maxspeed|axxo|dimension|fxg)\b/i' => 50, // Well known teams |
||
| 178 | '/\b(swesub|nlt-release)\b/i' => -30, // Avoid teams specialized in foreign versions |
||
| 179 | '/\b(french|fra|truefrench|italian|ita|russian|german)\b/i' => -30, // Avoid dubbed language |
||
| 180 | '/\b1080p\b/i' => 20, |
||
| 181 | '/\b720p\b/i' => 30, // Favor 720p instead of 1080p because of the filesize and better "compatibility" for low powered computer |
||
| 182 | '/\b(x264|xvid)\b/i' => 20, // Good formats |
||
| 183 | '/\b(uncut|unrated|extended|director\'s cut|director cut)\b/i' => 20, // Director's cut version is supposedly better |
||
| 184 | ]; |
||
| 185 | |||
| 186 | $cleanTitle = $this->cleanName($title); |
||
| 187 | preg_match('/((18|19|20)\d{2})(– )?\)$/', $title, $matches); |
||
| 188 | $year = $matches[1]; |
||
| 189 | |||
| 190 | foreach ($sources as &$source) { |
||
| 191 | $identity = 0; |
||
| 192 | $quality = 0; |
||
| 193 | |||
| 194 | // TODO: re-evaluate the alternate identity method with more data or drop it entirely |
||
| 195 | // $yearPattern = '/(\D)' . $year . '\D.*$/'; |
||
| 196 | // if (preg_match($yearPattern, $source['name'])) |
||
| 197 | // { |
||
| 198 | // $sourceWithoutYear = trim(preg_replace($yearPattern, '\1', $source['name'])); |
||
| 199 | // $cleanSource = $this->cleanName($sourceWithoutYear); |
||
| 200 | // |
||
| 201 | // // Boost identity because we found the year |
||
| 202 | // $identity += 20; |
||
| 203 | // } |
||
| 204 | // else |
||
| 205 | // { |
||
| 206 | // |
||
| 207 | // $length = strlen($cleanTitle); |
||
| 208 | // $pattern = '/^(.{0,' . $length . '}\w*)/'; |
||
| 209 | // preg_match($pattern, $this->cleanName($source['name']), $m); |
||
| 210 | // $cleanSource = $m[1]; |
||
| 211 | // } |
||
| 212 | // v($cleanTitle, $year, $source['name'], $sourceWithoutYear, $cleanSource, $identity); |
||
| 213 | // Identity mostly is matching title in source name |
||
| 214 | $cleanSource = mb_substr($this->cleanName($source['name']), 0, mb_strlen($cleanTitle)); |
||
| 215 | similar_text($cleanTitle, $cleanSource, $identity); |
||
| 216 | |||
| 217 | // If the name contains the year of the movie, boost identity |
||
| 218 | if (preg_match("/\b$year\b/", $source['name'])) { |
||
| 219 | $identity += 20; |
||
| 220 | } |
||
| 221 | |||
| 222 | // Apply all regexp based quality rules |
||
| 223 | foreach ($rules as $pattern => $score) { |
||
| 224 | if (preg_match($pattern, $source['name'], $matches)) { |
||
| 225 | $quality += $score; |
||
| 226 | } |
||
| 227 | } |
||
| 228 | |||
| 229 | // File should be at the very minimum +500MB |
||
| 230 | if ($source['size'] > 500 * 1024 * 1024) { |
||
| 231 | $quality += 10; |
||
| 232 | } |
||
| 233 | |||
| 234 | $source['identity'] = $identity; |
||
| 235 | $source['quality'] = $quality; |
||
| 236 | $source['score'] = $identity >= 100 && $quality > 80 ? 2 * $identity + $quality : 0; |
||
| 237 | } |
||
| 238 | |||
| 239 | // Sort by score, then seeds, then leech |
||
| 240 | usort($sources, function ($source, $other) { |
||
| 241 | if ($other['score'] != $source['score']) { |
||
| 242 | return $other['score'] - $source['score']; |
||
| 243 | } |
||
| 244 | if ($other['seeds'] != $source['seeds']) { |
||
| 245 | return $other['seeds'] - $source['seeds']; |
||
| 246 | } |
||
| 247 | if ($other['leech'] != $source['leech']) { |
||
| 248 | return $other['leech'] - $source['leech']; |
||
| 249 | } |
||
| 250 | |||
| 251 | return strcmp($other['link'], $source['link']); |
||
| 252 | }); |
||
| 253 | |||
| 254 | return $sources; |
||
| 255 | } |
||
| 256 | } |
||
| 257 |