SearchEngine::parse() - Code Metrics - PowerKiKi/mqueue - Measure and Improve Code Quality continuously with Scrutinizer

SearchEngine::parse() A
last analyzed 2021-05-27 05:44 UTC

↳ Parent: SearchEngine

Complexity

Conditions	4
Paths	4

Size

Total Lines	18
Code Lines	12

Duplication

Lines	0
Ratio	0 %

Code Coverage

Tests	0
CRAP Score	20

Importance

Changes

Metric	Value
eloc	12
c	0
b	0
f	0
dl	0
loc	18
rs	9.8666
ccs	0
cts	17
cp	0
cc	4
nc	4
nop	1
crap	20

<?php

namespace mQueue\Service;

/**
 * Search Engine to find sources for movies, given its title.
 * It relies on Nova to query several popular websites.
 */
class SearchEngine
{
    /**
     * Returns the command for the appropriate version of Nova
     *
     * @return string
     */
    protected function getNovaCmd()
    {
        $cmd = shell_exec('python --version 2>&1');
        preg_match('/\\d+\\.\\d+\\.\\d+/', $cmd, $matches);
        $version = $matches[0];
        if (version_compare($version, '3.0.0', '>=')) {
            return 'python ' . APPLICATION_PATH . '/../library/searchengine/nova3/nova2.py';
        }

        return 'python ' . APPLICATION_PATH . '/../library/searchengine/nova/nova2.py';
    }

    /**
     * Execute a shell command with a timeout
     *
     * @param string $cmd shell command
     * @param int $timeout seconds after which the process will be killed
     *
     * @return string the stdout of the command
     */
    protected function execute($cmd, $timeout)
    {
        $maximumTime = time() + $timeout;
        $stdout = null;

        $pipes = [];
        $process = proc_open(
            $cmd, [['pipe', 'r'], ['pipe', 'w'], ['pipe', 'w']], $pipes
        );

        if (is_resource($process)) {
            // Give group id to process (to later kill all its children)
            $status = proc_get_status($process);
            posix_setpgid($status['pid'], $status['pid']);

            stream_set_blocking($pipes[0], 0);
            stream_set_blocking($pipes[1], 0);
            stream_set_blocking($pipes[2], 0);
            fclose($pipes[0]);
        }

        while (is_resource($process)) {
            $stdout .= stream_get_contents($pipes[1]);

            if (time() > $maximumTime) {
                // sends SIGKILL to all processes inside group
                posix_kill(-$status['pid'], 9);

                proc_terminate($process, 9);
            }

            $status = proc_get_status($process);
            if (!$status['running']) {
                fclose($pipes[1]);
                fclose($pipes[2]);
                proc_close($process);
            }

            // 1 second will not make accurate timeout, but we don't really need accuracy
            sleep(1);
        }

        return $stdout;
    }

    /**
     * Parse string content and return an array of unique sources
     *
     * @param string $content
     *
     * @return array
     */
    public function parse($content)
    {
        $data = [];
        $keys = ['link', 'name', 'size', 'seeds', 'leech', 'engine_url', 'page'];
        $duplicates = [];
        foreach (explode(PHP_EOL, trim($content)) as $line) {
            $values = explode('|', trim($line));
            if (count($keys) == count($values)) {
                $source = array_combine($keys, $values);
                $duplicateKey = $source['name'] . $source['size'];
                if (!array_key_exists($duplicateKey, $duplicates)) {
                    $data[$source['link']] = $source;
                    $duplicates[$duplicateKey] = true;
                }
            }
        }

        return $data;
    }

    /**
     * Search for the given title and return an array of sources
     *
     * @param string $title
     *
     * @return array sources
     */
    public function search($title)
    {
        $cmd = $this->getNovaCmd() . ' all movies ' . escapeshellarg(str_replace(' ', '+', $title)) . ' 2>&1';
        echo $cmd . PHP_EOL;
        $content = $this->execute($cmd, 5 * 60); // 5 minutes to search

        $path = sys_get_temp_dir() . '/mqueue_' . $title;
        file_put_contents($path, $content);

        return $this->parse($content);
    }

    /**
     * Returns the given name simplified as much as possible
     *
     * @param string $name
     *
     * @return string
     */
    protected function cleanName($name)
    {
        // Insert space before uppercase letters
        $name = preg_replace('/([A-Z])/', ' \1', $name);
        $name = mb_strtolower($name);

        // Get rid of all accents
        $name = iconv('UTF-8', 'ASCII//TRANSLIT//IGNORE', $name);

        // Replace special character with common representation
        $name = str_replace('&', 'and', $name);

        // Remove all grouped things (may include team name at the beginning of name)
        // Or remove incomplete grouped things at the end of string
        $name = preg_replace('/\(.*(\)|$)/U', ' ', $name);
        $name = preg_replace('/\[.*(\]|$)/U', ' ', $name);
        $name = preg_replace('/\{.*(\}|$)/U', ' ', $name);

        // Keep only alphanum character
        $name = preg_replace('/[^[:alnum:]]/', ' ', $name);

        // Keep only a single space character
        $name = preg_replace('/[[:space:]]+/', ' ', $name);
        $name = trim($name);

        return $name;
    }

    /**
     * Compute scores of all sources according to the title searched
     *
     * @param string $title
     * @param array $sources
     *
     * @return array
     */
    public function computeScores($title, array $sources)
    {
        $rules = [
            // @see http://en.wikipedia.org/wiki/Pirated_movie_release_types
            '/\b(dvdrip|bdrip|brrip|blu-ray|bluray|bdr|bd5|bd9)\b/i' => 80, // Good sources
            '/\b(dvdr|dvd-full|full-rip|iso|dvd-5|dvd-9)\b/i' => 40, // Ok sources
            '/\b(dsr|dsrip|dthrip|dvbrip|hdtv|pdtv|tvrip|hdtvrip|vodrip|vodr)\b/i' => 10, // Soso sources
            '/\b(cam|camrip|ts|telesync|pdvd|wp|workprint|tc|telecine|ppv|ppvrip|scr|screener|dvdscr|dvdscreener|bdscr|ddc|r5)\b/i' => -20, // Low quality or unsure sources
            '/\b(maxspeed|axxo|dimension|fxg)\b/i' => 50, // Well known teams
            '/\b(swesub|nlt-release)\b/i' => -30, // Avoid teams specialized in foreign versions
            '/\b(french|fra|truefrench|italian|ita|russian|german)\b/i' => -30, // Avoid dubbed language
            '/\b1080p\b/i' => 20,
            '/\b720p\b/i' => 30, // Favor 720p instead of 1080p because of the filesize and better "compatibility" for low powered computer
            '/\b(x264|xvid)\b/i' => 20, // Good formats
            '/\b(uncut|unrated|extended|director\'s cut|director cut)\b/i' => 20, // Director's cut version is supposedly better
        ];

        $cleanTitle = $this->cleanName($title);
        preg_match('/((18|19|20)\d{2})(– )?\)$/', $title, $matches);
        $year = $matches[1];

        foreach ($sources as &$source) {
            $identity = 0;
            $quality = 0;

            // TODO: re-evaluate the alternate identity method with more data or drop it entirely
            //			$yearPattern = '/(\D)' . $year . '\D.*$/';
            //			if (preg_match($yearPattern, $source['name']))
            //			{
            //				$sourceWithoutYear = trim(preg_replace($yearPattern, '\1', $source['name']));
            //				$cleanSource = $this->cleanName($sourceWithoutYear);
            //
            //				// Boost identity because we found the year
            //				$identity += 20;
            //			}
            //			else
            //			{
            //
            //				$length = strlen($cleanTitle);
            //				$pattern = '/^(.{0,' . $length . '}\w*)/';
            //				preg_match($pattern, $this->cleanName($source['name']), $m);
            //				$cleanSource = $m[1];
            //			}
            //			v($cleanTitle, $year, $source['name'], $sourceWithoutYear, $cleanSource, $identity);
            // Identity mostly is matching title in source name
            $cleanSource = mb_substr($this->cleanName($source['name']), 0, mb_strlen($cleanTitle));
            similar_text($cleanTitle, $cleanSource, $identity);

            // If the name contains the year of the movie, boost identity
            if (preg_match("/\b$year\b/", $source['name'])) {
                $identity += 20;
            }

            // Apply all regexp based quality rules
            foreach ($rules as $pattern => $score) {
                if (preg_match($pattern, $source['name'], $matches)) {
                    $quality += $score;
                }
            }

            // File should be at the very minimum +500MB
            if ($source['size'] > 500 * 1024 * 1024) {
                $quality += 10;
            }

            $source['identity'] = $identity;
            $source['quality'] = $quality;
            $source['score'] = $identity >= 100 && $quality > 80 ? 2 * $identity + $quality : 0;
        }

        // Sort by score, then seeds, then leech
        usort($sources, function ($source, $other) {
            if ($other['score'] != $source['score']) {
                return $other['score'] - $source['score'];
            }
            if ($other['seeds'] != $source['seeds']) {
                return $other['seeds'] - $source['seeds'];
            }
            if ($other['leech'] != $source['leech']) {
                return $other['leech'] - $source['leech'];
            }

            return strcmp($other['link'], $source['link']);
        });

        return $sources;
    }
}


1			<?php
2
3			namespace mQueue\Service;
4
5			/**
6			* Search Engine to find sources for movies, given its title.
7			* It relies on Nova to query several popular websites.
8			*/
9			class SearchEngine
10			{
11			/**
12			* Returns the command for the appropriate version of Nova
13			*
14			* @return string
15			*/
16			protected function getNovaCmd()
17			{
18			$cmd = shell_exec('python --version 2>&1');
19			preg_match('/\\d+\\.\\d+\\.\\d+/', $cmd, $matches);
20			$version = $matches[0];
21			if (version_compare($version, '3.0.0', '>=')) {
22			return 'python ' . APPLICATION_PATH . '/../library/searchengine/nova3/nova2.py';
23			}
24
25			return 'python ' . APPLICATION_PATH . '/../library/searchengine/nova/nova2.py';
26			}
27
28			/**
29			* Execute a shell command with a timeout
30			*
31			* @param string $cmd shell command
32			* @param int $timeout seconds after which the process will be killed
33			*
34			* @return string the stdout of the command
35			*/
36			protected function execute($cmd, $timeout)
37			{
38			$maximumTime = time() + $timeout;
39			$stdout = null;
40
41			$pipes = [];
42			$process = proc_open(
43			$cmd, [['pipe', 'r'], ['pipe', 'w'], ['pipe', 'w']], $pipes
44			);
45
46			if (is_resource($process)) {
47			// Give group id to process (to later kill all its children)
48			$status = proc_get_status($process);
49			posix_setpgid($status['pid'], $status['pid']);
50
51			stream_set_blocking($pipes[0], 0);
52			stream_set_blocking($pipes[1], 0);
53			stream_set_blocking($pipes[2], 0);
54			fclose($pipes[0]);
55			}
56
57			while (is_resource($process)) {
58			$stdout .= stream_get_contents($pipes[1]);
59
60			if (time() > $maximumTime) {
61			// sends SIGKILL to all processes inside group
62			posix_kill(-$status['pid'], 9);
			0 ignored issues – show Comprehensibility Best Practice introduced 2019-06-13 09:00 UTC by Report Bug Copy Issue Report The variable `$status` does not seem to be defined for all execution paths leading up to this point. Loading history...
63			proc_terminate($process, 9);
64			}
65
66			$status = proc_get_status($process);
67			if (!$status['running']) {
68			fclose($pipes[1]);
69			fclose($pipes[2]);
70			proc_close($process);
71			}
72
73			// 1 second will not make accurate timeout, but we don't really need accuracy
74			sleep(1);
75			}
76
77			return $stdout;
78			}
79
80			/**
81			* Parse string content and return an array of unique sources
82			*
83			* @param string $content
84			*
85			* @return array
86			*/
87			public function parse($content)
88			{
89			$data = [];
90			$keys = ['link', 'name', 'size', 'seeds', 'leech', 'engine_url', 'page'];
91			$duplicates = [];
92			foreach (explode(PHP_EOL, trim($content)) as $line) {
93			$values = explode('\|', trim($line));
94			if (count($keys) == count($values)) {
95			$source = array_combine($keys, $values);
96			$duplicateKey = $source['name'] . $source['size'];
97			if (!array_key_exists($duplicateKey, $duplicates)) {
98			$data[$source['link']] = $source;
99			$duplicates[$duplicateKey] = true;
100			}
101			}
102			}
103
104			return $data;
105			}
106
107			/**
108			* Search for the given title and return an array of sources
109			*
110			* @param string $title
111			*
112			* @return array sources
113			*/
114			public function search($title)
115			{
116			$cmd = $this->getNovaCmd() . ' all movies ' . escapeshellarg(str_replace(' ', '+', $title)) . ' 2>&1';
117			echo $cmd . PHP_EOL;
118			$content = $this->execute($cmd, 5 * 60); // 5 minutes to search
119
120			$path = sys_get_temp_dir() . '/mqueue_' . $title;
121			file_put_contents($path, $content);
122
123			return $this->parse($content);
124			}
125
126			/**
127			* Returns the given name simplified as much as possible
128			*
129			* @param string $name
130			*
131			* @return string
132			*/
133			protected function cleanName($name)
134			{
135			// Insert space before uppercase letters
136			$name = preg_replace('/([A-Z])/', ' \1', $name);
137			$name = mb_strtolower($name);
138
139			// Get rid of all accents
140			$name = iconv('UTF-8', 'ASCII//TRANSLIT//IGNORE', $name);
141
142			// Replace special character with common representation
143			$name = str_replace('&', 'and', $name);
144
145			// Remove all grouped things (may include team name at the beginning of name)
146			// Or remove incomplete grouped things at the end of string
147			$name = preg_replace('/\(.*(\)\|$)/U', ' ', $name);
148			$name = preg_replace('/\[.*(\]\|$)/U', ' ', $name);
149			$name = preg_replace('/\{.*(\}\|$)/U', ' ', $name);
150
151			// Keep only alphanum character
152			$name = preg_replace('/[^[:alnum:]]/', ' ', $name);
153
154			// Keep only a single space character
155			$name = preg_replace('/[[:space:]]+/', ' ', $name);
156			$name = trim($name);
157
158			return $name;
159			}
160
161			/**
162			* Compute scores of all sources according to the title searched
163			*
164			* @param string $title
165			* @param array $sources
166			*
167			* @return array
168			*/
169			public function computeScores($title, array $sources)
170			{
171			$rules = [
172			// @see http://en.wikipedia.org/wiki/Pirated_movie_release_types
173			'/\b(dvdrip\|bdrip\|brrip\|blu-ray\|bluray\|bdr\|bd5\|bd9)\b/i' => 80, // Good sources
174			'/\b(dvdr\|dvd-full\|full-rip\|iso\|dvd-5\|dvd-9)\b/i' => 40, // Ok sources
175			'/\b(dsr\|dsrip\|dthrip\|dvbrip\|hdtv\|pdtv\|tvrip\|hdtvrip\|vodrip\|vodr)\b/i' => 10, // Soso sources
176			'/\b(cam\|camrip\|ts\|telesync\|pdvd\|wp\|workprint\|tc\|telecine\|ppv\|ppvrip\|scr\|screener\|dvdscr\|dvdscreener\|bdscr\|ddc\|r5)\b/i' => -20, // Low quality or unsure sources
177			'/\b(maxspeed\|axxo\|dimension\|fxg)\b/i' => 50, // Well known teams
178			'/\b(swesub\|nlt-release)\b/i' => -30, // Avoid teams specialized in foreign versions
179			'/\b(french\|fra\|truefrench\|italian\|ita\|russian\|german)\b/i' => -30, // Avoid dubbed language
180			'/\b1080p\b/i' => 20,
181			'/\b720p\b/i' => 30, // Favor 720p instead of 1080p because of the filesize and better "compatibility" for low powered computer
182			'/\b(x264\|xvid)\b/i' => 20, // Good formats
183			'/\b(uncut\|unrated\|extended\|director\'s cut\|director cut)\b/i' => 20, // Director's cut version is supposedly better
184			];
185
186			$cleanTitle = $this->cleanName($title);
187			preg_match('/((18\|19\|20)\d{2})(– )?\)$/', $title, $matches);
188			$year = $matches[1];
189
190			foreach ($sources as &$source) {
191			$identity = 0;
192			$quality = 0;
193
194			// TODO: re-evaluate the alternate identity method with more data or drop it entirely
195			// $yearPattern = '/(\D)' . $year . '\D.*$/';
196			// if (preg_match($yearPattern, $source['name']))
197			// {
198			// $sourceWithoutYear = trim(preg_replace($yearPattern, '\1', $source['name']));
199			// $cleanSource = $this->cleanName($sourceWithoutYear);
200			//
201			// // Boost identity because we found the year
202			// $identity += 20;
203			// }
204			// else
205			// {
206			//
207			// $length = strlen($cleanTitle);
208			// $pattern = '/^(.{0,' . $length . '}\w*)/';
209			// preg_match($pattern, $this->cleanName($source['name']), $m);
210			// $cleanSource = $m[1];
211			// }
212			// v($cleanTitle, $year, $source['name'], $sourceWithoutYear, $cleanSource, $identity);
213			// Identity mostly is matching title in source name
214			$cleanSource = mb_substr($this->cleanName($source['name']), 0, mb_strlen($cleanTitle));
215			similar_text($cleanTitle, $cleanSource, $identity);
216
217			// If the name contains the year of the movie, boost identity
218			if (preg_match("/\b$year\b/", $source['name'])) {
219			$identity += 20;
220			}
221
222			// Apply all regexp based quality rules
223			foreach ($rules as $pattern => $score) {
224			if (preg_match($pattern, $source['name'], $matches)) {
225			$quality += $score;
226			}
227			}
228
229			// File should be at the very minimum +500MB
230			if ($source['size'] > 500 * 1024 * 1024) {
231			$quality += 10;
232			}
233
234			$source['identity'] = $identity;
235			$source['quality'] = $quality;
236			$source['score'] = $identity >= 100 && $quality > 80 ? 2 * $identity + $quality : 0;
237			}
238
239			// Sort by score, then seeds, then leech
240			usort($sources, function ($source, $other) {
241			if ($other['score'] != $source['score']) {
242			return $other['score'] - $source['score'];
243			}
244			if ($other['seeds'] != $source['seeds']) {
245			return $other['seeds'] - $source['seeds'];
246			}
247			if ($other['leech'] != $source['leech']) {
248			return $other['leech'] - $source['leech'];
249			}
250
251			return strcmp($other['link'], $source['link']);
252			});
253
254			return $sources;
255			}
256			}
257

PowerKiKi / mqueue

SearchEngine::parse() A last analyzed 2021-05-27 05:44 UTC

Complexity

Size

Duplication

Code Coverage

Importance

Duplication Side-by-Side

Filter issues like

SearchEngine::parse() A
last analyzed 2021-05-27 05:44 UTC