titlesearch.bakaupdates.bakaupdates.BakaUpdates.get_similar_titles() - Code Metrics - DaRealFreak/TitleSearch - Measure and Improve Code Quality continuously with Scrutinizer

BakaUpdates.get_similar_titles() B
last analyzed 2018-04-07 14:58 UTC

↳ Parent: titlesearch.bakaupdates.bakaupdates

Complexity

Conditions

Size

Total Lines	32
Code Lines	19

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
cc	4
eloc	19
nop	1
dl	0
loc	32
rs	8.5806
c	0
b	0
f	0

#!/usr/local/bin/python
class SomeClass:
    def some_method(self):
        """Do x and return foo."""
# coding: utf-8

import re
from typing import Tuple

import jellyfish
import requests
from bs4 import BeautifulSoup as Soup

from titlesearch.language.detection import matches_language
from titlesearch.language.language_settings import *



class BakaUpdates(object):

    """Module for extracting alternative language titles for titles from https://www.mangaupdates.com"""


    SEARCH_URL = 'https://www.mangaupdates.com/series.html'
    KNOWN_LANGUAGES = [English, Japanese, Korean]

    ADDED_KEYWORDS = [' (Novel)']

    @staticmethod
    def get_similar_titles(title: str) -> list:
        """Main function for extracting alternate titles

        :type title: str
        :return:
        """
        payload = {
            'stype': 'title',
            'search': title
        }

        link = requests.get(url=BakaUpdates.SEARCH_URL, params=payload)
        soup = Soup(link.text, 'html.parser')

        seen_titles = []
        results = []
        for s in soup.find_all('td', attrs={"class": "text pad col1"}):

            search_result = BakaUpdates.clean_title(s.text)
            # I decided to add a seen titles list to prevent duplicate titles in the output.
            # BakaUpdates search returns titles without the added keywords first, so I'll skip the second result

            # which is most likely a novel
            if search_result not in seen_titles:
                results.append({
                    'title': search_result,
                    'link': s.find_next('a', href=True)['href'],
                    'similarity': jellyfish.jaro_distance(search_result.lower(), title.lower())
                })
                seen_titles.append(search_result)

        results.sort(key=lambda item: item['similarity'], reverse=True)
        return results

    @staticmethod
    def get_alternative_titles(title: str = '', link: str = '') -> dict:
        """Get alternative titles for the given title. Preferring link over title argument

        :type title: str
        :type link: str
        :return:
        """
        if title and not link:
            link = BakaUpdates.get_similar_titles(title)
            if link:
                link = link[0]['link']
            else:
                return BakaUpdates.group_titles(title, [])

        link = requests.get(url=link)
        release_title, alternative_titles = BakaUpdates.extract_titles(link.text)
        return BakaUpdates.group_titles(release_title, alternative_titles)

    @staticmethod
    def extract_titles(html_content: str) -> Tuple[str, list]:
        """Extract the titles from the HTML DOM tree

        :type html_content: str
        :return:
        """
        # html.parser can't handle <br> tags instead of <br/> tags and will append all titles as child

        # to the previous title, html5lib is slower but works properly
        soup = Soup(html_content, 'html5lib')

        release_title = soup.find('span', attrs={'class': ['releasestitle', 'tabletitle']}).text
        associated_names_tag = soup.find('b', string=re.compile("Associated Names"))
        alternative_titles = associated_names_tag.parent.find_next_sibling('div', attrs={'class': 'sContent'})

        for br in alternative_titles.find_all("br"):

            br.replace_with("\n")
        alternative_titles = [BakaUpdates.clean_title(title) for title in alternative_titles.text.split('\n')

                              if title.strip()]

        return release_title, alternative_titles

    @staticmethod

    def group_titles(release_title: str, alternative_titles: list) -> dict:
        """Iterate through the supported languages and group the titles according to the detected languages


        :type release_title: str
        :type alternative_titles: str
        :return:
        """
        grouped_titles = {}
        for language in BakaUpdates.KNOWN_LANGUAGES:
            grouped_titles[language.__name__.lower()] = []

        grouped_titles['english'] = [release_title]

        for title in alternative_titles:
            for language in BakaUpdates.KNOWN_LANGUAGES:
                if matches_language(title, language) and title not in grouped_titles[language.__name__.lower()]:

                    grouped_titles[language.__name__.lower()].append(title)
                    continue

        return grouped_titles

    @staticmethod
    def clean_title(title: str) -> str:
        """Strip title from leftover spaces and remove keywords added by BakaUpdates

        :type title: str
        :return:
        """
        title = title.strip()
        for keyword in BakaUpdates.ADDED_KEYWORDS:
            title = title.replace(keyword, '')
        return title


1		#!/usr/local/bin/python
		0 ignored issues – show Coding Style introduced 2018-04-05 10:05 UTC by Report Bug Copy Issue Report This module should have a docstring. The coding style of this project requires that you add a docstring to this code element. Below, you find an example for methods: class SomeClass: def some_method(self): """Do x and return foo.""" If you would like to know more about docstrings, we recommend to read PEP-257: Docstring Conventions. Loading history...
2		# coding: utf-8
3
4		import re
5		from typing import Tuple
6
7		import jellyfish
8		import requests
9		from bs4 import BeautifulSoup as Soup
10
11		from titlesearch.language.detection import matches_language
12		from titlesearch.language.language_settings import *
		0 ignored issues – show Coding Style introduced 2018-04-05 10:05 UTC by Report Bug Copy Issue Report The usage of wildcard imports like `titlesearch.language.language_settings` should generally be avoided. Loading history... Unused Code introduced 2018-04-05 10:05 UTC by Report Bug Copy Issue Report `np` was imported with wildcard, but is not used. Loading history... Unused Code introduced 2018-04-05 10:05 UTC by Report Bug Copy Issue Report `LanguageTemplate` was imported with wildcard, but is not used. Loading history...
13
14
15		class BakaUpdates(object):
		0 ignored issues – show Unused Code introduced 2018-04-05 10:05 UTC by Report Bug Copy Issue Report The variable `__class__` seems to be unused. Loading history...
16		"""Module for extracting alternative language titles for titles from https://www.mangaupdates.com"""
		0 ignored issues – show Coding Style introduced 2018-04-05 10:05 UTC by Report Bug Copy Issue Report This line is too long as per the coding-style (104/100). This check looks for lines that are too long. You can specify the maximum line length. Loading history...
17
18		SEARCH_URL = 'https://www.mangaupdates.com/series.html'
19		KNOWN_LANGUAGES = [English, Japanese, Korean]
		0 ignored issues – show Comprehensibility Best Practice introduced 2018-04-05 10:05 UTC by Report Bug Copy Issue Report The variable `Korean` does not seem to be defined. Loading history... Comprehensibility Best Practice introduced 2018-04-05 10:05 UTC by Report Bug Copy Issue Report The variable `English` does not seem to be defined. Loading history... Comprehensibility Best Practice introduced 2018-04-05 10:05 UTC by Report Bug Copy Issue Report The variable `Japanese` does not seem to be defined. Loading history...
20		ADDED_KEYWORDS = [' (Novel)']
21
22		@staticmethod
23		def get_similar_titles(title: str) -> list:
24		"""Main function for extracting alternate titles
25
26		:type title: str
27		:return:
28		"""
29		payload = {
30		'stype': 'title',
31		'search': title
32		}
33
34		link = requests.get(url=BakaUpdates.SEARCH_URL, params=payload)
35		soup = Soup(link.text, 'html.parser')
36
37		seen_titles = []
38		results = []
39		for s in soup.find_all('td', attrs={"class": "text pad col1"}):
		0 ignored issues – show Coding Style Naming introduced 2018-04-05 10:05 UTC by Report Bug Copy Issue Report The name `s` does not conform to the variable naming conventions (`(([a-z][a-z0-9_]{2,30})\|(_[a-z0-9_]*))$`). This check looks for invalid names for a range of different identifiers. You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements. If your project includes a Pylint configuration file, the settings contained in that file take precedence. To find out more about Pylint, please refer to their site. Loading history...
40		search_result = BakaUpdates.clean_title(s.text)
41		# I decided to add a seen titles list to prevent duplicate titles in the output.
42		# BakaUpdates search returns titles without the added keywords first, so I'll skip the second result
		0 ignored issues – show Coding Style introduced 2018-04-05 10:05 UTC by Report Bug Copy Issue Report This line is too long as per the coding-style (112/100). This check looks for lines that are too long. You can specify the maximum line length. Loading history...
43		# which is most likely a novel
44		if search_result not in seen_titles:
45		results.append({
46		'title': search_result,
47		'link': s.find_next('a', href=True)['href'],
48		'similarity': jellyfish.jaro_distance(search_result.lower(), title.lower())
49		})
50		seen_titles.append(search_result)
51
52		results.sort(key=lambda item: item['similarity'], reverse=True)
53		return results
54
55		@staticmethod
56		def get_alternative_titles(title: str = '', link: str = '') -> dict:
57		"""Get alternative titles for the given title. Preferring link over title argument
58
59		:type title: str
60		:type link: str
61		:return:
62		"""
63		if title and not link:
64		link = BakaUpdates.get_similar_titles(title)
65		if link:
66		link = link[0]['link']
67		else:
68		return BakaUpdates.group_titles(title, [])
69
70		link = requests.get(url=link)
71		release_title, alternative_titles = BakaUpdates.extract_titles(link.text)
72		return BakaUpdates.group_titles(release_title, alternative_titles)
73
74		@staticmethod
75		def extract_titles(html_content: str) -> Tuple[str, list]:
76		"""Extract the titles from the HTML DOM tree
77
78		:type html_content: str
79		:return:
80		"""
81		# html.parser can't handle <br> tags instead of <br/> tags and will append all titles as child
		0 ignored issues – show Coding Style introduced 2018-04-05 10:05 UTC by Report Bug Copy Issue Report This line is too long as per the coding-style (102/100). This check looks for lines that are too long. You can specify the maximum line length. Loading history...
82		# to the previous title, html5lib is slower but works properly
83		soup = Soup(html_content, 'html5lib')
84
85		release_title = soup.find('span', attrs={'class': ['releasestitle', 'tabletitle']}).text
86		associated_names_tag = soup.find('b', string=re.compile("Associated Names"))
87		alternative_titles = associated_names_tag.parent.find_next_sibling('div', attrs={'class': 'sContent'})
		0 ignored issues – show Coding Style introduced 2018-04-05 10:05 UTC by Report Bug Copy Issue Report This line is too long as per the coding-style (110/100). This check looks for lines that are too long. You can specify the maximum line length. Loading history...
88		for br in alternative_titles.find_all("br"):
		0 ignored issues – show Coding Style Naming introduced 2018-04-05 10:05 UTC by Report Bug Copy Issue Report The name `br` does not conform to the variable naming conventions (`(([a-z][a-z0-9_]{2,30})\|(_[a-z0-9_]*))$`). This check looks for invalid names for a range of different identifiers. You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements. If your project includes a Pylint configuration file, the settings contained in that file take precedence. To find out more about Pylint, please refer to their site. Loading history...
89		br.replace_with("\n")
90		alternative_titles = [BakaUpdates.clean_title(title) for title in alternative_titles.text.split('\n')
		0 ignored issues – show Coding Style introduced 2018-04-05 10:05 UTC by Report Bug Copy Issue Report This line is too long as per the coding-style (109/100). This check looks for lines that are too long. You can specify the maximum line length. Loading history...
91		if title.strip()]
92
93		return release_title, alternative_titles
94
95	View Code Duplication	@staticmethod
		0 ignored issues – show Duplication introduced 2018-04-05 12:08 UTC by Report Bug Copy Issue Report This code seems to be duplicated in your project. Loading history...
96		def group_titles(release_title: str, alternative_titles: list) -> dict:
97		"""Iterate through the supported languages and group the titles according to the detected languages
		0 ignored issues – show Coding Style introduced 2018-04-05 12:08 UTC by Report Bug Copy Issue Report This line is too long as per the coding-style (107/100). This check looks for lines that are too long. You can specify the maximum line length. Loading history...
98
99		:type release_title: str
100		:type alternative_titles: str
101		:return:
102		"""
103		grouped_titles = {}
104		for language in BakaUpdates.KNOWN_LANGUAGES:
105		grouped_titles[language.__name__.lower()] = []
106
107		grouped_titles['english'] = [release_title]
108
109		for title in alternative_titles:
110		for language in BakaUpdates.KNOWN_LANGUAGES:
111		if matches_language(title, language) and title not in grouped_titles[language.__name__.lower()]:
		0 ignored issues – show Coding Style introduced 2018-04-05 10:05 UTC by Report Bug Copy Issue Report This line is too long as per the coding-style (112/100). This check looks for lines that are too long. You can specify the maximum line length. Loading history...
112		grouped_titles[language.__name__.lower()].append(title)
113		continue
114
115		return grouped_titles
116
117		@staticmethod
118		def clean_title(title: str) -> str:
119		"""Strip title from leftover spaces and remove keywords added by BakaUpdates
120
121		:type title: str
122		:return:
123		"""
124		title = title.strip()
125		for keyword in BakaUpdates.ADDED_KEYWORDS:
126		title = title.replace(keyword, '')
127		return title
128

DaRealFreak / TitleSearch

BakaUpdates.get_similar_titles() B last analyzed 2018-04-07 14:58 UTC

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like

BakaUpdates.get_similar_titles() B
last analyzed 2018-04-07 14:58 UTC