| Total Complexity | 8 |
| Total Lines | 43 |
| Duplicated Lines | 0 % |
| Changes | 0 | ||
| 1 | #!/usr/local/bin/python |
||
|
|
|||
| 2 | # coding: utf-8 |
||
| 3 | |||
| 4 | import binascii |
||
| 5 | import re |
||
| 6 | from typing import Generator, Type |
||
| 7 | |||
| 8 | import numpy as np |
||
| 9 | |||
| 10 | from titlesearch.language import LanguageTemplate |
||
| 11 | |||
| 12 | |||
| 13 | def extract_unicode_characters(string: str) -> Generator: |
||
| 14 | """Escape all unicode characters and return a generator for the int values of the unicode characters |
||
| 15 | |||
| 16 | :type string: str |
||
| 17 | :return: |
||
| 18 | """ |
||
| 19 | unicode_characters = re.findall(b'\\\\u([a-f0-9]{4})', string.encode('unicode_escape')) |
||
| 20 | for x in unicode_characters: |
||
| 21 | s = binascii.unhexlify(x) |
||
| 22 | yield int.from_bytes(s, byteorder='big') |
||
| 23 | |||
| 24 | |||
| 25 | def matches_language(title: str, language: Type[LanguageTemplate]) -> bool: |
||
| 26 | """Determine based on unicode elements, if the title matches the language pattern. |
||
| 27 | |||
| 28 | :type title: str |
||
| 29 | :type language: LanguageTemplate |
||
| 30 | :return: |
||
| 31 | """ |
||
| 32 | unicode_characters = list(extract_unicode_characters(title)) |
||
| 33 | if language.requires_unicode_characters and not unicode_characters: |
||
| 34 | return False |
||
| 35 | |||
| 36 | if language.forbids_unicode_characters and unicode_characters: |
||
| 37 | return False |
||
| 38 | |||
| 39 | # not sure but all titles I found so far have a clear character set, not shared |
||
| 40 | # noinspection PyTypeChecker |
||
| 41 | return all([np.any((language.unicode_character_lowers <= int(unichar)) & |
||
| 42 | (int(unichar) <= language.unicode_character_uppers)) for unichar in unicode_characters]) |
||
| 43 |
The coding style of this project requires that you add a docstring to this code element. Below, you find an example for methods:
If you would like to know more about docstrings, we recommend to read PEP-257: Docstring Conventions.