|
1
|
|
|
#!/usr/local/bin/python |
|
2
|
|
|
# coding: utf-8 |
|
3
|
|
|
""" |
|
4
|
|
|
configuration for the different languages, mostly viable for asian languages since they have |
|
5
|
|
|
big differences between the used characters. |
|
6
|
|
|
Unicode ranges mostly taken from here: http://www.rikai.com/library/kanjitables/kanji_codes.unicode.shtml |
|
|
|
|
|
|
7
|
|
|
""" |
|
8
|
|
|
|
|
9
|
|
|
import numpy as np |
|
10
|
|
|
|
|
11
|
|
|
from titlesearch.language import LanguageTemplate |
|
12
|
|
|
|
|
13
|
|
|
|
|
14
|
|
|
class English(LanguageTemplate): |
|
|
|
|
|
|
15
|
|
|
unicode_character_lowers = np.array([]) |
|
16
|
|
|
unicode_character_uppers = np.array([]) |
|
17
|
|
|
|
|
18
|
|
|
requires_unicode_characters = False |
|
19
|
|
|
forbids_unicode_characters = True |
|
20
|
|
|
|
|
21
|
|
|
|
|
22
|
|
|
class Korean(LanguageTemplate): |
|
23
|
|
|
""" |
|
24
|
|
|
Hangul Syllables (AC00-D7A3) which corresponds to (가-힣) |
|
25
|
|
|
Hangul Jamo (1100–11FF) |
|
26
|
|
|
Hangul Compatibility Jamo (3130-318F) |
|
27
|
|
|
Hangul Jamo Extended-A (A960-A97F) |
|
28
|
|
|
Hangul Jamo Extended-B (D7B0-D7FF) |
|
29
|
|
|
""" |
|
30
|
|
|
unicode_character_lowers = np.array([0xAC00, 0x1100, 0x3130, 0xA960, 0xD7B0]) |
|
31
|
|
|
unicode_character_uppers = np.array([0xD7A3, 0x11FF, 0x318F, 0xA97F, 0xD7FF]) |
|
32
|
|
|
|
|
33
|
|
|
requires_unicode_characters = True |
|
34
|
|
|
forbids_unicode_characters = False |
|
35
|
|
|
|
|
36
|
|
|
|
|
37
|
|
|
class Japanese(LanguageTemplate): |
|
38
|
|
|
""" |
|
39
|
|
|
JAPANESE_PUNCTUATION=(0x3000, 0x3F) |
|
40
|
|
|
JAPANESE_HIRAGANA=(0x3040, 0x5f) |
|
41
|
|
|
JAPANESE_KATAKANA=(0x30A0, 0x5f) |
|
42
|
|
|
JAPANESE_ROMAN_HALF_WIDTH_KATAKANA=(0xFF00, 0xEF) |
|
43
|
|
|
JAPANESE_KANJI=(0x4e00, 0x9FAF) |
|
44
|
|
|
JAPANESE_KANJI_RARE=(0x3400, 0x19BF) |
|
45
|
|
|
""" |
|
46
|
|
|
unicode_character_lowers = np.array([0x3000, 0x3040, 0x30a0, 0xff00, 0x4e00, 0x3400]) |
|
47
|
|
|
unicode_character_uppers = np.array([0x303f, 0x309f, 0x30ff, 0xffef, 0x9FAF, 0x4dbf]) |
|
48
|
|
|
|
|
49
|
|
|
requires_unicode_characters = True |
|
50
|
|
|
forbids_unicode_characters = False |
|
51
|
|
|
|
This check looks for lines that are too long. You can specify the maximum line length.