Passed
Pull Request — master (#507)
by Juho
03:09
created

annif.util.detect_language()   A

Complexity

Conditions 3

Size

Total Lines 9
Code Lines 5

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 3
eloc 5
nop 1
dl 0
loc 9
rs 10
c 0
b 0
f 0
1
"""Utility functions for Annif"""
2
3
import glob
4
import os
5
import os.path
6
import tempfile
7
import numpy as np
8
import cld3
9
from annif import logger
10
from annif.suggestion import VectorSuggestionResult
11
12
13
def atomic_save(obj, dirname, filename, method=None):
14
    """Save the given object (which must have a .save() method, unless the
15
    method parameter is given) into the given directory with the given
16
    filename, using a temporary file and renaming the temporary file to the
17
    final name."""
18
19
    prefix, suffix = os.path.splitext(filename)
20
    tempfd, tempfilename = tempfile.mkstemp(
21
        prefix=prefix, suffix=suffix, dir=dirname)
22
    os.close(tempfd)
23
    logger.debug('saving %s to temporary file %s', str(obj)[:90], tempfilename)
24
    if method is not None:
25
        method(obj, tempfilename)
26
    else:
27
        obj.save(tempfilename)
28
    for fn in glob.glob(tempfilename + '*'):
29
        newname = fn.replace(tempfilename, os.path.join(dirname, filename))
30
        logger.debug('renaming temporary file %s to %s', fn, newname)
31
        os.rename(fn, newname)
32
33
34
def cleanup_uri(uri):
35
    """remove angle brackets from a URI, if any"""
36
    if uri.startswith('<') and uri.endswith('>'):
37
        return uri[1:-1]
38
    return uri
39
40
41
def merge_hits(weighted_hits, subject_index):
42
    """Merge hits from multiple sources. Input is a sequence of WeightedSuggestion
43
    objects. A SubjectIndex is needed to convert between subject IDs and URIs.
44
    Returns an SuggestionResult object."""
45
46
    weights = [whit.weight for whit in weighted_hits]
47
    scores = [whit.hits.as_vector(subject_index) for whit in weighted_hits]
48
    result = np.average(scores, axis=0, weights=weights)
49
    return VectorSuggestionResult(result)
50
51
52
def parse_sources(sourcedef):
53
    """parse a source definition such as 'src1:1.0,src2' into a sequence of
54
    tuples (src_id, weight)"""
55
56
    sources = []
57
    totalweight = 0.0
58
    for srcdef in sourcedef.strip().split(','):
59
        srcval = srcdef.strip().split(':')
60
        src_id = srcval[0]
61
        if len(srcval) > 1:
62
            weight = float(srcval[1])
63
        else:
64
            weight = 1.0
65
        sources.append((src_id, weight))
66
        totalweight += weight
67
    return [(srcid, weight / totalweight) for srcid, weight in sources]
68
69
70
def parse_args(param_string):
71
    """Parse a string of comma separated arguments such as '42,43,key=abc' into
72
    a list of positional args [42, 43] and a dict of keyword args {key: abc}"""
73
74
    if not param_string:
75
        return [], {}
76
    posargs = []
77
    kwargs = {}
78
    param_strings = param_string.split(',')
79
    for p_string in param_strings:
80
        parts = p_string.split('=')
81
        if len(parts) == 1:
82
            posargs.append(p_string)
83
        elif len(parts) == 2:
84
            kwargs[parts[0]] = parts[1]
85
    return posargs, kwargs
86
87
88
def detect_language(text):
89
    """Tries to detect the language of a text input. Outputs a BCP-47-style
90
    language code (e.g. 'en') and a probability for the language in a tuple."""
91
92
    lan_info = cld3.get_language(text)
93
    if lan_info is not None and lan_info.is_reliable:
94
        return (lan_info.language, lan_info.probability)
95
    else:
96
        return (None, None)
97
98
99
def boolean(val):
100
    """Convert the given value to a boolean True/False value, if it isn't already.
101
    True values are '1', 'yes', 'true', and 'on' (case insensitive), everything
102
    else is False."""
103
104
    return str(val).lower() in ('1', 'yes', 'true', 'on')
105
106
107
def identity(x):
108
    """Identity function: return the given argument unchanged"""
109
    return x
110