Passed
Pull Request — master (#600)
by Osma
02:56
created

annif.corpus.subject.SubjectFileTSV.languages()   A

Complexity

Conditions 1

Size

Total Lines 4
Code Lines 3

Duplication

Lines 4
Ratio 100 %

Importance

Changes 0
Metric Value
cc 1
eloc 3
nop 1
dl 4
loc 4
rs 10
c 0
b 0
f 0
1
"""Classes for supporting subject corpora expressed as directories or files"""
2
3
import annif.util
4
import numpy as np
5
from annif import logger
6
from .types import Subject
7
from .skos import serialize_subjects_to_skos
8
9
10 View Code Duplication
class SubjectFileTSV:
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
11
    """A subject vocabulary stored in a TSV file."""
12
13
    def __init__(self, path):
14
        self.path = path
15
16
    def _parse_line(self, line):
17
        vals = line.strip().split('\t', 2)
18
        clean_uri = annif.util.cleanup_uri(vals[0])
19
        label = vals[1] if len(vals) >= 2 else None
20
        notation = vals[2] if len(vals) >= 3 else None
21
        yield Subject(uri=clean_uri, label=label, notation=notation, text=None)
22
23
    @property
24
    def languages(self):
25
        # we don't have information about the language(s) of labels
26
        return None
27
28
    def subjects(self, language):
29
        with open(self.path, encoding='utf-8-sig') as subjfile:
30
            for line in subjfile:
31
                yield from self._parse_line(line)
32
33
    def save_skos(self, path, language):
34
        """Save the contents of the subject vocabulary into a SKOS/Turtle
35
        file with the given path name."""
36
        serialize_subjects_to_skos(self.subjects(language), language, path)
37
38
39 View Code Duplication
class SubjectIndex:
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
40
    """An index that remembers the associations between integers subject IDs
41
    and their URIs and labels."""
42
43
    def __init__(self):
44
        self._uris = []
45
        self._labels = []
46
        self._notations = []
47
        self._uri_idx = {}
48
        self._label_idx = {}
49
50
    def load_subjects(self, corpus, language):
51
        """Initialize the subject index from a subject corpus using labels
52
        in the given language."""
53
54
        for subject_id, subject in enumerate(corpus.subjects(language)):
55
            self._append(subject_id, subject.uri, subject.label,
56
                         subject.notation)
57
58
    def __len__(self):
59
        return len(self._uris)
60
61
    def __getitem__(self, subject_id):
62
        return (self._uris[subject_id], self._labels[subject_id],
63
                self._notations[subject_id])
64
65
    def _append(self, subject_id, uri, label, notation):
66
        self._uris.append(uri)
67
        self._labels.append(label)
68
        self._notations.append(notation)
69
        self._uri_idx[uri] = subject_id
70
        self._label_idx[label] = subject_id
71
72
    def append(self, uri, label, notation):
73
        subject_id = len(self._uris)
74
        self._append(subject_id, uri, label, notation)
75
76
    def contains_uri(self, uri):
77
        return uri in self._uri_idx
78
79
    def by_uri(self, uri, warnings=True):
80
        """return the subject index of a subject by its URI, or None if not found.
81
        If warnings=True, log a warning message if the URI cannot be found."""
82
        try:
83
            return self._uri_idx[uri]
84
        except KeyError:
85
            if warnings:
86
                logger.warning('Unknown subject URI <%s>', uri)
87
            return None
88
89
    def by_label(self, label):
90
        """return the subject index of a subject by its label"""
91
        try:
92
            return self._label_idx[label]
93
        except KeyError:
94
            logger.warning('Unknown subject label "%s"', label)
95
            return None
96
97
    def uris_to_labels(self, uris):
98
        """return a list of labels corresponding to the given URIs; unknown
99
        URIs are ignored"""
100
101
        return [self[subject_id][1]
102
                for subject_id in (self.by_uri(uri) for uri in uris)
103
                if subject_id is not None]
104
105
    def labels_to_uris(self, labels):
106
        """return a list of URIs corresponding to the given labels; unknown
107
        labels are ignored"""
108
109
        return [self[subject_id][0]
110
                for subject_id in (self.by_label(label) for label in labels)
111
                if subject_id is not None]
112
113
    def deprecated_ids(self):
114
        """return indices of deprecated subjects"""
115
116
        return [subject_id for subject_id, label in enumerate(self._labels)
117
                if label is None]
118
119
    @property
120
    def active(self):
121
        """return a list of (subject_id, uri, label, notation) tuples of all
122
        subjects that are not deprecated"""
123
124
        return [(subj_id, uri, label, notation)
125
                for subj_id, (uri, label, notation)
126
                in enumerate(zip(self._uris, self._labels, self._notations))
127
                if label is not None]
128
129
    def save(self, path):
130
        """Save this subject index into a file."""
131
132
        with open(path, 'w', encoding='utf-8') as subjfile:
133
            for uri, label, notation in self:
134
                line = "<{}>".format(uri)
135
                if label is not None:
136
                    line += ('\t' + label)
137
                    if notation is not None:
138
                        line += ('\t' + notation)
139
                print(line, file=subjfile)
140
141
    @classmethod
142
    def load(cls, path):
143
        """Load a subject index from a TSV file and return it."""
144
145
        corpus = SubjectFileTSV(path)
146
        subject_index = cls()
147
        subject_index.load_subjects(corpus, None)
148
        return subject_index
149
150
151 View Code Duplication
class SubjectSet:
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
152
    """Represents a set of subjects for a document."""
153
154
    def __init__(self, subj_data=None):
155
        """Create a SubjectSet and optionally initialize it from a tuple
156
        (URIs, labels)"""
157
158
        uris, labels = subj_data or ([], [])
159
        self.subject_uris = set(uris)
160
        self.subject_labels = set(labels)
161
162
    @classmethod
163
    def from_string(cls, subj_data):
164
        sset = cls()
165
        for line in subj_data.splitlines():
166
            sset._parse_line(line)
167
        return sset
168
169
    def _parse_line(self, line):
170
        vals = line.split("\t")
171
        for val in vals:
172
            val = val.strip()
173
            if val == '':
174
                continue
175
            if val.startswith('<') and val.endswith('>'):  # URI
176
                self.subject_uris.add(val[1:-1])
177
                continue
178
            self.subject_labels.add(val)
179
            return
180
181
    def has_uris(self):
182
        """returns True if the URIs for all subjects are known"""
183
        return len(self.subject_uris) >= len(self.subject_labels)
184
185
    def as_vector(self, subject_index, destination=None, warnings=True):
186
        """Return the hits as a one-dimensional NumPy array in sklearn
187
           multilabel indicator format, using a subject index as the source
188
           of subjects. Use destination array if given (not None), otherwise
189
           create and return a new one. If warnings=True, log warnings for
190
           unknown URIs."""
191
192
        if destination is None:
193
            destination = np.zeros(len(subject_index), dtype=bool)
194
195
        if self.has_uris():
196
            for uri in self.subject_uris:
197
                subject_id = subject_index.by_uri(
198
                    uri, warnings=warnings)
199
                if subject_id is not None:
200
                    destination[subject_id] = True
201
        else:
202
            for label in self.subject_labels:
203
                subject_id = subject_index.by_label(label)
204
                if subject_id is not None:
205
                    destination[subject_id] = True
206
        return destination
207