annif.corpus.subject.SubjectIndex.load_subjects() - Code Metrics - Inspection of "Make vocabularies multilingual" - NatLibFi/Annif - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Pull Request — master (#600)

by Osma

created 2022-08-04 07:41 UTC

annif.corpus.subject.SubjectIndex.load_subjects() A

↳ Parent: annif.corpus.subject

Complexity

Conditions

Size

Total Lines	7
Code Lines	4

Duplication

Lines	7
Ratio	100 %

Importance

Changes

Metric	Value
cc	2
eloc	4
nop	3
dl	7
loc	7
rs	10
c	0
b	0
f	0

"""Classes for supporting subject corpora expressed as directories or files"""

import annif.util
import numpy as np
from annif import logger
from .types import Subject
from .skos import serialize_subjects_to_skos


class SubjectFileTSV:

    """A subject vocabulary stored in a TSV file."""

    def __init__(self, path):
        self.path = path

    def _parse_line(self, line):
        vals = line.strip().split('\t', 2)
        clean_uri = annif.util.cleanup_uri(vals[0])
        label = vals[1] if len(vals) >= 2 else None
        notation = vals[2] if len(vals) >= 3 else None
        yield Subject(uri=clean_uri, label=label, notation=notation, text=None)

    @property
    def languages(self):
        # we don't have information about the language(s) of labels
        return None

    def subjects(self, language):
        with open(self.path, encoding='utf-8-sig') as subjfile:
            for line in subjfile:
                yield from self._parse_line(line)

    def save_skos(self, path, language):
        """Save the contents of the subject vocabulary into a SKOS/Turtle
        file with the given path name."""
        serialize_subjects_to_skos(self.subjects(language), language, path)


class SubjectIndex:

    """An index that remembers the associations between integers subject IDs
    and their URIs and labels."""

    def __init__(self):
        self._uris = []
        self._labels = []
        self._notations = []
        self._uri_idx = {}
        self._label_idx = {}

    def load_subjects(self, corpus, language):
        """Initialize the subject index from a subject corpus using labels
        in the given language."""

        for subject_id, subject in enumerate(corpus.subjects(language)):
            self._append(subject_id, subject.uri, subject.label,
                         subject.notation)

    def __len__(self):
        return len(self._uris)

    def __getitem__(self, subject_id):
        return (self._uris[subject_id], self._labels[subject_id],
                self._notations[subject_id])

    def _append(self, subject_id, uri, label, notation):
        self._uris.append(uri)
        self._labels.append(label)
        self._notations.append(notation)
        self._uri_idx[uri] = subject_id
        self._label_idx[label] = subject_id

    def append(self, uri, label, notation):
        subject_id = len(self._uris)
        self._append(subject_id, uri, label, notation)

    def contains_uri(self, uri):
        return uri in self._uri_idx

    def by_uri(self, uri, warnings=True):
        """return the subject index of a subject by its URI, or None if not found.
        If warnings=True, log a warning message if the URI cannot be found."""
        try:
            return self._uri_idx[uri]
        except KeyError:
            if warnings:
                logger.warning('Unknown subject URI <%s>', uri)
            return None

    def by_label(self, label):
        """return the subject index of a subject by its label"""
        try:
            return self._label_idx[label]
        except KeyError:
            logger.warning('Unknown subject label "%s"', label)
            return None

    def uris_to_labels(self, uris):
        """return a list of labels corresponding to the given URIs; unknown
        URIs are ignored"""

        return [self[subject_id][1]
                for subject_id in (self.by_uri(uri) for uri in uris)
                if subject_id is not None]

    def labels_to_uris(self, labels):
        """return a list of URIs corresponding to the given labels; unknown
        labels are ignored"""

        return [self[subject_id][0]
                for subject_id in (self.by_label(label) for label in labels)
                if subject_id is not None]

    def deprecated_ids(self):
        """return indices of deprecated subjects"""

        return [subject_id for subject_id, label in enumerate(self._labels)
                if label is None]

    @property
    def active(self):
        """return a list of (subject_id, uri, label, notation) tuples of all
        subjects that are not deprecated"""

        return [(subj_id, uri, label, notation)
                for subj_id, (uri, label, notation)
                in enumerate(zip(self._uris, self._labels, self._notations))
                if label is not None]

    def save(self, path):
        """Save this subject index into a file."""

        with open(path, 'w', encoding='utf-8') as subjfile:
            for uri, label, notation in self:
                line = "<{}>".format(uri)
                if label is not None:
                    line += ('\t' + label)
                    if notation is not None:
                        line += ('\t' + notation)
                print(line, file=subjfile)

    @classmethod
    def load(cls, path):
        """Load a subject index from a TSV file and return it."""

        corpus = SubjectFileTSV(path)
        subject_index = cls()
        subject_index.load_subjects(corpus, None)
        return subject_index


class SubjectSet:

    """Represents a set of subjects for a document."""

    def __init__(self, subj_data=None):
        """Create a SubjectSet and optionally initialize it from a tuple
        (URIs, labels)"""

        uris, labels = subj_data or ([], [])
        self.subject_uris = set(uris)
        self.subject_labels = set(labels)

    @classmethod
    def from_string(cls, subj_data):
        sset = cls()
        for line in subj_data.splitlines():
            sset._parse_line(line)
        return sset

    def _parse_line(self, line):
        vals = line.split("\t")
        for val in vals:
            val = val.strip()
            if val == '':
                continue
            if val.startswith('<') and val.endswith('>'):  # URI
                self.subject_uris.add(val[1:-1])
                continue
            self.subject_labels.add(val)
            return

    def has_uris(self):
        """returns True if the URIs for all subjects are known"""
        return len(self.subject_uris) >= len(self.subject_labels)

    def as_vector(self, subject_index, destination=None, warnings=True):
        """Return the hits as a one-dimensional NumPy array in sklearn
           multilabel indicator format, using a subject index as the source
           of subjects. Use destination array if given (not None), otherwise
           create and return a new one. If warnings=True, log warnings for
           unknown URIs."""

        if destination is None:
            destination = np.zeros(len(subject_index), dtype=bool)

        if self.has_uris():
            for uri in self.subject_uris:
                subject_id = subject_index.by_uri(
                    uri, warnings=warnings)
                if subject_id is not None:
                    destination[subject_id] = True
        else:
            for label in self.subject_labels:
                subject_id = subject_index.by_label(label)
                if subject_id is not None:
                    destination[subject_id] = True
        return destination


1		"""Classes for supporting subject corpora expressed as directories or files"""
2
3		import annif.util
4		import numpy as np
5		from annif import logger
6		from .types import Subject
7		from .skos import serialize_subjects_to_skos
8
9
10	View Code Duplication	class SubjectFileTSV:
		0 ignored issues – show Duplication introduced 2022-01-18 08:57 UTC by Report Bug Copy Issue Report This code seems to be duplicated in your project. Loading history...
11		"""A subject vocabulary stored in a TSV file."""
12
13		def __init__(self, path):
14		self.path = path
15
16		def _parse_line(self, line):
17		vals = line.strip().split('\t', 2)
18		clean_uri = annif.util.cleanup_uri(vals[0])
19		label = vals[1] if len(vals) >= 2 else None
20		notation = vals[2] if len(vals) >= 3 else None
21		yield Subject(uri=clean_uri, label=label, notation=notation, text=None)
22
23		@property
24		def languages(self):
25		# we don't have information about the language(s) of labels
26		return None
27
28		def subjects(self, language):
29		with open(self.path, encoding='utf-8-sig') as subjfile:
30		for line in subjfile:
31		yield from self._parse_line(line)
32
33		def save_skos(self, path, language):
34		"""Save the contents of the subject vocabulary into a SKOS/Turtle
35		file with the given path name."""
36		serialize_subjects_to_skos(self.subjects(language), language, path)
37
38
39	View Code Duplication	class SubjectIndex:
		0 ignored issues – show Duplication introduced 2022-01-18 08:57 UTC by Report Bug Copy Issue Report This code seems to be duplicated in your project. Loading history...
40		"""An index that remembers the associations between integers subject IDs
41		and their URIs and labels."""
42
43		def __init__(self):
44		self._uris = []
45		self._labels = []
46		self._notations = []
47		self._uri_idx = {}
48		self._label_idx = {}
49
50		def load_subjects(self, corpus, language):
51		"""Initialize the subject index from a subject corpus using labels
52		in the given language."""
53
54		for subject_id, subject in enumerate(corpus.subjects(language)):
55		self._append(subject_id, subject.uri, subject.label,
56		subject.notation)
57
58		def __len__(self):
59		return len(self._uris)
60
61		def __getitem__(self, subject_id):
62		return (self._uris[subject_id], self._labels[subject_id],
63		self._notations[subject_id])
64
65		def _append(self, subject_id, uri, label, notation):
66		self._uris.append(uri)
67		self._labels.append(label)
68		self._notations.append(notation)
69		self._uri_idx[uri] = subject_id
70		self._label_idx[label] = subject_id
71
72		def append(self, uri, label, notation):
73		subject_id = len(self._uris)
74		self._append(subject_id, uri, label, notation)
75
76		def contains_uri(self, uri):
77		return uri in self._uri_idx
78
79		def by_uri(self, uri, warnings=True):
80		"""return the subject index of a subject by its URI, or None if not found.
81		If warnings=True, log a warning message if the URI cannot be found."""
82		try:
83		return self._uri_idx[uri]
84		except KeyError:
85		if warnings:
86		logger.warning('Unknown subject URI <%s>', uri)
87		return None
88
89		def by_label(self, label):
90		"""return the subject index of a subject by its label"""
91		try:
92		return self._label_idx[label]
93		except KeyError:
94		logger.warning('Unknown subject label "%s"', label)
95		return None
96
97		def uris_to_labels(self, uris):
98		"""return a list of labels corresponding to the given URIs; unknown
99		URIs are ignored"""
100
101		return [self[subject_id][1]
102		for subject_id in (self.by_uri(uri) for uri in uris)
103		if subject_id is not None]
104
105		def labels_to_uris(self, labels):
106		"""return a list of URIs corresponding to the given labels; unknown
107		labels are ignored"""
108
109		return [self[subject_id][0]
110		for subject_id in (self.by_label(label) for label in labels)
111		if subject_id is not None]
112
113		def deprecated_ids(self):
114		"""return indices of deprecated subjects"""
115
116		return [subject_id for subject_id, label in enumerate(self._labels)
117		if label is None]
118
119		@property
120		def active(self):
121		"""return a list of (subject_id, uri, label, notation) tuples of all
122		subjects that are not deprecated"""
123
124		return [(subj_id, uri, label, notation)
125		for subj_id, (uri, label, notation)
126		in enumerate(zip(self._uris, self._labels, self._notations))
127		if label is not None]
128
129		def save(self, path):
130		"""Save this subject index into a file."""
131
132		with open(path, 'w', encoding='utf-8') as subjfile:
133		for uri, label, notation in self:
134		line = "<{}>".format(uri)
135		if label is not None:
136		line += ('\t' + label)
137		if notation is not None:
138		line += ('\t' + notation)
139		print(line, file=subjfile)
140
141		@classmethod
142		def load(cls, path):
143		"""Load a subject index from a TSV file and return it."""
144
145		corpus = SubjectFileTSV(path)
146		subject_index = cls()
147		subject_index.load_subjects(corpus, None)
148		return subject_index
149
150
151	View Code Duplication	class SubjectSet:
		0 ignored issues – show Duplication introduced 2022-01-18 08:57 UTC by Report Bug Copy Issue Report This code seems to be duplicated in your project. Loading history...
152		"""Represents a set of subjects for a document."""
153
154		def __init__(self, subj_data=None):
155		"""Create a SubjectSet and optionally initialize it from a tuple
156		(URIs, labels)"""
157
158		uris, labels = subj_data or ([], [])
159		self.subject_uris = set(uris)
160		self.subject_labels = set(labels)
161
162		@classmethod
163		def from_string(cls, subj_data):
164		sset = cls()
165		for line in subj_data.splitlines():
166		sset._parse_line(line)
167		return sset
168
169		def _parse_line(self, line):
170		vals = line.split("\t")
171		for val in vals:
172		val = val.strip()
173		if val == '':
174		continue
175		if val.startswith('<') and val.endswith('>'): # URI
176		self.subject_uris.add(val[1:-1])
177		continue
178		self.subject_labels.add(val)
179		return
180
181		def has_uris(self):
182		"""returns True if the URIs for all subjects are known"""
183		return len(self.subject_uris) >= len(self.subject_labels)
184
185		def as_vector(self, subject_index, destination=None, warnings=True):
186		"""Return the hits as a one-dimensional NumPy array in sklearn
187		multilabel indicator format, using a subject index as the source
188		of subjects. Use destination array if given (not None), otherwise
189		create and return a new one. If warnings=True, log warnings for
190		unknown URIs."""
191
192		if destination is None:
193		destination = np.zeros(len(subject_index), dtype=bool)
194
195		if self.has_uris():
196		for uri in self.subject_uris:
197		subject_id = subject_index.by_uri(
198		uri, warnings=warnings)
199		if subject_id is not None:
200		destination[subject_id] = True
201		else:
202		for label in self.subject_labels:
203		subject_id = subject_index.by_label(label)
204		if subject_id is not None:
205		destination[subject_id] = True
206		return destination
207

NatLibFi / Annif

Pull Request — master (#600)

annif.corpus.subject.SubjectIndex.load_subjects() A

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like