annif.corpus.subject.SubjectIndex.languages() - Code Metrics - Inspection of "Implement load-vocab and list-vocab commands" - NatLibFi/Annif - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Pull Request — master (#614)

by Osma

created 2022-09-02 07:05 UTC

annif.corpus.subject.SubjectIndex.languages() A

↳ Parent: annif.corpus.subject

Complexity

Conditions

Size

Total Lines	3
Code Lines	3

Duplication

Lines	3
Ratio	100 %

Importance

Changes

Metric	Value
cc	1
eloc	3
nop	1
dl	3
loc	3
rs	10
c	0
b	0
f	0

"""Classes for supporting subject corpora expressed as directories or files"""

import csv
import numpy as np
import annif.util
import os.path
from annif import logger
from .types import Subject, SubjectCorpus
from .skos import serialize_subjects_to_skos


class SubjectFileTSV(SubjectCorpus):

    """A monolingual subject vocabulary stored in a TSV file."""

    def __init__(self, path, language):
        """initialize the SubjectFileTSV given a path to a TSV file and the
        language of the vocabulary"""

        self.path = path
        self.language = language

    def _parse_line(self, line):
        vals = line.strip().split('\t', 2)
        clean_uri = annif.util.cleanup_uri(vals[0])
        label = vals[1] if len(vals) >= 2 else None
        labels = {self.language: label} if label else None
        notation = vals[2] if len(vals) >= 3 else None
        yield Subject(uri=clean_uri,
                      labels=labels,
                      notation=notation)

    @property
    def languages(self):
        return [self.language]

    @property
    def subjects(self):
        with open(self.path, encoding='utf-8-sig') as subjfile:
            for line in subjfile:
                yield from self._parse_line(line)

    def save_skos(self, path):
        """Save the contents of the subject vocabulary into a SKOS/Turtle
        file with the given path name."""
        serialize_subjects_to_skos(self.subjects, path)


class SubjectFileCSV(SubjectCorpus):

    """A multilingual subject vocabulary stored in a CSV file."""

    def __init__(self, path):
        """initialize the SubjectFileCSV given a path to a CSV file"""
        self.path = path

    def _parse_row(self, row):
        labels = {
            fname.replace('label_', ''): value or None
            for fname, value in row.items()
            if fname.startswith('label_')
        }

        # if there are no labels in any language, set labels to None
        # indicating a deprecated subject
        if set(labels.values()) == {None}:
            labels = None

        yield Subject(uri=annif.util.cleanup_uri(row['uri']),
                      labels=labels,
                      notation=row.get('notation', None) or None)

    @property
    def languages(self):
        # infer the supported languages from the CSV column names
        with open(self.path, encoding='utf-8-sig') as csvfile:
            reader = csv.reader(csvfile)
            fieldnames = next(reader, None)

        return [fname.replace('label_', '')
                for fname in fieldnames
                if fname.startswith('label_')]

    @property
    def subjects(self):
        with open(self.path, encoding='utf-8-sig') as csvfile:
            reader = csv.DictReader(csvfile)
            for row in reader:
                yield from self._parse_row(row)

    def save_skos(self, path):
        """Save the contents of the subject vocabulary into a SKOS/Turtle
        file with the given path name."""
        serialize_subjects_to_skos(self.subjects, path)

    @staticmethod
    def is_csv_file(path):
        """return True if the path looks like a CSV file"""

        return os.path.splitext(path)[1].lower() == '.csv'


class SubjectIndex:

    """An index that remembers the associations between integers subject IDs
    and their URIs and labels."""

    def __init__(self):
        self._subjects = []
        self._uri_idx = {}
        self._label_idx = {}
        self._languages = None

    def load_subjects(self, corpus):
        """Initialize the subject index from a subject corpus"""

        self._languages = corpus.languages
        for subject in corpus.subjects:
            self.append(subject)

    def __len__(self):
        return len(self._subjects)

    @property
    def languages(self):
        return self._languages

    def __getitem__(self, subject_id):
        return self._subjects[subject_id]

    def append(self, subject):
        if self._languages is None and subject.labels is not None:
            self._languages = list(subject.labels.keys())

        subject_id = len(self._subjects)
        self._uri_idx[subject.uri] = subject_id
        if subject.labels:
            for lang, label in subject.labels.items():
                self._label_idx[(label, lang)] = subject_id
        self._subjects.append(subject)

    def contains_uri(self, uri):
        return uri in self._uri_idx

    def by_uri(self, uri, warnings=True):
        """return the subject ID of a subject by its URI, or None if not found.
        If warnings=True, log a warning message if the URI cannot be found."""
        try:
            return self._uri_idx[uri]
        except KeyError:
            if warnings:
                logger.warning('Unknown subject URI <%s>', uri)
            return None

    def by_label(self, label, language):
        """return the subject ID of a subject by its label in a given
        language"""
        try:
            return self._label_idx[(label, language)]
        except KeyError:
            logger.warning('Unknown subject label "%s"@%s', label, language)
            return None

    def deprecated_ids(self):
        """return indices of deprecated subjects"""

        return [subject_id for subject_id, subject in enumerate(self._subjects)
                if subject.labels is None]

    @property
    def active(self):
        """return a list of (subject_id, subject) tuples of all subjects that
        are not deprecated"""

        return [(subj_id, subject)
                for subj_id, subject
                in enumerate(self._subjects)
                if subject.labels is not None]

    def save(self, path):
        """Save this subject index into a file with the given path name."""

        fieldnames = ['uri', 'notation'] + \
            [f'label_{lang}' for lang in self._languages]

        with open(path, 'w', encoding='utf-8', newline='') as csvfile:
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            writer.writeheader()
            for subject in self:
                row = {'uri': subject.uri,
                       'notation': subject.notation or ''}
                if subject.labels:
                    for lang, label in subject.labels.items():
                        row[f'label_{lang}'] = label
                writer.writerow(row)

    @classmethod
    def load(cls, path):
        """Load a subject index from a CSV file and return it."""

        corpus = SubjectFileCSV(path)
        subject_index = cls()
        subject_index.load_subjects(corpus)
        return subject_index


class SubjectSet:

    """Represents a set of subjects for a document."""

    def __init__(self, subject_ids=None):
        """Create a SubjectSet and optionally initialize it from an iterable
        of subject IDs"""

        if subject_ids:
            # use set comprehension to eliminate possible duplicates
            self._subject_ids = list({subject_id
                                      for subject_id in subject_ids
                                      if subject_id is not None})
        else:
            self._subject_ids = []

    def __len__(self):
        return len(self._subject_ids)

    def __getitem__(self, idx):
        return self._subject_ids[idx]

    def __bool__(self):
        return bool(self._subject_ids)

    def __eq__(self, other):
        if isinstance(other, SubjectSet):
            return self._subject_ids == other._subject_ids

        return False

    @classmethod
    def from_string(cls, subj_data, subject_index, language):
        subject_ids = set()
        for line in subj_data.splitlines():
            uri, label = cls._parse_line(line)
            if uri is not None:
                subject_ids.add(subject_index.by_uri(uri))
            else:
                subject_ids.add(subject_index.by_label(label, language))
        return cls(subject_ids)

    @staticmethod
    def _parse_line(line):
        uri = label = None
        vals = line.split("\t")
        for val in vals:
            val = val.strip()
            if val == '':
                continue
            if val.startswith('<') and val.endswith('>'):  # URI
                uri = val[1:-1]
                continue
            label = val
            break
        return uri, label

    def as_vector(self, size=None, destination=None):
        """Return the hits as a one-dimensional NumPy array in sklearn
           multilabel indicator format. Use destination array if given (not
           None), otherwise create and return a new one of the given size."""

        if destination is None:
            destination = np.zeros(size, dtype=bool)

        destination[list(self._subject_ids)] = True

        return destination


1		"""Classes for supporting subject corpora expressed as directories or files"""
2
3		import csv
4		import numpy as np
5		import annif.util
6		import os.path
7		from annif import logger
8		from .types import Subject, SubjectCorpus
9		from .skos import serialize_subjects_to_skos
10
11
12	View Code Duplication	class SubjectFileTSV(SubjectCorpus):
		0 ignored issues – show Duplication introduced 2022-01-18 08:57 UTC by Report Bug Copy Issue Report This code seems to be duplicated in your project. Loading history...
13		"""A monolingual subject vocabulary stored in a TSV file."""
14
15		def __init__(self, path, language):
16		"""initialize the SubjectFileTSV given a path to a TSV file and the
17		language of the vocabulary"""
18
19		self.path = path
20		self.language = language
21
22		def _parse_line(self, line):
23		vals = line.strip().split('\t', 2)
24		clean_uri = annif.util.cleanup_uri(vals[0])
25		label = vals[1] if len(vals) >= 2 else None
26		labels = {self.language: label} if label else None
27		notation = vals[2] if len(vals) >= 3 else None
28		yield Subject(uri=clean_uri,
29		labels=labels,
30		notation=notation)
31
32		@property
33		def languages(self):
34		return [self.language]
35
36		@property
37		def subjects(self):
38		with open(self.path, encoding='utf-8-sig') as subjfile:
39		for line in subjfile:
40		yield from self._parse_line(line)
41
42		def save_skos(self, path):
43		"""Save the contents of the subject vocabulary into a SKOS/Turtle
44		file with the given path name."""
45		serialize_subjects_to_skos(self.subjects, path)
46
47
48	View Code Duplication	class SubjectFileCSV(SubjectCorpus):
		0 ignored issues – show Duplication introduced 2022-01-18 08:57 UTC by Report Bug Copy Issue Report This code seems to be duplicated in your project. Loading history...
49		"""A multilingual subject vocabulary stored in a CSV file."""
50
51		def __init__(self, path):
52		"""initialize the SubjectFileCSV given a path to a CSV file"""
53		self.path = path
54
55		def _parse_row(self, row):
56		labels = {
57		fname.replace('label_', ''): value or None
58		for fname, value in row.items()
59		if fname.startswith('label_')
60		}
61
62		# if there are no labels in any language, set labels to None
63		# indicating a deprecated subject
64		if set(labels.values()) == {None}:
65		labels = None
66
67		yield Subject(uri=annif.util.cleanup_uri(row['uri']),
68		labels=labels,
69		notation=row.get('notation', None) or None)
70
71		@property
72		def languages(self):
73		# infer the supported languages from the CSV column names
74		with open(self.path, encoding='utf-8-sig') as csvfile:
75		reader = csv.reader(csvfile)
76		fieldnames = next(reader, None)
77
78		return [fname.replace('label_', '')
79		for fname in fieldnames
80		if fname.startswith('label_')]
81
82		@property
83		def subjects(self):
84		with open(self.path, encoding='utf-8-sig') as csvfile:
85		reader = csv.DictReader(csvfile)
86		for row in reader:
87		yield from self._parse_row(row)
88
89		def save_skos(self, path):
90		"""Save the contents of the subject vocabulary into a SKOS/Turtle
91		file with the given path name."""
92		serialize_subjects_to_skos(self.subjects, path)
93
94		@staticmethod
95		def is_csv_file(path):
96		"""return True if the path looks like a CSV file"""
97
98		return os.path.splitext(path)[1].lower() == '.csv'
99
100
101	View Code Duplication	class SubjectIndex:
		0 ignored issues – show Duplication introduced 2022-01-18 08:57 UTC by Report Bug Copy Issue Report This code seems to be duplicated in your project. Loading history...
102		"""An index that remembers the associations between integers subject IDs
103		and their URIs and labels."""
104
105		def __init__(self):
106		self._subjects = []
107		self._uri_idx = {}
108		self._label_idx = {}
109		self._languages = None
110
111		def load_subjects(self, corpus):
112		"""Initialize the subject index from a subject corpus"""
113
114		self._languages = corpus.languages
115		for subject in corpus.subjects:
116		self.append(subject)
117
118		def __len__(self):
119		return len(self._subjects)
120
121		@property
122		def languages(self):
123		return self._languages
124
125		def __getitem__(self, subject_id):
126		return self._subjects[subject_id]
127
128		def append(self, subject):
129		if self._languages is None and subject.labels is not None:
130		self._languages = list(subject.labels.keys())
131
132		subject_id = len(self._subjects)
133		self._uri_idx[subject.uri] = subject_id
134		if subject.labels:
135		for lang, label in subject.labels.items():
136		self._label_idx[(label, lang)] = subject_id
137		self._subjects.append(subject)
138
139		def contains_uri(self, uri):
140		return uri in self._uri_idx
141
142		def by_uri(self, uri, warnings=True):
143		"""return the subject ID of a subject by its URI, or None if not found.
144		If warnings=True, log a warning message if the URI cannot be found."""
145		try:
146		return self._uri_idx[uri]
147		except KeyError:
148		if warnings:
149		logger.warning('Unknown subject URI <%s>', uri)
150		return None
151
152		def by_label(self, label, language):
153		"""return the subject ID of a subject by its label in a given
154		language"""
155		try:
156		return self._label_idx[(label, language)]
157		except KeyError:
158		logger.warning('Unknown subject label "%s"@%s', label, language)
159		return None
160
161		def deprecated_ids(self):
162		"""return indices of deprecated subjects"""
163
164		return [subject_id for subject_id, subject in enumerate(self._subjects)
165		if subject.labels is None]
166
167		@property
168		def active(self):
169		"""return a list of (subject_id, subject) tuples of all subjects that
170		are not deprecated"""
171
172		return [(subj_id, subject)
173		for subj_id, subject
174		in enumerate(self._subjects)
175		if subject.labels is not None]
176
177		def save(self, path):
178		"""Save this subject index into a file with the given path name."""
179
180		fieldnames = ['uri', 'notation'] + \
181		[f'label_{lang}' for lang in self._languages]
182
183		with open(path, 'w', encoding='utf-8', newline='') as csvfile:
184		writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
185		writer.writeheader()
186		for subject in self:
187		row = {'uri': subject.uri,
188		'notation': subject.notation or ''}
189		if subject.labels:
190		for lang, label in subject.labels.items():
191		row[f'label_{lang}'] = label
192		writer.writerow(row)
193
194		@classmethod
195		def load(cls, path):
196		"""Load a subject index from a CSV file and return it."""
197
198		corpus = SubjectFileCSV(path)
199		subject_index = cls()
200		subject_index.load_subjects(corpus)
201		return subject_index
202
203
204	View Code Duplication	class SubjectSet:
		0 ignored issues – show Duplication introduced 2022-08-15 05:52 UTC by Report Bug Copy Issue Report This code seems to be duplicated in your project. Loading history...
205		"""Represents a set of subjects for a document."""
206
207		def __init__(self, subject_ids=None):
208		"""Create a SubjectSet and optionally initialize it from an iterable
209		of subject IDs"""
210
211		if subject_ids:
212		# use set comprehension to eliminate possible duplicates
213		self._subject_ids = list({subject_id
214		for subject_id in subject_ids
215		if subject_id is not None})
216		else:
217		self._subject_ids = []
218
219		def __len__(self):
220		return len(self._subject_ids)
221
222		def __getitem__(self, idx):
223		return self._subject_ids[idx]
224
225		def __bool__(self):
226		return bool(self._subject_ids)
227
228		def __eq__(self, other):
229		if isinstance(other, SubjectSet):
230		return self._subject_ids == other._subject_ids
231
232		return False
233
234		@classmethod
235		def from_string(cls, subj_data, subject_index, language):
236		subject_ids = set()
237		for line in subj_data.splitlines():
238		uri, label = cls._parse_line(line)
239		if uri is not None:
240		subject_ids.add(subject_index.by_uri(uri))
241		else:
242		subject_ids.add(subject_index.by_label(label, language))
243		return cls(subject_ids)
244
245		@staticmethod
246		def _parse_line(line):
247		uri = label = None
248		vals = line.split("\t")
249		for val in vals:
250		val = val.strip()
251		if val == '':
252		continue
253		if val.startswith('<') and val.endswith('>'): # URI
254		uri = val[1:-1]
255		continue
256		label = val
257		break
258		return uri, label
259
260		def as_vector(self, size=None, destination=None):
261		"""Return the hits as a one-dimensional NumPy array in sklearn
262		multilabel indicator format. Use destination array if given (not
263		None), otherwise create and return a new one of the given size."""
264
265		if destination is None:
266		destination = np.zeros(size, dtype=bool)
267
268		destination[list(self._subject_ids)] = True
269
270		return destination
271

NatLibFi / Annif

Pull Request — master (#614)

annif.corpus.subject.SubjectIndex.languages() A

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like