annif.corpus.subject.SubjectSet.has_uris() - Code Metrics - Inspection of "Initial support for online learning in vw_multi ba..." - NatLibFi/Annif - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Pull Request — master (#257)

by Osma

created 2019-02-27 11:41 UTC

annif.corpus.subject.SubjectSet.has_uris() A

↳ Parent: annif.corpus.subject

Complexity

Conditions

Size

Total Lines	3
Code Lines	2

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
eloc	2
dl	0
loc	3
rs	10
c	0
b	0
f	0
cc	1
nop	1

"""Classes for supporting subject corpora expressed as directories or files"""

import glob
import os.path
import annif.util
import numpy as np
from annif import logger
from .types import Subject, SubjectCorpus
from .convert import SubjectToDocumentCorpusMixin


class SubjectDirectory(SubjectCorpus, SubjectToDocumentCorpusMixin):
    """A subject corpus in the form of a directory with .txt files."""

    def __init__(self, path):
        self.path = path
        self._filenames = sorted(glob.glob(os.path.join(path, '*.txt')))

    @property
    def subjects(self):
        for filename in self._filenames:
            with open(filename) as subjfile:
                uri, label = subjfile.readline().strip().split(' ', 1)
                text = ' '.join(subjfile.readlines())
                yield Subject(uri=uri, label=label, text=text)


class SubjectFileTSV(SubjectCorpus, SubjectToDocumentCorpusMixin):
    """A subject corpus stored in a TSV file."""

    def __init__(self, path):
        self.path = path

    @property
    def subjects(self):
        with open(self.path) as subjfile:
            for line in subjfile:
                uri, label = line.strip().split(None, 1)
                clean_uri = annif.util.cleanup_uri(uri)
                yield Subject(uri=clean_uri, label=label, text=None)


class SubjectIndex:
    """An index that remembers the associations between integers subject IDs
    and their URIs and labels."""

    def __init__(self, corpus):
        """Initialize the subject index from a subject corpus."""
        self._uris = []
        self._labels = []
        self._uri_idx = {}
        self._label_idx = {}
        for subject_id, subject in enumerate(corpus.subjects):
            self._uris.append(subject.uri)
            self._labels.append(subject.label)
            self._uri_idx[subject.uri] = subject_id
            self._label_idx[subject.label] = subject_id

    def __len__(self):
        return len(self._uris)

    def __getitem__(self, subject_id):
        return (self._uris[subject_id], self._labels[subject_id])

    def by_uri(self, uri):
        """return the subject index of a subject by its URI"""
        try:
            return self._uri_idx[uri]
        except KeyError:
            logger.warning('Unknown subject URI <%s>', uri)
            return None

    def by_label(self, label):
        """return the subject index of a subject by its label"""
        try:
            return self._label_idx[label]
        except KeyError:
            logger.warning('Unknown subject label "%s"', label)
            return None

    def save(self, path):
        """Save this subject index into a file."""

        with open(path, 'w') as subjfile:
            for subject_id in range(len(self)):
                line = "<{}>\t{}".format(
                    self._uris[subject_id], self._labels[subject_id])
                print(line, file=subjfile)

    @classmethod
    def load(cls, path):
        """Load a subject index from a TSV file and return it."""

        corpus = SubjectFileTSV(path)
        return cls(corpus)


class SubjectSet:
    """Represents a set of subjects for a document."""

    def __init__(self, subj_data=None):
        """Create a SubjectSet and optionally initialize it from a tuple
        (URIs, labels)"""

        uris, labels = subj_data or ([], [])
        self.subject_uris = set(uris)
        self.subject_labels = set(labels)

    @classmethod
    def from_string(cls, subj_data):
        sset = cls()
        for line in subj_data.splitlines():
            sset._parse_line(line)
        return sset

    def _parse_line(self, line):
        vals = line.split("\t")
        for val in vals:
            val = val.strip()
            if val == '':
                continue
            if val.startswith('<') and val.endswith('>'):  # URI
                self.subject_uris.add(val[1:-1])
                continue
            self.subject_labels.add(val)
            return

    def has_uris(self):
        """returns True if the URIs for all subjects are known"""
        return len(self.subject_uris) >= len(self.subject_labels)

    def as_vector(self, subject_index):
        """Return the hits as a one-dimensional NumPy array in sklearn
           multilabel indicator format, using a subject index as the source
           of subjects."""

        vector = np.zeros(len(subject_index), dtype=np.int8)
        if self.has_uris():
            for uri in self.subject_uris:
                subject_id = subject_index.by_uri(uri)
                if subject_id is not None:
                    vector[subject_id] = 1
        else:
            for label in self.subject_labels:
                subject_id = subject_index.by_label(label)
                if subject_id is not None:
                    vector[subject_id] = 1
        return vector


1			"""Classes for supporting subject corpora expressed as directories or files"""
2
3			import glob
4			import os.path
5			import annif.util
6			import numpy as np
7			from annif import logger
8			from .types import Subject, SubjectCorpus
9			from .convert import SubjectToDocumentCorpusMixin
10
11
12			class SubjectDirectory(SubjectCorpus, SubjectToDocumentCorpusMixin):
13			"""A subject corpus in the form of a directory with .txt files."""
14
15			def __init__(self, path):
16			self.path = path
17			self._filenames = sorted(glob.glob(os.path.join(path, '*.txt')))
18
19			@property
20			def subjects(self):
21			for filename in self._filenames:
22			with open(filename) as subjfile:
23			uri, label = subjfile.readline().strip().split(' ', 1)
24			text = ' '.join(subjfile.readlines())
25			yield Subject(uri=uri, label=label, text=text)
26
27
28			class SubjectFileTSV(SubjectCorpus, SubjectToDocumentCorpusMixin):
29			"""A subject corpus stored in a TSV file."""
30
31			def __init__(self, path):
32			self.path = path
33
34			@property
35			def subjects(self):
36			with open(self.path) as subjfile:
37			for line in subjfile:
38			uri, label = line.strip().split(None, 1)
39			clean_uri = annif.util.cleanup_uri(uri)
40			yield Subject(uri=clean_uri, label=label, text=None)
41
42
43			class SubjectIndex:
44			"""An index that remembers the associations between integers subject IDs
45			and their URIs and labels."""
46
47			def __init__(self, corpus):
48			"""Initialize the subject index from a subject corpus."""
49			self._uris = []
50			self._labels = []
51			self._uri_idx = {}
52			self._label_idx = {}
53			for subject_id, subject in enumerate(corpus.subjects):
54			self._uris.append(subject.uri)
55			self._labels.append(subject.label)
56			self._uri_idx[subject.uri] = subject_id
57			self._label_idx[subject.label] = subject_id
58
59			def __len__(self):
60			return len(self._uris)
61
62			def __getitem__(self, subject_id):
63			return (self._uris[subject_id], self._labels[subject_id])
64
65			def by_uri(self, uri):
66			"""return the subject index of a subject by its URI"""
67			try:
68			return self._uri_idx[uri]
69			except KeyError:
70			logger.warning('Unknown subject URI <%s>', uri)
71			return None
72
73			def by_label(self, label):
74			"""return the subject index of a subject by its label"""
75			try:
76			return self._label_idx[label]
77			except KeyError:
78			logger.warning('Unknown subject label "%s"', label)
79			return None
80
81			def save(self, path):
82			"""Save this subject index into a file."""
83
84			with open(path, 'w') as subjfile:
85			for subject_id in range(len(self)):
86			line = "<{}>\t{}".format(
87			self._uris[subject_id], self._labels[subject_id])
88			print(line, file=subjfile)
89
90			@classmethod
91			def load(cls, path):
92			"""Load a subject index from a TSV file and return it."""
93
94			corpus = SubjectFileTSV(path)
95			return cls(corpus)
96
97
98			class SubjectSet:
99			"""Represents a set of subjects for a document."""
100
101			def __init__(self, subj_data=None):
102			"""Create a SubjectSet and optionally initialize it from a tuple
103			(URIs, labels)"""
104
105			uris, labels = subj_data or ([], [])
106			self.subject_uris = set(uris)
107			self.subject_labels = set(labels)
108
109			@classmethod
110			def from_string(cls, subj_data):
111			sset = cls()
112			for line in subj_data.splitlines():
113			sset._parse_line(line)
114			return sset
115
116			def _parse_line(self, line):
117			vals = line.split("\t")
118			for val in vals:
119			val = val.strip()
120			if val == '':
121			continue
122			if val.startswith('<') and val.endswith('>'): # URI
123			self.subject_uris.add(val[1:-1])
124			continue
125			self.subject_labels.add(val)
126			return
127
128			def has_uris(self):
129			"""returns True if the URIs for all subjects are known"""
130			return len(self.subject_uris) >= len(self.subject_labels)
131
132			def as_vector(self, subject_index):
133			"""Return the hits as a one-dimensional NumPy array in sklearn
134			multilabel indicator format, using a subject index as the source
135			of subjects."""
136
137			vector = np.zeros(len(subject_index), dtype=np.int8)
138			if self.has_uris():
139			for uri in self.subject_uris:
140			subject_id = subject_index.by_uri(uri)
141			if subject_id is not None:
142			vector[subject_id] = 1
143			else:
144			for label in self.subject_labels:
145			subject_id = subject_index.by_label(label)
146			if subject_id is not None:
147			vector[subject_id] = 1
148			return vector
149

NatLibFi / Annif

Pull Request — master (#257)

annif.corpus.subject.SubjectSet.has_uris() A

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like