annif.corpus.document.DocumentDirectory.documents() - Code Metrics - Inspection of "Initial support for online learning in vw_multi ba..." - NatLibFi/Annif - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Pull Request — master (#257)

by Osma

created 2019-02-27 11:41 UTC

DocumentDirectory.documents() A

↳ Parent: annif.corpus.document

Complexity

Conditions

Size

Total Lines	9
Code Lines	9

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
eloc	9
dl	0
loc	9
rs	9.95
c	0
b	0
f	0
cc	4
nop	1

"""Clases for supporting document corpora"""

import glob
import os.path
import re
import gzip
import annif.util
from .types import Document, DocumentCorpus
from .convert import DocumentToSubjectCorpusMixin
from .subject import SubjectSet


class DocumentDirectory(DocumentCorpus, DocumentToSubjectCorpusMixin):
    """A directory of files as a full text document corpus"""

    def __init__(self, path, require_subjects=False):
        self.path = path
        self.require_subjects = require_subjects

    def __iter__(self):
        """Iterate through the directory, yielding tuples of (docfile,
        subjectfile) containing file paths. If there is no key file and
        require_subjects is False, the subjectfile will be returned as None."""

        for filename in sorted(glob.glob(os.path.join(self.path, '*.txt'))):
            tsvfilename = re.sub(r'\.txt$', '.tsv', filename)
            if os.path.exists(tsvfilename):
                yield (filename, tsvfilename)
                continue
            keyfilename = re.sub(r'\.txt$', '.key', filename)
            if os.path.exists(keyfilename):
                yield (filename, keyfilename)
                continue
            if not self.require_subjects:
                yield (filename, None)

    @property
    def documents(self):
        for docfilename, keyfilename in self:
            with open(docfilename, errors='replace') as docfile:
                text = docfile.read()
            with open(keyfilename) as keyfile:
                subjects = SubjectSet.from_string(keyfile.read())
            yield Document(text=text, uris=subjects.subject_uris,
                           labels=subjects.subject_labels)


class DocumentFile(DocumentCorpus, DocumentToSubjectCorpusMixin):
    """A TSV file as a corpus of documents with subjects"""

    def __init__(self, path):
        self.path = path

    @property
    def documents(self):
        if self.path.endswith('.gz'):
            def opener(path):
                """open a gzip compressed file in text mode"""
                return gzip.open(path, mode='rt')
        else:
            opener = open

        with opener(self.path) as tsvfile:
            for line in tsvfile:
                text, uris = line.split('\t', maxsplit=1)
                subjects = [annif.util.cleanup_uri(uri)
                            for uri in uris.split()]
                yield Document(text=text, uris=subjects, labels=[])


class DocumentList(DocumentCorpus, DocumentToSubjectCorpusMixin):
    """A document corpus based on a list of other iterable of Document
    objects"""

    def __init__(self, documents):
        self._documents = documents

    @property
    def documents(self):
        yield from self._documents


1			"""Clases for supporting document corpora"""
2
3			import glob
4			import os.path
5			import re
6			import gzip
7			import annif.util
8			from .types import Document, DocumentCorpus
9			from .convert import DocumentToSubjectCorpusMixin
10			from .subject import SubjectSet
11
12
13			class DocumentDirectory(DocumentCorpus, DocumentToSubjectCorpusMixin):
14			"""A directory of files as a full text document corpus"""
15
16			def __init__(self, path, require_subjects=False):
17			self.path = path
18			self.require_subjects = require_subjects
19
20			def __iter__(self):
21			"""Iterate through the directory, yielding tuples of (docfile,
22			subjectfile) containing file paths. If there is no key file and
23			require_subjects is False, the subjectfile will be returned as None."""
24
25			for filename in sorted(glob.glob(os.path.join(self.path, '*.txt'))):
26			tsvfilename = re.sub(r'\.txt$', '.tsv', filename)
27			if os.path.exists(tsvfilename):
28			yield (filename, tsvfilename)
29			continue
30			keyfilename = re.sub(r'\.txt$', '.key', filename)
31			if os.path.exists(keyfilename):
32			yield (filename, keyfilename)
33			continue
34			if not self.require_subjects:
35			yield (filename, None)
36
37			@property
38			def documents(self):
39			for docfilename, keyfilename in self:
40			with open(docfilename, errors='replace') as docfile:
41			text = docfile.read()
42			with open(keyfilename) as keyfile:
43			subjects = SubjectSet.from_string(keyfile.read())
44			yield Document(text=text, uris=subjects.subject_uris,
45			labels=subjects.subject_labels)
46
47
48			class DocumentFile(DocumentCorpus, DocumentToSubjectCorpusMixin):
49			"""A TSV file as a corpus of documents with subjects"""
50
51			def __init__(self, path):
52			self.path = path
53
54			@property
55			def documents(self):
56			if self.path.endswith('.gz'):
57			def opener(path):
58			"""open a gzip compressed file in text mode"""
59			return gzip.open(path, mode='rt')
60			else:
61			opener = open
62
63			with opener(self.path) as tsvfile:
64			for line in tsvfile:
65			text, uris = line.split('\t', maxsplit=1)
66			subjects = [annif.util.cleanup_uri(uri)
67			for uri in uris.split()]
68			yield Document(text=text, uris=subjects, labels=[])
69
70
71			class DocumentList(DocumentCorpus, DocumentToSubjectCorpusMixin):
72			"""A document corpus based on a list of other iterable of Document
73			objects"""
74
75			def __init__(self, documents):
76			self._documents = documents
77
78			@property
79			def documents(self):
80			yield from self._documents
81

NatLibFi / Annif

Pull Request — master (#257)

DocumentDirectory.documents() A

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like