annif.corpus.document.TruncatingDocumentCorpus.__init__() - Code Metrics - Inspection of "Apply input_limit to texts when reading corpus doc..." - NatLibFi/Annif - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Pull Request — master (#452)

unknown

created 2020-11-18 15:52 UTC

TruncatingDocumentCorpus.init() A

↳ Parent: annif.corpus.document

Complexity

Conditions

Size

Total Lines	3
Code Lines	3

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
eloc	3
dl	0
loc	3
rs	10
c	0
b	0
f	0
cc	1
nop	3

"""Clases for supporting document corpora"""

import glob
import os.path
import re
import gzip
import annif.util
from .types import DocumentCorpus
from .subject import SubjectSet

logger = annif.logger


class DocumentDirectory(DocumentCorpus):
    """A directory of files as a full text document corpus"""

    def __init__(self, path, require_subjects=False):
        self.path = path
        self.require_subjects = require_subjects

    def __iter__(self):
        """Iterate through the directory, yielding tuples of (docfile,
        subjectfile) containing file paths. If there is no key file and
        require_subjects is False, the subjectfile will be returned as None."""

        for filename in sorted(glob.glob(os.path.join(self.path, '*.txt'))):
            tsvfilename = re.sub(r'\.txt$', '.tsv', filename)
            if os.path.exists(tsvfilename):
                yield (filename, tsvfilename)
                continue
            keyfilename = re.sub(r'\.txt$', '.key', filename)
            if os.path.exists(keyfilename):
                yield (filename, keyfilename)
                continue
            if not self.require_subjects:
                yield (filename, None)

    @property
    def documents(self):
        for docfilename, keyfilename in self:
            with open(docfilename, errors='replace',
                      encoding='utf-8-sig') as docfile:
                text = docfile.read()
            with open(keyfilename, encoding='utf-8-sig') as keyfile:
                subjects = SubjectSet.from_string(keyfile.read())
            yield self._create_document(text=text,
                                        uris=subjects.subject_uris,
                                        labels=subjects.subject_labels)


class DocumentFile(DocumentCorpus):
    """A TSV file as a corpus of documents with subjects"""

    def __init__(self, path):
        self.path = path

    @property
    def documents(self):
        if self.path.endswith('.gz'):
            opener = gzip.open
        else:
            opener = open
        with opener(self.path, mode='rt', encoding='utf-8-sig') as tsvfile:
            for line in tsvfile:
                yield from self._parse_tsv_line(line)

    def _parse_tsv_line(self, line):
        if '\t' in line:
            text, uris = line.split('\t', maxsplit=1)
            subjects = [annif.util.cleanup_uri(uri)
                        for uri in uris.split()]
            yield self._create_document(text=text,
                                        uris=subjects,
                                        labels=[])
        else:
            logger.warning('Skipping invalid line (missing tab): "%s"',
                           line.rstrip())


class DocumentList(DocumentCorpus):
    """A document corpus based on a list of other iterable of Document
    objects"""

    def __init__(self, documents):
        self._documents = documents

    @property
    def documents(self):
        yield from self._documents


class TruncatingDocumentCorpus(DocumentCorpus):
    """A document corpus that wraps another document corpus but truncates the
    documents to a given length"""

    def __init__(self, corpus, limit):
        self._documents = corpus.documents
        self._limit = limit

    @property
    def documents(self):
        for doc in self._documents:
            yield self._create_document(text=doc.text[:self._limit],
                                        uris=doc.uris,
                                        labels=doc.labels)


1			"""Clases for supporting document corpora"""
2
3			import glob
4			import os.path
5			import re
6			import gzip
7			import annif.util
8			from .types import DocumentCorpus
9			from .subject import SubjectSet
10
11			logger = annif.logger
12
13
14			class DocumentDirectory(DocumentCorpus):
15			"""A directory of files as a full text document corpus"""
16
17			def __init__(self, path, require_subjects=False):
18			self.path = path
19			self.require_subjects = require_subjects
20
21			def __iter__(self):
22			"""Iterate through the directory, yielding tuples of (docfile,
23			subjectfile) containing file paths. If there is no key file and
24			require_subjects is False, the subjectfile will be returned as None."""
25
26			for filename in sorted(glob.glob(os.path.join(self.path, '*.txt'))):
27			tsvfilename = re.sub(r'\.txt$', '.tsv', filename)
28			if os.path.exists(tsvfilename):
29			yield (filename, tsvfilename)
30			continue
31			keyfilename = re.sub(r'\.txt$', '.key', filename)
32			if os.path.exists(keyfilename):
33			yield (filename, keyfilename)
34			continue
35			if not self.require_subjects:
36			yield (filename, None)
37
38			@property
39			def documents(self):
40			for docfilename, keyfilename in self:
41			with open(docfilename, errors='replace',
42			encoding='utf-8-sig') as docfile:
43			text = docfile.read()
44			with open(keyfilename, encoding='utf-8-sig') as keyfile:
45			subjects = SubjectSet.from_string(keyfile.read())
46			yield self._create_document(text=text,
47			uris=subjects.subject_uris,
48			labels=subjects.subject_labels)
49
50
51			class DocumentFile(DocumentCorpus):
52			"""A TSV file as a corpus of documents with subjects"""
53
54			def __init__(self, path):
55			self.path = path
56
57			@property
58			def documents(self):
59			if self.path.endswith('.gz'):
60			opener = gzip.open
61			else:
62			opener = open
63			with opener(self.path, mode='rt', encoding='utf-8-sig') as tsvfile:
64			for line in tsvfile:
65			yield from self._parse_tsv_line(line)
66
67			def _parse_tsv_line(self, line):
68			if '\t' in line:
69			text, uris = line.split('\t', maxsplit=1)
70			subjects = [annif.util.cleanup_uri(uri)
71			for uri in uris.split()]
72			yield self._create_document(text=text,
73			uris=subjects,
74			labels=[])
75			else:
76			logger.warning('Skipping invalid line (missing tab): "%s"',
77			line.rstrip())
78
79
80			class DocumentList(DocumentCorpus):
81			"""A document corpus based on a list of other iterable of Document
82			objects"""
83
84			def __init__(self, documents):
85			self._documents = documents
86
87			@property
88			def documents(self):
89			yield from self._documents
90
91
92			class TruncatingDocumentCorpus(DocumentCorpus):
93			"""A document corpus that wraps another document corpus but truncates the
94			documents to a given length"""
95
96			def __init__(self, corpus, limit):
97			self._documents = corpus.documents
98			self._limit = limit
99
100			@property
101			def documents(self):
102			for doc in self._documents:
103			yield self._create_document(text=doc.text[:self._limit],
104			uris=doc.uris,
105			labels=doc.labels)
106

NatLibFi / Annif

Pull Request — master (#452)

TruncatingDocumentCorpus.__init__() A

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like

TruncatingDocumentCorpus.init() A