annif.corpus.document.TransformingDocumentCorpus.documents() - Code Metrics - Inspection of "Support for adding input-transformation operations" - NatLibFi/Annif - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Pull Request — master (#496)

by Juho

created 2021-08-09 14:32 UTC

TransformingDocumentCorpus.documents() A

↳ Parent: annif.corpus.document

Complexity

Conditions

Size

Total Lines	7
Code Lines	7

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
cc	2
eloc	7
nop	1
dl	0
loc	7
rs	10
c	0
b	0
f	0

"""Clases for supporting document corpora"""

import glob
import os.path
import re
import gzip
import annif.util
from itertools import islice
from .types import DocumentCorpus
from .subject import SubjectSet

logger = annif.logger


class DocumentDirectory(DocumentCorpus):
    """A directory of files as a full text document corpus"""

    def __init__(self, path, require_subjects=False):
        self.path = path
        self.require_subjects = require_subjects

    def __iter__(self):
        """Iterate through the directory, yielding tuples of (docfile,
        subjectfile) containing file paths. If there is no key file and
        require_subjects is False, the subjectfile will be returned as None."""

        for filename in sorted(glob.glob(os.path.join(self.path, '*.txt'))):
            tsvfilename = re.sub(r'\.txt$', '.tsv', filename)
            if os.path.exists(tsvfilename):
                yield (filename, tsvfilename)
                continue
            keyfilename = re.sub(r'\.txt$', '.key', filename)
            if os.path.exists(keyfilename):
                yield (filename, keyfilename)
                continue
            if not self.require_subjects:
                yield (filename, None)

    @property
    def documents(self):
        for docfilename, keyfilename in self:
            with open(docfilename, errors='replace',
                      encoding='utf-8-sig') as docfile:
                text = docfile.read()
            with open(keyfilename, encoding='utf-8-sig') as keyfile:
                subjects = SubjectSet.from_string(keyfile.read())
            yield self._create_document(text=text,
                                        uris=subjects.subject_uris,
                                        labels=subjects.subject_labels)


class DocumentFile(DocumentCorpus):
    """A TSV file as a corpus of documents with subjects"""

    def __init__(self, path):
        self.path = path

    @property
    def documents(self):
        if self.path.endswith('.gz'):
            opener = gzip.open
        else:
            opener = open
        with opener(self.path, mode='rt', encoding='utf-8-sig') as tsvfile:
            for line in tsvfile:
                yield from self._parse_tsv_line(line)

    def _parse_tsv_line(self, line):
        if '\t' in line:
            text, uris = line.split('\t', maxsplit=1)
            subjects = [annif.util.cleanup_uri(uri)
                        for uri in uris.split()]
            yield self._create_document(text=text,
                                        uris=subjects,
                                        labels=[])
        else:
            logger.warning('Skipping invalid line (missing tab): "%s"',
                           line.rstrip())


class DocumentList(DocumentCorpus):
    """A document corpus based on a list of other iterable of Document
    objects"""

    def __init__(self, documents):
        self._documents = documents

    @property
    def documents(self):
        yield from self._documents


class TransformingDocumentCorpus(DocumentCorpus):
    """A document corpus that wraps another document corpus but transforms the
    documents using a given transform function"""

    def __init__(self, corpus, transform_fn):
        self._orig_corpus = corpus
        self._transform_fn = transform_fn

    @property
    def documents(self):
        for doc in self._orig_corpus.documents:
            yield self._create_document(
                text=self._transform_fn(doc.text),
                uris=doc.uris,
                labels=doc.labels)


class LimitingDocumentCorpus(DocumentCorpus):
    """A document corpus that wraps another document corpus but limits the
    number of documents to a given limit"""

    def __init__(self, corpus, docs_limit):
        self._orig_corpus = corpus
        self.docs_limit = docs_limit

    @property
    def documents(self):
        for doc in islice(self._orig_corpus.documents, self.docs_limit):
            yield self._create_document(text=doc.text,
                                        uris=doc.uris,
                                        labels=doc.labels)


1			"""Clases for supporting document corpora"""
2
3			import glob
4			import os.path
5			import re
6			import gzip
7			import annif.util
8			from itertools import islice
9			from .types import DocumentCorpus
10			from .subject import SubjectSet
11
12			logger = annif.logger
13
14
15			class DocumentDirectory(DocumentCorpus):
16			"""A directory of files as a full text document corpus"""
17
18			def __init__(self, path, require_subjects=False):
19			self.path = path
20			self.require_subjects = require_subjects
21
22			def __iter__(self):
23			"""Iterate through the directory, yielding tuples of (docfile,
24			subjectfile) containing file paths. If there is no key file and
25			require_subjects is False, the subjectfile will be returned as None."""
26
27			for filename in sorted(glob.glob(os.path.join(self.path, '*.txt'))):
28			tsvfilename = re.sub(r'\.txt$', '.tsv', filename)
29			if os.path.exists(tsvfilename):
30			yield (filename, tsvfilename)
31			continue
32			keyfilename = re.sub(r'\.txt$', '.key', filename)
33			if os.path.exists(keyfilename):
34			yield (filename, keyfilename)
35			continue
36			if not self.require_subjects:
37			yield (filename, None)
38
39			@property
40			def documents(self):
41			for docfilename, keyfilename in self:
42			with open(docfilename, errors='replace',
43			encoding='utf-8-sig') as docfile:
44			text = docfile.read()
45			with open(keyfilename, encoding='utf-8-sig') as keyfile:
46			subjects = SubjectSet.from_string(keyfile.read())
47			yield self._create_document(text=text,
48			uris=subjects.subject_uris,
49			labels=subjects.subject_labels)
50
51
52			class DocumentFile(DocumentCorpus):
53			"""A TSV file as a corpus of documents with subjects"""
54
55			def __init__(self, path):
56			self.path = path
57
58			@property
59			def documents(self):
60			if self.path.endswith('.gz'):
61			opener = gzip.open
62			else:
63			opener = open
64			with opener(self.path, mode='rt', encoding='utf-8-sig') as tsvfile:
65			for line in tsvfile:
66			yield from self._parse_tsv_line(line)
67
68			def _parse_tsv_line(self, line):
69			if '\t' in line:
70			text, uris = line.split('\t', maxsplit=1)
71			subjects = [annif.util.cleanup_uri(uri)
72			for uri in uris.split()]
73			yield self._create_document(text=text,
74			uris=subjects,
75			labels=[])
76			else:
77			logger.warning('Skipping invalid line (missing tab): "%s"',
78			line.rstrip())
79
80
81			class DocumentList(DocumentCorpus):
82			"""A document corpus based on a list of other iterable of Document
83			objects"""
84
85			def __init__(self, documents):
86			self._documents = documents
87
88			@property
89			def documents(self):
90			yield from self._documents
91
92
93			class TransformingDocumentCorpus(DocumentCorpus):
94			"""A document corpus that wraps another document corpus but transforms the
95			documents using a given transform function"""
96
97			def __init__(self, corpus, transform_fn):
98			self._orig_corpus = corpus
99			self._transform_fn = transform_fn
100
101			@property
102			def documents(self):
103			for doc in self._orig_corpus.documents:
104			yield self._create_document(
105			text=self._transform_fn(doc.text),
106			uris=doc.uris,
107			labels=doc.labels)
108
109
110			class LimitingDocumentCorpus(DocumentCorpus):
111			"""A document corpus that wraps another document corpus but limits the
112			number of documents to a given limit"""
113
114			def __init__(self, corpus, docs_limit):
115			self._orig_corpus = corpus
116			self.docs_limit = docs_limit
117
118			@property
119			def documents(self):
120			for doc in islice(self._orig_corpus.documents, self.docs_limit):
121			yield self._create_document(text=doc.text,
122			uris=doc.uris,
123			labels=doc.labels)
124

NatLibFi / Annif

Pull Request — master (#496)

TransformingDocumentCorpus.documents() A

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like