annif.corpus.document.DocumentDirectory.__iter__() - Code Metrics - Inspection of "Support for batch suggest operations in suggest an..." - NatLibFi/Annif - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Pull Request — master (#663)

by Juho

created 2023-02-01 09:19 UTC

annif.corpus.document.DocumentDirectory.iter() A

↳ Parent: annif.corpus.document

Complexity

Conditions

Size

Total Lines	16
Code Lines	12

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
eloc	12
dl	0
loc	16
rs	9.3333
c	0
b	0
f	0
cc	5
nop	1

"""Clases for supporting document corpora"""

import glob
import gzip
import os.path
import re
from itertools import islice

import annif.util

from .subject import SubjectSet
from .types import Document, DocumentCorpus

logger = annif.logger


class DocumentDirectory(DocumentCorpus):
    """A directory of files as a full text document corpus"""

    def __init__(self, path, subject_index, language, require_subjects=False):
        self.path = path
        self.subject_index = subject_index
        self.language = language
        self.require_subjects = require_subjects

    def __iter__(self):
        """Iterate through the directory, yielding tuples of (docfile,
        subjectfile) containing file paths. If there is no key file and
        require_subjects is False, the subjectfile will be returned as None."""

        for filename in sorted(glob.glob(os.path.join(self.path, "*.txt"))):
            tsvfilename = re.sub(r"\.txt$", ".tsv", filename)
            if os.path.exists(tsvfilename):
                yield (filename, tsvfilename)
                continue
            keyfilename = re.sub(r"\.txt$", ".key", filename)
            if os.path.exists(keyfilename):
                yield (filename, keyfilename)
                continue
            if not self.require_subjects:
                yield (filename, None)

    @property
    def documents(self):
        for docfilename, keyfilename in self:
            with open(docfilename, errors="replace", encoding="utf-8-sig") as docfile:
                text = docfile.read()
            if keyfilename is None:
                yield Document(text=text, subject_set=None)
                continue
            with open(keyfilename, encoding="utf-8-sig") as keyfile:
                subjects = SubjectSet.from_string(
                    keyfile.read(), self.subject_index, self.language
                )
            yield Document(text=text, subject_set=subjects)


class DocumentFile(DocumentCorpus):
    """A TSV file as a corpus of documents with subjects"""

    def __init__(self, path, subject_index):
        self.path = path
        self.subject_index = subject_index

    @property
    def documents(self):
        if self.path.endswith(".gz"):
            opener = gzip.open
        else:
            opener = open
        with opener(self.path, mode="rt", encoding="utf-8-sig") as tsvfile:
            for line in tsvfile:
                yield from self._parse_tsv_line(line)

    def _parse_tsv_line(self, line):
        if "\t" in line:
            text, uris = line.split("\t", maxsplit=1)
            subject_ids = {
                self.subject_index.by_uri(annif.util.cleanup_uri(uri))
                for uri in uris.split()
            }
            yield Document(text=text, subject_set=SubjectSet(subject_ids))
        else:
            logger.warning('Skipping invalid line (missing tab): "%s"', line.rstrip())


class DocumentList(DocumentCorpus):
    """A document corpus based on a list of other iterable of Document
    objects"""

    def __init__(self, documents):
        self._documents = documents

    @property
    def documents(self):
        yield from self._documents


class TransformingDocumentCorpus(DocumentCorpus):
    """A document corpus that wraps another document corpus but transforms the
    documents using a given transform function"""

    def __init__(self, corpus, transform_fn):
        self._orig_corpus = corpus
        self._transform_fn = transform_fn

    @property
    def documents(self):
        for doc in self._orig_corpus.documents:
            yield Document(
                text=self._transform_fn(doc.text), subject_set=doc.subject_set
            )


class LimitingDocumentCorpus(DocumentCorpus):
    """A document corpus that wraps another document corpus but limits the
    number of documents to a given limit"""

    def __init__(self, corpus, docs_limit):
        self._orig_corpus = corpus
        self.docs_limit = docs_limit

    @property
    def documents(self):
        for doc in islice(self._orig_corpus.documents, self.docs_limit):
            yield doc


class BatchingDocumentCorpus(DocumentCorpus):
    """A document corpus that wraps another document corpus to allow iterating over the
    documents in batches of a given size; a batch is a list of documents."""

    def __init__(self, corpus):
        self._orig_corpus = corpus

    @property
    def documents(self):
        yield from self._orig_corpus.documents

    def doc_batches(self, batch_size):
        it = iter(self.documents)
        while True:
            docs_batch = list(islice(it, batch_size))
            if not docs_batch:
                return
            yield docs_batch


1			"""Clases for supporting document corpora"""
2
3			import glob
4			import gzip
5			import os.path
6			import re
7			from itertools import islice
8
9			import annif.util
10
11			from .subject import SubjectSet
12			from .types import Document, DocumentCorpus
13
14			logger = annif.logger
15
16
17			class DocumentDirectory(DocumentCorpus):
18			"""A directory of files as a full text document corpus"""
19
20			def __init__(self, path, subject_index, language, require_subjects=False):
21			self.path = path
22			self.subject_index = subject_index
23			self.language = language
24			self.require_subjects = require_subjects
25
26			def __iter__(self):
27			"""Iterate through the directory, yielding tuples of (docfile,
28			subjectfile) containing file paths. If there is no key file and
29			require_subjects is False, the subjectfile will be returned as None."""
30
31			for filename in sorted(glob.glob(os.path.join(self.path, "*.txt"))):
32			tsvfilename = re.sub(r"\.txt$", ".tsv", filename)
33			if os.path.exists(tsvfilename):
34			yield (filename, tsvfilename)
35			continue
36			keyfilename = re.sub(r"\.txt$", ".key", filename)
37			if os.path.exists(keyfilename):
38			yield (filename, keyfilename)
39			continue
40			if not self.require_subjects:
41			yield (filename, None)
42
43			@property
44			def documents(self):
45			for docfilename, keyfilename in self:
46			with open(docfilename, errors="replace", encoding="utf-8-sig") as docfile:
47			text = docfile.read()
48			if keyfilename is None:
49			yield Document(text=text, subject_set=None)
50			continue
51			with open(keyfilename, encoding="utf-8-sig") as keyfile:
52			subjects = SubjectSet.from_string(
53			keyfile.read(), self.subject_index, self.language
54			)
55			yield Document(text=text, subject_set=subjects)
56
57
58			class DocumentFile(DocumentCorpus):
59			"""A TSV file as a corpus of documents with subjects"""
60
61			def __init__(self, path, subject_index):
62			self.path = path
63			self.subject_index = subject_index
64
65			@property
66			def documents(self):
67			if self.path.endswith(".gz"):
68			opener = gzip.open
69			else:
70			opener = open
71			with opener(self.path, mode="rt", encoding="utf-8-sig") as tsvfile:
72			for line in tsvfile:
73			yield from self._parse_tsv_line(line)
74
75			def _parse_tsv_line(self, line):
76			if "\t" in line:
77			text, uris = line.split("\t", maxsplit=1)
78			subject_ids = {
79			self.subject_index.by_uri(annif.util.cleanup_uri(uri))
80			for uri in uris.split()
81			}
82			yield Document(text=text, subject_set=SubjectSet(subject_ids))
83			else:
84			logger.warning('Skipping invalid line (missing tab): "%s"', line.rstrip())
85
86
87			class DocumentList(DocumentCorpus):
88			"""A document corpus based on a list of other iterable of Document
89			objects"""
90
91			def __init__(self, documents):
92			self._documents = documents
93
94			@property
95			def documents(self):
96			yield from self._documents
97
98
99			class TransformingDocumentCorpus(DocumentCorpus):
100			"""A document corpus that wraps another document corpus but transforms the
101			documents using a given transform function"""
102
103			def __init__(self, corpus, transform_fn):
104			self._orig_corpus = corpus
105			self._transform_fn = transform_fn
106
107			@property
108			def documents(self):
109			for doc in self._orig_corpus.documents:
110			yield Document(
111			text=self._transform_fn(doc.text), subject_set=doc.subject_set
112			)
113
114
115			class LimitingDocumentCorpus(DocumentCorpus):
116			"""A document corpus that wraps another document corpus but limits the
117			number of documents to a given limit"""
118
119			def __init__(self, corpus, docs_limit):
120			self._orig_corpus = corpus
121			self.docs_limit = docs_limit
122
123			@property
124			def documents(self):
125			for doc in islice(self._orig_corpus.documents, self.docs_limit):
126			yield doc
127
128
129			class BatchingDocumentCorpus(DocumentCorpus):
130			"""A document corpus that wraps another document corpus to allow iterating over the
131			documents in batches of a given size; a batch is a list of documents."""
132
133			def __init__(self, corpus):
134			self._orig_corpus = corpus
135
136			@property
137			def documents(self):
138			yield from self._orig_corpus.documents
139
140			def doc_batches(self, batch_size):
141			it = iter(self.documents)
142			while True:
143			docs_batch = list(islice(it, batch_size))
144			if not docs_batch:
145			return
146			yield docs_batch
147

NatLibFi / Annif

Pull Request — master (#663)

annif.corpus.document.DocumentDirectory.__iter__() A

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like

annif.corpus.document.DocumentDirectory.iter() A