annif.vocab.subject_file   A
last analyzed

Complexity

Total Complexity 20

Size/Duplication

Total Lines 108
Duplicated Lines 0 %

Importance

Changes 0
Metric Value
eloc 67
dl 0
loc 108
rs 10
c 0
b 0
f 0
wmc 20

11 Methods

Rating   Name   Duplication   Size   Complexity  
A VocabFileCSV.languages() 0 11 2
A VocabFileCSV.subjects() 0 6 3
A VocabFileCSV.save_skos() 0 4 1
A VocabFileTSV.languages() 0 3 1
A VocabFileCSV.__init__() 0 3 1
A VocabFileCSV.is_csv_file() 0 5 1
A VocabFileTSV.save_skos() 0 4 1
A VocabFileTSV._parse_line() 0 7 4
A VocabFileTSV.subjects() 0 5 3
A VocabFileTSV.__init__() 0 6 1
A VocabFileCSV._parse_row() 0 16 2
1
"""Classes for supporting vocabulary files in CSV or TSV format"""
2
3
from __future__ import annotations
4
5
import csv
6
import os.path
7
from typing import TYPE_CHECKING
8
9
import annif
10
import annif.util
11
12
from .skos import serialize_subjects_to_skos
13
from .types import Subject, VocabSource
14
15
if TYPE_CHECKING:
16
    from collections.abc import Generator, Iterator
17
18
19
class VocabFileTSV(VocabSource):
20
    """A monolingual subject vocabulary stored in a TSV file."""
21
22
    def __init__(self, path: str, language: str) -> None:
23
        """initialize the VocabFileTSV given a path to a TSV file and the
24
        language of the vocabulary"""
25
26
        self.path = path
27
        self.language = language
28
29
    def _parse_line(self, line: str) -> Iterator[Subject]:
30
        vals = line.strip().split("\t", 2)
31
        clean_uri = annif.util.cleanup_uri(vals[0])
32
        label = vals[1] if len(vals) >= 2 else None
33
        labels = {self.language: label} if label else None
34
        notation = vals[2] if len(vals) >= 3 else None
35
        yield Subject(uri=clean_uri, labels=labels, notation=notation)
36
37
    @property
38
    def languages(self) -> list[str]:
39
        return [self.language]
40
41
    @property
42
    def subjects(self) -> Generator:
43
        with open(self.path, encoding="utf-8-sig") as subjfile:
44
            for line in subjfile:
45
                yield from self._parse_line(line)
46
47
    def save_skos(self, path: str) -> None:
48
        """Save the contents of the subject vocabulary into a SKOS/Turtle
49
        file with the given path name."""
50
        serialize_subjects_to_skos(self.subjects, path)
51
52
53
class VocabFileCSV(VocabSource):
54
    """A multilingual subject vocabulary stored in a CSV file."""
55
56
    def __init__(self, path: str) -> None:
57
        """initialize the VocabFileCSV given a path to a CSV file"""
58
        self.path = path
59
60
    def _parse_row(self, row: dict[str, str]) -> Iterator[Subject]:
61
        labels = {
62
            fname.replace("label_", ""): value or None
63
            for fname, value in row.items()
64
            if fname.startswith("label_")
65
        }
66
67
        # if there are no labels in any language, set labels to None
68
        # indicating a deprecated subject
69
        if set(labels.values()) == {None}:
70
            labels = None
71
72
        yield Subject(
73
            uri=annif.util.cleanup_uri(row["uri"]),
74
            labels=labels,
75
            notation=row.get("notation", None) or None,
76
        )
77
78
    @property
79
    def languages(self) -> list[str]:
80
        # infer the supported languages from the CSV column names
81
        with open(self.path, encoding="utf-8-sig") as csvfile:
82
            reader = csv.reader(csvfile)
83
            fieldnames = next(reader, None)
84
85
        return [
86
            fname.replace("label_", "")
87
            for fname in fieldnames
88
            if fname.startswith("label_")
89
        ]
90
91
    @property
92
    def subjects(self) -> Generator:
93
        with open(self.path, encoding="utf-8-sig") as csvfile:
94
            reader = csv.DictReader(csvfile)
95
            for row in reader:
96
                yield from self._parse_row(row)
97
98
    def save_skos(self, path: str) -> None:
99
        """Save the contents of the subject vocabulary into a SKOS/Turtle
100
        file with the given path name."""
101
        serialize_subjects_to_skos(self.subjects, path)
102
103
    @staticmethod
104
    def is_csv_file(path: str) -> bool:
105
        """return True if the path looks like a CSV file"""
106
107
        return os.path.splitext(path)[1].lower() == ".csv"
108