annif.corpus.json.json_file_to_document()   A
last analyzed

Complexity

Conditions 3

Size

Total Lines 15
Code Lines 12

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 3
eloc 12
nop 4
dl 0
loc 15
rs 9.8
c 0
b 0
f 0
1
"""Support for document corpora in JSON format"""
2
3
import functools
4
import json
5
import os.path
6
from importlib.resources import files
7
8
import annif
9
from annif.vocab import SubjectIndex
10
11
from .types import Document, SubjectSet
12
13
logger = annif.logger
14
15
16
@functools.lru_cache(maxsize=1)
17
def _get_json_schema(schema_name):
18
    schema_path = files("annif.schemas").joinpath(schema_name)
19
    with schema_path.open("r", encoding="utf-8") as schema_file:
20
        return json.load(schema_file)
21
22
23
def _subjects_to_subject_set(subjects, subject_index, language):
24
    subject_ids = []
25
    for subj in subjects:
26
        if "uri" in subj:
27
            subject_ids.append(subject_index.by_uri(subj["uri"]))
28
        else:
29
            subject_ids.append(subject_index.by_label(subj["label"], language))
30
    return SubjectSet(subject_ids)
31
32
33
def json_to_document(
34
    filename: str,
35
    json_data: str,
36
    subject_index: SubjectIndex | None,
37
    language: str,
38
    require_subjects: bool,
39
) -> Document | None:
40
41
    import jsonschema
42
43
    try:
44
        data = json.loads(json_data)
45
    except json.JSONDecodeError as err:
46
        logger.warning(f"JSON parsing failed for file {filename}: {err}")
47
        return None
48
49
    try:
50
        jsonschema.validate(instance=data, schema=_get_json_schema("document.json"))
51
    except jsonschema.ValidationError as err:
52
        logger.warning(f"JSON validation failed for file {filename}: {err.message}")
53
        return None
54
55
    if require_subjects or (subject_index is not None and "subjects" in data):
56
        subject_set = _subjects_to_subject_set(
57
            data.get("subjects", []), subject_index, language
58
        )
59
        if not subject_set:
60
            return None
61
    else:
62
        subject_set = None
63
64
    return Document(
65
        text=data.get("text", ""),
66
        metadata=data.get("metadata", {}),
67
        subject_set=subject_set,
68
        file_path=filename,
69
    )
70
71
72
def json_file_to_document(
73
    filename: str,
74
    subject_index: SubjectIndex | None,
75
    language: str,
76
    require_subjects: bool,
77
) -> Document | None:
78
    if os.path.getsize(filename) == 0:
79
        logger.warning(f"Skipping empty file {filename}")
80
        return None
81
82
    with open(filename, "r", encoding="utf-8") as jsonfile:
83
        json_data = jsonfile.read()
84
85
    return json_to_document(
86
        filename, json_data, subject_index, language, require_subjects
87
    )
88