annif.corpus.json.json_to_document()   B
last analyzed

Complexity

Conditions 7

Size

Total Lines 34
Code Lines 27

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 7
eloc 27
nop 5
dl 0
loc 34
rs 7.8319
c 0
b 0
f 0
1
"""Support for document corpora in JSON format"""
2
3
import functools
4
import json
5
import os.path
6
from importlib.resources import files
7
8
import jsonschema
9
10
import annif
11
from annif.vocab import SubjectIndex
12
13
from .types import Document, SubjectSet
14
15
logger = annif.logger
16
17
18
@functools.lru_cache(maxsize=1)
19
def _get_json_schema(schema_name):
20
    schema_path = files("annif.schemas").joinpath(schema_name)
21
    with schema_path.open("r", encoding="utf-8") as schema_file:
22
        return json.load(schema_file)
23
24
25
def _subjects_to_subject_set(subjects, subject_index, language):
26
    subject_ids = []
27
    for subj in subjects:
28
        if "uri" in subj:
29
            subject_ids.append(subject_index.by_uri(subj["uri"]))
30
        else:
31
            subject_ids.append(subject_index.by_label(subj["label"], language))
32
    return SubjectSet(subject_ids)
33
34
35
def json_to_document(
36
    filename: str,
37
    json_data: str,
38
    subject_index: SubjectIndex | None,
39
    language: str,
40
    require_subjects: bool,
41
) -> Document | None:
42
43
    try:
44
        data = json.loads(json_data)
45
    except json.JSONDecodeError as err:
46
        logger.warning(f"JSON parsing failed for file {filename}: {err}")
47
        return None
48
49
    try:
50
        jsonschema.validate(instance=data, schema=_get_json_schema("document.json"))
51
    except jsonschema.ValidationError as err:
52
        logger.warning(f"JSON validation failed for file {filename}: {err.message}")
53
        return None
54
55
    if require_subjects or (subject_index is not None and "subjects" in data):
56
        subject_set = _subjects_to_subject_set(
57
            data.get("subjects", []), subject_index, language
58
        )
59
        if not subject_set:
60
            return None
61
    else:
62
        subject_set = None
63
64
    return Document(
65
        text=data.get("text", ""),
66
        metadata=data.get("metadata", {}),
67
        subject_set=subject_set,
68
        file_path=filename,
69
    )
70
71
72
def json_file_to_document(
73
    filename: str,
74
    subject_index: SubjectIndex | None,
75
    language: str,
76
    require_subjects: bool,
77
) -> Document | None:
78
    if os.path.getsize(filename) == 0:
79
        logger.warning(f"Skipping empty file {filename}")
80
        return None
81
82
    with open(filename, "r", encoding="utf-8") as jsonfile:
83
        json_data = jsonfile.read()
84
85
    return json_to_document(
86
        filename, json_data, subject_index, language, require_subjects
87
    )
88