annif.corpus.json.json_file_to_document()   B
last analyzed

Complexity

Conditions 7

Size

Total Lines 34
Code Lines 28

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 7
eloc 28
nop 4
dl 0
loc 34
rs 7.808
c 0
b 0
f 0
1
"""Support for document corpora in JSON format"""
2
3
import functools
4
import json
5
import os.path
6
from importlib.resources import files
7
8
import jsonschema
9
10
import annif
11
from annif.vocab import SubjectIndex
12
13
from .types import Document, SubjectSet
14
15
logger = annif.logger
16
17
18
@functools.lru_cache(maxsize=1)
19
def _get_json_schema(schema_name):
20
    schema_path = files("annif.schemas").joinpath(schema_name)
21
    with schema_path.open("r", encoding="utf-8") as schema_file:
22
        return json.load(schema_file)
23
24
25
def _subjects_to_subject_set(subjects, subject_index, language):
26
    subject_ids = []
27
    for subj in subjects:
28
        if "uri" in subj:
29
            subject_ids.append(subject_index.by_uri(subj["uri"]))
30
        else:
31
            subject_ids.append(subject_index.by_label(subj["label"], language))
32
    return SubjectSet(subject_ids)
33
34
35
def json_file_to_document(
36
    filename: str,
37
    subject_index: SubjectIndex,
38
    language: str,
39
    require_subjects: bool,
40
) -> Document | None:
41
    if os.path.getsize(filename) == 0:
42
        logger.warning(f"Skipping empty file {filename}")
43
        return None
44
45
    with open(filename, "r", encoding="utf-8") as jsonfile:
46
        try:
47
            data = json.load(jsonfile)
48
        except json.JSONDecodeError as err:
49
            logger.warning(f"JSON parsing failed for file {filename}: {err}")
50
            return None
51
52
    try:
53
        jsonschema.validate(instance=data, schema=_get_json_schema("document.json"))
54
    except jsonschema.ValidationError as err:
55
        logger.warning(f"JSON validation failed for file {filename}: {err.message}")
56
        return None
57
58
    subject_set = _subjects_to_subject_set(
59
        data.get("subjects", []), subject_index, language
60
    )
61
    if require_subjects and not subject_set:
62
        return None
63
64
    return Document(
65
        text=data.get("text", ""),
66
        metadata=data.get("metadata", {}),
67
        subject_set=subject_set,
68
        file_path=filename,
69
    )
70