Passed
Push — issue868-json-corpus-format ( 86235d...24035c )
by Osma
03:08
created

annif.corpus.json.json_file_to_document()   B

Complexity

Conditions 6

Size

Total Lines 27
Code Lines 22

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 6
eloc 22
nop 4
dl 0
loc 27
rs 8.4186
c 0
b 0
f 0
1
"""Support for document corpora in JSON format"""
2
3
import json
4
import os.path
5
6
import annif
7
from annif.vocab import SubjectIndex
8
9
from .types import Document, SubjectSet
10
11
logger = annif.logger
12
13
14
def _subjects_to_subject_set(subjects, subject_index, language):
15
    subject_ids = []
16
    for subj in subjects:
17
        if "uri" in subj:
18
            subject_ids.append(subject_index.by_uri(subj["uri"]))
19
        else:
20
            subject_ids.append(subject_index.by_label(subj["label"], language))
21
    return SubjectSet(subject_ids)
22
23
24
def json_file_to_document(
25
    filename: str,
26
    subject_index: SubjectIndex,
27
    language: str,
28
    require_subjects: bool,
29
) -> Document | None:
30
    if os.path.getsize(filename) == 0:
31
        logger.warning(f"Skipping empty file {filename}")
32
        return None
33
34
    with open(filename) as jsonfile:
35
        try:
36
            data = json.load(jsonfile)
37
        except json.JSONDecodeError as err:
38
            logger.warning(f"JSON parsing failed for file {filename}: {err}")
39
            return None
40
41
    subject_set = _subjects_to_subject_set(
42
        data.get("subjects", []), subject_index, language
43
    )
44
    if require_subjects and not subject_set:
45
        return None
46
47
    return Document(
48
        text=data.get("text", ""),
49
        metadata=data.get("metadata", {}),
50
        subject_set=subject_set,
51
    )
52