annif.corpus.json.json_file_to_document() - Code Metrics - NatLibFi/Annif - Measure and Improve Code Quality continuously with Scrutinizer

annif.corpus.json.json_file_to_document() A
last analyzed 2025-08-29 08:28 UTC

↳ Parent: annif.corpus.json

Complexity

Conditions

Size

Total Lines	15
Code Lines	12

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
cc	3
eloc	12
nop	4
dl	0
loc	15
rs	9.8
c	0
b	0
f	0

"""Support for document corpora in JSON format"""

import functools
import json
import os.path
from importlib.resources import files

import annif
from annif.vocab import SubjectIndex

from .types import Document, SubjectSet

logger = annif.logger


@functools.lru_cache(maxsize=1)
def _get_json_schema(schema_name):
    schema_path = files("annif.schemas").joinpath(schema_name)
    with schema_path.open("r", encoding="utf-8") as schema_file:
        return json.load(schema_file)


def _subjects_to_subject_set(subjects, subject_index, language):
    subject_ids = []
    for subj in subjects:
        if "uri" in subj:
            subject_ids.append(subject_index.by_uri(subj["uri"]))
        else:
            subject_ids.append(subject_index.by_label(subj["label"], language))
    return SubjectSet(subject_ids)


def json_to_document(
    filename: str,
    json_data: str,
    subject_index: SubjectIndex | None,
    language: str,
    require_subjects: bool,
) -> Document | None:

    import jsonschema

    try:
        data = json.loads(json_data)
    except json.JSONDecodeError as err:
        logger.warning(f"JSON parsing failed for file {filename}: {err}")
        return None

    try:
        jsonschema.validate(instance=data, schema=_get_json_schema("document.json"))
    except jsonschema.ValidationError as err:
        logger.warning(f"JSON validation failed for file {filename}: {err.message}")
        return None

    if require_subjects or (subject_index is not None and "subjects" in data):
        subject_set = _subjects_to_subject_set(
            data.get("subjects", []), subject_index, language
        )
        if not subject_set:
            return None
    else:
        subject_set = None

    return Document(
        text=data.get("text", ""),
        metadata=data.get("metadata", {}),
        subject_set=subject_set,
        file_path=filename,
    )


def json_file_to_document(
    filename: str,
    subject_index: SubjectIndex | None,
    language: str,
    require_subjects: bool,
) -> Document | None:
    if os.path.getsize(filename) == 0:
        logger.warning(f"Skipping empty file {filename}")
        return None

    with open(filename, "r", encoding="utf-8") as jsonfile:
        json_data = jsonfile.read()

    return json_to_document(
        filename, json_data, subject_index, language, require_subjects
    )


1			"""Support for document corpora in JSON format"""
2
3			import functools
4			import json
5			import os.path
6			from importlib.resources import files
7
8			import annif
9			from annif.vocab import SubjectIndex
10
11			from .types import Document, SubjectSet
12
13			logger = annif.logger
14
15
16			@functools.lru_cache(maxsize=1)
17			def _get_json_schema(schema_name):
18			schema_path = files("annif.schemas").joinpath(schema_name)
19			with schema_path.open("r", encoding="utf-8") as schema_file:
20			return json.load(schema_file)
21
22
23			def _subjects_to_subject_set(subjects, subject_index, language):
24			subject_ids = []
25			for subj in subjects:
26			if "uri" in subj:
27			subject_ids.append(subject_index.by_uri(subj["uri"]))
28			else:
29			subject_ids.append(subject_index.by_label(subj["label"], language))
30			return SubjectSet(subject_ids)
31
32
33			def json_to_document(
34			filename: str,
35			json_data: str,
36			subject_index: SubjectIndex \| None,
37			language: str,
38			require_subjects: bool,
39			) -> Document \| None:
40
41			import jsonschema
42
43			try:
44			data = json.loads(json_data)
45			except json.JSONDecodeError as err:
46			logger.warning(f"JSON parsing failed for file {filename}: {err}")
47			return None
48
49			try:
50			jsonschema.validate(instance=data, schema=_get_json_schema("document.json"))
51			except jsonschema.ValidationError as err:
52			logger.warning(f"JSON validation failed for file {filename}: {err.message}")
53			return None
54
55			if require_subjects or (subject_index is not None and "subjects" in data):
56			subject_set = _subjects_to_subject_set(
57			data.get("subjects", []), subject_index, language
58			)
59			if not subject_set:
60			return None
61			else:
62			subject_set = None
63
64			return Document(
65			text=data.get("text", ""),
66			metadata=data.get("metadata", {}),
67			subject_set=subject_set,
68			file_path=filename,
69			)
70
71
72			def json_file_to_document(
73			filename: str,
74			subject_index: SubjectIndex \| None,
75			language: str,
76			require_subjects: bool,
77			) -> Document \| None:
78			if os.path.getsize(filename) == 0:
79			logger.warning(f"Skipping empty file {filename}")
80			return None
81
82			with open(filename, "r", encoding="utf-8") as jsonfile:
83			json_data = jsonfile.read()
84
85			return json_to_document(
86			filename, json_data, subject_index, language, require_subjects
87			)
88

NatLibFi / Annif

annif.corpus.json.json_file_to_document() A last analyzed 2025-08-29 08:28 UTC

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like

annif.corpus.json.json_file_to_document() A
last analyzed 2025-08-29 08:28 UTC