annif.corpus.json.json_file_to_document() - Code Metrics - NatLibFi/Annif - Measure and Improve Code Quality continuously with Scrutinizer

annif.corpus.json.json_file_to_document() B
last analyzed 2025-08-15 13:39 UTC

↳ Parent: annif.corpus.json

Complexity

Conditions

Size

Total Lines	34
Code Lines	28

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
cc	7
eloc	28
nop	4
dl	0
loc	34
rs	7.808
c	0
b	0
f	0

"""Support for document corpora in JSON format"""

import functools
import json
import os.path
from importlib.resources import files

import jsonschema

import annif
from annif.vocab import SubjectIndex

from .types import Document, SubjectSet

logger = annif.logger


@functools.lru_cache(maxsize=1)
def _get_json_schema(schema_name):
    schema_path = files("annif.schemas").joinpath(schema_name)
    with schema_path.open("r", encoding="utf-8") as schema_file:
        return json.load(schema_file)


def _subjects_to_subject_set(subjects, subject_index, language):
    subject_ids = []
    for subj in subjects:
        if "uri" in subj:
            subject_ids.append(subject_index.by_uri(subj["uri"]))
        else:
            subject_ids.append(subject_index.by_label(subj["label"], language))
    return SubjectSet(subject_ids)


def json_file_to_document(
    filename: str,
    subject_index: SubjectIndex,
    language: str,
    require_subjects: bool,
) -> Document | None:
    if os.path.getsize(filename) == 0:
        logger.warning(f"Skipping empty file {filename}")
        return None

    with open(filename, "r", encoding="utf-8") as jsonfile:
        try:
            data = json.load(jsonfile)
        except json.JSONDecodeError as err:
            logger.warning(f"JSON parsing failed for file {filename}: {err}")
            return None

    try:
        jsonschema.validate(instance=data, schema=_get_json_schema("document.json"))
    except jsonschema.ValidationError as err:
        logger.warning(f"JSON validation failed for file {filename}: {err.message}")
        return None

    subject_set = _subjects_to_subject_set(
        data.get("subjects", []), subject_index, language
    )
    if require_subjects and not subject_set:
        return None

    return Document(
        text=data.get("text", ""),
        metadata=data.get("metadata", {}),
        subject_set=subject_set,
        file_path=filename,
    )


1			"""Support for document corpora in JSON format"""
2
3			import functools
4			import json
5			import os.path
6			from importlib.resources import files
7
8			import jsonschema
9
10			import annif
11			from annif.vocab import SubjectIndex
12
13			from .types import Document, SubjectSet
14
15			logger = annif.logger
16
17
18			@functools.lru_cache(maxsize=1)
19			def _get_json_schema(schema_name):
20			schema_path = files("annif.schemas").joinpath(schema_name)
21			with schema_path.open("r", encoding="utf-8") as schema_file:
22			return json.load(schema_file)
23
24
25			def _subjects_to_subject_set(subjects, subject_index, language):
26			subject_ids = []
27			for subj in subjects:
28			if "uri" in subj:
29			subject_ids.append(subject_index.by_uri(subj["uri"]))
30			else:
31			subject_ids.append(subject_index.by_label(subj["label"], language))
32			return SubjectSet(subject_ids)
33
34
35			def json_file_to_document(
36			filename: str,
37			subject_index: SubjectIndex,
38			language: str,
39			require_subjects: bool,
40			) -> Document \| None:
41			if os.path.getsize(filename) == 0:
42			logger.warning(f"Skipping empty file {filename}")
43			return None
44
45			with open(filename, "r", encoding="utf-8") as jsonfile:
46			try:
47			data = json.load(jsonfile)
48			except json.JSONDecodeError as err:
49			logger.warning(f"JSON parsing failed for file {filename}: {err}")
50			return None
51
52			try:
53			jsonschema.validate(instance=data, schema=_get_json_schema("document.json"))
54			except jsonschema.ValidationError as err:
55			logger.warning(f"JSON validation failed for file {filename}: {err.message}")
56			return None
57
58			subject_set = _subjects_to_subject_set(
59			data.get("subjects", []), subject_index, language
60			)
61			if require_subjects and not subject_set:
62			return None
63
64			return Document(
65			text=data.get("text", ""),
66			metadata=data.get("metadata", {}),
67			subject_set=subject_set,
68			file_path=filename,
69			)
70

NatLibFi / Annif

annif.corpus.json.json_file_to_document() B last analyzed 2025-08-15 13:39 UTC

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like

annif.corpus.json.json_file_to_document() B
last analyzed 2025-08-15 13:39 UTC