annif.corpus.json - Code Metrics - Inspection of "Add JSON fulltext corpus format" - NatLibFi/Annif - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Pull Request — main (#872)

by Osma

created 2025-08-14 08:51 UTC

annif.corpus.json A

↳ Parent: Project

Complexity

Total Complexity

Size/Duplication

Total Lines	51
Duplicated Lines	0 %

Importance

Changes

Metric	Value
eloc	36
dl	0
loc	51
rs	10
c	0
b	0
f	0
wmc	9

2 Functions

Rating	Name	Duplication	Size	Complexity
B	json_file_to_document()	0	27	6
A	_subjects_to_subject_set()	0	8	3

"""Support for document corpora in JSON format"""

import json
import os.path

import annif
from annif.vocab import SubjectIndex

from .types import Document, SubjectSet

logger = annif.logger


def _subjects_to_subject_set(subjects, subject_index, language):
    subject_ids = []
    for subj in subjects:
        if "uri" in subj:
            subject_ids.append(subject_index.by_uri(subj["uri"]))
        else:
            subject_ids.append(subject_index.by_label(subj["label"], language))
    return SubjectSet(subject_ids)


def json_file_to_document(
    filename: str,
    subject_index: SubjectIndex,
    language: str,
    require_subjects: bool,
) -> Document | None:
    if os.path.getsize(filename) == 0:
        logger.warning(f"Skipping empty file {filename}")
        return None

    with open(filename) as jsonfile:
        try:
            data = json.load(jsonfile)
        except json.JSONDecodeError as err:
            logger.warning(f"JSON parsing failed for file {filename}: {err}")
            return None

    subject_set = _subjects_to_subject_set(
        data.get("subjects", []), subject_index, language
    )
    if require_subjects and not subject_set:
        return None

    return Document(
        text=data.get("text", ""),
        metadata=data.get("metadata", {}),
        subject_set=subject_set,
    )


1			"""Support for document corpora in JSON format"""
2
3			import json
4			import os.path
5
6			import annif
7			from annif.vocab import SubjectIndex
8
9			from .types import Document, SubjectSet
10
11			logger = annif.logger
12
13
14			def _subjects_to_subject_set(subjects, subject_index, language):
15			subject_ids = []
16			for subj in subjects:
17			if "uri" in subj:
18			subject_ids.append(subject_index.by_uri(subj["uri"]))
19			else:
20			subject_ids.append(subject_index.by_label(subj["label"], language))
21			return SubjectSet(subject_ids)
22
23
24			def json_file_to_document(
25			filename: str,
26			subject_index: SubjectIndex,
27			language: str,
28			require_subjects: bool,
29			) -> Document \| None:
30			if os.path.getsize(filename) == 0:
31			logger.warning(f"Skipping empty file {filename}")
32			return None
33
34			with open(filename) as jsonfile:
35			try:
36			data = json.load(jsonfile)
37			except json.JSONDecodeError as err:
38			logger.warning(f"JSON parsing failed for file {filename}: {err}")
39			return None
40
41			subject_set = _subjects_to_subject_set(
42			data.get("subjects", []), subject_index, language
43			)
44			if require_subjects and not subject_set:
45			return None
46
47			return Document(
48			text=data.get("text", ""),
49			metadata=data.get("metadata", {}),
50			subject_set=subject_set,
51			)
52

NatLibFi / Annif

Pull Request — main (#872)

annif.corpus.json A

Complexity

Size/Duplication

Importance

2 Functions

Duplication Side-by-Side

Filter issues like