Total Complexity | 9 |
Total Lines | 51 |
Duplicated Lines | 0 % |
Changes | 0 |
1 | """Support for document corpora in JSON format""" |
||
2 | |||
3 | import json |
||
4 | import os.path |
||
5 | |||
6 | import annif |
||
7 | from annif.vocab import SubjectIndex |
||
8 | |||
9 | from .types import Document, SubjectSet |
||
10 | |||
11 | logger = annif.logger |
||
12 | |||
13 | |||
14 | def _subjects_to_subject_set(subjects, subject_index, language): |
||
15 | subject_ids = [] |
||
16 | for subj in subjects: |
||
17 | if "uri" in subj: |
||
18 | subject_ids.append(subject_index.by_uri(subj["uri"])) |
||
19 | else: |
||
20 | subject_ids.append(subject_index.by_label(subj["label"], language)) |
||
21 | return SubjectSet(subject_ids) |
||
22 | |||
23 | |||
24 | def json_file_to_document( |
||
25 | filename: str, |
||
26 | subject_index: SubjectIndex, |
||
27 | language: str, |
||
28 | require_subjects: bool, |
||
29 | ) -> Document | None: |
||
30 | if os.path.getsize(filename) == 0: |
||
31 | logger.warning(f"Skipping empty file {filename}") |
||
32 | return None |
||
33 | |||
34 | with open(filename) as jsonfile: |
||
35 | try: |
||
36 | data = json.load(jsonfile) |
||
37 | except json.JSONDecodeError as err: |
||
38 | logger.warning(f"JSON parsing failed for file {filename}: {err}") |
||
39 | return None |
||
40 | |||
41 | subject_set = _subjects_to_subject_set( |
||
42 | data.get("subjects", []), subject_index, language |
||
43 | ) |
||
44 | if require_subjects and not subject_set: |
||
45 | return None |
||
46 | |||
47 | return Document( |
||
48 | text=data.get("text", ""), |
||
49 | metadata=data.get("metadata", {}), |
||
50 | subject_set=subject_set, |
||
51 | ) |
||
52 |