| Conditions | 6 |
| Total Lines | 27 |
| Code Lines | 22 |
| Lines | 0 |
| Ratio | 0 % |
| Changes | 0 | ||
| 1 | """Support for document corpora in JSON format""" |
||
| 24 | def json_file_to_document( |
||
| 25 | filename: str, |
||
| 26 | subject_index: SubjectIndex, |
||
| 27 | language: str, |
||
| 28 | require_subjects: bool, |
||
| 29 | ) -> Document | None: |
||
| 30 | if os.path.getsize(filename) == 0: |
||
| 31 | logger.warning(f"Skipping empty file {filename}") |
||
| 32 | return None |
||
| 33 | |||
| 34 | with open(filename) as jsonfile: |
||
| 35 | try: |
||
| 36 | data = json.load(jsonfile) |
||
| 37 | except json.JSONDecodeError as err: |
||
| 38 | logger.warning(f"JSON parsing failed for file {filename}: {err}") |
||
| 39 | return None |
||
| 40 | |||
| 41 | subject_set = _subjects_to_subject_set( |
||
| 42 | data.get("subjects", []), subject_index, language |
||
| 43 | ) |
||
| 44 | if require_subjects and not subject_set: |
||
| 45 | return None |
||
| 46 | |||
| 47 | return Document( |
||
| 48 | text=data.get("text", ""), |
||
| 49 | metadata=data.get("metadata", {}), |
||
| 50 | subject_set=subject_set, |
||
| 51 | ) |
||
| 52 |