| 1 |  |  | """Support for document corpora in JSON format""" | 
            
                                                                                                            
                            
            
                                    
            
            
                | 2 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 3 |  |  | import functools | 
            
                                                                                                            
                            
            
                                    
            
            
                | 4 |  |  | import json | 
            
                                                                                                            
                            
            
                                    
            
            
                | 5 |  |  | import os.path | 
            
                                                                                                            
                            
            
                                    
            
            
                | 6 |  |  | from importlib.resources import files | 
            
                                                                                                            
                            
            
                                    
            
            
                | 7 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 8 |  |  | import annif | 
            
                                                                                                            
                            
            
                                    
            
            
                | 9 |  |  | from annif.vocab import SubjectIndex | 
            
                                                                                                            
                            
            
                                    
            
            
                | 10 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 11 |  |  | from .types import Document, SubjectSet | 
            
                                                                                                            
                            
            
                                    
            
            
                | 12 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 13 |  |  | logger = annif.logger | 
            
                                                                                                            
                            
            
                                    
            
            
                | 14 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 15 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 16 |  |  | @functools.lru_cache(maxsize=1) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 17 |  |  | def _get_json_schema(schema_name): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 18 |  |  |     schema_path = files("annif.schemas").joinpath(schema_name) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 19 |  |  |     with schema_path.open("r", encoding="utf-8") as schema_file: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 20 |  |  |         return json.load(schema_file) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 21 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 22 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 23 |  |  | def _subjects_to_subject_set(subjects, subject_index, language): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 24 |  |  |     subject_ids = [] | 
            
                                                                                                            
                            
            
                                    
            
            
                | 25 |  |  |     for subj in subjects: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 26 |  |  |         if "uri" in subj: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 27 |  |  |             subject_ids.append(subject_index.by_uri(subj["uri"])) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 28 |  |  |         else: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 29 |  |  |             subject_ids.append(subject_index.by_label(subj["label"], language)) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 30 |  |  |     return SubjectSet(subject_ids) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 31 |  |  |  | 
            
                                                                                                            
                                                                
            
                                    
            
            
                | 32 |  |  |  | 
            
                                                                        
                            
            
                                    
            
            
                | 33 |  |  | def json_to_document( | 
            
                                                                        
                            
            
                                    
            
            
                | 34 |  |  |     filename: str, | 
            
                                                                        
                            
            
                                    
            
            
                | 35 |  |  |     json_data: str, | 
            
                                                                        
                            
            
                                    
            
            
                | 36 |  |  |     subject_index: SubjectIndex | None, | 
            
                                                                        
                            
            
                                    
            
            
                | 37 |  |  |     language: str, | 
            
                                                                        
                            
            
                                    
            
            
                | 38 |  |  |     require_subjects: bool, | 
            
                                                                        
                            
            
                                    
            
            
                | 39 |  |  | ) -> Document | None: | 
            
                                                                        
                            
            
                                    
            
            
                | 40 |  |  |  | 
            
                                                                        
                            
            
                                    
            
            
                | 41 |  |  |     import jsonschema | 
            
                                                                        
                            
            
                                    
            
            
                | 42 |  |  |  | 
            
                                                                        
                            
            
                                    
            
            
                | 43 |  |  |     try: | 
            
                                                                        
                            
            
                                    
            
            
                | 44 |  |  |         data = json.loads(json_data) | 
            
                                                                        
                            
            
                                    
            
            
                | 45 |  |  |     except json.JSONDecodeError as err: | 
            
                                                                        
                            
            
                                    
            
            
                | 46 |  |  |         logger.warning(f"JSON parsing failed for file {filename}: {err}") | 
            
                                                                        
                            
            
                                    
            
            
                | 47 |  |  |         return None | 
            
                                                                        
                            
            
                                    
            
            
                | 48 |  |  |  | 
            
                                                                        
                            
            
                                    
            
            
                | 49 |  |  |     try: | 
            
                                                                        
                            
            
                                    
            
            
                | 50 |  |  |         jsonschema.validate(instance=data, schema=_get_json_schema("document.json")) | 
            
                                                                        
                            
            
                                    
            
            
                | 51 |  |  |     except jsonschema.ValidationError as err: | 
            
                                                                        
                            
            
                                    
            
            
                | 52 |  |  |         logger.warning(f"JSON validation failed for file {filename}: {err.message}") | 
            
                                                                        
                            
            
                                    
            
            
                | 53 |  |  |         return None | 
            
                                                                        
                            
            
                                    
            
            
                | 54 |  |  |  | 
            
                                                                        
                            
            
                                    
            
            
                | 55 |  |  |     if require_subjects or (subject_index is not None and "subjects" in data): | 
            
                                                                        
                            
            
                                    
            
            
                | 56 |  |  |         subject_set = _subjects_to_subject_set( | 
            
                                                                        
                            
            
                                    
            
            
                | 57 |  |  |             data.get("subjects", []), subject_index, language | 
            
                                                                        
                            
            
                                    
            
            
                | 58 |  |  |         ) | 
            
                                                                        
                            
            
                                    
            
            
                | 59 |  |  |         if not subject_set: | 
            
                                                                        
                            
            
                                    
            
            
                | 60 |  |  |             return None | 
            
                                                                        
                            
            
                                    
            
            
                | 61 |  |  |     else: | 
            
                                                                        
                            
            
                                    
            
            
                | 62 |  |  |         subject_set = None | 
            
                                                                        
                            
            
                                    
            
            
                | 63 |  |  |  | 
            
                                                                        
                            
            
                                    
            
            
                | 64 |  |  |     return Document( | 
            
                                                                        
                            
            
                                    
            
            
                | 65 |  |  |         text=data.get("text", ""), | 
            
                                                                        
                            
            
                                    
            
            
                | 66 |  |  |         metadata=data.get("metadata", {}), | 
            
                                                                        
                            
            
                                    
            
            
                | 67 |  |  |         subject_set=subject_set, | 
            
                                                                        
                            
            
                                    
            
            
                | 68 |  |  |         file_path=filename, | 
            
                                                                        
                            
            
                                    
            
            
                | 69 |  |  |         document_id=data.get("document_id", None), | 
            
                                                                                                            
                            
            
                                    
            
            
                | 70 |  |  |     ) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 71 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 72 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 73 |  |  | def json_file_to_document( | 
            
                                                                                                            
                            
            
                                    
            
            
                | 74 |  |  |     filename: str, | 
            
                                                                                                            
                            
            
                                    
            
            
                | 75 |  |  |     subject_index: SubjectIndex | None, | 
            
                                                                                                            
                            
            
                                    
            
            
                | 76 |  |  |     language: str, | 
            
                                                                                                            
                            
            
                                    
            
            
                | 77 |  |  |     require_subjects: bool, | 
            
                                                                                                            
                            
            
                                    
            
            
                | 78 |  |  | ) -> Document | None: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 79 |  |  |     if os.path.getsize(filename) == 0: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 80 |  |  |         logger.warning(f"Skipping empty file {filename}") | 
            
                                                                                                            
                            
            
                                    
            
            
                | 81 |  |  |         return None | 
            
                                                                                                            
                            
            
                                    
            
            
                | 82 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 83 |  |  |     with open(filename, "r", encoding="utf-8") as jsonfile: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 84 |  |  |         json_data = jsonfile.read() | 
            
                                                                                                            
                            
            
                                    
            
            
                | 85 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 86 |  |  |     return json_to_document( | 
            
                                                                                                            
                            
            
                                    
            
            
                | 87 |  |  |         filename, json_data, subject_index, language, require_subjects | 
            
                                                                                                            
                                                                
            
                                    
            
            
                | 88 |  |  |     ) | 
            
                                                        
            
                                    
            
            
                | 89 |  |  |  |