| 1 |  |  | """Support for document corpora in JSON format""" | 
            
                                                                                                            
                            
            
                                    
            
            
                | 2 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 3 |  |  | import functools | 
            
                                                                                                            
                            
            
                                    
            
            
                | 4 |  |  | import json | 
            
                                                                                                            
                            
            
                                    
            
            
                | 5 |  |  | import os.path | 
            
                                                                                                            
                            
            
                                    
            
            
                | 6 |  |  | from importlib.resources import files | 
            
                                                                                                            
                            
            
                                    
            
            
                | 7 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 8 |  |  | import jsonschema | 
            
                                                                                                            
                            
            
                                    
            
            
                | 9 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 10 |  |  | import annif | 
            
                                                                                                            
                            
            
                                    
            
            
                | 11 |  |  | from annif.vocab import SubjectIndex | 
            
                                                                                                            
                            
            
                                    
            
            
                | 12 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 13 |  |  | from .types import Document, SubjectSet | 
            
                                                                                                            
                            
            
                                    
            
            
                | 14 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 15 |  |  | logger = annif.logger | 
            
                                                                                                            
                            
            
                                    
            
            
                | 16 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 17 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 18 |  |  | @functools.lru_cache(maxsize=1) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 19 |  |  | def _get_json_schema(schema_name): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 20 |  |  |     schema_path = files("annif.schemas").joinpath(schema_name) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 21 |  |  |     with schema_path.open("r", encoding="utf-8") as schema_file: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 22 |  |  |         return json.load(schema_file) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 23 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 24 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 25 |  |  | def _subjects_to_subject_set(subjects, subject_index, language): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 26 |  |  |     subject_ids = [] | 
            
                                                                                                            
                            
            
                                    
            
            
                | 27 |  |  |     for subj in subjects: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 28 |  |  |         if "uri" in subj: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 29 |  |  |             subject_ids.append(subject_index.by_uri(subj["uri"])) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 30 |  |  |         else: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 31 |  |  |             subject_ids.append(subject_index.by_label(subj["label"], language)) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 32 |  |  |     return SubjectSet(subject_ids) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 33 |  |  |  | 
            
                                                                                                            
                                                                
            
                                    
            
            
                | 34 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 35 |  |  | def json_file_to_document( | 
            
                                                        
            
                                    
            
            
                | 36 |  |  |     filename: str, | 
            
                                                        
            
                                    
            
            
                | 37 |  |  |     subject_index: SubjectIndex, | 
            
                                                        
            
                                    
            
            
                | 38 |  |  |     language: str, | 
            
                                                        
            
                                    
            
            
                | 39 |  |  |     require_subjects: bool, | 
            
                                                        
            
                                    
            
            
                | 40 |  |  | ) -> Document | None: | 
            
                                                        
            
                                    
            
            
                | 41 |  |  |     if os.path.getsize(filename) == 0: | 
            
                                                        
            
                                    
            
            
                | 42 |  |  |         logger.warning(f"Skipping empty file {filename}") | 
            
                                                        
            
                                    
            
            
                | 43 |  |  |         return None | 
            
                                                        
            
                                    
            
            
                | 44 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 45 |  |  |     with open(filename) as jsonfile: | 
            
                                                        
            
                                    
            
            
                | 46 |  |  |         try: | 
            
                                                        
            
                                    
            
            
                | 47 |  |  |             data = json.load(jsonfile) | 
            
                                                        
            
                                    
            
            
                | 48 |  |  |         except json.JSONDecodeError as err: | 
            
                                                        
            
                                    
            
            
                | 49 |  |  |             logger.warning(f"JSON parsing failed for file {filename}: {err}") | 
            
                                                        
            
                                    
            
            
                | 50 |  |  |             return None | 
            
                                                        
            
                                    
            
            
                | 51 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 52 |  |  |     try: | 
            
                                                        
            
                                    
            
            
                | 53 |  |  |         jsonschema.validate(instance=data, schema=_get_json_schema("document.json")) | 
            
                                                        
            
                                    
            
            
                | 54 |  |  |     except jsonschema.ValidationError as err: | 
            
                                                        
            
                                    
            
            
                | 55 |  |  |         logger.warning(f"JSON validation failed for file {filename}: {err}") | 
            
                                                        
            
                                    
            
            
                | 56 |  |  |         return None | 
            
                                                        
            
                                    
            
            
                | 57 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 58 |  |  |     subject_set = _subjects_to_subject_set( | 
            
                                                        
            
                                    
            
            
                | 59 |  |  |         data.get("subjects", []), subject_index, language | 
            
                                                        
            
                                    
            
            
                | 60 |  |  |     ) | 
            
                                                        
            
                                    
            
            
                | 61 |  |  |     if require_subjects and not subject_set: | 
            
                                                        
            
                                    
            
            
                | 62 |  |  |         return None | 
            
                                                        
            
                                    
            
            
                | 63 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 64 |  |  |     return Document( | 
            
                                                        
            
                                    
            
            
                | 65 |  |  |         text=data.get("text", ""), | 
            
                                                        
            
                                    
            
            
                | 66 |  |  |         metadata=data.get("metadata", {}), | 
            
                                                        
            
                                    
            
            
                | 67 |  |  |         subject_set=subject_set, | 
            
                                                        
            
                                    
            
            
                | 68 |  |  |     ) | 
            
                                                        
            
                                    
            
            
                | 69 |  |  |  |