| 
                    1
                 | 
                                    
                                                     | 
                
                 | 
                """Support for document corpora in JSON format"""  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    2
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    3
                 | 
                                    
                                                     | 
                
                 | 
                import functools  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    4
                 | 
                                    
                                                     | 
                
                 | 
                import json  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    5
                 | 
                                    
                                                     | 
                
                 | 
                import os.path  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    6
                 | 
                                    
                                                     | 
                
                 | 
                from importlib.resources import files  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    7
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    8
                 | 
                                    
                                                     | 
                
                 | 
                import annif  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    9
                 | 
                                    
                                                     | 
                
                 | 
                from annif.vocab import SubjectIndex  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    10
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    11
                 | 
                                    
                                                     | 
                
                 | 
                from .types import Document, SubjectSet  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    12
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    13
                 | 
                                    
                                                     | 
                
                 | 
                logger = annif.logger  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    14
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                                                                            
                                                                
            
                                    
            
            
                | 
                    15
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                                        
                            
            
                                    
            
            
                | 
                    16
                 | 
                                    
                                                     | 
                
                 | 
                @functools.lru_cache(maxsize=1)  | 
            
            
                                                                        
                            
            
                                    
            
            
                | 
                    17
                 | 
                                    
                                                     | 
                
                 | 
                def _get_json_schema(schema_name):  | 
            
            
                                                                        
                            
            
                                    
            
            
                | 
                    18
                 | 
                                    
                                                     | 
                
                 | 
                    schema_path = files("annif.schemas").joinpath(schema_name) | 
            
            
                                                                        
                            
            
                                    
            
            
                | 
                    19
                 | 
                                    
                                                     | 
                
                 | 
                    with schema_path.open("r", encoding="utf-8") as schema_file: | 
            
            
                                                                        
                            
            
                                    
            
            
                | 
                    20
                 | 
                                    
                                                     | 
                
                 | 
                        return json.load(schema_file)  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    21
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    22
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    23
                 | 
                                    
                                                     | 
                
                 | 
                def _subjects_to_subject_set(subjects, subject_index, language):  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    24
                 | 
                                    
                                                     | 
                
                 | 
                    subject_ids = []  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    25
                 | 
                                    
                                                     | 
                
                 | 
                    for subj in subjects:  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    26
                 | 
                                    
                                                     | 
                
                 | 
                        if "uri" in subj:  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    27
                 | 
                                    
                                                     | 
                
                 | 
                            subject_ids.append(subject_index.by_uri(subj["uri"]))  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    28
                 | 
                                    
                                                     | 
                
                 | 
                        else:  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    29
                 | 
                                    
                                                     | 
                
                 | 
                            subject_ids.append(subject_index.by_label(subj["label"], language))  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    30
                 | 
                                    
                                                     | 
                
                 | 
                    return SubjectSet(subject_ids)  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    31
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    32
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    33
                 | 
                                    
                                                     | 
                
                 | 
                def json_to_document(  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    34
                 | 
                                    
                                                     | 
                
                 | 
                    filename: str,  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    35
                 | 
                                    
                                                     | 
                
                 | 
                    json_data: str,  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    36
                 | 
                                    
                                                     | 
                
                 | 
                    subject_index: SubjectIndex | None,  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    37
                 | 
                                    
                                                     | 
                
                 | 
                    language: str,  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    38
                 | 
                                    
                                                     | 
                
                 | 
                    require_subjects: bool,  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    39
                 | 
                                    
                                                     | 
                
                 | 
                ) -> Document | None:  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    40
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    41
                 | 
                                    
                                                     | 
                
                 | 
                    import jsonschema  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    42
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    43
                 | 
                                    
                                                     | 
                
                 | 
                    try:  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    44
                 | 
                                    
                                                     | 
                
                 | 
                        data = json.loads(json_data)  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    45
                 | 
                                    
                                                     | 
                
                 | 
                    except json.JSONDecodeError as err:  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    46
                 | 
                                    
                                                     | 
                
                 | 
                        logger.warning(f"JSON parsing failed for file {filename}: {err}") | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    47
                 | 
                                    
                                                     | 
                
                 | 
                        return None  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    48
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    49
                 | 
                                    
                                                     | 
                
                 | 
                    try:  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    50
                 | 
                                    
                                                     | 
                
                 | 
                        jsonschema.validate(instance=data, schema=_get_json_schema("document.json")) | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    51
                 | 
                                    
                                                     | 
                
                 | 
                    except jsonschema.ValidationError as err:  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    52
                 | 
                                    
                                                     | 
                
                 | 
                        logger.warning(f"JSON validation failed for file {filename}: {err.message}") | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    53
                 | 
                                    
                                                     | 
                
                 | 
                        return None  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    54
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    55
                 | 
                                    
                                                     | 
                
                 | 
                    if require_subjects or (subject_index is not None and "subjects" in data):  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    56
                 | 
                                    
                                                     | 
                
                 | 
                        subject_set = _subjects_to_subject_set(  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    57
                 | 
                                    
                                                     | 
                
                 | 
                            data.get("subjects", []), subject_index, language | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    58
                 | 
                                    
                                                     | 
                
                 | 
                        )  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    59
                 | 
                                    
                                                     | 
                
                 | 
                        if not subject_set:  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    60
                 | 
                                    
                                                     | 
                
                 | 
                            return None  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    61
                 | 
                                    
                                                     | 
                
                 | 
                    else:  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    62
                 | 
                                    
                                                     | 
                
                 | 
                        subject_set = None  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    63
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    64
                 | 
                                    
                                                     | 
                
                 | 
                    return Document(  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    65
                 | 
                                    
                                                     | 
                
                 | 
                        text=data.get("text", ""), | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    66
                 | 
                                    
                                                     | 
                
                 | 
                        metadata=data.get("metadata", {}), | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    67
                 | 
                                    
                                                     | 
                
                 | 
                        subject_set=subject_set,  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    68
                 | 
                                    
                                                     | 
                
                 | 
                        file_path=filename,  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    69
                 | 
                                    
                                                     | 
                
                 | 
                        document_id=data.get("document_id", None), | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    70
                 | 
                                    
                                                     | 
                
                 | 
                    )  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    71
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    72
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    73
                 | 
                                    
                                                     | 
                
                 | 
                def json_file_to_document(  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    74
                 | 
                                    
                                                     | 
                
                 | 
                    filename: str,  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    75
                 | 
                                    
                                                     | 
                
                 | 
                    subject_index: SubjectIndex | None,  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    76
                 | 
                                    
                                                     | 
                
                 | 
                    language: str,  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    77
                 | 
                                    
                                                     | 
                
                 | 
                    require_subjects: bool,  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    78
                 | 
                                    
                                                     | 
                
                 | 
                ) -> Document | None:  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    79
                 | 
                                    
                                                     | 
                
                 | 
                    if os.path.getsize(filename) == 0:  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    80
                 | 
                                    
                                                     | 
                
                 | 
                        logger.warning(f"Skipping empty file {filename}") | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    81
                 | 
                                    
                                                     | 
                
                 | 
                        return None  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    82
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    83
                 | 
                                    
                                                     | 
                
                 | 
                    with open(filename, "r", encoding="utf-8") as jsonfile:  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    84
                 | 
                                    
                                                     | 
                
                 | 
                        json_data = jsonfile.read()  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    85
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    86
                 | 
                                    
                                                     | 
                
                 | 
                    return json_to_document(  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    87
                 | 
                                    
                                                     | 
                
                 | 
                        filename, json_data, subject_index, language, require_subjects  | 
            
            
                                                                                                            
                                                                
            
                                    
            
            
                | 
                    88
                 | 
                                    
                                                     | 
                
                 | 
                    )  | 
            
            
                                                        
            
                                    
            
            
                | 
                    89
                 | 
                                    
                                                     | 
                
                 | 
                 |