| 
                    1
                 | 
                                    
                                                     | 
                
                 | 
                """Neural network based ensemble backend that combines results from multiple  | 
            
            
                                                        
            
                                    
            
            
                | 
                    2
                 | 
                                    
                                                     | 
                
                 | 
                projects."""  | 
            
            
                                                        
            
                                    
            
            
                | 
                    3
                 | 
                                    
                                                     | 
                
                 | 
                from __future__ import annotations  | 
            
            
                                                        
            
                                    
            
            
                | 
                    4
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    5
                 | 
                                    
                                                     | 
                
                 | 
                import os.path  | 
            
            
                                                        
            
                                    
            
            
                | 
                    6
                 | 
                                    
                                                     | 
                
                 | 
                import shutil  | 
            
            
                                                        
            
                                    
            
            
                | 
                    7
                 | 
                                    
                                                     | 
                
                 | 
                from io import BytesIO  | 
            
            
                                                        
            
                                    
            
            
                | 
                    8
                 | 
                                    
                                                     | 
                
                 | 
                from typing import TYPE_CHECKING, Any, Dict, List, Tuple, Union  | 
            
            
                                                        
            
                                    
            
            
                | 
                    9
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    10
                 | 
                                    
                                                     | 
                
                 | 
                import joblib  | 
            
            
                                                        
            
                                    
            
            
                | 
                    11
                 | 
                                    
                                                     | 
                
                 | 
                import lmdb  | 
            
            
                                                        
            
                                    
            
            
                | 
                    12
                 | 
                                    
                                                     | 
                
                 | 
                import numpy as np  | 
            
            
                                                        
            
                                    
            
            
                | 
                    13
                 | 
                                    
                                                     | 
                
                 | 
                import tensorflow.keras.backend as K  | 
            
            
                                                        
            
                                    
            
            
                | 
                    14
                 | 
                                    
                                                     | 
                
                 | 
                from scipy.sparse import csc_matrix, csr_matrix  | 
            
            
                                                        
            
                                    
            
            
                | 
                    15
                 | 
                                    
                                                     | 
                
                 | 
                from tensorflow.keras.layers import Add, Dense, Dropout, Flatten, Input, Layer  | 
            
            
                                                        
            
                                    
            
            
                | 
                    16
                 | 
                                    
                                                     | 
                
                 | 
                from tensorflow.keras.models import Model, load_model  | 
            
            
                                                        
            
                                    
            
            
                | 
                    17
                 | 
                                    
                                                     | 
                
                 | 
                from tensorflow.keras.utils import Sequence  | 
            
            
                                                        
            
                                    
            
            
                | 
                    18
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    19
                 | 
                                    
                                                     | 
                
                 | 
                import annif.corpus  | 
            
            
                                                        
            
                                    
            
            
                | 
                    20
                 | 
                                    
                                                     | 
                
                 | 
                import annif.parallel  | 
            
            
                                                        
            
                                    
            
            
                | 
                    21
                 | 
                                    
                                                     | 
                
                 | 
                import annif.util  | 
            
            
                                                        
            
                                    
            
            
                | 
                    22
                 | 
                                    
                                                     | 
                
                 | 
                from annif.exception import NotInitializedException, NotSupportedException  | 
            
            
                                                        
            
                                    
            
            
                | 
                    23
                 | 
                                    
                                                     | 
                
                 | 
                from annif.suggestion import SuggestionBatch, vector_to_suggestions  | 
            
            
                                                        
            
                                    
            
            
                | 
                    24
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    25
                 | 
                                    
                                                     | 
                
                 | 
                from . import backend, ensemble  | 
            
            
                                                        
            
                                    
            
            
                | 
                    26
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    27
                 | 
                                    
                                                     | 
                
                 | 
                if TYPE_CHECKING:  | 
            
            
                                                        
            
                                    
            
            
                | 
                    28
                 | 
                                    
                                                     | 
                
                 | 
                    from tensorflow.python.framework.ops import EagerTensor  | 
            
            
                                                        
            
                                    
            
            
                | 
                    29
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    30
                 | 
                                    
                                                     | 
                
                 | 
                    from annif.corpus.document import DocumentCorpus  | 
            
            
                                                        
            
                                    
            
            
                | 
                    31
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    32
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    33
                 | 
                                    
                                                     | 
                
                 | 
                def idx_to_key(idx: int) -> bytes:  | 
            
            
                                                        
            
                                    
            
            
                | 
                    34
                 | 
                                    
                                                     | 
                
                 | 
                    """convert an integer index to a binary key for use in LMDB"""  | 
            
            
                                                        
            
                                    
            
            
                | 
                    35
                 | 
                                    
                                                     | 
                
                 | 
                    return b"%08d" % idx  | 
            
            
                                                        
            
                                    
            
            
                | 
                    36
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    37
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    38
                 | 
                                    
                                                     | 
                
                 | 
                def key_to_idx(key: Union[memoryview, bytes]) -> int:  | 
            
            
                                                        
            
                                    
            
            
                | 
                    39
                 | 
                                    
                                                     | 
                
                 | 
                    """convert a binary LMDB key to an integer index"""  | 
            
            
                                                        
            
                                    
            
            
                | 
                    40
                 | 
                                    
                                                     | 
                
                 | 
                    return int(key)  | 
            
            
                                                        
            
                                    
            
            
                | 
                    41
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    42
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    43
                 | 
                                    
                                                     | 
                
                 | 
                class LMDBSequence(Sequence):  | 
            
            
                                                        
            
                                    
            
            
                | 
                    44
                 | 
                                    
                                                     | 
                
                 | 
                    """A sequence of samples stored in a LMDB database."""  | 
            
            
                                                        
            
                                    
            
            
                | 
                    45
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    46
                 | 
                                    
                                                     | 
                
                 | 
                    def __init__(self, txn, batch_size):  | 
            
            
                                                        
            
                                    
            
            
                | 
                    47
                 | 
                                    
                                                     | 
                
                 | 
                        self._txn = txn  | 
            
            
                                                        
            
                                    
            
            
                | 
                    48
                 | 
                                    
                                                     | 
                
                 | 
                        cursor = txn.cursor()  | 
            
            
                                                        
            
                                    
            
            
                | 
                    49
                 | 
                                    
                                                     | 
                
                 | 
                        if cursor.last():  | 
            
            
                                                        
            
                                    
            
            
                | 
                    50
                 | 
                                    
                                                     | 
                
                 | 
                            # Counter holds the number of samples in the database  | 
            
            
                                                        
            
                                    
            
            
                | 
                    51
                 | 
                                    
                                                     | 
                
                 | 
                            self._counter = key_to_idx(cursor.key()) + 1  | 
            
            
                                                        
            
                                    
            
            
                | 
                    52
                 | 
                                    
                                                     | 
                
                 | 
                        else:  # empty database  | 
            
            
                                                        
            
                                    
            
            
                | 
                    53
                 | 
                                    
                                                     | 
                
                 | 
                            self._counter = 0  | 
            
            
                                                        
            
                                    
            
            
                | 
                    54
                 | 
                                    
                                                     | 
                
                 | 
                        self._batch_size = batch_size  | 
            
            
                                                        
            
                                    
            
            
                | 
                    55
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    56
                 | 
                                    
                                                     | 
                
                 | 
                    def add_sample(self, inputs: np.ndarray, targets: np.ndarray) -> None:  | 
            
            
                                                        
            
                                    
            
            
                | 
                    57
                 | 
                                    
                                                     | 
                
                 | 
                        # use zero-padded 8-digit key  | 
            
            
                                                        
            
                                    
            
            
                | 
                    58
                 | 
                                    
                                                     | 
                
                 | 
                        key = idx_to_key(self._counter)  | 
            
            
                                                        
            
                                    
            
            
                | 
                    59
                 | 
                                    
                                                     | 
                
                 | 
                        self._counter += 1  | 
            
            
                                                        
            
                                    
            
            
                | 
                    60
                 | 
                                    
                                                     | 
                
                 | 
                        # convert the sample into a sparse matrix and serialize it as bytes  | 
            
            
                                                        
            
                                    
            
            
                | 
                    61
                 | 
                                    
                                                     | 
                
                 | 
                        sample = (csc_matrix(inputs), csr_matrix(targets))  | 
            
            
                                                        
            
                                    
            
            
                | 
                    62
                 | 
                                    
                                                     | 
                
                 | 
                        buf = BytesIO()  | 
            
            
                                                        
            
                                    
            
            
                | 
                    63
                 | 
                                    
                                                     | 
                
                 | 
                        joblib.dump(sample, buf)  | 
            
            
                                                        
            
                                    
            
            
                | 
                    64
                 | 
                                    
                                                     | 
                
                 | 
                        buf.seek(0)  | 
            
            
                                                        
            
                                    
            
            
                | 
                    65
                 | 
                                    
                                                     | 
                
                 | 
                        self._txn.put(key, buf.read())  | 
            
            
                                                        
            
                                    
            
            
                | 
                    66
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    67
                 | 
                                    
                                                     | 
                
                 | 
                    def __getitem__(self, idx: int) -> Tuple[np.ndarray, np.ndarray]:  | 
            
            
                                                        
            
                                    
            
            
                | 
                    68
                 | 
                                    
                                                     | 
                
                 | 
                        """get a particular batch of samples"""  | 
            
            
                                                        
            
                                    
            
            
                | 
                    69
                 | 
                                    
                                                     | 
                
                 | 
                        cursor = self._txn.cursor()  | 
            
            
                                                        
            
                                    
            
            
                | 
                    70
                 | 
                                    
                                                     | 
                
                 | 
                        first_key = idx * self._batch_size  | 
            
            
                                                        
            
                                    
            
            
                | 
                    71
                 | 
                                    
                                                     | 
                
                 | 
                        cursor.set_key(idx_to_key(first_key))  | 
            
            
                                                        
            
                                    
            
            
                | 
                    72
                 | 
                                    
                                                     | 
                
                 | 
                        input_arrays = []  | 
            
            
                                                        
            
                                    
            
            
                | 
                    73
                 | 
                                    
                                                     | 
                
                 | 
                        target_arrays = []  | 
            
            
                                                        
            
                                    
            
            
                | 
                    74
                 | 
                                    
                                                     | 
                
                 | 
                        for key, value in cursor.iternext():  | 
            
            
                                                        
            
                                    
            
            
                | 
                    75
                 | 
                                    
                                                     | 
                
                 | 
                            if key_to_idx(key) >= (first_key + self._batch_size):  | 
            
            
                                                        
            
                                    
            
            
                | 
                    76
                 | 
                                    
                                                     | 
                
                 | 
                                break  | 
            
            
                                                        
            
                                    
            
            
                | 
                    77
                 | 
                                    
                                                     | 
                
                 | 
                            input_csr, target_csr = joblib.load(BytesIO(value))  | 
            
            
                                                        
            
                                    
            
            
                | 
                    78
                 | 
                                    
                                                     | 
                
                 | 
                            input_arrays.append(input_csr.toarray())  | 
            
            
                                                        
            
                                    
            
            
                | 
                    79
                 | 
                                    
                                                     | 
                
                 | 
                            target_arrays.append(target_csr.toarray().flatten())  | 
            
            
                                                        
            
                                    
            
            
                | 
                    80
                 | 
                                    
                                                     | 
                
                 | 
                        return np.array(input_arrays), np.array(target_arrays)  | 
            
            
                                                        
            
                                    
            
            
                | 
                    81
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    82
                 | 
                                    
                                                     | 
                
                 | 
                    def __len__(self) -> int:  | 
            
            
                                                        
            
                                    
            
            
                | 
                    83
                 | 
                                    
                                                     | 
                
                 | 
                        """return the number of available batches"""  | 
            
            
                                                        
            
                                    
            
            
                | 
                    84
                 | 
                                    
                                                     | 
                
                 | 
                        return int(np.ceil(self._counter / self._batch_size))  | 
            
            
                                                        
            
                                    
            
            
                | 
                    85
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    86
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    87
                 | 
                                    
                                                     | 
                
                 | 
                class MeanLayer(Layer):  | 
            
            
                                                        
            
                                    
            
            
                | 
                    88
                 | 
                                    
                                                     | 
                
                 | 
                    """Custom Keras layer that calculates mean values along the 2nd axis."""  | 
            
            
                                                        
            
                                    
            
            
                | 
                    89
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    90
                 | 
                                    
                                                     | 
                
                 | 
                    def call(self, inputs: EagerTensor) -> EagerTensor:  | 
            
            
                                                        
            
                                    
            
            
                | 
                    91
                 | 
                                    
                                                     | 
                
                 | 
                        return K.mean(inputs, axis=2)  | 
            
            
                                                        
            
                                    
            
            
                | 
                    92
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    93
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    94
                 | 
                                    
                                                     | 
                
                 | 
                class NNEnsembleBackend(backend.AnnifLearningBackend, ensemble.BaseEnsembleBackend):  | 
            
            
                                                        
            
                                    
            
            
                | 
                    95
                 | 
                                    
                                                     | 
                
                 | 
                    """Neural network ensemble backend that combines results from multiple  | 
            
            
                                                        
            
                                    
            
            
                | 
                    96
                 | 
                                    
                                                     | 
                
                 | 
                    projects"""  | 
            
            
                                                        
            
                                    
            
            
                | 
                    97
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    98
                 | 
                                    
                                                     | 
                
                 | 
                    name = "nn_ensemble"  | 
            
            
                                                        
            
                                    
            
            
                | 
                    99
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    100
                 | 
                                    
                                                     | 
                
                 | 
                    MODEL_FILE = "nn-model.h5"  | 
            
            
                                                        
            
                                    
            
            
                | 
                    101
                 | 
                                    
                                                     | 
                
                 | 
                    LMDB_FILE = "nn-train.mdb"  | 
            
            
                                                        
            
                                    
            
            
                | 
                    102
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    103
                 | 
                                    
                                                     | 
                
                 | 
                    DEFAULT_PARAMETERS = { | 
            
            
                                                        
            
                                    
            
            
                | 
                    104
                 | 
                                    
                                                     | 
                
                 | 
                        "nodes": 100,  | 
            
            
                                                        
            
                                    
            
            
                | 
                    105
                 | 
                                    
                                                     | 
                
                 | 
                        "dropout_rate": 0.2,  | 
            
            
                                                        
            
                                    
            
            
                | 
                    106
                 | 
                                    
                                                     | 
                
                 | 
                        "optimizer": "adam",  | 
            
            
                                                        
            
                                    
            
            
                | 
                    107
                 | 
                                    
                                                     | 
                
                 | 
                        "epochs": 10,  | 
            
            
                                                        
            
                                    
            
            
                | 
                    108
                 | 
                                    
                                                     | 
                
                 | 
                        "learn-epochs": 1,  | 
            
            
                                                        
            
                                    
            
            
                | 
                    109
                 | 
                                    
                                                     | 
                
                 | 
                        "lmdb_map_size": 1024 * 1024 * 1024,  | 
            
            
                                                        
            
                                    
            
            
                | 
                    110
                 | 
                                    
                                                     | 
                
                 | 
                    }  | 
            
            
                                                        
            
                                    
            
            
                | 
                    111
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    112
                 | 
                                    
                                                     | 
                
                 | 
                    # defaults for uninitialized instances  | 
            
            
                                                        
            
                                    
            
            
                | 
                    113
                 | 
                                    
                                                     | 
                
                 | 
                    _model = None  | 
            
            
                                                        
            
                                    
            
            
                | 
                    114
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    115
                 | 
                                    
                                                     | 
                
                 | 
                    def default_params(self) -> Dict[str, Any]:  | 
            
            
                                                        
            
                                    
            
            
                | 
                    116
                 | 
                                    
                                                     | 
                
                 | 
                        params = backend.AnnifBackend.DEFAULT_PARAMETERS.copy()  | 
            
            
                                                        
            
                                    
            
            
                | 
                    117
                 | 
                                    
                                                     | 
                
                 | 
                        params.update(self.DEFAULT_PARAMETERS)  | 
            
            
                                                        
            
                                    
            
            
                | 
                    118
                 | 
                                    
                                                     | 
                
                 | 
                        return params  | 
            
            
                                                        
            
                                    
            
            
                | 
                    119
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    120
                 | 
                                    
                                                     | 
                
                 | 
                    def initialize(self, parallel: bool = False) -> None:  | 
            
            
                                                        
            
                                    
            
            
                | 
                    121
                 | 
                                    
                                                     | 
                
                 | 
                        super().initialize(parallel)  | 
            
            
                                                        
            
                                    
            
            
                | 
                    122
                 | 
                                    
                                                     | 
                
                 | 
                        if self._model is not None:  | 
            
            
                                                        
            
                                    
            
            
                | 
                    123
                 | 
                                    
                                                     | 
                
                 | 
                            return  # already initialized  | 
            
            
                                                        
            
                                    
            
            
                | 
                    124
                 | 
                                    
                                                     | 
                
                 | 
                        if parallel:  | 
            
            
                                                        
            
                                    
            
            
                | 
                    125
                 | 
                                    
                                                     | 
                
                 | 
                            # Don't load TF model just before parallel execution,  | 
            
            
                                                        
            
                                    
            
            
                | 
                    126
                 | 
                                    
                                                     | 
                
                 | 
                            # since it won't work after forking worker processes  | 
            
            
                                                        
            
                                    
            
            
                | 
                    127
                 | 
                                    
                                                     | 
                
                 | 
                            return  | 
            
            
                                                        
            
                                    
            
            
                | 
                    128
                 | 
                                    
                                                     | 
                
                 | 
                        model_filename = os.path.join(self.datadir, self.MODEL_FILE)  | 
            
            
                                                        
            
                                    
            
            
                | 
                    129
                 | 
                                    
                                                     | 
                
                 | 
                        if not os.path.exists(model_filename):  | 
            
            
                                                        
            
                                    
            
            
                | 
                    130
                 | 
                                    
                                                     | 
                
                 | 
                            raise NotInitializedException(  | 
            
            
                                                        
            
                                    
            
            
                | 
                    131
                 | 
                                    
                                                     | 
                
                 | 
                                "model file {} not found".format(model_filename), | 
            
            
                                                        
            
                                    
            
            
                | 
                    132
                 | 
                                    
                                                     | 
                
                 | 
                                backend_id=self.backend_id,  | 
            
            
                                                        
            
                                    
            
            
                | 
                    133
                 | 
                                    
                                                     | 
                
                 | 
                            )  | 
            
            
                                                        
            
                                    
            
            
                | 
                    134
                 | 
                                    
                                                     | 
                
                 | 
                        self.debug("loading Keras model from {}".format(model_filename)) | 
            
            
                                                        
            
                                    
            
            
                | 
                    135
                 | 
                                    
                                                     | 
                
                 | 
                        self._model = load_model(  | 
            
            
                                                        
            
                                    
            
            
                | 
                    136
                 | 
                                    
                                                     | 
                
                 | 
                            model_filename, custom_objects={"MeanLayer": MeanLayer} | 
            
            
                                                        
            
                                    
            
            
                | 
                    137
                 | 
                                    
                                                     | 
                
                 | 
                        )  | 
            
            
                                                        
            
                                    
            
            
                | 
                    138
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    139
                 | 
                                    
                                                     | 
                
                 | 
                    def _merge_source_batches(  | 
            
            
                                                        
            
                                    
            
            
                | 
                    140
                 | 
                                    
                                                     | 
                
                 | 
                        self,  | 
            
            
                                                        
            
                                    
            
            
                | 
                    141
                 | 
                                    
                                                     | 
                
                 | 
                        batch_by_source: Dict[str, SuggestionBatch],  | 
            
            
                                                        
            
                                    
            
            
                | 
                    142
                 | 
                                    
                                                     | 
                
                 | 
                        sources: List[Tuple[str, float]],  | 
            
            
                                                        
            
                                    
            
            
                | 
                    143
                 | 
                                    
                                                     | 
                
                 | 
                        params: Dict[str, Any],  | 
            
            
                                                        
            
                                    
            
            
                | 
                    144
                 | 
                                    
                                                     | 
                
                 | 
                    ) -> SuggestionBatch:  | 
            
            
                                                        
            
                                    
            
            
                | 
                    145
                 | 
                                    
                                                     | 
                
                 | 
                        src_weight = dict(sources)  | 
            
            
                                                        
            
                                    
            
            
                | 
                    146
                 | 
                                    
                                                     | 
                
                 | 
                        score_vectors = np.array(  | 
            
            
                                                        
            
                                    
            
            
                | 
                    147
                 | 
                                    
                                                     | 
                
                 | 
                            [  | 
            
            
                                                        
            
                                    
            
            
                | 
                    148
                 | 
                                    
                                                     | 
                
                 | 
                                [  | 
            
            
                                                        
            
                                    
            
            
                | 
                    149
                 | 
                                    
                                                     | 
                
                 | 
                                    np.sqrt(suggestions.as_vector())  | 
            
            
                                                        
            
                                    
            
            
                | 
                    150
                 | 
                                    
                                                     | 
                
                 | 
                                    * src_weight[project_id]  | 
            
            
                                                        
            
                                    
            
            
                | 
                    151
                 | 
                                    
                                                     | 
                
                 | 
                                    * len(batch_by_source)  | 
            
            
                                                        
            
                                    
            
            
                | 
                    152
                 | 
                                    
                                                     | 
                
                 | 
                                    for suggestions in batch  | 
            
            
                                                        
            
                                    
            
            
                | 
                    153
                 | 
                                    
                                                     | 
                
                 | 
                                ]  | 
            
            
                                                        
            
                                    
            
            
                | 
                    154
                 | 
                                    
                                                     | 
                
                 | 
                                for project_id, batch in batch_by_source.items()  | 
            
            
                                                        
            
                                    
            
            
                | 
                    155
                 | 
                                    
                                                     | 
                
                 | 
                            ],  | 
            
            
                                                        
            
                                    
            
            
                | 
                    156
                 | 
                                    
                                                     | 
                
                 | 
                            dtype=np.float32,  | 
            
            
                                                        
            
                                    
            
            
                | 
                    157
                 | 
                                    
                                                     | 
                
                 | 
                        ).transpose(1, 2, 0)  | 
            
            
                                                        
            
                                    
            
            
                | 
                    158
                 | 
                                    
                                                     | 
                
                 | 
                        prediction = self._model(score_vectors).numpy()  | 
            
            
                                                        
            
                                    
            
            
                | 
                    159
                 | 
                                    
                                                     | 
                
                 | 
                        return SuggestionBatch.from_sequence(  | 
            
            
                                                        
            
                                    
            
            
                | 
                    160
                 | 
                                    
                                                     | 
                
                 | 
                            [  | 
            
            
                                                        
            
                                    
            
            
                | 
                    161
                 | 
                                    
                                                     | 
                
                 | 
                                vector_to_suggestions(row, limit=int(params["limit"]))  | 
            
            
                                                        
            
                                    
            
            
                | 
                    162
                 | 
                                    
                                                     | 
                
                 | 
                                for row in prediction  | 
            
            
                                                        
            
                                    
            
            
                | 
                    163
                 | 
                                    
                                                     | 
                
                 | 
                            ],  | 
            
            
                                                        
            
                                    
            
            
                | 
                    164
                 | 
                                    
                                                     | 
                
                 | 
                            self.project.subjects,  | 
            
            
                                                        
            
                                    
            
            
                | 
                    165
                 | 
                                    
                                                     | 
                
                 | 
                        )  | 
            
            
                                                        
            
                                    
            
            
                | 
                    166
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    167
                 | 
                                    
                                                     | 
                
                 | 
                    def _create_model(self, sources: List[Tuple[str, float]]) -> None:  | 
            
            
                                                        
            
                                    
            
            
                | 
                    168
                 | 
                                    
                                                     | 
                
                 | 
                        self.info("creating NN ensemble model") | 
            
            
                                                        
            
                                    
            
            
                | 
                    169
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    170
                 | 
                                    
                                                     | 
                
                 | 
                        inputs = Input(shape=(len(self.project.subjects), len(sources)))  | 
            
            
                                                        
            
                                    
            
            
                | 
                    171
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    172
                 | 
                                    
                                                     | 
                
                 | 
                        flat_input = Flatten()(inputs)  | 
            
            
                                                        
            
                                    
            
            
                | 
                    173
                 | 
                                    
                                                     | 
                
                 | 
                        drop_input = Dropout(rate=float(self.params["dropout_rate"]))(flat_input)  | 
            
            
                                                        
            
                                    
            
            
                | 
                    174
                 | 
                                    
                                                     | 
                
                 | 
                        hidden = Dense(int(self.params["nodes"]), activation="relu")(drop_input)  | 
            
            
                                                        
            
                                    
            
            
                | 
                    175
                 | 
                                    
                                                     | 
                
                 | 
                        drop_hidden = Dropout(rate=float(self.params["dropout_rate"]))(hidden)  | 
            
            
                                                        
            
                                    
            
            
                | 
                    176
                 | 
                                    
                                                     | 
                
                 | 
                        delta = Dense(  | 
            
            
                                                        
            
                                    
            
            
                | 
                    177
                 | 
                                    
                                                     | 
                
                 | 
                            len(self.project.subjects),  | 
            
            
                                                        
            
                                    
            
            
                | 
                    178
                 | 
                                    
                                                     | 
                
                 | 
                            kernel_initializer="zeros",  | 
            
            
                                                        
            
                                    
            
            
                | 
                    179
                 | 
                                    
                                                     | 
                
                 | 
                            bias_initializer="zeros",  | 
            
            
                                                        
            
                                    
            
            
                | 
                    180
                 | 
                                    
                                                     | 
                
                 | 
                        )(drop_hidden)  | 
            
            
                                                        
            
                                    
            
            
                | 
                    181
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    182
                 | 
                                    
                                                     | 
                
                 | 
                        mean = MeanLayer()(inputs)  | 
            
            
                                                        
            
                                    
            
            
                | 
                    183
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    184
                 | 
                                    
                                                     | 
                
                 | 
                        predictions = Add()([mean, delta])  | 
            
            
                                                        
            
                                    
            
            
                | 
                    185
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    186
                 | 
                                    
                                                     | 
                
                 | 
                        self._model = Model(inputs=inputs, outputs=predictions)  | 
            
            
                                                        
            
                                    
            
            
                | 
                    187
                 | 
                                    
                                                     | 
                
                 | 
                        self._model.compile(  | 
            
            
                                                        
            
                                    
            
            
                | 
                    188
                 | 
                                    
                                                     | 
                
                 | 
                            optimizer=self.params["optimizer"],  | 
            
            
                                                        
            
                                    
            
            
                | 
                    189
                 | 
                                    
                                                     | 
                
                 | 
                            loss="binary_crossentropy",  | 
            
            
                                                        
            
                                    
            
            
                | 
                    190
                 | 
                                    
                                                     | 
                
                 | 
                            metrics=["top_k_categorical_accuracy"],  | 
            
            
                                                        
            
                                    
            
            
                | 
                    191
                 | 
                                    
                                                     | 
                
                 | 
                        )  | 
            
            
                                                        
            
                                    
            
            
                | 
                    192
                 | 
                                    
                                                     | 
                
                 | 
                        if "lr" in self.params:  | 
            
            
                                                        
            
                                    
            
            
                | 
                    193
                 | 
                                    
                                                     | 
                
                 | 
                            self._model.optimizer.learning_rate.assign(float(self.params["lr"]))  | 
            
            
                                                        
            
                                    
            
            
                | 
                    194
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    195
                 | 
                                    
                                                     | 
                
                 | 
                        summary = []  | 
            
            
                                                        
            
                                    
            
            
                | 
                    196
                 | 
                                    
                                                     | 
                
                 | 
                        self._model.summary(print_fn=summary.append)  | 
            
            
                                                        
            
                                    
            
            
                | 
                    197
                 | 
                                    
                                                     | 
                
                 | 
                        self.debug("Created model: \n" + "\n".join(summary)) | 
            
            
                                                        
            
                                    
            
            
                | 
                    198
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    199
                 | 
                                    
                                                     | 
                
                 | 
                    def _train(  | 
            
            
                                                        
            
                                    
            
            
                | 
                    200
                 | 
                                    
                                                     | 
                
                 | 
                        self,  | 
            
            
                                                        
            
                                    
            
            
                | 
                    201
                 | 
                                    
                                                     | 
                
                 | 
                        corpus: DocumentCorpus,  | 
            
            
                                                        
            
                                    
            
            
                | 
                    202
                 | 
                                    
                                                     | 
                
                 | 
                        params: Dict[str, Any],  | 
            
            
                                                        
            
                                    
            
            
                | 
                    203
                 | 
                                    
                                                     | 
                
                 | 
                        jobs: int = 0,  | 
            
            
                                                        
            
                                    
            
            
                | 
                    204
                 | 
                                    
                                                     | 
                
                 | 
                    ) -> None:  | 
            
            
                                                        
            
                                    
            
            
                | 
                    205
                 | 
                                    
                                                     | 
                
                 | 
                        sources = annif.util.parse_sources(self.params["sources"])  | 
            
            
                                                        
            
                                    
            
            
                | 
                    206
                 | 
                                    
                                                     | 
                
                 | 
                        self._create_model(sources)  | 
            
            
                                                        
            
                                    
            
            
                | 
                    207
                 | 
                                    
                                                     | 
                
                 | 
                        self._fit_model(  | 
            
            
                                                        
            
                                    
            
            
                | 
                    208
                 | 
                                    
                                                     | 
                
                 | 
                            corpus,  | 
            
            
                                                        
            
                                    
            
            
                | 
                    209
                 | 
                                    
                                                     | 
                
                 | 
                            epochs=int(params["epochs"]),  | 
            
            
                                                        
            
                                    
            
            
                | 
                    210
                 | 
                                    
                                                     | 
                
                 | 
                            lmdb_map_size=int(params["lmdb_map_size"]),  | 
            
            
                                                        
            
                                    
            
            
                | 
                    211
                 | 
                                    
                                                     | 
                
                 | 
                            n_jobs=jobs,  | 
            
            
                                                        
            
                                    
            
            
                | 
                    212
                 | 
                                    
                                                     | 
                
                 | 
                        )  | 
            
            
                                                        
            
                                    
            
            
                | 
                    213
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    214
                 | 
                                    
                                                     | 
                
                 | 
                    def _corpus_to_vectors(  | 
            
            
                                                        
            
                                    
            
            
                | 
                    215
                 | 
                                    
                                                     | 
                
                 | 
                        self,  | 
            
            
                                                        
            
                                    
            
            
                | 
                    216
                 | 
                                    
                                                     | 
                
                 | 
                        corpus: DocumentCorpus,  | 
            
            
                                                        
            
                                    
            
            
                | 
                    217
                 | 
                                    
                                                     | 
                
                 | 
                        seq: LMDBSequence,  | 
            
            
                                                        
            
                                    
            
            
                | 
                    218
                 | 
                                    
                                                     | 
                
                 | 
                        n_jobs: int,  | 
            
            
                                                        
            
                                    
            
            
                | 
                    219
                 | 
                                    
                                                     | 
                
                 | 
                    ) -> None:  | 
            
            
                                                        
            
                                    
            
            
                | 
                    220
                 | 
                                    
                                                     | 
                
                 | 
                        # pass corpus through all source projects  | 
            
            
                                                        
            
                                    
            
            
                | 
                    221
                 | 
                                    
                                                     | 
                
                 | 
                        sources = dict(annif.util.parse_sources(self.params["sources"]))  | 
            
            
                                                        
            
                                    
            
            
                | 
                    222
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    223
                 | 
                                    
                                                     | 
                
                 | 
                        # initialize the source projects before forking, to save memory  | 
            
            
                                                        
            
                                    
            
            
                | 
                    224
                 | 
                                    
                                                     | 
                
                 | 
                        self.info(f"Initializing source projects: {', '.join(sources.keys())}") | 
            
            
                                                        
            
                                    
            
            
                | 
                    225
                 | 
                                    
                                                     | 
                
                 | 
                        for project_id in sources.keys():  | 
            
            
                                                        
            
                                    
            
            
                | 
                    226
                 | 
                                    
                                                     | 
                
                 | 
                            project = self.project.registry.get_project(project_id)  | 
            
            
                                                        
            
                                    
            
            
                | 
                    227
                 | 
                                    
                                                     | 
                
                 | 
                            project.initialize(parallel=True)  | 
            
            
                                                        
            
                                    
            
            
                | 
                    228
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    229
                 | 
                                    
                                                     | 
                
                 | 
                        psmap = annif.parallel.ProjectSuggestMap(  | 
            
            
                                                        
            
                                    
            
            
                | 
                    230
                 | 
                                    
                                                     | 
                
                 | 
                            self.project.registry,  | 
            
            
                                                        
            
                                    
            
            
                | 
                    231
                 | 
                                    
                                                     | 
                
                 | 
                            list(sources.keys()),  | 
            
            
                                                        
            
                                    
            
            
                | 
                    232
                 | 
                                    
                                                     | 
                
                 | 
                            backend_params=None,  | 
            
            
                                                        
            
                                    
            
            
                | 
                    233
                 | 
                                    
                                                     | 
                
                 | 
                            limit=None,  | 
            
            
                                                        
            
                                    
            
            
                | 
                    234
                 | 
                                    
                                                     | 
                
                 | 
                            threshold=0.0,  | 
            
            
                                                        
            
                                    
            
            
                | 
                    235
                 | 
                                    
                                                     | 
                
                 | 
                        )  | 
            
            
                                                        
            
                                    
            
            
                | 
                    236
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    237
                 | 
                                    
                                                     | 
                
                 | 
                        jobs, pool_class = annif.parallel.get_pool(n_jobs)  | 
            
            
                                                        
            
                                    
            
            
                | 
                    238
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    239
                 | 
                                    
                                                     | 
                
                 | 
                        self.info("Processing training documents...") | 
            
            
                                                        
            
                                    
            
            
                | 
                    240
                 | 
                                    
                                                     | 
                
                 | 
                        with pool_class(jobs) as pool:  | 
            
            
                                                        
            
                                    
            
            
                | 
                    241
                 | 
                                    
                                                     | 
                
                 | 
                            for hits, subject_set in pool.imap_unordered(  | 
            
            
                                                        
            
                                    
            
            
                | 
                    242
                 | 
                                    
                                                     | 
                
                 | 
                                psmap.suggest, corpus.documents  | 
            
            
                                                        
            
                                    
            
            
                | 
                    243
                 | 
                                    
                                                     | 
                
                 | 
                            ):  | 
            
            
                                                        
            
                                    
            
            
                | 
                    244
                 | 
                                    
                                                     | 
                
                 | 
                                doc_scores = []  | 
            
            
                                                        
            
                                    
            
            
                | 
                    245
                 | 
                                    
                                                     | 
                
                 | 
                                for project_id, p_hits in hits.items():  | 
            
            
                                                        
            
                                    
            
            
                | 
                    246
                 | 
                                    
                                                     | 
                
                 | 
                                    vector = p_hits.as_vector()  | 
            
            
                                                        
            
                                    
            
            
                | 
                    247
                 | 
                                    
                                                     | 
                
                 | 
                                    doc_scores.append(  | 
            
            
                                                        
            
                                    
            
            
                | 
                    248
                 | 
                                    
                                                     | 
                
                 | 
                                        np.sqrt(vector) * sources[project_id] * len(sources)  | 
            
            
                                                        
            
                                    
            
            
                | 
                    249
                 | 
                                    
                                                     | 
                
                 | 
                                    )  | 
            
            
                                                        
            
                                    
            
            
                | 
                    250
                 | 
                                    
                                                     | 
                
                 | 
                                score_vector = np.array(doc_scores, dtype=np.float32).transpose()  | 
            
            
                                                        
            
                                    
            
            
                | 
                    251
                 | 
                                    
                                                     | 
                
                 | 
                                true_vector = subject_set.as_vector(len(self.project.subjects))  | 
            
            
                                                        
            
                                    
            
            
                | 
                    252
                 | 
                                    
                                                     | 
                
                 | 
                                seq.add_sample(score_vector, true_vector)  | 
            
            
                                                        
            
                                    
            
            
                | 
                    253
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    254
                 | 
                                    
                                                     | 
                
                 | 
                    def _open_lmdb(self, cached, lmdb_map_size):  | 
            
            
                                                        
            
                                    
            
            
                | 
                    255
                 | 
                                    
                                                     | 
                
                 | 
                        lmdb_path = os.path.join(self.datadir, self.LMDB_FILE)  | 
            
            
                                                        
            
                                    
            
            
                | 
                    256
                 | 
                                    
                                                     | 
                
                 | 
                        if not cached and os.path.exists(lmdb_path):  | 
            
            
                                                        
            
                                    
            
            
                | 
                    257
                 | 
                                    
                                                     | 
                
                 | 
                            shutil.rmtree(lmdb_path)  | 
            
            
                                                        
            
                                    
            
            
                | 
                    258
                 | 
                                    
                                                     | 
                
                 | 
                        return lmdb.open(lmdb_path, map_size=lmdb_map_size, writemap=True)  | 
            
            
                                                        
            
                                    
            
            
                | 
                    259
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    260
                 | 
                                    
                                                     | 
                
                 | 
                    def _fit_model(  | 
            
            
                                                        
            
                                    
            
            
                | 
                    261
                 | 
                                    
                                                     | 
                
                 | 
                        self,  | 
            
            
                                                        
            
                                    
            
            
                | 
                    262
                 | 
                                    
                                                     | 
                
                 | 
                        corpus: DocumentCorpus,  | 
            
            
                                                        
            
                                    
            
            
                | 
                    263
                 | 
                                    
                                                     | 
                
                 | 
                        epochs: int,  | 
            
            
                                                        
            
                                    
            
            
                | 
                    264
                 | 
                                    
                                                     | 
                
                 | 
                        lmdb_map_size: int,  | 
            
            
                                                        
            
                                    
            
            
                | 
                    265
                 | 
                                    
                                                     | 
                
                 | 
                        n_jobs: int = 1,  | 
            
            
                                                        
            
                                    
            
            
                | 
                    266
                 | 
                                    
                                                     | 
                
                 | 
                    ) -> None:  | 
            
            
                                                        
            
                                    
            
            
                | 
                    267
                 | 
                                    
                                                     | 
                
                 | 
                        env = self._open_lmdb(corpus == "cached", lmdb_map_size)  | 
            
            
                                                        
            
                                    
            
            
                | 
                    268
                 | 
                                    
                                                     | 
                
                 | 
                        if corpus != "cached":  | 
            
            
                                                        
            
                                    
            
            
                | 
                    269
                 | 
                                    
                                                     | 
                
                 | 
                            if corpus.is_empty():  | 
            
            
                                                        
            
                                    
            
            
                | 
                    270
                 | 
                                    
                                                     | 
                
                 | 
                                raise NotSupportedException(  | 
            
            
                                                        
            
                                    
            
            
                | 
                    271
                 | 
                                    
                                                     | 
                
                 | 
                                    "Cannot train nn_ensemble project with no documents"  | 
            
            
                                                        
            
                                    
            
            
                | 
                    272
                 | 
                                    
                                                     | 
                
                 | 
                                )  | 
            
            
                                                        
            
                                    
            
            
                | 
                    273
                 | 
                                    
                                                     | 
                
                 | 
                            with env.begin(write=True, buffers=True) as txn:  | 
            
            
                                                        
            
                                    
            
            
                | 
                    274
                 | 
                                    
                                                     | 
                
                 | 
                                seq = LMDBSequence(txn, batch_size=32)  | 
            
            
                                                        
            
                                    
            
            
                | 
                    275
                 | 
                                    
                                                     | 
                
                 | 
                                self._corpus_to_vectors(corpus, seq, n_jobs)  | 
            
            
                                                        
            
                                    
            
            
                | 
                    276
                 | 
                                    
                                                     | 
                
                 | 
                        else:  | 
            
            
                                                        
            
                                    
            
            
                | 
                    277
                 | 
                                    
                                                     | 
                
                 | 
                            self.info("Reusing cached training data from previous run.") | 
            
            
                                                        
            
                                    
            
            
                | 
                    278
                 | 
                                    
                                                     | 
                
                 | 
                        # fit the model using a read-only view of the LMDB  | 
            
            
                                                        
            
                                    
            
            
                | 
                    279
                 | 
                                    
                                                     | 
                
                 | 
                        self.info("Training neural network model...") | 
            
            
                                                        
            
                                    
            
            
                | 
                    280
                 | 
                                    
                                                     | 
                
                 | 
                        with env.begin(buffers=True) as txn:  | 
            
            
                                                        
            
                                    
            
            
                | 
                    281
                 | 
                                    
                                                     | 
                
                 | 
                            seq = LMDBSequence(txn, batch_size=32)  | 
            
            
                                                        
            
                                    
            
            
                | 
                    282
                 | 
                                    
                                                     | 
                
                 | 
                            self._model.fit(seq, verbose=True, epochs=epochs)  | 
            
            
                                                        
            
                                    
            
            
                | 
                    283
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    284
                 | 
                                    
                                                     | 
                
                 | 
                        annif.util.atomic_save(self._model, self.datadir, self.MODEL_FILE)  | 
            
            
                                                        
            
                                    
            
            
                | 
                    285
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    286
                 | 
                                    
                                                     | 
                
                 | 
                    def _learn(  | 
            
            
                                                        
            
                                    
            
            
                | 
                    287
                 | 
                                    
                                                     | 
                
                 | 
                        self,  | 
            
            
                                                        
            
                                    
            
            
                | 
                    288
                 | 
                                    
                                                     | 
                
                 | 
                        corpus: DocumentCorpus,  | 
            
            
                                                        
            
                                    
            
            
                | 
                    289
                 | 
                                    
                                                     | 
                
                 | 
                        params: Dict[str, Any],  | 
            
            
                                                        
            
                                    
            
            
                | 
                    290
                 | 
                                    
                                                     | 
                
                 | 
                    ) -> None:  | 
            
            
                                                        
            
                                    
            
            
                | 
                    291
                 | 
                                    
                                                     | 
                
                 | 
                        self.initialize()  | 
            
            
                                                        
            
                                    
            
            
                | 
                    292
                 | 
                                    
                                                     | 
                
                 | 
                        self._fit_model(  | 
            
            
                                                        
            
                                    
            
            
                | 
                    293
                 | 
                                    
                                                     | 
                
                 | 
                            corpus, int(params["learn-epochs"]), int(params["lmdb_map_size"])  | 
            
            
                                                        
            
                                    
            
            
                | 
                    294
                 | 
                                    
                                                     | 
                
                 | 
                        )  | 
            
            
                                                        
            
                                    
            
            
                | 
                    295
                 | 
                                    
                                                     | 
                
                 | 
                 |