annif.lexical.util   A
last analyzed

Complexity

Total Complexity 9

Size/Duplication

Total Lines 59
Duplicated Lines 0 %

Importance

Changes 0
Metric Value
eloc 38
dl 0
loc 59
rs 10
c 0
b 0
f 0
wmc 9

3 Functions

Rating   Name   Duplication   Size   Complexity  
A get_subject_labels() 0 8 1
A make_relation_matrix() 0 13 4
A make_collection_matrix() 0 15 4
1
"""Utility methods for lexical algorithms"""
2
3
from __future__ import annotations
4
5
import collections
6
from typing import TYPE_CHECKING
7
8
from rdflib import URIRef
9
from rdflib.namespace import SKOS
10
from scipy.sparse import csc_matrix, lil_matrix
11
12
if TYPE_CHECKING:
13
    from rdflib.graph import Graph
14
15
    from annif.vocab import AnnifVocabulary
16
17
18
def get_subject_labels(
19
    graph: Graph, uri: str, properties: list[URIRef], language: str
20
) -> list[str]:
21
    return [
22
        str(label)
23
        for prop in properties
24
        for label in graph.objects(URIRef(uri), prop)
25
        if label.language == language
26
    ]
27
28
29
def make_relation_matrix(
30
    graph: Graph, vocab: AnnifVocabulary, property: URIRef
31
) -> csc_matrix:
32
    n_subj = len(vocab.subjects)
33
    matrix = lil_matrix((n_subj, n_subj), dtype=bool)
34
35
    for subj, obj in graph.subject_objects(property):
36
        subj_id = vocab.subjects.by_uri(str(subj), warnings=False)
37
        obj_id = vocab.subjects.by_uri(str(obj), warnings=False)
38
        if subj_id is not None and obj_id is not None:
39
            matrix[subj_id, obj_id] = True
40
41
    return csc_matrix(matrix)
42
43
44
def make_collection_matrix(graph: Graph, vocab: AnnifVocabulary) -> csc_matrix:
45
    # make an index with all collection members
46
    c_members = collections.defaultdict(list)
47
    for coll, member in graph.subject_objects(SKOS.member):
48
        member_id = vocab.subjects.by_uri(str(member), warnings=False)
49
        if member_id is not None:
50
            c_members[str(coll)].append(member_id)
51
52
    c_matrix = lil_matrix((len(c_members), len(vocab.subjects)), dtype=bool)
53
54
    # populate the matrix for collection -> subject_id
55
    for c_id, members in enumerate(c_members.values()):
56
        c_matrix[c_id, members] = True
57
58
    return csc_matrix(c_matrix)
59