Passed
Push — dependabot/pip/sphinx-copybutt... ( c72176 )
by
unknown
18:24 queued 16:24
created

mandos.model.hits.HitUtils.hits_to_df()   A

Complexity

Conditions 2

Size

Total Lines 9
Code Lines 9

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 2
eloc 9
nop 2
dl 0
loc 9
rs 9.95
c 0
b 0
f 0
1
import dataclasses
0 ignored issues
show
introduced by
Missing module docstring
Loading history...
2
import html
3
from dataclasses import dataclass
4
from datetime import datetime
5
from typing import Optional, Sequence
6
7
import pandas as pd
0 ignored issues
show
introduced by
Unable to import 'pandas'
Loading history...
8
from typeddfs import TypedDfs
0 ignored issues
show
introduced by
Unable to import 'typeddfs'
Loading history...
9
10
from mandos.model import ReflectionUtils
11
12
HIT_FIELD_TYPE = frozenset([str, int, float, datetime])
13
14
15
@dataclass(frozen=True, repr=True, order=True)
16
class Pair:
17
    """
18
    Predicate, object pairs.
19
    """
20
21
    pred: str
22
    obj: str
23
24
25
@dataclass(frozen=True, repr=True, order=True)
26
class Triple:
27
    """
28
    Usually compound, predicate, object.
29
    """
30
31
    sub: str
32
    pred: str
33
    obj: str
34
35
    @property
36
    def n_triples(self) -> str:
37
        """
38
        Returns a simple text statement in n-triples format.
39
        """
40
        s = self.sub
0 ignored issues
show
Coding Style Naming introduced by
Variable name "s" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
41
        p = html.escape(self.pred, quote=True)
0 ignored issues
show
Coding Style Naming introduced by
Variable name "p" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
42
        o = html.escape(self.obj, quote=True)
0 ignored issues
show
Coding Style Naming introduced by
Variable name "o" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
43
        return f'"{s}" "{p}" "{o}" .'
44
45
46
@dataclass(frozen=True, order=True, repr=True)
0 ignored issues
show
best-practice introduced by
Too many instance attributes (15/7)
Loading history...
47
class AbstractHit:
48
    """
49
    An abstract annotation (statement type), which may support additional fields.
50
    """
51
52
    record_id: Optional[str]
53
    origin_inchikey: str
54
    matched_inchikey: str
55
    compound_id: str
56
    compound_name: str
57
    predicate: str
58
    statement: str
59
    object_id: str
60
    object_name: str
61
    value: float
62
    search_key: str
63
    search_class: str
64
    data_source: str
65
    run_date: datetime
66
    cache_date: Optional[datetime]
67
    # is_hit: Optional[bool] = None
68
    # score: Optional[float] = None
69
    # x_score_1: Optional[float] = None
70
    # x_score_2: Optional[float] = None
71
72
    @property
73
    def hit_class(self) -> str:
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
74
        return self.__class__.__name__
75
76
    @property
77
    def to_triple(self) -> Triple:
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
78
        return Triple(sub=self.origin_inchikey, pred=self.predicate, obj=self.object_name)
79
80
    @property
81
    def to_pair(self) -> Pair:
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
82
        return Pair(pred=self.predicate, obj=self.object_name)
83
84
    def __hash__(self):
85
        return hash(self.record_id)
86
87
    @property
88
    def universal_id(self) -> str:
89
        """
90
        Gets an identifier (a hex key) that uniquely identifies the record by its unique attributes.
91
        Does **NOT** distinguish between hits with duplicate information and does **NOT**
92
        include ``record_id``.
93
94
        Returns:
95
            A 16-character hexadecimal string
96
        """
97
        # excluding record_id only because it's not available for some hit types
98
        # we'd rather immediately see duplicates if the exist
99
        fields = {
100
            field
101
            for field in self.fields()
102
            if field
103
            not in {"record_id", "origin_inchikey", "compound_name", "search_key", "search_class"}
104
        }
105
        hexed = hex(hash(tuple([getattr(self, f) for f in fields])))
106
        # remove negative signs -- still unique
107
        return hexed.replace("-", "").replace("0x", "")
108
109
    @classmethod
110
    def fields(cls) -> Sequence[str]:
111
        """
112
        Finds the list of fields in this class by reflection.
113
        """
114
        return [f.name for f in dataclasses.fields(cls)]
115
116
117
HitFrame = (
118
    TypedDfs.typed("HitFrame")
119
    .require("record_id", dtype=str)
120
    .require("inchikey", "compound_id", "compound_name", dtype=str)
121
    .require("predicate", "statement", dtype=str)
122
    .require("object_id", "object_name", dtype=str)
123
    .require("search_key", "search_class", "data_source", dtype=str)
124
    .require("hit_class", dtype=str)
125
    .require("cache_date", "run_date", dtype=str)
126
    .reserve("is_hit", dtype=bool)
127
    .reserve("score", *[f"x_score_{i}" for i in range(1, 10)], dtype=float)
128
).build()
129
130
131
class HitUtils:
0 ignored issues
show
introduced by
Missing class docstring
Loading history...
132
    @classmethod
133
    def hits_to_df(cls, hits: Sequence[AbstractHit]) -> HitFrame:
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
134
        data = []
135
        for hit in hits:
136
            x = {f: getattr(hit, f) for f in hit.__class__.fields()}
0 ignored issues
show
Coding Style Naming introduced by
Variable name "x" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
137
            x["universal_id"] = hit.universal_id
138
            x["hit_class"] = hit.hit_class
139
            data.append(x)
140
        return HitFrame([pd.Series(x) for x in data])
141
142
    @classmethod
143
    def df_to_hits(cls, self: HitFrame) -> Sequence[AbstractHit]:
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
144
        hits = []
145
        for row in self.iterrows():
146
            clazz = ReflectionUtils.injection(row.hit_class, AbstractHit)
147
            # ignore extra columns
148
            # if cols are missing, let it fail on clazz.__init__
149
            data = {f: getattr(row, f) for f in self.columns if f in row.__dict__}
150
            # noinspection PyArgumentList
151
            hit = clazz(**data)
152
            hits.append(hit)
153
        return hits
154
155
156
__all__ = ["AbstractHit", "HitFrame", "Pair", "Triple", "HIT_FIELD_TYPE", "HitUtils"]
157