Passed
Push — main ( bfa577...eb6882 )
by Douglas
04:37
created

mandos.model.chembl_support.chembl_targets   A

Complexity

Total Complexity 31

Size/Duplication

Total Lines 321
Duplicated Lines 0 %

Importance

Changes 0
Metric Value
eloc 186
dl 0
loc 321
rs 9.92
c 0
b 0
f 0
wmc 31

15 Methods

Rating   Name   Duplication   Size   Complexity  
A TargetType.protein_types() 0 3 1
A DagTargetLinkType.cross() 0 13 4
A TargetType.of() 0 8 2
A ChemblTarget.traverse() 0 16 1
A TargetType.all_types() 0 3 1
A TargetType.is_protein() 0 7 1
A ChemblTarget.api() 0 8 1
A TargetRelationshipType.of() 0 3 1
A ChemblTarget.find() 0 20 2
A DagTargetLinkType.matches() 0 20 4
A TargetType.is_traversable() 0 8 1
A TargetType.is_unknown() 0 3 1
B ChemblTarget._traverse() 0 46 7
A ChemblTarget.links() 0 21 3
A TargetFactory.find() 0 20 1
1
"""
2
Model of ChEMBL targets and a hierarchy between them as a directed acyclic graph (DAG).
3
"""
4
from __future__ import annotations
5
6
import abc
7
import enum
8
import logging
9
import re
10
from dataclasses import dataclass
11
from typing import Optional, Sequence, Set
12
from typing import Tuple as Tup
13
14
from urllib3.util.retry import MaxRetryError
0 ignored issues
show
introduced by
Unable to import 'urllib3.util.retry'
Loading history...
15
from pocketutils.core.dot_dict import NestedDotDict
0 ignored issues
show
introduced by
Unable to import 'pocketutils.core.dot_dict'
Loading history...
16
17
from mandos.model.chembl_api import ChemblApi
18
19
logger = logging.getLogger(__package__)
20
21
22
class TargetNotFoundError(ValueError):
0 ignored issues
show
Documentation introduced by
Empty class docstring
Loading history...
23
    """"""
24
25
26
class TargetType(enum.Enum):
27
    """
28
    Enum corresponding to the ChEMBL API field ``target.target_type``.
29
    """
30
31
    single_protein = enum.auto()
32
    protein_family = enum.auto()
33
    protein_complex = enum.auto()
34
    protein_complex_group = enum.auto()
35
    selectivity_group = enum.auto()
36
    protein_protein_interaction = enum.auto()
37
    nucleic_acid = enum.auto()
38
    chimeric_protein = enum.auto()
39
    protein_nucleic_acid_complex = enum.auto()
40
    metal = enum.auto()
41
    small_molecule = enum.auto()
42
    subcellular = enum.auto()
43
    unknown = enum.auto()
44
45
    @classmethod
46
    def of(cls, s: str) -> TargetType:
0 ignored issues
show
Coding Style Naming introduced by
Method name "of" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
Coding Style Naming introduced by
Argument name "s" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
introduced by
Missing function or method docstring
Loading history...
47
        key = s.replace(" ", "_").replace("-", "_").lower()
48
        try:
49
            return TargetType[key]
50
        except KeyError:
51
            logger.error(f"Target type {key} not found. Using TargetType.unknown.")
0 ignored issues
show
introduced by
Use lazy % formatting in logging functions
Loading history...
52
            return TargetType.unknown
53
54
    @classmethod
55
    def protein_types(cls) -> Set[TargetType]:
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
56
        return {s for s in cls if s.is_protein}
57
58
    @classmethod
59
    def all_types(cls) -> Set[TargetType]:
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
60
        return set(TargetType)  # here for symmetry
61
62
    @property
63
    def is_traversable(self) -> bool:
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
64
        return self in {
65
            TargetType.single_protein,
66
            TargetType.protein_family,
67
            TargetType.protein_complex,
68
            TargetType.protein_complex_group,
69
            TargetType.selectivity_group,
70
        }
71
72
    @property
73
    def is_protein(self) -> bool:
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
74
        return self in {
75
            TargetType.single_protein,
76
            TargetType.protein_family,
77
            TargetType.protein_complex,
78
            TargetType.protein_complex_group,
79
        }
80
81
    @property
82
    def is_unknown(self) -> bool:
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
83
        return self == TargetType.unknown
84
85
86
class TargetRelationshipType(enum.Enum):
0 ignored issues
show
introduced by
Missing class docstring
Loading history...
87
    subset_of = enum.auto()
88
    superset_of = enum.auto()
89
    overlaps_with = enum.auto()
90
    equivalent_to = enum.auto()
91
92
    @classmethod
93
    def of(cls, s: str) -> TargetRelationshipType:
0 ignored issues
show
Coding Style Naming introduced by
Argument name "s" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
introduced by
Missing function or method docstring
Loading history...
Coding Style Naming introduced by
Method name "of" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
94
        return TargetRelationshipType[s.replace(" ", "_").replace("-", "_").lower()]
95
96
97
@dataclass(frozen=True, order=True, repr=True)
0 ignored issues
show
introduced by
Missing class docstring
Loading history...
98
class DagTargetLinkType:
99
    source_type: TargetType
100
    rel_type: TargetRelationshipType
101
    dest_type: TargetType
102
    words: Optional[Set[str]]
103
104
    @classmethod
105
    def cross(
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
106
        cls,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
107
        source_types: Set[TargetType],
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
108
        rel_types: Set[TargetRelationshipType],
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
109
        dest_types: Set[TargetType],
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
110
    ) -> Set[DagTargetLinkType]:
111
        st = set()
0 ignored issues
show
Coding Style Naming introduced by
Variable name "st" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
112
        for source in source_types:
113
            for rel in rel_types:
114
                for dest in dest_types:
115
                    st.add(DagTargetLinkType(source, rel, dest, None))
116
        return st
117
118
    def matches(
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
119
        self,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
120
        source: TargetType,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
121
        rel: TargetRelationshipType,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
122
        target: TargetType,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
123
        target_name: Optional[str],
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
124
    ) -> bool:
125
        if self.words is None:
126
            words_match = True
127
        else:
128
            words_match = False
129
            for choice in self.words:
130
                if any((word == choice for word in re.compile(r"[ \-_]+").split(target_name))):
0 ignored issues
show
introduced by
The variable word does not seem to be defined in case the for loop on line 129 is not entered. Are you sure this can never be the case?
Loading history...
131
                    words_match = True
132
                    break
133
        return (
134
            self.source_type == source
135
            and self.rel_type == rel
136
            and self.dest_type == target
137
            and words_match
138
        )
139
140
141
@dataclass(frozen=True, order=True, repr=True)
0 ignored issues
show
introduced by
Missing class docstring
Loading history...
142
class DagTarget:
143
    depth: int
144
    is_end: bool
145
    target: ChemblTarget
146
    link_type: Optional[DagTargetLinkType]
147
148
149
@dataclass(frozen=True, order=True, repr=True)
150
class ChemblTarget(metaclass=abc.ABCMeta):
151
    """
152
    A target from ChEMBL, from the ``target`` table.
153
    ChEMBL targets form a DAG via the ``target_relation`` table using links of type "SUPERSET OF" and "SUBSET OF".
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (114/100).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
154
    (There are additional link types ("OVERLAPS WITH", for ex), which we are ignoring.)
155
    For some receptors the DAG happens to be a tree. This is not true in general. See the GABAA receptor, for example.
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (118/100).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
156
    To fetch a target, use the ``find`` factory method.
157
158
    Attributes:
159
        chembl: The CHEMBL ID, starting with 'CHEMBL'
160
        name: The preferred name (``pref_target_name``)
161
        type: From the ``target_type`` ChEMBL field
162
    """
163
164
    chembl: str
165
    name: Optional[str]
166
    type: TargetType
167
168
    @classmethod
169
    def api(cls) -> ChemblApi:
170
        """
171
172
        Returns:
173
174
        """
175
        raise NotImplementedError()
176
177
    @classmethod
178
    def find(cls, chembl: str) -> ChemblTarget:
179
        """
180
181
        Args:
182
            chembl:
183
184
        Returns:
185
186
        """
187
        try:
188
            targets = cls.api().target.filter(target_chembl_id=chembl)
189
        except MaxRetryError:
190
            raise TargetNotFoundError(f"Failed to find target {chembl}")
191
        assert len(targets) == 1, f"Found {len(targets)} targets for {chembl}"
192
        target = NestedDotDict(targets[0])
193
        return cls(
194
            chembl=target["target_chembl_id"],
195
            name=target.get("pref_name"),
196
            type=TargetType.of(target["target_type"]),
197
        )
198
199
    def links(
200
        self, rel_types: Set[TargetRelationshipType]
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
201
    ) -> Sequence[Tup[ChemblTarget, TargetRelationshipType]]:
202
        """
203
        Gets adjacent targets in the DAG.
204
205
        Args:
206
            rel_types:
207
208
        Returns:
209
        """
210
        relations = self.__class__.api().target_relation.filter(target_chembl_id=self.chembl)
211
        links = []
212
        # "subset" means "up" (it's reversed from what's on the website)
213
        for superset in relations:
214
            linked_id = superset["related_target_chembl_id"]
215
            rel_type = TargetRelationshipType.of(superset["relationship"])
216
            if rel_type in rel_types:
217
                linked_target = self.find(linked_id)
218
                links.append((linked_target, rel_type))
219
        return sorted(links)
220
221
    def traverse(self, permitting: Set[DagTargetLinkType]) -> Set[DagTarget]:
222
        """
223
        Traverses the DAG from this node, hopping only to targets with type in the given set.
224
225
        Args:
226
            permitting: The set of target types we're allowed to follow links onto
227
228
        Returns:
229
            The targets in the set, in a breadth-first order (then sorted by CHEMBL ID)
230
            The int is the depth, starting at 0 (this protein), going to +inf for the highest ancestors
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (103/100).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
231
        """
232
        results = set()
233
        # purposely use the invalid value None for is_root
234
        self._traverse(DagTarget(0, None, self, None), permitting, results)
235
        assert not any((x.is_end is None for x in results))
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable x does not seem to be defined.
Loading history...
236
        return results
237
238
    @classmethod
239
    def _traverse(
240
        cls, source: DagTarget, permitting: Set[DagTargetLinkType], results: Set[DagTarget]
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
241
    ) -> None:
242
        # recursive method called from traverse
243
        # this got really complex
244
        # basically, we just want to:
245
        # for each link (relationship) to another target:
246
        # for every allowed link type (DagTargetLinkType), try:
247
        # if the link type is acceptable, add the found target and associated link type, and break
248
        # all good if we've already traversed this
249
        if source.target.chembl in {s.target.chembl for s in results}:
250
            return
251
        # find all links from ChEMBL, then filter to only the valid links
252
        # do not traverse yet -- we just want to find these links
253
        link_candidates = source.target.links({q.rel_type for q in permitting})
254
        links = []
255
        for linked_target, rel_type in link_candidates:
256
            # try out all of the link types that could match
257
            # getting to the link_target by way of any of them is fine
258
            # although the DagTarget takes the link_type, we'll just go ahead and break if we find one acceptable link
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (118/100).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
259
            # the links are already sorted, so that should be fine
260
            # (otherwise, we just end up with redundant targets)
261
            for permitted in permitting:
262
                if permitted.matches(
263
                    source.target.type, rel_type, linked_target.type, linked_target.name
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
264
                ):
265
                    link_type = DagTargetLinkType(
266
                        source.target.type, rel_type, linked_target.type, permitted.words
267
                    )
268
                    # purposely use the invalid value None for is_root
269
                    linked = DagTarget(source.depth + 1, None, linked_target, link_type)
270
                    links.append(linked)
271
                    break
272
        # now, we'll add our own (breadth-first, remember)
273
        # we know whether we're at an "end" node by whether we found any links
274
        # note that this is an invariant of the node (and permitted link types): it doesn't depend on traversal order
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (117/100).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
275
        is_at_end = len(links) == 0
276
        results.add(DagTarget(source.depth, is_at_end, source.target, source.link_type))
277
        # alright! now traverse on the links
278
        for link in links:
279
            # this check is needed
280
            # otherwise we can go superset --- subset --- superset ---
281
            # or just --- overlaps with --- overlaps with ---
282
            if link not in results:
283
                cls._traverse(link, permitting, results)
284
285
286
class TargetFactory:
287
    """
288
    Factory for ``Target`` that injects a ``ChemblApi``.
289
    """
290
291
    @classmethod
292
    def find(cls, chembl: str, api: ChemblApi) -> ChemblTarget:
293
        """
294
295
        Args:
296
            chembl:
297
            api:
298
299
        Returns:
300
            A ``Target`` instance from a newly created subclass of that class
301
        """
302
303
        @dataclass(frozen=True, order=True, repr=True)
304
        class _Target(ChemblTarget):
305
            @classmethod
306
            def api(cls) -> ChemblApi:
307
                return api
308
309
        _Target.__name__ = "Target:" + chembl
310
        return _Target.find(chembl)
311
312
313
__all__ = [
314
    "TargetType",
315
    "TargetRelationshipType",
316
    "ChemblTarget",
317
    "DagTarget",
318
    "TargetFactory",
319
    "DagTargetLinkType",
320
    "TargetNotFoundError",
321
]
322