mandos.model.hits.HitUtils.hits_to_df() - Code Metrics - Inspection of "build(deps-dev): bump sphinx-copybutton from 0.3.3..." - dmyersturnbull/mandos - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Push — dependabot/pip/sphinx-copybutt... ( c72176 )

unknown

created 2021-07-05 03:05 UTC

mandos.model.hits.HitUtils.hits_to_df() A

↳ Parent: mandos.model.hits

Complexity

Conditions

Size

Total Lines	9
Code Lines	9

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
cc	2
eloc	9
nop	2
dl	0
loc	9
rs	9.95
c	0
b	0
f	0

import dataclasses

import html
from dataclasses import dataclass
from datetime import datetime
from typing import Optional, Sequence

import pandas as pd

from typeddfs import TypedDfs


from mandos.model import ReflectionUtils

HIT_FIELD_TYPE = frozenset([str, int, float, datetime])


@dataclass(frozen=True, repr=True, order=True)
class Pair:
    """
    Predicate, object pairs.
    """

    pred: str
    obj: str


@dataclass(frozen=True, repr=True, order=True)
class Triple:
    """
    Usually compound, predicate, object.
    """

    sub: str
    pred: str
    obj: str

    @property
    def n_triples(self) -> str:
        """
        Returns a simple text statement in n-triples format.
        """
        s = self.sub

        p = html.escape(self.pred, quote=True)

        o = html.escape(self.obj, quote=True)

        return f'"{s}" "{p}" "{o}" .'


@dataclass(frozen=True, order=True, repr=True)

class AbstractHit:
    """
    An abstract annotation (statement type), which may support additional fields.
    """

    record_id: Optional[str]
    origin_inchikey: str
    matched_inchikey: str
    compound_id: str
    compound_name: str
    predicate: str
    statement: str
    object_id: str
    object_name: str
    value: float
    search_key: str
    search_class: str
    data_source: str
    run_date: datetime
    cache_date: Optional[datetime]
    # is_hit: Optional[bool] = None
    # score: Optional[float] = None
    # x_score_1: Optional[float] = None
    # x_score_2: Optional[float] = None

    @property
    def hit_class(self) -> str:

        return self.__class__.__name__

    @property
    def to_triple(self) -> Triple:

        return Triple(sub=self.origin_inchikey, pred=self.predicate, obj=self.object_name)

    @property
    def to_pair(self) -> Pair:

        return Pair(pred=self.predicate, obj=self.object_name)

    def __hash__(self):
        return hash(self.record_id)

    @property
    def universal_id(self) -> str:
        """
        Gets an identifier (a hex key) that uniquely identifies the record by its unique attributes.
        Does **NOT** distinguish between hits with duplicate information and does **NOT**
        include ``record_id``.

        Returns:
            A 16-character hexadecimal string
        """
        # excluding record_id only because it's not available for some hit types
        # we'd rather immediately see duplicates if the exist
        fields = {
            field
            for field in self.fields()
            if field
            not in {"record_id", "origin_inchikey", "compound_name", "search_key", "search_class"}
        }
        hexed = hex(hash(tuple([getattr(self, f) for f in fields])))
        # remove negative signs -- still unique
        return hexed.replace("-", "").replace("0x", "")

    @classmethod
    def fields(cls) -> Sequence[str]:
        """
        Finds the list of fields in this class by reflection.
        """
        return [f.name for f in dataclasses.fields(cls)]


HitFrame = (
    TypedDfs.typed("HitFrame")
    .require("record_id", dtype=str)
    .require("inchikey", "compound_id", "compound_name", dtype=str)
    .require("predicate", "statement", dtype=str)
    .require("object_id", "object_name", dtype=str)
    .require("search_key", "search_class", "data_source", dtype=str)
    .require("hit_class", dtype=str)
    .require("cache_date", "run_date", dtype=str)
    .reserve("is_hit", dtype=bool)
    .reserve("score", *[f"x_score_{i}" for i in range(1, 10)], dtype=float)
).build()


class HitUtils:

    @classmethod
    def hits_to_df(cls, hits: Sequence[AbstractHit]) -> HitFrame:

        data = []
        for hit in hits:
            x = {f: getattr(hit, f) for f in hit.__class__.fields()}

            x["universal_id"] = hit.universal_id
            x["hit_class"] = hit.hit_class
            data.append(x)
        return HitFrame([pd.Series(x) for x in data])

    @classmethod
    def df_to_hits(cls, self: HitFrame) -> Sequence[AbstractHit]:

        hits = []
        for row in self.iterrows():
            clazz = ReflectionUtils.injection(row.hit_class, AbstractHit)
            # ignore extra columns
            # if cols are missing, let it fail on clazz.__init__
            data = {f: getattr(row, f) for f in self.columns if f in row.__dict__}
            # noinspection PyArgumentList
            hit = clazz(**data)
            hits.append(hit)
        return hits


__all__ = ["AbstractHit", "HitFrame", "Pair", "Triple", "HIT_FIELD_TYPE", "HitUtils"]


1			import dataclasses
			0 ignored issues – show introduced 2021-03-10 02:41 UTC by Report Bug Copy Issue Report Missing module docstring Loading history...
2			import html
3			from dataclasses import dataclass
4			from datetime import datetime
5			from typing import Optional, Sequence
6
7			import pandas as pd
			0 ignored issues – show introduced 2021-06-30 04:51 UTC by Report Bug Copy Issue Report Unable to import 'pandas' Loading history...
8			from typeddfs import TypedDfs
			0 ignored issues – show introduced 2021-03-21 02:08 UTC by Report Bug Copy Issue Report Unable to import 'typeddfs' Loading history...
9
10			from mandos.model import ReflectionUtils
11
12			HIT_FIELD_TYPE = frozenset([str, int, float, datetime])
13
14
15			@dataclass(frozen=True, repr=True, order=True)
16			class Pair:
17			"""
18			Predicate, object pairs.
19			"""
20
21			pred: str
22			obj: str
23
24
25			@dataclass(frozen=True, repr=True, order=True)
26			class Triple:
27			"""
28			Usually compound, predicate, object.
29			"""
30
31			sub: str
32			pred: str
33			obj: str
34
35			@property
36			def n_triples(self) -> str:
37			"""
38			Returns a simple text statement in n-triples format.
39			"""
40			s = self.sub
			0 ignored issues – show Coding Style Naming introduced 2021-06-30 04:51 UTC by Report Bug Copy Issue Report Variable name "s" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,\|_[^\\WA-Z]*\|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern) This check looks for invalid names for a range of different identifiers. You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements. If your project includes a Pylint configuration file, the settings contained in that file take precedence. To find out more about Pylint, please refer to their site. Loading history...
41			p = html.escape(self.pred, quote=True)
			0 ignored issues – show Coding Style Naming introduced 2021-06-30 04:51 UTC by Report Bug Copy Issue Report Variable name "p" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,\|_[^\\WA-Z]*\|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern) This check looks for invalid names for a range of different identifiers. You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements. If your project includes a Pylint configuration file, the settings contained in that file take precedence. To find out more about Pylint, please refer to their site. Loading history...
42			o = html.escape(self.obj, quote=True)
			0 ignored issues – show Coding Style Naming introduced 2021-06-30 04:51 UTC by Report Bug Copy Issue Report Variable name "o" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,\|_[^\\WA-Z]*\|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern) This check looks for invalid names for a range of different identifiers. You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements. If your project includes a Pylint configuration file, the settings contained in that file take precedence. To find out more about Pylint, please refer to their site. Loading history...
43			return f'"{s}" "{p}" "{o}" .'
44
45
46			@dataclass(frozen=True, order=True, repr=True)
			0 ignored issues – show best-practice introduced 2021-06-30 04:51 UTC by Report Bug Copy Issue Report Too many instance attributes (15/7) Loading history...
47			class AbstractHit:
48			"""
49			An abstract annotation (statement type), which may support additional fields.
50			"""
51
52			record_id: Optional[str]
53			origin_inchikey: str
54			matched_inchikey: str
55			compound_id: str
56			compound_name: str
57			predicate: str
58			statement: str
59			object_id: str
60			object_name: str
61			value: float
62			search_key: str
63			search_class: str
64			data_source: str
65			run_date: datetime
66			cache_date: Optional[datetime]
67			# is_hit: Optional[bool] = None
68			# score: Optional[float] = None
69			# x_score_1: Optional[float] = None
70			# x_score_2: Optional[float] = None
71
72			@property
73			def hit_class(self) -> str:
			0 ignored issues – show introduced 2021-01-25 23:06 UTC by Report Bug Copy Issue Report Missing function or method docstring Loading history...
74			return self.__class__.__name__
75
76			@property
77			def to_triple(self) -> Triple:
			0 ignored issues – show introduced 2021-06-30 04:51 UTC by Report Bug Copy Issue Report Missing function or method docstring Loading history...
78			return Triple(sub=self.origin_inchikey, pred=self.predicate, obj=self.object_name)
79
80			@property
81			def to_pair(self) -> Pair:
			0 ignored issues – show introduced 2021-06-30 04:51 UTC by Report Bug Copy Issue Report Missing function or method docstring Loading history...
82			return Pair(pred=self.predicate, obj=self.object_name)
83
84			def __hash__(self):
85			return hash(self.record_id)
86
87			@property
88			def universal_id(self) -> str:
89			"""
90			Gets an identifier (a hex key) that uniquely identifies the record by its unique attributes.
91			Does NOT distinguish between hits with duplicate information and does NOT
92			include ``record_id``.
93
94			Returns:
95			A 16-character hexadecimal string
96			"""
97			# excluding record_id only because it's not available for some hit types
98			# we'd rather immediately see duplicates if the exist
99			fields = {
100			field
101			for field in self.fields()
102			if field
103			not in {"record_id", "origin_inchikey", "compound_name", "search_key", "search_class"}
104			}
105			hexed = hex(hash(tuple([getattr(self, f) for f in fields])))
106			# remove negative signs -- still unique
107			return hexed.replace("-", "").replace("0x", "")
108
109			@classmethod
110			def fields(cls) -> Sequence[str]:
111			"""
112			Finds the list of fields in this class by reflection.
113			"""
114			return [f.name for f in dataclasses.fields(cls)]
115
116
117			HitFrame = (
118			TypedDfs.typed("HitFrame")
119			.require("record_id", dtype=str)
120			.require("inchikey", "compound_id", "compound_name", dtype=str)
121			.require("predicate", "statement", dtype=str)
122			.require("object_id", "object_name", dtype=str)
123			.require("search_key", "search_class", "data_source", dtype=str)
124			.require("hit_class", dtype=str)
125			.require("cache_date", "run_date", dtype=str)
126			.reserve("is_hit", dtype=bool)
127			.reserve("score", *[f"x_score_{i}" for i in range(1, 10)], dtype=float)
128			).build()
129
130
131			class HitUtils:
			0 ignored issues – show introduced 2021-07-05 03:20 UTC by Report Bug Copy Issue Report Missing class docstring Loading history...
132			@classmethod
133			def hits_to_df(cls, hits: Sequence[AbstractHit]) -> HitFrame:
			0 ignored issues – show introduced 2021-06-30 04:51 UTC by Report Bug Copy Issue Report Missing function or method docstring Loading history...
134			data = []
135			for hit in hits:
136			x = {f: getattr(hit, f) for f in hit.__class__.fields()}
			0 ignored issues – show Coding Style Naming introduced 2021-06-30 04:51 UTC by Report Bug Copy Issue Report Variable name "x" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,\|_[^\\WA-Z]*\|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern) This check looks for invalid names for a range of different identifiers. You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements. If your project includes a Pylint configuration file, the settings contained in that file take precedence. To find out more about Pylint, please refer to their site. Loading history...
137			x["universal_id"] = hit.universal_id
138			x["hit_class"] = hit.hit_class
139			data.append(x)
140			return HitFrame([pd.Series(x) for x in data])
141
142			@classmethod
143			def df_to_hits(cls, self: HitFrame) -> Sequence[AbstractHit]:
			0 ignored issues – show introduced 2021-07-05 03:20 UTC by Report Bug Copy Issue Report Missing function or method docstring Loading history...
144			hits = []
145			for row in self.iterrows():
146			clazz = ReflectionUtils.injection(row.hit_class, AbstractHit)
147			# ignore extra columns
148			# if cols are missing, let it fail on clazz.__init__
149			data = {f: getattr(row, f) for f in self.columns if f in row.__dict__}
150			# noinspection PyArgumentList
151			hit = clazz(**data)
152			hits.append(hit)
153			return hits
154
155
156			__all__ = ["AbstractHit", "HitFrame", "Pair", "Triple", "HIT_FIELD_TYPE", "HitUtils"]
157

dmyersturnbull / mandos

Push — dependabot/pip/sphinx-copybutt... ( c72176 )

mandos.model.hits.HitUtils.hits_to_df() A

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like