pocketutils.biochem.uniprot_go.FlatGoTerm.parse() - Code Metrics - Inspection of "fix: broaden vr ranges" - dmyersturnbull/pocketutils - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Push — main ( 6e4731...702ebc )

by Douglas

created 2022-08-11 21:03 UTC

pocketutils.biochem.uniprot_go.FlatGoTerm.parse() A

↳ Parent: pocketutils.biochem.uniprot_go

Complexity

Conditions

Size

Total Lines	20
Code Lines	14

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
cc	2
eloc	14
nop	2
dl	0
loc	20
rs	9.7
c	0
b	0
f	0

import dataclasses

import logging
import os
from dataclasses import dataclass
from typing import Collection, Iterable, List, Mapping, Optional, Union
from urllib import request

import pandas as pd

import regex

import uniprot


# uses https://github.com/tanghaibao/goatools
from goatools import obo_parser


# NOT the same as FlatGoTerm, which has no knowledge of hierarchy
from goatools.obo_parser import GOTerm


from pocketutils.core.exceptions import MultipleMatchesError, StringPatternError


# noinspection PyProtectedMember
from pocketutils.core.input_output import silenced


go_pattern = regex.compile(
    r"GO:(\d+); ([CFP]):([\dA-Za-z- ,()]+); ([A-Z]+):([A-Za-z-_]+)\.", flags=regex.V1
)
GO_OBO_URL = "http://current.geneontology.org/ontology/go.obo"  # nosec
GO_OBO_FILENAME = "go.obo"
logger = logging.getLogger("pocketutils")


@dataclass(frozen=True, repr=True)
class FlatGoTerm:
    """
    A Gene Ontology term.
    Not to be confused with GOTerm in goatools: obo_parser.GOTerm

    Attributes:
        - identifier: (str); ex: GO:0005737
        - kind: (str: 'P'==process, 'C'==component, 'F'==function)
        - description: (str)
        - sourceId: (str); ex: IDA
        - sourceName: (str); ex: UniProtKB
    """

    identifier: str
    kind: str
    description: str
    source_id: str
    source_name: str

    @classmethod
    def parse(cls, stwing: str):
        """
        Builds a GO term from a string from uniprot_obj['go'].
        Raises:
            ValueError: if the syntax is wrong.
        """
        match = go_pattern.search(stwing)
        if match is None:
            raise StringPatternError(
                f"String didn't match GO term pattern: {stwing}",
                value=stwing,
                pattern=go_pattern,
            )
        return FlatGoTerm(
            "GO:" + match.group(1),
            match.group(2),
            match.group(3),
            match.group(4),
            match.group(5),
        )

    def to_series(self) -> pd.Series:

        return pd.Series(dataclasses.asdict(self))


class UniprotGoTerms:

    def fetch_uniprot_data(self, uniprot_ids: Union[str, List[str]]) -> List[Mapping[str, str]]:
class Foo:
    def some_method(self, x, y):
        return x + y;
        """
        Fetches a list of dicts of UniProt metadata, one per UniProt ID.

        Raises:
            ValueError: If a UniProt ID wasn't found.
        """
        if isinstance(uniprot_ids, str):  # not a list type
            uniprot_ids = [uniprot_ids]
        # if we don't prevent these here, we'll get a ValueError from below, which is confusing
        # That's because uniprot.fetch_uniprot_metadata will only return one per unique ID
        if len(set(uniprot_ids)) != len(uniprot_ids):
            raise MultipleMatchesError("Set of UniProt IDs cannot contain duplicates")
        with silenced(no_stderr=False):
            uniprot_data = uniprot.fetch_uniprot_metadata(uniprot_ids)
        if uniprot_data is None or uniprot_data == {} or len(uniprot_data) != len(uniprot_ids):
            raise LookupError(f"At least one UniProt ID not found in {uniprot_ids}")
        return list(uniprot_data.values())

    def go_terms_for_uniprot_id(self, uniprot_id: str) -> List[FlatGoTerm]:
        """Returns a list of FlatGoTerm objects from a UniProt ID."""
        term_strings = (self.fetch_uniprot_data(uniprot_id)[0])["go"]
        return [FlatGoTerm(s) for s in term_strings]

    def go_terms_for_uniprot_id_as_df(self, uniprot_id: str) -> pd.DataFrame:
        """Returns a Pandas DataFrame of GO terms from a UniProt ID."""
        df = pd.DataFrame(columns=["ID", "kind", "description", "sourceId", "sourceName"])

        for term in self.go_terms_for_uniprot_id(uniprot_id):
            df.loc[len(df)] = term.to_series()

        return df.set_index("ID")


class GoTermsAtLevel:
    """
    Gene ontology terms organized by level.

    Example:
        .. code-block::

            go_term_ancestors_for_uniprot_id_as_df('P42681', 2)
    """

    def __init__(self) -> None:
        if os.path.exists(GO_OBO_FILENAME):
            self.obo = obo_parser.GODag(GO_OBO_FILENAME)
        else:
            logger.info("Downloading Gene Ontology OBO...")
            request.urlretrieve(GO_OBO_URL)  # nosec
            # This will be used in query_obo_term
            self.obo = obo_parser.GODag(GO_OBO_FILENAME)
            logger.info("Done downloading OBO.")
        self.substruct = UniprotGoTerms()

    def query_obo_term(self, term_id: str) -> GOTerm:
        """
        Queries a term through the global obo.
        This function wraps the call to raise a ValueError if the term is not found;
        otherwise it only logs a warning.
        """
        x = self.obo.query_term(term_id)

        if x is None:
            raise LookupError(f"Term ID {x} not found")
        return x

    def get_ancestors_of_go_term(self, term_id: str, level: int) -> Iterable[GOTerm]:
        """
        From a GO term in the form 'GO:0007344', returns a set of ancestor GOTerm objects at the specified level.

        The traversal is restricted to is-a relationships.
        Note that the level is the minimum number of steps to the root.

        Args:
            term_id: The term
            level: starting at 0 (root)
        """

        def traverse_up(term, buildup_set, lvl):
            if term.level == lvl:
                buildup_set.add(term)
            if term.has_parent:
                return [traverse_up(p, buildup_set, lvl) for p in term.parents]
            return None

        terms = set()
        traverse_up(self.query_obo_term(term_id), terms, level)
        return terms

    def go_term_ancestors_for_uniprot_id(
        self, uniprot_id: str, level: int, kinds_allowed: Optional[Collection[str]] = None

    ) -> Iterable[GOTerm]:
        """
        Gets the GO terms associated with a UniProt ID and returns a set of their ancestors at the specified level.

        The traversal is restricted to is-a relationships.
        Note that the level is the minimum number of steps to the root.

        Args:
            level: starting at 0 (root)
            uniprot_id: ID
            kinds_allowed: a set containing any combination of 'P', 'F', or 'C'
        """
        if kinds_allowed is None:
            kinds_allowed = ["P", "F", "C"]
        if len(kinds_allowed) == 0:
            return []
        terms = [
            term
            for term in self.substruct.go_terms_for_uniprot_id(uniprot_id)
            if term.kind in kinds_allowed
        ]
        ancestor_terms = set()
        for term_id in [t.identifier for t in terms]:
            ancestor_terms.update(self.get_ancestors_of_go_term(term_id, level))
        return ancestor_terms

    def go_term_ancestors_for_uniprot_id_as_df(
        self, uniprot_id: str, level: int, kinds_allowed: Optional[Collection[str]] = None

    ) -> pd.DataFrame:
        """
        See go_term_ancestors_for_uniprot_id.

        Args:
            uniprot_id: ID
            level: Level
            kinds_allowed: Can include 'P', 'F', and/or 'C'

        Returns:
             Pandas DataFrame with columns IDand name.
        """
        if kinds_allowed is None:
            kinds_allowed = ["P", "F", "C"]
        df = pd.DataFrame(columns=["ID", "name"])

        for term in self.go_term_ancestors_for_uniprot_id(uniprot_id, level, kinds_allowed):
            df.loc[len(df)] = pd.Series({"ID": term.id, "name": term.name, "level": term.level})

        return df.set_index("ID")


__all__ = ["FlatGoTerm", "UniprotGoTerms", "GoTermsAtLevel"]


1			import dataclasses
			0 ignored issues – show introduced 2021-01-25 05:25 UTC by Report Bug Copy Issue Report Missing module docstring Loading history...
2			import logging
3			import os
4			from dataclasses import dataclass
5			from typing import Collection, Iterable, List, Mapping, Optional, Union
6			from urllib import request
7
8			import pandas as pd
			0 ignored issues – show introduced 2021-01-25 05:25 UTC by Report Bug Copy Issue Report Unable to import 'pandas' Loading history...
9			import regex
			0 ignored issues – show introduced 2021-09-09 00:35 UTC by Report Bug Copy Issue Report Unable to import 'regex' Loading history...
10			import uniprot
			0 ignored issues – show introduced 2021-01-25 05:25 UTC by Report Bug Copy Issue Report Unable to import 'uniprot' Loading history...
11
12			# uses https://github.com/tanghaibao/goatools
13			from goatools import obo_parser
			0 ignored issues – show introduced 2021-01-25 05:25 UTC by Report Bug Copy Issue Report Unable to import 'goatools' Loading history...
14
15			# NOT the same as FlatGoTerm, which has no knowledge of hierarchy
16			from goatools.obo_parser import GOTerm
			0 ignored issues – show introduced 2021-01-25 05:25 UTC by Report Bug Copy Issue Report Unable to import 'goatools.obo_parser' Loading history...
17
18			from pocketutils.core.exceptions import MultipleMatchesError, StringPatternError
			0 ignored issues – show Bug introduced 2021-10-28 02:10 UTC by Report Bug Copy Issue Report The name `core` does not seem to exist in module `pocketutils`. Loading history...
19
20			# noinspection PyProtectedMember
21			from pocketutils.core.input_output import silenced
			0 ignored issues – show Bug introduced 2021-10-28 02:10 UTC by Report Bug Copy Issue Report The name `core` does not seem to exist in module `pocketutils`. Loading history...
22
23			go_pattern = regex.compile(
24			r"GO:(\d+); ([CFP]):([\dA-Za-z- ,()]+); ([A-Z]+):([A-Za-z-_]+)\.", flags=regex.V1
25			)
26			GO_OBO_URL = "http://current.geneontology.org/ontology/go.obo" # nosec
27			GO_OBO_FILENAME = "go.obo"
28			logger = logging.getLogger("pocketutils")
29
30
31			@dataclass(frozen=True, repr=True)
32			class FlatGoTerm:
33			"""
34			A Gene Ontology term.
35			Not to be confused with GOTerm in goatools: obo_parser.GOTerm
36
37			Attributes:
38			- identifier: (str); ex: GO:0005737
39			- kind: (str: 'P'==process, 'C'==component, 'F'==function)
40			- description: (str)
41			- sourceId: (str); ex: IDA
42			- sourceName: (str); ex: UniProtKB
43			"""
44
45			identifier: str
46			kind: str
47			description: str
48			source_id: str
49			source_name: str
50
51			@classmethod
52			def parse(cls, stwing: str):
53			"""
54			Builds a GO term from a string from uniprot_obj['go'].
55			Raises:
56			ValueError: if the syntax is wrong.
57			"""
58			match = go_pattern.search(stwing)
59			if match is None:
60			raise StringPatternError(
61			f"String didn't match GO term pattern: {stwing}",
62			value=stwing,
63			pattern=go_pattern,
64			)
65			return FlatGoTerm(
66			"GO:" + match.group(1),
67			match.group(2),
68			match.group(3),
69			match.group(4),
70			match.group(5),
71			)
72
73			def to_series(self) -> pd.Series:
			0 ignored issues – show introduced 2021-01-25 05:25 UTC by Report Bug Copy Issue Report Missing function or method docstring Loading history...
74			return pd.Series(dataclasses.asdict(self))
75
76
77			class UniprotGoTerms:
			0 ignored issues – show introduced 2021-01-25 05:25 UTC by Report Bug Copy Issue Report Missing class docstring Loading history...
78			def fetch_uniprot_data(self, uniprot_ids: Union[str, List[str]]) -> List[Mapping[str, str]]:
			0 ignored issues – show Coding Style introduced 2021-01-25 05:25 UTC by Report Bug Copy Issue Report This method could be written as a function/class method. If a method does not access any attributes of the class, it could also be implemented as a function or static method. This can help improve readability. For example class Foo: def some_method(self, x, y): return x + y; could be written as class Foo: @classmethod def some_method(cls, x, y): return x + y; Loading history...
79			"""
80			Fetches a list of dicts of UniProt metadata, one per UniProt ID.
81
82			Raises:
83			ValueError: If a UniProt ID wasn't found.
84			"""
85			if isinstance(uniprot_ids, str): # not a list type
86			uniprot_ids = [uniprot_ids]
87			# if we don't prevent these here, we'll get a ValueError from below, which is confusing
88			# That's because uniprot.fetch_uniprot_metadata will only return one per unique ID
89			if len(set(uniprot_ids)) != len(uniprot_ids):
90			raise MultipleMatchesError("Set of UniProt IDs cannot contain duplicates")
91			with silenced(no_stderr=False):
92			uniprot_data = uniprot.fetch_uniprot_metadata(uniprot_ids)
93			if uniprot_data is None or uniprot_data == {} or len(uniprot_data) != len(uniprot_ids):
94			raise LookupError(f"At least one UniProt ID not found in {uniprot_ids}")
95			return list(uniprot_data.values())
96
97			def go_terms_for_uniprot_id(self, uniprot_id: str) -> List[FlatGoTerm]:
98			"""Returns a list of FlatGoTerm objects from a UniProt ID."""
99			term_strings = (self.fetch_uniprot_data(uniprot_id)[0])["go"]
100			return [FlatGoTerm(s) for s in term_strings]
101
102			def go_terms_for_uniprot_id_as_df(self, uniprot_id: str) -> pd.DataFrame:
103			"""Returns a Pandas DataFrame of GO terms from a UniProt ID."""
104			df = pd.DataFrame(columns=["ID", "kind", "description", "sourceId", "sourceName"])
			0 ignored issues – show Coding Style Naming introduced 2021-01-25 05:25 UTC by Report Bug Copy Issue Report Variable name "df" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,\|_[^\\WA-Z]*\|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern) This check looks for invalid names for a range of different identifiers. You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements. If your project includes a Pylint configuration file, the settings contained in that file take precedence. To find out more about Pylint, please refer to their site. Loading history...
105			for term in self.go_terms_for_uniprot_id(uniprot_id):
106			df.loc[len(df)] = term.to_series()
			0 ignored issues – show Comprehensibility Best Practice introduced 2021-01-25 05:25 UTC by Report Bug Copy Issue Report The variable `len` does not seem to be defined. Loading history...
107			return df.set_index("ID")
108
109
110			class GoTermsAtLevel:
111			"""
112			Gene ontology terms organized by level.
113
114			Example:
115			.. code-block::
116
117			go_term_ancestors_for_uniprot_id_as_df('P42681', 2)
118			"""
119
120			def __init__(self) -> None:
121			if os.path.exists(GO_OBO_FILENAME):
122			self.obo = obo_parser.GODag(GO_OBO_FILENAME)
123			else:
124			logger.info("Downloading Gene Ontology OBO...")
125			request.urlretrieve(GO_OBO_URL) # nosec
126			# This will be used in query_obo_term
127			self.obo = obo_parser.GODag(GO_OBO_FILENAME)
128			logger.info("Done downloading OBO.")
129			self.substruct = UniprotGoTerms()
130
131			def query_obo_term(self, term_id: str) -> GOTerm:
132			"""
133			Queries a term through the global obo.
134			This function wraps the call to raise a ValueError if the term is not found;
135			otherwise it only logs a warning.
136			"""
137			x = self.obo.query_term(term_id)
			0 ignored issues – show Coding Style Naming introduced 2021-01-25 05:25 UTC by Report Bug Copy Issue Report Variable name "x" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,\|_[^\\WA-Z]*\|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern) This check looks for invalid names for a range of different identifiers. You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements. If your project includes a Pylint configuration file, the settings contained in that file take precedence. To find out more about Pylint, please refer to their site. Loading history...
138			if x is None:
139			raise LookupError(f"Term ID {x} not found")
140			return x
141
142			def get_ancestors_of_go_term(self, term_id: str, level: int) -> Iterable[GOTerm]:
143			"""
144			From a GO term in the form 'GO:0007344', returns a set of ancestor GOTerm objects at the specified level.
			0 ignored issues – show Coding Style introduced 2021-01-25 05:25 UTC by Report Bug Copy Issue Report This line is too long as per the coding-style (113/100). This check looks for lines that are too long. You can specify the maximum line length. Loading history...
145			The traversal is restricted to is-a relationships.
146			Note that the level is the minimum number of steps to the root.
147
148			Args:
149			term_id: The term
150			level: starting at 0 (root)
151			"""
152
153			def traverse_up(term, buildup_set, lvl):
154			if term.level == lvl:
155			buildup_set.add(term)
156			if term.has_parent:
157			return [traverse_up(p, buildup_set, lvl) for p in term.parents]
158			return None
159
160			terms = set()
161			traverse_up(self.query_obo_term(term_id), terms, level)
162			return terms
163
164			def go_term_ancestors_for_uniprot_id(
165			self, uniprot_id: str, level: int, kinds_allowed: Optional[Collection[str]] = None
			0 ignored issues – show Coding Style introduced 2021-01-25 05:25 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
166			) -> Iterable[GOTerm]:
167			"""
168			Gets the GO terms associated with a UniProt ID and returns a set of their ancestors at the specified level.
			0 ignored issues – show Coding Style introduced 2021-01-25 05:25 UTC by Report Bug Copy Issue Report This line is too long as per the coding-style (115/100). This check looks for lines that are too long. You can specify the maximum line length. Loading history...
169			The traversal is restricted to is-a relationships.
170			Note that the level is the minimum number of steps to the root.
171
172			Args:
173			level: starting at 0 (root)
174			uniprot_id: ID
175			kinds_allowed: a set containing any combination of 'P', 'F', or 'C'
176			"""
177			if kinds_allowed is None:
178			kinds_allowed = ["P", "F", "C"]
179			if len(kinds_allowed) == 0:
180			return []
181			terms = [
182			term
183			for term in self.substruct.go_terms_for_uniprot_id(uniprot_id)
184			if term.kind in kinds_allowed
185			]
186			ancestor_terms = set()
187			for term_id in [t.identifier for t in terms]:
188			ancestor_terms.update(self.get_ancestors_of_go_term(term_id, level))
189			return ancestor_terms
190
191			def go_term_ancestors_for_uniprot_id_as_df(
192			self, uniprot_id: str, level: int, kinds_allowed: Optional[Collection[str]] = None
			0 ignored issues – show Coding Style introduced 2021-01-25 05:25 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
193			) -> pd.DataFrame:
194			"""
195			See go_term_ancestors_for_uniprot_id.
196
197			Args:
198			uniprot_id: ID
199			level: Level
200			kinds_allowed: Can include 'P', 'F', and/or 'C'
201
202			Returns:
203			Pandas DataFrame with columns IDand name.
204			"""
205			if kinds_allowed is None:
206			kinds_allowed = ["P", "F", "C"]
207			df = pd.DataFrame(columns=["ID", "name"])
			0 ignored issues – show Coding Style Naming introduced 2021-01-25 05:25 UTC by Report Bug Copy Issue Report Variable name "df" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,\|_[^\\WA-Z]*\|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern) This check looks for invalid names for a range of different identifiers. You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements. If your project includes a Pylint configuration file, the settings contained in that file take precedence. To find out more about Pylint, please refer to their site. Loading history...
208			for term in self.go_term_ancestors_for_uniprot_id(uniprot_id, level, kinds_allowed):
209			df.loc[len(df)] = pd.Series({"ID": term.id, "name": term.name, "level": term.level})
			0 ignored issues – show Comprehensibility Best Practice introduced 2021-01-25 05:25 UTC by Report Bug Copy Issue Report The variable `len` does not seem to be defined. Loading history...
210			return df.set_index("ID")
211
212
213			__all__ = ["FlatGoTerm", "UniprotGoTerms", "GoTermsAtLevel"]
214

dmyersturnbull / pocketutils

Push — main ( 6e4731...702ebc )

pocketutils.biochem.uniprot_go.FlatGoTerm.parse() A

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like