DependencyUtils.simplify_tag() - Code Metrics - myedibleenso/py-processors - Measure and Improve Code Quality continuously with Scrutinizer

DependencyUtils.simplify_tag() F
last analyzed 2018-06-14 22:58 UTC

↳ Parent: DependencyUtils

Complexity

Conditions

Size

Total Lines

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
cc	9
dl	0
loc	40
rs	3
c	0
b	0
f	0

#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
from processors.utils import LabelManager
from collections import Counter
import networkx as nx
import collections
import re


class DependencyUtils(object):
    """
    A set of utilities for analyzing syntactic dependency graphs.

    Methods
    -------
    build_networkx_graph(roots, edges, name)
        Constructs a networkx.Graph

    shortest_path(g, start, end)
        Finds the shortest path in a `networkx.Graph` between any element in a list of start nodes and any element in a list of end nodes.

    retrieve_edges(dep_graph, path)
        Converts output of `shortest_path` into a list of triples that include the grammatical relation (and direction) for each node-node "hop" in the syntactic dependency graph.

    simplify_tag(tag)
        Maps part of speech (PoS) tag to a subset of PoS tags to better consolidate categorical labels.

    lexicalize_path(sentence, path, words=False, lemmas=False, tags=False, simple_tags=False, entities=False, limit_to=None)
        Lexicalizes path in syntactic dependency graph using Odin-style token constraints.

    pagerank(networkx_graph, alpha=0.85, personalization=None, max_iter=1000, tol=1e-06, nstart=None, weight='weight', dangling=None)
        Measures node activity in a `networkx.Graph` using a thin wrapper around `networkx` implementation of pagerank algorithm (see `networkx.algorithms.link_analysis.pagerank`).  Use with `processors.ds.DirectedGraph.graph`.
    """

    UNKNOWN = LabelManager.UNKNOWN

    @staticmethod
    def build_networkx_graph(roots, edges, name, reverse=False):
        """
        Converts a `processors` dependency graph into a networkx graph
        """
        G = nx.DiGraph()
        graph_name = name
        # store roots
        G.graph["roots"] = roots
        # reversing the graph is useful if you want to run pagerank to highlight predicate and argument nodes
        if reverse:
            edges = [(edge.destination, edge.source, {"relation": edge.relation}) for edge in edges]
        else:
            edges = [(edge.source, edge.destination, {"relation": edge.relation}) for edge in edges]
        G.add_edges_from(edges)
        return G

    @staticmethod
    def shortest_paths(g, start, end):
        """
        Find the shortest paths between two nodes.
        Note that if `g` is a directed graph, a path will not be found.

        Parameters
        ----------
        g : a networkx graph
            The networkx graph to explore.

        start : int or [int]
            A single token index or list of token indices serving as the start of the graph traversal.

        end : int or [int]
            A single token index or list of token indices serving as the end of the graph traversal.

        Returns
        -------
        None or [[(int, int)]]
            None if no paths are found.  Otherwise, a list of lists of (source index, target index) tuples representing path segments.
        """
        # converts single int to [int]
        start = start if isinstance(start, collections.Iterable) else [start]
        end = end if isinstance(end, collections.Iterable) else [end]
        # node list -> edges (i.e., (source, dest) pairs)
        def path_to_edges(g, path):
            return [(path[i], path[i+1]) for i in range(len(path) - 1)]

        shortest_paths = []
        # pathfinding b/w pairs of nodes
        for s in start:
            for e in end:
                try:
                    paths = nx.algorithms.all_shortest_paths(g, s, e)
                    for path in paths:
                        shortest_paths.append(path_to_edges(g, path))
                # no path found...
                except:
                    #print("No path found between '{}' and '{}'".format(s, e))
                    continue
        return None if len(shortest_paths) == 0 else shortest_paths

    @staticmethod
    def shortest_path(g, start, end, scoring_func=lambda path: -len(path)):
        """
        Find the shortest path between two nodes.
        Note that pathfinding is sensitive to direction.  If you want to ignore direction, convert your networkx.Digraph to a networkx.Graph.

        Parameters
        ----------
        g : a networkx graph
            The networkx graph to explore.

        start : int or [int]
            A single token index or list of token indices serving as the start of the graph traversal.

        end : int or [int]
            A single token index or list of token indices serving as the end of the graph traversal.

        scoring_func : function
            A function that scores each path in a list of paths.  Each path has the form [(source index, relation, destination index)].
            The path with the maximum score will be returned.

        Returns
        -------
        None or [(int, int)]
            None if no paths are found.  Otherwise, a list of (source index, target index) tuples representing path segments.
        """
        paths = DependencyUtils.shortest_paths(g, start, end)
        return None if len(shortest_paths) == 0 else max(paths, key=scoring_func)

    @staticmethod
    def directed_relation(source_idx, destination_idx, relation, deps):
        """
        Converts relation to a directed relation (incoming v. outgoing)
        if such a relation links `source_idx` and `destination_idx` in `deps`.

        Parameters
        ----------
        source_idx : int
            The token index for the source node

        destination_idx : int
            The token index for the destination node

        relation : str
            The undirected relation (i.e., the grammatical/semantic relation that connects the two nodes)

        deps : processors.ds.DirectedGraph
            The directed graph to be referenced

        Returns
        -------
        str or None
            The directed relation that connects the `source_idx` to the `destination_idx` in `deps`.
        """
        matches = [">{}".format(rel) for d, rel in deps.outgoing[source_idx] if d == destination_idx and rel == relation] + \
        ["<{}".format(rel) for d, rel in deps.incoming[source_idx] if d == destination_idx and rel == relation]
        return None if len(matches) == 0 else matches[0]

    @staticmethod
    def retrieve_edges(dep_graph, path):
        """
        Converts output of `DependencyUtils.shortest_path`
        into a list of triples that include the grammatical relation (and direction)
        for each node-node "hop" in the syntactic dependency graph.

        Parameters
        ----------
        dep_graph : processors.ds.DirectedGraph
            The `DirectedGraph` used to retrieve the grammatical relations for each edge in the `path`.

        path : [(int, int)]
            A list of tuples representing the shortest path from A to B in `dep_graph`.

        Returns
        -------
        [(int, str, int)]
            the shortest path (`path`) enhanced with the directed grammatical relations
            (ex. `>nsubj` for `predicate` to `subject` vs. `<nsubj` for `subject` to `predicate`).
        """

        shortest_path = []
        for (s, d) in path:
            # build dictionaries from incoming/outgoing
            outgoing = {dest_idx:">{}".format(rel) for (dest_idx, rel) in dep_graph.outgoing[s]}
            incoming = {source_idx:"<{}".format(rel) for (source_idx, rel) in dep_graph.incoming[s]}
            relation = outgoing[d] if d in outgoing else incoming[d]
            shortest_path.append((s, relation, d))
        return shortest_path

    @staticmethod
    def simplify_tag(tag):
        """
        Maps part of speech (PoS) tag to a subset of PoS tags to better consolidate categorical labels.

        Parameters
        ----------
        tag : str
            The Penn-style PoS tag to be mapped to a simplified form.

        Returns
        -------
        str
            A simplified form of `tag`.  In some cases, the returned form may be identical to `tag`.
        """
        simple_tag = "\"{}\"".format(tag)
        # collapse plurals
        if tag.startswith("NNP"):
            simple_tag = "/^NNP/"
        # collapse plurals
        elif tag.startswith("NN"):
            simple_tag = "/^N/"
        elif tag.startswith("VB"):
            simple_tag = "/^V/"
        # collapse comparative, superlatives, etc.
        elif tag.startswith("JJ"):
            simple_tag = "/^J/"
        # collapse comparative, superlatives, etc.
        elif tag.startswith("RB"):
            simple_tag = "/^RB/"
        # collapse possessive/non-possesive pronouns
        elif tag.startswith("PRP"):
            simple_tag = "/^PRP/"
        # treat WH determiners as DT
        elif tag == "WDT":
            simple_tag = "/DT$/"
        # treat DT the same as WDT
        elif tag == "DT":
            simple_tag = "/DT$/"
        return simple_tag

    @staticmethod
    def lexicalize_path(sentence,
                        path,
                        words=False,
                        lemmas=False,
                        tags=False,
                        simple_tags=False,
                        entities=False,
                        limit_to=None,
                        ):
        """
        Lexicalizes path in syntactic dependency graph using Odin-style token constraints.  Operates on output of `DependencyUtils.retrieve_edges`

        Parameters
        ----------
        sentence : processors.ds.Sentence
            The `Sentence` from which the `path` was found.  Used to lexicalize the `path`.

        path : list
            A list of (source index, relation, target index) triples.

        words : bool
            Whether or not to encode nodes in the `path` with a token constraint constructed from `Sentence.words`

        lemmas : bool
            Whether or not to encode nodes in the `path` with a token constraint constructed from `Sentence.lemmas`

        tags : bool
            Whether or not to encode nodes in the `path` with a token constraint constructed from `Sentence.tags`

        simple_tags : bool
            Whether or not to encode nodes in the `path` with a token constraint constructed from `DependencyUtils.simplify_tag` applied to `Sentence.tags`

        entities : bool
            Whether or not to encode nodes in the `path` with a token constraint constructed from `Sentence._entities`

        limit_to : [int] or None
            Selectively apply lexicalization only to the this list of token indices.  None means apply the specified lexicalization to all token indices in the path.
        Returns
        -------
        [str]
            The lexicalized form of `path`, encoded according to the specified parameters.
        """
        UNKNOWN = LabelManager.UNKNOWN
        lexicalized_path = []
        relations = []
        nodes = []
        # gather edges and nodes
        for edge in path:
            relations.append(edge[1])
            nodes.append(edge[0])
        nodes.append(path[-1][-1])

        for (i, node) in enumerate(nodes):
            if not limit_to or node in limit_to:
                # build token constraints
                token_constraints = []
                # words
                if words:
                    token_constraints.append("word=\"{}\"".format(sentence.words[node]))
                # PoS tags
                if tags and sentence.tags[node] != UNKNOWN:
                    token_constraints.append("tag=\"{}\"".format(sentence.tags[node]))
                # lemmas
                if lemmas and sentence.lemmas[node] != UNKNOWN:
                    token_constraints.append("lemma=\"{}\"".format(sentence.lemmas[node]))
                # NE labels
                if entities and sentence._entities[node] != UNKNOWN:
                    token_constraints.append("entity=\"{}\"".format(sentence.entity[node]))
                # simple tags
                if simple_tags and sentence.tags[node] != UNKNOWN:
                    token_constraints.append("tag={}".format(DependencyUtils.simplify_tag(sentence.tags[node])))
                # build node pattern
                if len(token_constraints) > 0:
                    node_pattern = "[{}]".format(" & ".join(token_constraints))
                    # store lexicalized representation of node
                    lexicalized_path.append(node_pattern)
            # append next edge
            if i < len(relations):
                lexicalized_path.append(relations[i])
        return lexicalized_path

    @staticmethod
    def pagerank(networkx_graph,
                 alpha=0.85,
                 personalization=None,
                 max_iter=1000,
                 tol=1e-06,
                 nstart=None,
                 weight='weight',
                 dangling=None):
        """
        Measures node activity in a `networkx.Graph` using a thin wrapper around `networkx` implementation of pagerank algorithm (see `networkx.algorithms.link_analysis.pagerank`).  Use with `processors.ds.DirectedGraph.graph`.

        Parameters
        ----------
        networkx_graph : networkx.Graph
            Corresponds to `G` parameter of `networkx.algorithms.link_analysis.pagerank`.

        See Also
        --------
        Method parameters correspond to those of [`networkx.algorithms.link_analysis.pagerank`](https://networkx.github.io/documentation/development/reference/generated/networkx.algorithms.link_analysis.pagerank_alg.pagerank.html#networkx.algorithms.link_analysis.pagerank_alg.pagerank)

        Returns
        -------
        collections.Counter
            A collections.Counter of node -> pagerank weights
        """
        pg_res = nx.algorithms.link_analysis.pagerank(G=networkx_graph, alpha=alpha, personalization=personalization, max_iter=max_iter, tol=tol, nstart=nstart, weight=weight, dangling=dangling)
        return Counter(pg_res)



class HeadFinder(object):

    import processors

    @staticmethod
    def semantic_head(sentence, graph_name="stanford-collapsed", valid_tags={r"^N", "VBG"}, valid_indices=None):
        """
        Finds the token with the highest pagerank score that meets the filtering criteria.

        Parameters
        ----------
        sentence : processors.ds.Sentence
            The Sentence to be analyzed.

        graph_name : str
            The name of the graph upon which to run the algorithm.  Default is "stanford-collapsed".

        valid_tags : set or None
            An optional set of str or regexes representing valid tokens.

        valid_indices : list or None
            A optional list of int representing the indices that should be considered.

        Returns
        -------
        int or None
            The index of the highest scoring token meeting the criteria.
        """

        from processors.ds import Sentence as Sent

        def is_valid_tag(tag):
            return True if not valid_tags else any(re.match(tag_pattern, tag) for tag_pattern in valid_tags)

        # ensure we're dealing with a Sentence
        if not isinstance(sentence, Sent): return None

        valid_indices = valid_indices if valid_indices else list(range(sentence.length))

        # corner case: if the sentence is a single token, pagerank doesn't apply.
        # check tag and index
        if sentence.length == 1:
            return 0 if is_valid_tag(sentence.tags[0]) and 0 in valid_indices else None

        dependencies = sentence.graphs.get(graph_name, None)

        if not dependencies: return None

        scored_toks = dependencies.pagerank().most_common()

        remaining = [i for (i, score) in scored_toks \
                    if i in valid_indices and
                    is_valid_tag(sentence.tags[i])]
        # take token with the highest pagerank score
        return remaining[0] if len(remaining) > 0 else None


1			#!/usr/bin/env python
2			# -- coding: utf-8 --
3			from __future__ import unicode_literals
4			from processors.utils import LabelManager
5			from collections import Counter
6			import networkx as nx
7			import collections
8			import re
9
10
11			class DependencyUtils(object):
12			"""
13			A set of utilities for analyzing syntactic dependency graphs.
14
15			Methods
16			-------
17			build_networkx_graph(roots, edges, name)
18			Constructs a networkx.Graph
19
20			shortest_path(g, start, end)
21			Finds the shortest path in a `networkx.Graph` between any element in a list of start nodes and any element in a list of end nodes.
22
23			retrieve_edges(dep_graph, path)
24			Converts output of `shortest_path` into a list of triples that include the grammatical relation (and direction) for each node-node "hop" in the syntactic dependency graph.
25
26			simplify_tag(tag)
27			Maps part of speech (PoS) tag to a subset of PoS tags to better consolidate categorical labels.
28
29			lexicalize_path(sentence, path, words=False, lemmas=False, tags=False, simple_tags=False, entities=False, limit_to=None)
30			Lexicalizes path in syntactic dependency graph using Odin-style token constraints.
31
32			pagerank(networkx_graph, alpha=0.85, personalization=None, max_iter=1000, tol=1e-06, nstart=None, weight='weight', dangling=None)
33			Measures node activity in a `networkx.Graph` using a thin wrapper around `networkx` implementation of pagerank algorithm (see `networkx.algorithms.link_analysis.pagerank`). Use with `processors.ds.DirectedGraph.graph`.
34			"""
35
36			UNKNOWN = LabelManager.UNKNOWN
37
38			@staticmethod
39			def build_networkx_graph(roots, edges, name, reverse=False):
40			"""
41			Converts a `processors` dependency graph into a networkx graph
42			"""
43			G = nx.DiGraph()
44			graph_name = name
45			# store roots
46			G.graph["roots"] = roots
47			# reversing the graph is useful if you want to run pagerank to highlight predicate and argument nodes
48			if reverse:
49			edges = [(edge.destination, edge.source, {"relation": edge.relation}) for edge in edges]
50			else:
51			edges = [(edge.source, edge.destination, {"relation": edge.relation}) for edge in edges]
52			G.add_edges_from(edges)
53			return G
54
55			@staticmethod
56			def shortest_paths(g, start, end):
57			"""
58			Find the shortest paths between two nodes.
59			Note that if `g` is a directed graph, a path will not be found.
60
61			Parameters
62			----------
63			g : a networkx graph
64			The networkx graph to explore.
65
66			start : int or [int]
67			A single token index or list of token indices serving as the start of the graph traversal.
68
69			end : int or [int]
70			A single token index or list of token indices serving as the end of the graph traversal.
71
72			Returns
73			-------
74			None or [[(int, int)]]
75			None if no paths are found. Otherwise, a list of lists of (source index, target index) tuples representing path segments.
76			"""
77			# converts single int to [int]
78			start = start if isinstance(start, collections.Iterable) else [start]
79			end = end if isinstance(end, collections.Iterable) else [end]
80			# node list -> edges (i.e., (source, dest) pairs)
81			def path_to_edges(g, path):
82			return [(path[i], path[i+1]) for i in range(len(path) - 1)]
83
84			shortest_paths = []
85			# pathfinding b/w pairs of nodes
86			for s in start:
87			for e in end:
88			try:
89			paths = nx.algorithms.all_shortest_paths(g, s, e)
90			for path in paths:
91			shortest_paths.append(path_to_edges(g, path))
92			# no path found...
93			except:
94			#print("No path found between '{}' and '{}'".format(s, e))
95			continue
96			return None if len(shortest_paths) == 0 else shortest_paths
97
98			@staticmethod
99			def shortest_path(g, start, end, scoring_func=lambda path: -len(path)):
100			"""
101			Find the shortest path between two nodes.
102			Note that pathfinding is sensitive to direction. If you want to ignore direction, convert your networkx.Digraph to a networkx.Graph.
103
104			Parameters
105			----------
106			g : a networkx graph
107			The networkx graph to explore.
108
109			start : int or [int]
110			A single token index or list of token indices serving as the start of the graph traversal.
111
112			end : int or [int]
113			A single token index or list of token indices serving as the end of the graph traversal.
114
115			scoring_func : function
116			A function that scores each path in a list of paths. Each path has the form [(source index, relation, destination index)].
117			The path with the maximum score will be returned.
118
119			Returns
120			-------
121			None or [(int, int)]
122			None if no paths are found. Otherwise, a list of (source index, target index) tuples representing path segments.
123			"""
124			paths = DependencyUtils.shortest_paths(g, start, end)
125			return None if len(shortest_paths) == 0 else max(paths, key=scoring_func)
126
127			@staticmethod
128			def directed_relation(source_idx, destination_idx, relation, deps):
129			"""
130			Converts relation to a directed relation (incoming v. outgoing)
131			if such a relation links `source_idx` and `destination_idx` in `deps`.
132
133			Parameters
134			----------
135			source_idx : int
136			The token index for the source node
137
138			destination_idx : int
139			The token index for the destination node
140
141			relation : str
142			The undirected relation (i.e., the grammatical/semantic relation that connects the two nodes)
143
144			deps : processors.ds.DirectedGraph
145			The directed graph to be referenced
146
147			Returns
148			-------
149			str or None
150			The directed relation that connects the `source_idx` to the `destination_idx` in `deps`.
151			"""
152			matches = [">{}".format(rel) for d, rel in deps.outgoing[source_idx] if d == destination_idx and rel == relation] + \
153			["<{}".format(rel) for d, rel in deps.incoming[source_idx] if d == destination_idx and rel == relation]
154			return None if len(matches) == 0 else matches[0]
155
156			@staticmethod
157			def retrieve_edges(dep_graph, path):
158			"""
159			Converts output of `DependencyUtils.shortest_path`
160			into a list of triples that include the grammatical relation (and direction)
161			for each node-node "hop" in the syntactic dependency graph.
162
163			Parameters
164			----------
165			dep_graph : processors.ds.DirectedGraph
166			The `DirectedGraph` used to retrieve the grammatical relations for each edge in the `path`.
167
168			path : [(int, int)]
169			A list of tuples representing the shortest path from A to B in `dep_graph`.
170
171			Returns
172			-------
173			[(int, str, int)]
174			the shortest path (`path`) enhanced with the directed grammatical relations
175			(ex. `>nsubj` for `predicate` to `subject` vs. `<nsubj` for `subject` to `predicate`).
176			"""
177
178			shortest_path = []
179			for (s, d) in path:
180			# build dictionaries from incoming/outgoing
181			outgoing = {dest_idx:">{}".format(rel) for (dest_idx, rel) in dep_graph.outgoing[s]}
182			incoming = {source_idx:"<{}".format(rel) for (source_idx, rel) in dep_graph.incoming[s]}
183			relation = outgoing[d] if d in outgoing else incoming[d]
184			shortest_path.append((s, relation, d))
185			return shortest_path
186
187			@staticmethod
188			def simplify_tag(tag):
189			"""
190			Maps part of speech (PoS) tag to a subset of PoS tags to better consolidate categorical labels.
191
192			Parameters
193			----------
194			tag : str
195			The Penn-style PoS tag to be mapped to a simplified form.
196
197			Returns
198			-------
199			str
200			A simplified form of `tag`. In some cases, the returned form may be identical to `tag`.
201			"""
202			simple_tag = "\"{}\"".format(tag)
203			# collapse plurals
204			if tag.startswith("NNP"):
205			simple_tag = "/^NNP/"
206			# collapse plurals
207			elif tag.startswith("NN"):
208			simple_tag = "/^N/"
209			elif tag.startswith("VB"):
210			simple_tag = "/^V/"
211			# collapse comparative, superlatives, etc.
212			elif tag.startswith("JJ"):
213			simple_tag = "/^J/"
214			# collapse comparative, superlatives, etc.
215			elif tag.startswith("RB"):
216			simple_tag = "/^RB/"
217			# collapse possessive/non-possesive pronouns
218			elif tag.startswith("PRP"):
219			simple_tag = "/^PRP/"
220			# treat WH determiners as DT
221			elif tag == "WDT":
222			simple_tag = "/DT$/"
223			# treat DT the same as WDT
224			elif tag == "DT":
225			simple_tag = "/DT$/"
226			return simple_tag
227
228			@staticmethod
229			def lexicalize_path(sentence,
230			path,
231			words=False,
232			lemmas=False,
233			tags=False,
234			simple_tags=False,
235			entities=False,
236			limit_to=None,
237			):
238			"""
239			Lexicalizes path in syntactic dependency graph using Odin-style token constraints. Operates on output of `DependencyUtils.retrieve_edges`
240
241			Parameters
242			----------
243			sentence : processors.ds.Sentence
244			The `Sentence` from which the `path` was found. Used to lexicalize the `path`.
245
246			path : list
247			A list of (source index, relation, target index) triples.
248
249			words : bool
250			Whether or not to encode nodes in the `path` with a token constraint constructed from `Sentence.words`
251
252			lemmas : bool
253			Whether or not to encode nodes in the `path` with a token constraint constructed from `Sentence.lemmas`
254
255			tags : bool
256			Whether or not to encode nodes in the `path` with a token constraint constructed from `Sentence.tags`
257
258			simple_tags : bool
259			Whether or not to encode nodes in the `path` with a token constraint constructed from `DependencyUtils.simplify_tag` applied to `Sentence.tags`
260
261			entities : bool
262			Whether or not to encode nodes in the `path` with a token constraint constructed from `Sentence._entities`
263
264			limit_to : [int] or None
265			Selectively apply lexicalization only to the this list of token indices. None means apply the specified lexicalization to all token indices in the path.
266			Returns
267			-------
268			[str]
269			The lexicalized form of `path`, encoded according to the specified parameters.
270			"""
271			UNKNOWN = LabelManager.UNKNOWN
272			lexicalized_path = []
273			relations = []
274			nodes = []
275			# gather edges and nodes
276			for edge in path:
277			relations.append(edge[1])
278			nodes.append(edge[0])
279			nodes.append(path[-1][-1])
280
281			for (i, node) in enumerate(nodes):
282			if not limit_to or node in limit_to:
283			# build token constraints
284			token_constraints = []
285			# words
286			if words:
287			token_constraints.append("word=\"{}\"".format(sentence.words[node]))
288			# PoS tags
289			if tags and sentence.tags[node] != UNKNOWN:
290			token_constraints.append("tag=\"{}\"".format(sentence.tags[node]))
291			# lemmas
292			if lemmas and sentence.lemmas[node] != UNKNOWN:
293			token_constraints.append("lemma=\"{}\"".format(sentence.lemmas[node]))
294			# NE labels
295			if entities and sentence._entities[node] != UNKNOWN:
296			token_constraints.append("entity=\"{}\"".format(sentence.entity[node]))
297			# simple tags
298			if simple_tags and sentence.tags[node] != UNKNOWN:
299			token_constraints.append("tag={}".format(DependencyUtils.simplify_tag(sentence.tags[node])))
300			# build node pattern
301			if len(token_constraints) > 0:
302			node_pattern = "[{}]".format(" & ".join(token_constraints))
303			# store lexicalized representation of node
304			lexicalized_path.append(node_pattern)
305			# append next edge
306			if i < len(relations):
307			lexicalized_path.append(relations[i])
308			return lexicalized_path
309
310			@staticmethod
311			def pagerank(networkx_graph,
312			alpha=0.85,
313			personalization=None,
314			max_iter=1000,
315			tol=1e-06,
316			nstart=None,
317			weight='weight',
318			dangling=None):
319			"""
320			Measures node activity in a `networkx.Graph` using a thin wrapper around `networkx` implementation of pagerank algorithm (see `networkx.algorithms.link_analysis.pagerank`). Use with `processors.ds.DirectedGraph.graph`.
321
322			Parameters
323			----------
324			networkx_graph : networkx.Graph
325			Corresponds to `G` parameter of `networkx.algorithms.link_analysis.pagerank`.
326
327			See Also
328			--------
329			Method parameters correspond to those of [`networkx.algorithms.link_analysis.pagerank`](https://networkx.github.io/documentation/development/reference/generated/networkx.algorithms.link_analysis.pagerank_alg.pagerank.html#networkx.algorithms.link_analysis.pagerank_alg.pagerank)
330
331			Returns
332			-------
333			collections.Counter
334			A collections.Counter of node -> pagerank weights
335			"""
336			pg_res = nx.algorithms.link_analysis.pagerank(G=networkx_graph, alpha=alpha, personalization=personalization, max_iter=max_iter, tol=tol, nstart=nstart, weight=weight, dangling=dangling)
337			return Counter(pg_res)
338
339
340
341			class HeadFinder(object):
342
343			import processors
344
345			@staticmethod
346			def semantic_head(sentence, graph_name="stanford-collapsed", valid_tags={r"^N", "VBG"}, valid_indices=None):
347			"""
348			Finds the token with the highest pagerank score that meets the filtering criteria.
349
350			Parameters
351			----------
352			sentence : processors.ds.Sentence
353			The Sentence to be analyzed.
354
355			graph_name : str
356			The name of the graph upon which to run the algorithm. Default is "stanford-collapsed".
357
358			valid_tags : set or None
359			An optional set of str or regexes representing valid tokens.
360
361			valid_indices : list or None
362			A optional list of int representing the indices that should be considered.
363
364			Returns
365			-------
366			int or None
367			The index of the highest scoring token meeting the criteria.
368			"""
369
370			from processors.ds import Sentence as Sent
371
372			def is_valid_tag(tag):
373			return True if not valid_tags else any(re.match(tag_pattern, tag) for tag_pattern in valid_tags)
374
375			# ensure we're dealing with a Sentence
376			if not isinstance(sentence, Sent): return None
377
378			valid_indices = valid_indices if valid_indices else list(range(sentence.length))
379
380			# corner case: if the sentence is a single token, pagerank doesn't apply.
381			# check tag and index
382			if sentence.length == 1:
383			return 0 if is_valid_tag(sentence.tags[0]) and 0 in valid_indices else None
384
385			dependencies = sentence.graphs.get(graph_name, None)
386
387			if not dependencies: return None
388
389			scored_toks = dependencies.pagerank().most_common()
390
391			remaining = [i for (i, score) in scored_toks \
392			if i in valid_indices and
393			is_valid_tag(sentence.tags[i])]
394			# take token with the highest pagerank score
395			return remaining[0] if len(remaining) > 0 else None
396

myedibleenso / py-processors

DependencyUtils.simplify_tag() F last analyzed 2018-06-14 22:58 UTC

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like

DependencyUtils.simplify_tag() F
last analyzed 2018-06-14 22:58 UTC