e2edutch.stanza - Code Metrics - Filter-Bubble/e2e-Dutch - Measure and Improve Code Quality continuously with Scrutinizer

e2edutch.stanza A
last analyzed 2021-08-05 09:00 UTC

↳ Parent: Project

Complexity

Total Complexity

Size/Duplication

Total Lines	135
Duplicated Lines	0 %

Importance

Changes

Metric	Value
wmc	10
eloc	72
dl	0
loc	135
rs	10
c	0
b	0
f	0

3 Methods

Rating	Name	Size	Complexity
A	CorefProcessor._set_up_model()	3	1
A	CorefProcessor.__init__()	22	1
B	CorefProcessor.process()	64	6

1 Function

Rating	Name	Duplication	Size	Complexity
A	clusterSetter()	0	5	2

import tensorflow.compat.v1 as tf

import os

import stanza
import logging


from pathlib import Path


from e2edutch import util
from e2edutch import coref_model as cm

from e2edutch.download import download_data
from e2edutch.predict import Predictor

from stanza.pipeline.processor import Processor, register_processor

from stanza.models.common.doc import Document, Span



# Add a Clusters property to documents as a List of List of Span:
# Clusters is a List of cluster, cluster is a List of Span
def clusterSetter(self, value):

    if isinstance(value, type([])):
        self._clusters = value
class MyParent:
    def __init__(self):
        self._x = 1;
        self.y = 2;

class MyChild(MyParent):
    def some_method(self):
        return self._x    # Ok, since accessed from a child class

class AnotherClass:
    def some_method(self, instance_of_my_child):
        return instance_of_my_child._x   # Would be flagged as AnotherClass is not
                                         # a child class of MyParent
    else:
        logger.error('Clusters must be a List')


stanza.models.common.doc.Document.add_property('clusters', default='[]', setter=clusterSetter)


logger = logging.getLogger('e2edutch')
logger.setLevel(logging.INFO)
logger.addHandler(logging.StreamHandler())


@register_processor('coref')
class CorefProcessor(Processor):
    ''' Processor that appends coreference information
    Coreferences are added similarly to Stanza's entities:
    * a Document has an attribute clusters that is a List of coreference clusters;
    * a coreference cluster is a List of Stanza Spans.
    '''
    _requires = set(['tokenize'])
    _provides = set(['coref'])

    def __init__(self, config, pipeline, use_gpu):
class SomeParent:
    def __init__(self):
        self.x = 1

class SomeChild(SomeParent):
    def __init__(self):
        # Initialize the super class
        SomeParent.__init__(self)
        # Make e2edutch follow Stanza's GPU settings:
        # set the environment value for GPU, so that initialize_from_env picks it up.
        # if use_gpu:
        #    os.environ['GPU'] = ' '.join(tf.config.experimental.list_physical_devices('GPU'))
        # else:
        #    if 'GPU' in os.environ['GPU'] :
        #        os.environ.pop('GPU')

        self.e2econfig = util.initialize_from_env(model_name='final')

        # Override datapath and log_root:
        # store e2edata with the Stanza resources, ie. a 'stanza_resources/nl/coref' directory
        self.e2econfig['datapath'] = Path(config['model_path']).parent
        self.e2econfig['log_root'] = Path(config['model_path']).parent

        # Download data files if not present
        download_data(self.e2econfig)

        # Start and stop a session to cache all models
        predictor = Predictor(config=self.e2econfig)
        predictor.end_session()

    def _set_up_model(self, *args):
class Foo:
    def some_method(self, x, y):
        return x + y;
        print('_set_up_model')
        pass


    def process(self, doc):


        predictor = Predictor(config=self.e2econfig)

        # build the example argument for predict:
        #   example (dict): dict with the following fields:
        #                     sentences ([[str]])
        #                     doc_id (str)
        #                     clusters ([[(int, int)]]) (optional)
        example = {}
        example['sentences'] = []
        example['doc_id'] = 'document_from_stanza'  # TODO check what this should be

        example['doc_key'] = 'undocumented'  # TODO check what this should be


        for sentence in doc.sentences:
            s = []

            for word in sentence.words:
                s.append(word.text)
            example['sentences'].append(s)

        predicted_clusters = predictor.predict(example)  # a list of tuples

        # Add the predicted clusters back to the Stanza document

        clusters = []
        for predicted_cluster in predicted_clusters:  # a tuple of entities
            cluster = []
            for predicted_reference in predicted_cluster:  # a tuple of (start, end) word
                start, end = predicted_reference

                # find the sentence_id of the sentence containing this reference
                sentence_id = 0
                sentence = doc.sentences[0]
                sentence_start_word = 0
                sentence_end_word = len(sentence.words) - 1

                while sentence_end_word < start:
                    sentence_start_word = sentence_end_word + 1

                    # move to the next sentence
                    sentence_id += 1
                    sentence = doc.sentences[sentence_id]

                    sentence_end_word = sentence_start_word + len(sentence.words) - 1

                # start counting words from the start of this sentence
                start -= sentence_start_word
                end -= sentence_start_word

                span = Span(  # a list of Tokens
                    tokens=[word.parent for word in sentence.words[start:end + 1]],
                    doc=doc,
                    type='COREF',
                    sent=doc.sentences[sentence_id]
                )
                cluster.append(span)

            clusters.append(cluster)

        doc.clusters = clusters

        predictor.end_session()

        return doc


1			import tensorflow.compat.v1 as tf
			0 ignored issues – show introduced 2021-02-01 13:49 UTC by Report Bug Copy Issue Report Missing module docstring Loading history... introduced 2021-01-25 09:19 UTC by Report Bug Copy Issue Report Unable to import 'tensorflow.compat.v1' Loading history... Unused Code introduced 2021-01-25 09:19 UTC by Report Bug Copy Issue Report Unused tensorflow.compat.v1 imported as tf Loading history...
2			import os
			0 ignored issues – show Unused Code introduced 2021-02-01 13:49 UTC by Report Bug Copy Issue Report The import `os` seems to be unused. Loading history... introduced 2021-02-01 13:49 UTC by Report Bug Copy Issue Report standard import "import os" should be placed before "import tensorflow.compat.v1 as tf" Loading history...
3			import stanza
4			import logging
			0 ignored issues – show introduced 2021-02-01 13:49 UTC by Report Bug Copy Issue Report standard import "import logging" should be placed before "import tensorflow.compat.v1 as tf" Loading history...
5
6			from pathlib import Path
			0 ignored issues – show introduced 2021-02-01 13:49 UTC by Report Bug Copy Issue Report standard import "from pathlib import Path" should be placed before "import tensorflow.compat.v1 as tf" Loading history...
7
8			from e2edutch import util
9			from e2edutch import coref_model as cm
			0 ignored issues – show Unused Code introduced 2021-01-25 09:19 UTC by Report Bug Copy Issue Report Unused coref_model imported from e2edutch as cm Loading history...
10			from e2edutch.download import download_data
11			from e2edutch.predict import Predictor
12
13			from stanza.pipeline.processor import Processor, register_processor
			0 ignored issues – show introduced 2021-01-25 13:55 UTC by Report Bug Copy Issue Report third party import "from stanza.pipeline.processor import Processor, register_processor" should be placed before "from e2edutch import util" Loading history...
14			from stanza.models.common.doc import Document, Span
			0 ignored issues – show Unused Code introduced 2021-01-25 09:19 UTC by Report Bug Copy Issue Report Unused Document imported from stanza.models.common.doc Loading history... introduced 2021-01-25 19:01 UTC by Report Bug Copy Issue Report third party import "from stanza.models.common.doc import Document, Span" should be placed before "from e2edutch import util" Loading history...
15
16
17			# Add a Clusters property to documents as a List of List of Span:
18			# Clusters is a List of cluster, cluster is a List of Span
19			def clusterSetter(self, value):
			0 ignored issues – show Coding Style Naming introduced 2021-01-25 19:01 UTC by Report Bug Copy Issue Report Function name "clusterSetter" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,\|_[^\\WA-Z]*\|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern) This check looks for invalid names for a range of different identifiers. You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements. If your project includes a Pylint configuration file, the settings contained in that file take precedence. To find out more about Pylint, please refer to their site. Loading history... introduced 2021-01-25 19:01 UTC by Report Bug Copy Issue Report Missing function or method docstring Loading history...
20			if isinstance(value, type([])):
21			self._clusters = value
			0 ignored issues – show Coding Style Best Practice introduced 2021-01-26 15:58 UTC by Report Bug Copy Issue Report It seems like `_clusters` was declared protected and should not be accessed from this context. Prefixing a member variable `_` is usually regarded as the equivalent of declaring it with protected visibility that exists in other languages. Consequentially, such a member should only be accessed from the same class or a child class: class MyParent: def __init__(self): self._x = 1; self.y = 2; class MyChild(MyParent): def some_method(self): return self._x # Ok, since accessed from a child class class AnotherClass: def some_method(self, instance_of_my_child): return instance_of_my_child._x # Would be flagged as AnotherClass is not # a child class of MyParent Loading history...
22			else:
23			logger.error('Clusters must be a List')
24
25
26			stanza.models.common.doc.Document.add_property('clusters', default='[]', setter=clusterSetter)
27
28
29			logger = logging.getLogger('e2edutch')
30			logger.setLevel(logging.INFO)
31			logger.addHandler(logging.StreamHandler())
32
33
34			@register_processor('coref')
35			class CorefProcessor(Processor):
36			''' Processor that appends coreference information
37			Coreferences are added similarly to Stanza's entities:
38			* a Document has an attribute clusters that is a List of coreference clusters;
39			* a coreference cluster is a List of Stanza Spans.
40			'''
41			_requires = set(['tokenize'])
42			_provides = set(['coref'])
43
44			def __init__(self, config, pipeline, use_gpu):
			0 ignored issues – show Bug introduced 2021-01-25 13:55 UTC by Report Bug Copy Issue Report The `__init__` method of the super-class `Processor` is not called. It is generally advisable to initialize the super-class by calling its `__init__` method: class SomeParent: def __init__(self): self.x = 1 class SomeChild(SomeParent): def __init__(self): # Initialize the super class SomeParent.__init__(self) Loading history...
45			# Make e2edutch follow Stanza's GPU settings:
46			# set the environment value for GPU, so that initialize_from_env picks it up.
47			# if use_gpu:
48			# os.environ['GPU'] = ' '.join(tf.config.experimental.list_physical_devices('GPU'))
49			# else:
50			# if 'GPU' in os.environ['GPU'] :
51			# os.environ.pop('GPU')
52
53			self.e2econfig = util.initialize_from_env(model_name='final')
54
55			# Override datapath and log_root:
56			# store e2edata with the Stanza resources, ie. a 'stanza_resources/nl/coref' directory
57			self.e2econfig['datapath'] = Path(config['model_path']).parent
58			self.e2econfig['log_root'] = Path(config['model_path']).parent
59
60			# Download data files if not present
61			download_data(self.e2econfig)
62
63			# Start and stop a session to cache all models
64			predictor = Predictor(config=self.e2econfig)
65			predictor.end_session()
66
67			def _set_up_model(self, *args):
			0 ignored issues – show Unused Code introduced 2021-01-25 09:19 UTC by Report Bug Copy Issue Report The argument `args` seems to be unused. Loading history... Coding Style introduced 2021-01-25 09:19 UTC by Report Bug Copy Issue Report This method could be written as a function/class method. If a method does not access any attributes of the class, it could also be implemented as a function or static method. This can help improve readability. For example class Foo: def some_method(self, x, y): return x + y; could be written as class Foo: @classmethod def some_method(cls, x, y): return x + y; Loading history...
68			print('_set_up_model')
69			pass
			0 ignored issues – show Unused Code introduced 2021-01-25 09:19 UTC by Report Bug Copy Issue Report Unnecessary pass statement Loading history...
70
71			def process(self, doc):
			0 ignored issues – show Comprehensibility introduced 2021-01-25 19:01 UTC by Report Bug Copy Issue Report This function exceeds the maximum number of variables (18/15). Loading history...
72
73			predictor = Predictor(config=self.e2econfig)
74
75			# build the example argument for predict:
76			# example (dict): dict with the following fields:
77			# sentences ([[str]])
78			# doc_id (str)
79			# clusters ([[(int, int)]]) (optional)
80			example = {}
81			example['sentences'] = []
82			example['doc_id'] = 'document_from_stanza' # TODO check what this should be
			0 ignored issues – show Coding Style introduced 2021-01-25 19:01 UTC by Report Bug Copy Issue Report `TODO` and `FIXME` comments should generally be avoided. Loading history...
83			example['doc_key'] = 'undocumented' # TODO check what this should be
			0 ignored issues – show Coding Style introduced 2021-01-25 19:01 UTC by Report Bug Copy Issue Report `TODO` and `FIXME` comments should generally be avoided. Loading history...
84
85			for sentence in doc.sentences:
86			s = []
			0 ignored issues – show Coding Style Naming introduced 2021-01-25 19:01 UTC by Report Bug Copy Issue Report Variable name "s" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,\|_[^\\WA-Z]*\|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern) This check looks for invalid names for a range of different identifiers. You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements. If your project includes a Pylint configuration file, the settings contained in that file take precedence. To find out more about Pylint, please refer to their site. Loading history...
87			for word in sentence.words:
88			s.append(word.text)
89			example['sentences'].append(s)
90
91			predicted_clusters = predictor.predict(example) # a list of tuples
92
93			# Add the predicted clusters back to the Stanza document
94
95			clusters = []
96			for predicted_cluster in predicted_clusters: # a tuple of entities
97			cluster = []
98			for predicted_reference in predicted_cluster: # a tuple of (start, end) word
99			start, end = predicted_reference
100
101			# find the sentence_id of the sentence containing this reference
102			sentence_id = 0
103			sentence = doc.sentences[0]
104			sentence_start_word = 0
105			sentence_end_word = len(sentence.words) - 1
106
107			while sentence_end_word < start:
108			sentence_start_word = sentence_end_word + 1
109
110			# move to the next sentence
111			sentence_id += 1
112			sentence = doc.sentences[sentence_id]
113
114			sentence_end_word = sentence_start_word + len(sentence.words) - 1
115
116			# start counting words from the start of this sentence
117			start -= sentence_start_word
118			end -= sentence_start_word
119
120			span = Span( # a list of Tokens
121			tokens=[word.parent for word in sentence.words[start:end + 1]],
122			doc=doc,
123			type='COREF',
124			sent=doc.sentences[sentence_id]
125			)
126			cluster.append(span)
127
128			clusters.append(cluster)
129
130			doc.clusters = clusters
131
132			predictor.end_session()
133
134			return doc
135

Filter-Bubble / e2e-Dutch

e2edutch.stanza A last analyzed 2021-08-05 09:00 UTC

Complexity

Size/Duplication

Importance

3 Methods

1 Function

Duplication Side-by-Side

Filter issues like

e2edutch.stanza A
last analyzed 2021-08-05 09:00 UTC