Completed
Push — master ( cc3c01...072717 )
by Dafne van
14s queued 11s
created

e2edutch.stanza.clusterSetter()   A

Complexity

Conditions 2

Size

Total Lines 5
Code Lines 4

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 4
dl 0
loc 5
rs 10
c 0
b 0
f 0
cc 2
nop 2
1
import os
0 ignored issues
show
introduced by
Missing module docstring
Loading history...
Unused Code introduced by
The import os seems to be unused.
Loading history...
2
import stanza
3
import logging
0 ignored issues
show
introduced by
standard import "import logging" should be placed before "import stanza"
Loading history...
4
5
from pathlib import Path
0 ignored issues
show
introduced by
standard import "from pathlib import Path" should be placed before "import stanza"
Loading history...
6
7
from e2edutch import util
8
from e2edutch import coref_model as cm
0 ignored issues
show
Unused Code introduced by
Unused coref_model imported from e2edutch as cm
Loading history...
9
from e2edutch.download import download_data
10
from e2edutch.predict import Predictor
11
12
from stanza.pipeline.processor import Processor, register_processor
0 ignored issues
show
introduced by
third party import "from stanza.pipeline.processor import Processor, register_processor" should be placed before "from e2edutch import util"
Loading history...
13
from stanza.models.common.doc import Document, Span
0 ignored issues
show
Unused Code introduced by
Unused Document imported from stanza.models.common.doc
Loading history...
introduced by
third party import "from stanza.models.common.doc import Document, Span" should be placed before "from e2edutch import util"
Loading history...
14
15
16
# Add a Clusters property to documents as a List of List of Span:
17
# Clusters is a List of cluster, cluster is a List of Span
18
def clusterSetter(self, value):
0 ignored issues
show
Coding Style Naming introduced by
Function name "clusterSetter" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
introduced by
Missing function or method docstring
Loading history...
19
    if isinstance(value, type([])):
20
        self._clusters = value
0 ignored issues
show
Coding Style Best Practice introduced by
It seems like _clusters was declared protected and should not be accessed from this context.

Prefixing a member variable _ is usually regarded as the equivalent of declaring it with protected visibility that exists in other languages. Consequentially, such a member should only be accessed from the same class or a child class:

class MyParent:
    def __init__(self):
        self._x = 1;
        self.y = 2;

class MyChild(MyParent):
    def some_method(self):
        return self._x    # Ok, since accessed from a child class

class AnotherClass:
    def some_method(self, instance_of_my_child):
        return instance_of_my_child._x   # Would be flagged as AnotherClass is not
                                         # a child class of MyParent
Loading history...
21
    else:
22
        logger.error('Clusters must be a List')
23
24
stanza.models.common.doc.Document.add_property('clusters', default='[]', setter=clusterSetter)
25
26
27
import tensorflow.compat.v1 as tf
0 ignored issues
show
introduced by
Unable to import 'tensorflow.compat.v1'
Loading history...
introduced by
Import "import tensorflow.compat.v1 as tf" should be placed at the top of the module
Loading history...
Unused Code introduced by
Unused tensorflow.compat.v1 imported as tf
Loading history...
introduced by
third party import "import tensorflow.compat.v1 as tf" should be placed before "from e2edutch import util"
Loading history...
28
29
logger = logging.getLogger('e2edutch')
30
logger.setLevel(logging.INFO)
31
logger.addHandler(logging.StreamHandler())
32
33
34
@register_processor('coref')
35
class CorefProcessor(Processor):
36
    ''' Processor that appends coreference information '''
37
    _requires = set(['tokenize'])
38
    _provides = set(['coref'])
39
    
0 ignored issues
show
Coding Style introduced by
Trailing whitespace
Loading history...
40
    def __init__(self, config, pipeline, use_gpu):
0 ignored issues
show
Bug introduced by
The __init__ method of the super-class Processor is not called.

It is generally advisable to initialize the super-class by calling its __init__ method:

class SomeParent:
    def __init__(self):
        self.x = 1

class SomeChild(SomeParent):
    def __init__(self):
        # Initialize the super class
        SomeParent.__init__(self)
Loading history...
41
        # Make e2edutch follow Stanza's GPU settings:
42
        # set the environment value for GPU, so that initialize_from_env picks it up.
43
        #if use_gpu:
44
        #    os.environ['GPU'] = ' '.join(tf.config.experimental.list_physical_devices('GPU'))
45
        #else:
46
        #    if 'GPU' in os.environ['GPU'] :
47
        #        os.environ.pop('GPU')
48
49
        self.e2econfig = util.initialize_from_env(model_name='final')
50
51
        # Override datapath and log_root:
52
        # store e2edata with the Stanza resources, ie. a 'stanza_resources/nl/coref' directory
53
        self.e2econfig['datapath'] = Path(config['model_path']).parent
54
        self.e2econfig['log_root'] = Path(config['model_path']).parent
55
56
        # Download data files if not present
57
        download_data(self.e2econfig)
58
59
        # Start and stop a session to cache all models
60
        predictor = Predictor(config=self.e2econfig)
61
        predictor.end_session()
62
63
    def _set_up_model(self, *args):
0 ignored issues
show
Unused Code introduced by
The argument args seems to be unused.
Loading history...
Coding Style introduced by
This method could be written as a function/class method.

If a method does not access any attributes of the class, it could also be implemented as a function or static method. This can help improve readability. For example

class Foo:
    def some_method(self, x, y):
        return x + y;

could be written as

class Foo:
    @classmethod
    def some_method(cls, x, y):
        return x + y;
Loading history...
64
        print ('_set_up_model')
0 ignored issues
show
Coding Style introduced by
No space allowed before bracket
Loading history...
65
        pass
0 ignored issues
show
Unused Code introduced by
Unnecessary pass statement
Loading history...
66
67
    def process(self, doc):
0 ignored issues
show
Comprehensibility introduced by
This function exceeds the maximum number of variables (18/15).
Loading history...
68
69
        predictor = Predictor(config=self.e2econfig)
70
71
        # build the example argument for predict:
72
        #   example (dict): dict with the following fields:
73
        #                     sentences ([[str]])
74
        #                     doc_id (str)
75
        #                     clusters ([[(int, int)]]) (optional)
76
        example = {}
77
        example['sentences'] = []
78
        example['doc_id'] = 'document_from_stanza'  # TODO check what this should be
0 ignored issues
show
Coding Style introduced by
TODO and FIXME comments should generally be avoided.
Loading history...
79
        example['doc_key'] = 'undocumented'  # TODO check what this should be
0 ignored issues
show
Coding Style introduced by
TODO and FIXME comments should generally be avoided.
Loading history...
80
81
        for sentence in doc.sentences:
82
            s = []
0 ignored issues
show
Coding Style Naming introduced by
Variable name "s" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
83
            for word in sentence.words:
84
                s.append(word.text)
85
            example['sentences'].append(s)
86
87
        predicted_clusters = predictor.predict(example)  # a list of tuples
88
89
        # Add the predicted clusters back to the Stanza document
90
91
        clusters = []
92
        for predicted_cluster in predicted_clusters:  # a tuple of entities
93
            cluster = []
94
            for predicted_reference in predicted_cluster:  # a tuple of (start, end) word
95
                start, end = predicted_reference
96
97
                # find the sentence_id of the sentence containing this reference
98
                sentence_id = 0
99
                sentence = doc.sentences[0]
100
                sentence_start_word = 0
101
                sentence_end_word = len(sentence.words) - 1
102
103
                while sentence_end_word < start:
104
                    sentence_start_word = sentence_end_word + 1
105
106
                    # move to the next sentence
107
                    sentence_id += 1
108
                    sentence = doc.sentences[sentence_id]
109
110
                    sentence_end_word = sentence_start_word + len(sentence.words) - 1
111
112
                # start counting words from the start of this sentence
113
                start -= sentence_start_word
114
                end -= sentence_start_word
115
116
                span = Span(  # a list of Tokens
117
                        tokens=[word.parent for word in sentence.words[start:end + 1]],
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation (remove 4 spaces).
Loading history...
118
                        doc=doc,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation (remove 4 spaces).
Loading history...
119
                        type='COREF',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation (remove 4 spaces).
Loading history...
120
                        sent=doc.sentences[sentence_id]
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation (remove 4 spaces).
Loading history...
121
                        )
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation.
Loading history...
122
                cluster.append(span)
123
124
            clusters.append(cluster)
125
126
        doc.clusters = clusters
127
128
        predictor.end_session()
129
130
        return doc
131