e2edutch.naf   A
last analyzed

Complexity

Total Complexity 21

Size/Duplication

Total Lines 141
Duplicated Lines 0 %

Importance

Changes 0
Metric Value
wmc 21
eloc 118
dl 0
loc 141
rs 10
c 0
b 0
f 0

5 Functions

Rating   Name   Duplication   Size   Complexity  
A add_linguistic_processors() 0 10 1
A get_naf() 0 17 5
A create_coref_layer() 0 18 5
B get_naf_from_sentences() 0 50 5
B get_jsonlines() 0 23 5
1
from . import __version__
0 ignored issues
show
introduced by
Missing module docstring
Loading history...
2
import logging
0 ignored issues
show
introduced by
standard import "import logging" should be placed before "from . import __version__"
Loading history...
3
4
import KafNafParserPy
0 ignored issues
show
introduced by
third party import "import KafNafParserPy" should be placed before "from . import __version__"
Loading history...
5
from KafNafParserPy import KafNafParser
0 ignored issues
show
introduced by
third party import "from KafNafParserPy import KafNafParser" should be placed before "from . import __version__"
Loading history...
6
from lxml.etree import XMLSyntaxError
0 ignored issues
show
Bug introduced by
The name XMLSyntaxError does not seem to exist in module lxml.etree.
Loading history...
introduced by
third party import "from lxml.etree import XMLSyntaxError" should be placed before "from . import __version__"
Loading history...
7
import itertools
0 ignored issues
show
introduced by
standard import "import itertools" should be placed before "import KafNafParserPy"
Loading history...
8
import tensorflow.compat.v1 as tf
0 ignored issues
show
introduced by
Unable to import 'tensorflow.compat.v1'
Loading history...
introduced by
third party import "import tensorflow.compat.v1 as tf" should be placed before "from . import __version__"
Loading history...
9
tf.disable_v2_behavior()
10
11
logger = logging.getLogger('e2edutch')
12
this_name = 'Coreference resolution based on e2e model'
0 ignored issues
show
Coding Style Naming introduced by
Constant name "this_name" doesn't conform to UPPER_CASE naming style ('([^\\W\\da-z][^\\Wa-z]*|__.*__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
13
14
15
def get_naf(input_filename):
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
16
    try:
17
        naf = KafNafParser(input_filename)
18
    except XMLSyntaxError:
19
        with open(input_filename) as input_file:
20
            input = input_file.read()
0 ignored issues
show
Bug Best Practice introduced by
This seems to re-define the built-in input.

It is generally discouraged to redefine built-ins as this makes code very hard to read.

Loading history...
21
        if "<NAF" in input and "</NAF>" in input:
22
            # I'm guessing this should be a NAF file but something is wrong
23
            logger.exception("Error parsing NAF file")
24
            raise
25
        naf = KafNafParser(type="NAF")
26
        naf.set_version("3.0")
27
        naf.set_language("nl")
28
        naf.lang = "nl"
29
        naf.raw = input
30
        naf.set_raw(naf.raw)
31
    return naf
32
33
34
def get_naf_from_sentences(sentences):
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
Comprehensibility introduced by
This function exceeds the maximum number of variables (20/15).
Loading history...
35
    naf_obj = KafNafParser(type="NAF")
36
    naf_obj.set_version("3.0")
37
    naf_obj.set_language("nl")
38
    naf_obj.lang = "nl"
39
    naf_obj.raw = '\n'.join([' '.join(s) for s in sentences])
40
    naf_obj.set_raw(naf_obj.raw)
41
    # Create text layer
42
    wcount = 1
43
    offsets = {}
44
    txt = naf_obj.get_raw()
45
    token_ids = []
46
    for sid, sentence in enumerate(sentences):
47
        token_ids_sub = []
48
        for token in sentence:
49
            token_obj = KafNafParserPy.Cwf(type=naf_obj.get_type())
50
            token_id = 'w{}'.format(wcount)
51
            token_length = len(token)
52
            offsets[wcount] = txt.find(token, offsets.get(wcount - 1, 0))
53
            token_obj.set_id(token_id)
54
            token_obj.set_length(str(token_length))
55
            # token_obj.set_offset(str(offset)) # Is this correct????
56
            token_obj.set_para('1')
57
            token_obj.set_sent(str(sid + 1))
58
            token_obj.set_text(token)
59
            token_obj.set_offset(str(offsets[wcount]))
60
            token_ids_sub.append(token_id)
61
            wcount += 1
62
            naf_obj.add_wf(token_obj)
63
        token_ids.append(token_ids_sub)
64
    # Create term layers
65
    term_ids = []
66
    count_terms = 0
67
    for sid, (sentence, token_ids_sub) in enumerate(zip(sentences, token_ids)):
68
        term_ids_sub = []
69
        logger.info('Creating the term layer...')
70
        for num_token, (token, token_id) in enumerate(
0 ignored issues
show
Unused Code introduced by
The variable num_token seems to be unused.
Loading history...
71
                zip(sentence, token_ids_sub)):
72
            new_term_id = 't_' + str(count_terms)
73
            count_terms += 1
74
            term_ids_sub.append(new_term_id)
75
            term_obj = KafNafParserPy.Cterm(type=naf_obj.get_type())
76
            term_obj.set_id(new_term_id)
77
            new_span = KafNafParserPy.Cspan()
78
            new_span.create_from_ids([token_id])
79
            term_obj.set_span(new_span)
80
            naf_obj.add_term(term_obj)
81
        term_ids.append(term_ids_sub)
82
83
    return naf_obj, term_ids
84
85
86
def create_coref_layer(knaf_obj, clusters, term_ids):
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
87
    term_ids_list = list(itertools.chain.from_iterable(term_ids))
88
    for cluster_id, cluster in enumerate(clusters):
89
        coref_obj = KafNafParserPy.Ccoreference(type=knaf_obj.get_type())
90
        coref_obj.set_id('co{}'.format(cluster_id + 1))
91
        coref_obj.set_type('entity')
92
        for start, end in cluster:
93
            coref_obj.add_span(term_ids_list[start:end + 1])
94
            span_text = []
95
            for term_id in term_ids_list[start:end + 1]:
96
                word_ids = knaf_obj.get_term(term_id).get_span_ids()
97
                for word_id in word_ids:
98
                    word = knaf_obj.get_token(word_id).get_text()
99
                    span_text.append(word)
100
            span_text = ' '.join(span_text)
101
            # TODO: output span_text as comment
0 ignored issues
show
Coding Style introduced by
TODO and FIXME comments should generally be avoided.
Loading history...
102
        knaf_obj.add_coreference(coref_obj)
103
    return knaf_obj
104
105
106
def add_linguistic_processors(in_obj):
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
107
    name = this_name
108
109
    my_lp = KafNafParserPy.Clp()
110
    my_lp.set_name(name)
111
    my_lp.set_version(__version__)
112
    my_lp.set_timestamp()
113
    in_obj.add_linguistic_processor('coreferences', my_lp)
114
115
    return in_obj
116
117
118
def get_jsonlines(knaf_obj):
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
119
    sent_term_tok = []
120
121
    for term in knaf_obj.get_terms():
122
        for tok_id in term.get_span_ids():
123
            tok = knaf_obj.get_token(tok_id)
124
            sent_term_tok.append(
125
                (tok.get_sent(), term.get_id(), tok_id, tok.get_text()))
126
127
    sentences = []
128
    term_ids = []
129
    tok_ids = []
130
    for sent_id, idlist in itertools.groupby(sent_term_tok, lambda t: t[0]):
0 ignored issues
show
Unused Code introduced by
The variable sent_id seems to be unused.
Loading history...
131
        idlist = list(idlist)
132
        sentences.append([t[3] for t in idlist])
133
        term_ids.append([t[1] for t in idlist])
134
        tok_ids.append([t[2] for t in idlist])
135
136
    jsonlines_obj = {'doc_key': str(knaf_obj.get_filename()),
137
                     'sentences': sentences,
138
                     'clusters': []
139
                     }
140
    return jsonlines_obj, term_ids, tok_ids
141