e2edutch.naf - Code Metrics - Filter-Bubble/e2e-Dutch - Measure and Improve Code Quality continuously with Scrutinizer

e2edutch.naf A
last analyzed 2021-08-05 09:00 UTC

↳ Parent: Project

Complexity

Total Complexity

Size/Duplication

Total Lines	141
Duplicated Lines	0 %

Importance

Changes

Metric	Value
wmc	21
eloc	118
dl	0
loc	141
rs	10
c	0
b	0
f	0

5 Functions

Rating	Name	Size	Complexity
A	add_linguistic_processors()	10	1
A	get_naf()	17	5
A	create_coref_layer()	18	5
B	get_naf_from_sentences()	50	5
B	get_jsonlines()	23	5

from . import __version__

import logging


import KafNafParserPy

from KafNafParserPy import KafNafParser

from lxml.etree import XMLSyntaxError

import itertools

import tensorflow.compat.v1 as tf

tf.disable_v2_behavior()

logger = logging.getLogger('e2edutch')
this_name = 'Coreference resolution based on e2e model'



def get_naf(input_filename):

    try:
        naf = KafNafParser(input_filename)
    except XMLSyntaxError:
        with open(input_filename) as input_file:
            input = input_file.read()

        if "<NAF" in input and "</NAF>" in input:
            # I'm guessing this should be a NAF file but something is wrong
            logger.exception("Error parsing NAF file")
            raise
        naf = KafNafParser(type="NAF")
        naf.set_version("3.0")
        naf.set_language("nl")
        naf.lang = "nl"
        naf.raw = input
        naf.set_raw(naf.raw)
    return naf


def get_naf_from_sentences(sentences):

    naf_obj = KafNafParser(type="NAF")
    naf_obj.set_version("3.0")
    naf_obj.set_language("nl")
    naf_obj.lang = "nl"
    naf_obj.raw = '\n'.join([' '.join(s) for s in sentences])
    naf_obj.set_raw(naf_obj.raw)
    # Create text layer
    wcount = 1
    offsets = {}
    txt = naf_obj.get_raw()
    token_ids = []
    for sid, sentence in enumerate(sentences):
        token_ids_sub = []
        for token in sentence:
            token_obj = KafNafParserPy.Cwf(type=naf_obj.get_type())
            token_id = 'w{}'.format(wcount)
            token_length = len(token)
            offsets[wcount] = txt.find(token, offsets.get(wcount - 1, 0))
            token_obj.set_id(token_id)
            token_obj.set_length(str(token_length))
            # token_obj.set_offset(str(offset)) # Is this correct????
            token_obj.set_para('1')
            token_obj.set_sent(str(sid + 1))
            token_obj.set_text(token)
            token_obj.set_offset(str(offsets[wcount]))
            token_ids_sub.append(token_id)
            wcount += 1
            naf_obj.add_wf(token_obj)
        token_ids.append(token_ids_sub)
    # Create term layers
    term_ids = []
    count_terms = 0
    for sid, (sentence, token_ids_sub) in enumerate(zip(sentences, token_ids)):
        term_ids_sub = []
        logger.info('Creating the term layer...')
        for num_token, (token, token_id) in enumerate(

                zip(sentence, token_ids_sub)):
            new_term_id = 't_' + str(count_terms)
            count_terms += 1
            term_ids_sub.append(new_term_id)
            term_obj = KafNafParserPy.Cterm(type=naf_obj.get_type())
            term_obj.set_id(new_term_id)
            new_span = KafNafParserPy.Cspan()
            new_span.create_from_ids([token_id])
            term_obj.set_span(new_span)
            naf_obj.add_term(term_obj)
        term_ids.append(term_ids_sub)

    return naf_obj, term_ids


def create_coref_layer(knaf_obj, clusters, term_ids):

    term_ids_list = list(itertools.chain.from_iterable(term_ids))
    for cluster_id, cluster in enumerate(clusters):
        coref_obj = KafNafParserPy.Ccoreference(type=knaf_obj.get_type())
        coref_obj.set_id('co{}'.format(cluster_id + 1))
        coref_obj.set_type('entity')
        for start, end in cluster:
            coref_obj.add_span(term_ids_list[start:end + 1])
            span_text = []
            for term_id in term_ids_list[start:end + 1]:
                word_ids = knaf_obj.get_term(term_id).get_span_ids()
                for word_id in word_ids:
                    word = knaf_obj.get_token(word_id).get_text()
                    span_text.append(word)
            span_text = ' '.join(span_text)
            # TODO: output span_text as comment

        knaf_obj.add_coreference(coref_obj)
    return knaf_obj


def add_linguistic_processors(in_obj):

    name = this_name

    my_lp = KafNafParserPy.Clp()
    my_lp.set_name(name)
    my_lp.set_version(__version__)
    my_lp.set_timestamp()
    in_obj.add_linguistic_processor('coreferences', my_lp)

    return in_obj


def get_jsonlines(knaf_obj):

    sent_term_tok = []

    for term in knaf_obj.get_terms():
        for tok_id in term.get_span_ids():
            tok = knaf_obj.get_token(tok_id)
            sent_term_tok.append(
                (tok.get_sent(), term.get_id(), tok_id, tok.get_text()))

    sentences = []
    term_ids = []
    tok_ids = []
    for sent_id, idlist in itertools.groupby(sent_term_tok, lambda t: t[0]):

        idlist = list(idlist)
        sentences.append([t[3] for t in idlist])
        term_ids.append([t[1] for t in idlist])
        tok_ids.append([t[2] for t in idlist])

    jsonlines_obj = {'doc_key': str(knaf_obj.get_filename()),
                     'sentences': sentences,
                     'clusters': []
                     }
    return jsonlines_obj, term_ids, tok_ids


1			from . import __version__
			0 ignored issues – show introduced 2020-10-27 15:20 UTC by Report Bug Copy Issue Report Missing module docstring Loading history...
2			import logging
			0 ignored issues – show introduced 2020-10-27 15:20 UTC by Report Bug Copy Issue Report standard import "import logging" should be placed before "from . import __version__" Loading history...
3
4			import KafNafParserPy
			0 ignored issues – show introduced 2020-10-27 15:20 UTC by Report Bug Copy Issue Report third party import "import KafNafParserPy" should be placed before "from . import __version__" Loading history...
5			from KafNafParserPy import KafNafParser
			0 ignored issues – show introduced 2020-10-27 15:20 UTC by Report Bug Copy Issue Report third party import "from KafNafParserPy import KafNafParser" should be placed before "from . import __version__" Loading history...
6			from lxml.etree import XMLSyntaxError
			0 ignored issues – show Bug introduced 2020-10-27 15:20 UTC by Report Bug Copy Issue Report The name `XMLSyntaxError` does not seem to exist in module `lxml.etree`. Loading history... introduced 2020-10-27 15:20 UTC by Report Bug Copy Issue Report third party import "from lxml.etree import XMLSyntaxError" should be placed before "from . import __version__" Loading history...
7			import itertools
			0 ignored issues – show introduced 2020-10-27 15:20 UTC by Report Bug Copy Issue Report standard import "import itertools" should be placed before "import KafNafParserPy" Loading history...
8			import tensorflow.compat.v1 as tf
			0 ignored issues – show introduced 2020-10-27 15:20 UTC by Report Bug Copy Issue Report Unable to import 'tensorflow.compat.v1' Loading history... introduced 2020-10-27 15:20 UTC by Report Bug Copy Issue Report third party import "import tensorflow.compat.v1 as tf" should be placed before "from . import __version__" Loading history...
9			tf.disable_v2_behavior()
10
11			logger = logging.getLogger('e2edutch')
12			this_name = 'Coreference resolution based on e2e model'
			0 ignored issues – show Coding Style Naming introduced 2020-10-27 15:20 UTC by Report Bug Copy Issue Report Constant name "this_name" doesn't conform to UPPER_CASE naming style ('([^\\W\\da-z][^\\Wa-z]\|__.__)$' pattern) This check looks for invalid names for a range of different identifiers. You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements. If your project includes a Pylint configuration file, the settings contained in that file take precedence. To find out more about Pylint, please refer to their site. Loading history...
13
14
15			def get_naf(input_filename):
			0 ignored issues – show introduced 2020-10-27 15:20 UTC by Report Bug Copy Issue Report Missing function or method docstring Loading history...
16			try:
17			naf = KafNafParser(input_filename)
18			except XMLSyntaxError:
19			with open(input_filename) as input_file:
20			input = input_file.read()
			0 ignored issues – show Bug Best Practice introduced 2020-10-27 15:20 UTC by Report Bug Copy Issue Report This seems to re-define the built-in `input`. It is generally discouraged to redefine built-ins as this makes code very hard to read. Loading history...
21			if "<NAF" in input and "</NAF>" in input:
22			# I'm guessing this should be a NAF file but something is wrong
23			logger.exception("Error parsing NAF file")
24			raise
25			naf = KafNafParser(type="NAF")
26			naf.set_version("3.0")
27			naf.set_language("nl")
28			naf.lang = "nl"
29			naf.raw = input
30			naf.set_raw(naf.raw)
31			return naf
32
33
34			def get_naf_from_sentences(sentences):
			0 ignored issues – show introduced 2020-10-27 15:20 UTC by Report Bug Copy Issue Report Missing function or method docstring Loading history... Comprehensibility introduced 2020-10-27 15:20 UTC by Report Bug Copy Issue Report This function exceeds the maximum number of variables (20/15). Loading history...
35			naf_obj = KafNafParser(type="NAF")
36			naf_obj.set_version("3.0")
37			naf_obj.set_language("nl")
38			naf_obj.lang = "nl"
39			naf_obj.raw = '\n'.join([' '.join(s) for s in sentences])
40			naf_obj.set_raw(naf_obj.raw)
41			# Create text layer
42			wcount = 1
43			offsets = {}
44			txt = naf_obj.get_raw()
45			token_ids = []
46			for sid, sentence in enumerate(sentences):
47			token_ids_sub = []
48			for token in sentence:
49			token_obj = KafNafParserPy.Cwf(type=naf_obj.get_type())
50			token_id = 'w{}'.format(wcount)
51			token_length = len(token)
52			offsets[wcount] = txt.find(token, offsets.get(wcount - 1, 0))
53			token_obj.set_id(token_id)
54			token_obj.set_length(str(token_length))
55			# token_obj.set_offset(str(offset)) # Is this correct????
56			token_obj.set_para('1')
57			token_obj.set_sent(str(sid + 1))
58			token_obj.set_text(token)
59			token_obj.set_offset(str(offsets[wcount]))
60			token_ids_sub.append(token_id)
61			wcount += 1
62			naf_obj.add_wf(token_obj)
63			token_ids.append(token_ids_sub)
64			# Create term layers
65			term_ids = []
66			count_terms = 0
67			for sid, (sentence, token_ids_sub) in enumerate(zip(sentences, token_ids)):
68			term_ids_sub = []
69			logger.info('Creating the term layer...')
70			for num_token, (token, token_id) in enumerate(
			0 ignored issues – show Unused Code introduced 2020-10-27 15:20 UTC by Report Bug Copy Issue Report The variable `num_token` seems to be unused. Loading history...
71			zip(sentence, token_ids_sub)):
72			new_term_id = 't_' + str(count_terms)
73			count_terms += 1
74			term_ids_sub.append(new_term_id)
75			term_obj = KafNafParserPy.Cterm(type=naf_obj.get_type())
76			term_obj.set_id(new_term_id)
77			new_span = KafNafParserPy.Cspan()
78			new_span.create_from_ids([token_id])
79			term_obj.set_span(new_span)
80			naf_obj.add_term(term_obj)
81			term_ids.append(term_ids_sub)
82
83			return naf_obj, term_ids
84
85
86			def create_coref_layer(knaf_obj, clusters, term_ids):
			0 ignored issues – show introduced 2020-10-27 15:20 UTC by Report Bug Copy Issue Report Missing function or method docstring Loading history...
87			term_ids_list = list(itertools.chain.from_iterable(term_ids))
88			for cluster_id, cluster in enumerate(clusters):
89			coref_obj = KafNafParserPy.Ccoreference(type=knaf_obj.get_type())
90			coref_obj.set_id('co{}'.format(cluster_id + 1))
91			coref_obj.set_type('entity')
92			for start, end in cluster:
93			coref_obj.add_span(term_ids_list[start:end + 1])
94			span_text = []
95			for term_id in term_ids_list[start:end + 1]:
96			word_ids = knaf_obj.get_term(term_id).get_span_ids()
97			for word_id in word_ids:
98			word = knaf_obj.get_token(word_id).get_text()
99			span_text.append(word)
100			span_text = ' '.join(span_text)
101			# TODO: output span_text as comment
			0 ignored issues – show Coding Style introduced 2020-10-27 15:20 UTC by Report Bug Copy Issue Report `TODO` and `FIXME` comments should generally be avoided. Loading history...
102			knaf_obj.add_coreference(coref_obj)
103			return knaf_obj
104
105
106			def add_linguistic_processors(in_obj):
			0 ignored issues – show introduced 2020-10-27 15:20 UTC by Report Bug Copy Issue Report Missing function or method docstring Loading history...
107			name = this_name
108
109			my_lp = KafNafParserPy.Clp()
110			my_lp.set_name(name)
111			my_lp.set_version(__version__)
112			my_lp.set_timestamp()
113			in_obj.add_linguistic_processor('coreferences', my_lp)
114
115			return in_obj
116
117
118			def get_jsonlines(knaf_obj):
			0 ignored issues – show introduced 2020-10-27 15:20 UTC by Report Bug Copy Issue Report Missing function or method docstring Loading history...
119			sent_term_tok = []
120
121			for term in knaf_obj.get_terms():
122			for tok_id in term.get_span_ids():
123			tok = knaf_obj.get_token(tok_id)
124			sent_term_tok.append(
125			(tok.get_sent(), term.get_id(), tok_id, tok.get_text()))
126
127			sentences = []
128			term_ids = []
129			tok_ids = []
130			for sent_id, idlist in itertools.groupby(sent_term_tok, lambda t: t[0]):
			0 ignored issues – show Unused Code introduced 2020-10-27 15:20 UTC by Report Bug Copy Issue Report The variable `sent_id` seems to be unused. Loading history...
131			idlist = list(idlist)
132			sentences.append([t[3] for t in idlist])
133			term_ids.append([t[1] for t in idlist])
134			tok_ids.append([t[2] for t in idlist])
135
136			jsonlines_obj = {'doc_key': str(knaf_obj.get_filename()),
137			'sentences': sentences,
138			'clusters': []
139			}
140			return jsonlines_obj, term_ids, tok_ids
141

Filter-Bubble / e2e-Dutch

e2edutch.naf A last analyzed 2021-08-05 09:00 UTC

Complexity

Size/Duplication

Importance

5 Functions

Duplication Side-by-Side

Filter issues like

e2edutch.naf A
last analyzed 2021-08-05 09:00 UTC