features_csv_to_dict.main() - Code Metrics - chrislit/abydos - Measure and Improve Code Quality continuously with Scrutinizer

features_csv_to_dict.main() F
last analyzed 2020-12-31 20:10 UTC

↳ Parent: features_csv_to_dict

Complexity

Conditions

Size

Total Lines	284
Code Lines	176

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
eloc	176
dl	0
loc	284
rs	0
c	0
b	0
f	0
cc	42
nop	1

How to fix Long Method Complexity

#!/usr/bin/env python3
# Copyright 2014-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.

"""features_csv_to_dict.py.

This script converts a CSV document of feature values to a Python dict.

The CSV document is of the format
<phonetic symbol(s)>,<variant number>,<segmental>,<feature>+

    Phonetic symbols are IPA (or other system) symbols.
    Variant number refers to one of the following codes:
         0 = IPA
         1 = Americanist
         2 = IPA variant
         3 = Americanist variant
         9 = other
    Segmental must be either 1 (segmental) or 0 (featural). Featural symbols
    replace features on the preceding segmental symbol.
    Features may be 1 (+), -1 (-), or 0 (0) to indicate feature values.

Lines beginning with # are interpreted as comments
"""

import codecs
import getopt
import sys
import unicodedata


def main(argv):
    """Read input file and write to output.

    Parameters
    ----------
    argv : list
        Arguments to the script

    """
    first_col = 3
    last_col = -1

    def print_usage():
        """Print usage statement."""
        sys.stdout.write(
            'features_csv_to_dict.py -i <inputfile> ' + '[-o <outputfile>]\n'
        )
        sys.exit(2)

    def binarize(num):
        """Replace 0, -1, 1, 2 with 00, 10, 01, 11.

        Parameters
        ----------
        num : str
            The number to binarize

        Returns
        -------
        str
            A binarized number

        """
        if num == '0':  # 0
            return '00'
        elif num == '-1':  # -
            return '10'
        elif num == '1':  # +
            return '01'
        # '2' -> ± (segmental) or copy from base (non-segmental)
        return '11'

    def init_termdicts():
        """Initialize the terms dict.

        Returns
        -------
        (dict, dict)
            Term & feature mask dictionaries

        """
        ifile = codecs.open('features_terms.csv', 'r', 'utf-8')

        feature_mask = {}
        keyline = ifile.readline().strip().split(',')[first_col:last_col]
        mag = len(keyline)
        for i in range(len(keyline)):
            features = '0b' + ('00' * i) + '11' + ('00' * (mag - i - 1))
            feature_mask[keyline[i]] = int(features, 2)

        termdict = {}
        for line in ifile:
            line = line.strip().rstrip(',')
            if '#' in line:
                line = line[: line.find('#')].strip()
            if line:
                line = line.split(',')
                term = line[last_col]
                features = '0b' + ''.join(
                    [binarize(val) for val in line[first_col:last_col]]
                )
                termdict[term] = int(features, 2)

        return termdict, feature_mask

    def check_terms(sym, features, name, termdict):
        """Check terms.

        Check each term of the phone name to confirm that it matches
        the expected features implied by that feature.

        Parameters
        ----------
        sym : str
            Symbol to check
        features : int
            Phone features
        name : str
            Phone name
        termdict : dict
            Dictionary of terms

        """
        if '#' in name:
            name = name[: name.find('#')].strip()
        for term in name.split():
            if term in termdict:
                if termdict[term] & features != termdict[term]:
                    sys.stdout.write(
                        'Feature mismatch for term "'
                        + term
                        + '" in   '
                        + sym
                        + '\n'
                    )
            else:
                sys.stdout.write(
                    'Unknown term "'
                    + term
                    + '" in '
                    + name
                    + ' : '
                    + sym
                    + '\n'
                )

    def check_entailments(sym, features, feature_mask):
        """Check entailments.

        Check for necessary feature assignments (entailments)
        For example, [+round] necessitates [+labial].

        Parameters
        ----------
        sym : str
            Symbol to check
        features : int
            Phone features
        feature_mask : dict
            The feature mask

        """
        entailments = {
            '+labial': ('±round', '±protruded', '±compressed', '±labiodental'),
            '-labial': ('0round', '0protruded', '0compressed', '0labiodental'),
            '+coronal': ('±anterior', '±distributed'),
            '-coronal': ('0anterior', '0distributed'),
            '+dorsal': ('±high', '±low', '±front', '±back', '±tense'),
            '-dorsal': ('0high', '0low', '0front', '0back', '0tense'),
            '+pharyngeal': ('±atr', '±rtr'),
            '-pharyngeal': ('0atr', '0rtr'),
            '+protruded': ('+labial', '+round', '-compressed'),
            '+compressed': ('+labial', '+round', '-protruded'),
            '+glottalic_suction': ('-velaric_suction',),
            '+velaric_suction': ('-glottalic_suction',),
        }

        for feature in entailments:
            fname = feature[1:]
            if feature[0] == '+':
                fm = (feature_mask[fname] >> 1) & feature_mask[fname]
            else:
                fm = (feature_mask[fname] << 1) & feature_mask[fname]
            if (features & fm) == fm:
                for ent in entailments[feature]:
                    ename = ent[1:]
                    if ent[0] == '+':
                        efm = (feature_mask[ename] >> 1) & feature_mask[ename]
                    elif ent[0] == '-':
                        efm = (feature_mask[ename] << 1) & feature_mask[ename]
                    elif ent[0] == '0':
                        efm = 0
                    elif ent[0] == '±':
                        efm = feature_mask[ename]

                    if ent[0] == '±':
                        if (features & efm) == 0:

                            sys.stdout.write(
                                'Incorrect entailment for '
                                + sym
                                + ' for feature '
                                + fname
                                + ' and entailment '
                                + ename
                            )
                    else:
                        if (features & efm) != efm:
                            sys.stdout.write(
                                'Incorrect entailment for '
                                + sym
                                + ' for feature '
                                + fname
                                + ' and entailment '
                                + ename
                            )

    checkdict = {}  # a mapping of symbol to feature
    checkset_s = set()  # a set of the symbols seen
    checkset_f = set()  # a set of the feature values seen

    termdict, feature_mask = init_termdicts()

    ifile = ''
    ofile = ''
    try:
        opts = getopt.getopt(argv, 'hi:o:', ['ifile=', 'ofile='])[0]
    except getopt.GetoptError:
        print_usage()
    for opt, arg in opts:
        if opt == '-h':
            print_usage()
        elif opt in ('-i', '--ifile'):
            ifile = codecs.open(arg, 'r', 'utf-8')
        elif opt in ('-o', '--ofile'):
            ofile = codecs.open(arg, 'w', 'utf-8')
    if not ifile:
        print_usage()

    oline = 'PHONETIC_FEATURES = {'
    if not ofile:
        ofile = sys.stdout

    ofile.write(oline + '\n')

    keyline = ifile.readline().strip().split(',')[first_col:last_col]
    for line in ifile:
        line = line.strip().rstrip(',')

        if line.startswith('####'):
            break

        line = unicodedata.normalize('NFC', line)

        if not line or line.startswith('#'):
            oline = '                     ' + line

        else:
            line = line.strip().split(',')
            if '#' in line:
                line = line[: line.find('#')]
            symbol = line[0]
            variant = int(line[1])
            segmental = bool(line[2])
            features = '0b' + ''.join(
                [binarize(val) for val in line[first_col:last_col]]
            )
            name = line[-1].strip()
            if not segmental:
                features = '-' + features

            featint = int(features, 2)
            check_terms(symbol, featint, name, termdict)
            check_entailments(symbol, featint, feature_mask)
            if symbol in checkset_s:
                sys.stdout.write(
                    'Symbol ' + symbol + ' appears twice in CSV.\n'
                )
            else:
                checkset_s.add(symbol)

            if variant < 2:
                if featint in checkset_f:
                    sys.stdout.write(
                        'Feature set '
                        + str(featint)
                        + ' appears in CSV for two primary IPA '
                        + 'symbols: '
                        + symbol
                        + ' and '
                        + checkdict[featint]
                    )
                else:
                    checkdict[featint] = symbol
                    checkset_f.add(featint)

            if variant < 5:
                oline = "                     '{}': {},".format(
                    symbol, featint
                )
            else:
                oline = ''

        if oline:
            ofile.write(oline + '\n')

    ofile.write('                    }\n\nFEATURE_MASK = {')

    mag = len(keyline)
    for i in range(len(keyline)):
        features = int('0b' + ('00' * i) + '11' + ('00' * (mag - i - 1)), 2)
        oline = "                '{}': {},".format(keyline[i], features)
        ofile.write(oline + '\n')

    ofile.write('               }\n')


if __name__ == '__main__':
    main(sys.argv[1:])


1			#!/usr/bin/env python3
2			# Copyright 2014-2020 by Christopher C. Little.
3			# This file is part of Abydos.
4			#
5			# Abydos is free software: you can redistribute it and/or modify
6			# it under the terms of the GNU General Public License as published by
7			# the Free Software Foundation, either version 3 of the License, or
8			# (at your option) any later version.
9			#
10			# Abydos is distributed in the hope that it will be useful,
11			# but WITHOUT ANY WARRANTY; without even the implied warranty of
12			# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13			# GNU General Public License for more details.
14			#
15			# You should have received a copy of the GNU General Public License
16			# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
17
18			"""features_csv_to_dict.py.
19
20			This script converts a CSV document of feature values to a Python dict.
21
22			The CSV document is of the format
23			<phonetic symbol(s)>,<variant number>,<segmental>,<feature>+
24
25			Phonetic symbols are IPA (or other system) symbols.
26			Variant number refers to one of the following codes:
27			0 = IPA
28			1 = Americanist
29			2 = IPA variant
30			3 = Americanist variant
31			9 = other
32			Segmental must be either 1 (segmental) or 0 (featural). Featural symbols
33			replace features on the preceding segmental symbol.
34			Features may be 1 (+), -1 (-), or 0 (0) to indicate feature values.
35
36			Lines beginning with # are interpreted as comments
37			"""
38
39			import codecs
40			import getopt
41			import sys
42			import unicodedata
43
44
45			def main(argv):
46			"""Read input file and write to output.
47
48			Parameters
49			----------
50			argv : list
51			Arguments to the script
52
53			"""
54			first_col = 3
55			last_col = -1
56
57			def print_usage():
58			"""Print usage statement."""
59			sys.stdout.write(
60			'features_csv_to_dict.py -i <inputfile> ' + '[-o <outputfile>]\n'
61			)
62			sys.exit(2)
63
64			def binarize(num):
65			"""Replace 0, -1, 1, 2 with 00, 10, 01, 11.
66
67			Parameters
68			----------
69			num : str
70			The number to binarize
71
72			Returns
73			-------
74			str
75			A binarized number
76
77			"""
78			if num == '0': # 0
79			return '00'
80			elif num == '-1': # -
81			return '10'
82			elif num == '1': # +
83			return '01'
84			# '2' -> ± (segmental) or copy from base (non-segmental)
85			return '11'
86
87			def init_termdicts():
88			"""Initialize the terms dict.
89
90			Returns
91			-------
92			(dict, dict)
93			Term & feature mask dictionaries
94
95			"""
96			ifile = codecs.open('features_terms.csv', 'r', 'utf-8')
97
98			feature_mask = {}
99			keyline = ifile.readline().strip().split(',')[first_col:last_col]
100			mag = len(keyline)
101			for i in range(len(keyline)):
102			features = '0b' + ('00' * i) + '11' + ('00' * (mag - i - 1))
103			feature_mask[keyline[i]] = int(features, 2)
104
105			termdict = {}
106			for line in ifile:
107			line = line.strip().rstrip(',')
108			if '#' in line:
109			line = line[: line.find('#')].strip()
110			if line:
111			line = line.split(',')
112			term = line[last_col]
113			features = '0b' + ''.join(
114			[binarize(val) for val in line[first_col:last_col]]
115			)
116			termdict[term] = int(features, 2)
117
118			return termdict, feature_mask
119
120			def check_terms(sym, features, name, termdict):
121			"""Check terms.
122
123			Check each term of the phone name to confirm that it matches
124			the expected features implied by that feature.
125
126			Parameters
127			----------
128			sym : str
129			Symbol to check
130			features : int
131			Phone features
132			name : str
133			Phone name
134			termdict : dict
135			Dictionary of terms
136
137			"""
138			if '#' in name:
139			name = name[: name.find('#')].strip()
140			for term in name.split():
141			if term in termdict:
142			if termdict[term] & features != termdict[term]:
143			sys.stdout.write(
144			'Feature mismatch for term "'
145			+ term
146			+ '" in '
147			+ sym
148			+ '\n'
149			)
150			else:
151			sys.stdout.write(
152			'Unknown term "'
153			+ term
154			+ '" in '
155			+ name
156			+ ' : '
157			+ sym
158			+ '\n'
159			)
160
161			def check_entailments(sym, features, feature_mask):
162			"""Check entailments.
163
164			Check for necessary feature assignments (entailments)
165			For example, [+round] necessitates [+labial].
166
167			Parameters
168			----------
169			sym : str
170			Symbol to check
171			features : int
172			Phone features
173			feature_mask : dict
174			The feature mask
175
176			"""
177			entailments = {
178			'+labial': ('±round', '±protruded', '±compressed', '±labiodental'),
179			'-labial': ('0round', '0protruded', '0compressed', '0labiodental'),
180			'+coronal': ('±anterior', '±distributed'),
181			'-coronal': ('0anterior', '0distributed'),
182			'+dorsal': ('±high', '±low', '±front', '±back', '±tense'),
183			'-dorsal': ('0high', '0low', '0front', '0back', '0tense'),
184			'+pharyngeal': ('±atr', '±rtr'),
185			'-pharyngeal': ('0atr', '0rtr'),
186			'+protruded': ('+labial', '+round', '-compressed'),
187			'+compressed': ('+labial', '+round', '-protruded'),
188			'+glottalic_suction': ('-velaric_suction',),
189			'+velaric_suction': ('-glottalic_suction',),
190			}
191
192			for feature in entailments:
193			fname = feature[1:]
194			if feature[0] == '+':
195			fm = (feature_mask[fname] >> 1) & feature_mask[fname]
196			else:
197			fm = (feature_mask[fname] << 1) & feature_mask[fname]
198			if (features & fm) == fm:
199			for ent in entailments[feature]:
200			ename = ent[1:]
201			if ent[0] == '+':
202			efm = (feature_mask[ename] >> 1) & feature_mask[ename]
203			elif ent[0] == '-':
204			efm = (feature_mask[ename] << 1) & feature_mask[ename]
205			elif ent[0] == '0':
206			efm = 0
207			elif ent[0] == '±':
208			efm = feature_mask[ename]
209
210			if ent[0] == '±':
211			if (features & efm) == 0:
			0 ignored issues – show introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report The variable `efm` does not seem to be defined for all execution paths. Loading history...
212			sys.stdout.write(
213			'Incorrect entailment for '
214			+ sym
215			+ ' for feature '
216			+ fname
217			+ ' and entailment '
218			+ ename
219			)
220			else:
221			if (features & efm) != efm:
222			sys.stdout.write(
223			'Incorrect entailment for '
224			+ sym
225			+ ' for feature '
226			+ fname
227			+ ' and entailment '
228			+ ename
229			)
230
231			checkdict = {} # a mapping of symbol to feature
232			checkset_s = set() # a set of the symbols seen
233			checkset_f = set() # a set of the feature values seen
234
235			termdict, feature_mask = init_termdicts()
236
237			ifile = ''
238			ofile = ''
239			try:
240			opts = getopt.getopt(argv, 'hi:o:', ['ifile=', 'ofile='])[0]
241			except getopt.GetoptError:
242			print_usage()
243			for opt, arg in opts:
244			if opt == '-h':
245			print_usage()
246			elif opt in ('-i', '--ifile'):
247			ifile = codecs.open(arg, 'r', 'utf-8')
248			elif opt in ('-o', '--ofile'):
249			ofile = codecs.open(arg, 'w', 'utf-8')
250			if not ifile:
251			print_usage()
252
253			oline = 'PHONETIC_FEATURES = {'
254			if not ofile:
255			ofile = sys.stdout
256
257			ofile.write(oline + '\n')
258
259			keyline = ifile.readline().strip().split(',')[first_col:last_col]
260			for line in ifile:
261			line = line.strip().rstrip(',')
262
263			if line.startswith('####'):
264			break
265
266			line = unicodedata.normalize('NFC', line)
267
268			if not line or line.startswith('#'):
269			oline = ' ' + line
270
271			else:
272			line = line.strip().split(',')
273			if '#' in line:
274			line = line[: line.find('#')]
275			symbol = line[0]
276			variant = int(line[1])
277			segmental = bool(line[2])
278			features = '0b' + ''.join(
279			[binarize(val) for val in line[first_col:last_col]]
280			)
281			name = line[-1].strip()
282			if not segmental:
283			features = '-' + features
284
285			featint = int(features, 2)
286			check_terms(symbol, featint, name, termdict)
287			check_entailments(symbol, featint, feature_mask)
288			if symbol in checkset_s:
289			sys.stdout.write(
290			'Symbol ' + symbol + ' appears twice in CSV.\n'
291			)
292			else:
293			checkset_s.add(symbol)
294
295			if variant < 2:
296			if featint in checkset_f:
297			sys.stdout.write(
298			'Feature set '
299			+ str(featint)
300			+ ' appears in CSV for two primary IPA '
301			+ 'symbols: '
302			+ symbol
303			+ ' and '
304			+ checkdict[featint]
305			)
306			else:
307			checkdict[featint] = symbol
308			checkset_f.add(featint)
309
310			if variant < 5:
311			oline = " '{}': {},".format(
312			symbol, featint
313			)
314			else:
315			oline = ''
316
317			if oline:
318			ofile.write(oline + '\n')
319
320			ofile.write(' }\n\nFEATURE_MASK = {')
321
322			mag = len(keyline)
323			for i in range(len(keyline)):
324			features = int('0b' + ('00' * i) + '11' + ('00' * (mag - i - 1)), 2)
325			oline = " '{}': {},".format(keyline[i], features)
326			ofile.write(oline + '\n')
327
328			ofile.write(' }\n')
329
330
331			if __name__ == '__main__':
332			main(sys.argv[1:])
333

chrislit / abydos

features_csv_to_dict.main() F last analyzed 2020-12-31 20:10 UTC

Complexity

Size

Duplication

Importance

How to fix Long Method Complexity

Long Method

Complexity

Duplication Side-by-Side

Filter issues like

features_csv_to_dict.main() F
last analyzed 2020-12-31 20:10 UTC