Code Duplication - mila-udem/fuel - Measure and Improve Code Quality continuously with Scrutinizer

Code Duplication Length = 64-70 lines in 2 locations

fuel/converters/ilsvrc2010.py 1 location


    return synsets, cost_matrix, raw_valid_groundtruth


def read_metadata_mat_file(meta_mat):
    """Read ILSVRC2010 metadata from the distributed MAT file.

    Parameters
    ----------
    meta_mat : str or file-like object
        The filename or file-handle for `meta.mat` from the
        ILSVRC2010 development kit.

    Returns
    -------
    synsets : ndarray, 1-dimensional, compound dtype
        A table containing ILSVRC2010 metadata for the "synonym sets"
        or "synsets" that comprise the classes and superclasses,
        including the following fields:
         * `ILSVRC2010_ID`: the integer ID used in the original
           competition data.
         * `WNID`: A string identifier that uniquely identifies
           a synset in ImageNet and WordNet.
         * `wordnet_height`: The length of the longest path to
           a leaf node in the FULL ImageNet/WordNet hierarchy
           (leaf nodes in the FULL ImageNet/WordNet hierarchy
           have `wordnet_height` 0).
         * `gloss`: A string representation of an English
           textual description of the concept represented by
           this synset.
         * `num_children`: The number of children in the hierarchy
           for this synset.
         * `words`: A string representation, comma separated,
           of different synoym words or phrases for the concept
           represented by this synset.
         * `children`: A vector of `ILSVRC2010_ID`s of children
           of this synset, padded with -1. Note that these refer
           to `ILSVRC2010_ID`s from the original data and *not*
           the zero-based index in the table.
         * `num_train_images`: The number of training images for
           this synset.
    cost_matrix : ndarray, 2-dimensional, uint8
        A 1000x1000 matrix containing the precomputed pairwise
        cost (based on distance in the hierarchy) for all
        low-level synsets (i.e. the thousand possible output
        classes with training data associated).

    """
    mat = loadmat(meta_mat, squeeze_me=True)
    synsets = mat['synsets']
    cost_matrix = mat['cost_matrix']
    new_dtype = numpy.dtype([
        ('ILSVRC2010_ID', numpy.int16),
        ('WNID', ('S', max(map(len, synsets['WNID'])))),
        ('wordnet_height', numpy.int8),
        ('gloss', ('S', max(map(len, synsets['gloss'])))),
        ('num_children', numpy.int8),
        ('words', ('S', max(map(len, synsets['words'])))),
        ('children', (numpy.int8, max(synsets['num_children']))),
        ('num_train_images', numpy.uint16)
    ])
    new_synsets = numpy.empty(synsets.shape, dtype=new_dtype)
    for attr in ['ILSVRC2010_ID', 'WNID', 'wordnet_height', 'gloss',
                 'num_children', 'words', 'num_train_images']:
        new_synsets[attr] = synsets[attr]
    children = [numpy.atleast_1d(ch) for ch in synsets['children']]
    padded_children = [
        numpy.concatenate((c,
                           -numpy.ones(new_dtype['children'].shape[0] - len(c),
                                       dtype=numpy.int16)))
        for c in children
    ]
    new_synsets['children'] = padded_children
    return new_synsets, cost_matrix


def extract_patch_images(f, which_set):

fuel/converters/ilsvrc2012.py 1 location


    return synsets, raw_valid_groundtruth


def read_metadata_mat_file(meta_mat):
    """Read ILSVRC2012 metadata from the distributed MAT file.

    Parameters
    ----------
    meta_mat : str or file-like object
        The filename or file-handle for `meta.mat` from the
        ILSVRC2012 development kit.

    Returns
    -------
    synsets : ndarray, 1-dimensional, compound dtype
        A table containing ILSVRC2012 metadata for the "synonym sets"
        or "synsets" that comprise the classes and superclasses,
        including the following fields:
         * `ILSVRC2012_ID`: the integer ID used in the original
           competition data.
         * `WNID`: A string identifier that uniquely identifies
           a synset in ImageNet and WordNet.
         * `wordnet_height`: The length of the longest path to
           a leaf node in the FULL ImageNet/WordNet hierarchy
           (leaf nodes in the FULL ImageNet/WordNet hierarchy
           have `wordnet_height` 0).
         * `gloss`: A string representation of an English
           textual description of the concept represented by
           this synset.
         * `num_children`: The number of children in the hierarchy
           for this synset.
         * `words`: A string representation, comma separated,
           of different synoym words or phrases for the concept
           represented by this synset.
         * `children`: A vector of `ILSVRC2012_ID`s of children
           of this synset, padded with -1. Note that these refer
           to `ILSVRC2012_ID`s from the original data and *not*
           the zero-based index in the table.
         * `num_train_images`: The number of training images for
           this synset.

    """
    mat = loadmat(meta_mat, squeeze_me=True)
    synsets = mat['synsets']
    new_dtype = numpy.dtype([
        ('ILSVRC2012_ID', numpy.int16),
        ('WNID', ('S', max(map(len, synsets['WNID'])))),
        ('wordnet_height', numpy.int8),
        ('gloss', ('S', max(map(len, synsets['gloss'])))),
        ('num_children', numpy.int8),
        ('words', ('S', max(map(len, synsets['words'])))),
        ('children', (numpy.int8, max(synsets['num_children']))),
        ('num_train_images', numpy.uint16)
    ])
    new_synsets = numpy.empty(synsets.shape, dtype=new_dtype)
    for attr in ['ILSVRC2012_ID', 'WNID', 'wordnet_height', 'gloss',
                 'num_children', 'words', 'num_train_images']:
        new_synsets[attr] = synsets[attr]
    children = [numpy.atleast_1d(ch) for ch in synsets['children']]
    padded_children = [
        numpy.concatenate((c,
                           -numpy.ones(new_dtype['children'].shape[0] - len(c),
                                       dtype=numpy.int16)))
        for c in children
    ]
    new_synsets['children'] = padded_children
    return new_synsets


		@@ 461-530 (lines=70) @@
458		return synsets, cost_matrix, raw_valid_groundtruth
459
460
461		def read_metadata_mat_file(meta_mat):
462		"""Read ILSVRC2010 metadata from the distributed MAT file.
463
464		Parameters
465		----------
466		meta_mat : str or file-like object
467		The filename or file-handle for `meta.mat` from the
468		ILSVRC2010 development kit.
469
470		Returns
471		-------
472		synsets : ndarray, 1-dimensional, compound dtype
473		A table containing ILSVRC2010 metadata for the "synonym sets"
474		or "synsets" that comprise the classes and superclasses,
475		including the following fields:
476		* `ILSVRC2010_ID`: the integer ID used in the original
477		competition data.
478		* `WNID`: A string identifier that uniquely identifies
479		a synset in ImageNet and WordNet.
480		* `wordnet_height`: The length of the longest path to
481		a leaf node in the FULL ImageNet/WordNet hierarchy
482		(leaf nodes in the FULL ImageNet/WordNet hierarchy
483		have `wordnet_height` 0).
484		* `gloss`: A string representation of an English
485		textual description of the concept represented by
486		this synset.
487		* `num_children`: The number of children in the hierarchy
488		for this synset.
489		* `words`: A string representation, comma separated,
490		of different synoym words or phrases for the concept
491		represented by this synset.
492		* `children`: A vector of `ILSVRC2010_ID`s of children
493		of this synset, padded with -1. Note that these refer
494		to `ILSVRC2010_ID`s from the original data and not
495		the zero-based index in the table.
496		* `num_train_images`: The number of training images for
497		this synset.
498		cost_matrix : ndarray, 2-dimensional, uint8
499		A 1000x1000 matrix containing the precomputed pairwise
500		cost (based on distance in the hierarchy) for all
501		low-level synsets (i.e. the thousand possible output
502		classes with training data associated).
503
504		"""
505		mat = loadmat(meta_mat, squeeze_me=True)
506		synsets = mat['synsets']
507		cost_matrix = mat['cost_matrix']
508		new_dtype = numpy.dtype([
509		('ILSVRC2010_ID', numpy.int16),
510		('WNID', ('S', max(map(len, synsets['WNID'])))),
511		('wordnet_height', numpy.int8),
512		('gloss', ('S', max(map(len, synsets['gloss'])))),
513		('num_children', numpy.int8),
514		('words', ('S', max(map(len, synsets['words'])))),
515		('children', (numpy.int8, max(synsets['num_children']))),
516		('num_train_images', numpy.uint16)
517		])
518		new_synsets = numpy.empty(synsets.shape, dtype=new_dtype)
519		for attr in ['ILSVRC2010_ID', 'WNID', 'wordnet_height', 'gloss',
520		'num_children', 'words', 'num_train_images']:
521		new_synsets[attr] = synsets[attr]
522		children = [numpy.atleast_1d(ch) for ch in synsets['children']]
523		padded_children = [
524		numpy.concatenate((c,
525		-numpy.ones(new_dtype['children'].shape[0] - len(c),
526		dtype=numpy.int16)))
527		for c in children
528		]
529		new_synsets['children'] = padded_children
530		return new_synsets, cost_matrix
531
532
533		def extract_patch_images(f, which_set):

		@@ 231-294 (lines=64) @@
228		return synsets, raw_valid_groundtruth
229
230
231		def read_metadata_mat_file(meta_mat):
232		"""Read ILSVRC2012 metadata from the distributed MAT file.
233
234		Parameters
235		----------
236		meta_mat : str or file-like object
237		The filename or file-handle for `meta.mat` from the
238		ILSVRC2012 development kit.
239
240		Returns
241		-------
242		synsets : ndarray, 1-dimensional, compound dtype
243		A table containing ILSVRC2012 metadata for the "synonym sets"
244		or "synsets" that comprise the classes and superclasses,
245		including the following fields:
246		* `ILSVRC2012_ID`: the integer ID used in the original
247		competition data.
248		* `WNID`: A string identifier that uniquely identifies
249		a synset in ImageNet and WordNet.
250		* `wordnet_height`: The length of the longest path to
251		a leaf node in the FULL ImageNet/WordNet hierarchy
252		(leaf nodes in the FULL ImageNet/WordNet hierarchy
253		have `wordnet_height` 0).
254		* `gloss`: A string representation of an English
255		textual description of the concept represented by
256		this synset.
257		* `num_children`: The number of children in the hierarchy
258		for this synset.
259		* `words`: A string representation, comma separated,
260		of different synoym words or phrases for the concept
261		represented by this synset.
262		* `children`: A vector of `ILSVRC2012_ID`s of children
263		of this synset, padded with -1. Note that these refer
264		to `ILSVRC2012_ID`s from the original data and not
265		the zero-based index in the table.
266		* `num_train_images`: The number of training images for
267		this synset.
268
269		"""
270		mat = loadmat(meta_mat, squeeze_me=True)
271		synsets = mat['synsets']
272		new_dtype = numpy.dtype([
273		('ILSVRC2012_ID', numpy.int16),
274		('WNID', ('S', max(map(len, synsets['WNID'])))),
275		('wordnet_height', numpy.int8),
276		('gloss', ('S', max(map(len, synsets['gloss'])))),
277		('num_children', numpy.int8),
278		('words', ('S', max(map(len, synsets['words'])))),
279		('children', (numpy.int8, max(synsets['num_children']))),
280		('num_train_images', numpy.uint16)
281		])
282		new_synsets = numpy.empty(synsets.shape, dtype=new_dtype)
283		for attr in ['ILSVRC2012_ID', 'WNID', 'wordnet_height', 'gloss',
284		'num_children', 'words', 'num_train_images']:
285		new_synsets[attr] = synsets[attr]
286		children = [numpy.atleast_1d(ch) for ch in synsets['children']]
287		padded_children = [
288		numpy.concatenate((c,
289		-numpy.ones(new_dtype['children'].shape[0] - len(c),
290		dtype=numpy.int16)))
291		for c in children
292		]
293		new_synsets['children'] = padded_children
294		return new_synsets
295

mila-udem / fuel

Code Duplication Length = 64-70 lines in 2 locations

fuel/converters/ilsvrc2010.py 1 location

fuel/converters/ilsvrc2012.py 1 location