Completed
Push — master ( 3e1d4c...f31f72 )
by Bart
27s
created

train_set_producer()   F

Complexity

Conditions 11

Size

Total Lines 38

Duplication

Lines 0
Ratio 0 %
Metric Value
cc 11
dl 0
loc 38
rs 3.1764

How to fix   Complexity   

Complexity

Complex classes like train_set_producer() often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

1
from __future__ import division
2
from collections import OrderedDict
3
from functools import partial
4
import gzip
5
import io
6
import os
7
import logging
8
import os.path
9
10
import h5py
11
import numpy
12
from picklable_itertools.extras import equizip
13
from PIL import Image
14
from scipy.io.matlab import loadmat
15
from six.moves import zip, xrange
0 ignored issues
show
Bug Best Practice introduced by
This seems to re-define the built-in zip.

It is generally discouraged to redefine built-ins as this makes code very hard to read.

Loading history...
16
import zmq
17
18
from fuel.converters.base import check_exists, progress_bar
19
from fuel.datasets import H5PYDataset
20
from fuel.utils.formats import tar_open
21
from fuel.utils.parallel import producer_consumer
22
from fuel import config
23
24
log = logging.getLogger(__name__)
0 ignored issues
show
Coding Style Naming introduced by
The name log does not conform to the constant naming conventions ((([A-Z_][A-Z0-9_]*)|(__.*__))$).

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
25
26
DEVKIT_ARCHIVE = 'ILSVRC2010_devkit-1.0.tar.gz'
27
DEVKIT_META_PATH = 'devkit-1.0/data/meta.mat'
28
DEVKIT_VALID_GROUNDTRUTH_PATH = ('devkit-1.0/data/'
29
                                 'ILSVRC2010_validation_ground_truth.txt')
30
PATCH_IMAGES_TAR = 'patch_images.tar'
31
TEST_GROUNDTRUTH = 'ILSVRC2010_test_ground_truth.txt'
32
TRAIN_IMAGES_TAR = 'ILSVRC2010_images_train.tar'
33
VALID_IMAGES_TAR = 'ILSVRC2010_images_val.tar'
34
TEST_IMAGES_TAR = 'ILSVRC2010_images_test.tar'
35
IMAGE_TARS = (TRAIN_IMAGES_TAR, VALID_IMAGES_TAR, TEST_IMAGES_TAR,
36
              PATCH_IMAGES_TAR)
37
PUBLIC_FILES = TEST_GROUNDTRUTH, DEVKIT_ARCHIVE
38
ALL_FILES = PUBLIC_FILES + IMAGE_TARS
39
40
41
@check_exists(required_files=ALL_FILES)
42
def convert_ilsvrc2010(directory, output_directory,
43
                       output_filename='ilsvrc2010.hdf5',
44
                       shuffle_seed=config.default_seed):
45
    """Converter for data from the ILSVRC 2010 competition.
46
47
    Source files for this dataset can be obtained by registering at
48
    [ILSVRC2010WEB].
49
50
    Parameters
51
    ----------
52
    input_directory : str
53
        Path from which to read raw data files.
54
    output_directory : str
55
        Path to which to save the HDF5 file.
56
    output_filename : str, optional
57
        The output filename for the HDF5 file. Default: 'ilsvrc2010.hdf5'.
58
    shuffle_seed : int or sequence, optional
59
        Seed for a random number generator used to shuffle the order
60
        of the training set on disk, so that sequential reads will not
61
        be ordered by class.
62
63
    .. [ILSVRC2010WEB] http://image-net.org/challenges/LSVRC/2010/index
64
65
    """
66
    devkit_path = os.path.join(directory, DEVKIT_ARCHIVE)
67
    test_groundtruth_path = os.path.join(directory, TEST_GROUNDTRUTH)
68
    train, valid, test, patch = [os.path.join(directory, fn)
69
                                 for fn in IMAGE_TARS]
70
    n_train, valid_groundtruth, test_groundtruth, wnid_map = \
71
        prepare_metadata(devkit_path, test_groundtruth_path)
72
    n_valid, n_test = len(valid_groundtruth), len(test_groundtruth)
73
    output_path = os.path.join(output_directory, output_filename)
74
75
    with h5py.File(output_path, 'w') as f:
76
        log.info('Creating HDF5 datasets...')
77
        prepare_hdf5_file(f, n_train, n_valid, n_test)
78
        log.info('Processing training set...')
79
        process_train_set(f, train, patch, n_train, wnid_map, shuffle_seed)
80
        log.info('Processing validation set...')
81
        process_other_set(f, 'valid', valid, patch, valid_groundtruth, n_train)
82
        log.info('Processing test set...')
83
        process_other_set(f, 'test', test, patch, test_groundtruth,
84
                          n_train + n_valid)
85
        log.info('Done.')
86
87
    return (output_path,)
88
89
90
def fill_subparser(subparser):
91
    """Sets up a subparser to convert the ILSVRC2010 dataset files.
92
93
    Parameters
94
    ----------
95
    subparser : :class:`argparse.ArgumentParser`
96
        Subparser handling the `ilsvrc2010` command.
97
98
    """
99
    subparser.add_argument(
100
        "--shuffle-seed", help="Seed to use for randomizing order of the "
101
                               "training set on disk.",
102
        default=config.default_seed, type=int, required=False)
103
    return convert_ilsvrc2010
104
105
106
def prepare_metadata(devkit_archive, test_groundtruth_path):
107
    """Extract dataset metadata required for HDF5 file setup.
108
109
    Parameters
110
    ----------
111
    devkit_archive : str or file-like object
112
        The filename or file-handle for the gzipped TAR archive
113
        containing the ILSVRC2010 development kit.
114
    test_groundtruth_path : str or file-like object
115
        The filename or file-handle for the text file containing
116
        the ILSVRC2010 test set ground truth.
117
118
    Returns
119
    -------
120
    n_train : int
121
        The number of examples in the training set.
122
    valid_groundtruth : ndarray, 1-dimensional
123
        An ndarray containing the validation set groundtruth in terms of
124
        0-based class indices.
125
    test_groundtruth : ndarray, 1-dimensional
126
        An ndarray containing the test groundtruth in terms of 0-based
127
        class indices.
128
    wnid_map : dict
129
        A dictionary that maps WordNet IDs to 0-based class indices.
130
131
    """
132
    # Read what's necessary from the development kit.
133
    synsets, cost_matrix, raw_valid_groundtruth = read_devkit(devkit_archive)
0 ignored issues
show
Unused Code introduced by
The variable cost_matrix seems to be unused.
Loading history...
134
135
    # Mapping to take WordNet IDs to our internal 0-999 encoding.
136
    wnid_map = dict(zip((s.decode('utf8') for s in synsets['WNID']),
137
                        xrange(1000)))
138
139
    # Map the 'ILSVRC2010 ID' to our zero-based ID.
140
    ilsvrc_id_to_zero_based = dict(zip(synsets['ILSVRC2010_ID'],
141
                                   xrange(len(synsets))))
142
143
    # Map the validation set groundtruth to 0-999 labels.
144
    valid_groundtruth = [ilsvrc_id_to_zero_based[id_]
145
                         for id_ in raw_valid_groundtruth]
146
147
    # Raw test data groundtruth, ILSVRC2010 IDs.
148
    raw_test_groundtruth = numpy.loadtxt(test_groundtruth_path,
149
                                         dtype=numpy.int16)
150
151
    # Map the test set groundtruth to 0-999 labels.
152
    test_groundtruth = [ilsvrc_id_to_zero_based[id_]
153
                        for id_ in raw_test_groundtruth]
154
155
    # Ascertain the number of filenames to prepare appropriate sized
156
    # arrays.
157
    n_train = int(synsets['num_train_images'].sum())
158
    log.info('Training set: {} images'.format(n_train))
159
    log.info('Validation set: {} images'.format(len(valid_groundtruth)))
160
    log.info('Test set: {} images'.format(len(test_groundtruth)))
161
    n_total = n_train + len(valid_groundtruth) + len(test_groundtruth)
162
    log.info('Total (train/valid/test): {} images'.format(n_total))
163
    return n_train, valid_groundtruth, test_groundtruth, wnid_map
164
165
166
def create_splits(n_train, n_valid, n_test):
167
    n_total = n_train + n_valid + n_test
168
    tuples = {}
169
    tuples['train'] = (0, n_train)
170
    tuples['valid'] = (n_train, n_train + n_valid)
171
    tuples['test'] = (n_train + n_valid, n_total)
172
    sources = ['encoded_images', 'targets', 'filenames']
173
    return OrderedDict(
174
        (split, OrderedDict((source, tuples[split]) for source in sources))
175
        for split in ('train', 'valid', 'test')
176
    )
177
178
179
def prepare_hdf5_file(hdf5_file, n_train, n_valid, n_test):
180
    """Create datasets within a given HDF5 file.
181
182
    Parameters
183
    ----------
184
    hdf5_file : :class:`h5py.File` instance
185
        HDF5 file handle to which to write.
186
    n_train : int
187
        The number of training set examples.
188
    n_valid : int
189
        The number of validation set examples.
190
    n_test : int
191
        The number of test set examples.
192
193
    """
194
    n_total = n_train + n_valid + n_test
195
    splits = create_splits(n_train, n_valid, n_test)
196
    hdf5_file.attrs['split'] = H5PYDataset.create_split_array(splits)
197
    vlen_dtype = h5py.special_dtype(vlen=numpy.dtype('uint8'))
198
    hdf5_file.create_dataset('encoded_images', shape=(n_total,),
199
                             dtype=vlen_dtype)
200
    hdf5_file.create_dataset('targets', shape=(n_total, 1), dtype=numpy.int16)
201
    hdf5_file.create_dataset('filenames', shape=(n_total, 1), dtype='S32')
202
203
204
def process_train_set(hdf5_file, train_archive, patch_archive, n_train,
205
                      wnid_map, shuffle_seed=None):
206
    """Process the ILSVRC2010 training set.
207
208
    Parameters
209
    ----------
210
    hdf5_file : :class:`h5py.File` instance
211
        HDF5 file handle to which to write. Assumes `features`, `targets`
212
        and `filenames` already exist and have first dimension larger than
213
        `n_train`.
214
    train_archive :  str or file-like object
215
        Filename or file handle for the TAR archive of training images.
216
    patch_archive :  str or file-like object
217
        Filename or file handle for the TAR archive of patch images.
218
    n_train : int
219
        The number of items in the training set.
220
    wnid_map : dict
221
        A dictionary mapping WordNet IDs to class indices.
222
    shuffle_seed : int or sequence, optional
223
        Seed for a NumPy random number generator that permutes the
224
        training set on disk. If `None`, no permutation is performed
225
        (this is the default).
226
227
    """
228
    producer = partial(train_set_producer, train_archive=train_archive,
229
                       patch_archive=patch_archive, wnid_map=wnid_map)
230
    consumer = partial(image_consumer, hdf5_file=hdf5_file,
231
                       num_expected=n_train, shuffle_seed=shuffle_seed)
232
    producer_consumer(producer, consumer)
233
234
235
def _write_to_hdf5(hdf5_file, index, image_filename, image_data,
236
                   class_index):
237
    hdf5_file['filenames'][index] = image_filename.encode('ascii')
238
    hdf5_file['encoded_images'][index] = image_data
239
    hdf5_file['targets'][index] = class_index
240
241
242
def train_set_producer(socket, train_archive, patch_archive, wnid_map):
243
    """Load/send images from the training set TAR file or patch images.
244
245
    Parameters
246
    ----------
247
    socket : :class:`zmq.Socket`
248
        PUSH socket on which to send loaded images.
249
    train_archive :  str or file-like object
250
        Filename or file handle for the TAR archive of training images.
251
    patch_archive :  str or file-like object
252
        Filename or file handle for the TAR archive of patch images.
253
    wnid_map : dict
254
        A dictionary that maps WordNet IDs to 0-based class indices.
255
        Used to decode the filenames of the inner TAR files.
256
257
    """
258
    patch_images = extract_patch_images(patch_archive, 'train')
259
    num_patched = 0
260
    with tar_open(train_archive) as tar:
261
        for inner_tar_info in tar:
262
            with tar_open(tar.extractfile(inner_tar_info.name)) as inner:
263
                wnid = inner_tar_info.name.split('.')[0]
264
                class_index = wnid_map[wnid]
265
                filenames = sorted(info.name for info in inner
266
                                   if info.isfile())
267
                images_gen = (load_from_tar_or_patch(inner, filename,
268
                                                     patch_images)
269
                              for filename in filenames)
270
                pathless_filenames = (os.path.split(fn)[-1]
271
                                      for fn in filenames)
272
                stream = equizip(pathless_filenames, images_gen)
273
                for image_fn, (image_data, patched) in stream:
274
                    if patched:
275
                        num_patched += 1
276
                    socket.send_pyobj((image_fn, class_index), zmq.SNDMORE)
277
                    socket.send(image_data)
278
    if num_patched != len(patch_images):
279
        raise ValueError('not all patch images were used')
280
281
282
def image_consumer(socket, hdf5_file, num_expected, shuffle_seed=None,
283
                   offset=0):
284
    """Fill an HDF5 file with incoming images from a socket.
285
286
    Parameters
287
    ----------
288
    socket : :class:`zmq.Socket`
289
        PULL socket on which to receive images.
290
    hdf5_file : :class:`h5py.File` instance
291
        HDF5 file handle to which to write. Assumes `features`, `targets`
292
        and `filenames` already exist and have first dimension larger than
293
        `sum(images_per_class)`.
294
    num_expected : int
295
        The number of items we expect to be sent over the socket.
296
    shuffle_seed : int or sequence, optional
297
        Seed for a NumPy random number generator that permutes the
298
        images on disk.
299
    offset : int, optional
300
        The offset in the HDF5 datasets at which to start writing
301
        received examples. Defaults to 0.
302
303
    """
304
    with progress_bar('images', maxval=num_expected) as pb:
305
        if shuffle_seed is None:
306
            index_gen = iter(xrange(num_expected))
307
        else:
308
            rng = numpy.random.RandomState(shuffle_seed)
309
            index_gen = iter(rng.permutation(num_expected))
310
        for i, num in enumerate(index_gen):
311
            image_filename, class_index = socket.recv_pyobj(zmq.SNDMORE)
312
            image_data = numpy.fromstring(socket.recv(), dtype='uint8')
313
            _write_to_hdf5(hdf5_file, num + offset, image_filename,
314
                           image_data, class_index)
315
            pb.update(i + 1)
316
317
318
def process_other_set(hdf5_file, which_set, image_archive, patch_archive,
319
                      groundtruth, offset):
320
    """Process the validation or test set.
321
322
    Parameters
323
    ----------
324
    hdf5_file : :class:`h5py.File` instance
325
        HDF5 file handle to which to write. Assumes `features`, `targets`
326
        and `filenames` already exist and have first dimension larger than
327
        `sum(images_per_class)`.
328
    which_set : str
329
        Which set of images is being processed. One of 'train', 'valid',
330
        'test'.  Used for extracting the appropriate images from the patch
331
        archive.
332
    image_archive : str or file-like object
333
        The filename or file-handle for the TAR archive containing images.
334
    patch_archive : str or file-like object
335
        Filename or file handle for the TAR archive of patch images.
336
    groundtruth : iterable
337
        Iterable container containing scalar 0-based class index for each
338
        image, sorted by filename.
339
    offset : int
340
        The offset in the HDF5 datasets at which to start writing.
341
342
    """
343
    producer = partial(other_set_producer, image_archive=image_archive,
344
                       patch_archive=patch_archive,
345
                       groundtruth=groundtruth, which_set=which_set)
346
    consumer = partial(image_consumer, hdf5_file=hdf5_file,
347
                       num_expected=len(groundtruth), offset=offset)
348
    producer_consumer(producer, consumer)
349
350
351
def other_set_producer(socket, which_set, image_archive, patch_archive,
352
                       groundtruth):
353
    """Push image files read from the valid/test set TAR to a socket.
354
355
    Parameters
356
    ----------
357
    socket : :class:`zmq.Socket`
358
        PUSH socket on which to send images.
359
    which_set : str
360
        Which set of images is being processed. One of 'train', 'valid',
361
        'test'.  Used for extracting the appropriate images from the patch
362
        archive.
363
    image_archive : str or file-like object
364
        The filename or file-handle for the TAR archive containing images.
365
    patch_archive : str or file-like object
366
        Filename or file handle for the TAR archive of patch images.
367
    groundtruth : iterable
368
        Iterable container containing scalar 0-based class index for each
369
        image, sorted by filename.
370
371
    """
372
    patch_images = extract_patch_images(patch_archive, which_set)
373
    num_patched = 0
374
    with tar_open(image_archive) as tar:
375
        filenames = sorted(info.name for info in tar if info.isfile())
376
        images = (load_from_tar_or_patch(tar, filename, patch_images)
377
                  for filename in filenames)
378
        pathless_filenames = (os.path.split(fn)[-1] for fn in filenames)
379
        image_iterator = equizip(images, pathless_filenames, groundtruth)
380
        for (image_data, patched), filename, class_index in image_iterator:
381
            if patched:
382
                num_patched += 1
383
            socket.send_pyobj((filename, class_index), zmq.SNDMORE)
384
            socket.send(image_data, copy=False)
385
    if num_patched != len(patch_images):
386
        raise Exception
387
388
389
def load_from_tar_or_patch(tar, image_filename, patch_images):
390
    """Do everything necessary to process an image inside a TAR.
391
392
    Parameters
393
    ----------
394
    tar : `TarFile` instance
395
        The tar from which to read `image_filename`.
396
    image_filename : str
397
        Fully-qualified path inside of `tar` from which to read an
398
        image file.
399
    patch_images : dict
400
        A dictionary containing filenames (without path) of replacements
401
        to be substituted in place of the version of the same file found
402
        in `tar`.
403
404
    Returns
405
    -------
406
    image_data : bytes
407
        The JPEG bytes representing either the image from the TAR archive
408
        or its replacement from the patch dictionary.
409
    patched : bool
410
        True if the image was retrieved from the patch dictionary. False
411
        if it was retrieved from the TAR file.
412
413
    """
414
    patched = True
415
    image_bytes = patch_images.get(os.path.basename(image_filename), None)
416
    if image_bytes is None:
417
        patched = False
418
        try:
419
            image_bytes = tar.extractfile(image_filename).read()
420
            numpy.array(Image.open(io.BytesIO(image_bytes)))
421
        except (IOError, OSError):
422
            with gzip.GzipFile(fileobj=tar.extractfile(image_filename)) as gz:
423
                image_bytes = gz.read()
424
                numpy.array(Image.open(io.BytesIO(image_bytes)))
425
    return image_bytes, patched
426
427
428
def read_devkit(f):
429
    """Read relevant information from the development kit archive.
430
431
    Parameters
432
    ----------
433
    f : str or file-like object
434
        The filename or file-handle for the gzipped TAR archive
435
        containing the ILSVRC2010 development kit.
436
437
    Returns
438
    -------
439
    synsets : ndarray, 1-dimensional, compound dtype
440
        See :func:`read_metadata_mat_file` for details.
441
    cost_matrix : ndarray, 2-dimensional, uint8
442
        See :func:`read_metadata_mat_file` for details.
443
    raw_valid_groundtruth : ndarray, 1-dimensional, int16
444
        The labels for the ILSVRC2010 validation set,
445
        distributed with the development kit code.
446
447
    """
448
    with tar_open(f) as tar:
449
        # Metadata table containing class hierarchy, textual descriptions, etc.
450
        meta_mat = tar.extractfile(DEVKIT_META_PATH)
451
        synsets, cost_matrix = read_metadata_mat_file(meta_mat)
452
453
        # Raw validation data groundtruth, ILSVRC2010 IDs. Confusingly
454
        # distributed inside the development kit archive.
455
        raw_valid_groundtruth = numpy.loadtxt(tar.extractfile(
456
            DEVKIT_VALID_GROUNDTRUTH_PATH), dtype=numpy.int16)
457
    return synsets, cost_matrix, raw_valid_groundtruth
458
459
460
def read_metadata_mat_file(meta_mat):
461
    """Read ILSVRC2010 metadata from the distributed MAT file.
462
463
    Parameters
464
    ----------
465
    meta_mat : str or file-like object
466
        The filename or file-handle for `meta.mat` from the
467
        ILSVRC2010 development kit.
468
469
    Returns
470
    -------
471
    synsets : ndarray, 1-dimensional, compound dtype
472
        A table containing ILSVRC2010 metadata for the "synonym sets"
473
        or "synsets" that comprise the classes and superclasses,
474
        including the following fields:
475
         * `ILSVRC2010_ID`: the integer ID used in the original
476
           competition data.
477
         * `WNID`: A string identifier that uniquely identifies
478
           a synset in ImageNet and WordNet.
479
         * `wordnet_height`: The length of the longest path to
480
           a leaf node in the FULL ImageNet/WordNet hierarchy
481
           (leaf nodes in the FULL ImageNet/WordNet hierarchy
482
           have `wordnet_height` 0).
483
         * `gloss`: A string representation of an English
484
           textual description of the concept represented by
485
           this synset.
486
         * `num_children`: The number of children in the hierarchy
487
           for this synset.
488
         * `words`: A string representation, comma separated,
489
           of different synoym words or phrases for the concept
490
           represented by this synset.
491
         * `children`: A vector of `ILSVRC2010_ID`s of children
492
           of this synset, padded with -1. Note that these refer
493
           to `ILSVRC2010_ID`s from the original data and *not*
494
           the zero-based index in the table.
495
         * `num_train_images`: The number of training images for
496
           this synset.
497
    cost_matrix : ndarray, 2-dimensional, uint8
498
        A 1000x1000 matrix containing the precomputed pairwise
499
        cost (based on distance in the hierarchy) for all
500
        low-level synsets (i.e. the thousand possible output
501
        classes with training data associated).
502
503
    """
504
    mat = loadmat(meta_mat, squeeze_me=True)
505
    synsets = mat['synsets']
506
    cost_matrix = mat['cost_matrix']
507
    new_dtype = numpy.dtype([
508
        ('ILSVRC2010_ID', numpy.int16),
509
        ('WNID', ('S', max(map(len, synsets['WNID'])))),
510
        ('wordnet_height', numpy.int8),
511
        ('gloss', ('S', max(map(len, synsets['gloss'])))),
512
        ('num_children', numpy.int8),
513
        ('words', ('S', max(map(len, synsets['words'])))),
514
        ('children', (numpy.int8, max(synsets['num_children']))),
515
        ('num_train_images', numpy.uint16)
516
    ])
517
    new_synsets = numpy.empty(synsets.shape, dtype=new_dtype)
518
    for attr in ['ILSVRC2010_ID', 'WNID', 'wordnet_height', 'gloss',
519
                 'num_children', 'words', 'num_train_images']:
520
        new_synsets[attr] = synsets[attr]
521
    children = [numpy.atleast_1d(ch) for ch in synsets['children']]
522
    padded_children = [
523
        numpy.concatenate((c,
524
                           -numpy.ones(new_dtype['children'].shape[0] - len(c),
525
                                       dtype=numpy.int16)))
526
        for c in children
527
    ]
528
    new_synsets['children'] = padded_children
529
    return new_synsets, cost_matrix
530
531
532
def extract_patch_images(f, which_set):
533
    """Extracts a dict of the "patch images" for ILSVRC2010.
534
535
    Parameters
536
    ----------
537
    f : str or file-like object
538
        The filename or file-handle to the patch images TAR file.
539
    which_set : str
540
        Which set of images to extract. One of 'train', 'valid', 'test'.
541
542
    Returns
543
    -------
544
    dict
545
        A dictionary contains a mapping of filenames (without path) to a
546
        bytes object containing the replacement image.
547
548
    Notes
549
    -----
550
    Certain images in the distributed archives are blank, or display
551
    an "image not available" banner. A separate TAR file of
552
    "patch images" is distributed with the corrected versions of
553
    these. It is this archive that this function is intended to read.
554
555
    """
556
    if which_set not in ('train', 'valid', 'test'):
557
        raise ValueError('which_set must be one of train, valid, or test')
558
    which_set = 'val' if which_set == 'valid' else which_set
559
    patch_images = {}
560
    with tar_open(f) as tar:
561
        for info_obj in tar:
562
            if not info_obj.name.endswith('.JPEG'):
563
                continue
564
            # Pretty sure that '/' is used for tarfile regardless of
565
            # os.path.sep, but I officially don't care about Windows.
566
            tokens = info_obj.name.split('/')
567
            file_which_set = tokens[-2]
568
            if file_which_set != which_set:
569
                continue
570
            filename = tokens[-1]
571
            patch_images[filename] = tar.extractfile(info_obj.name).read()
572
    return patch_images
573