Completed
Push — master ( afd9d8...0f19a6 )
by Kent
16s
created

parse_header_section()   F

Complexity

Conditions 11

Size

Total Lines 52

Duplication

Lines 0
Ratio 0 %

Importance

Changes 9
Bugs 1 Features 0
Metric Value
cc 11
dl 0
loc 52
rs 3.8571
c 9
b 1
f 0

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

Complexity

Complex classes like parse_header_section() often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

1
import codecs
2
import logging
3
import os
4
import re
5
import textwrap
6
import traceback
7
8
import numpy as np
9
10
from . import defaults
11
12
# Convoluted import for StringIO in order to support:
13
#
14
# - Python 3 - io.StringIO
15
# - Python 2 (optimized) - cStringIO.StringIO
16
# - Python 2 (all) - StringIO.StringIO
17
18
try:
19
    import cStringIO as StringIO
20
except ImportError:
21
    try:  # cStringIO not available on this system
22
        import StringIO
23
    except ImportError:  # Python 3
24
        from io import StringIO
25
    else:
26
        from StringIO import StringIO
27
else:
28
    from StringIO import StringIO
29
30
from . import defaults
31
from . import exceptions
32
from .las_items import HeaderItem, CurveItem, SectionItems, OrderedDict
33
34
35
logger = logging.getLogger(__name__)
36
37
URL_REGEXP = re.compile(
38
    r'^(?:http|ftp)s?://'  # http:// or https://
39
    r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}'
40
    r'\.?|[A-Z0-9-]{2,}\.?)|'  # (cont.) domain...
41
    r'localhost|'  # localhost...
42
    r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})'  # ...or ip
43
    r'(?::\d+)?'  # optional port
44
    r'(?:/?|[/?]\S+)$', re.IGNORECASE)
45
46
47
def open_file(file_ref, **encoding_kwargs):
48
    '''Open a file if necessary.
49
50
    If ``autodetect_encoding=True`` then either ``cchardet`` or ``chardet``
51
    needs to be installed, or else an ``ImportError`` will be raised.
52
53
    Arguments:
54
        file_ref (file-like object, str): either a filename, an open file
55
            object, or a string containing the contents of a file.
56
57
    See :func:`lasio.reader.open_with_codecs` for keyword arguments that can be
58
    used here.
59
60
    Returns: 
61
        tuple of an open file-like object, and the encoding that
62
        was used to decode it (if it were read from disk).
63
64
    '''
65
    encoding = None
66
    if isinstance(file_ref, str): # file_ref != file-like object, so what is it?
67
        lines = file_ref.splitlines()
68
        first_line = lines[0]
69
        if URL_REGEXP.match(first_line): # it's a URL
70
            logger.info('Loading URL {}'.format(first_line))
71
            try:
72
                import urllib2
73
                response = urllib2.urlopen(first_line)
74
                encoding = response.headers.getparam('charset')
75
                file_ref = StringIO(response.read())
76
                logger.debug('Retrieved data had encoding {}'.format(encoding))
77
            except ImportError:
78
                import urllib.request
79
                response = urllib.request.urlopen(file_ref)
80
                encoding = response.headers.get_content_charset()
81
                file_ref = StringIO(response.read().decode(encoding))
82
                logger.debug('Retrieved data decoded via {}'.format(encoding))
83
        elif len(lines) > 1: # it's LAS data as a string.
84
            file_ref = StringIO(file_ref)
85
        else:  # it must be a filename
86
            file_ref, encoding = open_with_codecs(first_line, **encoding_kwargs)
87
    return file_ref, encoding
88
89
90
def open_with_codecs(filename, encoding=None, encoding_errors='replace',
91
              autodetect_encoding=True, autodetect_encoding_chars=4000):
92
    '''
93
    Read Unicode data from file.
94
95
    Arguments:
96
        filename (str): path to file
97
98
    Keyword Arguments:
99
        encoding (str): character encoding to open file_ref with, using
100
            :func:`codecs.open`.
101
        encoding_errors (str): 'strict', 'replace' (default), 'ignore' - how to
102
            handle errors with encodings (see
103
            `this section 
104
            <https://docs.python.org/3/library/codecs.html#codec-base-classes>`__
105
            of the standard library's :mod:`codecs` module for more information)
106
        autodetect_encoding (str or bool): default True to use 
107
            `chardet <https://github.com/chardet/chardet>`__/`cchardet 
108
            <https://github.com/PyYoshi/cChardet>`__ to detect encoding. 
109
            Note if set to False several common encodings will be tried but 
110
            chardet won't be used.
111
        autodetect_encoding_chars (int/None): number of chars to read from LAS
112
            file for auto-detection of encoding.
113
114
    Returns:
115
        a unicode or string object
116
117
    This function is called by :func:`lasio.reader.open_file`.
118
119
    '''
120
    if autodetect_encoding_chars:
121
        nbytes = int(autodetect_encoding_chars)
122
    else:
123
        nbytes = None
124
125
    # Forget [c]chardet - if we can locate the BOM we just assume that's correct.
126
    nbytes_test = min(32, os.path.getsize(filename))
127
    with open(filename, mode='rb') as test:
128
        raw = test.read(nbytes_test)
129
    if raw.startswith(codecs.BOM_UTF8):
130
        encoding = 'utf-8-sig'
131
        autodetect_encoding = False
132
133
    # If BOM wasn't found...
134
    if (autodetect_encoding) and (not encoding):
135
        with open(filename, mode='rb') as test:
136
            if nbytes is None:
137
                raw = test.read()
138
            else:
139
                raw = test.read(nbytes)
140
        encoding = get_encoding(autodetect_encoding, raw)
141
        autodetect_encoding = False
142
        
143
    # Or if no BOM found & chardet not installed
144
    if (not autodetect_encoding) and (not encoding):
145
        encoding = adhoc_test_encoding(filename)
146
        if encoding:
147
            logger.info('{} was found by ad hoc to work but note it might not'
148
                       ' be the correct encoding'.format(encoding))
149
150
    # Now open and return the file-like object
151
    logger.info('Opening {} as {} and treating errors with "{}"'.format(
152
        filename, encoding, encoding_errors))
153
    file_obj = codecs.open(filename, mode='r', encoding=encoding,
154
        errors=encoding_errors)
155
    return file_obj, encoding
156
157
158
def adhoc_test_encoding(filename):
159
    test_encodings = ['ascii', 'windows-1252', 'latin-1']
160
    for i in test_encodings:
161
        encoding = i
162
        with codecs.open(filename, mode='r', encoding=encoding) as f:
163
            try:
164
                f.readline()
165
                break
166
            except UnicodeDecodeError:
167
                logger.debug('{} tested, raised UnicodeDecodeError'.format(i))
168
                pass
169
            encoding = None
170
    return encoding
171
172
173
def get_encoding(auto, raw):
174
    '''
175
    Automatically detect character encoding.
176
177
    Arguments:
178
        auto (str): auto-detection of character encoding - can be either
179
            'chardet', 'cchardet', False, or True (the latter will pick the
180
            fastest available option)
181
        raw (bytes): array of bytes to detect from
182
183
    Returns:
184
        A string specifying the character encoding.
185
186
    '''
187
    if auto is True:
188
        try:
189
            import cchardet as chardet
190
        except ImportError:
191
            try:
192
                import chardet
193
            except ImportError:
194
                logger.debug('chardet or cchardet is recommended for automatic'
195
                    ' detection of character encodings. Instead trying some'
196
                    ' common encodings.')
197
                return None
198
            else:
199
                logger.debug('get_encoding Using chardet')
200
                method = 'chardet'
201
        else:
202
            logger.debug('get_encoding Using cchardet')
203
            method = 'cchardet'
204
    elif auto.lower() == 'chardet':
205
        import chardet
206
        logger.debug('get_encoding Using chardet')
207
        method = 'chardet'
208
    elif auto.lower() == 'cchardet':
209
        import cchardet as chardet
210
        logger.debug('get_encoding Using cchardet')
211
        method = 'cchardet'
212
    result = chardet.detect(raw)
213
    logger.debug('{} method detected encoding of {} at confidence {}'.format(
214
        method, result['encoding'], result['confidence']))
215
    return result['encoding']
216
217
218
def read_file_contents(file_obj, regexp_subs, value_null_subs, 
219
                       ignore_data=False):
220
    '''Read file contents into memory.
221
222
    Arguments:
223
        file_obj (open file-like object)
224
225
    Keyword Arguments:
226
        null_subs (bool): True will substitute ``numpy.nan`` for invalid values
227
        ignore_data (bool): if True, do not read in the numerical data in the
228
            ~ASCII section
229
230
    Returns:
231
        OrderedDict
232
233
    I think of the returned dictionary as a "raw section". The keys are
234
    the first line of the LAS section, including the tilde. Each value is
235
    a dict with either::
236
237
        {"section_type": "header",
238
         "title": str,               # title of section (including the ~)
239
         "lines": [str, ],           # a list of the lines from the lAS file
240
         "line_nos": [int, ]         # line nos from the original file
241
         }
242
243
    or::
244
245
        {"section_type": "data",
246
         "title": str,              # title of section (including the ~)
247
         "start_line": int,         # location of data section (the title line)
248
         "ncols": int,              # no. of columns on first line of data,
249
         "array": ndarray           # 1-D numpy.ndarray,
250
         }
251
252
    '''
253
    sections = OrderedDict()
254
    sect_lines = []
255
    sect_line_nos = []
256
    sect_title_line = None
257
258
    for i, line in enumerate(file_obj):
259
        line = line.strip()
260
        if not line:
261
            continue
262
        if line.upper().startswith('~A'):
263
            # HARD CODED FOR VERSION 1.2 and 2.0; needs review for 3.0
264
            # We have finished looking at the metadata and need
265
            # to start reading numerical data.
266
            if not sect_title_line is None:
267
                sections[sect_title_line] = {
268
                    "section_type": "header",
269
                    "title": sect_title_line,
270
                    "lines": sect_lines,
271
                    "line_nos": sect_line_nos,
272
                    }
273
            if not ignore_data:
274
                try:
275
                    data = read_data_section_iterative(file_obj, regexp_subs, value_null_subs)
276
                except:
277
                    raise exceptions.LASDataError(
278
                        traceback.format_exc()[:-1] + 
279
                        ' in data section beginning line {}'.format(i + 1))
280
                sections[line] = {
281
                    "section_type": "data",
282
                    "start_line": i,
283
                    "title": line,
284
                    "array": data,
285
                    }
286
                logger.debug('Data section ["array"].shape = {}'.format(data.shape))
287
            break
288
289
        elif line.startswith('~'):
290
            if sect_lines:
291
                # We have ended a section and need to start the next
292
                sections[sect_title_line] = {
293
                    "section_type": "header",
294
                    "title": sect_title_line,
295
                    "lines": sect_lines,
296
                    "line_nos": sect_line_nos,
297
                    }
298
                sect_lines = []
299
                sect_line_nos = []
300
            else:
301
                # We are entering into a section for the first time
302
                pass
303
            sect_title_line = line # either way... this is the case.
304
305
        else:
306
            # We are in the middle of a section.
307
            if not line.startswith("#"): # ignore commented-out lines.. for now.
308
                sect_lines.append(line)
309
                sect_line_nos.append(i + 1)
310
311
    # Find the number of columns in the data section(s). This is only
312
    # useful if WRAP = NO, but we do it for all since we don't yet know
313
    # what the wrap setting is.
314
315
    for section in sections.values():
316
        if section["section_type"] == "data":
317
            section["ncols"] = None
318
            file_obj.seek(0)
319
            for i, line in enumerate(file_obj):
320
                if i == section["start_line"] + 1:
321
                    for pattern, sub_str in regexp_subs:
322
                        line = re.sub(pattern, sub_str, line)
323
                    section["ncols"] = len(line.split())
324
                    break
325
    return sections
326
327
328
def read_data_section_iterative(file_obj, regexp_subs, value_null_subs):
329
    '''Read data section into memory.
330
331
    Arguments:
332
        file_obj (open file-like object): should be positioned in line-by-line
333
            reading mode, with the last line read being the title of the
334
            ~ASCII data section.
335
        regexp_subs (list): each item should be a tuple of the pattern and
336
            substitution string for a call to re.sub() on each line of the
337
            data section. See defaults.py READ_SUBS and NULL_SUBS for examples.
338
        value_null_subs (list): list of numerical values to be replaced by
339
            numpy.nan values.
340
341
    Returns:
342
        A 1-D numpy ndarray.
343
344
    '''
345
    def items(f):
346
        for line in f:
347
            for pattern, sub_str in regexp_subs:
348
                line = re.sub(pattern, sub_str, line)
349
            for item in line.split():
350
                yield item
351
352
    array = np.fromiter(items(file_obj), np.float64, -1)
353
    for value in value_null_subs:
354
        array[array == value] = np.nan
355
    return array
356
357
358
def get_substitutions(read_policy, null_policy):
359
    '''Parse read and null policy definitions into a list of regexp and value
360
    substitutions.
361
362
    Arguments:
363
        read_policy (str, list, or substitution): either (1) a string defined in 
364
            defaults.READ_POLICIES; (2) a list of substitutions as defined by
365
            the keys of defaults.READ_SUBS; or (3) a list of actual substitutions
366
            similar to the values of defaults.READ_SUBS. You can mix (2) and (3)
367
            together if you want.
368
        null_policy (str, list, or sub): as for read_policy but for 
369
            defaults.NULL_POLICIES and defaults.NULL_SUBS
370
371
    Returns:
372
        regexp_subs, value_null_subs, version_NULL - two lists and a bool. 
373
        The first list is pairs of regexp patterns and substrs, and the second
374
        list is just a list of floats or integers. The bool is whether or not
375
        'NULL' was located as a substitution.
376
377
    '''
378
    regexp_subs = []
379
    numerical_subs = []
380
    version_NULL = False
381
382
    for policy_typ, policy, policy_subs, subs in (
383
            ('read', read_policy, defaults.READ_POLICIES, defaults.READ_SUBS),
384
            ('null', null_policy, defaults.NULL_POLICIES, defaults.NULL_SUBS)):
385
        try:
386
            is_policy = policy in policy_subs
387
        except TypeError:
388
            is_policy = False
389
        if is_policy:
390
            logger.debug('using {} policy of "{}"'.format(policy_typ, policy))
391
            all_subs = []
392
            for sub in policy_subs[policy]:
393
                logger.debug('adding substitution {}'.format(sub))
394
                if sub in subs:
395
                    all_subs += subs[sub]
396
                if sub == 'NULL':
397
                    logger.debug('located substition for LAS.version.NULL as True')
398
                    version_NULL = True
399
        else:
400
            all_subs = []
401
            for item in policy:
402
                if item in subs:
403
                    all_subs += subs[item]
404
                    if item == 'NULL':
405
                        logger.debug('located substition for LAS.version.NULL as True')
406
                        version_NULL = True
407
                else:
408
                    all_subs.append(item)
409
        for item in all_subs:
410
            try:
411
                iter(item)
412
            except TypeError:
413
                logger.debug('added numerical substitution: {}'.format(item))
414
                numerical_subs.append(item)
415
            else:                
416
                logger.debug('added regexp substitution: pattern={} substr="{}"'.format(item[0], item[1]))
417
                regexp_subs.append(item)
418
    numerical_subs = [n for n in numerical_subs if not n is None]
419
                
420
    return regexp_subs, numerical_subs, version_NULL
421
422
423
def parse_header_section(sectdict, version, ignore_header_errors=False,
424
                         mnemonic_case='preserve'):
425
    '''Parse a header section dict into a SectionItems containing HeaderItems.
426
427
    Arguments:
428
        sectdict (dict): object returned from
429
            :func:`lasio.reader.read_file_contents`
430
        version (float): either 1.2 or 2.0
431
432
    Keyword Arguments:
433
        ignore_header_errors (bool): if True, issue HeaderItem parse errors
434
            as :func:`logging.warning` calls instead of a
435
            :exc:`lasio.exceptions.LASHeaderError` exception.
436
        mnemonic_case (str): 'preserve': keep the case of HeaderItem mnemonics
437
                             'upper': convert all HeaderItem mnemonics to uppercase
438
                             'lower': convert all HeaderItem mnemonics to lowercase
439
440
    Returns:
441
        :class:`lasio.las_items.SectionItems`
442
443
    '''
444
    title = sectdict["title"]
445
    assert len(sectdict["lines"]) == len(sectdict["line_nos"])
446
    parser = SectionParser(title, version=version)
447
448
    section = SectionItems()
449
    assert mnemonic_case in ('upper', 'lower', 'preserve')
450
    if not mnemonic_case == 'preserve':
451
        section.mnemonic_transforms = True
452
    
453
    for i in range(len(sectdict["lines"])):
454
        line = sectdict["lines"][i]
455
        j = sectdict["line_nos"][i]
456
        if not line:
457
            continue
458
        try:
459
            values = read_line(line)
460
        except:
461
            message = 'line {} (section {}): "{}"'.format(
462
                # traceback.format_exc().splitlines()[-1].strip('\n'),
463
                j, title, line)
464
            if ignore_header_errors:
465
                logger.warning(message)
466
            else:
467
                raise exceptions.LASHeaderError(message)
468
        else:
469
            if mnemonic_case == 'upper':
470
                values['name'] = values['name'].upper()
471
            elif mnemonic_case == 'lower':
472
                values['name'] = values['name'].lower()
473
            section.append(parser(**values))
474
    return section
475
476
477
478
class SectionParser(object):
479
480
    '''Parse lines from header sections.
481
482
    Arguments:
483
        title (str): title line of section. Used to understand different
484
            order formatting across the special sections ~C, ~P, ~W, and ~V,
485
            depending on version 1.2 or 2.0.
486
487
    Keyword Arguments:
488
        version (float): version to parse according to. Default is 1.2.
489
490
    '''
491
492
    def __init__(self, title, version=1.2):
493
        if title.upper().startswith('~C'):
494
            self.func = self.curves
495
            self.section_name2 = "Curves"
496
        elif title.upper().startswith('~P'):
497
            self.func = self.params
498
            self.section_name2 = "Parameter"
499
        elif title.upper().startswith('~W'):
500
            self.func = self.metadata
501
            self.section_name2 = "Well"
502
        elif title.upper().startswith('~V'):
503
            self.func = self.metadata
504
            self.section_name2 = "Version"
505
506
507
        self.version = version
508
        self.section_name = title
509
510
        defs = defaults.ORDER_DEFINITIONS
511
        section_orders = defs[self.version][self.section_name2]
512
        self.default_order = section_orders[0]#
513
        self.orders = {}
514
        for order, mnemonics in section_orders[1:]:
515
            for mnemonic in mnemonics:
516
                self.orders[mnemonic] = order
517
518
    def __call__(self, **keys):
519
        '''Return the correct object for this type of section.
520
521
        Refer to :meth:`lasio.reader.SectionParser.metadata`,
522
        :meth:`lasio.reader.SectionParser.params`, and
523
        :meth:`lasio.reader.SectionParser.curves` for the methods actually
524
        used by this routine.
525
526
        Keyword arguments should be the key:value pairs returned by
527
        :func:`lasio.reader.read_header_line`.
528
529
        '''
530
        item = self.func(**keys)
531
        return item
532
533
    def num(self, x, default=None):
534
        '''Attempt to parse a number.
535
536
        Arguments:
537
            x (str, int, float): potential number
538
            default (int, float, None): fall-back option
539
540
        Returns:
541
            int, float, or **default** - from most to least preferred types.
542
543
        '''
544
        if default is None:
545
            default = x
546
        
547
        # in case it is a string.
548
        try:
549
            pattern, sub = defaults.READ_SUBS['comma-decimal-mark'][0]
550
            x = re.sub(pattern, sub, x)
551
        except:
552
            pass
553
554
        try:
555
            return np.int(x)
556
        except:
557
            try:
558
                x = np.float(x)
559
            except:
560
                return default
561
        if np.isfinite(x):
562
            return x
563
        else:
564
            return default
565
566
    def metadata(self, **keys):
567
        '''Return HeaderItem correctly formatted according to the order
568
        prescribed for LAS v 1.2 or 2.0 for the ~W section.
569
570
        Keyword arguments should be the key:value pairs returned by
571
        :func:`lasio.reader.read_header_line`.
572
573
        '''
574
        key_order = self.orders.get(keys['name'], self.default_order)
575
        if key_order == 'value:descr':
576
            return HeaderItem(
577
                keys['name'],                 # mnemonic
578
                keys['unit'],                 # unit
579
                self.num(keys['value']),      # value
580
                keys['descr'],                # descr
581
            )
582
        elif key_order == 'descr:value':
583
            return HeaderItem(
584
                keys['name'],                   # mnemonic
585
                keys['unit'],                   # unit
586
                keys['descr'],                  # descr
587
                self.num(keys['value']),        # value
588
            )
589
590
    def curves(self, **keys):
591
        '''Return CurveItem.
592
593
        Keyword arguments should be the key:value pairs returned by
594
        :func:`lasio.reader.read_header_line`.
595
596
        '''
597
        item = CurveItem(
598
            keys['name'],               # mnemonic
599
            keys['unit'],               # unit
600
            keys['value'],              # value
601
            keys['descr'],              # descr
602
        )
603
        return item
604
605
    def params(self, **keys):
606
        '''Return HeaderItem for ~P section (the same between 1.2 and 2.0 specs)
607
608
        Keyword arguments should be the key:value pairs returned by
609
        :func:`lasio.reader.read_header_line`.
610
611
        '''
612
        return HeaderItem(
613
            keys['name'],               # mnemonic
614
            keys['unit'],               # unit
615
            self.num(keys['value']),    # value
616
            keys['descr'],              # descr
617
        )
618
619
620
def read_line(*args, **kwargs):
621
    '''Retained for backwards-compatibility.
622
623
    See :func:`lasio.reader.read_header_line`.
624
625
    '''
626
    return read_header_line(*args, **kwargs)
627
628
629
def read_header_line(line, pattern=None):
630
    '''Read a line from a LAS header section.
631
632
    The line is parsed with a regular expression -- see LAS file specs for
633
    more details, but it should basically be in the format::
634
635
        name.unit       value : descr
636
637
    Arguments:
638
        line (str): line from a LAS header section
639
640
    Returns:
641
        A dictionary with keys 'name', 'unit', 'value', and 'descr', each
642
        containing a string as value.
643
644
    '''
645
    d = {'name': '', 'unit': '', 'value': '', 'descr': ''}
646
    if pattern is None:
647
        if not ':' in line:
648
            pattern = (r'\.?(?P<name>[^.]*)\.' +
649
                       r'(?P<unit>[^\s:]*)' +
650
                       r'(?P<value>[^:]*)')
651
        else:
652
            pattern = (r'\.?(?P<name>[^.]*)\.' +
653
                       r'(?P<unit>[^\s:]*)' +
654
                       r'(?P<value>[^:]*):' +
655
                       r'(?P<descr>.*)')
656
    m = re.match(pattern, line)
657
    mdict = m.groupdict()
658
    for key, value in mdict.items():
659
        d[key] = value.strip()
660
        if key == 'unit':
661
            if d[key].endswith('.'):
662
                d[key] = d[key].strip('.')  # see issue #36
663
    return d
664