parse_header_section()   C
last analyzed

Complexity

Conditions 11

Size

Total Lines 52

Duplication

Lines 0
Ratio 0 %

Importance

Changes 4
Bugs 1 Features 0
Metric Value
cc 11
dl 0
loc 52
rs 5.3509
c 4
b 1
f 0

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

Complexity

Complex classes like parse_header_section() often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

1
import codecs
2
import logging
3
import os
4
import re
5
import textwrap
6
import traceback
7
8
import numpy as np
9
10
from . import defaults
11
12
# Convoluted import for StringIO in order to support:
13
#
14
# - Python 3 - io.StringIO
15
# - Python 2 (optimized) - cStringIO.StringIO
16
# - Python 2 (all) - StringIO.StringIO
17
18
try:
19
    import cStringIO as StringIO
20
except ImportError:
21
    try:  # cStringIO not available on this system
22
        import StringIO
23
    except ImportError:  # Python 3
24
        from io import StringIO
25
    else:
26
        from StringIO import StringIO
27
else:
28
    from StringIO import StringIO
29
30
from . import defaults
31
from . import exceptions
32
from .las_items import HeaderItem, CurveItem, SectionItems, OrderedDict
33
34
35
logger = logging.getLogger(__name__)
36
37
URL_REGEXP = re.compile(
38
    r'^(?:http|ftp)s?://'  # http:// or https://
39
    r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}'
40
    r'\.?|[A-Z0-9-]{2,}\.?)|'  # (cont.) domain...
41
    r'localhost|'  # localhost...
42
    r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})'  # ...or ip
43
    r'(?::\d+)?'  # optional port
44
    r'(?:/?|[/?]\S+)$', re.IGNORECASE)
45
46
47
def open_file(file_ref, **encoding_kwargs):
48
    '''Open a file if necessary.
49
50
    If ``autodetect_encoding=True`` then either ``cchardet`` or ``chardet``
51
    needs to be installed, or else an ``ImportError`` will be raised.
52
53
    Arguments:
54
        file_ref (file-like object, str): either a filename, an open file
55
            object, or a string containing the contents of a file.
56
57
    See :func:`lasio.reader.open_with_codecs` for keyword arguments that can be
58
    used here.
59
60
    Returns: 
61
        tuple of an open file-like object, and the encoding that
62
        was used to decode it (if it were read from disk).
63
64
    '''
65
    encoding = None
66
    if isinstance(file_ref, str): # file_ref != file-like object, so what is it?
67
        lines = file_ref.splitlines()
68
        first_line = lines[0]
69
        if URL_REGEXP.match(first_line): # it's a URL
70
            logger.info('Loading URL {}'.format(first_line))
71
            try:
72
                import urllib2
73
                response = urllib2.urlopen(first_line)
74
                encoding = response.headers.getparam('charset')
75
                file_ref = StringIO(response.read())
76
                logger.debug('Retrieved data had encoding {}'.format(encoding))
77
            except ImportError:
78
                import urllib.request
79
                response = urllib.request.urlopen(file_ref)
80
                encoding = response.headers.get_content_charset()
81
                file_ref = StringIO(response.read().decode(encoding))
82
                logger.debug('Retrieved data decoded via {}'.format(encoding))
83
        elif len(lines) > 1: # it's LAS data as a string.
84
            file_ref = StringIO(file_ref)
85
        else:  # it must be a filename
86
            file_ref, encoding = open_with_codecs(first_line, **encoding_kwargs)
87
    return file_ref, encoding
88
89
90
def open_with_codecs(filename, encoding=None, encoding_errors='replace',
91
              autodetect_encoding=True, autodetect_encoding_chars=4000):
92
    '''
93
    Read Unicode data from file.
94
95
    Arguments:
96
        filename (str): path to file
97
98
    Keyword Arguments:
99
        encoding (str): character encoding to open file_ref with, using
100
            :func:`codecs.open`.
101
        encoding_errors (str): 'strict', 'replace' (default), 'ignore' - how to
102
            handle errors with encodings (see
103
            `this section 
104
            <https://docs.python.org/3/library/codecs.html#codec-base-classes>`__
105
            of the standard library's :mod:`codecs` module for more information)
106
        autodetect_encoding (str or bool): default True to use 
107
            `chardet <https://github.com/chardet/chardet>`__/`cchardet 
108
            <https://github.com/PyYoshi/cChardet>`__ to detect encoding. 
109
            Note if set to False several common encodings will be tried but 
110
            chardet won't be used.
111
        autodetect_encoding_chars (int/None): number of chars to read from LAS
112
            file for auto-detection of encoding.
113
114
    Returns:
115
        a unicode or string object
116
117
    This function is called by :func:`lasio.reader.open_file`.
118
119
    '''
120
    if autodetect_encoding_chars:
121
        nbytes = int(autodetect_encoding_chars)
122
    else:
123
        nbytes = None
124
125
    # Forget [c]chardet - if we can locate the BOM we just assume that's correct.
126
    nbytes_test = min(32, os.path.getsize(filename))
127
    with open(filename, mode='rb') as test:
128
        raw = test.read(nbytes_test)
129
    if raw.startswith(codecs.BOM_UTF8):
130
        encoding = 'utf-8-sig'
131
        autodetect_encoding = False
132
133
    # If BOM wasn't found...
134
    if (autodetect_encoding) and (not encoding):
135
        with open(filename, mode='rb') as test:
136
            if nbytes is None:
137
                raw = test.read()
138
            else:
139
                raw = test.read(nbytes)
140
        encoding = get_encoding(autodetect_encoding, raw)
141
        autodetect_encoding = False
142
        
143
    # Or if no BOM found & chardet not installed
144
    if (not autodetect_encoding) and (not encoding):
145
        encoding = adhoc_test_encoding(filename)
146
        if encoding:
147
            logger.info('{} was found by ad hoc to work but note it might not'
148
                       ' be the correct encoding'.format(encoding))
149
150
    # Now open and return the file-like object
151
    logger.info('Opening {} as {} and treating errors with "{}"'.format(
152
        filename, encoding, encoding_errors))
153
    file_obj = codecs.open(filename, mode='r', encoding=encoding,
154
        errors=encoding_errors)
155
    return file_obj, encoding
156
157
158
def adhoc_test_encoding(filename):
159
    test_encodings = ['ascii', 'windows-1252', 'latin-1']
160
    for i in test_encodings:
161
        encoding = i
162
        with codecs.open(filename, mode='r', encoding=encoding) as f:
163
            try:
164
                f.readline()
165
                break
166
            except UnicodeDecodeError:
167
                logger.debug('{} tested, raised UnicodeDecodeError'.format(i))
168
                pass
169
            encoding = None
170
    return encoding
171
172
173
def get_encoding(auto, raw):
174
    '''
175
    Automatically detect character encoding.
176
177
    Arguments:
178
        auto (str): auto-detection of character encoding - can be either
179
            'chardet', 'cchardet', False, or True (the latter will pick the
180
            fastest available option)
181
        raw (bytes): array of bytes to detect from
182
183
    Returns:
184
        A string specifying the character encoding.
185
186
    '''
187
    if auto is True:
188
        try:
189
            import cchardet as chardet
190
        except ImportError:
191
            try:
192
                import chardet
193
            except ImportError:
194
                logger.debug('chardet or cchardet is recommended for automatic'
195
                    ' detection of character encodings. Instead trying some'
196
                    ' common encodings.')
197
                return None
198
            else:
199
                logger.debug('get_encoding Using chardet')
200
                method = 'chardet'
201
        else:
202
            logger.debug('get_encoding Using cchardet')
203
            method = 'cchardet'
204
    elif auto.lower() == 'chardet':
205
        import chardet
206
        logger.debug('get_encoding Using chardet')
207
        method = 'chardet'
208
    elif auto.lower() == 'cchardet':
209
        import cchardet as chardet
210
        logger.debug('get_encoding Using cchardet')
211
        method = 'cchardet'
212
    result = chardet.detect(raw)
213
    logger.debug('{} method detected encoding of {} at confidence {}'.format(
214
        method, result['encoding'], result['confidence']))
215
    return result['encoding']
216
217
218
def read_file_contents(file_obj, regexp_subs, value_null_subs, 
219
                       ignore_data=False):
220
    '''Read file contents into memory.
221
222
    Arguments:
223
        file_obj (open file-like object)
224
225
    Keyword Arguments:
226
        null_subs (bool): True will substitute ``numpy.nan`` for invalid values
227
        ignore_data (bool): if True, do not read in the numerical data in the
228
            ~ASCII section
229
230
    Returns:
231
        OrderedDict
232
233
    I think of the returned dictionary as a "raw section". The keys are
234
    the first line of the LAS section, including the tilde. Each value is
235
    a dict with either::
236
237
        {"section_type": "header",
238
         "title": str,               # title of section (including the ~)
239
         "lines": [str, ],           # a list of the lines from the lAS file
240
         "line_nos": [int, ]         # line nos from the original file
241
         }
242
243
    or::
244
245
        {"section_type": "data",
246
         "title": str,              # title of section (including the ~)
247
         "start_line": int,         # location of data section (the title line)
248
         "ncols": int,              # no. of columns on first line of data,
249
         "array": ndarray           # 1-D numpy.ndarray,
250
         }
251
252
    '''
253
    sections = OrderedDict()
254
    sect_lines = []
255
    sect_line_nos = []
256
    sect_title_line = None
257
    section_exists = False
258
259
    for i, line in enumerate(file_obj):
260
        line = line.strip()
261
        if not line:
262
            continue
263
        if line.upper().startswith('~A'):
264
            # HARD CODED FOR VERSION 1.2 and 2.0; needs review for 3.0
265
            # We have finished looking at the metadata and need
266
            # to start reading numerical data.
267
            if not sect_title_line is None:
268
                sections[sect_title_line] = {
269
                    "section_type": "header",
270
                    "title": sect_title_line,
271
                    "lines": sect_lines,
272
                    "line_nos": sect_line_nos,
273
                    }
274
            if not ignore_data:
275
                try:
276
                    data = read_data_section_iterative(file_obj, regexp_subs, value_null_subs)
277
                except:
278
                    raise exceptions.LASDataError(
279
                        traceback.format_exc()[:-1] + 
280
                        ' in data section beginning line {}'.format(i + 1))
281
                sections[line] = {
282
                    "section_type": "data",
283
                    "start_line": i,
284
                    "title": line,
285
                    "array": data,
286
                    }
287
                logger.debug('Data section ["array"].shape = {}'.format(data.shape))
288
            break
289
290
        elif line.startswith('~'):
291
            if section_exists:
292
                # We have ended a section and need to start the next
293
                sections[sect_title_line] = {
294
                    "section_type": "header",
295
                    "title": sect_title_line,
296
                    "lines": sect_lines,
297
                    "line_nos": sect_line_nos,
298
                    }
299
                sect_lines = []
300
                sect_line_nos = []
301
            else:
302
                # We are entering into a section for the first time
303
                section_exists = True
304
                pass
305
            sect_title_line = line # either way... this is the case.
306
307
        else:
308
            # We are in the middle of a section.
309
            if not line.startswith("#"): # ignore commented-out lines.. for now.
310
                sect_lines.append(line)
311
                sect_line_nos.append(i + 1)
312
313
    # Find the number of columns in the data section(s). This is only
314
    # useful if WRAP = NO, but we do it for all since we don't yet know
315
    # what the wrap setting is.
316
317
    for section in sections.values():
318
        if section["section_type"] == "data":
319
            section["ncols"] = None
320
            file_obj.seek(0)
321
            for i, line in enumerate(file_obj):
322
                if i == section["start_line"] + 1:
323
                    for pattern, sub_str in regexp_subs:
324
                        line = re.sub(pattern, sub_str, line)
325
                    section["ncols"] = len(line.split())
326
                    break
327
    return sections
328
329
330
def read_data_section_iterative(file_obj, regexp_subs, value_null_subs):
331
    '''Read data section into memory.
332
333
    Arguments:
334
        file_obj (open file-like object): should be positioned in line-by-line
335
            reading mode, with the last line read being the title of the
336
            ~ASCII data section.
337
        regexp_subs (list): each item should be a tuple of the pattern and
338
            substitution string for a call to re.sub() on each line of the
339
            data section. See defaults.py READ_SUBS and NULL_SUBS for examples.
340
        value_null_subs (list): list of numerical values to be replaced by
341
            numpy.nan values.
342
343
    Returns:
344
        A 1-D numpy ndarray.
345
346
    '''
347
    def items(f):
348
        for line in f:
349
            for pattern, sub_str in regexp_subs:
350
                line = re.sub(pattern, sub_str, line)
351
            for item in line.split():
352
                try:
353
                    yield np.float64(item)
354
                except ValueError:
355
                    yield item
356
357
    array = np.array([i for i in items(file_obj)])
358
    for value in value_null_subs:
359
        array[array == value] = np.nan
360
    return array
361
362
363
def get_substitutions(read_policy, null_policy):
364
    '''Parse read and null policy definitions into a list of regexp and value
365
    substitutions.
366
367
    Arguments:
368
        read_policy (str, list, or substitution): either (1) a string defined in 
369
            defaults.READ_POLICIES; (2) a list of substitutions as defined by
370
            the keys of defaults.READ_SUBS; or (3) a list of actual substitutions
371
            similar to the values of defaults.READ_SUBS. You can mix (2) and (3)
372
            together if you want.
373
        null_policy (str, list, or sub): as for read_policy but for 
374
            defaults.NULL_POLICIES and defaults.NULL_SUBS
375
376
    Returns:
377
        regexp_subs, value_null_subs, version_NULL - two lists and a bool. 
378
        The first list is pairs of regexp patterns and substrs, and the second
379
        list is just a list of floats or integers. The bool is whether or not
380
        'NULL' was located as a substitution.
381
382
    '''
383
    regexp_subs = []
384
    numerical_subs = []
385
    version_NULL = False
386
387
    for policy_typ, policy, policy_subs, subs in (
388
            ('read', read_policy, defaults.READ_POLICIES, defaults.READ_SUBS),
389
            ('null', null_policy, defaults.NULL_POLICIES, defaults.NULL_SUBS)):
390
        try:
391
            is_policy = policy in policy_subs
392
        except TypeError:
393
            is_policy = False
394
        if is_policy:
395
            logger.debug('using {} policy of "{}"'.format(policy_typ, policy))
396
            all_subs = []
397
            for sub in policy_subs[policy]:
398
                logger.debug('adding substitution {}'.format(sub))
399
                if sub in subs:
400
                    all_subs += subs[sub]
401
                if sub == 'NULL':
402
                    logger.debug('located substition for LAS.version.NULL as True')
403
                    version_NULL = True
404
        else:
405
            all_subs = []
406
            for item in policy:
407
                if item in subs:
408
                    all_subs += subs[item]
409
                    if item == 'NULL':
410
                        logger.debug('located substition for LAS.version.NULL as True')
411
                        version_NULL = True
412
                else:
413
                    all_subs.append(item)
414
        for item in all_subs:
415
            try:
416
                iter(item)
417
            except TypeError:
418
                logger.debug('added numerical substitution: {}'.format(item))
419
                numerical_subs.append(item)
420
            else:                
421
                logger.debug('added regexp substitution: pattern={} substr="{}"'.format(item[0], item[1]))
422
                regexp_subs.append(item)
423
    numerical_subs = [n for n in numerical_subs if not n is None]
424
                
425
    return regexp_subs, numerical_subs, version_NULL
426
427
428
def parse_header_section(sectdict, version, ignore_header_errors=False,
429
                         mnemonic_case='preserve'):
430
    '''Parse a header section dict into a SectionItems containing HeaderItems.
431
432
    Arguments:
433
        sectdict (dict): object returned from
434
            :func:`lasio.reader.read_file_contents`
435
        version (float): either 1.2 or 2.0
436
437
    Keyword Arguments:
438
        ignore_header_errors (bool): if True, issue HeaderItem parse errors
439
            as :func:`logging.warning` calls instead of a
440
            :exc:`lasio.exceptions.LASHeaderError` exception.
441
        mnemonic_case (str): 'preserve': keep the case of HeaderItem mnemonics
442
                             'upper': convert all HeaderItem mnemonics to uppercase
443
                             'lower': convert all HeaderItem mnemonics to lowercase
444
445
    Returns:
446
        :class:`lasio.las_items.SectionItems`
447
448
    '''
449
    title = sectdict["title"]
450
    assert len(sectdict["lines"]) == len(sectdict["line_nos"])
451
    parser = SectionParser(title, version=version)
452
453
    section = SectionItems()
454
    assert mnemonic_case in ('upper', 'lower', 'preserve')
455
    if not mnemonic_case == 'preserve':
456
        section.mnemonic_transforms = True
457
    
458
    for i in range(len(sectdict["lines"])):
459
        line = sectdict["lines"][i]
460
        j = sectdict["line_nos"][i]
461
        if not line:
462
            continue
463
        try:
464
            values = read_line(line)
465
        except:
466
            message = 'line {} (section {}): "{}"'.format(
467
                # traceback.format_exc().splitlines()[-1].strip('\n'),
468
                j, title, line)
469
            if ignore_header_errors:
470
                logger.warning(message)
471
            else:
472
                raise exceptions.LASHeaderError(message)
473
        else:
474
            if mnemonic_case == 'upper':
475
                values['name'] = values['name'].upper()
476
            elif mnemonic_case == 'lower':
477
                values['name'] = values['name'].lower()
478
            section.append(parser(**values))
479
    return section
480
481
482
483
class SectionParser(object):
484
485
    '''Parse lines from header sections.
486
487
    Arguments:
488
        title (str): title line of section. Used to understand different
489
            order formatting across the special sections ~C, ~P, ~W, and ~V,
490
            depending on version 1.2 or 2.0.
491
492
    Keyword Arguments:
493
        version (float): version to parse according to. Default is 1.2.
494
495
    '''
496
497
    def __init__(self, title, version=1.2):
498
        if title.upper().startswith('~C'):
499
            self.func = self.curves
500
            self.section_name2 = "Curves"
501
        elif title.upper().startswith('~P'):
502
            self.func = self.params
503
            self.section_name2 = "Parameter"
504
        elif title.upper().startswith('~W'):
505
            self.func = self.metadata
506
            self.section_name2 = "Well"
507
        elif title.upper().startswith('~V'):
508
            self.func = self.metadata
509
            self.section_name2 = "Version"
510
511
512
        self.version = version
513
        self.section_name = title
514
515
        defs = defaults.ORDER_DEFINITIONS
516
        section_orders = defs[self.version][self.section_name2]
517
        self.default_order = section_orders[0]#
518
        self.orders = {}
519
        for order, mnemonics in section_orders[1:]:
520
            for mnemonic in mnemonics:
521
                self.orders[mnemonic] = order
522
523
    def __call__(self, **keys):
524
        '''Return the correct object for this type of section.
525
526
        Refer to :meth:`lasio.reader.SectionParser.metadata`,
527
        :meth:`lasio.reader.SectionParser.params`, and
528
        :meth:`lasio.reader.SectionParser.curves` for the methods actually
529
        used by this routine.
530
531
        Keyword arguments should be the key:value pairs returned by
532
        :func:`lasio.reader.read_header_line`.
533
534
        '''
535
        item = self.func(**keys)
536
        return item
537
538
    def num(self, x, default=None):
539
        '''Attempt to parse a number.
540
541
        Arguments:
542
            x (str, int, float): potential number
543
            default (int, float, None): fall-back option
544
545
        Returns:
546
            int, float, or **default** - from most to least preferred types.
547
548
        '''
549
        if default is None:
550
            default = x
551
        
552
        # in case it is a string.
553
        try:
554
            pattern, sub = defaults.READ_SUBS['comma-decimal-mark'][0]
555
            x = re.sub(pattern, sub, x)
556
        except:
557
            pass
558
559
        try:
560
            return np.int(x)
561
        except:
562
            try:
563
                x = np.float(x)
564
            except:
565
                return default
566
        if np.isfinite(x):
567
            return x
568
        else:
569
            return default
570
571
    def metadata(self, **keys):
572
        '''Return HeaderItem correctly formatted according to the order
573
        prescribed for LAS v 1.2 or 2.0 for the ~W section.
574
575
        Keyword arguments should be the key:value pairs returned by
576
        :func:`lasio.reader.read_header_line`.
577
578
        '''
579
        key_order = self.orders.get(keys['name'], self.default_order)
580
        if key_order == 'value:descr':
581
            return HeaderItem(
582
                keys['name'],                 # mnemonic
583
                keys['unit'],                 # unit
584
                self.num(keys['value']),      # value
585
                keys['descr'],                # descr
586
            )
587
        elif key_order == 'descr:value':
588
            return HeaderItem(
589
                keys['name'],                   # mnemonic
590
                keys['unit'],                   # unit
591
                keys['descr'],                  # descr
592
                self.num(keys['value']),        # value
593
            )
594
595
    def curves(self, **keys):
596
        '''Return CurveItem.
597
598
        Keyword arguments should be the key:value pairs returned by
599
        :func:`lasio.reader.read_header_line`.
600
601
        '''
602
        item = CurveItem(
603
            keys['name'],               # mnemonic
604
            keys['unit'],               # unit
605
            keys['value'],              # value
606
            keys['descr'],              # descr
607
        )
608
        return item
609
610
    def params(self, **keys):
611
        '''Return HeaderItem for ~P section (the same between 1.2 and 2.0 specs)
612
613
        Keyword arguments should be the key:value pairs returned by
614
        :func:`lasio.reader.read_header_line`.
615
616
        '''
617
        return HeaderItem(
618
            keys['name'],               # mnemonic
619
            keys['unit'],               # unit
620
            self.num(keys['value']),    # value
621
            keys['descr'],              # descr
622
        )
623
624
625
def read_line(*args, **kwargs):
626
    '''Retained for backwards-compatibility.
627
628
    See :func:`lasio.reader.read_header_line`.
629
630
    '''
631
    return read_header_line(*args, **kwargs)
632
633
634
def read_header_line(line, pattern=None):
635
    '''Read a line from a LAS header section.
636
637
    The line is parsed with a regular expression -- see LAS file specs for
638
    more details, but it should basically be in the format::
639
640
        name.unit       value : descr
641
642
    Arguments:
643
        line (str): line from a LAS header section
644
645
    Returns:
646
        A dictionary with keys 'name', 'unit', 'value', and 'descr', each
647
        containing a string as value.
648
649
    '''
650
    d = {'name': '', 'unit': '', 'value': '', 'descr': ''}
651
    if pattern is None:
652
        if not ':' in line:
653
            pattern = (r'\.?(?P<name>[^.]*)\.' +
654
                       r'(?P<unit>[^\s:]*)' +
655
                       r'(?P<value>[^:]*)')
656
        else:
657
            pattern = (r'\.?(?P<name>[^.]*)\.' +
658
                       r'(?P<unit>[^\s:]*)' +
659
                       r'(?P<value>[^:]*):' +
660
                       r'(?P<descr>.*)')
661
    m = re.match(pattern, line)
662
    mdict = m.groupdict()
663
    for key, value in mdict.items():
664
        d[key] = value.strip()
665
        if key == 'unit':
666
            if d[key].endswith('.'):
667
                d[key] = d[key].strip('.')  # see issue #36
668
    return d
669