parse_header_section() - Code Metrics - Inspection of "Merge pull request #212 from kinverarity1/issue210..." - kinverarity1/lasio - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Push — master ( afd9d8...0f19a6 )

by Kent

created 2017-12-21 02:22 UTC

parse_header_section() F

↳ Parent: Project

Complexity

Conditions

Size

Total Lines

Duplication

Lines	0
Ratio	0 %

Importance

Changes	9
Bugs	1	Features	0

Metric	Value
cc	11
dl	0
loc	52
rs	3.8571
c	9
b	1
f	0

How to fix Long Method Complexity

import codecs
import logging
import os
import re
import textwrap
import traceback

import numpy as np

from . import defaults

# Convoluted import for StringIO in order to support:
#
# - Python 3 - io.StringIO
# - Python 2 (optimized) - cStringIO.StringIO
# - Python 2 (all) - StringIO.StringIO

try:
    import cStringIO as StringIO
except ImportError:
    try:  # cStringIO not available on this system
        import StringIO
    except ImportError:  # Python 3
        from io import StringIO
    else:
        from StringIO import StringIO
else:
    from StringIO import StringIO

from . import defaults
from . import exceptions
from .las_items import HeaderItem, CurveItem, SectionItems, OrderedDict


logger = logging.getLogger(__name__)

URL_REGEXP = re.compile(
    r'^(?:http|ftp)s?://'  # http:// or https://
    r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}'
    r'\.?|[A-Z0-9-]{2,}\.?)|'  # (cont.) domain...
    r'localhost|'  # localhost...
    r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})'  # ...or ip
    r'(?::\d+)?'  # optional port
    r'(?:/?|[/?]\S+)$', re.IGNORECASE)


def open_file(file_ref, **encoding_kwargs):
    '''Open a file if necessary.

    If ``autodetect_encoding=True`` then either ``cchardet`` or ``chardet``
    needs to be installed, or else an ``ImportError`` will be raised.

    Arguments:
        file_ref (file-like object, str): either a filename, an open file
            object, or a string containing the contents of a file.

    See :func:`lasio.reader.open_with_codecs` for keyword arguments that can be
    used here.

    Returns: 
        tuple of an open file-like object, and the encoding that
        was used to decode it (if it were read from disk).

    '''
    encoding = None
    if isinstance(file_ref, str): # file_ref != file-like object, so what is it?
        lines = file_ref.splitlines()
        first_line = lines[0]
        if URL_REGEXP.match(first_line): # it's a URL
            logger.info('Loading URL {}'.format(first_line))
            try:
                import urllib2
                response = urllib2.urlopen(first_line)
                encoding = response.headers.getparam('charset')
                file_ref = StringIO(response.read())
                logger.debug('Retrieved data had encoding {}'.format(encoding))
            except ImportError:
                import urllib.request
                response = urllib.request.urlopen(file_ref)
                encoding = response.headers.get_content_charset()
                file_ref = StringIO(response.read().decode(encoding))
                logger.debug('Retrieved data decoded via {}'.format(encoding))
        elif len(lines) > 1: # it's LAS data as a string.
            file_ref = StringIO(file_ref)
        else:  # it must be a filename
            file_ref, encoding = open_with_codecs(first_line, **encoding_kwargs)
    return file_ref, encoding


def open_with_codecs(filename, encoding=None, encoding_errors='replace',
              autodetect_encoding=True, autodetect_encoding_chars=4000):
    '''
    Read Unicode data from file.

    Arguments:
        filename (str): path to file

    Keyword Arguments:
        encoding (str): character encoding to open file_ref with, using
            :func:`codecs.open`.
        encoding_errors (str): 'strict', 'replace' (default), 'ignore' - how to
            handle errors with encodings (see
            `this section 
            <https://docs.python.org/3/library/codecs.html#codec-base-classes>`__
            of the standard library's :mod:`codecs` module for more information)
        autodetect_encoding (str or bool): default True to use 
            `chardet <https://github.com/chardet/chardet>`__/`cchardet 
            <https://github.com/PyYoshi/cChardet>`__ to detect encoding. 
            Note if set to False several common encodings will be tried but 
            chardet won't be used.
        autodetect_encoding_chars (int/None): number of chars to read from LAS
            file for auto-detection of encoding.

    Returns:
        a unicode or string object

    This function is called by :func:`lasio.reader.open_file`.

    '''
    if autodetect_encoding_chars:
        nbytes = int(autodetect_encoding_chars)
    else:
        nbytes = None

    # Forget [c]chardet - if we can locate the BOM we just assume that's correct.
    nbytes_test = min(32, os.path.getsize(filename))
    with open(filename, mode='rb') as test:
        raw = test.read(nbytes_test)
    if raw.startswith(codecs.BOM_UTF8):
        encoding = 'utf-8-sig'
        autodetect_encoding = False

    # If BOM wasn't found...
    if (autodetect_encoding) and (not encoding):
        with open(filename, mode='rb') as test:
            if nbytes is None:
                raw = test.read()
            else:
                raw = test.read(nbytes)
        encoding = get_encoding(autodetect_encoding, raw)
        autodetect_encoding = False
        
    # Or if no BOM found & chardet not installed
    if (not autodetect_encoding) and (not encoding):
        encoding = adhoc_test_encoding(filename)
        if encoding:
            logger.info('{} was found by ad hoc to work but note it might not'
                       ' be the correct encoding'.format(encoding))

    # Now open and return the file-like object
    logger.info('Opening {} as {} and treating errors with "{}"'.format(
        filename, encoding, encoding_errors))
    file_obj = codecs.open(filename, mode='r', encoding=encoding,
        errors=encoding_errors)
    return file_obj, encoding


def adhoc_test_encoding(filename):
    test_encodings = ['ascii', 'windows-1252', 'latin-1']
    for i in test_encodings:
        encoding = i
        with codecs.open(filename, mode='r', encoding=encoding) as f:
            try:
                f.readline()
                break
            except UnicodeDecodeError:
                logger.debug('{} tested, raised UnicodeDecodeError'.format(i))
                pass
            encoding = None
    return encoding


def get_encoding(auto, raw):
    '''
    Automatically detect character encoding.

    Arguments:
        auto (str): auto-detection of character encoding - can be either
            'chardet', 'cchardet', False, or True (the latter will pick the
            fastest available option)
        raw (bytes): array of bytes to detect from

    Returns:
        A string specifying the character encoding.

    '''
    if auto is True:
        try:
            import cchardet as chardet
        except ImportError:
            try:
                import chardet
            except ImportError:
                logger.debug('chardet or cchardet is recommended for automatic'
                    ' detection of character encodings. Instead trying some'
                    ' common encodings.')
                return None
            else:
                logger.debug('get_encoding Using chardet')
                method = 'chardet'
        else:
            logger.debug('get_encoding Using cchardet')
            method = 'cchardet'
    elif auto.lower() == 'chardet':
        import chardet
        logger.debug('get_encoding Using chardet')
        method = 'chardet'
    elif auto.lower() == 'cchardet':
        import cchardet as chardet
        logger.debug('get_encoding Using cchardet')
        method = 'cchardet'
    result = chardet.detect(raw)
    logger.debug('{} method detected encoding of {} at confidence {}'.format(
        method, result['encoding'], result['confidence']))
    return result['encoding']


def read_file_contents(file_obj, regexp_subs, value_null_subs, 
                       ignore_data=False):
    '''Read file contents into memory.

    Arguments:
        file_obj (open file-like object)

    Keyword Arguments:
        null_subs (bool): True will substitute ``numpy.nan`` for invalid values
        ignore_data (bool): if True, do not read in the numerical data in the
            ~ASCII section

    Returns:
        OrderedDict

    I think of the returned dictionary as a "raw section". The keys are
    the first line of the LAS section, including the tilde. Each value is
    a dict with either::

        {"section_type": "header",
         "title": str,               # title of section (including the ~)
         "lines": [str, ],           # a list of the lines from the lAS file
         "line_nos": [int, ]         # line nos from the original file
         }

    or::

        {"section_type": "data",
         "title": str,              # title of section (including the ~)
         "start_line": int,         # location of data section (the title line)
         "ncols": int,              # no. of columns on first line of data,
         "array": ndarray           # 1-D numpy.ndarray,
         }

    '''
    sections = OrderedDict()
    sect_lines = []
    sect_line_nos = []
    sect_title_line = None

    for i, line in enumerate(file_obj):
        line = line.strip()
        if not line:
            continue
        if line.upper().startswith('~A'):
            # HARD CODED FOR VERSION 1.2 and 2.0; needs review for 3.0
            # We have finished looking at the metadata and need
            # to start reading numerical data.
            if not sect_title_line is None:
                sections[sect_title_line] = {
                    "section_type": "header",
                    "title": sect_title_line,
                    "lines": sect_lines,
                    "line_nos": sect_line_nos,
                    }
            if not ignore_data:
                try:
                    data = read_data_section_iterative(file_obj, regexp_subs, value_null_subs)
                except:
                    raise exceptions.LASDataError(
                        traceback.format_exc()[:-1] + 
                        ' in data section beginning line {}'.format(i + 1))
                sections[line] = {
                    "section_type": "data",
                    "start_line": i,
                    "title": line,
                    "array": data,
                    }
                logger.debug('Data section ["array"].shape = {}'.format(data.shape))
            break

        elif line.startswith('~'):
            if sect_lines:
                # We have ended a section and need to start the next
                sections[sect_title_line] = {
                    "section_type": "header",
                    "title": sect_title_line,
                    "lines": sect_lines,
                    "line_nos": sect_line_nos,
                    }
                sect_lines = []
                sect_line_nos = []
            else:
                # We are entering into a section for the first time
                pass
            sect_title_line = line # either way... this is the case.

        else:
            # We are in the middle of a section.
            if not line.startswith("#"): # ignore commented-out lines.. for now.
                sect_lines.append(line)
                sect_line_nos.append(i + 1)

    # Find the number of columns in the data section(s). This is only
    # useful if WRAP = NO, but we do it for all since we don't yet know
    # what the wrap setting is.

    for section in sections.values():
        if section["section_type"] == "data":
            section["ncols"] = None
            file_obj.seek(0)
            for i, line in enumerate(file_obj):
                if i == section["start_line"] + 1:
                    for pattern, sub_str in regexp_subs:
                        line = re.sub(pattern, sub_str, line)
                    section["ncols"] = len(line.split())
                    break
    return sections


def read_data_section_iterative(file_obj, regexp_subs, value_null_subs):
    '''Read data section into memory.

    Arguments:
        file_obj (open file-like object): should be positioned in line-by-line
            reading mode, with the last line read being the title of the
            ~ASCII data section.
        regexp_subs (list): each item should be a tuple of the pattern and
            substitution string for a call to re.sub() on each line of the
            data section. See defaults.py READ_SUBS and NULL_SUBS for examples.
        value_null_subs (list): list of numerical values to be replaced by
            numpy.nan values.

    Returns:
        A 1-D numpy ndarray.

    '''
    def items(f):
        for line in f:
            for pattern, sub_str in regexp_subs:
                line = re.sub(pattern, sub_str, line)
            for item in line.split():
                yield item

    array = np.fromiter(items(file_obj), np.float64, -1)
    for value in value_null_subs:
        array[array == value] = np.nan
    return array


def get_substitutions(read_policy, null_policy):
    '''Parse read and null policy definitions into a list of regexp and value
    substitutions.

    Arguments:
        read_policy (str, list, or substitution): either (1) a string defined in 
            defaults.READ_POLICIES; (2) a list of substitutions as defined by
            the keys of defaults.READ_SUBS; or (3) a list of actual substitutions
            similar to the values of defaults.READ_SUBS. You can mix (2) and (3)
            together if you want.
        null_policy (str, list, or sub): as for read_policy but for 
            defaults.NULL_POLICIES and defaults.NULL_SUBS

    Returns:
        regexp_subs, value_null_subs, version_NULL - two lists and a bool. 
        The first list is pairs of regexp patterns and substrs, and the second
        list is just a list of floats or integers. The bool is whether or not
        'NULL' was located as a substitution.

    '''
    regexp_subs = []
    numerical_subs = []
    version_NULL = False

    for policy_typ, policy, policy_subs, subs in (
            ('read', read_policy, defaults.READ_POLICIES, defaults.READ_SUBS),
            ('null', null_policy, defaults.NULL_POLICIES, defaults.NULL_SUBS)):
        try:
            is_policy = policy in policy_subs
        except TypeError:
            is_policy = False
        if is_policy:
            logger.debug('using {} policy of "{}"'.format(policy_typ, policy))
            all_subs = []
            for sub in policy_subs[policy]:
                logger.debug('adding substitution {}'.format(sub))
                if sub in subs:
                    all_subs += subs[sub]
                if sub == 'NULL':
                    logger.debug('located substition for LAS.version.NULL as True')
                    version_NULL = True
        else:
            all_subs = []
            for item in policy:
                if item in subs:
                    all_subs += subs[item]
                    if item == 'NULL':
                        logger.debug('located substition for LAS.version.NULL as True')
                        version_NULL = True
                else:
                    all_subs.append(item)
        for item in all_subs:
            try:
                iter(item)
            except TypeError:
                logger.debug('added numerical substitution: {}'.format(item))
                numerical_subs.append(item)
            else:                
                logger.debug('added regexp substitution: pattern={} substr="{}"'.format(item[0], item[1]))
                regexp_subs.append(item)
    numerical_subs = [n for n in numerical_subs if not n is None]
                
    return regexp_subs, numerical_subs, version_NULL


def parse_header_section(sectdict, version, ignore_header_errors=False,
                         mnemonic_case='preserve'):
    '''Parse a header section dict into a SectionItems containing HeaderItems.

    Arguments:
        sectdict (dict): object returned from
            :func:`lasio.reader.read_file_contents`
        version (float): either 1.2 or 2.0

    Keyword Arguments:
        ignore_header_errors (bool): if True, issue HeaderItem parse errors
            as :func:`logging.warning` calls instead of a
            :exc:`lasio.exceptions.LASHeaderError` exception.
        mnemonic_case (str): 'preserve': keep the case of HeaderItem mnemonics
                             'upper': convert all HeaderItem mnemonics to uppercase
                             'lower': convert all HeaderItem mnemonics to lowercase

    Returns:
        :class:`lasio.las_items.SectionItems`

    '''
    title = sectdict["title"]
    assert len(sectdict["lines"]) == len(sectdict["line_nos"])
    parser = SectionParser(title, version=version)

    section = SectionItems()
    assert mnemonic_case in ('upper', 'lower', 'preserve')
    if not mnemonic_case == 'preserve':
        section.mnemonic_transforms = True
    
    for i in range(len(sectdict["lines"])):
        line = sectdict["lines"][i]
        j = sectdict["line_nos"][i]
        if not line:
            continue
        try:
            values = read_line(line)
        except:
            message = 'line {} (section {}): "{}"'.format(
                # traceback.format_exc().splitlines()[-1].strip('\n'),
                j, title, line)
            if ignore_header_errors:
                logger.warning(message)
            else:
                raise exceptions.LASHeaderError(message)
        else:
            if mnemonic_case == 'upper':
                values['name'] = values['name'].upper()
            elif mnemonic_case == 'lower':
                values['name'] = values['name'].lower()
            section.append(parser(**values))
    return section



class SectionParser(object):

    '''Parse lines from header sections.

    Arguments:
        title (str): title line of section. Used to understand different
            order formatting across the special sections ~C, ~P, ~W, and ~V,
            depending on version 1.2 or 2.0.

    Keyword Arguments:
        version (float): version to parse according to. Default is 1.2.

    '''

    def __init__(self, title, version=1.2):
        if title.upper().startswith('~C'):
            self.func = self.curves
            self.section_name2 = "Curves"
        elif title.upper().startswith('~P'):
            self.func = self.params
            self.section_name2 = "Parameter"
        elif title.upper().startswith('~W'):
            self.func = self.metadata
            self.section_name2 = "Well"
        elif title.upper().startswith('~V'):
            self.func = self.metadata
            self.section_name2 = "Version"


        self.version = version
        self.section_name = title

        defs = defaults.ORDER_DEFINITIONS
        section_orders = defs[self.version][self.section_name2]
        self.default_order = section_orders[0]#
        self.orders = {}
        for order, mnemonics in section_orders[1:]:
            for mnemonic in mnemonics:
                self.orders[mnemonic] = order

    def __call__(self, **keys):
        '''Return the correct object for this type of section.

        Refer to :meth:`lasio.reader.SectionParser.metadata`,
        :meth:`lasio.reader.SectionParser.params`, and
        :meth:`lasio.reader.SectionParser.curves` for the methods actually
        used by this routine.

        Keyword arguments should be the key:value pairs returned by
        :func:`lasio.reader.read_header_line`.

        '''
        item = self.func(**keys)
        return item

    def num(self, x, default=None):
        '''Attempt to parse a number.

        Arguments:
            x (str, int, float): potential number
            default (int, float, None): fall-back option

        Returns:
            int, float, or **default** - from most to least preferred types.

        '''
        if default is None:
            default = x
        
        # in case it is a string.
        try:
            pattern, sub = defaults.READ_SUBS['comma-decimal-mark'][0]
            x = re.sub(pattern, sub, x)
        except:
            pass

        try:
            return np.int(x)
        except:
            try:
                x = np.float(x)
            except:
                return default
        if np.isfinite(x):
            return x
        else:
            return default

    def metadata(self, **keys):
        '''Return HeaderItem correctly formatted according to the order
        prescribed for LAS v 1.2 or 2.0 for the ~W section.

        Keyword arguments should be the key:value pairs returned by
        :func:`lasio.reader.read_header_line`.

        '''
        key_order = self.orders.get(keys['name'], self.default_order)
        if key_order == 'value:descr':
            return HeaderItem(
                keys['name'],                 # mnemonic
                keys['unit'],                 # unit
                self.num(keys['value']),      # value
                keys['descr'],                # descr
            )
        elif key_order == 'descr:value':
            return HeaderItem(
                keys['name'],                   # mnemonic
                keys['unit'],                   # unit
                keys['descr'],                  # descr
                self.num(keys['value']),        # value
            )

    def curves(self, **keys):
        '''Return CurveItem.

        Keyword arguments should be the key:value pairs returned by
        :func:`lasio.reader.read_header_line`.

        '''
        item = CurveItem(
            keys['name'],               # mnemonic
            keys['unit'],               # unit
            keys['value'],              # value
            keys['descr'],              # descr
        )
        return item

    def params(self, **keys):
        '''Return HeaderItem for ~P section (the same between 1.2 and 2.0 specs)

        Keyword arguments should be the key:value pairs returned by
        :func:`lasio.reader.read_header_line`.

        '''
        return HeaderItem(
            keys['name'],               # mnemonic
            keys['unit'],               # unit
            self.num(keys['value']),    # value
            keys['descr'],              # descr
        )


def read_line(*args, **kwargs):
    '''Retained for backwards-compatibility.

    See :func:`lasio.reader.read_header_line`.

    '''
    return read_header_line(*args, **kwargs)


def read_header_line(line, pattern=None):
    '''Read a line from a LAS header section.

    The line is parsed with a regular expression -- see LAS file specs for
    more details, but it should basically be in the format::

        name.unit       value : descr

    Arguments:
        line (str): line from a LAS header section

    Returns:
        A dictionary with keys 'name', 'unit', 'value', and 'descr', each
        containing a string as value.

    '''
    d = {'name': '', 'unit': '', 'value': '', 'descr': ''}
    if pattern is None:
        if not ':' in line:
            pattern = (r'\.?(?P<name>[^.]*)\.' +
                       r'(?P<unit>[^\s:]*)' +
                       r'(?P<value>[^:]*)')
        else:
            pattern = (r'\.?(?P<name>[^.]*)\.' +
                       r'(?P<unit>[^\s:]*)' +
                       r'(?P<value>[^:]*):' +
                       r'(?P<descr>.*)')
    m = re.match(pattern, line)
    mdict = m.groupdict()
    for key, value in mdict.items():
        d[key] = value.strip()
        if key == 'unit':
            if d[key].endswith('.'):
                d[key] = d[key].strip('.')  # see issue #36
    return d


1			import codecs
2			import logging
3			import os
4			import re
5			import textwrap
6			import traceback
7
8			import numpy as np
9
10			from . import defaults
11
12			# Convoluted import for StringIO in order to support:
13			#
14			# - Python 3 - io.StringIO
15			# - Python 2 (optimized) - cStringIO.StringIO
16			# - Python 2 (all) - StringIO.StringIO
17
18			try:
19			import cStringIO as StringIO
20			except ImportError:
21			try: # cStringIO not available on this system
22			import StringIO
23			except ImportError: # Python 3
24			from io import StringIO
25			else:
26			from StringIO import StringIO
27			else:
28			from StringIO import StringIO
29
30			from . import defaults
31			from . import exceptions
32			from .las_items import HeaderItem, CurveItem, SectionItems, OrderedDict
33
34
35			logger = logging.getLogger(__name__)
36
37			URL_REGEXP = re.compile(
38			r'^(?:http\|ftp)s?://' # http:// or https://
39			r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}'
40			r'\.?\|[A-Z0-9-]{2,}\.?)\|' # (cont.) domain...
41			r'localhost\|' # localhost...
42			r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip
43			r'(?::\d+)?' # optional port
44			r'(?:/?\|[/?]\S+)$', re.IGNORECASE)
45
46
47			def open_file(file_ref, **encoding_kwargs):
48			'''Open a file if necessary.
49
50			If ``autodetect_encoding=True`` then either ``cchardet`` or ``chardet``
51			needs to be installed, or else an ``ImportError`` will be raised.
52
53			Arguments:
54			file_ref (file-like object, str): either a filename, an open file
55			object, or a string containing the contents of a file.
56
57			See :func:`lasio.reader.open_with_codecs` for keyword arguments that can be
58			used here.
59
60			Returns:
61			tuple of an open file-like object, and the encoding that
62			was used to decode it (if it were read from disk).
63
64			'''
65			encoding = None
66			if isinstance(file_ref, str): # file_ref != file-like object, so what is it?
67			lines = file_ref.splitlines()
68			first_line = lines[0]
69			if URL_REGEXP.match(first_line): # it's a URL
70			logger.info('Loading URL {}'.format(first_line))
71			try:
72			import urllib2
73			response = urllib2.urlopen(first_line)
74			encoding = response.headers.getparam('charset')
75			file_ref = StringIO(response.read())
76			logger.debug('Retrieved data had encoding {}'.format(encoding))
77			except ImportError:
78			import urllib.request
79			response = urllib.request.urlopen(file_ref)
80			encoding = response.headers.get_content_charset()
81			file_ref = StringIO(response.read().decode(encoding))
82			logger.debug('Retrieved data decoded via {}'.format(encoding))
83			elif len(lines) > 1: # it's LAS data as a string.
84			file_ref = StringIO(file_ref)
85			else: # it must be a filename
86			file_ref, encoding = open_with_codecs(first_line, **encoding_kwargs)
87			return file_ref, encoding
88
89
90			def open_with_codecs(filename, encoding=None, encoding_errors='replace',
91			autodetect_encoding=True, autodetect_encoding_chars=4000):
92			'''
93			Read Unicode data from file.
94
95			Arguments:
96			filename (str): path to file
97
98			Keyword Arguments:
99			encoding (str): character encoding to open file_ref with, using
100			:func:`codecs.open`.
101			encoding_errors (str): 'strict', 'replace' (default), 'ignore' - how to
102			handle errors with encodings (see
103			`this section
104			<https://docs.python.org/3/library/codecs.html#codec-base-classes>`__
105			of the standard library's :mod:`codecs` module for more information)
106			autodetect_encoding (str or bool): default True to use
107			`chardet <https://github.com/chardet/chardet>`__/`cchardet
108			<https://github.com/PyYoshi/cChardet>`__ to detect encoding.
109			Note if set to False several common encodings will be tried but
110			chardet won't be used.
111			autodetect_encoding_chars (int/None): number of chars to read from LAS
112			file for auto-detection of encoding.
113
114			Returns:
115			a unicode or string object
116
117			This function is called by :func:`lasio.reader.open_file`.
118
119			'''
120			if autodetect_encoding_chars:
121			nbytes = int(autodetect_encoding_chars)
122			else:
123			nbytes = None
124
125			# Forget [c]chardet - if we can locate the BOM we just assume that's correct.
126			nbytes_test = min(32, os.path.getsize(filename))
127			with open(filename, mode='rb') as test:
128			raw = test.read(nbytes_test)
129			if raw.startswith(codecs.BOM_UTF8):
130			encoding = 'utf-8-sig'
131			autodetect_encoding = False
132
133			# If BOM wasn't found...
134			if (autodetect_encoding) and (not encoding):
135			with open(filename, mode='rb') as test:
136			if nbytes is None:
137			raw = test.read()
138			else:
139			raw = test.read(nbytes)
140			encoding = get_encoding(autodetect_encoding, raw)
141			autodetect_encoding = False
142
143			# Or if no BOM found & chardet not installed
144			if (not autodetect_encoding) and (not encoding):
145			encoding = adhoc_test_encoding(filename)
146			if encoding:
147			logger.info('{} was found by ad hoc to work but note it might not'
148			' be the correct encoding'.format(encoding))
149
150			# Now open and return the file-like object
151			logger.info('Opening {} as {} and treating errors with "{}"'.format(
152			filename, encoding, encoding_errors))
153			file_obj = codecs.open(filename, mode='r', encoding=encoding,
154			errors=encoding_errors)
155			return file_obj, encoding
156
157
158			def adhoc_test_encoding(filename):
159			test_encodings = ['ascii', 'windows-1252', 'latin-1']
160			for i in test_encodings:
161			encoding = i
162			with codecs.open(filename, mode='r', encoding=encoding) as f:
163			try:
164			f.readline()
165			break
166			except UnicodeDecodeError:
167			logger.debug('{} tested, raised UnicodeDecodeError'.format(i))
168			pass
169			encoding = None
170			return encoding
171
172
173			def get_encoding(auto, raw):
174			'''
175			Automatically detect character encoding.
176
177			Arguments:
178			auto (str): auto-detection of character encoding - can be either
179			'chardet', 'cchardet', False, or True (the latter will pick the
180			fastest available option)
181			raw (bytes): array of bytes to detect from
182
183			Returns:
184			A string specifying the character encoding.
185
186			'''
187			if auto is True:
188			try:
189			import cchardet as chardet
190			except ImportError:
191			try:
192			import chardet
193			except ImportError:
194			logger.debug('chardet or cchardet is recommended for automatic'
195			' detection of character encodings. Instead trying some'
196			' common encodings.')
197			return None
198			else:
199			logger.debug('get_encoding Using chardet')
200			method = 'chardet'
201			else:
202			logger.debug('get_encoding Using cchardet')
203			method = 'cchardet'
204			elif auto.lower() == 'chardet':
205			import chardet
206			logger.debug('get_encoding Using chardet')
207			method = 'chardet'
208			elif auto.lower() == 'cchardet':
209			import cchardet as chardet
210			logger.debug('get_encoding Using cchardet')
211			method = 'cchardet'
212			result = chardet.detect(raw)
213			logger.debug('{} method detected encoding of {} at confidence {}'.format(
214			method, result['encoding'], result['confidence']))
215			return result['encoding']
216
217
218			def read_file_contents(file_obj, regexp_subs, value_null_subs,
219			ignore_data=False):
220			'''Read file contents into memory.
221
222			Arguments:
223			file_obj (open file-like object)
224
225			Keyword Arguments:
226			null_subs (bool): True will substitute ``numpy.nan`` for invalid values
227			ignore_data (bool): if True, do not read in the numerical data in the
228			~ASCII section
229
230			Returns:
231			OrderedDict
232
233			I think of the returned dictionary as a "raw section". The keys are
234			the first line of the LAS section, including the tilde. Each value is
235			a dict with either::
236
237			{"section_type": "header",
238			"title": str, # title of section (including the ~)
239			"lines": [str, ], # a list of the lines from the lAS file
240			"line_nos": [int, ] # line nos from the original file
241			}
242
243			or::
244
245			{"section_type": "data",
246			"title": str, # title of section (including the ~)
247			"start_line": int, # location of data section (the title line)
248			"ncols": int, # no. of columns on first line of data,
249			"array": ndarray # 1-D numpy.ndarray,
250			}
251
252			'''
253			sections = OrderedDict()
254			sect_lines = []
255			sect_line_nos = []
256			sect_title_line = None
257
258			for i, line in enumerate(file_obj):
259			line = line.strip()
260			if not line:
261			continue
262			if line.upper().startswith('~A'):
263			# HARD CODED FOR VERSION 1.2 and 2.0; needs review for 3.0
264			# We have finished looking at the metadata and need
265			# to start reading numerical data.
266			if not sect_title_line is None:
267			sections[sect_title_line] = {
268			"section_type": "header",
269			"title": sect_title_line,
270			"lines": sect_lines,
271			"line_nos": sect_line_nos,
272			}
273			if not ignore_data:
274			try:
275			data = read_data_section_iterative(file_obj, regexp_subs, value_null_subs)
276			except:
277			raise exceptions.LASDataError(
278			traceback.format_exc()[:-1] +
279			' in data section beginning line {}'.format(i + 1))
280			sections[line] = {
281			"section_type": "data",
282			"start_line": i,
283			"title": line,
284			"array": data,
285			}
286			logger.debug('Data section ["array"].shape = {}'.format(data.shape))
287			break
288
289			elif line.startswith('~'):
290			if sect_lines:
291			# We have ended a section and need to start the next
292			sections[sect_title_line] = {
293			"section_type": "header",
294			"title": sect_title_line,
295			"lines": sect_lines,
296			"line_nos": sect_line_nos,
297			}
298			sect_lines = []
299			sect_line_nos = []
300			else:
301			# We are entering into a section for the first time
302			pass
303			sect_title_line = line # either way... this is the case.
304
305			else:
306			# We are in the middle of a section.
307			if not line.startswith("#"): # ignore commented-out lines.. for now.
308			sect_lines.append(line)
309			sect_line_nos.append(i + 1)
310
311			# Find the number of columns in the data section(s). This is only
312			# useful if WRAP = NO, but we do it for all since we don't yet know
313			# what the wrap setting is.
314
315			for section in sections.values():
316			if section["section_type"] == "data":
317			section["ncols"] = None
318			file_obj.seek(0)
319			for i, line in enumerate(file_obj):
320			if i == section["start_line"] + 1:
321			for pattern, sub_str in regexp_subs:
322			line = re.sub(pattern, sub_str, line)
323			section["ncols"] = len(line.split())
324			break
325			return sections
326
327
328			def read_data_section_iterative(file_obj, regexp_subs, value_null_subs):
329			'''Read data section into memory.
330
331			Arguments:
332			file_obj (open file-like object): should be positioned in line-by-line
333			reading mode, with the last line read being the title of the
334			~ASCII data section.
335			regexp_subs (list): each item should be a tuple of the pattern and
336			substitution string for a call to re.sub() on each line of the
337			data section. See defaults.py READ_SUBS and NULL_SUBS for examples.
338			value_null_subs (list): list of numerical values to be replaced by
339			numpy.nan values.
340
341			Returns:
342			A 1-D numpy ndarray.
343
344			'''
345			def items(f):
346			for line in f:
347			for pattern, sub_str in regexp_subs:
348			line = re.sub(pattern, sub_str, line)
349			for item in line.split():
350			yield item
351
352			array = np.fromiter(items(file_obj), np.float64, -1)
353			for value in value_null_subs:
354			array[array == value] = np.nan
355			return array
356
357
358			def get_substitutions(read_policy, null_policy):
359			'''Parse read and null policy definitions into a list of regexp and value
360			substitutions.
361
362			Arguments:
363			read_policy (str, list, or substitution): either (1) a string defined in
364			defaults.READ_POLICIES; (2) a list of substitutions as defined by
365			the keys of defaults.READ_SUBS; or (3) a list of actual substitutions
366			similar to the values of defaults.READ_SUBS. You can mix (2) and (3)
367			together if you want.
368			null_policy (str, list, or sub): as for read_policy but for
369			defaults.NULL_POLICIES and defaults.NULL_SUBS
370
371			Returns:
372			regexp_subs, value_null_subs, version_NULL - two lists and a bool.
373			The first list is pairs of regexp patterns and substrs, and the second
374			list is just a list of floats or integers. The bool is whether or not
375			'NULL' was located as a substitution.
376
377			'''
378			regexp_subs = []
379			numerical_subs = []
380			version_NULL = False
381
382			for policy_typ, policy, policy_subs, subs in (
383			('read', read_policy, defaults.READ_POLICIES, defaults.READ_SUBS),
384			('null', null_policy, defaults.NULL_POLICIES, defaults.NULL_SUBS)):
385			try:
386			is_policy = policy in policy_subs
387			except TypeError:
388			is_policy = False
389			if is_policy:
390			logger.debug('using {} policy of "{}"'.format(policy_typ, policy))
391			all_subs = []
392			for sub in policy_subs[policy]:
393			logger.debug('adding substitution {}'.format(sub))
394			if sub in subs:
395			all_subs += subs[sub]
396			if sub == 'NULL':
397			logger.debug('located substition for LAS.version.NULL as True')
398			version_NULL = True
399			else:
400			all_subs = []
401			for item in policy:
402			if item in subs:
403			all_subs += subs[item]
404			if item == 'NULL':
405			logger.debug('located substition for LAS.version.NULL as True')
406			version_NULL = True
407			else:
408			all_subs.append(item)
409			for item in all_subs:
410			try:
411			iter(item)
412			except TypeError:
413			logger.debug('added numerical substitution: {}'.format(item))
414			numerical_subs.append(item)
415			else:
416			logger.debug('added regexp substitution: pattern={} substr="{}"'.format(item[0], item[1]))
417			regexp_subs.append(item)
418			numerical_subs = [n for n in numerical_subs if not n is None]
419
420			return regexp_subs, numerical_subs, version_NULL
421
422
423			def parse_header_section(sectdict, version, ignore_header_errors=False,
424			mnemonic_case='preserve'):
425			'''Parse a header section dict into a SectionItems containing HeaderItems.
426
427			Arguments:
428			sectdict (dict): object returned from
429			:func:`lasio.reader.read_file_contents`
430			version (float): either 1.2 or 2.0
431
432			Keyword Arguments:
433			ignore_header_errors (bool): if True, issue HeaderItem parse errors
434			as :func:`logging.warning` calls instead of a
435			:exc:`lasio.exceptions.LASHeaderError` exception.
436			mnemonic_case (str): 'preserve': keep the case of HeaderItem mnemonics
437			'upper': convert all HeaderItem mnemonics to uppercase
438			'lower': convert all HeaderItem mnemonics to lowercase
439
440			Returns:
441			:class:`lasio.las_items.SectionItems`
442
443			'''
444			title = sectdict["title"]
445			assert len(sectdict["lines"]) == len(sectdict["line_nos"])
446			parser = SectionParser(title, version=version)
447
448			section = SectionItems()
449			assert mnemonic_case in ('upper', 'lower', 'preserve')
450			if not mnemonic_case == 'preserve':
451			section.mnemonic_transforms = True
452
453			for i in range(len(sectdict["lines"])):
454			line = sectdict["lines"][i]
455			j = sectdict["line_nos"][i]
456			if not line:
457			continue
458			try:
459			values = read_line(line)
460			except:
461			message = 'line {} (section {}): "{}"'.format(
462			# traceback.format_exc().splitlines()[-1].strip('\n'),
463			j, title, line)
464			if ignore_header_errors:
465			logger.warning(message)
466			else:
467			raise exceptions.LASHeaderError(message)
468			else:
469			if mnemonic_case == 'upper':
470			values['name'] = values['name'].upper()
471			elif mnemonic_case == 'lower':
472			values['name'] = values['name'].lower()
473			section.append(parser(**values))
474			return section
475
476
477
478			class SectionParser(object):
479
480			'''Parse lines from header sections.
481
482			Arguments:
483			title (str): title line of section. Used to understand different
484			order formatting across the special sections ~C, ~P, ~W, and ~V,
485			depending on version 1.2 or 2.0.
486
487			Keyword Arguments:
488			version (float): version to parse according to. Default is 1.2.
489
490			'''
491
492			def __init__(self, title, version=1.2):
493			if title.upper().startswith('~C'):
494			self.func = self.curves
495			self.section_name2 = "Curves"
496			elif title.upper().startswith('~P'):
497			self.func = self.params
498			self.section_name2 = "Parameter"
499			elif title.upper().startswith('~W'):
500			self.func = self.metadata
501			self.section_name2 = "Well"
502			elif title.upper().startswith('~V'):
503			self.func = self.metadata
504			self.section_name2 = "Version"
505
506
507			self.version = version
508			self.section_name = title
509
510			defs = defaults.ORDER_DEFINITIONS
511			section_orders = defs[self.version][self.section_name2]
512			self.default_order = section_orders[0]#
513			self.orders = {}
514			for order, mnemonics in section_orders[1:]:
515			for mnemonic in mnemonics:
516			self.orders[mnemonic] = order
517
518			def __call__(self, **keys):
519			'''Return the correct object for this type of section.
520
521			Refer to :meth:`lasio.reader.SectionParser.metadata`,
522			:meth:`lasio.reader.SectionParser.params`, and
523			:meth:`lasio.reader.SectionParser.curves` for the methods actually
524			used by this routine.
525
526			Keyword arguments should be the key:value pairs returned by
527			:func:`lasio.reader.read_header_line`.
528
529			'''
530			item = self.func(**keys)
531			return item
532
533			def num(self, x, default=None):
534			'''Attempt to parse a number.
535
536			Arguments:
537			x (str, int, float): potential number
538			default (int, float, None): fall-back option
539
540			Returns:
541			int, float, or default - from most to least preferred types.
542
543			'''
544			if default is None:
545			default = x
546
547			# in case it is a string.
548			try:
549			pattern, sub = defaults.READ_SUBS['comma-decimal-mark'][0]
550			x = re.sub(pattern, sub, x)
551			except:
552			pass
553
554			try:
555			return np.int(x)
556			except:
557			try:
558			x = np.float(x)
559			except:
560			return default
561			if np.isfinite(x):
562			return x
563			else:
564			return default
565
566			def metadata(self, **keys):
567			'''Return HeaderItem correctly formatted according to the order
568			prescribed for LAS v 1.2 or 2.0 for the ~W section.
569
570			Keyword arguments should be the key:value pairs returned by
571			:func:`lasio.reader.read_header_line`.
572
573			'''
574			key_order = self.orders.get(keys['name'], self.default_order)
575			if key_order == 'value:descr':
576			return HeaderItem(
577			keys['name'], # mnemonic
578			keys['unit'], # unit
579			self.num(keys['value']), # value
580			keys['descr'], # descr
581			)
582			elif key_order == 'descr:value':
583			return HeaderItem(
584			keys['name'], # mnemonic
585			keys['unit'], # unit
586			keys['descr'], # descr
587			self.num(keys['value']), # value
588			)
589
590			def curves(self, **keys):
591			'''Return CurveItem.
592
593			Keyword arguments should be the key:value pairs returned by
594			:func:`lasio.reader.read_header_line`.
595
596			'''
597			item = CurveItem(
598			keys['name'], # mnemonic
599			keys['unit'], # unit
600			keys['value'], # value
601			keys['descr'], # descr
602			)
603			return item
604
605			def params(self, **keys):
606			'''Return HeaderItem for ~P section (the same between 1.2 and 2.0 specs)
607
608			Keyword arguments should be the key:value pairs returned by
609			:func:`lasio.reader.read_header_line`.
610
611			'''
612			return HeaderItem(
613			keys['name'], # mnemonic
614			keys['unit'], # unit
615			self.num(keys['value']), # value
616			keys['descr'], # descr
617			)
618
619
620			def read_line(args, *kwargs):
621			'''Retained for backwards-compatibility.
622
623			See :func:`lasio.reader.read_header_line`.
624
625			'''
626			return read_header_line(args, *kwargs)
627
628
629			def read_header_line(line, pattern=None):
630			'''Read a line from a LAS header section.
631
632			The line is parsed with a regular expression -- see LAS file specs for
633			more details, but it should basically be in the format::
634
635			name.unit value : descr
636
637			Arguments:
638			line (str): line from a LAS header section
639
640			Returns:
641			A dictionary with keys 'name', 'unit', 'value', and 'descr', each
642			containing a string as value.
643
644			'''
645			d = {'name': '', 'unit': '', 'value': '', 'descr': ''}
646			if pattern is None:
647			if not ':' in line:
648			pattern = (r'\.?(?P<name>[^.]*)\.' +
649			r'(?P<unit>[^\s:]*)' +
650			r'(?P<value>[^:]*)')
651			else:
652			pattern = (r'\.?(?P<name>[^.]*)\.' +
653			r'(?P<unit>[^\s:]*)' +
654			r'(?P<value>[^:]*):' +
655			r'(?P<descr>.*)')
656			m = re.match(pattern, line)
657			mdict = m.groupdict()
658			for key, value in mdict.items():
659			d[key] = value.strip()
660			if key == 'unit':
661			if d[key].endswith('.'):
662			d[key] = d[key].strip('.') # see issue #36
663			return d
664

kinverarity1 / lasio

Push — master ( afd9d8...0f19a6 )

parse_header_section() F

Complexity

Size

Duplication

Importance

How to fix Long Method Complexity

Long Method

Complexity

Duplication Side-by-Side

Filter issues like