parse_header_section() - Code Metrics - kinverarity1/lasio - Measure and Improve Code Quality continuously with Scrutinizer

parse_header_section() C
last analyzed 2018-07-04 10:31 UTC

↳ Parent: Project

Complexity

Conditions

Size

Total Lines

Duplication

Lines	0
Ratio	0 %

Importance

Changes	4
Bugs	1	Features	0

Metric	Value
cc	11
dl	0
loc	52
rs	5.3509
c	4
b	1
f	0

How to fix Long Method Complexity

import codecs
import logging
import os
import re
import textwrap
import traceback

import numpy as np

from . import defaults

# Convoluted import for StringIO in order to support:
#
# - Python 3 - io.StringIO
# - Python 2 (optimized) - cStringIO.StringIO
# - Python 2 (all) - StringIO.StringIO

try:
    import cStringIO as StringIO
except ImportError:
    try:  # cStringIO not available on this system
        import StringIO
    except ImportError:  # Python 3
        from io import StringIO
    else:
        from StringIO import StringIO
else:
    from StringIO import StringIO

from . import defaults
from . import exceptions
from .las_items import HeaderItem, CurveItem, SectionItems, OrderedDict


logger = logging.getLogger(__name__)

URL_REGEXP = re.compile(
    r'^(?:http|ftp)s?://'  # http:// or https://
    r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}'
    r'\.?|[A-Z0-9-]{2,}\.?)|'  # (cont.) domain...
    r'localhost|'  # localhost...
    r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})'  # ...or ip
    r'(?::\d+)?'  # optional port
    r'(?:/?|[/?]\S+)$', re.IGNORECASE)


def open_file(file_ref, **encoding_kwargs):
    '''Open a file if necessary.

    If ``autodetect_encoding=True`` then either ``cchardet`` or ``chardet``
    needs to be installed, or else an ``ImportError`` will be raised.

    Arguments:
        file_ref (file-like object, str): either a filename, an open file
            object, or a string containing the contents of a file.

    See :func:`lasio.reader.open_with_codecs` for keyword arguments that can be
    used here.

    Returns: 
        tuple of an open file-like object, and the encoding that
        was used to decode it (if it were read from disk).

    '''
    encoding = None
    if isinstance(file_ref, str): # file_ref != file-like object, so what is it?
        lines = file_ref.splitlines()
        first_line = lines[0]
        if URL_REGEXP.match(first_line): # it's a URL
            logger.info('Loading URL {}'.format(first_line))
            try:
                import urllib2
                response = urllib2.urlopen(first_line)
                encoding = response.headers.getparam('charset')
                file_ref = StringIO(response.read())
                logger.debug('Retrieved data had encoding {}'.format(encoding))
            except ImportError:
                import urllib.request
                response = urllib.request.urlopen(file_ref)
                encoding = response.headers.get_content_charset()
                file_ref = StringIO(response.read().decode(encoding))
                logger.debug('Retrieved data decoded via {}'.format(encoding))
        elif len(lines) > 1: # it's LAS data as a string.
            file_ref = StringIO(file_ref)
        else:  # it must be a filename
            file_ref, encoding = open_with_codecs(first_line, **encoding_kwargs)
    return file_ref, encoding


def open_with_codecs(filename, encoding=None, encoding_errors='replace',
              autodetect_encoding=True, autodetect_encoding_chars=4000):
    '''
    Read Unicode data from file.

    Arguments:
        filename (str): path to file

    Keyword Arguments:
        encoding (str): character encoding to open file_ref with, using
            :func:`codecs.open`.
        encoding_errors (str): 'strict', 'replace' (default), 'ignore' - how to
            handle errors with encodings (see
            `this section 
            <https://docs.python.org/3/library/codecs.html#codec-base-classes>`__
            of the standard library's :mod:`codecs` module for more information)
        autodetect_encoding (str or bool): default True to use 
            `chardet <https://github.com/chardet/chardet>`__/`cchardet 
            <https://github.com/PyYoshi/cChardet>`__ to detect encoding. 
            Note if set to False several common encodings will be tried but 
            chardet won't be used.
        autodetect_encoding_chars (int/None): number of chars to read from LAS
            file for auto-detection of encoding.

    Returns:
        a unicode or string object

    This function is called by :func:`lasio.reader.open_file`.

    '''
    if autodetect_encoding_chars:
        nbytes = int(autodetect_encoding_chars)
    else:
        nbytes = None

    # Forget [c]chardet - if we can locate the BOM we just assume that's correct.
    nbytes_test = min(32, os.path.getsize(filename))
    with open(filename, mode='rb') as test:
        raw = test.read(nbytes_test)
    if raw.startswith(codecs.BOM_UTF8):
        encoding = 'utf-8-sig'
        autodetect_encoding = False

    # If BOM wasn't found...
    if (autodetect_encoding) and (not encoding):
        with open(filename, mode='rb') as test:
            if nbytes is None:
                raw = test.read()
            else:
                raw = test.read(nbytes)
        encoding = get_encoding(autodetect_encoding, raw)
        autodetect_encoding = False
        
    # Or if no BOM found & chardet not installed
    if (not autodetect_encoding) and (not encoding):
        encoding = adhoc_test_encoding(filename)
        if encoding:
            logger.info('{} was found by ad hoc to work but note it might not'
                       ' be the correct encoding'.format(encoding))

    # Now open and return the file-like object
    logger.info('Opening {} as {} and treating errors with "{}"'.format(
        filename, encoding, encoding_errors))
    file_obj = codecs.open(filename, mode='r', encoding=encoding,
        errors=encoding_errors)
    return file_obj, encoding


def adhoc_test_encoding(filename):
    test_encodings = ['ascii', 'windows-1252', 'latin-1']
    for i in test_encodings:
        encoding = i
        with codecs.open(filename, mode='r', encoding=encoding) as f:
            try:
                f.readline()
                break
            except UnicodeDecodeError:
                logger.debug('{} tested, raised UnicodeDecodeError'.format(i))
                pass
            encoding = None
    return encoding


def get_encoding(auto, raw):
    '''
    Automatically detect character encoding.

    Arguments:
        auto (str): auto-detection of character encoding - can be either
            'chardet', 'cchardet', False, or True (the latter will pick the
            fastest available option)
        raw (bytes): array of bytes to detect from

    Returns:
        A string specifying the character encoding.

    '''
    if auto is True:
        try:
            import cchardet as chardet
        except ImportError:
            try:
                import chardet
            except ImportError:
                logger.debug('chardet or cchardet is recommended for automatic'
                    ' detection of character encodings. Instead trying some'
                    ' common encodings.')
                return None
            else:
                logger.debug('get_encoding Using chardet')
                method = 'chardet'
        else:
            logger.debug('get_encoding Using cchardet')
            method = 'cchardet'
    elif auto.lower() == 'chardet':
        import chardet
        logger.debug('get_encoding Using chardet')
        method = 'chardet'
    elif auto.lower() == 'cchardet':
        import cchardet as chardet
        logger.debug('get_encoding Using cchardet')
        method = 'cchardet'
    result = chardet.detect(raw)
    logger.debug('{} method detected encoding of {} at confidence {}'.format(
        method, result['encoding'], result['confidence']))
    return result['encoding']


def read_file_contents(file_obj, regexp_subs, value_null_subs, 
                       ignore_data=False):
    '''Read file contents into memory.

    Arguments:
        file_obj (open file-like object)

    Keyword Arguments:
        null_subs (bool): True will substitute ``numpy.nan`` for invalid values
        ignore_data (bool): if True, do not read in the numerical data in the
            ~ASCII section

    Returns:
        OrderedDict

    I think of the returned dictionary as a "raw section". The keys are
    the first line of the LAS section, including the tilde. Each value is
    a dict with either::

        {"section_type": "header",
         "title": str,               # title of section (including the ~)
         "lines": [str, ],           # a list of the lines from the lAS file
         "line_nos": [int, ]         # line nos from the original file
         }

    or::

        {"section_type": "data",
         "title": str,              # title of section (including the ~)
         "start_line": int,         # location of data section (the title line)
         "ncols": int,              # no. of columns on first line of data,
         "array": ndarray           # 1-D numpy.ndarray,
         }

    '''
    sections = OrderedDict()
    sect_lines = []
    sect_line_nos = []
    sect_title_line = None
    section_exists = False

    for i, line in enumerate(file_obj):
        line = line.strip()
        if not line:
            continue
        if line.upper().startswith('~A'):
            # HARD CODED FOR VERSION 1.2 and 2.0; needs review for 3.0
            # We have finished looking at the metadata and need
            # to start reading numerical data.
            if not sect_title_line is None:
                sections[sect_title_line] = {
                    "section_type": "header",
                    "title": sect_title_line,
                    "lines": sect_lines,
                    "line_nos": sect_line_nos,
                    }
            if not ignore_data:
                try:
                    data = read_data_section_iterative(file_obj, regexp_subs, value_null_subs)
                except:
                    raise exceptions.LASDataError(
                        traceback.format_exc()[:-1] + 
                        ' in data section beginning line {}'.format(i + 1))
                sections[line] = {
                    "section_type": "data",
                    "start_line": i,
                    "title": line,
                    "array": data,
                    }
                logger.debug('Data section ["array"].shape = {}'.format(data.shape))
            break

        elif line.startswith('~'):
            if section_exists:
                # We have ended a section and need to start the next
                sections[sect_title_line] = {
                    "section_type": "header",
                    "title": sect_title_line,
                    "lines": sect_lines,
                    "line_nos": sect_line_nos,
                    }
                sect_lines = []
                sect_line_nos = []
            else:
                # We are entering into a section for the first time
                section_exists = True
                pass
            sect_title_line = line # either way... this is the case.

        else:
            # We are in the middle of a section.
            if not line.startswith("#"): # ignore commented-out lines.. for now.
                sect_lines.append(line)
                sect_line_nos.append(i + 1)

    # Find the number of columns in the data section(s). This is only
    # useful if WRAP = NO, but we do it for all since we don't yet know
    # what the wrap setting is.

    for section in sections.values():
        if section["section_type"] == "data":
            section["ncols"] = None
            file_obj.seek(0)
            for i, line in enumerate(file_obj):
                if i == section["start_line"] + 1:
                    for pattern, sub_str in regexp_subs:
                        line = re.sub(pattern, sub_str, line)
                    section["ncols"] = len(line.split())
                    break
    return sections


def read_data_section_iterative(file_obj, regexp_subs, value_null_subs):
    '''Read data section into memory.

    Arguments:
        file_obj (open file-like object): should be positioned in line-by-line
            reading mode, with the last line read being the title of the
            ~ASCII data section.
        regexp_subs (list): each item should be a tuple of the pattern and
            substitution string for a call to re.sub() on each line of the
            data section. See defaults.py READ_SUBS and NULL_SUBS for examples.
        value_null_subs (list): list of numerical values to be replaced by
            numpy.nan values.

    Returns:
        A 1-D numpy ndarray.

    '''
    def items(f):
        for line in f:
            for pattern, sub_str in regexp_subs:
                line = re.sub(pattern, sub_str, line)
            for item in line.split():
                try:
                    yield np.float64(item)
                except ValueError:
                    yield item

    array = np.array([i for i in items(file_obj)])
    for value in value_null_subs:
        array[array == value] = np.nan
    return array


def get_substitutions(read_policy, null_policy):
    '''Parse read and null policy definitions into a list of regexp and value
    substitutions.

    Arguments:
        read_policy (str, list, or substitution): either (1) a string defined in 
            defaults.READ_POLICIES; (2) a list of substitutions as defined by
            the keys of defaults.READ_SUBS; or (3) a list of actual substitutions
            similar to the values of defaults.READ_SUBS. You can mix (2) and (3)
            together if you want.
        null_policy (str, list, or sub): as for read_policy but for 
            defaults.NULL_POLICIES and defaults.NULL_SUBS

    Returns:
        regexp_subs, value_null_subs, version_NULL - two lists and a bool. 
        The first list is pairs of regexp patterns and substrs, and the second
        list is just a list of floats or integers. The bool is whether or not
        'NULL' was located as a substitution.

    '''
    regexp_subs = []
    numerical_subs = []
    version_NULL = False

    for policy_typ, policy, policy_subs, subs in (
            ('read', read_policy, defaults.READ_POLICIES, defaults.READ_SUBS),
            ('null', null_policy, defaults.NULL_POLICIES, defaults.NULL_SUBS)):
        try:
            is_policy = policy in policy_subs
        except TypeError:
            is_policy = False
        if is_policy:
            logger.debug('using {} policy of "{}"'.format(policy_typ, policy))
            all_subs = []
            for sub in policy_subs[policy]:
                logger.debug('adding substitution {}'.format(sub))
                if sub in subs:
                    all_subs += subs[sub]
                if sub == 'NULL':
                    logger.debug('located substition for LAS.version.NULL as True')
                    version_NULL = True
        else:
            all_subs = []
            for item in policy:
                if item in subs:
                    all_subs += subs[item]
                    if item == 'NULL':
                        logger.debug('located substition for LAS.version.NULL as True')
                        version_NULL = True
                else:
                    all_subs.append(item)
        for item in all_subs:
            try:
                iter(item)
            except TypeError:
                logger.debug('added numerical substitution: {}'.format(item))
                numerical_subs.append(item)
            else:                
                logger.debug('added regexp substitution: pattern={} substr="{}"'.format(item[0], item[1]))
                regexp_subs.append(item)
    numerical_subs = [n for n in numerical_subs if not n is None]
                
    return regexp_subs, numerical_subs, version_NULL


def parse_header_section(sectdict, version, ignore_header_errors=False,
                         mnemonic_case='preserve'):
    '''Parse a header section dict into a SectionItems containing HeaderItems.

    Arguments:
        sectdict (dict): object returned from
            :func:`lasio.reader.read_file_contents`
        version (float): either 1.2 or 2.0

    Keyword Arguments:
        ignore_header_errors (bool): if True, issue HeaderItem parse errors
            as :func:`logging.warning` calls instead of a
            :exc:`lasio.exceptions.LASHeaderError` exception.
        mnemonic_case (str): 'preserve': keep the case of HeaderItem mnemonics
                             'upper': convert all HeaderItem mnemonics to uppercase
                             'lower': convert all HeaderItem mnemonics to lowercase

    Returns:
        :class:`lasio.las_items.SectionItems`

    '''
    title = sectdict["title"]
    assert len(sectdict["lines"]) == len(sectdict["line_nos"])
    parser = SectionParser(title, version=version)

    section = SectionItems()
    assert mnemonic_case in ('upper', 'lower', 'preserve')
    if not mnemonic_case == 'preserve':
        section.mnemonic_transforms = True
    
    for i in range(len(sectdict["lines"])):
        line = sectdict["lines"][i]
        j = sectdict["line_nos"][i]
        if not line:
            continue
        try:
            values = read_line(line)
        except:
            message = 'line {} (section {}): "{}"'.format(
                # traceback.format_exc().splitlines()[-1].strip('\n'),
                j, title, line)
            if ignore_header_errors:
                logger.warning(message)
            else:
                raise exceptions.LASHeaderError(message)
        else:
            if mnemonic_case == 'upper':
                values['name'] = values['name'].upper()
            elif mnemonic_case == 'lower':
                values['name'] = values['name'].lower()
            section.append(parser(**values))
    return section



class SectionParser(object):

    '''Parse lines from header sections.

    Arguments:
        title (str): title line of section. Used to understand different
            order formatting across the special sections ~C, ~P, ~W, and ~V,
            depending on version 1.2 or 2.0.

    Keyword Arguments:
        version (float): version to parse according to. Default is 1.2.

    '''

    def __init__(self, title, version=1.2):
        if title.upper().startswith('~C'):
            self.func = self.curves
            self.section_name2 = "Curves"
        elif title.upper().startswith('~P'):
            self.func = self.params
            self.section_name2 = "Parameter"
        elif title.upper().startswith('~W'):
            self.func = self.metadata
            self.section_name2 = "Well"
        elif title.upper().startswith('~V'):
            self.func = self.metadata
            self.section_name2 = "Version"


        self.version = version
        self.section_name = title

        defs = defaults.ORDER_DEFINITIONS
        section_orders = defs[self.version][self.section_name2]
        self.default_order = section_orders[0]#
        self.orders = {}
        for order, mnemonics in section_orders[1:]:
            for mnemonic in mnemonics:
                self.orders[mnemonic] = order

    def __call__(self, **keys):
        '''Return the correct object for this type of section.

        Refer to :meth:`lasio.reader.SectionParser.metadata`,
        :meth:`lasio.reader.SectionParser.params`, and
        :meth:`lasio.reader.SectionParser.curves` for the methods actually
        used by this routine.

        Keyword arguments should be the key:value pairs returned by
        :func:`lasio.reader.read_header_line`.

        '''
        item = self.func(**keys)
        return item

    def num(self, x, default=None):
        '''Attempt to parse a number.

        Arguments:
            x (str, int, float): potential number
            default (int, float, None): fall-back option

        Returns:
            int, float, or **default** - from most to least preferred types.

        '''
        if default is None:
            default = x
        
        # in case it is a string.
        try:
            pattern, sub = defaults.READ_SUBS['comma-decimal-mark'][0]
            x = re.sub(pattern, sub, x)
        except:
            pass

        try:
            return np.int(x)
        except:
            try:
                x = np.float(x)
            except:
                return default
        if np.isfinite(x):
            return x
        else:
            return default

    def metadata(self, **keys):
        '''Return HeaderItem correctly formatted according to the order
        prescribed for LAS v 1.2 or 2.0 for the ~W section.

        Keyword arguments should be the key:value pairs returned by
        :func:`lasio.reader.read_header_line`.

        '''
        key_order = self.orders.get(keys['name'], self.default_order)
        if key_order == 'value:descr':
            return HeaderItem(
                keys['name'],                 # mnemonic
                keys['unit'],                 # unit
                self.num(keys['value']),      # value
                keys['descr'],                # descr
            )
        elif key_order == 'descr:value':
            return HeaderItem(
                keys['name'],                   # mnemonic
                keys['unit'],                   # unit
                keys['descr'],                  # descr
                self.num(keys['value']),        # value
            )

    def curves(self, **keys):
        '''Return CurveItem.

        Keyword arguments should be the key:value pairs returned by
        :func:`lasio.reader.read_header_line`.

        '''
        item = CurveItem(
            keys['name'],               # mnemonic
            keys['unit'],               # unit
            keys['value'],              # value
            keys['descr'],              # descr
        )
        return item

    def params(self, **keys):
        '''Return HeaderItem for ~P section (the same between 1.2 and 2.0 specs)

        Keyword arguments should be the key:value pairs returned by
        :func:`lasio.reader.read_header_line`.

        '''
        return HeaderItem(
            keys['name'],               # mnemonic
            keys['unit'],               # unit
            self.num(keys['value']),    # value
            keys['descr'],              # descr
        )


def read_line(*args, **kwargs):
    '''Retained for backwards-compatibility.

    See :func:`lasio.reader.read_header_line`.

    '''
    return read_header_line(*args, **kwargs)


def read_header_line(line, pattern=None):
    '''Read a line from a LAS header section.

    The line is parsed with a regular expression -- see LAS file specs for
    more details, but it should basically be in the format::

        name.unit       value : descr

    Arguments:
        line (str): line from a LAS header section

    Returns:
        A dictionary with keys 'name', 'unit', 'value', and 'descr', each
        containing a string as value.

    '''
    d = {'name': '', 'unit': '', 'value': '', 'descr': ''}
    if pattern is None:
        if not ':' in line:
            pattern = (r'\.?(?P<name>[^.]*)\.' +
                       r'(?P<unit>[^\s:]*)' +
                       r'(?P<value>[^:]*)')
        else:
            pattern = (r'\.?(?P<name>[^.]*)\.' +
                       r'(?P<unit>[^\s:]*)' +
                       r'(?P<value>[^:]*):' +
                       r'(?P<descr>.*)')
    m = re.match(pattern, line)
    mdict = m.groupdict()
    for key, value in mdict.items():
        d[key] = value.strip()
        if key == 'unit':
            if d[key].endswith('.'):
                d[key] = d[key].strip('.')  # see issue #36
    return d


1			import codecs
2			import logging
3			import os
4			import re
5			import textwrap
6			import traceback
7
8			import numpy as np
9
10			from . import defaults
11
12			# Convoluted import for StringIO in order to support:
13			#
14			# - Python 3 - io.StringIO
15			# - Python 2 (optimized) - cStringIO.StringIO
16			# - Python 2 (all) - StringIO.StringIO
17
18			try:
19			import cStringIO as StringIO
20			except ImportError:
21			try: # cStringIO not available on this system
22			import StringIO
23			except ImportError: # Python 3
24			from io import StringIO
25			else:
26			from StringIO import StringIO
27			else:
28			from StringIO import StringIO
29
30			from . import defaults
31			from . import exceptions
32			from .las_items import HeaderItem, CurveItem, SectionItems, OrderedDict
33
34
35			logger = logging.getLogger(__name__)
36
37			URL_REGEXP = re.compile(
38			r'^(?:http\|ftp)s?://' # http:// or https://
39			r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}'
40			r'\.?\|[A-Z0-9-]{2,}\.?)\|' # (cont.) domain...
41			r'localhost\|' # localhost...
42			r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip
43			r'(?::\d+)?' # optional port
44			r'(?:/?\|[/?]\S+)$', re.IGNORECASE)
45
46
47			def open_file(file_ref, **encoding_kwargs):
48			'''Open a file if necessary.
49
50			If ``autodetect_encoding=True`` then either ``cchardet`` or ``chardet``
51			needs to be installed, or else an ``ImportError`` will be raised.
52
53			Arguments:
54			file_ref (file-like object, str): either a filename, an open file
55			object, or a string containing the contents of a file.
56
57			See :func:`lasio.reader.open_with_codecs` for keyword arguments that can be
58			used here.
59
60			Returns:
61			tuple of an open file-like object, and the encoding that
62			was used to decode it (if it were read from disk).
63
64			'''
65			encoding = None
66			if isinstance(file_ref, str): # file_ref != file-like object, so what is it?
67			lines = file_ref.splitlines()
68			first_line = lines[0]
69			if URL_REGEXP.match(first_line): # it's a URL
70			logger.info('Loading URL {}'.format(first_line))
71			try:
72			import urllib2
73			response = urllib2.urlopen(first_line)
74			encoding = response.headers.getparam('charset')
75			file_ref = StringIO(response.read())
76			logger.debug('Retrieved data had encoding {}'.format(encoding))
77			except ImportError:
78			import urllib.request
79			response = urllib.request.urlopen(file_ref)
80			encoding = response.headers.get_content_charset()
81			file_ref = StringIO(response.read().decode(encoding))
82			logger.debug('Retrieved data decoded via {}'.format(encoding))
83			elif len(lines) > 1: # it's LAS data as a string.
84			file_ref = StringIO(file_ref)
85			else: # it must be a filename
86			file_ref, encoding = open_with_codecs(first_line, **encoding_kwargs)
87			return file_ref, encoding
88
89
90			def open_with_codecs(filename, encoding=None, encoding_errors='replace',
91			autodetect_encoding=True, autodetect_encoding_chars=4000):
92			'''
93			Read Unicode data from file.
94
95			Arguments:
96			filename (str): path to file
97
98			Keyword Arguments:
99			encoding (str): character encoding to open file_ref with, using
100			:func:`codecs.open`.
101			encoding_errors (str): 'strict', 'replace' (default), 'ignore' - how to
102			handle errors with encodings (see
103			`this section
104			<https://docs.python.org/3/library/codecs.html#codec-base-classes>`__
105			of the standard library's :mod:`codecs` module for more information)
106			autodetect_encoding (str or bool): default True to use
107			`chardet <https://github.com/chardet/chardet>`__/`cchardet
108			<https://github.com/PyYoshi/cChardet>`__ to detect encoding.
109			Note if set to False several common encodings will be tried but
110			chardet won't be used.
111			autodetect_encoding_chars (int/None): number of chars to read from LAS
112			file for auto-detection of encoding.
113
114			Returns:
115			a unicode or string object
116
117			This function is called by :func:`lasio.reader.open_file`.
118
119			'''
120			if autodetect_encoding_chars:
121			nbytes = int(autodetect_encoding_chars)
122			else:
123			nbytes = None
124
125			# Forget [c]chardet - if we can locate the BOM we just assume that's correct.
126			nbytes_test = min(32, os.path.getsize(filename))
127			with open(filename, mode='rb') as test:
128			raw = test.read(nbytes_test)
129			if raw.startswith(codecs.BOM_UTF8):
130			encoding = 'utf-8-sig'
131			autodetect_encoding = False
132
133			# If BOM wasn't found...
134			if (autodetect_encoding) and (not encoding):
135			with open(filename, mode='rb') as test:
136			if nbytes is None:
137			raw = test.read()
138			else:
139			raw = test.read(nbytes)
140			encoding = get_encoding(autodetect_encoding, raw)
141			autodetect_encoding = False
142
143			# Or if no BOM found & chardet not installed
144			if (not autodetect_encoding) and (not encoding):
145			encoding = adhoc_test_encoding(filename)
146			if encoding:
147			logger.info('{} was found by ad hoc to work but note it might not'
148			' be the correct encoding'.format(encoding))
149
150			# Now open and return the file-like object
151			logger.info('Opening {} as {} and treating errors with "{}"'.format(
152			filename, encoding, encoding_errors))
153			file_obj = codecs.open(filename, mode='r', encoding=encoding,
154			errors=encoding_errors)
155			return file_obj, encoding
156
157
158			def adhoc_test_encoding(filename):
159			test_encodings = ['ascii', 'windows-1252', 'latin-1']
160			for i in test_encodings:
161			encoding = i
162			with codecs.open(filename, mode='r', encoding=encoding) as f:
163			try:
164			f.readline()
165			break
166			except UnicodeDecodeError:
167			logger.debug('{} tested, raised UnicodeDecodeError'.format(i))
168			pass
169			encoding = None
170			return encoding
171
172
173			def get_encoding(auto, raw):
174			'''
175			Automatically detect character encoding.
176
177			Arguments:
178			auto (str): auto-detection of character encoding - can be either
179			'chardet', 'cchardet', False, or True (the latter will pick the
180			fastest available option)
181			raw (bytes): array of bytes to detect from
182
183			Returns:
184			A string specifying the character encoding.
185
186			'''
187			if auto is True:
188			try:
189			import cchardet as chardet
190			except ImportError:
191			try:
192			import chardet
193			except ImportError:
194			logger.debug('chardet or cchardet is recommended for automatic'
195			' detection of character encodings. Instead trying some'
196			' common encodings.')
197			return None
198			else:
199			logger.debug('get_encoding Using chardet')
200			method = 'chardet'
201			else:
202			logger.debug('get_encoding Using cchardet')
203			method = 'cchardet'
204			elif auto.lower() == 'chardet':
205			import chardet
206			logger.debug('get_encoding Using chardet')
207			method = 'chardet'
208			elif auto.lower() == 'cchardet':
209			import cchardet as chardet
210			logger.debug('get_encoding Using cchardet')
211			method = 'cchardet'
212			result = chardet.detect(raw)
213			logger.debug('{} method detected encoding of {} at confidence {}'.format(
214			method, result['encoding'], result['confidence']))
215			return result['encoding']
216
217
218			def read_file_contents(file_obj, regexp_subs, value_null_subs,
219			ignore_data=False):
220			'''Read file contents into memory.
221
222			Arguments:
223			file_obj (open file-like object)
224
225			Keyword Arguments:
226			null_subs (bool): True will substitute ``numpy.nan`` for invalid values
227			ignore_data (bool): if True, do not read in the numerical data in the
228			~ASCII section
229
230			Returns:
231			OrderedDict
232
233			I think of the returned dictionary as a "raw section". The keys are
234			the first line of the LAS section, including the tilde. Each value is
235			a dict with either::
236
237			{"section_type": "header",
238			"title": str, # title of section (including the ~)
239			"lines": [str, ], # a list of the lines from the lAS file
240			"line_nos": [int, ] # line nos from the original file
241			}
242
243			or::
244
245			{"section_type": "data",
246			"title": str, # title of section (including the ~)
247			"start_line": int, # location of data section (the title line)
248			"ncols": int, # no. of columns on first line of data,
249			"array": ndarray # 1-D numpy.ndarray,
250			}
251
252			'''
253			sections = OrderedDict()
254			sect_lines = []
255			sect_line_nos = []
256			sect_title_line = None
257			section_exists = False
258
259			for i, line in enumerate(file_obj):
260			line = line.strip()
261			if not line:
262			continue
263			if line.upper().startswith('~A'):
264			# HARD CODED FOR VERSION 1.2 and 2.0; needs review for 3.0
265			# We have finished looking at the metadata and need
266			# to start reading numerical data.
267			if not sect_title_line is None:
268			sections[sect_title_line] = {
269			"section_type": "header",
270			"title": sect_title_line,
271			"lines": sect_lines,
272			"line_nos": sect_line_nos,
273			}
274			if not ignore_data:
275			try:
276			data = read_data_section_iterative(file_obj, regexp_subs, value_null_subs)
277			except:
278			raise exceptions.LASDataError(
279			traceback.format_exc()[:-1] +
280			' in data section beginning line {}'.format(i + 1))
281			sections[line] = {
282			"section_type": "data",
283			"start_line": i,
284			"title": line,
285			"array": data,
286			}
287			logger.debug('Data section ["array"].shape = {}'.format(data.shape))
288			break
289
290			elif line.startswith('~'):
291			if section_exists:
292			# We have ended a section and need to start the next
293			sections[sect_title_line] = {
294			"section_type": "header",
295			"title": sect_title_line,
296			"lines": sect_lines,
297			"line_nos": sect_line_nos,
298			}
299			sect_lines = []
300			sect_line_nos = []
301			else:
302			# We are entering into a section for the first time
303			section_exists = True
304			pass
305			sect_title_line = line # either way... this is the case.
306
307			else:
308			# We are in the middle of a section.
309			if not line.startswith("#"): # ignore commented-out lines.. for now.
310			sect_lines.append(line)
311			sect_line_nos.append(i + 1)
312
313			# Find the number of columns in the data section(s). This is only
314			# useful if WRAP = NO, but we do it for all since we don't yet know
315			# what the wrap setting is.
316
317			for section in sections.values():
318			if section["section_type"] == "data":
319			section["ncols"] = None
320			file_obj.seek(0)
321			for i, line in enumerate(file_obj):
322			if i == section["start_line"] + 1:
323			for pattern, sub_str in regexp_subs:
324			line = re.sub(pattern, sub_str, line)
325			section["ncols"] = len(line.split())
326			break
327			return sections
328
329
330			def read_data_section_iterative(file_obj, regexp_subs, value_null_subs):
331			'''Read data section into memory.
332
333			Arguments:
334			file_obj (open file-like object): should be positioned in line-by-line
335			reading mode, with the last line read being the title of the
336			~ASCII data section.
337			regexp_subs (list): each item should be a tuple of the pattern and
338			substitution string for a call to re.sub() on each line of the
339			data section. See defaults.py READ_SUBS and NULL_SUBS for examples.
340			value_null_subs (list): list of numerical values to be replaced by
341			numpy.nan values.
342
343			Returns:
344			A 1-D numpy ndarray.
345
346			'''
347			def items(f):
348			for line in f:
349			for pattern, sub_str in regexp_subs:
350			line = re.sub(pattern, sub_str, line)
351			for item in line.split():
352			try:
353			yield np.float64(item)
354			except ValueError:
355			yield item
356
357			array = np.array([i for i in items(file_obj)])
358			for value in value_null_subs:
359			array[array == value] = np.nan
360			return array
361
362
363			def get_substitutions(read_policy, null_policy):
364			'''Parse read and null policy definitions into a list of regexp and value
365			substitutions.
366
367			Arguments:
368			read_policy (str, list, or substitution): either (1) a string defined in
369			defaults.READ_POLICIES; (2) a list of substitutions as defined by
370			the keys of defaults.READ_SUBS; or (3) a list of actual substitutions
371			similar to the values of defaults.READ_SUBS. You can mix (2) and (3)
372			together if you want.
373			null_policy (str, list, or sub): as for read_policy but for
374			defaults.NULL_POLICIES and defaults.NULL_SUBS
375
376			Returns:
377			regexp_subs, value_null_subs, version_NULL - two lists and a bool.
378			The first list is pairs of regexp patterns and substrs, and the second
379			list is just a list of floats or integers. The bool is whether or not
380			'NULL' was located as a substitution.
381
382			'''
383			regexp_subs = []
384			numerical_subs = []
385			version_NULL = False
386
387			for policy_typ, policy, policy_subs, subs in (
388			('read', read_policy, defaults.READ_POLICIES, defaults.READ_SUBS),
389			('null', null_policy, defaults.NULL_POLICIES, defaults.NULL_SUBS)):
390			try:
391			is_policy = policy in policy_subs
392			except TypeError:
393			is_policy = False
394			if is_policy:
395			logger.debug('using {} policy of "{}"'.format(policy_typ, policy))
396			all_subs = []
397			for sub in policy_subs[policy]:
398			logger.debug('adding substitution {}'.format(sub))
399			if sub in subs:
400			all_subs += subs[sub]
401			if sub == 'NULL':
402			logger.debug('located substition for LAS.version.NULL as True')
403			version_NULL = True
404			else:
405			all_subs = []
406			for item in policy:
407			if item in subs:
408			all_subs += subs[item]
409			if item == 'NULL':
410			logger.debug('located substition for LAS.version.NULL as True')
411			version_NULL = True
412			else:
413			all_subs.append(item)
414			for item in all_subs:
415			try:
416			iter(item)
417			except TypeError:
418			logger.debug('added numerical substitution: {}'.format(item))
419			numerical_subs.append(item)
420			else:
421			logger.debug('added regexp substitution: pattern={} substr="{}"'.format(item[0], item[1]))
422			regexp_subs.append(item)
423			numerical_subs = [n for n in numerical_subs if not n is None]
424
425			return regexp_subs, numerical_subs, version_NULL
426
427
428			def parse_header_section(sectdict, version, ignore_header_errors=False,
429			mnemonic_case='preserve'):
430			'''Parse a header section dict into a SectionItems containing HeaderItems.
431
432			Arguments:
433			sectdict (dict): object returned from
434			:func:`lasio.reader.read_file_contents`
435			version (float): either 1.2 or 2.0
436
437			Keyword Arguments:
438			ignore_header_errors (bool): if True, issue HeaderItem parse errors
439			as :func:`logging.warning` calls instead of a
440			:exc:`lasio.exceptions.LASHeaderError` exception.
441			mnemonic_case (str): 'preserve': keep the case of HeaderItem mnemonics
442			'upper': convert all HeaderItem mnemonics to uppercase
443			'lower': convert all HeaderItem mnemonics to lowercase
444
445			Returns:
446			:class:`lasio.las_items.SectionItems`
447
448			'''
449			title = sectdict["title"]
450			assert len(sectdict["lines"]) == len(sectdict["line_nos"])
451			parser = SectionParser(title, version=version)
452
453			section = SectionItems()
454			assert mnemonic_case in ('upper', 'lower', 'preserve')
455			if not mnemonic_case == 'preserve':
456			section.mnemonic_transforms = True
457
458			for i in range(len(sectdict["lines"])):
459			line = sectdict["lines"][i]
460			j = sectdict["line_nos"][i]
461			if not line:
462			continue
463			try:
464			values = read_line(line)
465			except:
466			message = 'line {} (section {}): "{}"'.format(
467			# traceback.format_exc().splitlines()[-1].strip('\n'),
468			j, title, line)
469			if ignore_header_errors:
470			logger.warning(message)
471			else:
472			raise exceptions.LASHeaderError(message)
473			else:
474			if mnemonic_case == 'upper':
475			values['name'] = values['name'].upper()
476			elif mnemonic_case == 'lower':
477			values['name'] = values['name'].lower()
478			section.append(parser(**values))
479			return section
480
481
482
483			class SectionParser(object):
484
485			'''Parse lines from header sections.
486
487			Arguments:
488			title (str): title line of section. Used to understand different
489			order formatting across the special sections ~C, ~P, ~W, and ~V,
490			depending on version 1.2 or 2.0.
491
492			Keyword Arguments:
493			version (float): version to parse according to. Default is 1.2.
494
495			'''
496
497			def __init__(self, title, version=1.2):
498			if title.upper().startswith('~C'):
499			self.func = self.curves
500			self.section_name2 = "Curves"
501			elif title.upper().startswith('~P'):
502			self.func = self.params
503			self.section_name2 = "Parameter"
504			elif title.upper().startswith('~W'):
505			self.func = self.metadata
506			self.section_name2 = "Well"
507			elif title.upper().startswith('~V'):
508			self.func = self.metadata
509			self.section_name2 = "Version"
510
511
512			self.version = version
513			self.section_name = title
514
515			defs = defaults.ORDER_DEFINITIONS
516			section_orders = defs[self.version][self.section_name2]
517			self.default_order = section_orders[0]#
518			self.orders = {}
519			for order, mnemonics in section_orders[1:]:
520			for mnemonic in mnemonics:
521			self.orders[mnemonic] = order
522
523			def __call__(self, **keys):
524			'''Return the correct object for this type of section.
525
526			Refer to :meth:`lasio.reader.SectionParser.metadata`,
527			:meth:`lasio.reader.SectionParser.params`, and
528			:meth:`lasio.reader.SectionParser.curves` for the methods actually
529			used by this routine.
530
531			Keyword arguments should be the key:value pairs returned by
532			:func:`lasio.reader.read_header_line`.
533
534			'''
535			item = self.func(**keys)
536			return item
537
538			def num(self, x, default=None):
539			'''Attempt to parse a number.
540
541			Arguments:
542			x (str, int, float): potential number
543			default (int, float, None): fall-back option
544
545			Returns:
546			int, float, or default - from most to least preferred types.
547
548			'''
549			if default is None:
550			default = x
551
552			# in case it is a string.
553			try:
554			pattern, sub = defaults.READ_SUBS['comma-decimal-mark'][0]
555			x = re.sub(pattern, sub, x)
556			except:
557			pass
558
559			try:
560			return np.int(x)
561			except:
562			try:
563			x = np.float(x)
564			except:
565			return default
566			if np.isfinite(x):
567			return x
568			else:
569			return default
570
571			def metadata(self, **keys):
572			'''Return HeaderItem correctly formatted according to the order
573			prescribed for LAS v 1.2 or 2.0 for the ~W section.
574
575			Keyword arguments should be the key:value pairs returned by
576			:func:`lasio.reader.read_header_line`.
577
578			'''
579			key_order = self.orders.get(keys['name'], self.default_order)
580			if key_order == 'value:descr':
581			return HeaderItem(
582			keys['name'], # mnemonic
583			keys['unit'], # unit
584			self.num(keys['value']), # value
585			keys['descr'], # descr
586			)
587			elif key_order == 'descr:value':
588			return HeaderItem(
589			keys['name'], # mnemonic
590			keys['unit'], # unit
591			keys['descr'], # descr
592			self.num(keys['value']), # value
593			)
594
595			def curves(self, **keys):
596			'''Return CurveItem.
597
598			Keyword arguments should be the key:value pairs returned by
599			:func:`lasio.reader.read_header_line`.
600
601			'''
602			item = CurveItem(
603			keys['name'], # mnemonic
604			keys['unit'], # unit
605			keys['value'], # value
606			keys['descr'], # descr
607			)
608			return item
609
610			def params(self, **keys):
611			'''Return HeaderItem for ~P section (the same between 1.2 and 2.0 specs)
612
613			Keyword arguments should be the key:value pairs returned by
614			:func:`lasio.reader.read_header_line`.
615
616			'''
617			return HeaderItem(
618			keys['name'], # mnemonic
619			keys['unit'], # unit
620			self.num(keys['value']), # value
621			keys['descr'], # descr
622			)
623
624
625			def read_line(args, *kwargs):
626			'''Retained for backwards-compatibility.
627
628			See :func:`lasio.reader.read_header_line`.
629
630			'''
631			return read_header_line(args, *kwargs)
632
633
634			def read_header_line(line, pattern=None):
635			'''Read a line from a LAS header section.
636
637			The line is parsed with a regular expression -- see LAS file specs for
638			more details, but it should basically be in the format::
639
640			name.unit value : descr
641
642			Arguments:
643			line (str): line from a LAS header section
644
645			Returns:
646			A dictionary with keys 'name', 'unit', 'value', and 'descr', each
647			containing a string as value.
648
649			'''
650			d = {'name': '', 'unit': '', 'value': '', 'descr': ''}
651			if pattern is None:
652			if not ':' in line:
653			pattern = (r'\.?(?P<name>[^.]*)\.' +
654			r'(?P<unit>[^\s:]*)' +
655			r'(?P<value>[^:]*)')
656			else:
657			pattern = (r'\.?(?P<name>[^.]*)\.' +
658			r'(?P<unit>[^\s:]*)' +
659			r'(?P<value>[^:]*):' +
660			r'(?P<descr>.*)')
661			m = re.match(pattern, line)
662			mdict = m.groupdict()
663			for key, value in mdict.items():
664			d[key] = value.strip()
665			if key == 'unit':
666			if d[key].endswith('.'):
667			d[key] = d[key].strip('.') # see issue #36
668			return d
669

kinverarity1 / lasio

parse_header_section() C last analyzed 2018-07-04 10:31 UTC

Complexity

Size

Duplication

Importance

How to fix Long Method Complexity

Long Method

Complexity

Duplication Side-by-Side

Filter issues like

parse_header_section() C
last analyzed 2018-07-04 10:31 UTC