Passed
Push — master ( 8dafba...5f5381 )
by Konstantin
02:59 queued 01:15
created

ocrd_models.utils.handle_oai_response()   A

Complexity

Conditions 5

Size

Total Lines 13
Code Lines 10

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 10
dl 0
loc 13
rs 9.3333
c 0
b 0
f 0
cc 5
nop 1
1
"""
2
Utilities for ocrd_models
3
"""
4
from lxml import etree as ET
5
6
from ocrd_utils import getLogger
7
from .constants import NAMESPACES as NS
8
9
__all__ = [
10
    'xmllint_format',
11
    'handle_oai_response',
12
    'is_oai_content',
13
    'extract_mets_from_oai_content'
14
]
15
16
log = getLogger('ocrd_models.utils')
17
18
def xmllint_format(xml):
19
    """
20
    Pretty-print XML like ``xmllint`` does.
21
22
    Arguments:
23
        xml (string): Serialized XML
24
    """
25
    parser = ET.XMLParser(resolve_entities=False, strip_cdata=False, remove_blank_text=True)
26
    document = ET.fromstring(xml, parser)
27
    return ('%s\n%s' % ('<?xml version="1.0" encoding="UTF-8"?>',
28
                        ET.tostring(document, pretty_print=True, encoding='UTF-8').decode('utf-8'))).encode('utf-8')
29
30
def handle_oai_response(response):
31
    """
32
    In case of a valid OAI-Response, extract first METS-Entry-Data
33
    """
34
    content_type = response.headers['Content-Type']
35
    if 'xml' in content_type or 'text' in content_type:
36
        content = response.content
37
        try:
38
            if is_oai_content(content):
39
                return extract_mets_from_oai_content(content)
40
        except ET.LxmlError as exc:
41
            log.warning("textual response but no xml: %s (%s)", content, exc)
42
    return response.content
43
44
45
def is_oai_content(data):
46
    """
47
    Return True if data is an OAI-PMH request/response
48
    """
49
    xml_root = ET.fromstring(data)
50
    root_tag = xml_root.tag
51
    log.info("response data root.tag: '%s'" % root_tag)
52
    return str(root_tag).endswith('OAI-PMH')
53
54
55
def extract_mets_from_oai_content(data, preamble='<?xml version="1.0" encoding="UTF-8"?>'):
56
    """
57
    Extract METS from an OAI-PMH GetRecord response
58
    """
59
    xml_root = ET.fromstring(data)
60
    if 'mets' in xml_root.tag:
61
        return data
62
    mets_root_el = xml_root.find('.//{%s}mets' % NS['mets'])
63
    if mets_root_el is not None:
64
        new_tree = ET.ElementTree(mets_root_el)
65
        xml_formatted = ET.tostring(new_tree,
66
                                pretty_print=True,
67
                                encoding='UTF-8').decode('UTF-8')
68
        formatted_content = '{}\n{}'.format(preamble, xml_formatted)
69
        return formatted_content.encode('UTF-8').replace(b'\n', b'\r\n')
70
71
    raise Exception("Missing mets-section in %s" % data)
72