|
1
|
|
|
""" |
|
2
|
|
|
Utilities for ocrd_models |
|
3
|
|
|
""" |
|
4
|
|
|
from lxml import etree as ET |
|
5
|
|
|
|
|
6
|
|
|
from ocrd_utils import getLogger |
|
7
|
|
|
from .constants import NAMESPACES as NS |
|
8
|
|
|
|
|
9
|
|
|
__all__ = [ |
|
10
|
|
|
'xmllint_format', |
|
11
|
|
|
'handle_oai_response', |
|
12
|
|
|
'is_oai_content', |
|
13
|
|
|
'extract_mets_from_oai_content' |
|
14
|
|
|
] |
|
15
|
|
|
|
|
16
|
|
|
log = getLogger('ocrd_models.utils') |
|
17
|
|
|
|
|
18
|
|
|
def xmllint_format(xml): |
|
19
|
|
|
""" |
|
20
|
|
|
Pretty-print XML like ``xmllint`` does. |
|
21
|
|
|
|
|
22
|
|
|
Arguments: |
|
23
|
|
|
xml (string): Serialized XML |
|
24
|
|
|
""" |
|
25
|
|
|
parser = ET.XMLParser(resolve_entities=False, strip_cdata=False, remove_blank_text=True) |
|
26
|
|
|
document = ET.fromstring(xml, parser) |
|
27
|
|
|
return ('%s\n%s' % ('<?xml version="1.0" encoding="UTF-8"?>', |
|
28
|
|
|
ET.tostring(document, pretty_print=True, encoding='UTF-8').decode('utf-8'))).encode('utf-8') |
|
29
|
|
|
|
|
30
|
|
|
def handle_oai_response(response): |
|
31
|
|
|
""" |
|
32
|
|
|
In case of a valid OAI-Response, extract first METS-Entry-Data |
|
33
|
|
|
""" |
|
34
|
|
|
content_type = response.headers['Content-Type'] |
|
35
|
|
|
if 'xml' in content_type or 'text' in content_type: |
|
36
|
|
|
content = response.content |
|
37
|
|
|
try: |
|
38
|
|
|
if is_oai_content(content): |
|
39
|
|
|
return extract_mets_from_oai_content(content) |
|
40
|
|
|
except ET.LxmlError as exc: |
|
41
|
|
|
log.warning("textual response but no xml: %s (%s)", content, exc) |
|
42
|
|
|
return response.content |
|
43
|
|
|
|
|
44
|
|
|
|
|
45
|
|
|
def is_oai_content(data): |
|
46
|
|
|
""" |
|
47
|
|
|
Return True if data is an OAI-PMH request/response |
|
48
|
|
|
""" |
|
49
|
|
|
xml_root = ET.fromstring(data) |
|
50
|
|
|
root_tag = xml_root.tag |
|
51
|
|
|
log.info("response data root.tag: '%s'" % root_tag) |
|
52
|
|
|
return str(root_tag).endswith('OAI-PMH') |
|
53
|
|
|
|
|
54
|
|
|
|
|
55
|
|
|
def extract_mets_from_oai_content(data, preamble='<?xml version="1.0" encoding="UTF-8"?>'): |
|
56
|
|
|
""" |
|
57
|
|
|
Extract METS from an OAI-PMH GetRecord response |
|
58
|
|
|
""" |
|
59
|
|
|
xml_root = ET.fromstring(data) |
|
60
|
|
|
if 'mets' in xml_root.tag: |
|
61
|
|
|
return data |
|
62
|
|
|
mets_root_el = xml_root.find('.//{%s}mets' % NS['mets']) |
|
63
|
|
|
if mets_root_el is not None: |
|
64
|
|
|
new_tree = ET.ElementTree(mets_root_el) |
|
65
|
|
|
xml_formatted = ET.tostring(new_tree, |
|
66
|
|
|
pretty_print=True, |
|
67
|
|
|
encoding='UTF-8').decode('UTF-8') |
|
68
|
|
|
formatted_content = '{}\n{}'.format(preamble, xml_formatted) |
|
69
|
|
|
return formatted_content.encode('UTF-8').replace(b'\n', b'\r\n') |
|
70
|
|
|
|
|
71
|
|
|
raise Exception("Missing mets-section in %s" % data) |
|
72
|
|
|
|