| 1 |  |  | """ | 
            
                                                        
            
                                    
            
            
                | 2 |  |  | Utilities for ocrd_models | 
            
                                                        
            
                                    
            
            
                | 3 |  |  | """ | 
            
                                                        
            
                                    
            
            
                | 4 |  |  | from lxml import etree as ET | 
            
                                                        
            
                                    
            
            
                | 5 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 6 |  |  | from ocrd_utils import getLogger | 
            
                                                        
            
                                    
            
            
                | 7 |  |  | from .constants import NAMESPACES as NS | 
            
                                                        
            
                                    
            
            
                | 8 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 9 |  |  | __all__ = [ | 
            
                                                        
            
                                    
            
            
                | 10 |  |  |     'xmllint_format', | 
            
                                                        
            
                                    
            
            
                | 11 |  |  |     'handle_oai_response', | 
            
                                                        
            
                                    
            
            
                | 12 |  |  |     'is_oai_content', | 
            
                                                        
            
                                    
            
            
                | 13 |  |  |     'extract_mets_from_oai_content' | 
            
                                                        
            
                                    
            
            
                | 14 |  |  | ] | 
            
                                                        
            
                                    
            
            
                | 15 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 16 |  |  | log = getLogger('ocrd_models.utils') | 
            
                                                        
            
                                    
            
            
                | 17 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 18 |  |  | def xmllint_format(xml): | 
            
                                                        
            
                                    
            
            
                | 19 |  |  |     """ | 
            
                                                        
            
                                    
            
            
                | 20 |  |  |     Pretty-print XML like ``xmllint`` does. | 
            
                                                        
            
                                    
            
            
                | 21 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 22 |  |  |     Arguments: | 
            
                                                        
            
                                    
            
            
                | 23 |  |  |         xml (string): Serialized XML | 
            
                                                        
            
                                    
            
            
                | 24 |  |  |     """ | 
            
                                                        
            
                                    
            
            
                | 25 |  |  |     parser = ET.XMLParser(resolve_entities=False, strip_cdata=False, remove_blank_text=True) | 
            
                                                        
            
                                    
            
            
                | 26 |  |  |     document = ET.fromstring(xml, parser) | 
            
                                                        
            
                                    
            
            
                | 27 |  |  |     return ('%s\n%s' % ('<?xml version="1.0" encoding="UTF-8"?>', | 
            
                                                        
            
                                    
            
            
                | 28 |  |  |                         ET.tostring(document, pretty_print=True, encoding='UTF-8').decode('utf-8'))).encode('utf-8') | 
            
                                                        
            
                                    
            
            
                | 29 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 30 |  |  | def handle_oai_response(response): | 
            
                                                        
            
                                    
            
            
                | 31 |  |  |     """ | 
            
                                                        
            
                                    
            
            
                | 32 |  |  |     In case of a valid OAI-Response, extract first METS-Entry-Data | 
            
                                                        
            
                                    
            
            
                | 33 |  |  |     """ | 
            
                                                        
            
                                    
            
            
                | 34 |  |  |     content_type = response.headers['Content-Type'] | 
            
                                                        
            
                                    
            
            
                | 35 |  |  |     if 'xml' in content_type or 'text' in content_type: | 
            
                                                        
            
                                    
            
            
                | 36 |  |  |         content = response.content | 
            
                                                        
            
                                    
            
            
                | 37 |  |  |         try: | 
            
                                                        
            
                                    
            
            
                | 38 |  |  |             if is_oai_content(content): | 
            
                                                        
            
                                    
            
            
                | 39 |  |  |                 return extract_mets_from_oai_content(content) | 
            
                                                        
            
                                    
            
            
                | 40 |  |  |         except ET.LxmlError as exc: | 
            
                                                        
            
                                    
            
            
                | 41 |  |  |             log.warning("textual response but no xml: %s (%s)", content, exc) | 
            
                                                        
            
                                    
            
            
                | 42 |  |  |     return response.content | 
            
                                                        
            
                                    
            
            
                | 43 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 44 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 45 |  |  | def is_oai_content(data): | 
            
                                                        
            
                                    
            
            
                | 46 |  |  |     """ | 
            
                                                        
            
                                    
            
            
                | 47 |  |  |     Return True if data is an OAI-PMH request/response | 
            
                                                        
            
                                    
            
            
                | 48 |  |  |     """ | 
            
                                                        
            
                                    
            
            
                | 49 |  |  |     xml_root = ET.fromstring(data) | 
            
                                                        
            
                                    
            
            
                | 50 |  |  |     root_tag = xml_root.tag | 
            
                                                        
            
                                    
            
            
                | 51 |  |  |     log.info("response data root.tag: '%s'" % root_tag) | 
            
                                                        
            
                                    
            
            
                | 52 |  |  |     return str(root_tag).endswith('OAI-PMH') | 
            
                                                        
            
                                    
            
            
                | 53 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 54 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 55 |  |  | def extract_mets_from_oai_content(data, preamble='<?xml version="1.0" encoding="UTF-8"?>'): | 
            
                                                        
            
                                    
            
            
                | 56 |  |  |     """ | 
            
                                                        
            
                                    
            
            
                | 57 |  |  |     Extract METS from an OAI-PMH GetRecord response | 
            
                                                        
            
                                    
            
            
                | 58 |  |  |     """ | 
            
                                                        
            
                                    
            
            
                | 59 |  |  |     xml_root = ET.fromstring(data) | 
            
                                                        
            
                                    
            
            
                | 60 |  |  |     if 'mets' in xml_root.tag: | 
            
                                                        
            
                                    
            
            
                | 61 |  |  |         return data | 
            
                                                        
            
                                    
            
            
                | 62 |  |  |     mets_root_el = xml_root.find('.//{%s}mets' % NS['mets']) | 
            
                                                        
            
                                    
            
            
                | 63 |  |  |     if mets_root_el is not None: | 
            
                                                        
            
                                    
            
            
                | 64 |  |  |         new_tree = ET.ElementTree(mets_root_el) | 
            
                                                        
            
                                    
            
            
                | 65 |  |  |         xml_formatted = ET.tostring(new_tree, | 
            
                                                        
            
                                    
            
            
                | 66 |  |  |                                 pretty_print=True, | 
            
                                                        
            
                                    
            
            
                | 67 |  |  |                                 encoding='UTF-8').decode('UTF-8') | 
            
                                                        
            
                                    
            
            
                | 68 |  |  |         formatted_content = '{}\n{}'.format(preamble, xml_formatted) | 
            
                                                        
            
                                    
            
            
                | 69 |  |  |         return formatted_content.encode('UTF-8').replace(b'\n', b'\r\n') | 
            
                                                        
            
                                    
            
            
                | 70 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 71 |  |  |     raise Exception("Missing mets-section in %s" % data) | 
            
                                                        
            
                                    
            
            
                | 72 |  |  |  |