| 1 |  |  | """ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 2 |  |  | Processor base class and helper functions. | 
            
                                                                                                            
                            
            
                                    
            
            
                | 3 |  |  | """ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 4 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 5 |  |  | __all__ = [ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 6 |  |  |     'Processor', | 
            
                                                                                                            
                            
            
                                    
            
            
                | 7 |  |  |     'generate_processor_help', | 
            
                                                                                                            
                            
            
                                    
            
            
                | 8 |  |  |     'run_cli', | 
            
                                                                                                            
                            
            
                                    
            
            
                | 9 |  |  |     'run_processor' | 
            
                                                                                                            
                            
            
                                    
            
            
                | 10 |  |  | ] | 
            
                                                                                                            
                            
            
                                    
            
            
                | 11 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 12 |  |  | from pkg_resources import resource_filename | 
            
                                                                                                            
                            
            
                                    
            
            
                | 13 |  |  | from os.path import exists | 
            
                                                                                                            
                            
            
                                    
            
            
                | 14 |  |  | from shutil import copyfileobj | 
            
                                                                                                            
                            
            
                                    
            
            
                | 15 |  |  | import json | 
            
                                                                                                            
                            
            
                                    
            
            
                | 16 |  |  | import os | 
            
                                                                                                            
                            
            
                                    
            
            
                | 17 |  |  | from os import getcwd | 
            
                                                                                                            
                            
            
                                    
            
            
                | 18 |  |  | from pathlib import Path | 
            
                                                                                                            
                            
            
                                    
            
            
                | 19 |  |  | import sys | 
            
                                                                                                            
                            
            
                                    
            
            
                | 20 |  |  | import tarfile | 
            
                                                                                                            
                            
            
                                    
            
            
                | 21 |  |  | import io | 
            
                                                                                                            
                            
            
                                    
            
            
                | 22 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 23 |  |  | from ocrd_utils import ( | 
            
                                                                                                            
                            
            
                                    
            
            
                | 24 |  |  |     VERSION as OCRD_VERSION, | 
            
                                                                                                            
                            
            
                                    
            
            
                | 25 |  |  |     MIMETYPE_PAGE, | 
            
                                                                                                            
                            
            
                                    
            
            
                | 26 |  |  |     MIME_TO_EXT, | 
            
                                                                                                            
                            
            
                                    
            
            
                | 27 |  |  |     getLogger, | 
            
                                                                                                            
                            
            
                                    
            
            
                | 28 |  |  |     initLogging, | 
            
                                                                                                            
                            
            
                                    
            
            
                | 29 |  |  |     list_resource_candidates, | 
            
                                                                                                            
                            
            
                                    
            
            
                | 30 |  |  |     pushd_popd, | 
            
                                                                                                            
                            
            
                                    
            
            
                | 31 |  |  |     list_all_resources, | 
            
                                                                                                            
                            
            
                                    
            
            
                | 32 |  |  |     get_processor_resource_types | 
            
                                                                                                            
                            
            
                                    
            
            
                | 33 |  |  | ) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 34 |  |  | from ocrd_validators import ParameterValidator | 
            
                                                                                                            
                            
            
                                    
            
            
                | 35 |  |  | from ocrd_models.ocrd_page import MetadataItemType, LabelType, LabelsType | 
            
                                                                                                            
                            
            
                                    
            
            
                | 36 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 37 |  |  | # XXX imports must remain for backwards-compatibilty | 
            
                                                                                                            
                            
            
                                    
            
            
                | 38 |  |  | from .helpers import run_cli, run_processor, generate_processor_help # pylint: disable=unused-import | 
            
                                                                                                            
                            
            
                                    
            
            
                | 39 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 40 |  |  | class Processor(): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 41 |  |  |     """ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 42 |  |  |     A processor is a tool that implements the uniform OCR-D command-line interface | 
            
                                                                                                            
                            
            
                                    
            
            
                | 43 |  |  |     for run-time data processing. That is, it executes a single workflow step, | 
            
                                                                                                            
                            
            
                                    
            
            
                | 44 |  |  |     or a combination of workflow steps, on the workspace (represented by local METS). | 
            
                                                                                                            
                            
            
                                    
            
            
                | 45 |  |  |     It reads input files for all or requested physical pages of the input fileGrp(s), | 
            
                                                                                                            
                            
            
                                    
            
            
                | 46 |  |  |     and writes output files for them into the output fileGrp(s). It may take  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 47 |  |  |     a number of optional or mandatory parameters. | 
            
                                                                                                            
                            
            
                                    
            
            
                | 48 |  |  |     """ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 49 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 50 |  |  |     def __init__( | 
            
                                                                                                            
                            
            
                                    
            
            
                | 51 |  |  |             self, | 
            
                                                                                                            
                            
            
                                    
            
            
                | 52 |  |  |             workspace, | 
            
                                                                                                            
                            
            
                                    
            
            
                | 53 |  |  |             ocrd_tool=None, | 
            
                                                                                                            
                            
            
                                    
            
            
                | 54 |  |  |             parameter=None, | 
            
                                                                                                            
                            
            
                                    
            
            
                | 55 |  |  |             # TODO OCR-D/core#274 | 
            
                                                                                                            
                            
            
                                    
            
            
                | 56 |  |  |             # input_file_grp=None, | 
            
                                                                                                            
                            
            
                                    
            
            
                | 57 |  |  |             # output_file_grp=None, | 
            
                                                                                                            
                            
            
                                    
            
            
                | 58 |  |  |             input_file_grp="INPUT", | 
            
                                                                                                            
                            
            
                                    
            
            
                | 59 |  |  |             output_file_grp="OUTPUT", | 
            
                                                                                                            
                            
            
                                    
            
            
                | 60 |  |  |             page_id=None, | 
            
                                                                                                            
                            
            
                                    
            
            
                | 61 |  |  |             show_resource=None, | 
            
                                                                                                            
                            
            
                                    
            
            
                | 62 |  |  |             list_resources=False, | 
            
                                                                                                            
                            
            
                                    
            
            
                | 63 |  |  |             show_help=False, | 
            
                                                                                                            
                            
            
                                    
            
            
                | 64 |  |  |             show_version=False, | 
            
                                                                                                            
                            
            
                                    
            
            
                | 65 |  |  |             dump_json=False, | 
            
                                                                                                            
                            
            
                                    
            
            
                | 66 |  |  |             version=None | 
            
                                                                                                            
                            
            
                                    
            
            
                | 67 |  |  |     ): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 68 |  |  |         """ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 69 |  |  |         Instantiate, but do not process. Unless ``list_resources`` or | 
            
                                                                                                            
                            
            
                                    
            
            
                | 70 |  |  |         ``show_resource`` or ``show_help`` or ``show_version`` or | 
            
                                                                                                            
                            
            
                                    
            
            
                | 71 |  |  |         ``dump_json`` is true, setup for processing (parsing and | 
            
                                                                                                            
                            
            
                                    
            
            
                | 72 |  |  |         validating parameters, entering the workspace directory). | 
            
                                                                                                            
                            
            
                                    
            
            
                | 73 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 74 |  |  |         Args: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 75 |  |  |              workspace (:py:class:`~ocrd.Workspace`): The workspace to process. \ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 76 |  |  |                  Can be ``None`` even for processing (esp. on multiple workspaces), \ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 77 |  |  |                  but then needs to be set before running. | 
            
                                                                                                            
                            
            
                                    
            
            
                | 78 |  |  |         Keyword Args: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 79 |  |  |              ocrd_tool (string): JSON of the ocrd-tool description for that processor. \ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 80 |  |  |                  Can be ``None`` for processing, but needs to be set before running. | 
            
                                                                                                            
                            
            
                                    
            
            
                | 81 |  |  |              parameter (string): JSON of the runtime choices for ocrd-tool ``parameters``. \ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 82 |  |  |                  Can be ``None`` even for processing, but then needs to be set before running. | 
            
                                                                                                            
                            
            
                                    
            
            
                | 83 |  |  |              input_file_grp (string): comma-separated list of METS ``fileGrp``s used for input. | 
            
                                                                                                            
                            
            
                                    
            
            
                | 84 |  |  |              output_file_grp (string): comma-separated list of METS ``fileGrp``s used for output. | 
            
                                                                                                            
                            
            
                                    
            
            
                | 85 |  |  |              page_id (string): comma-separated list of METS physical ``page`` IDs to process \ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 86 |  |  |                  (or empty for all pages). | 
            
                                                                                                            
                            
            
                                    
            
            
                | 87 |  |  |              show_resource (string): If not ``None``, then instead of processing, resolve \ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 88 |  |  |                  given resource by name and print its contents to stdout. | 
            
                                                                                                            
                            
            
                                    
            
            
                | 89 |  |  |              list_resources (boolean): If true, then instead of processing, find all installed \ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 90 |  |  |                  resource files in the search paths and print their path names. | 
            
                                                                                                            
                            
            
                                    
            
            
                | 91 |  |  |              show_help (boolean): If true, then instead of processing, print a usage description \ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 92 |  |  |                  including the standard CLI and all of this processor's ocrd-tool parameters and \ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 93 |  |  |                  docstrings. | 
            
                                                                                                            
                            
            
                                    
            
            
                | 94 |  |  |              show_version (boolean): If true, then instead of processing, print information on \ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 95 |  |  |                  this processor's version and OCR-D version. Exit afterwards. | 
            
                                                                                                            
                            
            
                                    
            
            
                | 96 |  |  |              dump_json (boolean): If true, then instead of processing, print :py:attr:`ocrd_tool` \ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 97 |  |  |                  on stdout. | 
            
                                                                                                            
                            
            
                                    
            
            
                | 98 |  |  |         """ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 99 |  |  |         self.ocrd_tool = ocrd_tool | 
            
                                                                                                            
                            
            
                                    
            
            
                | 100 |  |  |         if parameter is None: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 101 |  |  |             parameter = {} | 
            
                                                                                                            
                            
            
                                    
            
            
                | 102 |  |  |         if dump_json: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 103 |  |  |             print(json.dumps(ocrd_tool, indent=True)) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 104 |  |  |             return | 
            
                                                                                                            
                            
            
                                    
            
            
                | 105 |  |  |         if list_resources: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 106 |  |  |             for res in self.list_all_resources(): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 107 |  |  |                 print(res) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 108 |  |  |             return | 
            
                                                                                                            
                            
            
                                    
            
            
                | 109 |  |  |         if show_resource: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 110 |  |  |             initLogging() | 
            
                                                                                                            
                            
            
                                    
            
            
                | 111 |  |  |             res_fname = self.resolve_resource(show_resource) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 112 |  |  |             fpath = Path(res_fname) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 113 |  |  |             if fpath.is_dir(): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 114 |  |  |                 with pushd_popd(fpath): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 115 |  |  |                     fileobj = io.BytesIO() | 
            
                                                                                                            
                            
            
                                    
            
            
                | 116 |  |  |                     with tarfile.open(fileobj=fileobj, mode='w:gz') as tarball: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 117 |  |  |                         tarball.add('.') | 
            
                                                                                                            
                            
            
                                    
            
            
                | 118 |  |  |                     fileobj.seek(0) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 119 |  |  |                     copyfileobj(fileobj, sys.stdout.buffer) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 120 |  |  |             else: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 121 |  |  |                 sys.stdout.buffer.write(fpath.read_bytes()) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 122 |  |  |             return | 
            
                                                                                                            
                            
            
                                    
            
            
                | 123 |  |  |         if show_help: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 124 |  |  |             self.show_help() | 
            
                                                                                                            
                            
            
                                    
            
            
                | 125 |  |  |             return | 
            
                                                                                                            
                            
            
                                    
            
            
                | 126 |  |  |         self.version = version | 
            
                                                                                                            
                            
            
                                    
            
            
                | 127 |  |  |         if show_version: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 128 |  |  |             self.show_version() | 
            
                                                                                                            
                            
            
                                    
            
            
                | 129 |  |  |             return | 
            
                                                                                                            
                            
            
                                    
            
            
                | 130 |  |  |         self.workspace = workspace | 
            
                                                                                                            
                            
            
                                    
            
            
                | 131 |  |  |         # FIXME HACK would be better to use pushd_popd(self.workspace.directory) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 132 |  |  |         # but there is no way to do that in process here since it's an | 
            
                                                                                                            
                            
            
                                    
            
            
                | 133 |  |  |         # overridden method. chdir is almost always an anti-pattern. | 
            
                                                                                                            
                            
            
                                    
            
            
                | 134 |  |  |         if self.workspace: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 135 |  |  |             self.old_pwd = getcwd() | 
            
                                                                                                            
                            
            
                                    
            
            
                | 136 |  |  |             os.chdir(self.workspace.directory) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 137 |  |  |         self.input_file_grp = input_file_grp | 
            
                                                                                                            
                            
            
                                    
            
            
                | 138 |  |  |         self.output_file_grp = output_file_grp | 
            
                                                                                                            
                            
            
                                    
            
            
                | 139 |  |  |         self.page_id = None if page_id == [] or page_id is None else page_id | 
            
                                                                                                            
                            
            
                                    
            
            
                | 140 |  |  |         parameterValidator = ParameterValidator(ocrd_tool) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 141 |  |  |         report = parameterValidator.validate(parameter) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 142 |  |  |         if not report.is_valid: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 143 |  |  |             raise Exception("Invalid parameters %s" % report.errors) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 144 |  |  |         self.parameter = parameter | 
            
                                                                                                            
                            
            
                                    
            
            
                | 145 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 146 |  |  |     def show_help(self): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 147 |  |  |         print(generate_processor_help(self.ocrd_tool, processor_instance=self)) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 148 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 149 |  |  |     def show_version(self): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 150 |  |  |         print("Version %s, ocrd/core %s" % (self.version, OCRD_VERSION)) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 151 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 152 |  |  |     def verify(self): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 153 |  |  |         """ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 154 |  |  |         Verify that the :py:attr:`input_file_grp` fulfills the processor's requirements. | 
            
                                                                                                            
                            
            
                                    
            
            
                | 155 |  |  |         """ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 156 |  |  |         return True | 
            
                                                                                                            
                            
            
                                    
            
            
                | 157 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 158 |  |  |     def process(self): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 159 |  |  |         """ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 160 |  |  |         Process the :py:attr:`workspace`  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 161 |  |  |         from the given :py:attr:`input_file_grp` | 
            
                                                                                                            
                            
            
                                    
            
            
                | 162 |  |  |         to the given :py:attr:`output_file_grp` | 
            
                                                                                                            
                            
            
                                    
            
            
                | 163 |  |  |         for the given :py:attr:`page_id` | 
            
                                                                                                            
                            
            
                                    
            
            
                | 164 |  |  |         under the given :py:attr:`parameter`. | 
            
                                                                                                            
                            
            
                                    
            
            
                | 165 |  |  |          | 
            
                                                                                                            
                            
            
                                    
            
            
                | 166 |  |  |         (This contains the main functionality and needs to be overridden by subclasses.) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 167 |  |  |         """ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 168 |  |  |         raise Exception("Must be implemented") | 
            
                                                                                                            
                            
            
                                    
            
            
                | 169 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 170 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 171 |  |  |     def add_metadata(self, pcgts): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 172 |  |  |         """ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 173 |  |  |         Add PAGE-XML :py:class:`~ocrd_models.ocrd_page.MetadataItemType` ``MetadataItem`` describing | 
            
                                                                                                            
                            
            
                                    
            
            
                | 174 |  |  |         the processing step and runtime parameters to :py:class:`~ocrd_models.ocrd_page.PcGtsType` ``pcgts``. | 
            
                                                                                                            
                            
            
                                    
            
            
                | 175 |  |  |         """ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 176 |  |  |         pcgts.get_Metadata().add_MetadataItem( | 
            
                                                                                                            
                            
            
                                    
            
            
                | 177 |  |  |                 MetadataItemType(type_="processingStep", | 
            
                                                                                                            
                            
            
                                    
            
            
                | 178 |  |  |                     name=self.ocrd_tool['steps'][0], | 
            
                                                                                                            
                            
            
                                    
            
            
                | 179 |  |  |                     value=self.ocrd_tool['executable'], | 
            
                                                                                                            
                            
            
                                    
            
            
                | 180 |  |  |                     Labels=[LabelsType( | 
            
                                                                                                            
                            
            
                                    
            
            
                | 181 |  |  |                         externalModel="ocrd-tool", | 
            
                                                                                                            
                            
            
                                    
            
            
                | 182 |  |  |                         externalId="parameters", | 
            
                                                                                                            
                            
            
                                    
            
            
                | 183 |  |  |                         Label=[LabelType(type_=name, | 
            
                                                                                                            
                            
            
                                    
            
            
                | 184 |  |  |                                          value=self.parameter[name]) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 185 |  |  |                                for name in self.parameter.keys()]), | 
            
                                                                                                            
                            
            
                                    
            
            
                | 186 |  |  |                             LabelsType( | 
            
                                                                                                            
                            
            
                                    
            
            
                | 187 |  |  |                         externalModel="ocrd-tool", | 
            
                                                                                                            
                            
            
                                    
            
            
                | 188 |  |  |                         externalId="version", | 
            
                                                                                                            
                            
            
                                    
            
            
                | 189 |  |  |                         Label=[LabelType(type_=self.ocrd_tool['executable'], | 
            
                                                                                                            
                            
            
                                    
            
            
                | 190 |  |  |                                          value=self.version), | 
            
                                                                                                            
                            
            
                                    
            
            
                | 191 |  |  |                                LabelType(type_='ocrd/core', | 
            
                                                                                                            
                            
            
                                    
            
            
                | 192 |  |  |                                          value=OCRD_VERSION)]) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 193 |  |  |                     ])) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 194 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 195 |  |  |     def resolve_resource(self, val): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 196 |  |  |         """ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 197 |  |  |         Resolve a resource name to an absolute file path with the algorithm in | 
            
                                                                                                            
                            
            
                                    
            
            
                | 198 |  |  |         https://ocr-d.de/en/spec/ocrd_tool#file-parameters | 
            
                                                                                                            
                            
            
                                    
            
            
                | 199 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 200 |  |  |         Args: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 201 |  |  |             val (string): resource value to resolve | 
            
                                                                                                            
                            
            
                                    
            
            
                | 202 |  |  |         """ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 203 |  |  |         executable = self.ocrd_tool['executable'] | 
            
                                                                                                            
                            
            
                                    
            
            
                | 204 |  |  |         log = getLogger('ocrd.%s.resolve_resource' % executable) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 205 |  |  |         if exists(val): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 206 |  |  |             log.debug("Resolved to absolute path %s" % val) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 207 |  |  |             return val | 
            
                                                                                                            
                            
            
                                    
            
            
                | 208 |  |  |         if hasattr(self, 'old_pwd'): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 209 |  |  |             cwd = self.old_pwd | 
            
                                                                                                            
                            
            
                                    
            
            
                | 210 |  |  |         else: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 211 |  |  |             cwd = getcwd() | 
            
                                                                                                            
                            
            
                                    
            
            
                | 212 |  |  |         ret = [cand for cand in list_resource_candidates(executable, val, | 
            
                                                                                                            
                            
            
                                    
            
            
                | 213 |  |  |                                                          cwd=cwd, moduled=self.moduledir) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 214 |  |  |                if exists(cand)] | 
            
                                                                                                            
                            
            
                                    
            
            
                | 215 |  |  |         if ret: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 216 |  |  |             log.debug("Resolved %s to absolute path %s" % (val, ret[0])) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 217 |  |  |             return ret[0] | 
            
                                                                                                            
                            
            
                                    
            
            
                | 218 |  |  |         log.error("Could not find resource '%s' for executable '%s'. " | 
            
                                                                                                            
                            
            
                                    
            
            
                | 219 |  |  |                   "Try 'ocrd resmgr download %s %s' to download this resource.", | 
            
                                                                                                            
                            
            
                                    
            
            
                | 220 |  |  |                   val, executable, executable, val) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 221 |  |  |         sys.exit(1) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 222 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 223 |  |  |     def list_all_resources(self): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 224 |  |  |         """ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 225 |  |  |         List all resources found in the filesystem and matching content-type by filename suffix | 
            
                                                                                                            
                            
            
                                    
            
            
                | 226 |  |  |         """ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 227 |  |  |         mimetypes = get_processor_resource_types(None, self.ocrd_tool) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 228 |  |  |         for res in list_all_resources(self.ocrd_tool['executable'], moduled=self.moduledir): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 229 |  |  |             res = Path(res) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 230 |  |  |             if not '*/*' in mimetypes: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 231 |  |  |                 if res.is_dir() and not 'text/directory' in mimetypes: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 232 |  |  |                     continue | 
            
                                                                                                            
                            
            
                                    
            
            
                | 233 |  |  |                 # if we do not know all MIME types, then keep the file, otherwise require suffix match | 
            
                                                                                                            
                            
            
                                    
            
            
                | 234 |  |  |                 if res.is_file() and not any(res.suffix == MIME_TO_EXT.get(mime, res.suffix) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 235 |  |  |                                              for mime in mimetypes): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 236 |  |  |                     continue | 
            
                                                                                                            
                            
            
                                    
            
            
                | 237 |  |  |             yield res | 
            
                                                                                                            
                            
            
                                    
            
            
                | 238 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 239 |  |  |     @property | 
            
                                                                                                            
                            
            
                                    
            
            
                | 240 |  |  |     def module(self): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 241 |  |  |         """ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 242 |  |  |         The top-level module this processor belongs to. | 
            
                                                                                                            
                            
            
                                    
            
            
                | 243 |  |  |         """ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 244 |  |  |         return self.__module__.split('.')[0] | 
            
                                                                                                            
                                                                
            
                                    
            
            
                | 245 |  |  |  | 
            
                                                                        
                            
            
                                    
            
            
                | 246 |  |  |     @property | 
            
                                                                        
                            
            
                                    
            
            
                | 247 |  |  |     def moduledir(self): | 
            
                                                                        
                            
            
                                    
            
            
                | 248 |  |  |         """ | 
            
                                                                        
                            
            
                                    
            
            
                | 249 |  |  |         The filesystem path of the module directory. | 
            
                                                                        
                            
            
                                    
            
            
                | 250 |  |  |         """ | 
            
                                                                        
                            
            
                                    
            
            
                | 251 |  |  |         return resource_filename(self.module, '') | 
            
                                                                                                            
                            
            
                                    
            
            
                | 252 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 253 |  |  |     @property | 
            
                                                                                                            
                            
            
                                    
            
            
                | 254 |  |  |     def input_files(self): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 255 |  |  |         """ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 256 |  |  |         List the input files (for single-valued :py:attr:`input_file_grp`). | 
            
                                                                                                            
                            
            
                                    
            
            
                | 257 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 258 |  |  |         For each physical page: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 259 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 260 |  |  |         - If there is a single PAGE-XML for the page, take it (and forget about all | 
            
                                                                                                            
                            
            
                                    
            
            
                | 261 |  |  |           other files for that page) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 262 |  |  |         - Else if there is a single image file, take it (and forget about all other | 
            
                                                                                                            
                            
            
                                    
            
            
                | 263 |  |  |           files for that page) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 264 |  |  |         - Otherwise raise an error (complaining that only PAGE-XML warrants | 
            
                                                                                                            
                            
            
                                    
            
            
                | 265 |  |  |           having multiple images for a single page) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 266 |  |  |         Algorithm <https://github.com/cisocrgroup/ocrd_cis/pull/57#issuecomment-656336593>_ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 267 |  |  |          | 
            
                                                                                                            
                            
            
                                    
            
            
                | 268 |  |  |         Returns: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 269 |  |  |             A list of :py:class:`ocrd_models.ocrd_file.OcrdFile` objects. | 
            
                                                                                                            
                            
            
                                    
            
            
                | 270 |  |  |         """ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 271 |  |  |         if not self.input_file_grp: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 272 |  |  |             raise ValueError("Processor is missing input fileGrp") | 
            
                                                                                                            
                            
            
                                    
            
            
                | 273 |  |  |         ret = self.zip_input_files(mimetype=None, on_error='abort') | 
            
                                                                                                            
                            
            
                                    
            
            
                | 274 |  |  |         if not ret: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 275 |  |  |             return [] | 
            
                                                                                                            
                            
            
                                    
            
            
                | 276 |  |  |         assert len(ret[0]) == 1, 'Use zip_input_files() instead of input_files when processing multiple input fileGrps' | 
            
                                                                                                            
                            
            
                                    
            
            
                | 277 |  |  |         return [tuples[0] for tuples in ret] | 
            
                                                                                                            
                            
            
                                    
            
            
                | 278 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 279 |  |  |     def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 280 |  |  |         """ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 281 |  |  |         List tuples of input files (for multi-valued :py:attr:`input_file_grp`). | 
            
                                                                                                            
                            
            
                                    
            
            
                | 282 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 283 |  |  |         Processors that expect/need multiple input file groups, | 
            
                                                                                                            
                            
            
                                    
            
            
                | 284 |  |  |         cannot use :py:data:`input_files`. They must align (zip) input files | 
            
                                                                                                            
                            
            
                                    
            
            
                | 285 |  |  |         across pages. This includes the case where not all pages | 
            
                                                                                                            
                            
            
                                    
            
            
                | 286 |  |  |         are equally present in all file groups. It also requires | 
            
                                                                                                            
                            
            
                                    
            
            
                | 287 |  |  |         making a consistent selection if there are multiple files | 
            
                                                                                                            
                            
            
                                    
            
            
                | 288 |  |  |         per page. | 
            
                                                                                                            
                            
            
                                    
            
            
                | 289 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 290 |  |  |         Following the OCR-D functional model, this function tries to | 
            
                                                                                                            
                            
            
                                    
            
            
                | 291 |  |  |         find a single PAGE file per page, or fall back to a single | 
            
                                                                                                            
                            
            
                                    
            
            
                | 292 |  |  |         image file per page. In either case, multiple matches per page | 
            
                                                                                                            
                            
            
                                    
            
            
                | 293 |  |  |         are an error (see error handling below). | 
            
                                                                                                            
                            
            
                                    
            
            
                | 294 |  |  |         This default behaviour can be changed by using a fixed MIME | 
            
                                                                                                            
                            
            
                                    
            
            
                | 295 |  |  |         type filter via :py:attr:`mimetype`. But still, multiple matching | 
            
                                                                                                            
                            
            
                                    
            
            
                | 296 |  |  |         files per page are an error. | 
            
                                                                                                            
                            
            
                                    
            
            
                | 297 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 298 |  |  |         Single-page multiple-file errors are handled according to | 
            
                                                                                                            
                            
            
                                    
            
            
                | 299 |  |  |         :py:attr:`on_error`: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 300 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 301 |  |  |         - if ``skip``, then the page for the respective fileGrp will be | 
            
                                                                                                            
                            
            
                                    
            
            
                | 302 |  |  |           silently skipped (as if there was no match at all) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 303 |  |  |         - if ``first``, then the first matching file for the page will be | 
            
                                                                                                            
                            
            
                                    
            
            
                | 304 |  |  |           silently selected (as if the first was the only match) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 305 |  |  |         - if ``last``, then the last matching file for the page will be | 
            
                                                                                                            
                            
            
                                    
            
            
                | 306 |  |  |           silently selected (as if the last was the only match) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 307 |  |  |         - if ``abort``, then an exception will be raised. | 
            
                                                                                                            
                            
            
                                    
            
            
                | 308 |  |  |         Multiple matches for PAGE-XML will always raise an exception. | 
            
                                                                                                            
                            
            
                                    
            
            
                | 309 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 310 |  |  |         Keyword Args: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 311 |  |  |              require_first (boolean): If true, then skip a page entirely | 
            
                                                                                                            
                            
            
                                    
            
            
                | 312 |  |  |                  whenever it is not available in the first input `fileGrp`. | 
            
                                                                                                            
                            
            
                                    
            
            
                | 313 |  |  |              mimetype (string): If not `None`, filter by the specified MIME | 
            
                                                                                                            
                            
            
                                    
            
            
                | 314 |  |  |                  type (literal or regex prefixed by `//`). Otherwise prefer | 
            
                                                                                                            
                            
            
                                    
            
            
                | 315 |  |  |                  PAGE or image. | 
            
                                                                                                            
                            
            
                                    
            
            
                | 316 |  |  |         Returns: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 317 |  |  |             A list of :py:class:`ocrd_models.ocrd_file.OcrdFile` tuples. | 
            
                                                                                                            
                            
            
                                    
            
            
                | 318 |  |  |         """ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 319 |  |  |         if not self.input_file_grp: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 320 |  |  |             raise ValueError("Processor is missing input fileGrp") | 
            
                                                                                                            
                            
            
                                    
            
            
                | 321 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 322 |  |  |         LOG = getLogger('ocrd.processor.base') | 
            
                                                                                                            
                            
            
                                    
            
            
                | 323 |  |  |         ifgs = self.input_file_grp.split(",") | 
            
                                                                                                            
                            
            
                                    
            
            
                | 324 |  |  |         # Iterating over all files repeatedly may seem inefficient at first sight, | 
            
                                                                                                            
                            
            
                                    
            
            
                | 325 |  |  |         # but the unnecessary OcrdFile instantiations for posterior fileGrp filtering | 
            
                                                                                                            
                            
            
                                    
            
            
                | 326 |  |  |         # can actually be much more costly than traversing the ltree. | 
            
                                                                                                            
                            
            
                                    
            
            
                | 327 |  |  |         # This might depend on the number of pages vs number of fileGrps. | 
            
                                                                                                            
                            
            
                                    
            
            
                | 328 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 329 |  |  |         pages = dict() | 
            
                                                                                                            
                            
            
                                    
            
            
                | 330 |  |  |         for i, ifg in enumerate(ifgs): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 331 |  |  |             for file_ in sorted(self.workspace.mets.find_all_files( | 
            
                                                                                                            
                            
            
                                    
            
            
                | 332 |  |  |                     pageId=self.page_id, fileGrp=ifg, mimetype=mimetype), | 
            
                                                                                                            
                            
            
                                    
            
            
                | 333 |  |  |                                 # sort by MIME type so PAGE comes before images | 
            
                                                                                                            
                            
            
                                    
            
            
                | 334 |  |  |                                 key=lambda file_: file_.mimetype): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 335 |  |  |                 if not file_.pageId: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 336 |  |  |                     continue | 
            
                                                                                                            
                            
            
                                    
            
            
                | 337 |  |  |                 ift = pages.setdefault(file_.pageId, [None]*len(ifgs)) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 338 |  |  |                 if ift[i]: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 339 |  |  |                     LOG.debug("another file %s for page %s in input file group %s", file_.ID, file_.pageId, ifg) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 340 |  |  |                     # fileGrp has multiple files for this page ID | 
            
                                                                                                            
                            
            
                                    
            
            
                | 341 |  |  |                     if mimetype: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 342 |  |  |                         # filter was active, this must not happen | 
            
                                                                                                            
                            
            
                                                                    
                                                                                                        
            
            
                | 343 |  | View Code Duplication |                         if on_error == 'skip': | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 344 |  |  |                             ift[i] = None | 
            
                                                                                                            
                            
            
                                    
            
            
                | 345 |  |  |                         elif on_error == 'first': | 
            
                                                                                                            
                            
            
                                    
            
            
                | 346 |  |  |                             pass # keep first match | 
            
                                                                                                            
                            
            
                                    
            
            
                | 347 |  |  |                         elif on_error == 'last': | 
            
                                                                                                            
                            
            
                                    
            
            
                | 348 |  |  |                             ift[i] = file_ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 349 |  |  |                         elif on_error == 'abort': | 
            
                                                                                                            
                            
            
                                    
            
            
                | 350 |  |  |                             raise ValueError( | 
            
                                                                                                            
                            
            
                                    
            
            
                | 351 |  |  |                                 "Multiple '%s' matches for page '%s' in fileGrp '%s'." % ( | 
            
                                                                                                            
                            
            
                                    
            
            
                | 352 |  |  |                                     mimetype, file_.pageId, ifg)) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 353 |  |  |                         else: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 354 |  |  |                             raise Exception("Unknown 'on_error' strategy '%s'" % on_error) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 355 |  |  |                     elif (ift[i].mimetype == MIMETYPE_PAGE and | 
            
                                                                                                            
                            
            
                                    
            
            
                | 356 |  |  |                           file_.mimetype != MIMETYPE_PAGE): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 357 |  |  |                         pass # keep PAGE match | 
            
                                                                                                            
                            
            
                                    
            
            
                | 358 |  |  |                     elif (ift[i].mimetype == MIMETYPE_PAGE and | 
            
                                                                                                            
                            
            
                                    
            
            
                | 359 |  |  |                           file_.mimetype == MIMETYPE_PAGE): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 360 |  |  |                         raise ValueError( | 
            
                                                                                                            
                            
            
                                    
            
            
                | 361 |  |  |                             "Multiple PAGE-XML matches for page '%s' in fileGrp '%s'." % ( | 
            
                                                                                                            
                            
            
                                    
            
            
                | 362 |  |  |                                 file_.pageId, ifg)) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 363 |  |  |                     else: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 364 |  |  |                         # filter was inactive but no PAGE is in control, this must not happen | 
            
                                                                                                            
                            
            
                                                                    
                                                                                                        
            
            
                | 365 |  | View Code Duplication |                         if on_error == 'skip': | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 366 |  |  |                             ift[i] = None | 
            
                                                                                                            
                            
            
                                    
            
            
                | 367 |  |  |                         elif on_error == 'first': | 
            
                                                                                                            
                            
            
                                    
            
            
                | 368 |  |  |                             pass # keep first match | 
            
                                                                                                            
                            
            
                                    
            
            
                | 369 |  |  |                         elif on_error == 'last': | 
            
                                                                                                            
                            
            
                                    
            
            
                | 370 |  |  |                             ift[i] = file_ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 371 |  |  |                         elif on_error == 'abort': | 
            
                                                                                                            
                            
            
                                    
            
            
                | 372 |  |  |                             raise ValueError( | 
            
                                                                                                            
                            
            
                                    
            
            
                | 373 |  |  |                                 "No PAGE-XML for page '%s' in fileGrp '%s' but multiple matches." % ( | 
            
                                                                                                            
                            
            
                                    
            
            
                | 374 |  |  |                                     file_.pageId, ifg)) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 375 |  |  |                         else: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 376 |  |  |                             raise Exception("Unknown 'on_error' strategy '%s'" % on_error) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 377 |  |  |                 else: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 378 |  |  |                     LOG.debug("adding file %s for page %s to input file group %s", file_.ID, file_.pageId, ifg) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 379 |  |  |                     ift[i] = file_ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 380 |  |  |         ifts = list() | 
            
                                                                                                            
                            
            
                                    
            
            
                | 381 |  |  |         for page, ifiles in pages.items(): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 382 |  |  |             for i, ifg in enumerate(ifgs): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 383 |  |  |                 if not ifiles[i]: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 384 |  |  |                     # other fallback options? | 
            
                                                                                                            
                            
            
                                    
            
            
                | 385 |  |  |                     LOG.error('found no page %s in file group %s', | 
            
                                                                                                            
                            
            
                                    
            
            
                | 386 |  |  |                               page, ifg) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 387 |  |  |             if ifiles[0] or not require_first: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 388 |  |  |                 ifts.append(tuple(ifiles)) | 
            
                                                                                                            
                                                                
            
                                    
            
            
                | 389 |  |  |         return ifts | 
            
                                                        
            
                                    
            
            
                | 390 |  |  |  |