|
1
|
|
|
# pylint: disable=missing-module-docstring,invalid-name |
|
2
|
|
|
from typing import Optional, get_args |
|
3
|
|
|
import os |
|
4
|
|
|
import subprocess |
|
5
|
|
|
from tempfile import TemporaryDirectory |
|
6
|
|
|
|
|
7
|
|
|
import click |
|
8
|
|
|
|
|
9
|
|
|
from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor |
|
10
|
|
|
from ocrd import Processor |
|
11
|
|
|
from ocrd_models import OcrdPage, OcrdFileType |
|
12
|
|
|
from ocrd_models.ocrd_page import to_xml |
|
13
|
|
|
from ocrd_modelfactory import page_from_file |
|
14
|
|
|
from ocrd_utils import config, make_file_id, MIMETYPE_PAGE |
|
15
|
|
|
|
|
16
|
|
|
|
|
17
|
|
|
class ShellProcessor(Processor): |
|
18
|
|
|
|
|
19
|
|
|
def setup(self): |
|
20
|
|
|
command = self.parameter['command'] |
|
21
|
|
|
if '@INFILE' not in command: |
|
22
|
|
|
raise Exception("command parameter requires @INFILE pattern") |
|
23
|
|
|
if '@OUTFILE' not in command: |
|
24
|
|
|
raise Exception("command parameter requires @OUTFILE pattern") |
|
25
|
|
|
|
|
26
|
|
|
def process_page_file(self, *input_files: Optional[OcrdFileType]) -> None: |
|
27
|
|
|
""" |
|
28
|
|
|
Process PAGE files via arbitrary command line on the shell. |
|
29
|
|
|
|
|
30
|
|
|
\b |
|
31
|
|
|
For each selected physical page of the workspace, pass ``command`` |
|
32
|
|
|
to the shell, replacing: |
|
33
|
|
|
- the string ``@INFILE`` with the PAGE input file path, |
|
34
|
|
|
- the string ``@OUTFILE`` with the PAGE output file path. |
|
35
|
|
|
|
|
36
|
|
|
Modify the resulting PAGE output file with our new `@pcGtsId` and |
|
37
|
|
|
metadata. |
|
38
|
|
|
""" |
|
39
|
|
|
input_paths: List[str] = [""] * len(input_files) |
|
40
|
|
|
input_pos = next(i for i, input_file in enumerate(input_files) |
|
41
|
|
|
if input_file is not None) |
|
42
|
|
|
page_id = input_files[input_pos].pageId |
|
43
|
|
|
self._base_logger.info("processing page %s", page_id) |
|
44
|
|
|
for i, input_file in enumerate(input_files): |
|
45
|
|
|
grp = self.input_file_grp.split(',')[i] |
|
46
|
|
|
if input_file is None: |
|
47
|
|
|
self._base_logger.debug(f"ignoring missing file for input fileGrp {grp} for page {page_id}") |
|
48
|
|
|
continue |
|
49
|
|
|
assert isinstance(input_file, get_args(OcrdFileType)) |
|
50
|
|
|
if not input_file.local_filename: |
|
51
|
|
|
self._base_logger.error(f'No local file exists for page {page_id} in file group {grp}') |
|
52
|
|
|
if config.OCRD_MISSING_INPUT == 'ABORT': |
|
53
|
|
|
raise MissingInputFile(grp, page_id, input_file.mimetype) |
|
|
|
|
|
|
54
|
|
|
continue |
|
55
|
|
|
self._base_logger.debug(f"parsing file {input_file.ID} for page {page_id}") |
|
56
|
|
|
if os.path.exists(input_file.local_filename): |
|
57
|
|
|
input_paths[i] = input_file.local_filename |
|
58
|
|
|
else: |
|
59
|
|
|
self._base_logger.error(f"non-existing local file for input fileGrp {grp} for page {page_id}") |
|
60
|
|
|
if not any(input_paths): |
|
61
|
|
|
self._base_logger.warning(f'skipping page {page_id}') |
|
62
|
|
|
return |
|
63
|
|
|
output_file_id = make_file_id(input_files[input_pos], self.output_file_grp) |
|
64
|
|
|
if input_files[input_pos].fileGrp == self.output_file_grp: |
|
65
|
|
|
# input=output fileGrp: re-use ID exactly |
|
66
|
|
|
output_file_id = input_files[input_pos].ID |
|
67
|
|
|
output_file = next(self.workspace.mets.find_files(ID=output_file_id), None) |
|
68
|
|
|
if output_file and config.OCRD_EXISTING_OUTPUT != 'OVERWRITE': |
|
69
|
|
|
# short-cut avoiding useless computation: |
|
70
|
|
|
raise FileExistsError( |
|
71
|
|
|
f"A file with ID=={output_file_id} already exists {output_file} and neither force nor ignore are set" |
|
72
|
|
|
) |
|
73
|
|
|
command = self.parameter['command'] |
|
74
|
|
|
with TemporaryDirectory(suffix=page_id) as tmpdir: |
|
75
|
|
|
out_path = os.path.join(tmpdir, output_file_id + ".xml") |
|
76
|
|
|
# remove quotation around filename patterns, if any |
|
77
|
|
|
command = command.replace('"@INFILE"', '@INFILE').replace('"@OUTFILE"', '@OUTFILE') |
|
78
|
|
|
command = command.replace("'@INFILE'", '@INFILE').replace("'@OUTFILE'", '@OUTFILE') |
|
79
|
|
|
# replace filename patterns with actual paths, quoted |
|
80
|
|
|
for in_path in input_paths: |
|
81
|
|
|
command = command.replace('@INFILE', '"' + in_path + '"', 1) |
|
82
|
|
|
command = command.replace('@OUTFILE', '"' + out_path + '"') |
|
83
|
|
|
# execute command pattern |
|
84
|
|
|
self.logger.debug("Running command: '%s'", command) |
|
85
|
|
|
# pylint: disable=subprocess-run-check |
|
86
|
|
|
result = subprocess.run(command, shell=True, |
|
87
|
|
|
universal_newlines=True, |
|
88
|
|
|
stdout=subprocess.PIPE, |
|
89
|
|
|
stderr=subprocess.PIPE) |
|
90
|
|
|
self.logger.debug("Command for %s returned: %d", page_id, result.returncode) |
|
91
|
|
|
if result.stdout: |
|
92
|
|
|
self.logger.info("Command for %s stdout: %s", page_id, result.stdout) |
|
93
|
|
|
if result.stderr: |
|
94
|
|
|
self.logger.warning("Command for %s stderr: %s", page_id, result.stderr) |
|
95
|
|
|
if result.returncode != 0: |
|
96
|
|
|
self.logger.error("Command for %s failed", page_id) |
|
97
|
|
|
return |
|
98
|
|
|
try: |
|
99
|
|
|
result = page_from_file(out_path) |
|
100
|
|
|
assert isinstance(result, OcrdPage) |
|
101
|
|
|
except ValueError as err: |
|
102
|
|
|
# not PAGE and not an image to generate PAGE for |
|
103
|
|
|
self._base_logger.error(f"non-PAGE output for page {page_id}: {err}") |
|
104
|
|
|
return |
|
105
|
|
|
result.set_pcGtsId(output_file_id) |
|
106
|
|
|
self.add_metadata(result) |
|
107
|
|
|
self.workspace.add_file( |
|
108
|
|
|
file_id=output_file_id, |
|
109
|
|
|
file_grp=self.output_file_grp, |
|
110
|
|
|
page_id=page_id, |
|
111
|
|
|
local_filename=os.path.join(self.output_file_grp, output_file_id + '.xml'), |
|
112
|
|
|
mimetype=MIMETYPE_PAGE, |
|
113
|
|
|
content=to_xml(result), |
|
114
|
|
|
) |
|
115
|
|
|
|
|
116
|
|
|
@property |
|
117
|
|
|
def metadata_filename(self): |
|
118
|
|
|
return 'processor/builtin/dummy/ocrd-tool.json' |
|
119
|
|
|
|
|
120
|
|
|
@property |
|
121
|
|
|
def executable(self): |
|
122
|
|
|
return 'ocrd-command' |
|
123
|
|
|
|
|
124
|
|
|
|
|
125
|
|
|
@click.command() |
|
126
|
|
|
@ocrd_cli_options |
|
127
|
|
|
def cli(*args, **kwargs): |
|
128
|
|
|
return ocrd_cli_wrap_processor(ShellProcessor, *args, **kwargs) |
|
129
|
|
|
|