Passed
Pull Request — master (#1343)
by
unknown
02:09
created

ocrd.processor.builtin.shell_processor.cli()   A

Complexity

Conditions 1

Size

Total Lines 4
Code Lines 4

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 4
dl 0
loc 4
rs 10
c 0
b 0
f 0
cc 1
nop 2
1
# pylint: disable=missing-module-docstring,invalid-name
2
from typing import Optional, get_args
3
import os
4
import subprocess
5
from tempfile import TemporaryDirectory
6
7
import click
8
9
from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor
10
from ocrd import Processor
11
from ocrd_models import OcrdPage, OcrdFileType
12
from ocrd_models.ocrd_page import to_xml
13
from ocrd_modelfactory import page_from_file
14
from ocrd_utils import config, make_file_id, MIMETYPE_PAGE
15
16
17
class ShellProcessor(Processor):
18
19
    def setup(self):
20
        command = self.parameter['command']
21
        if '@INFILE' not in command:
22
            raise Exception("command parameter requires @INFILE pattern")
23
        if '@OUTFILE' not in command:
24
            raise Exception("command parameter requires @OUTFILE pattern")
25
26
    def process_page_file(self, *input_files: Optional[OcrdFileType]) -> None:
27
        """
28
        Process PAGE files via arbitrary command line on the shell.
29
30
        \b
31
        For each selected physical page of the workspace, pass ``command`` 
32
        to the shell, replacing:
33
        - the string ``@INFILE`` with the PAGE input file path,
34
        - the string ``@OUTFILE`` with the PAGE output file path.
35
36
        Modify the resulting PAGE output file with our new `@pcGtsId` and
37
        metadata.
38
        """
39
        input_paths: List[str] = [""] * len(input_files)
40
        input_pos = next(i for i, input_file in enumerate(input_files)
41
                         if input_file is not None)
42
        page_id = input_files[input_pos].pageId
43
        self._base_logger.info("processing page %s", page_id)
44
        for i, input_file in enumerate(input_files):
45
            grp = self.input_file_grp.split(',')[i]
46
            if input_file is None:
47
                self._base_logger.debug(f"ignoring missing file for input fileGrp {grp} for page {page_id}")
48
                continue
49
            assert isinstance(input_file, get_args(OcrdFileType))
50
            if not input_file.local_filename:
51
                self._base_logger.error(f'No local file exists for page {page_id} in file group {grp}')
52
                if config.OCRD_MISSING_INPUT == 'ABORT':
53
                    raise MissingInputFile(grp, page_id, input_file.mimetype)
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable MissingInputFile does not seem to be defined.
Loading history...
54
                continue
55
            self._base_logger.debug(f"parsing file {input_file.ID} for page {page_id}")
56
            if os.path.exists(input_file.local_filename):
57
                input_paths[i] = input_file.local_filename
58
            else:
59
                self._base_logger.error(f"non-existing local file for input fileGrp {grp} for page {page_id}")
60
        if not any(input_paths):
61
            self._base_logger.warning(f'skipping page {page_id}')
62
            return
63
        output_file_id = make_file_id(input_files[input_pos], self.output_file_grp)
64
        if input_files[input_pos].fileGrp == self.output_file_grp:
65
            # input=output fileGrp: re-use ID exactly
66
            output_file_id = input_files[input_pos].ID
67
        output_file = next(self.workspace.mets.find_files(ID=output_file_id), None)
68
        if output_file and config.OCRD_EXISTING_OUTPUT != 'OVERWRITE':
69
            # short-cut avoiding useless computation:
70
            raise FileExistsError(
71
                f"A file with ID=={output_file_id} already exists {output_file} and neither force nor ignore are set"
72
            )
73
        command = self.parameter['command']
74
        with TemporaryDirectory(suffix=page_id) as tmpdir:
75
            out_path = os.path.join(tmpdir, output_file_id + ".xml")
76
            # remove quotation around filename patterns, if any
77
            command = command.replace('"@INFILE"', '@INFILE').replace('"@OUTFILE"', '@OUTFILE')
78
            command = command.replace("'@INFILE'", '@INFILE').replace("'@OUTFILE'", '@OUTFILE')
79
            # replace filename patterns with actual paths, quoted
80
            for in_path in input_paths:
81
                command = command.replace('@INFILE', '"' + in_path + '"', 1)
82
            command = command.replace('@OUTFILE', '"' + out_path + '"')
83
            # execute command pattern
84
            self.logger.debug("Running command: '%s'", command)
85
            # pylint: disable=subprocess-run-check
86
            result = subprocess.run(command, shell=True,
87
                                    universal_newlines=True,
88
                                    stdout=subprocess.PIPE,
89
                                    stderr=subprocess.PIPE)
90
            self.logger.debug("Command for %s returned: %d", page_id, result.returncode)
91
            if result.stdout:
92
                self.logger.info("Command for %s stdout: %s", page_id, result.stdout)
93
            if result.stderr:
94
                self.logger.warning("Command for %s stderr: %s", page_id, result.stderr)
95
            if result.returncode != 0:
96
                self.logger.error("Command for %s failed", page_id)
97
                return
98
            try:
99
                result = page_from_file(out_path)
100
                assert isinstance(result, OcrdPage)
101
            except ValueError as err:
102
                # not PAGE and not an image to generate PAGE for
103
                self._base_logger.error(f"non-PAGE output for page {page_id}: {err}")
104
                return
105
        result.set_pcGtsId(output_file_id)
106
        self.add_metadata(result)
107
        self.workspace.add_file(
108
            file_id=output_file_id,
109
            file_grp=self.output_file_grp,
110
            page_id=page_id,
111
            local_filename=os.path.join(self.output_file_grp, output_file_id + '.xml'),
112
            mimetype=MIMETYPE_PAGE,
113
            content=to_xml(result),
114
        )
115
116
    @property
117
    def metadata_filename(self):
118
        return 'processor/builtin/dummy/ocrd-tool.json'
119
120
    @property
121
    def executable(self):
122
        return 'ocrd-command'
123
124
125
@click.command()
126
@ocrd_cli_options
127
def cli(*args, **kwargs):
128
    return ocrd_cli_wrap_processor(ShellProcessor, *args, **kwargs)
129