Passed
Push — master ( c99229...840662 )
by Konstantin
02:41
created

ocrd_models.ocrd_mets.OcrdMets.merge()   B

Complexity

Conditions 6

Size

Total Lines 29
Code Lines 17

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 17
dl 0
loc 29
rs 8.6166
c 0
b 0
f 0
cc 6
nop 8

How to fix   Many Parameters   

Many Parameters

Methods with many parameters are not only hard to understand, but their parameters also often become inconsistent when you need more, or different data.

There are several approaches to avoid long parameter lists:

1
"""
2
API to METS
3
"""
4
from datetime import datetime
5
import re
6
import typing
7
from os import environ
8
from lxml import etree as ET
9
from copy import deepcopy
10
11
from ocrd_utils import (
12
    is_local_filename,
13
    getLogger,
14
    generate_range,
15
    VERSION,
16
    REGEX_PREFIX,
17
    REGEX_FILE_ID
18
)
19
20
from .constants import (
21
    NAMESPACES as NS,
22
    TAG_METS_AGENT,
23
    TAG_METS_DIV,
24
    TAG_METS_FILE,
25
    TAG_METS_FILEGRP,
26
    TAG_METS_FILESEC,
27
    TAG_METS_FPTR,
28
    TAG_METS_METSHDR,
29
    TAG_METS_STRUCTMAP,
30
    IDENTIFIER_PRIORITY,
31
    TAG_MODS_IDENTIFIER,
32
    METS_XML_EMPTY,
33
)
34
35
from .ocrd_xml_base import OcrdXmlDocument, ET
36
from .ocrd_file import OcrdFile
37
from .ocrd_agent import OcrdAgent
38
39
REGEX_PREFIX_LEN = len(REGEX_PREFIX)
40
41
class OcrdMets(OcrdXmlDocument):
42
    """
43
    API to a single METS file
44
    """
45
46
    @staticmethod
47
    def empty_mets(now=None, cache_flag=False):
48
        """
49
        Create an empty METS file from bundled template.
50
        """
51
        if not now:
52
            now = datetime.now().isoformat()
53
        tpl = METS_XML_EMPTY.decode('utf-8')
54
        tpl = tpl.replace('{{ VERSION }}', VERSION)
55
        tpl = tpl.replace('{{ NOW }}', '%s' % now)
56
        return OcrdMets(content=tpl.encode('utf-8'), cache_flag=cache_flag)
57
58
    def __init__(self, **kwargs):
59
        """
60
        """
61
        super(OcrdMets, self).__init__(**kwargs)
62
63
        # XXX If the environment variable OCRD_METS_CACHING is set to "true",
64
        # then enable caching, if "false", disable caching, overriding the
65
        # kwarg to the constructor
66
        if 'OCRD_METS_CACHING' in environ:
67
            cache_override = environ['OCRD_METS_CACHING'] in ('true', '1')
68
            getLogger('ocrd_models.ocrd_mets').debug('METS Caching %s because OCRD_METS_CACHING is %s',
69
                    'enabled' if cache_override else 'disabled', environ['OCRD_METS_CACHING'])
70
            self._cache_flag = cache_override
71
72
        # If cache is enabled
73
        if self._cache_flag:
74
            self.refresh_caches()
75
76
    def __exit__(self):
77
        """
78
79
        """
80
        if self._cache_flag:
81
            self._clear_caches()
82
83
    def __str__(self):
84
        """
85
        String representation
86
        """
87
        return 'OcrdMets[cached=%s,fileGrps=%s,files=%s]' % (self._cache_flag, self.file_groups, list(self.find_files()))
88
89
    def _fill_caches(self):
90
        """
91
        Fills the caches with fileGrps and FileIDs
92
        """
93
94
        tree_root = self._tree.getroot()
95
96
        # Fill with files
97
        el_fileSec = tree_root.find("mets:fileSec", NS)
98
        if el_fileSec is None:
99
            return
100
101
        log = getLogger('ocrd_models.ocrd_mets._fill_caches-files')
102
103
        for el_fileGrp in el_fileSec.findall('mets:fileGrp', NS):
104
            fileGrp_use = el_fileGrp.get('USE')
105
106
            # Assign an empty dictionary that will hold the files of the added fileGrp
107
            self._file_cache[fileGrp_use] = {}
108
109
            for el_file in el_fileGrp:
110
                file_id = el_file.get('ID')
111
                self._file_cache[fileGrp_use].update({file_id : el_file})
112
                # log.info("File added to the cache: %s" % file_id)
113
114
        # Fill with pages
115
        el_div_list = tree_root.findall(".//mets:div[@TYPE='page']", NS)
116
        if len(el_div_list) == 0:
117
            return
118
        log = getLogger('ocrd_models.ocrd_mets._fill_caches-pages')
119
120
        for el_div in el_div_list:
121
            div_id = el_div.get('ID')
122
            log.debug("DIV_ID: %s" % el_div.get('ID'))
123
124
            self._page_cache[div_id] = el_div
125
126
            # Assign an empty dictionary that will hold the fptr of the added page (div)
127
            self._fptr_cache[div_id] = {}
128
129
            # log.info("Page_id added to the cache: %s" % div_id)
130
131
            for el_fptr in el_div:
132
                self._fptr_cache[div_id].update({el_fptr.get('FILEID') : el_fptr})
133
                # log.info("Fptr added to the cache: %s" % el_fptr.get('FILEID'))
134
135
        # log.info("Len of page_cache: %s" % len(self._page_cache))
136
        # log.info("Len of fptr_cache: %s" % len(self._fptr_cache))
137
138
    def _clear_caches(self):
139
        """
140
        Deallocates the caches
141
        """
142
143
        self._file_cache = None
144
        self._page_cache = None
145
        self._fptr_cache = None
146
        
147
    def refresh_caches(self):
148
        if self._cache_flag:
149
            # Cache for the files (mets:file) - two nested dictionaries
150
            # The outer dictionary's Key: 'fileGrp.USE'
151
            # The outer dictionary's Value: Inner dictionary
152
            # The inner dictionary's Key: 'file.ID'
153
            # The inner dictionary's Value: a 'file' object at some memory location
154
            self._file_cache = {}
155
156
            # Cache for the pages (mets:div)
157
            # The dictionary's Key: 'div.ID'
158
            # The dictionary's Value: a 'div' object at some memory location
159
            self._page_cache = {}
160
161
            # Cache for the file pointers (mets:fptr) - two nested dictionaries
162
            # The outer dictionary's Key: 'div.ID'
163
            # The outer dictionary's Value: Inner dictionary
164
            # The inner dictionary's Key: 'fptr.FILEID'
165
            # The inner dictionary's Value: a 'fptr' object at some memory location
166
            self._fptr_cache = {}
167
            
168
            # Note, if the empty_mets() function is used to instantiate OcrdMets
169
            # Then the cache is empty even after this operation
170
            self._fill_caches()
171
        
172
    @property
173
    def unique_identifier(self):
174
        """
175
        Get the unique identifier by looking through ``mods:identifier``
176
        See `specs <https://ocr-d.de/en/spec/mets#unique-id-for-the-document-processed>`_ for details.
177
        """
178
        for t in IDENTIFIER_PRIORITY:
179
            found = self._tree.getroot().find('.//mods:identifier[@type="%s"]' % t, NS)
180
            if found is not None:
181
                return found.text
182
        
183
    @unique_identifier.setter
184
    def unique_identifier(self, purl):
185
        """
186
        Set the unique identifier by looking through ``mods:identifier``
187
        See `specs <https://ocr-d.de/en/spec/mets#unique-id-for-the-document-processed>`_ for details.
188
        """
189
        id_el = None
190
        for t in IDENTIFIER_PRIORITY:
191
            id_el = self._tree.getroot().find('.//mods:identifier[@type="%s"]' % t, NS)
192
            if id_el is not None:
193
                break
194
        if id_el is None:
195
            mods = self._tree.getroot().find('.//mods:mods', NS)
196
            id_el = ET.SubElement(mods, TAG_MODS_IDENTIFIER)
197
            id_el.set('type', 'purl')
198
        id_el.text = purl
199
200
    @property
201
    def agents(self):
202
        """
203
        List all :py:class:`ocrd_models.ocrd_agent.OcrdAgent`s
204
        """
205
        return [OcrdAgent(el_agent) for el_agent in self._tree.getroot().findall('mets:metsHdr/mets:agent', NS)]
206
207
    def add_agent(self, *args, **kwargs):
208
        """
209
        Add an :py:class:`ocrd_models.ocrd_agent.OcrdAgent` to the list of agents in the ``metsHdr``.
210
        """
211
        el_metsHdr = self._tree.getroot().find('.//mets:metsHdr', NS)
212
        if el_metsHdr is None:
213
            el_metsHdr = ET.Element(TAG_METS_METSHDR)
214
            self._tree.getroot().insert(0, el_metsHdr)
215
        #  assert(el_metsHdr is not None)
216
        el_agent = ET.SubElement(el_metsHdr, TAG_METS_AGENT)
217
        #  print(ET.tostring(el_metsHdr))
218
        return OcrdAgent(el_agent, *args, **kwargs)
219
220
    @property
221
    def file_groups(self):
222
        """
223
        List the `@USE` of all `mets:fileGrp` entries.
224
        """
225
226
        # WARNING: Actually we cannot return strings in place of elements!
227
        if self._cache_flag:
228
           return list(self._file_cache.keys())
229
230
        return [el.get('USE') for el in self._tree.getroot().findall('.//mets:fileGrp', NS)]
231
232
    def find_all_files(self, *args, **kwargs):
233
        """
234
        Like :py:meth:`find_files` but return a list of all results.
235
        Equivalent to ``list(self.find_files(...))``
236
        """
237
        return list(self.find_files(*args, **kwargs))
238
239
    # pylint: disable=multiple-statements
240
    def find_files(self, ID=None, fileGrp=None, pageId=None, mimetype=None, url=None, local_only=False):
241
        """
242
        Search ``mets:file`` entries in this METS document and yield results.
243
        The :py:attr:`ID`, :py:attr:`pageId`, :py:attr:`fileGrp`,
244
        :py:attr:`url` and :py:attr:`mimetype` parameters can each be either a
245
        literal string, or a regular expression if the string starts with
246
        ``//`` (double slash).
247
        If it is a regex, the leading ``//`` is removed and candidates are matched
248
        against the regex with `re.fullmatch`. If it is a literal string, comparison
249
        is done with string equality.
250
        The :py:attr:`pageId` parameter supports the numeric range operator ``..``. For
251
        example, to find all files in pages ``PHYS_0001`` to ``PHYS_0003``,
252
        ``PHYS_0001..PHYS_0003`` will be expanded to ``PHYS_0001,PHYS_0002,PHYS_0003``.
253
        Keyword Args:
254
            ID (string) : ``@ID`` of the ``mets:file``
255
            fileGrp (string) : ``@USE`` of the ``mets:fileGrp`` to list files of
256
            pageId (string) : ``@ID`` of the corresponding physical ``mets:structMap`` entry (physical page)
257
            url (string) : ``@xlink:href`` (URL or path) of ``mets:Flocat`` of ``mets:file``
258
            mimetype (string) : ``@MIMETYPE`` of ``mets:file``
259
            local (boolean) : Whether to restrict results to local files in the filesystem
260
        Yields:
261
            :py:class:`ocrd_models:ocrd_file:OcrdFile` instantiations
262
        """
263
        pageId_list = []
264
        if pageId:
265
            pageId_patterns = []
266
            for pageId_token in re.split(r',', pageId):
267
                if pageId_token.startswith(REGEX_PREFIX):
268
                    pageId_patterns.append(re.compile(pageId_token[REGEX_PREFIX_LEN:]))
269
                elif '..' in pageId_token:
270
                    pageId_patterns += generate_range(*pageId_token.split('..', 1))
271
                else:
272
                    pageId_patterns += [pageId_token]
273
            if self._cache_flag:
274
                for page_id in self._page_cache.keys():
275
                    if page_id in pageId_patterns or \
276
                        any([isinstance(p, typing.Pattern) and p.fullmatch(page_id) for p in pageId_patterns]):
277
                        pageId_list += self._fptr_cache[page_id]
278
            else:
279
                for page in self._tree.getroot().xpath(
280
                    '//mets:div[@TYPE="page"]', namespaces=NS):
281
                    if page.get('ID') in pageId_patterns or \
282
                        any([isinstance(p, typing.Pattern) and p.fullmatch(page.get('ID')) for p in pageId_patterns]):
283
                        pageId_list += [fptr.get('FILEID') for fptr in page.findall('mets:fptr', NS)]
284
285
        if ID and ID.startswith(REGEX_PREFIX):
286
            ID = re.compile(ID[REGEX_PREFIX_LEN:])
287
        if fileGrp and fileGrp.startswith(REGEX_PREFIX):
288
            fileGrp = re.compile(fileGrp[REGEX_PREFIX_LEN:])
289
        if mimetype and mimetype.startswith(REGEX_PREFIX):
290
            mimetype = re.compile(mimetype[REGEX_PREFIX_LEN:])
291
        if url and url.startswith(REGEX_PREFIX):
292
            url = re.compile(url[REGEX_PREFIX_LEN:])
293
            
294
        candidates = []
295
        if self._cache_flag:
296
            if fileGrp:
297
                if isinstance(fileGrp, str):
298
                    candidates += self._file_cache.get(fileGrp, {}).values()
299
                else:
300
                    candidates = [x for fileGrp_needle, el_file_list in self._file_cache.items() if fileGrp.match(fileGrp_needle) for x in el_file_list.values()]
301
            else:
302
                candidates = [el_file for id_to_file in self._file_cache.values() for el_file in id_to_file.values()]
303
        else:
304
            candidates = self._tree.getroot().xpath('//mets:file', namespaces=NS)
305
            
306
        for cand in candidates:
307
            if ID:
308
                if isinstance(ID, str):
309
                    if not ID == cand.get('ID'): continue
310
                else:
311
                    if not ID.fullmatch(cand.get('ID')): continue
312
313
            if pageId is not None and cand.get('ID') not in pageId_list:
314
                continue
315
316
            if not self._cache_flag and fileGrp:
317
                if isinstance(fileGrp, str):
318
                    if cand.getparent().get('USE') != fileGrp: continue
319
                else:
320
                    if not fileGrp.fullmatch(cand.getparent().get('USE')): continue
321
322
            if mimetype:
323
                if isinstance(mimetype, str):
324
                    if cand.get('MIMETYPE') != mimetype: continue
325
                else:
326
                    if not mimetype.fullmatch(cand.get('MIMETYPE') or ''): continue
327
328
            if url:
329
                cand_locat = cand.find('mets:FLocat', namespaces=NS)
330
                if cand_locat is None:
331
                    continue
332
                cand_url = cand_locat.get('{%s}href' % NS['xlink'])
333
                if isinstance(url, str):
334
                    if cand_url != url: continue
335
                else:
336
                    if not url.fullmatch(cand_url): continue
337
338
            # Note: why we instantiate a class only to find out that the local_only is set afterwards
339
            # Checking local_only and url before instantiation should be better?
340
            f = OcrdFile(cand, mets=self)
341
342
            # If only local resources should be returned and f is not a file path: skip the file
343
            if local_only and not is_local_filename(f.url):
344
                continue
345
            yield f
346
347
    def add_file_group(self, fileGrp):
348
        """
349
        Add a new ``mets:fileGrp``.
350
        Arguments:
351
            fileGrp (string): ``@USE`` of the new ``mets:fileGrp``.
352
        """
353
        if ',' in fileGrp:
354
            raise Exception('fileGrp must not contain commas')
355
        el_fileSec = self._tree.getroot().find('mets:fileSec', NS)
356
        if el_fileSec is None:
357
            el_fileSec = ET.SubElement(self._tree.getroot(), TAG_METS_FILESEC)
358
        el_fileGrp = el_fileSec.find('mets:fileGrp[@USE="%s"]' % fileGrp, NS)
359
        if el_fileGrp is None:
360
            el_fileGrp = ET.SubElement(el_fileSec, TAG_METS_FILEGRP)
361
            el_fileGrp.set('USE', fileGrp)
362
            
363
            if self._cache_flag:
364
                # Assign an empty dictionary that will hold the files of the added fileGrp
365
                self._file_cache[fileGrp] = {}
366
                
367
        return el_fileGrp
368
369
    def rename_file_group(self, old, new):
370
        """
371
        Rename a ``mets:fileGrp`` by changing the ``@USE`` from :py:attr:`old` to :py:attr:`new`.
372
        """
373
        el_fileGrp = self._tree.getroot().find('mets:fileSec/mets:fileGrp[@USE="%s"]' % old, NS)
374
        if el_fileGrp is None:
375
            raise FileNotFoundError("No such fileGrp '%s'" % old)
376
        el_fileGrp.set('USE', new)
377
        
378
        if self._cache_flag:
379
            self._file_cache[new] = self._file_cache.pop(old)
380
381
    def remove_file_group(self, USE, recursive=False, force=False):
382
        """
383
        Remove a ``mets:fileGrp`` (single fixed ``@USE`` or multiple regex ``@USE``)
384
        Arguments:
385
            USE (string): ``@USE`` of the ``mets:fileGrp`` to delete. Can be a regex if prefixed with ``//``
386
            recursive (boolean): Whether to recursively delete each ``mets:file`` in the group
387
            force (boolean): Do not raise an exception if ``mets:fileGrp`` does not exist
388
        """
389
        log = getLogger('ocrd_models.ocrd_mets.remove_file_group')
390
        el_fileSec = self._tree.getroot().find('mets:fileSec', NS)
391
        if el_fileSec is None:
392
            raise Exception("No fileSec!")
393
        if isinstance(USE, str):
394
            if USE.startswith(REGEX_PREFIX):
395
                use = re.compile(USE[REGEX_PREFIX_LEN:])
396
                for cand in el_fileSec.findall('mets:fileGrp', NS):
397
                    if use.fullmatch(cand.get('USE')):
398
                        self.remove_file_group(cand, recursive=recursive)
399
                return
400
            else:
401
                el_fileGrp = el_fileSec.find('mets:fileGrp[@USE="%s"]' % USE, NS)
402
        else:
403
            el_fileGrp = USE
404
        if el_fileGrp is None:   # pylint: disable=len-as-condition
405
            msg = "No such fileGrp: %s" % USE
406
            if force:
407
                log.warning(msg)
408
                return
409
            raise Exception(msg)
410
411
        # The cache should also be used here
412
        if self._cache_flag:
413
            files = self._file_cache.get(el_fileGrp.get('USE'), {}).values()
414
        else:
415
            files = el_fileGrp.findall('mets:file', NS)
416
417
        if files:
418
            if not recursive:
419
                raise Exception("fileGrp %s is not empty and recursive wasn't set" % USE)
420
            for f in list(files):
421
                self.remove_one_file(ID=f.get('ID'), fileGrp=f.getparent().get('USE'))
422
423
        if self._cache_flag:
424
            # Note: Since the files inside the group are removed
425
            # with the 'remove_one_file' method above, 
426
            # we should not take care of that again.
427
            # We just remove the fileGrp.
428
            del self._file_cache[el_fileGrp.get('USE')]
429
430
        el_fileGrp.getparent().remove(el_fileGrp)
431
432
    def add_file(self, fileGrp, mimetype=None, url=None, ID=None, pageId=None, force=False, local_filename=None, ignore=False, **kwargs):
433
        """
434
        Instantiate and add a new :py:class:`ocrd_models.ocrd_file.OcrdFile`.
435
        Arguments:
436
            fileGrp (string): ``@USE`` of ``mets:fileGrp`` to add to
437
        Keyword Args:
438
            mimetype (string): ``@MIMETYPE`` of the ``mets:file`` to use
439
            url (string): ``@xlink:href`` (URL or path) of the ``mets:file`` to use
440
            ID (string): ``@ID`` of the ``mets:file`` to use
441
            pageId (string): ``@ID`` in the physical ``mets:structMap`` to link to
442
            force (boolean): Whether to add the file even if a ``mets:file`` with the same ``@ID`` already exists.
443
            ignore (boolean): Do not look for existing files at all. Shift responsibility for preventing errors from duplicate ID to the user.
444
            local_filename (string):
445
        """
446
        if not ID:
447
            raise ValueError("Must set ID of the mets:file")
448
        if not fileGrp:
449
            raise ValueError("Must set fileGrp of the mets:file")
450
        if not REGEX_FILE_ID.fullmatch(ID):
451
            raise ValueError("Invalid syntax for mets:file/@ID %s (not an xs:ID)" % ID)
452
        if not REGEX_FILE_ID.fullmatch(fileGrp):
453
            raise ValueError("Invalid syntax for mets:fileGrp/@USE %s (not an xs:ID)" % fileGrp)
454
        log = getLogger('ocrd_models.ocrd_mets.add_file')
455
456
        el_fileGrp = self.add_file_group(fileGrp)
457
        if not ignore:
458
            mets_file = next(self.find_files(ID=ID, fileGrp=fileGrp), None)
459
            if mets_file:
460
                if mets_file.fileGrp == fileGrp and \
461
                   mets_file.pageId == pageId and \
462
                   mets_file.mimetype == mimetype:
463
                    if not force:
464
                        raise FileExistsError(f"A file with ID=={ID} already exists {mets_file} and neither force nor ignore are set")
465
                    self.remove_file(ID=ID, fileGrp=fileGrp)
466
                else:
467
                    raise FileExistsError(f"A file with ID=={ID} already exists {mets_file} but unrelated - cannot mitigate")
468
469
        # To get rid of Python's FutureWarning - checking if v is not None
470
        kwargs = {k: v for k, v in locals().items() if k in ['url', 'ID', 'mimetype', 'pageId', 'local_filename'] and v is not None}
471
        # This separation is needed to reuse the same el_mets_file element in the caching if block
472
        el_mets_file = ET.SubElement(el_fileGrp, TAG_METS_FILE)
473
        # The caching of the physical page is done in the OcrdFile constructor
474
        mets_file = OcrdFile(el_mets_file, mets=self, **kwargs)
475
476
        if self._cache_flag:
477
            # Add the file to the file cache
478
            self._file_cache[fileGrp].update({ID: el_mets_file})
479
480
        return mets_file
481
482
    def remove_file(self, *args, **kwargs):
483
        """
484
        Delete each ``ocrd:file`` matching the query. Same arguments as :py:meth:`find_files`
485
        """
486
        files = list(self.find_files(*args, **kwargs))
487
        if files:
488
            for f in files:
489
                self.remove_one_file(f)
490
            if len(files) > 1:
491
                return files
492
            else:
493
                return files[0] # for backwards-compatibility
494
        if any(1 for kwarg in kwargs
495
               if isinstance(kwarg, str) and kwarg.startswith(REGEX_PREFIX)):
496
            # allow empty results if filter criteria involve a regex
497
            return []
498
        raise FileNotFoundError("File not found: %s %s" % (args, kwargs))
499
500
    def remove_one_file(self, ID, fileGrp=None):
501
        """
502
        Delete an existing :py:class:`ocrd_models.ocrd_file.OcrdFile`.
503
        Arguments:
504
            ID (string|OcrdFile): ``@ID`` of the ``mets:file`` to delete  Can also be an :py:class:`ocrd_models.ocrd_file.OcrdFile` to avoid search via ``ID``.
505
            fileGrp (string): ``@USE`` of the ``mets:fileGrp`` containing the ``mets:file``. Used only for optimization.
506
        Returns:
507
            The old :py:class:`ocrd_models.ocrd_file.OcrdFile` reference.
508
        """
509
        log = getLogger('ocrd_models.ocrd_mets.remove_one_file')
510
        log.debug("remove_one_file(%s %s)" % (ID, fileGrp))
511
        if isinstance(ID, OcrdFile):
512
            ocrd_file = ID
513
            ID = ocrd_file.ID
514
        else:
515
            ocrd_file = next(self.find_files(ID=ID, fileGrp=fileGrp), None)
516
517
        if not ocrd_file:
518
            raise FileNotFoundError("File not found: %s (fileGr=%s)" % (ID, fileGrp))
519
520
        # Delete the physical page ref
521
        fptrs = []
522
        if self._cache_flag:
523
            for page in self._fptr_cache.keys():
524
                if ID in self._fptr_cache[page]:
525
                    fptrs.append(self._fptr_cache[page][ID])
526
        else:
527
            fptrs = self._tree.getroot().findall('.//mets:fptr[@FILEID="%s"]' % ID, namespaces=NS)
528
529
        # Delete the physical page ref
530
        for fptr in fptrs:
531
            log.debug("Delete fptr element %s for page '%s'", fptr, ID)
532
            page_div = fptr.getparent()
533
            page_div.remove(fptr)
534
            # Remove the fptr from the cache as well
535
            if self._cache_flag:
536
                del self._fptr_cache[page_div.get('ID')][ID]
537
            # delete empty pages
538
            if not page_div.getchildren():
539
                log.debug("Delete empty page %s", page_div)
540
                page_div.getparent().remove(page_div)
541
                # Delete the empty pages from caches as well
542
                if self._cache_flag:
543
                    del self._page_cache[page_div.get('ID')]
544
                    del self._fptr_cache[page_div.get('ID')]
545
546
        # Delete the file reference from the cache
547
        if self._cache_flag:
548
            parent_use = ocrd_file._el.getparent().get('USE')
549
            del self._file_cache[parent_use][ocrd_file.ID]
550
551
        # Delete the file reference
552
        # pylint: disable=protected-access
553
        ocrd_file._el.getparent().remove(ocrd_file._el)
554
555
        return ocrd_file
556
557
    @property
558
    def physical_pages(self):
559
        """
560
        List all page IDs (the ``@ID`` of each physical ``mets:structMap`` ``mets:div``)
561
        """
562
        if self._cache_flag:
563
            return list(self._page_cache.keys())
564
            
565
        return self._tree.getroot().xpath(
566
            'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"]/@ID',
567
            namespaces=NS)
568
569
    def get_physical_pages(self, for_fileIds=None):
570
        """
571
        List all page IDs (the ``@ID`` of each physical ``mets:structMap`` ``mets:div``),
572
        optionally for a subset of ``mets:file`` ``@ID`` :py:attr:`for_fileIds`.
573
        """
574
        if for_fileIds is None:
575
            return self.physical_pages
576
        ret = [None] * len(for_fileIds)
577
        
578
        if self._cache_flag:
579
            for pageId in self._fptr_cache.keys():
580
                for fptr in self._fptr_cache[pageId].keys():
581
                    if fptr in for_fileIds:
582
                        ret[for_fileIds.index(fptr)] = pageId
583
        else:
584
          for page in self._tree.getroot().xpath(
585
              'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"]',
586
                  namespaces=NS):
587
              for fptr in page.findall('mets:fptr', NS):
588
                  if fptr.get('FILEID') in for_fileIds:
589
                      ret[for_fileIds.index(fptr.get('FILEID'))] = page.get('ID')
590
        return ret
591
592
    def set_physical_page_for_file(self, pageId, ocrd_file, order=None, orderlabel=None):
593
        """
594
        Set the physical page ID (``@ID`` of the physical ``mets:structMap`` ``mets:div`` entry)
595
        corresponding to the ``mets:file`` :py:attr:`ocrd_file`, creating all structures if necessary.
596
        Arguments:
597
            pageId (string): ``@ID`` of the physical ``mets:structMap`` entry to use
598
            ocrd_file (object): existing :py:class:`ocrd_models.ocrd_file.OcrdFile` object
599
        Keyword Args:
600
            order (string): ``@ORDER`` to use
601
            orderlabel (string): ``@ORDERLABEL`` to use
602
        """
603
604
        # delete any page mapping for this file.ID
605
        candidates = []
606
        if self._cache_flag:
607
            for page_id in self._fptr_cache.keys():
608
                if ocrd_file.ID in self._fptr_cache[page_id].keys():
609
                    if self._fptr_cache[page_id][ocrd_file.ID] is not None:
610
                        candidates.append(self._fptr_cache[page_id][ocrd_file.ID])
611
        else:
612
            candidates = self._tree.getroot().findall(
613
                'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"]/mets:fptr[@FILEID="%s"]' %
614
                ocrd_file.ID, namespaces=NS)
615
616
        for el_fptr in candidates:
617
            if self._cache_flag:
618
                del self._fptr_cache[el_fptr.getparent().get('ID')][ocrd_file.ID]
619
            el_fptr.getparent().remove(el_fptr)
620
621
        # find/construct as necessary
622
        el_structmap = self._tree.getroot().find('mets:structMap[@TYPE="PHYSICAL"]', NS)
623
        if el_structmap is None:
624
            el_structmap = ET.SubElement(self._tree.getroot(), TAG_METS_STRUCTMAP)
625
            el_structmap.set('TYPE', 'PHYSICAL')
626
        el_seqdiv = el_structmap.find('mets:div[@TYPE="physSequence"]', NS)
627
        if el_seqdiv is None:
628
            el_seqdiv = ET.SubElement(el_structmap, TAG_METS_DIV)
629
            el_seqdiv.set('TYPE', 'physSequence')
630
        
631
        el_pagediv = None
632
        if self._cache_flag:
633
            if pageId in self._page_cache:
634
                el_pagediv = self._page_cache[pageId]
635
        else:
636
            el_pagediv = el_seqdiv.find('mets:div[@ID="%s"]' % pageId, NS)
637
        
638
        if el_pagediv is None:
639
            el_pagediv = ET.SubElement(el_seqdiv, TAG_METS_DIV)
640
            el_pagediv.set('TYPE', 'page')
641
            el_pagediv.set('ID', pageId)
642
            if order:
643
                el_pagediv.set('ORDER', order)
644
            if orderlabel:
645
                el_pagediv.set('ORDERLABEL', orderlabel)
646
            if self._cache_flag:
647
                # Create a new entry in the page cache
648
                self._page_cache[pageId] = el_pagediv
649
                # Create a new entry in the fptr cache and 
650
                # assign an empty dictionary to hold the fileids
651
                self._fptr_cache[pageId] = {}
652
                
653
        el_fptr = ET.SubElement(el_pagediv, TAG_METS_FPTR)
654
        el_fptr.set('FILEID', ocrd_file.ID)
655
656
        if self._cache_flag:
657
            # Assign the ocrd fileID to the pageId in the cache
658
            self._fptr_cache[el_pagediv.get('ID')].update({ocrd_file.ID : el_fptr})
659
660
    def get_physical_page_for_file(self, ocrd_file):
661
        """
662
        Get the physical page ID (``@ID`` of the physical ``mets:structMap`` ``mets:div`` entry)
663
        corresponding to the ``mets:file`` :py:attr:`ocrd_file`.
664
        """
665
        ret = []
666
        if self._cache_flag:
667
            for pageId in self._fptr_cache.keys():
668
                if ocrd_file.ID in self._fptr_cache[pageId].keys():
669
                    ret.append(self._page_cache[pageId].get('ID'))
670
        else:
671
            ret = self._tree.getroot().xpath(
672
                '/mets:mets/mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"][./mets:fptr[@FILEID="%s"]]/@ID' %
673
                ocrd_file.ID, namespaces=NS)
674
675
        # To get rid of the python's FutureWarning
676
        if len(ret):
677
            return ret[0]
678
679
    def remove_physical_page(self, ID):
680
        """
681
        Delete page (physical ``mets:structMap`` ``mets:div`` entry ``@ID``) :py:attr:`ID`.
682
        """
683
        mets_div = None
684
        if self._cache_flag:
685
            if ID in self._page_cache.keys():
686
                mets_div = [self._page_cache[ID]]
687
        else:
688
            mets_div = self._tree.getroot().xpath(
689
                'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"][@ID="%s"]' % ID,
690
                namespaces=NS)
691
        if mets_div:
692
            mets_div[0].getparent().remove(mets_div[0])
693
            if self._cache_flag:
694
                del self._page_cache[ID]
695
                del self._fptr_cache[ID]
696
697
    def remove_physical_page_fptr(self, fileId):
698
        """
699
        Delete all ``mets:fptr[@FILEID = fileId]`` to ``mets:file[@ID == fileId]`` for :py:attr:`fileId` from all ``mets:div`` entries in the physical ``mets:structMap``.
700
        Returns:
701
            List of pageIds that mets:fptrs were deleted from
702
        """
703
704
        # Question: What is the reason to keep a list of mets_fptrs?
705
        # Do we have a situation in which the fileId is same for different pageIds ?
706
        # From the examples I have seen inside 'assets' that is not the case
707
        # and the mets_fptrs list will always contain a single element.
708
        # If that's the case then we do not need to iterate 2 loops, just one.
709
        mets_fptrs = []
710
        if self._cache_flag:
711
            for page_id in self._fptr_cache.keys():
712
                if fileId in self._fptr_cache[page_id].keys():
713
                    mets_fptrs.append(self._fptr_cache[page_id][fileId]) 
714
        else:
715
            mets_fptrs = self._tree.getroot().xpath(
716
                'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"]/mets:fptr[@FILEID="%s"]' % fileId, namespaces=NS)
717
        ret = []
718
        for mets_fptr in mets_fptrs:
719
            mets_div = mets_fptr.getparent()
720
            ret.append(mets_div.get('ID'))
721
            if self._cache_flag:
722
                del self._fptr_cache[mets_div.get('ID')][mets_fptr.get('FILEID')]
723
            mets_div.remove(mets_fptr)
724
        return ret
725
726
    def merge(self, other_mets, force=False, fileGrp_mapping=None, fileId_mapping=None, pageId_mapping=None, after_add_cb=None, **kwargs):
727
        """
728
        Add all files from other_mets.
729
        Accepts the same kwargs as :py:func:`find_files`
730
        Keyword Args:
731
            force (boolean): Whether to :py:meth:`add_file`s with force (overwriting existing ``mets:file``s)
732
            fileGrp_mapping (dict): Map :py:attr:`other_mets` fileGrp to fileGrp in this METS
733
            fileId_mapping (dict): Map :py:attr:`other_mets` file ID to file ID in this METS
734
            pageId_mapping (dict): Map :py:attr:`other_mets` page ID to page ID in this METS
735
            after_add_cb (function): Callback received after file is added to the METS
736
        """
737
        if not fileGrp_mapping:
738
            fileGrp_mapping = {}
739
        if not fileId_mapping:
740
            fileId_mapping = {}
741
        if not pageId_mapping:
742
            pageId_mapping = {}
743
        for f_src in other_mets.find_files(**kwargs):
744
            f_dest = self.add_file(
745
                    fileGrp_mapping.get(f_src.fileGrp, f_src.fileGrp),
746
                    mimetype=f_src.mimetype,
747
                    url=f_src.url,
748
                    ID=fileId_mapping.get(f_src.ID, f_src.ID),
749
                    pageId=pageId_mapping.get(f_src.pageId, f_src.pageId),
750
                    force=force)
751
            # FIXME: merge metsHdr, amdSec, dmdSec as well
752
            # FIXME: merge structMap logical and structLink as well
753
            if after_add_cb:
754
                after_add_cb(f_dest)
755