Passed
Pull Request — master (#875)
by Konstantin
02:51
created

ocrd_models.ocrd_mets.OcrdMets.add_file()   D

Complexity

Conditions 12

Size

Total Lines 49
Code Lines 27

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 27
dl 0
loc 49
rs 4.8
c 0
b 0
f 0
cc 12
nop 10

How to fix   Complexity    Many Parameters   

Complexity

Complex classes like ocrd_models.ocrd_mets.OcrdMets.add_file() often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

Many Parameters

Methods with many parameters are not only hard to understand, but their parameters also often become inconsistent when you need more, or different data.

There are several approaches to avoid long parameter lists:

1
"""
2
API to METS
3
"""
4
from datetime import datetime
5
import re
6
import typing
7
from os import environ
8
from lxml import etree as ET
9
from copy import deepcopy
10
11
from ocrd_utils import (
12
    is_local_filename,
13
    getLogger,
14
    generate_range,
15
    VERSION,
16
    REGEX_PREFIX,
17
    REGEX_FILE_ID
18
)
19
20
from .constants import (
21
    NAMESPACES as NS,
22
    TAG_METS_AGENT,
23
    TAG_METS_DIV,
24
    TAG_METS_FILE,
25
    TAG_METS_FILEGRP,
26
    TAG_METS_FILESEC,
27
    TAG_METS_FPTR,
28
    TAG_METS_METSHDR,
29
    TAG_METS_STRUCTMAP,
30
    IDENTIFIER_PRIORITY,
31
    TAG_MODS_IDENTIFIER,
32
    METS_XML_EMPTY,
33
)
34
35
from .ocrd_xml_base import OcrdXmlDocument, ET
36
from .ocrd_file import OcrdFile
37
from .ocrd_agent import OcrdAgent
38
39
REGEX_PREFIX_LEN = len(REGEX_PREFIX)
40
41
class OcrdMets(OcrdXmlDocument):
42
    """
43
    API to a single METS file
44
    """
45
46
    @staticmethod
47
    def empty_mets(now=None, cache_flag=False):
48
        """
49
        Create an empty METS file from bundled template.
50
        """
51
        if not now:
52
            now = datetime.now().isoformat()
53
        tpl = METS_XML_EMPTY.decode('utf-8')
54
        tpl = tpl.replace('{{ VERSION }}', VERSION)
55
        tpl = tpl.replace('{{ NOW }}', '%s' % now)
56
        return OcrdMets(content=tpl.encode('utf-8'), cache_flag=cache_flag)
57
58
    def __init__(self, **kwargs):
59
        """
60
        """
61
        super(OcrdMets, self).__init__(**kwargs)
62
63
        # XXX If the environment variable OCRD_METS_CACHING is set to "true",
64
        # then enable caching, if "false", disable caching, overriding the
65
        # kwarg to the constructor
66
        if 'OCRD_METS_CACHING' in environ:
67
            cache_override = environ['OCRD_METS_CACHING'] in ('true', '1')
68
            getLogger('ocrd_models.ocrd_mets').debug('METS Caching %s because OCRD_METS_CACHING is %s',
69
                    'enabled' if cache_override else 'disabled', environ['OCRD_METS_CACHING'])
70
            self._cache_flag = cache_override
71
72
        # If cache is enabled
73
        if self._cache_flag:
74
75
            # Cache for the files (mets:file) - two nested dictionaries
76
            # The outer dictionary's Key: 'fileGrp.USE'
77
            # The outer dictionary's Value: Inner dictionary
78
            # The inner dictionary's Key: 'file.ID'
79
            # The inner dictionary's Value: a 'file' object at some memory location
80
            self._file_cache = {}
81
82
            # Cache for the pages (mets:div)
83
            # The dictionary's Key: 'div.ID'
84
            # The dictionary's Value: a 'div' object at some memory location
85
            self._page_cache = {}
86
87
            # Cache for the file pointers (mets:fptr) - two nested dictionaries
88
            # The outer dictionary's Key: 'div.ID'
89
            # The outer dictionary's Value: Inner dictionary
90
            # The inner dictionary's Key: 'fptr.FILEID'
91
            # The inner dictionary's Value: a 'fptr' object at some memory location
92
            self._fptr_cache = {}
93
94
            # Note, if the empty_mets() function is used to instantiate OcrdMets
95
            # Then the cache is empty even after this operation
96
            self._fill_caches()
97
98
    def __exit__(self):
99
        """
100
101
        """
102
        if self._cache_flag:
103
            self._clear_caches()
104
105
    def __str__(self):
106
        """
107
        String representation
108
        """
109
        return 'OcrdMets[cached=%s,fileGrps=%s,files=%s]' % (self._cache_flag, self.file_groups, list(self.find_files()))
110
111
    def _fill_caches(self):
112
        """
113
        Fills the caches with fileGrps and FileIDs
114
        """
115
116
        tree_root = self._tree.getroot()
117
118
        # Fill with files
119
        el_fileSec = tree_root.find("mets:fileSec", NS)
120
        if el_fileSec is None:
121
            return
122
123
        log = getLogger('ocrd_models.ocrd_mets._fill_caches-files')
124
125
        for el_fileGrp in el_fileSec.findall('mets:fileGrp', NS):
126
            fileGrp_use = el_fileGrp.get('USE')
127
128
            # Assign an empty dictionary that will hold the files of the added fileGrp
129
            self._file_cache[fileGrp_use] = {}
130
131
            for el_file in el_fileGrp:
132
                file_id = el_file.get('ID')
133
                self._file_cache[fileGrp_use].update({file_id : el_file})
134
                # log.info("File added to the cache: %s" % file_id)
135
136
        # Fill with pages
137
        el_div_list = tree_root.findall(".//mets:div[@TYPE='page']", NS)
138
        if len(el_div_list) == 0:
139
            return
140
        log = getLogger('ocrd_models.ocrd_mets._fill_caches-pages')
141
142
        for el_div in el_div_list:
143
            div_id = el_div.get('ID')
144
            log.debug("DIV_ID: %s" % el_div.get('ID'))
145
146
            self._page_cache[div_id] = el_div
147
148
            # Assign an empty dictionary that will hold the fptr of the added page (div)
149
            self._fptr_cache[div_id] = {}
150
151
            # log.info("Page_id added to the cache: %s" % div_id)
152
153
            for el_fptr in el_div:
154
                self._fptr_cache[div_id].update({el_fptr.get('FILEID') : el_fptr})
155
                # log.info("Fptr added to the cache: %s" % el_fptr.get('FILEID'))
156
157
        # log.info("Len of page_cache: %s" % len(self._page_cache))
158
        # log.info("Len of fptr_cache: %s" % len(self._fptr_cache))
159
160
    def _clear_caches(self):
161
        """
162
        Deallocates the caches
163
        """
164
165
        self._file_cache = None
166
        self._page_cache = None
167
        self._fptr_cache = None
168
169
    @property
170
    def unique_identifier(self):
171
        """
172
        Get the unique identifier by looking through ``mods:identifier``
173
        See `specs <https://ocr-d.de/en/spec/mets#unique-id-for-the-document-processed>`_ for details.
174
        """
175
        for t in IDENTIFIER_PRIORITY:
176
            found = self._tree.getroot().find('.//mods:identifier[@type="%s"]' % t, NS)
177
            if found is not None:
178
                return found.text
179
        
180
    @unique_identifier.setter
181
    def unique_identifier(self, purl):
182
        """
183
        Set the unique identifier by looking through ``mods:identifier``
184
        See `specs <https://ocr-d.de/en/spec/mets#unique-id-for-the-document-processed>`_ for details.
185
        """
186
        id_el = None
187
        for t in IDENTIFIER_PRIORITY:
188
            id_el = self._tree.getroot().find('.//mods:identifier[@type="%s"]' % t, NS)
189
            if id_el is not None:
190
                break
191
        if id_el is None:
192
            mods = self._tree.getroot().find('.//mods:mods', NS)
193
            id_el = ET.SubElement(mods, TAG_MODS_IDENTIFIER)
194
            id_el.set('type', 'purl')
195
        id_el.text = purl
196
197
    @property
198
    def agents(self):
199
        """
200
        List all :py:class:`ocrd_models.ocrd_agent.OcrdAgent`s
201
        """
202
        return [OcrdAgent(el_agent) for el_agent in self._tree.getroot().findall('mets:metsHdr/mets:agent', NS)]
203
204
    def add_agent(self, *args, **kwargs):
205
        """
206
        Add an :py:class:`ocrd_models.ocrd_agent.OcrdAgent` to the list of agents in the ``metsHdr``.
207
        """
208
        el_metsHdr = self._tree.getroot().find('.//mets:metsHdr', NS)
209
        if el_metsHdr is None:
210
            el_metsHdr = ET.Element(TAG_METS_METSHDR)
211
            self._tree.getroot().insert(0, el_metsHdr)
212
        #  assert(el_metsHdr is not None)
213
        el_agent = ET.SubElement(el_metsHdr, TAG_METS_AGENT)
214
        #  print(ET.tostring(el_metsHdr))
215
        return OcrdAgent(el_agent, *args, **kwargs)
216
217
    @property
218
    def file_groups(self):
219
        """
220
        List the `@USE` of all `mets:fileGrp` entries.
221
        """
222
223
        # WARNING: Actually we cannot return strings in place of elements!
224
        if self._cache_flag:
225
           return list(self._file_cache.keys())
226
227
        return [el.get('USE') for el in self._tree.getroot().findall('.//mets:fileGrp', NS)]
228
229
    def find_all_files(self, *args, **kwargs):
230
        """
231
        Like :py:meth:`find_files` but return a list of all results.
232
        Equivalent to ``list(self.find_files(...))``
233
        """
234
        return list(self.find_files(*args, **kwargs))
235
236
    # pylint: disable=multiple-statements
237
    def find_files(self, ID=None, fileGrp=None, pageId=None, mimetype=None, url=None, local_only=False):
238
        """
239
        Search ``mets:file`` entries in this METS document and yield results.
240
        The :py:attr:`ID`, :py:attr:`pageId`, :py:attr:`fileGrp`,
241
        :py:attr:`url` and :py:attr:`mimetype` parameters can each be either a
242
        literal string, or a regular expression if the string starts with
243
        ``//`` (double slash).
244
        If it is a regex, the leading ``//`` is removed and candidates are matched
245
        against the regex with `re.fullmatch`. If it is a literal string, comparison
246
        is done with string equality.
247
        The :py:attr:`pageId` parameter supports the numeric range operator ``..``. For
248
        example, to find all files in pages ``PHYS_0001`` to ``PHYS_0003``,
249
        ``PHYS_0001..PHYS_0003`` will be expanded to ``PHYS_0001,PHYS_0002,PHYS_0003``.
250
        Keyword Args:
251
            ID (string) : ``@ID`` of the ``mets:file``
252
            fileGrp (string) : ``@USE`` of the ``mets:fileGrp`` to list files of
253
            pageId (string) : ``@ID`` of the corresponding physical ``mets:structMap`` entry (physical page)
254
            url (string) : ``@xlink:href`` (URL or path) of ``mets:Flocat`` of ``mets:file``
255
            mimetype (string) : ``@MIMETYPE`` of ``mets:file``
256
            local (boolean) : Whether to restrict results to local files in the filesystem
257
        Yields:
258
            :py:class:`ocrd_models:ocrd_file:OcrdFile` instantiations
259
        """
260
        pageId_list = []
261
        if pageId:
262
            pageId_patterns = []
263
            for pageId_token in re.split(r',', pageId):
264
                if pageId_token.startswith(REGEX_PREFIX):
265
                    pageId_patterns.append(re.compile(pageId_token[REGEX_PREFIX_LEN:]))
266
                elif '..' in pageId_token:
267
                    pageId_patterns += generate_range(*pageId_token.split('..', 1))
268
                else:
269
                    pageId_patterns += [pageId_token]
270
            if self._cache_flag:
271
                for page_id in self._page_cache.keys():
272
                    if page_id in pageId_patterns or \
273
                        any([isinstance(p, typing.Pattern) and p.fullmatch(page_id) for p in pageId_patterns]):
274
                        pageId_list += self._fptr_cache[page_id]
275
            else:
276
                for page in self._tree.getroot().xpath(
277
                    '//mets:div[@TYPE="page"]', namespaces=NS):
278
                    if page.get('ID') in pageId_patterns or \
279
                        any([isinstance(p, typing.Pattern) and p.fullmatch(page.get('ID')) for p in pageId_patterns]):
280
                        pageId_list += [fptr.get('FILEID') for fptr in page.findall('mets:fptr', NS)]
281
282
        if ID and ID.startswith(REGEX_PREFIX):
283
            ID = re.compile(ID[REGEX_PREFIX_LEN:])
284
        if fileGrp and fileGrp.startswith(REGEX_PREFIX):
285
            fileGrp = re.compile(fileGrp[REGEX_PREFIX_LEN:])
286
        if mimetype and mimetype.startswith(REGEX_PREFIX):
287
            mimetype = re.compile(mimetype[REGEX_PREFIX_LEN:])
288
        if url and url.startswith(REGEX_PREFIX):
289
            url = re.compile(url[REGEX_PREFIX_LEN:])
290
            
291
        candidates = []
292
        if self._cache_flag:
293
            if fileGrp:
294
                if isinstance(fileGrp, str):
295
                    candidates += self._file_cache.get(fileGrp, {}).values()
296
                else:
297
                    candidates = [x for fileGrp_needle, el_file_list in self._file_cache.items() if fileGrp.match(fileGrp_needle) for x in el_file_list.values()]
298
            else:
299
                candidates = [el_file for id_to_file in self._file_cache.values() for el_file in id_to_file.values()]
300
        else:
301
            candidates = self._tree.getroot().xpath('//mets:file', namespaces=NS)
302
            
303
        for cand in candidates:
304
            if ID:
305
                if isinstance(ID, str):
306
                    if not ID == cand.get('ID'): continue
307
                else:
308
                    if not ID.fullmatch(cand.get('ID')): continue
309
310
            if pageId is not None and cand.get('ID') not in pageId_list:
311
                continue
312
313
            if not self._cache_flag and fileGrp:
314
                if isinstance(fileGrp, str):
315
                    if cand.getparent().get('USE') != fileGrp: continue
316
                else:
317
                    if not fileGrp.fullmatch(cand.getparent().get('USE')): continue
318
319
            if mimetype:
320
                if isinstance(mimetype, str):
321
                    if cand.get('MIMETYPE') != mimetype: continue
322
                else:
323
                    if not mimetype.fullmatch(cand.get('MIMETYPE') or ''): continue
324
325
            if url:
326
                cand_locat = cand.find('mets:FLocat', namespaces=NS)
327
                if cand_locat is None:
328
                    continue
329
                cand_url = cand_locat.get('{%s}href' % NS['xlink'])
330
                if isinstance(url, str):
331
                    if cand_url != url: continue
332
                else:
333
                    if not url.fullmatch(cand_url): continue
334
335
            # Note: why we instantiate a class only to find out that the local_only is set afterwards
336
            # Checking local_only and url before instantiation should be better?
337
            f = OcrdFile(cand, mets=self)
338
339
            # If only local resources should be returned and f is not a file path: skip the file
340
            if local_only and not is_local_filename(f.url):
341
                continue
342
            yield f
343
344
    def add_file_group(self, fileGrp):
345
        """
346
        Add a new ``mets:fileGrp``.
347
        Arguments:
348
            fileGrp (string): ``@USE`` of the new ``mets:fileGrp``.
349
        """
350
        if ',' in fileGrp:
351
            raise Exception('fileGrp must not contain commas')
352
        el_fileSec = self._tree.getroot().find('mets:fileSec', NS)
353
        if el_fileSec is None:
354
            el_fileSec = ET.SubElement(self._tree.getroot(), TAG_METS_FILESEC)
355
        el_fileGrp = el_fileSec.find('mets:fileGrp[@USE="%s"]' % fileGrp, NS)
356
        if el_fileGrp is None:
357
            el_fileGrp = ET.SubElement(el_fileSec, TAG_METS_FILEGRP)
358
            el_fileGrp.set('USE', fileGrp)
359
            
360
            if self._cache_flag:
361
                # Assign an empty dictionary that will hold the files of the added fileGrp
362
                self._file_cache[fileGrp] = {}
363
                
364
        return el_fileGrp
365
366
    def rename_file_group(self, old, new):
367
        """
368
        Rename a ``mets:fileGrp`` by changing the ``@USE`` from :py:attr:`old` to :py:attr:`new`.
369
        """
370
        el_fileGrp = self._tree.getroot().find('mets:fileSec/mets:fileGrp[@USE="%s"]' % old, NS)
371
        if el_fileGrp is None:
372
            raise FileNotFoundError("No such fileGrp '%s'" % old)
373
        el_fileGrp.set('USE', new)
374
        
375
        if self._cache_flag:
376
            self._file_cache[new] = self._file_cache.pop(old)
377
378
    def remove_file_group(self, USE, recursive=False, force=False):
379
        """
380
        Remove a ``mets:fileGrp`` (single fixed ``@USE`` or multiple regex ``@USE``)
381
        Arguments:
382
            USE (string): ``@USE`` of the ``mets:fileGrp`` to delete. Can be a regex if prefixed with ``//``
383
            recursive (boolean): Whether to recursively delete each ``mets:file`` in the group
384
            force (boolean): Do not raise an exception if ``mets:fileGrp`` does not exist
385
        """
386
        log = getLogger('ocrd_models.ocrd_mets.remove_file_group')
387
        el_fileSec = self._tree.getroot().find('mets:fileSec', NS)
388
        if el_fileSec is None:
389
            raise Exception("No fileSec!")
390
        if isinstance(USE, str):
391
            if USE.startswith(REGEX_PREFIX):
392
                use = re.compile(USE[REGEX_PREFIX_LEN:])
393
                for cand in el_fileSec.findall('mets:fileGrp', NS):
394
                    if use.fullmatch(cand.get('USE')):
395
                        self.remove_file_group(cand, recursive=recursive)
396
                return
397
            else:
398
                el_fileGrp = el_fileSec.find('mets:fileGrp[@USE="%s"]' % USE, NS)
399
        else:
400
            el_fileGrp = USE
401
        if el_fileGrp is None:   # pylint: disable=len-as-condition
402
            msg = "No such fileGrp: %s" % USE
403
            if force:
404
                log.warning(msg)
405
                return
406
            raise Exception(msg)
407
408
        # The cache should also be used here
409
        if self._cache_flag:
410
            files = self._file_cache.get(el_fileGrp.get('USE'), {}).values()
411
        else:
412
            files = el_fileGrp.findall('mets:file', NS)
413
414
        if files:
415
            if not recursive:
416
                raise Exception("fileGrp %s is not empty and recursive wasn't set" % USE)
417
            for f in files:
418
                self.remove_one_file(ID=f.get('ID'), fileGrp=f.getparent().get('USE'))
419
420
        if self._cache_flag:
421
            # Note: Since the files inside the group are removed
422
            # with the 'remove_one_file' method above, 
423
            # we should not take care of that again.
424
            # We just remove the fileGrp.
425
            del self._file_cache[el_fileGrp.get('USE')]
426
427
        el_fileGrp.getparent().remove(el_fileGrp)
428
429
    def add_file(self, fileGrp, mimetype=None, url=None, ID=None, pageId=None, force=False, local_filename=None, ignore=False, **kwargs):
430
        """
431
        Instantiate and add a new :py:class:`ocrd_models.ocrd_file.OcrdFile`.
432
        Arguments:
433
            fileGrp (string): ``@USE`` of ``mets:fileGrp`` to add to
434
        Keyword Args:
435
            mimetype (string): ``@MIMETYPE`` of the ``mets:file`` to use
436
            url (string): ``@xlink:href`` (URL or path) of the ``mets:file`` to use
437
            ID (string): ``@ID`` of the ``mets:file`` to use
438
            pageId (string): ``@ID`` in the physical ``mets:structMap`` to link to
439
            force (boolean): Whether to add the file even if a ``mets:file`` with the same ``@ID`` already exists.
440
            ignore (boolean): Do not look for existing files at all. Shift responsibility for preventing errors from duplicate ID to the user.
441
            local_filename (string):
442
        """
443
        if not ID:
444
            raise ValueError("Must set ID of the mets:file")
445
        if not fileGrp:
446
            raise ValueError("Must set fileGrp of the mets:file")
447
        if not REGEX_FILE_ID.fullmatch(ID):
448
            raise ValueError("Invalid syntax for mets:file/@ID %s (not an xs:ID)" % ID)
449
        if not REGEX_FILE_ID.fullmatch(fileGrp):
450
            raise ValueError("Invalid syntax for mets:fileGrp/@USE %s (not an xs:ID)" % fileGrp)
451
        log = getLogger('ocrd_models.ocrd_mets.add_file')
452
453
        el_fileGrp = self.add_file_group(fileGrp)
454
        if not ignore:
455
            mets_file = next(self.find_files(ID=ID, fileGrp=fileGrp), None)
456
            if mets_file:
457
                if mets_file.fileGrp == fileGrp and \
458
                   mets_file.pageId == pageId and \
459
                   mets_file.mimetype == mimetype:
460
                    if not force:
461
                        raise FileExistsError(f"A file with ID=={ID} already exists {mets_file} and neither force nor ignore are set")
462
                    self.remove_file(ID=ID, fileGrp=fileGrp)
463
                else:
464
                    raise FileExistsError(f"A file with ID=={ID} already exists {mets_file} but unrelated - cannot mitigate")
465
466
        # To get rid of Python's FutureWarning - checking if v is not None
467
        kwargs = {k: v for k, v in locals().items() if k in ['url', 'ID', 'mimetype', 'pageId', 'local_filename'] and v is not None}
468
        # This separation is needed to reuse the same el_mets_file element in the caching if block
469
        el_mets_file = ET.SubElement(el_fileGrp, TAG_METS_FILE)
470
        # The caching of the physical page is done in the OcrdFile constructor
471
        mets_file = OcrdFile(el_mets_file, mets=self, **kwargs)
472
473
        if self._cache_flag:
474
            # Add the file to the file cache
475
            self._file_cache[fileGrp].update({ID: el_mets_file})
476
477
        return mets_file
478
479
    def remove_file(self, *args, **kwargs):
480
        """
481
        Delete each ``ocrd:file`` matching the query. Same arguments as :py:meth:`find_files`
482
        """
483
        files = list(self.find_files(*args, **kwargs))
484
        if files:
485
            for f in files:
486
                self.remove_one_file(f)
487
            if len(files) > 1:
488
                return files
489
            else:
490
                return files[0] # for backwards-compatibility
491
        if any(1 for kwarg in kwargs
492
               if isinstance(kwarg, str) and kwarg.startswith(REGEX_PREFIX)):
493
            # allow empty results if filter criteria involve a regex
494
            return []
495
        raise FileNotFoundError("File not found: %s %s" % (args, kwargs))
496
497
    def remove_one_file(self, ID, fileGrp=None):
498
        """
499
        Delete an existing :py:class:`ocrd_models.ocrd_file.OcrdFile`.
500
        Arguments:
501
            ID (string|OcrdFile): ``@ID`` of the ``mets:file`` to delete  Can also be an :py:class:`ocrd_models.ocrd_file.OcrdFile` to avoid search via ``ID``.
502
            fileGrp (string): ``@USE`` of the ``mets:fileGrp`` containing the ``mets:file``. Used only for optimization.
503
        Returns:
504
            The old :py:class:`ocrd_models.ocrd_file.OcrdFile` reference.
505
        """
506
        log = getLogger('ocrd_models.ocrd_mets.remove_one_file')
507
        log.debug("remove_one_file(%s %s)" % (ID, fileGrp))
508
        if isinstance(ID, OcrdFile):
509
            ocrd_file = ID
510
            ID = ocrd_file.ID
511
        else:
512
            ocrd_file = next(self.find_files(ID=ID, fileGrp=fileGrp), None)
513
514
        if not ocrd_file:
515
            raise FileNotFoundError("File not found: %s (fileGr=%s)" % (ID, fileGrp))
516
517
        # Delete the physical page ref
518
        fptrs = []
519
        if self._cache_flag:
520
            for page in self._fptr_cache.keys():
521
                if ID in self._fptr_cache[page]:
522
                    fptrs.append(self._fptr_cache[page][ID])
523
        else:
524
            fptrs = self._tree.getroot().findall('.//mets:fptr[@FILEID="%s"]' % ID, namespaces=NS)
525
526
        # Delete the physical page ref
527
        for fptr in fptrs:
528
            log.debug("Delete fptr element %s for page '%s'", fptr, ID)
529
            page_div = fptr.getparent()
530
            page_div.remove(fptr)
531
            # Remove the fptr from the cache as well
532
            if self._cache_flag:
533
                del self._fptr_cache[page_div.get('ID')][ID]
534
            # delete empty pages
535
            if not page_div.getchildren():
536
                log.debug("Delete empty page %s", page_div)
537
                page_div.getparent().remove(page_div)
538
                # Delete the empty pages from caches as well
539
                if self._cache_flag:
540
                    del self._page_cache[page_div.get('ID')]
541
                    del self._fptr_cache[page_div.get('ID')]
542
543
        # Delete the file reference from the cache
544
        if self._cache_flag:
545
            parent_use = ocrd_file._el.getparent().get('USE')
546
            del self._file_cache[parent_use][ocrd_file.ID]
547
548
        # Delete the file reference
549
        # pylint: disable=protected-access
550
        ocrd_file._el.getparent().remove(ocrd_file._el)
551
552
        return ocrd_file
553
554
    @property
555
    def physical_pages(self):
556
        """
557
        List all page IDs (the ``@ID`` of each physical ``mets:structMap`` ``mets:div``)
558
        """
559
        if self._cache_flag:
560
            return self._page_cache.values()
561
            
562
        return self._tree.getroot().xpath(
563
            'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"]/@ID',
564
            namespaces=NS)
565
566
    def get_physical_pages(self, for_fileIds=None):
567
        """
568
        List all page IDs (the ``@ID`` of each physical ``mets:structMap`` ``mets:div``),
569
        optionally for a subset of ``mets:file`` ``@ID`` :py:attr:`for_fileIds`.
570
        """
571
        if for_fileIds is None:
572
            return self.physical_pages
573
        ret = [None] * len(for_fileIds)
574
        
575
        if self._cache_flag:
576
            for pageId in self._fptr_cache.keys():
577
                for fptr in self._fptr_cache[pageId].keys():
578
                    if fptr in for_fileIds:
579
                        ret[for_fileIds.index(fptr)] = pageId
580
        else:
581
          for page in self._tree.getroot().xpath(
582
              'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"]',
583
                  namespaces=NS):
584
              for fptr in page.findall('mets:fptr', NS):
585
                  if fptr.get('FILEID') in for_fileIds:
586
                      ret[for_fileIds.index(fptr.get('FILEID'))] = page.get('ID')
587
        return ret
588
589
    def set_physical_page_for_file(self, pageId, ocrd_file, order=None, orderlabel=None):
590
        """
591
        Set the physical page ID (``@ID`` of the physical ``mets:structMap`` ``mets:div`` entry)
592
        corresponding to the ``mets:file`` :py:attr:`ocrd_file`, creating all structures if necessary.
593
        Arguments:
594
            pageId (string): ``@ID`` of the physical ``mets:structMap`` entry to use
595
            ocrd_file (object): existing :py:class:`ocrd_models.ocrd_file.OcrdFile` object
596
        Keyword Args:
597
            order (string): ``@ORDER`` to use
598
            orderlabel (string): ``@ORDERLABEL`` to use
599
        """
600
601
        # delete any page mapping for this file.ID
602
        candidates = []
603
        if self._cache_flag:
604
            for page_id in self._fptr_cache.keys():
605
                if ocrd_file.ID in self._fptr_cache[page_id].keys():
606
                    if self._fptr_cache[page_id][ocrd_file.ID] is not None:
607
                        candidates.append(self._fptr_cache[page_id][ocrd_file.ID])
608
        else:
609
            candidates = self._tree.getroot().findall(
610
                'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"]/mets:fptr[@FILEID="%s"]' %
611
                ocrd_file.ID, namespaces=NS)
612
613
        for el_fptr in candidates:
614
            if self._cache_flag:
615
                del self._fptr_cache[el_fptr.getparent().get('ID')][ocrd_file.ID]
616
            el_fptr.getparent().remove(el_fptr)
617
618
        # find/construct as necessary
619
        el_structmap = self._tree.getroot().find('mets:structMap[@TYPE="PHYSICAL"]', NS)
620
        if el_structmap is None:
621
            el_structmap = ET.SubElement(self._tree.getroot(), TAG_METS_STRUCTMAP)
622
            el_structmap.set('TYPE', 'PHYSICAL')
623
        el_seqdiv = el_structmap.find('mets:div[@TYPE="physSequence"]', NS)
624
        if el_seqdiv is None:
625
            el_seqdiv = ET.SubElement(el_structmap, TAG_METS_DIV)
626
            el_seqdiv.set('TYPE', 'physSequence')
627
        
628
        el_pagediv = None
629
        if self._cache_flag:
630
            if pageId in self._page_cache:
631
                el_pagediv = self._page_cache[pageId]
632
        else:
633
            el_pagediv = el_seqdiv.find('mets:div[@ID="%s"]' % pageId, NS)
634
        
635
        if el_pagediv is None:
636
            el_pagediv = ET.SubElement(el_seqdiv, TAG_METS_DIV)
637
            el_pagediv.set('TYPE', 'page')
638
            el_pagediv.set('ID', pageId)
639
            if order:
640
                el_pagediv.set('ORDER', order)
641
            if orderlabel:
642
                el_pagediv.set('ORDERLABEL', orderlabel)
643
            if self._cache_flag:
644
                # Create a new entry in the page cache
645
                self._page_cache[pageId] = el_pagediv
646
                # Create a new entry in the fptr cache and 
647
                # assign an empty dictionary to hold the fileids
648
                self._fptr_cache[pageId] = {}
649
                
650
        el_fptr = ET.SubElement(el_pagediv, TAG_METS_FPTR)
651
        el_fptr.set('FILEID', ocrd_file.ID)
652
653
        if self._cache_flag:
654
            # Assign the ocrd fileID to the pageId in the cache
655
            self._fptr_cache[el_pagediv.get('ID')].update({ocrd_file.ID : el_fptr})
656
657
    def get_physical_page_for_file(self, ocrd_file):
658
        """
659
        Get the physical page ID (``@ID`` of the physical ``mets:structMap`` ``mets:div`` entry)
660
        corresponding to the ``mets:file`` :py:attr:`ocrd_file`.
661
        """
662
        ret = []
663
        if self._cache_flag:
664
            for pageId in self._fptr_cache.keys():
665
                if ocrd_file.ID in self._fptr_cache[pageId].keys():
666
                    ret.append(self._page_cache[pageId].get('ID'))
667
        else:
668
            ret = self._tree.getroot().xpath(
669
                '/mets:mets/mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"][./mets:fptr[@FILEID="%s"]]/@ID' %
670
                ocrd_file.ID, namespaces=NS)
671
672
        # To get rid of the python's FutureWarning
673
        if len(ret):
674
            return ret[0]
675
676
    def remove_physical_page(self, ID):
677
        """
678
        Delete page (physical ``mets:structMap`` ``mets:div`` entry ``@ID``) :py:attr:`ID`.
679
        """
680
        mets_div = None
681
        if self._cache_flag:
682
            if ID in self._page_cache.keys():
683
                mets_div = [self._page_cache[ID]]
684
        else:
685
            mets_div = self._tree.getroot().xpath(
686
                'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"][@ID="%s"]' % ID,
687
                namespaces=NS)
688
        if mets_div is not None:
689
            mets_div[0].getparent().remove(mets_div[0])
690
            if self._cache_flag:
691
                del self._page_cache[ID]
692
                del self._fptr_cache[ID]
693
694
    def remove_physical_page_fptr(self, fileId):
695
        """
696
        Delete all ``mets:fptr[@FILEID = fileId]`` to ``mets:file[@ID == fileId]`` for :py:attr:`fileId` from all ``mets:div`` entries in the physical ``mets:structMap``.
697
        Returns:
698
            List of pageIds that mets:fptrs were deleted from
699
        """
700
701
        # Question: What is the reason to keep a list of mets_fptrs?
702
        # Do we have a situation in which the fileId is same for different pageIds ?
703
        # From the examples I have seen inside 'assets' that is not the case
704
        # and the mets_fptrs list will always contain a single element.
705
        # If that's the case then we do not need to iterate 2 loops, just one.
706
        mets_fptrs = []
707
        if self._cache_flag:
708
            for page_id in self._fptr_cache.keys():
709
                if fileId in self._fptr_cache[page_id].keys():
710
                    mets_fptrs.append(self._fptr_cache[page_id][fileId]) 
711
        else:
712
            mets_fptrs = self._tree.getroot().xpath(
713
                'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"]/mets:fptr[@FILEID="%s"]' % fileId, namespaces=NS)
714
        ret = []
715
        for mets_fptr in mets_fptrs:
716
            mets_div = mets_fptr.getparent()
717
            ret.append(mets_div.get('ID'))
718
            if self._cache_flag:
719
                del self._fptr_cache[mets_div.get('ID')][mets_fptr.get('FILEID')]
720
            mets_div.remove(mets_fptr)
721
        return ret
722
723
    def merge(self, other_mets, force=False, fileGrp_mapping=None, fileId_mapping=None, pageId_mapping=None, after_add_cb=None, **kwargs):
724
        """
725
        Add all files from other_mets.
726
        Accepts the same kwargs as :py:func:`find_files`
727
        Keyword Args:
728
            force (boolean): Whether to :py:meth:`add_file`s with force (overwriting existing ``mets:file``s)
729
            fileGrp_mapping (dict): Map :py:attr:`other_mets` fileGrp to fileGrp in this METS
730
            fileId_mapping (dict): Map :py:attr:`other_mets` file ID to file ID in this METS
731
            pageId_mapping (dict): Map :py:attr:`other_mets` page ID to page ID in this METS
732
            after_add_cb (function): Callback received after file is added to the METS
733
        """
734
        if not fileGrp_mapping:
735
            fileGrp_mapping = {}
736
        if not fileId_mapping:
737
            fileId_mapping = {}
738
        if not pageId_mapping:
739
            pageId_mapping = {}
740
        for f_src in other_mets.find_files(**kwargs):
741
            f_dest = self.add_file(
742
                    fileGrp_mapping.get(f_src.fileGrp, f_src.fileGrp),
743
                    mimetype=f_src.mimetype,
744
                    url=f_src.url,
745
                    ID=fileId_mapping.get(f_src.ID, f_src.ID),
746
                    pageId=pageId_mapping.get(f_src.pageId, f_src.pageId),
747
                    force=force)
748
            # FIXME: merge metsHdr, amdSec, dmdSec as well
749
            # FIXME: merge structMap logical and structLink as well
750
            if after_add_cb:
751
                after_add_cb(f_dest)
752