Passed
Push — master ( 8b29fb...59f8ea )
by Daniel
02:56
created

amd.io.CifReader.__init__()   D

Complexity

Conditions 11

Size

Total Lines 95
Code Lines 64

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 64
dl 0
loc 95
rs 4.9581
c 0
b 0
f 0
cc 11
nop 8

How to fix   Long Method    Complexity    Many Parameters   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

Complexity

Complex classes like amd.io.CifReader.__init__() often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

Many Parameters

Methods with many parameters are not only hard to understand, but their parameters also often become inconsistent when you need more, or different data.

There are several approaches to avoid long parameter lists:

1
"""Tools for reading crystals from files, or from the CSD with
0 ignored issues
show
coding-style introduced by
Too many lines in module (1469/1000)
Loading history...
2
``csd-python-api``. The readers return
3
:class:`amd.PeriodicSet <.periodicset.PeriodicSet>` objects representing the
4
crystal which can be passed to :func:`amd.AMD() <.calculate.AMD>` and
5
:func:`amd.PDD() <.calculate.PDD>` to get their invariants.
6
"""
7
 
0 ignored issues
show
Coding Style introduced by
Trailing whitespace
Loading history...
8
import os
9
import re
10
import functools
11
import warnings
12
from typing import Tuple
13
14
import numpy as np
15
import numba
16
import ase.io.cif   # convert_value, parse_cif
17
import ase.data
18
import ase.spacegroup
19
from ase.spacegroup.spacegroup import parse_sitesym as ase_parse_sitesym
20
21
from .utils import cellpar_to_cell
22
from .periodicset import PeriodicSet
23
24
25
def _custom_warning(message, category, filename, lineno, *args, **kwargs):
0 ignored issues
show
Unused Code introduced by
The argument filename seems to be unused.
Loading history...
Unused Code introduced by
The argument lineno seems to be unused.
Loading history...
Unused Code introduced by
The argument args seems to be unused.
Loading history...
Unused Code introduced by
The argument kwargs seems to be unused.
Loading history...
26
    return f'{category.__name__}: {message}\n'
27
28
warnings.formatwarning = _custom_warning
29
30
_EQUIV_SITE_TOL = 1e-3
31
_DISORDER_OPTIONS = {'skip', 'ordered_sites', 'all_sites'}
32
33
CIF_TAGS = {
34
    'cellpar': [
35
        '_cell_length_a',
36
        '_cell_length_b',
37
        '_cell_length_c',
38
        '_cell_angle_alpha',
39
        '_cell_angle_beta',
40
        '_cell_angle_gamma',],
41
42
    'atom_site_fract': [
43
        '_atom_site_fract_x',
44
        '_atom_site_fract_y',
45
        '_atom_site_fract_z',],
46
47
    'atom_site_cartn': [
48
        '_atom_site_cartn_x',
49
        '_atom_site_cartn_y',
50
        '_atom_site_cartn_z',],
51
52
    'atom_symbol': [
53
        '_atom_site_type_symbol',
54
        '_atom_site_label',],
55
56
    'symop': [
57
        '_symmetry_equiv_pos_as_xyz',
58
        '_space_group_symop_operation_xyz',
59
        '_space_group_symop.operation_xyz',
60
        '_symmetry_equiv_pos_as_xyz_',
61
        '_space_group_symop_operation_xyz_',],
62
63
    'spacegroup_name': [
64
        '_space_group_name_Hall',
65
        '_symmetry_space_group_name_hall',
66
        '_space_group_name_H-M_alt',
67
        '_symmetry_space_group_name_H-M',
68
        '_symmetry_space_group_name_H_M',
69
        '_symmetry_space_group_name_h-m',],
70
71
    'spacegroup_number': [
72
        '_space_group_IT_number',
73
        '_symmetry_Int_Tables_number',
74
        '_space_group_IT_number_',
75
        '_symmetry_Int_Tables_number_',],
76
}
77
78
79
class _Reader:
80
81
    def __init__(
0 ignored issues
show
best-practice introduced by
Too many arguments (6/5)
Loading history...
82
            self,
83
            remove_hydrogens=False,
84
            disorder='skip',
85
            heaviest_component=False,
86
            molecular_centres=False,
87
            show_warnings=True
88
    ):
89
90
        if disorder not in _DISORDER_OPTIONS:
91
            msg = 'disorder parameter must be one of ' \
92
                  f'{_DISORDER_OPTIONS} (passed {disorder})'
93
            raise ValueError(msg)
94
95
        self.remove_hydrogens = remove_hydrogens
96
        self.disorder = disorder
97
        self.heaviest_component = heaviest_component
98
        self.molecular_centres = molecular_centres
99
        self.show_warnings = show_warnings
100
        self._backend_generator = None
101
        self._converter = None
102
103
    def __iter__(self):
104
        if self._backend_generator is None or self._converter is None:
105
            raise RuntimeError(f'{self.__class__.__name__} not initialized.')
106
        return self
107
108
    def __next__(self):
109
        """Iterates over self._backend_generator, passing items through self._converter.
110
        Catches ParseError + warnings raised in self._converter, optionally printing them.
111
        """
112
113
        if not self.show_warnings:
114
            warnings.simplefilter('ignore')
115
116
        while True:
117
118
            item = next(self._backend_generator) # will raise StopIteration when done
119
120
            with warnings.catch_warnings(record=True) as warning_msgs:
121
                msg = None
122
                try:
123
                    periodic_set = self._converter(item)
0 ignored issues
show
Bug introduced by
self._converter does not seem to be callable.
Loading history...
124
                except ParseError as err:
125
                    msg = str(err)
126
127
            if msg:
128
                warnings.warn(msg)
129
                continue
130
131
            for warning in warning_msgs:
132
                msg = f'{periodic_set.name} {warning.message}'
133
                warnings.warn(msg, category=warning.category)
134
135
            return periodic_set
136
137
    def read(self):
138
        """Reads the crystal(s), returns one
139
        :class:`amd.PeriodicSet <.periodicset.PeriodicSet>` if there is only
140
        one, otherwise returns a list. (Note the return type is not
141
        consistent!)"""
142
        l = list(self)
143
        if len(l) == 1:
144
            return l[0]
145
        return l
146
147
148
class CifReader(_Reader):
149
    """Read all structures in a .cif file or all files in a folder
150
    with ase or csd-python-api (if installed), yielding
151
    :class:`amd.PeriodicSet <.periodicset.PeriodicSet>` s.
152
153
    Parameters
154
    ----------
155
    path : str
156
        Path to a .cif file or directory. (Other files are accepted when using
157
        ``reader='ccdc'``, if csd-python-api is installed.)
158
    reader : str, optional
159
        The backend package used for parsing. Default is :code:`ase`,
160
        to use csd-python-api change to :code:`ccdc`. The ccdc reader should
161
        be able to read any format accepted by :class:`ccdc.io.EntryReader`,
162
        though only cifs have been tested.
163
    remove_hydrogens : bool, optional
164
        Remove Hydrogens from the crystal.
165
    disorder : str, optional
166
        Controls how disordered structures are handled. Default is ``skip``
167
        which skips any crystal with disorder, since disorder conflicts with
168
        the periodic set model. To read disordered structures anyway, choose
169
        either :code:`ordered_sites` to remove atoms with disorder or
170
        :code:`all_sites` include all atoms regardless of disorder.
171
    heaviest_component : bool, optional
172
        csd-python-api only. Removes all but the heaviest molecule in the
173
        asymmeric unit, intended for removing solvents.
174
    molecular_centres : bool, default False
175
        csd-python-api only. Extract the centres of molecules in the unit cell
176
        and store in the attribute molecular_centres.
177
    show_warnings : bool, optional
178
        Controls whether warnings that arise during reading are printed.
179
180
    Yields
181
    ------
182
    :class:`amd.PeriodicSet <.periodicset.PeriodicSet>`
183
        Represents the crystal as a periodic set, consisting of a finite set
184
        of points (motif) and lattice (unit cell). Contains other data, e.g.
185
        the crystal's name and information about the asymmetric unit.
186
187
    Examples
188
    --------
189
190
        ::
191
192
            # Put all crystals in a .CIF in a list
193
            structures = list(amd.CifReader('mycif.cif'))
194
195
            # Can also accept path to a directory, reading all files inside
196
            structures = list(amd.CifReader('path/to/folder'))
197
198
            # Reads just one if the .CIF has just one crystal
199
            periodic_set = amd.CifReader('mycif.cif').read()
200
201
            # List of AMDs (k=100) of crystals in a .CIF
202
            amds = [amd.AMD(periodic_set, 100) for periodic_set in amd.CifReader('mycif.cif')]
203
    """
204
205
    def __init__(
0 ignored issues
show
best-practice introduced by
Too many arguments (8/5)
Loading history...
206
            self,
207
            path,
208
            reader='ase',
209
            remove_hydrogens=False,
210
            disorder='skip',
211
            heaviest_component=False,
212
            molecular_centres=False,
213
            show_warnings=True,
214
    ):
215
216
        super().__init__(
217
            remove_hydrogens=remove_hydrogens,
218
            disorder=disorder,
219
            heaviest_component=heaviest_component,
220
            molecular_centres=molecular_centres,
221
            show_warnings=show_warnings
222
        )
223
224
        # file_parser: Callable(path) -> Iterable[External object]
225
        # self._converter: Callable(External object) -> PeriodicSet
226
        # set options in self._converter with functools.partial
227
228
        if reader != 'ccdc':
229
230
            if heaviest_component:
231
                msg = 'Parameter heaviest_component ' \
232
                      'only implemented for reader="ccdc".'
233
                raise NotImplementedError(msg)
234
235
            if molecular_centres:
236
                msg = 'Parameter molecular_centres ' \
237
                      'only implemented for reader="ccdc".'
238
                raise NotImplementedError(msg)
239
240
        if reader in ('ase', 'pycodcif'):
241
242
            extensions = {'cif'}
243
            file_parser = functools.partial(ase.io.cif.parse_cif, reader=reader)
244
            self._converter = functools.partial(
245
                periodicset_from_ase_cifblock,
246
                remove_hydrogens=self.remove_hydrogens,
247
                disorder=self.disorder
248
            )
249
250
        elif reader == 'pymatgen':
251
252
            extensions = {'cif'}
253
            file_parser = CifReader._pymatgen_cifblock_generator
254
            self._converter = functools.partial(
255
                periodicset_from_pymatgen_cifblock,
256
                remove_hydrogens=self.remove_hydrogens,
257
                disorder=self.disorder
258
            )
259
260
        elif reader == 'gemmi':
261
262
            import gemmi.cif
0 ignored issues
show
introduced by
Import outside toplevel (gemmi.cif)
Loading history...
introduced by
Unable to import 'gemmi.cif'
Loading history...
263
264
            extensions = {'cif'}
265
            file_parser = gemmi.cif.read_file
266
            self._converter = functools.partial(
267
                periodicset_from_gemmi_block,
268
                remove_hydrogens=self.remove_hydrogens,
269
                disorder=self.disorder
270
            )
271
272
        elif reader == 'ccdc':
273
274
            try:
275
                import ccdc.io
0 ignored issues
show
introduced by
Import outside toplevel (ccdc.io)
Loading history...
276
            except (ImportError, RuntimeError) as e:
277
                msg = 'Failed to import csd-python-api, please' \
278
                      'check it is installed and licensed.'
279
                raise ImportError(msg) from e
280
281
            extensions = ccdc.io.EntryReader.known_suffixes
282
            file_parser = ccdc.io.EntryReader
283
            self._converter = functools.partial(
284
                periodicset_from_ccdc_entry,
285
                remove_hydrogens=self.remove_hydrogens,
286
                disorder=self.disorder,
287
                molecular_centres=self.molecular_centres,
288
                heaviest_component=self.heaviest_component
289
            )
290
291
        else:
292
            raise ValueError(f'Unknown reader {reader}.')
293
294
        if os.path.isfile(path):
295
            self._backend_generator = iter(file_parser(path))
296
        elif os.path.isdir(path):
297
            self._backend_generator = iter(CifReader._generate_from_dir(path, file_parser, extensions))
298
        else:
299
            raise FileNotFoundError(f'No such file or directory: {path}')
300
301
    @staticmethod
302
    def _generate_from_dir(path, file_parser, extensions):
303
        for file in os.listdir(path):
304
            suff = os.path.splitext(file)[1][1:]
305
            if suff.lower() in extensions:
306
                yield from file_parser(os.path.join(path, file))
307
308
    @staticmethod
309
    def _pymatgen_cifblock_generator(path):
310
        """Path to .cif --> generator of pymatgen CifBlocks."""
311
        from pymatgen.io.cif import CifFile
0 ignored issues
show
introduced by
Unable to import 'pymatgen.io.cif'
Loading history...
introduced by
Import outside toplevel (pymatgen.io.cif.CifFile)
Loading history...
312
        yield from CifFile.from_file(path).data.values()
313
314
315
class CSDReader(_Reader):
316
    """Read structures from the CSD with csd-python-api, yielding
317
    :class:`amd.PeriodicSet <.periodicset.PeriodicSet>` s.
318
319
    Parameters
320
    ----------
321
    refcodes : str or List[str], optional
322
        Single or list of CSD refcodes to read. If None or 'CSD', iterates
323
        over the whole CSD.
324
    families : bool, optional
325
        Read all entries whose refcode starts with the given strings, or
326
        'families' (e.g. giving 'DEBXIT' reads all entries starting with
327
        DEBXIT).
328
    remove_hydrogens : bool, optional
329
        Remove hydrogens from the crystal.
330
    disorder : str, optional
331
        Controls how disordered structures are handled. Default is ``skip``
332
        which skips any crystal with disorder, since disorder conflicts with
333
        the periodic set model. To read disordered structures anyway, choose
334
        either :code:`ordered_sites` to remove atoms with disorder or
335
        :code:`all_sites` include all atoms regardless of disorder.
336
    heaviest_component : bool, optional
337
        Removes all but the heaviest molecule in the asymmeric unit, intended
338
        for removing solvents.
339
    molecular_centres : bool, default False
340
        Extract the centres of molecules in the unit cell and store in
341
        attribute molecular_centres.
342
    show_warnings : bool, optional
343
        Controls whether warnings that arise during reading are printed.
344
345
    Yields
346
    ------
347
    :class:`amd.PeriodicSet <.periodicset.PeriodicSet>`
348
        Represents the crystal as a periodic set, consisting of a finite set
349
        of points (motif) and lattice (unit cell). Contains other useful data,
350
        e.g. the crystal's name and information about the asymmetric unit for
351
        calculation.
352
353
    Examples
354
    --------
355
356
        ::
357
358
            # Put these entries in a list
359
            refcodes = ['DEBXIT01', 'DEBXIT05', 'HXACAN01']
360
            structures = list(amd.CSDReader(refcodes))
361
362
            # Read refcode families (any whose refcode starts with strings in the list)
363
            refcode_families = ['ACSALA', 'HXACAN']
364
            structures = list(amd.CSDReader(refcode_families, families=True))
365
366
            # Get AMDs (k=100) for crystals in these families
367
            refcodes = ['ACSALA', 'HXACAN']
368
            amds = []
369
            for periodic_set in amd.CSDReader(refcodes, families=True):
370
                amds.append(amd.AMD(periodic_set, 100))
371
372
            # Giving the reader nothing reads from the whole CSD.
373
            for periodic_set in amd.CSDReader():
374
                ...
375
    """
376
377
    def __init__(
0 ignored issues
show
best-practice introduced by
Too many arguments (8/5)
Loading history...
378
            self,
379
            refcodes=None,
380
            families=False,
381
            remove_hydrogens=False,
382
            disorder='skip',
383
            heaviest_component=False,
384
            molecular_centres=False,
385
            show_warnings=True,
386
    ):
387
388
        super().__init__(
389
            remove_hydrogens=remove_hydrogens,
390
            disorder=disorder,
391
            heaviest_component=heaviest_component,
392
            molecular_centres=molecular_centres,
393
            show_warnings=show_warnings,
394
        )
395
396
        try:
397
            import ccdc.io
0 ignored issues
show
introduced by
Import outside toplevel (ccdc.io)
Loading history...
398
        except (ImportError, RuntimeError) as _:
399
            msg = 'Failed to import csd-python-api, please' \
400
                  'check it is installed and licensed.'
401
            raise ImportError(msg)
402
403
        if isinstance(refcodes, str) and refcodes.lower() == 'csd':
404
            refcodes = None
405
406
        if refcodes is None:
407
            families = False
408
        else:
409
            if isinstance(refcodes, str):
410
                refcodes = [refcodes]
411
            else:
412
                refcodes = list(refcodes)
413
414
        if families:
415
            refcodes = _refcodes_from_families(refcodes)
416
417
        # self._entry_reader = ccdc.io.EntryReader('CSD')
418
        entry_reader = ccdc.io.EntryReader('CSD')
419
        self._converter = functools.partial(
420
            periodicset_from_ccdc_entry,
421
            remove_hydrogens=self.remove_hydrogens,
422
            disorder=self.disorder,
423
            molecular_centres=self.molecular_centres,
424
            heaviest_component=self.heaviest_component
425
        )
426
        self._backend_generator = iter(self._ccdc_generator(refcodes, entry_reader))
427
428
    @staticmethod
429
    def _ccdc_generator(refcodes, entry_reader):
430
        """Generates ccdc Entries from CSD refcodes."""
431
432
        if refcodes is None:
433
            for entry in entry_reader:
434
                yield entry
435
        else:
436
            for refcode in refcodes:
437
                try:
438
                    entry = entry_reader.entry(refcode)
439
                    yield entry
440
                except RuntimeError:
441
                    warnings.warn(f'{refcode} not found in database')
442
443
444
class ParseError(ValueError):
445
    """Raised when an item cannot be parsed into a periodic set."""
446
    pass
0 ignored issues
show
Unused Code introduced by
Unnecessary pass statement
Loading history...
447
448
449
def periodicset_from_ase_cifblock(
450
        block,
451
        remove_hydrogens=False,
452
        disorder='skip'
453
) -> PeriodicSet:
454
    """:class:`ase.io.cif.CIFBlock` --> :class:`amd.PeriodicSet <.periodicset.PeriodicSet>`.
455
    CIFBlock is the type returned by :class:`ase.io.cif.parse_cif`.
456
457
    Parameters
458
    ----------
459
    block : :class:`ase.io.cif.CIFBlock`
460
        An ase CIFBlock object representing a crystal.
461
    remove_hydrogens : bool, optional
462
        Remove Hydrogens from the crystal.
463
    disorder : str, optional
464
        Controls how disordered structures are handled. Default is ``skip``
465
        which raises a ParseError if the input has any disorder, since
466
        disorder conflicts with the periodic set model. To read disordered
467
        structures anyway, choose either :code:`ordered_sites` to remove sites
468
        with disorder or :code:`all_sites` include all sites regardless.
469
470
    Returns
471
    -------
472
    :class:`amd.PeriodicSet <.periodicset.PeriodicSet>`
473
        Represents the crystal as a periodic set, consisting of a finite set
474
        of points (motif) and lattice (unit cell). Contains other useful data,
475
        e.g. the crystal's name and information about the asymmetric unit for
476
        calculation.
477
478
    Raises
479
    ------
480
    ParseError
481
        Raised if the structure fails to be parsed for any of the following:
482
        1. Required data is missing (e.g. cell parameters),
483
        2. disorder == 'skip' and any of:
484
            (a) any atom has occupancy < 1,
485
            (b) any atom's label ends with '?',
486
        3. The motif is empty after removing H or disordered sites.
487
    """
488
489
    # unit cell
490
    cellpar = [block.get(tag) for tag in CIF_TAGS['cellpar']]
491
    if None in cellpar:
492
        raise ParseError(f'{block.name} has missing cell data')
493
    cell = cellpar_to_cell(*cellpar)
494
495
    # asymmetric unit coords. ase removes uncertainty brackets.
496
    cartesian = False # flag needed for later
497
    asym_unit = [block.get(name) for name in CIF_TAGS['atom_site_fract']]
498
499
    if None in asym_unit: # missing scaled coords, try cartesian
500
        asym_unit = [block.get(name) for name in CIF_TAGS['atom_site_cartn']]
501
        if None in asym_unit:
502
            raise ParseError(f'{block.name} has no coordinates')
503
        cartesian = True
504
505
    asym_unit = list(zip(*asym_unit)) # transpose [xs,ys,zs] -> [p1,p2,...]
506
507
    # atomic types
508
    asym_symbols = block._get_any(CIF_TAGS['atom_symbol'])
0 ignored issues
show
Coding Style Best Practice introduced by
It seems like _get_any was declared protected and should not be accessed from this context.

Prefixing a member variable _ is usually regarded as the equivalent of declaring it with protected visibility that exists in other languages. Consequentially, such a member should only be accessed from the same class or a child class:

class MyParent:
    def __init__(self):
        self._x = 1;
        self.y = 2;

class MyChild(MyParent):
    def some_method(self):
        return self._x    # Ok, since accessed from a child class

class AnotherClass:
    def some_method(self, instance_of_my_child):
        return instance_of_my_child._x   # Would be flagged as AnotherClass is not
                                         # a child class of MyParent
Loading history...
509 View Code Duplication
    if asym_symbols is None:
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
510
        warnings.warn('missing atomic types will be labelled 0')
511
        asym_types = [0] * len(asym_unit)
512
    else:
513
        asym_types = []
514
        for label in asym_symbols:
515
            if label in ('.', '?'):
516
                warnings.warn('missing atomic types will be labelled 0')
517
                num = 0
518
            else:
519
                sym = re.search(r'([A-Z][a-z]?)', label).group(0)
520
                if sym == 'D':
521
                    sym = 'H'
522
                num = ase.data.atomic_numbers[sym]
523
            asym_types.append(num)
524
525
    # find if sites have disorder or not (if necassary)
526
    has_disorder = []
527
    if disorder != 'all_sites':
528
        occupancies = block.get('_atom_site_occupancy')
529
        if occupancies is None:
530
            occupancies = np.ones((len(asym_unit), ))
531
        labels = block.get('_atom_site_label')
532
        if labels is None:
533
            labels = [''] * len(asym_unit)
534
        for lab, occ in zip(labels, occupancies):
535
            has_disorder.append(_atom_has_disorder(lab, occ))
536
537
    # remove sites with ?, . or other string
538
    invalid_coords = []
539
    for i, xyz in enumerate(asym_unit):
540
        if not all(isinstance(coord, (int, float)) for coord in xyz):
541
            invalid_coords.append(i)
542
543
    if invalid_coords:
544
        warnings.warn('atoms without sites or missing data will be removed')
545
        asym_unit = [xyz for i, xyz in enumerate(asym_unit) if i not in invalid_coords]
546
        asym_types = [xyz for i, xyz in enumerate(asym_types) if i not in invalid_coords]
547
        if disorder != 'all_sites':
548
            has_disorder = [d for i, d in enumerate(has_disorder) if i not in invalid_coords]
549
550
    remove_sites = []
551
    if remove_hydrogens:
552
        remove_sites.extend(i for i, num in enumerate(asym_types) if num == 1)
553
554
    # remove atoms with fractional occupancy or raise ParseError
555 View Code Duplication
    if disorder != 'all_sites':
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
556
        for i, dis in enumerate(has_disorder):
557
            if i in remove_sites:
558
                continue
559
            if dis:
560
                if disorder == 'skip':
0 ignored issues
show
Unused Code introduced by
Unnecessary "elif" after "raise"
Loading history...
561
                    msg = f"{block.name} has disorder, pass disorder='ordered_sites' " \
562
                            "or 'all_sites' to remove/ignore disorder"
563
                    raise ParseError(msg)
564
                elif disorder == 'ordered_sites':
565
                    remove_sites.append(i)
566
567
    asym_unit = [xyz for i, xyz in enumerate(asym_unit) if i not in remove_sites]
568
    asym_types = [t for i, t in enumerate(asym_types) if i not in remove_sites]
569
    asym_unit = np.array(asym_unit)
570
571
    if asym_unit.shape[0] == 0:
572
        raise ParseError(f'{block.name} has no valid sites')
573
574
    if cartesian:
575
        asym_unit = asym_unit @ np.linalg.inv(cell)
576
    asym_unit = np.mod(asym_unit, 1)
577
    # asym_unit = _snap_small_prec_coords(asym_unit) # recommended by pymatgen
578
579
    # remove overlapping sites unless disorder == 'all_sites'
580
    if disorder != 'all_sites':
581
        keep_sites = _unique_sites(asym_unit)
582
        if not np.all(keep_sites):
583
            warnings.warn('may have overlapping sites, duplicates will be removed')
584
            asym_unit = asym_unit[keep_sites]
585
            asym_types = [t for t, keep in zip(asym_types, keep_sites) if keep]
586
587
    # symmetry operations
588
    sitesym = block._get_any(CIF_TAGS['symop'])
0 ignored issues
show
Coding Style Best Practice introduced by
It seems like _get_any was declared protected and should not be accessed from this context.

Prefixing a member variable _ is usually regarded as the equivalent of declaring it with protected visibility that exists in other languages. Consequentially, such a member should only be accessed from the same class or a child class:

class MyParent:
    def __init__(self):
        self._x = 1;
        self.y = 2;

class MyChild(MyParent):
    def some_method(self):
        return self._x    # Ok, since accessed from a child class

class AnotherClass:
    def some_method(self, instance_of_my_child):
        return instance_of_my_child._x   # Would be flagged as AnotherClass is not
                                         # a child class of MyParent
Loading history...
589
    if sitesym is None: # no symops, use spacegroup
590
        try:
591
            spg = block.get_spacegroup(True)
592
            rot, trans = spg.rotations, spg.translations
593
        except: # if none found, assume no symmetries
0 ignored issues
show
Coding Style Best Practice introduced by
General except handlers without types should be used sparingly.

Typically, you would use general except handlers when you intend to specifically handle all types of errors, f.e. when logging. Otherwise, such general error handlers can mask errors in your application that you want to know of.

Loading history...
594
            rot, trans = ase_parse_sitesym(['x,y,z'])
595
    else:
596
        if isinstance(sitesym, str):
597
            sitesym = [sitesym]
598
        rot, trans = ase_parse_sitesym(sitesym)
599
600
    # apply symmetries to asymmetric unit for full motif
601
    frac_motif, asym_inds, wyc_muls, inverses = _expand_asym_unit(asym_unit, rot, trans)
602
    types = np.array([asym_types[i] for i in inverses])
603
    motif = frac_motif @ cell
604
    
0 ignored issues
show
Coding Style introduced by
Trailing whitespace
Loading history...
605
    return PeriodicSet(
606
        motif=motif,
607
        cell=cell,
608
        name=block.name,
609
        asymmetric_unit=asym_inds,
610
        wyckoff_multiplicities=wyc_muls,
611
        types=types
612
    )
613
614
615
def periodicset_from_ase_atoms(
616
        atoms,
617
        remove_hydrogens=False
618
) -> PeriodicSet:
619
    """:class:`ase.atoms.Atoms` --> :class:`amd.PeriodicSet <.periodicset.PeriodicSet>`.
620
    Does not have the option to remove disorder.
621
622
    Parameters
623
    ----------
624
    atoms : :class:`ase.atoms.Atoms`
625
        An ase Atoms object representing a crystal.
626
    remove_hydrogens : bool, optional
627
        Remove Hydrogens from the crystal.
628
629
    Returns
630
    -------
631
    :class:`amd.PeriodicSet <.periodicset.PeriodicSet>`
632
        Represents the crystal as a periodic set, consisting of a finite set
633
        of points (motif) and lattice (unit cell). Contains other useful data,
634
        e.g. the crystal's name and information about the asymmetric unit for
635
        calculation.
636
637
    Raises
638
    ------
639
    ParseError
640
        Raised if there are no valid sites in atoms.
641
    """
642
643
    cell = atoms.get_cell().array
644
645
    remove_inds = []
646
    if remove_hydrogens:
647
        for i in np.where(atoms.get_atomic_numbers() == 1)[0]:
648
            remove_inds.append(i)
649
650
    for i in sorted(remove_inds, reverse=True):
651
        atoms.pop(i)
652
653
    if len(atoms) == 0:
654
        raise ParseError(f'ase Atoms object has no valid sites')
0 ignored issues
show
introduced by
Using an f-string that does not have any interpolated variables
Loading history...
655
656
    if 'spacegroup' in atoms.info:
657
        spg = atoms.info['spacegroup']
658
        rot, trans = spg.rotations, spg.translations
659
    # else assume no symmetries?
660
661
    asym_unit = ase.spacegroup.get_basis(atoms, tol=_EQUIV_SITE_TOL) # default tol=1e-5
662
    frac_motif, asym_inds, wyc_muls, _ = _expand_asym_unit(asym_unit, rot, trans)
0 ignored issues
show
introduced by
The variable rot does not seem to be defined in case 'spacegroup' in atoms.info on line 656 is False. Are you sure this can never be the case?
Loading history...
introduced by
The variable trans does not seem to be defined in case 'spacegroup' in atoms.info on line 656 is False. Are you sure this can never be the case?
Loading history...
663
    motif = frac_motif @ cell
664
665
    return PeriodicSet(
666
        motif=motif,
667
        cell=cell,
668
        asymmetric_unit=asym_inds,
669
        wyckoff_multiplicities=wyc_muls,
670
        types=atoms.get_atomic_numbers()
671
    )
672
673
674
def periodicset_from_pymatgen_cifblock(
675
        block,
676
        remove_hydrogens=False,
677
        disorder='skip'
678
) -> PeriodicSet:
679
    """:class:`pymatgen.io.cif.CifBlock` --> :class:`amd.PeriodicSet <.periodicset.PeriodicSet>`.
680
681
    Parameters
682
    ----------
683
    block : :class:`pymatgen.io.cif.CifBlock`
684
        A pymatgen CifBlock object representing a crystal.
685
    remove_hydrogens : bool, optional
686
        Remove Hydrogens from the crystal.
687
    disorder : str, optional
688
        Controls how disordered structures are handled. Default is ``skip``
689
        which raises a ParseError if the input has any disorder, since
690
        disorder conflicts with the periodic set model. To read disordered
691
        structures anyway, choose either :code:`ordered_sites` to remove sites
692
        with disorder or :code:`all_sites` include all sites regardless.
693
694
    Returns
695
    -------
696
    :class:`amd.PeriodicSet <.periodicset.PeriodicSet>`
697
        Represents the crystal as a periodic set, consisting of a finite set
698
        of points (motif) and lattice (unit cell). Contains other useful data,
699
        e.g. the crystal's name and information about the asymmetric unit for
700
        calculation.
701
702
    Raises
703
    ------
704
    ParseError
705
        Raised if the structure can/should not be parsed for the following reasons:
706
        1. no sites found or motif is empty after removing H or disordered sites,
707
        2. a site has missing coordinates,
708
        3. disorder == 'skip' and any of:
709
            (a) any atom has occupancy < 1,
710
            (b) any atom's label ends with '?'.
711
    """
712
713
    odict = block.data
714
715
    # unit cell
716
    cellpar = [odict.get(tag) for tag in CIF_TAGS['cellpar']]
717
    if None in cellpar:
718
        raise ParseError(f'{block.header} has missing cell data')
719
    cellpar = [ase.io.cif.convert_value(v) for v in cellpar]
720
    cell = cellpar_to_cell(*cellpar)
721
722
    # asymmetric unit coords
723
    cartesian = False # flag needed for later
724
    asym_unit = [odict.get(tag) for tag in CIF_TAGS['atom_site_fract']]
725
726
    if None in asym_unit: # missing scaled coords, try cartesian
727
        asym_unit = [odict.get(tag) for tag in CIF_TAGS['atom_site_cartn']]
728
        if None in asym_unit:
729
            raise ParseError(f'{block.header} has no coordinates')
730
        cartesian = True
731
732
    asym_unit = list(zip(*asym_unit)) # transpose [xs,ys,zs] -> [p1,p2,...]
733
    # pymatgen does not remove uncertainty brackets
734
    asym_unit = [[ase.io.cif.convert_value(coord) for coord in xyz] for xyz in asym_unit]
735
736
    # atomic types
737
    for tag in CIF_TAGS['atom_symbol']:
738
        asym_symbols = odict.get(tag)
739 View Code Duplication
        if asym_symbols is not None:
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
740
            asym_types = []
741
            for label in asym_symbols:
742
                if label in ('.', '?'):
743
                    warnings.warn('missing atomic types will be labelled 0')
744
                    num = 0
745
                else:
746
                    sym = re.search(r'([A-Z][a-z]?)', label).group(0)
747
                    if sym == 'D':
748
                        sym = 'H'
749
                    # could use pymatgen here
750
                    num = ase.data.atomic_numbers[sym]
751
                asym_types.append(num)
752
            break
753
    else:
754
        warnings.warn('missing atomic types will be labelled 0')
755
        asym_types = [0] * len(asym_unit)
756
757
    # find if sites have disorder or not (if necassary)
758
    has_disorder = []
759
    if disorder != 'all_sites':
760
        occupancies = odict.get('_atom_site_occupancy')
761
        if occupancies is None:
762
            occupancies = np.ones((len(asym_unit), ))
763
        labels = odict.get('_atom_site_label')
764
        if labels is None:
765
            labels = [''] * len(asym_unit)
766
        for lab, occ in zip(labels, occupancies):
767
            has_disorder.append(_atom_has_disorder(lab, occ))
768
769
    # remove sites with ?, . or other string
770
    invalid_coords = []
771
    for i, xyz in enumerate(asym_unit):
772
        if not all(isinstance(coord, (int, float)) for coord in xyz):
773
            invalid_coords.append(i)
774
775
    if invalid_coords:
776
        warnings.warn('atoms without sites or missing data will be removed')
777
        asym_unit = [xyz for i, xyz in enumerate(asym_unit) if i not in invalid_coords]
778
        asym_types = [xyz for i, xyz in enumerate(asym_types) if i not in invalid_coords]
779
        if disorder != 'all_sites':
780
            has_disorder = [d for i, d in enumerate(has_disorder) if i not in invalid_coords]
781
782
    remove_sites = []
783
    if remove_hydrogens:
784
        remove_sites.extend((i for i, num in enumerate(asym_types) if num == 1))
0 ignored issues
show
introduced by
The variable i does not seem to be defined in case the for loop on line 771 is not entered. Are you sure this can never be the case?
Loading history...
785
786 View Code Duplication
    if disorder != 'all_sites':
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
787
        for i, dis in enumerate(has_disorder):
788
            if i in remove_sites:
789
                continue
790
            if dis:
791
                if disorder == 'skip':
0 ignored issues
show
Unused Code introduced by
Unnecessary "elif" after "raise"
Loading history...
792
                    msg = f"{block.header} has disorder, pass disorder='ordered_sites' " \
793
                            "or 'all_sites' to remove/ignore disorder"
794
                    raise ParseError(msg)
795
                elif disorder == 'ordered_sites':
796
                    remove_sites.append(i)
797
798
    asym_unit = [xyz for i, xyz in enumerate(asym_unit) if i not in remove_sites]
799
    asym_types = [t for i, t in enumerate(asym_types) if i not in remove_sites]
800
    asym_unit = np.array(asym_unit)
801
802
    if asym_unit.shape[0] == 0:
803
        raise ParseError(f'{block.header} has no valid sites')
804
805
    if cartesian:
806
        asym_unit = asym_unit @ np.linalg.inv(cell)
807
    asym_unit = np.mod(asym_unit, 1)
808
    # asym_unit = _snap_small_prec_coords(asym_unit) # recommended by pymatgen
809
810
    # remove overlapping sites unless disorder == 'all_sites'
811
    if disorder != 'all_sites':      
0 ignored issues
show
Coding Style introduced by
Trailing whitespace
Loading history...
812
        keep_sites = _unique_sites(asym_unit)
813
        if not np.all(keep_sites):
814
            warnings.warn('may have overlapping sites; duplicates will be removed')
815
        asym_unit = asym_unit[keep_sites]
816
        asym_types = [sym for sym, keep in zip(asym_types, keep_sites) if keep]
817
818
    rot, trans = _parse_sitesym_pymatgen(odict)
819
    frac_motif, asym_inds, wyc_muls, inverses = _expand_asym_unit(asym_unit, rot, trans)
820
    types = np.array([asym_types[i] for i in inverses])
821
    motif = frac_motif @ cell
822
823
    return PeriodicSet(
824
        motif=motif,
825
        cell=cell,
826
        name=block.header,
827
        asymmetric_unit=asym_inds,
828
        wyckoff_multiplicities=wyc_muls,
829
        types=types
830
    )
831
832
833
def periodicset_from_pymatgen_structure(
834
        structure,
835
        remove_hydrogens=False,
836
        disorder='skip'
837
) -> PeriodicSet:
838
    """:class:`pymatgen.core.structure.Structure` --> :class:`amd.PeriodicSet <.periodicset.PeriodicSet>`.
839
    Does not set the name of the periodic set, as there seems to be no such attribute in
840
    the pymatgen Structure object.
841
842
    Parameters
843
    ----------
844
    structure : :class:`pymatgen.core.structure.Structure`
845
        A pymatgen Structure object representing a crystal.
846
    remove_hydrogens : bool, optional
847
        Remove Hydrogens from the crystal.
848
    disorder : str, optional
849
        Controls how disordered structures are handled. Default is ``skip``
850
        which raises a ParseError if the input has any disorder, since
851
        disorder conflicts with the periodic set model. To read disordered
852
        structures anyway, choose either :code:`ordered_sites` to remove sites
853
        with disorder or :code:`all_sites` include all sites regardless.
854
855
    Returns
856
    -------
857
    :class:`amd.PeriodicSet <.periodicset.PeriodicSet>`
858
        Represents the crystal as a periodic set, consisting of a finite set
859
        of points (motif) and lattice (unit cell). Contains other useful data,
860
        e.g. the crystal's name and information about the asymmetric unit for
861
        calculation.
862
    
0 ignored issues
show
Coding Style introduced by
Trailing whitespace
Loading history...
863
    Raises
864
    ------
865
    ParseError :
866
        Raised if the disorder == 'skip' and not structure.is_ordered
867
    """
868
869
    from pymatgen.symmetry.analyzer import SpacegroupAnalyzer
0 ignored issues
show
introduced by
Unable to import 'pymatgen.symmetry.analyzer'
Loading history...
introduced by
Import outside toplevel (pymatgen.symmetry.analyzer.SpacegroupAnalyzer)
Loading history...
870
871
    if remove_hydrogens:
872
        structure.remove_species(['H', 'D'])
873
874
    if disorder == 'skip':
875
        if not structure.is_ordered:
876
            msg = f"pymatgen Structure has disorder, pass disorder='ordered_sites' " \
0 ignored issues
show
introduced by
Using an f-string that does not have any interpolated variables
Loading history...
877
                   "or 'all_sites' to remove/ignore disorder"
878
            raise ParseError(msg)
879
    elif disorder == 'ordered_sites':
880
        remove_inds = []
881
        for i, comp in enumerate(structure.species_and_occu):
882
            if comp.num_atoms < 1:
883
                remove_inds.append(i)
884
        structure.remove_sites(remove_inds)
885
886
    motif = structure.cart_coords
887
    cell = structure.lattice.matrix
888
    sym_structure = SpacegroupAnalyzer(structure).get_symmetrized_structure()
889
    equiv_inds = sym_structure.equivalent_indices
890
    asym_unit = np.array([l[0] for l in equiv_inds])
891
    wyc_muls = np.array([len(l) for l in equiv_inds])
892
    types = np.array(sym_structure.atomic_numbers)
893
894
    return PeriodicSet(
895
        motif=motif,
896
        cell=cell,
897
        asymmetric_unit=asym_unit,
898
        wyckoff_multiplicities=wyc_muls,
899
        types=types
900
    )
901
902
903
def periodicset_from_ccdc_entry(
904
        entry,
905
        remove_hydrogens=False,
906
        disorder='skip',
907
        heaviest_component=False,
908
        molecular_centres=False
909
) -> PeriodicSet:
910
    """:class:`ccdc.entry.Entry` --> :class:`amd.PeriodicSet <.periodicset.PeriodicSet>`.
911
    Entry is the type returned by :class:`ccdc.io.EntryReader`.
912
913
    Parameters
914
    ----------
915
    entry : :class:`ccdc.entry.Entry`
916
        A ccdc Entry object representing a database entry.
917
    remove_hydrogens : bool, optional
918
        Remove Hydrogens from the crystal.
919
    disorder : str, optional
920
        Controls how disordered structures are handled. Default is ``skip``
921
        which raises a ParseError if the input has any disorder, since
922
        disorder conflicts with the periodic set model. To read disordered
923
        structures anyway, choose either :code:`ordered_sites` to remove sites
924
        with disorder or :code:`all_sites` include all sites regardless.
925
    heaviest_component : bool, optional
926
        Removes all but the heaviest molecule in the asymmeric unit, intended
927
        for removing solvents.
928
    molecular_centres : bool, default False
929
        Extract the centres of molecules in the unit cell and store in
930
        the attribute molecular_centres of the returned PeriodicSet.
931
932
    Returns
933
    -------
934
    :class:`amd.PeriodicSet <.periodicset.PeriodicSet>`
935
        Represents the crystal as a periodic set, consisting of a finite set
936
        of points (motif) and lattice (unit cell). Contains other useful data,
937
        e.g. the crystal's name and information about the asymmetric unit for
938
        calculation.
939
940
    Raises
941
    ------
942
    ParseError
943
        Raised if the structure fails parsing for any of the following:
944
        1. entry.has_3d_structure is False,
945
        2. disorder == 'skip' and any of:
946
            (a) entry.has_disorder or crystal.has_disorder is True,
947
            (b) any atom has occupancy < 1,
948
            (c) any atom's label ends with '?',
949
        3. entry.crystal.molecule.all_atoms_have_sites is False,
950
        4. a.fractional_coordinates is None for any a in entry.crystal.disordered_molecule,
951
        5. The motif is empty after removing H, disordered sites or solvents.
952
    """
953
954
    # Entry specific flags
955
    if not entry.has_3d_structure:
956
        raise ParseError(f'{entry.identifier} has no 3D structure')
957
958
    if disorder == 'skip' and entry.has_disorder:
959
        msg = f"{entry.identifier} has disorder, pass disorder='ordered_sites' " \
960
                "or 'all_sites' to remove/ignore disorder"
961
        raise ParseError(msg)
962
963
    return periodicset_from_ccdc_crystal(
964
        entry.crystal,
965
        remove_hydrogens=remove_hydrogens,
966
        disorder=disorder,
967
        heaviest_component=heaviest_component,
968
        molecular_centres=molecular_centres
969
    )
970
971
972
def periodicset_from_ccdc_crystal(
973
        crystal,
974
        remove_hydrogens=False,
975
        disorder='skip',
976
        heaviest_component=False,
977
        molecular_centres=False
978
) -> PeriodicSet:
979
    """:class:`ccdc.crystal.Crystal` --> :class:`amd.PeriodicSet <.periodicset.PeriodicSet>`.
980
    Crystal is the type returned by :class:`ccdc.io.CrystalReader`.
981
982
    Parameters
983
    ----------
984
    crystal : :class:`ccdc.crystal.Crystal`
985
        A ccdc Crystal object representing a crystal structure.
986
    remove_hydrogens : bool, optional
987
        Remove Hydrogens from the crystal.
988
    disorder : str, optional
989
        Controls how disordered structures are handled. Default is ``skip``
990
        which raises a ParseError if the input has any disorder, since
991
        disorder conflicts with the periodic set model. To read disordered
992
        structures anyway, choose either :code:`ordered_sites` to remove sites
993
        with disorder or :code:`all_sites` include all sites regardless.
994
    heaviest_component : bool, optional
995
        Removes all but the heaviest molecule in the asymmeric unit,
996
        intended for removing solvents.
997
    molecular_centres : bool, default False
998
        Extract the centres of molecules in the unit cell and store in
999
        the attribute molecular_centres of the returned PeriodicSet.
1000
1001
    Returns
1002
    -------
1003
    :class:`amd.PeriodicSet <.periodicset.PeriodicSet>`
1004
        Represents the crystal as a periodic set, consisting of a finite set
1005
        of points (motif) and lattice (unit cell). Contains other useful data,
1006
        e.g. the crystal's name and information about the asymmetric unit for
1007
        calculation.
1008
1009
    Raises
1010
    ------
1011
    ParseError
1012
        Raised if the structure fails parsing for any of the following:
1013
        1. disorder == 'skip' and any of:
1014
            (a) crystal.has_disorder is True,
1015
            (b) any atom has occupancy < 1,
1016
            (c) any atom's label ends with '?',
1017
        2. crystal.molecule.all_atoms_have_sites is False,
1018
        3. a.fractional_coordinates is None for any a in crystal.disordered_molecule,
1019
        4. The motif is empty after removing H, disordered sites or solvents.
1020
    """
1021
1022
    molecule = crystal.disordered_molecule
1023
1024
    # if skipping disorder, check disorder flags and then all atoms
1025
    if disorder == 'skip':
1026
        if crystal.has_disorder or \
1027
         any(_atom_has_disorder(a.label, a.occupancy) for a in molecule.atoms):
1028
            msg = f"{crystal.identifier} has disorder, pass disorder='ordered_sites' " \
1029
                   "or 'all_sites' to remove/ignore disorder"
1030
            raise ParseError(msg)
1031
1032
    elif disorder == 'ordered_sites':
1033
        molecule.remove_atoms(a for a in molecule.atoms
1034
                              if _atom_has_disorder(a.label, a.occupancy))
1035
1036
    if remove_hydrogens:
1037
        molecule.remove_atoms(a for a in molecule.atoms if a.atomic_symbol in 'HD')
1038
1039
    if heaviest_component and len(molecule.components) > 1:
1040
        molecule = _heaviest_component(molecule)
1041
1042
    # remove atoms with missing coordinate data and warn
1043
    is_missing = (a.fractional_coordinates is None for a in molecule.atoms)
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable a does not seem to be defined.
Loading history...
1044
    if any(is_missing):
1045
        warnings.warn('atoms without sites or missing data will be removed')
1046
        molecule.remove_atoms(a for a, missing in zip(molecule.atoms, is_missing) if missing)
1047
1048
    if not molecule.all_atoms_have_sites:
1049
        raise ParseError(f'{crystal.identifier} has atoms without sites')
1050
1051
    crystal.molecule = molecule
1052
    cell = cellpar_to_cell(*crystal.cell_lengths, *crystal.cell_angles)
1053
1054
    if molecular_centres:
1055
        frac_centres = _frac_molecular_centres_ccdc(crystal)
1056
        mol_centres = frac_centres @ cell
1057
        return PeriodicSet(mol_centres, cell, name=crystal.identifier)
1058
1059
    asym_atoms = crystal.asymmetric_unit_molecule.atoms
1060
    # check for None?
1061
    asym_unit = np.array([tuple(a.fractional_coordinates) for a in asym_atoms])
1062
1063
    if asym_unit.shape[0] == 0:
1064
        raise ParseError(f'{crystal.identifier} has no valid sites')
1065
1066
    asym_unit = np.mod(asym_unit, 1)
1067
    # asym_unit = _snap_small_prec_coords(asym_unit)
1068
    asym_types = [a.atomic_number for a in asym_atoms]
1069
1070
    if disorder != 'all_sites':
1071
        keep_sites = _unique_sites(asym_unit)
1072
        if not np.all(keep_sites):
1073
            warnings.warn('may have overlapping sites; duplicates will be removed')
1074
        asym_unit = asym_unit[keep_sites]
1075
        asym_types = [sym for sym, keep in zip(asym_types, keep_sites) if keep]
1076
1077
    sitesym = crystal.symmetry_operators
1078
    # try spacegroup numbers?
1079
    if not sitesym:
1080
        sitesym = ['x,y,z']
1081
1082
    rot = np.array([np.array(crystal.symmetry_rotation(op)).reshape((3, 3)) 
0 ignored issues
show
Coding Style introduced by
Trailing whitespace
Loading history...
1083
                    for op in sitesym])
1084
    trans = np.array([crystal.symmetry_translation(op) for op in sitesym])
1085
    frac_motif, asym_inds, wyc_muls, inverses = _expand_asym_unit(asym_unit, rot, trans)
1086
    motif = frac_motif @ cell
1087
    types = np.array([asym_types[i] for i in inverses])
1088
1089
    return PeriodicSet(
1090
        motif=motif,
1091
        cell=cell,
1092
        name=crystal.identifier,
1093
        asymmetric_unit=asym_inds,
1094
        wyckoff_multiplicities=wyc_muls,
1095
        types=types
1096
    )
1097
1098
1099
# function is not quite finished.
1100
def periodicset_from_gemmi_block(
1101
        block,
1102
        remove_hydrogens=False,
1103
        disorder='skip'
1104
) -> PeriodicSet:
1105
    """:class:`gemmi.cif.Block` --> :class:`amd.PeriodicSet <.periodicset.PeriodicSet>`.
1106
    Block is the type returned by :class:`gemmi.cif.read_file`.
1107
1108
    Parameters
1109
    ----------
1110
    block : :class:`ase.io.cif.CIFBlock`
1111
        An ase CIFBlock object representing a crystal.
1112
    remove_hydrogens : bool, optional
1113
        Remove Hydrogens from the crystal.
1114
    disorder : str, optional
1115
        Controls how disordered structures are handled. Default is ``skip``
1116
        which raises a ParseError if the input has any disorder, since
1117
        disorder conflicts with the periodic set model. To read disordered
1118
        structures anyway, choose either :code:`ordered_sites` to remove sites
1119
        with disorder or :code:`all_sites` include all sites regardless.
1120
1121
    Returns
1122
    -------
1123
    :class:`amd.PeriodicSet <.periodicset.PeriodicSet>`
1124
        Represents the crystal as a periodic set, consisting of a finite set
1125
        of points (motif) and lattice (unit cell). Contains other useful data,
1126
        e.g. the crystal's name and information about the asymmetric unit for
1127
        calculation.
1128
1129
    Raises
1130
    ------
1131
    ParseError
1132
        Raised if the structure fails to be parsed for any of the following:
1133
        1. Required data is missing (e.g. cell parameters),
1134
        2. disorder == 'skip' and any of:
1135
            (a) any atom has occupancy < 1,
1136
            (b) any atom's label ends with '?',
1137
        3. The motif is empty after removing H or disordered sites.
1138
    """
1139
1140
    cellpar = [ase.io.cif.convert_value(block.find_value(tag))
1141
               for tag in CIF_TAGS['cellpar']]
1142
    if None in cellpar:
1143
        raise ParseError(f'{block.name} has missing cell information')
1144
1145
    cell = cellpar_to_cell(*cellpar)
1146
1147
    xyz_loop = block.find(CIF_TAGS['atom_site_fract']).loop
1148
    if xyz_loop is None:
1149
        # check for crtsn!
1150
        raise ParseError(f'{block.name} has missing coordinate data')
1151
1152
    # loop will contain all xyz cols, maybe labels, maybe types, maybe occs
1153
1154
    loop_dict = _gemmi_loop_to_dict(xyz_loop)
1155
    xyz_str = [loop_dict[t] for t in CIF_TAGS['atom_site_fract']]
1156
    asym_unit = [[ase.io.cif.convert_value(c) for c in coords] for coords in xyz_str]
1157
1158
    asym_unit = np.mod(np.array(asym_unit).T, 1)
1159
    # asym_unit = _snap_small_prec_coords(asym_unit)
1160
1161
    if '_atom_site_type_symbol' in loop_dict:
1162
        asym_syms = loop_dict['_atom_site_type_symbol']
1163
        asym_types = [ase.data.atomic_numbers[s] for s in asym_syms]
1164
    else:
1165
        warnings.warn('missing atomic types')
1166
        asym_types = [0 for _ in range(len(asym_unit))]
1167
1168
    remove_sites = []
1169
1170
    # if labels exist, check them for disorder
1171
    if '_atom_site_label' in loop_dict:
1172
        labels = loop_dict['_atom_site_label']
1173
    else:
1174
        labels = [''] * xyz_loop.length()
1175
1176
    # if occupancies exist, check them for disorder
1177
    if '_atom_site_occupancy' in loop_dict:
1178
        occupancies = [ase.io.cif.convert_value(occ)
1179
                       for occ in loop_dict['_atom_site_occupancy']]
1180
    else:
1181
        occupancies = [None for _ in range(xyz_loop.length())]
1182
1183
    if disorder == 'skip':
1184
        if any(_atom_has_disorder(lab, occ) for lab, occ in zip(labels, occupancies)):
1185
            msg = f"{block.name} has disorder, pass disorder='ordered_sites' " \
1186
                "or 'all_sites' to remove/ignore disorder"
1187
            raise ParseError(msg)
1188
    elif disorder == 'ordered_sites':
1189
        for i, (lab, occ) in enumerate(zip(labels, occupancies)):
1190
            if _atom_has_disorder(lab, occ):
1191
                remove_sites.append(i)
1192
1193
    if remove_hydrogens:
1194
        remove_sites.extend((i for i, num in enumerate(asym_types) if num == 1))
0 ignored issues
show
introduced by
The variable i does not seem to be defined for all execution paths.
Loading history...
1195
1196
    asym_unit = np.delete(asym_unit, remove_sites, axis=0)
1197
    asym_types = [s for i, s in enumerate(asym_types) if i not in remove_sites]
1198
1199
    if disorder != 'all_sites':
1200
        keep_sites = _unique_sites(asym_unit)
1201
        if not np.all(keep_sites):
1202
            warnings.warn('may have overlapping sites; duplicates will be removed')
1203
        asym_unit = asym_unit[keep_sites]
1204
        asym_types = [sym for sym, keep in zip(asym_types, keep_sites) if keep]
1205
1206
    if asym_unit.shape[0] == 0:
1207
        raise ParseError(f'{block.name} has no valid sites')
1208
1209
    # get symops
1210
    sitesym = []
1211
    for tag in CIF_TAGS['symop']:
1212
        symop_loop = block.find([tag]).loop
1213
        if symop_loop is not None:
1214
            symop_loop_dict = _gemmi_loop_to_dict(symop_loop)
1215
            sitesym = symop_loop_dict[tag]
1216
            break
1217
    # try spacegroup names/nums?
1218
    if not sitesym:
1219
        sitesym = ['x,y,z',]
1220
1221
    rot, trans = ase_parse_sitesym(sitesym)
1222
    frac_motif, asym_inds, wyc_muls, inverses = _expand_asym_unit(asym_unit, rot, trans)
1223
    types = np.array([asym_types[i] for i in inverses])
1224
    motif = frac_motif @ cell
1225
1226
    return PeriodicSet(
1227
        motif=motif,
1228
        cell=cell,
1229
        name=block.name,
1230
        asymmetric_unit=asym_inds,
1231
        wyckoff_multiplicities=wyc_muls,
1232
        types=types
1233
    )
1234
1235
1236
def _expand_asym_unit(
1237
        asym_unit: np.ndarray,
1238
        rotations: np.ndarray,
1239
        translations: np.ndarray
1240
) -> Tuple[np.ndarray, ...]:
1241
    """
1242
    Asymmetric unit frac coords, list of rotations, list of translations
1243
    -->
1244
    full fractional motif, asymmetric unit indices, multiplicities, inverses.
1245
    """
1246
1247
    all_sites = []
1248
    asym_inds = [0]
1249
    multiplicities = []
1250
    inverses = []
1251
    m, dims = asym_unit.shape
1252
1253
    expanded_sites = np.zeros((m, len(rotations), dims))
1254
    for i in range(m):
1255
        expanded_sites[i] = np.dot(rotations, asym_unit[i]) + translations
1256
    expanded_sites = np.mod(expanded_sites, 1)
1257
1258
    for inv, sites in enumerate(expanded_sites):
1259
1260
        multiplicity = 0
1261
1262
        for site_ in sites:
1263
1264
            if not all_sites:
1265
                all_sites.append(site_)
1266
                inverses.append(inv)
1267
                multiplicity += 1
1268
                continue
1269
1270
            # check if site_ overlaps with existing sites
1271
            diffs1 = np.abs(site_ - all_sites)
1272
            diffs2 = np.abs(diffs1 - 1)
1273
            mask = np.all((diffs1 <= _EQUIV_SITE_TOL) | (diffs2 <= _EQUIV_SITE_TOL), axis=-1)
1274
1275
            if np.any(mask):
1276
                where_equal = np.argwhere(mask).flatten()
1277
                for ind in where_equal:
1278
                    if inverses[ind] == inv: # invariant
1279
                        pass
1280
                    else: # equivalent to a different site
1281
                        warnings.warn(f'has equivalent sites at positions {inverses[ind]}, {inv}')
1282
            else:
1283
                all_sites.append(site_)
1284
                inverses.append(inv)
1285
                multiplicity += 1
1286
1287
        if multiplicity > 0:
1288
            multiplicities.append(multiplicity)
1289
            asym_inds.append(len(all_sites))
1290
1291
    frac_motif = np.array(all_sites)
1292
    asym_inds = np.array(asym_inds[:-1])
1293
    multiplicities = np.array(multiplicities)
1294
1295
    return frac_motif, asym_inds, multiplicities, inverses
1296
1297
1298
@numba.njit()
1299
def _unique_sites(asym_unit):
1300
    """Uniquify (within _EQUIV_SITE_TOL) a list of fractional coordinates,
1301
    considering all points modulo 1. Returns an array of bools such that
1302
    asym_unit[_unique_sites(asym_unit)] is a uniquified list."""
1303
    site_diffs1 = np.abs(np.expand_dims(asym_unit, 1) - asym_unit)
1304
    site_diffs2 = np.abs(site_diffs1 - 1)
1305
    sites_neq_mask = np.logical_and((site_diffs1 > _EQUIV_SITE_TOL),
1306
                                    (site_diffs2 > _EQUIV_SITE_TOL))
1307
    overlapping = np.triu(sites_neq_mask.sum(axis=-1) == 0, 1)
1308
    return overlapping.sum(axis=0) == 0
1309
1310
1311
def _parse_sitesym_pymatgen(data):
1312
    """In order to generate symmetry equivalent positions, the symmetry
1313
    operations are parsed. If the symops are not present, the space
1314
    group symbol is parsed, and symops are generated."""
1315
1316
    from pymatgen.symmetry.groups import SpaceGroup
0 ignored issues
show
introduced by
Unable to import 'pymatgen.symmetry.groups'
Loading history...
introduced by
Import outside toplevel (pymatgen.symmetry.groups.SpaceGroup)
Loading history...
1317
    from pymatgen.core.operations import SymmOp
0 ignored issues
show
introduced by
Unable to import 'pymatgen.core.operations'
Loading history...
introduced by
Import outside toplevel (pymatgen.core.operations.SymmOp)
Loading history...
1318
    import pymatgen.io.cif
0 ignored issues
show
introduced by
Import outside toplevel (pymatgen.io.cif)
Loading history...
introduced by
Unable to import 'pymatgen.io.cif'
Loading history...
1319
1320
    symops = []
1321
1322
    # try to parse xyz symops
1323
    for symmetry_label in CIF_TAGS['symop']:
1324
1325
        xyz = data.get(symmetry_label)
1326
        if not xyz:
1327
            continue
1328
        if isinstance(xyz, str):
1329
            xyz = [xyz]
1330
        try:
1331
            symops = [SymmOp.from_xyz_string(s) for s in xyz]
1332
            break
1333
        except ValueError:
1334
            continue
1335
1336
    # try to parse symbol
1337
    if not symops:
1338
1339
        for symmetry_label in CIF_TAGS['spacegroup_name']:
1340
1341
            sg = data.get(symmetry_label)
1342
            if not sg:
1343
                continue
1344
            sg = re.sub(r'[\s_]', '', sg)
1345
1346
            try:
1347
                spg = pymatgen.io.cif.space_groups.get(sg)
1348
                if not spg:
1349
                    continue
1350
                symops = SpaceGroup(spg).symmetry_ops
1351
                break
1352
            except ValueError:
1353
                pass
1354
1355
            try:
1356
                for d in pymatgen.io.cif._get_cod_data():
0 ignored issues
show
Coding Style Best Practice introduced by
It seems like _get_cod_data was declared protected and should not be accessed from this context.

Prefixing a member variable _ is usually regarded as the equivalent of declaring it with protected visibility that exists in other languages. Consequentially, such a member should only be accessed from the same class or a child class:

class MyParent:
    def __init__(self):
        self._x = 1;
        self.y = 2;

class MyChild(MyParent):
    def some_method(self):
        return self._x    # Ok, since accessed from a child class

class AnotherClass:
    def some_method(self, instance_of_my_child):
        return instance_of_my_child._x   # Would be flagged as AnotherClass is not
                                         # a child class of MyParent
Loading history...
1357
                    if sg == re.sub(r'\s+', '', d['hermann_mauguin']):
1358
                        xyz = d['symops']
1359
                        symops = [SymmOp.from_xyz_string(s) for s in xyz]
1360
                        break
1361
            except Exception:
0 ignored issues
show
Best Practice introduced by
Catching very general exceptions such as Exception is usually not recommended.

Generally, you would want to handle very specific errors in the exception handler. This ensure that you do not hide other types of errors which should be fixed.

So, unless you specifically plan to handle any error, consider adding a more specific exception.

Loading history...
1362
                continue
1363
1364
            if symops:
1365
                break
1366
1367
    # try to parse international number
1368
    if not symops:
1369
        for symmetry_label in CIF_TAGS['spacegroup_number']:
1370
            num = data.get(symmetry_label)
1371
            if not num:
1372
                continue
1373
1374
            try:
1375
                i = int(ase.io.cif.convert_value(num))
1376
                symops = SpaceGroup.from_int_number(i).symmetry_ops
1377
                break
1378
            except ValueError:
1379
                continue
1380
1381
    if not symops:
1382
        symops = [SymmOp.from_xyz_string(s) for s in ['x', 'y', 'z']]
1383
1384
    rotations = [op.rotation_matrix for op in symops]
1385
    translations = [op.translation_vector for op in symops]
1386
1387
    return rotations, translations
1388
1389
1390
def _frac_molecular_centres_ccdc(crystal):
1391
    """Returns any geometric centres of molecules in the unit cell.
1392
    Expects a ccdc Crystal object and returns fractional coordiantes."""
1393
1394
    frac_centres = []
1395
    for comp in crystal.packing(inclusion='CentroidIncluded').components:
1396
        coords = [a.fractional_coordinates for a in comp.atoms]
1397
        x, y, z = zip(*coords)
1398
        m = len(coords)
1399
        frac_centres.append((sum(x) / m, sum(y) / m, sum(z) / m))
1400
    frac_centres = np.mod(np.array(frac_centres), 1)
1401
    return frac_centres[_unique_sites(frac_centres)]
1402
1403
1404
def _atom_has_disorder(label, occupancy):
1405
    """Return True if label ends with ? or occupancy is a number < 1."""
1406
    return label.endswith('?') or (np.isscalar(occupancy) and occupancy < 1)
1407
1408
1409
# def _snap_small_prec_coords(frac_coords):
1410
#     """Find where frac_coords is within 1e-4 of 1/3 or 2/3, change to 1/3 and 2/3.
1411
#     Recommended by pymatgen's CIF parser. May use later."""
1412
#     frac_coords[np.abs(1 - 3 * frac_coords) < 1e-4] = 1 / 3.
1413
#     frac_coords[np.abs(1 - 3 * frac_coords / 2) < 1e-4] = 2 / 3.
1414
#     return frac_coords
1415
1416
1417
def _heaviest_component(molecule):
1418
    """Heaviest component (removes all but the heaviest component of the asym unit).
1419
    Intended for removing solvents. Probably doesn't play well with disorder"""
1420
    component_weights = []
1421
    for component in molecule.components:
1422
        weight = 0
1423
        for a in component.atoms:
1424
            if isinstance(a.atomic_weight, (float, int)):
1425
                if isinstance(a.occupancy, (float, int)):
1426
                    weight += a.occupancy * a.atomic_weight
1427
                else:
1428
                    weight += a.atomic_weight
1429
        component_weights.append(weight)
1430
    largest_component_ind = np.argmax(np.array(component_weights))
1431
    molecule = molecule.components[largest_component_ind]
1432
    return molecule
1433
1434
1435
def _refcodes_from_families(refcode_families):
1436
    """List of strings --> all CSD refcodes starting with any of the strings.
1437
    Intended to be passed a list of families and return all refcodes in them."""
1438
1439
    try:
1440
        import ccdc.search
0 ignored issues
show
introduced by
Import outside toplevel (ccdc.search)
Loading history...
1441
    except (ImportError, RuntimeError) as _:
1442
        msg = 'Failed to import csd-python-api, please'\
1443
              'check it is installed and licensed.'
1444
        raise ImportError(msg)
1445
1446
    all_refcodes = []
1447
    for refcode in refcode_families:
1448
        query = ccdc.search.TextNumericSearch()
1449
        query.add_identifier(refcode)
1450
        hits = [hit.identifier for hit in query.search()]
1451
        all_refcodes.extend(hits)
1452
1453
    # filter to unique refcodes
1454
    seen = set()
1455
    seen_add = seen.add
1456
    refcodes = [
1457
        refcode for refcode in all_refcodes
1458
        if not (refcode in seen or seen_add(refcode))]
1459
1460
    return refcodes
1461
1462
1463
def _gemmi_loop_to_dict(gemmi_loop):
1464
    """gemmi Loop object --> dict, tags: values"""
1465
    tablified_loop = [[] for _ in range(len(gemmi_loop.tags))]
1466
    n_cols = gemmi_loop.width()
1467
    for i, item in enumerate(gemmi_loop.values):
1468
        tablified_loop[i % n_cols].append(item)
1469
    return {tag: l for tag, l in zip(gemmi_loop.tags, tablified_loop)}
0 ignored issues
show
Unused Code introduced by
Unnecessary use of a comprehension
Loading history...
1470