Passed
Push — master ( bc96b3...eb28f3 )
by Daniel
06:34
created

amd.io._parse_sitesyms()   F

Complexity

Conditions 16

Size

Total Lines 47
Code Lines 34

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 34
dl 0
loc 47
rs 2.4
c 0
b 0
f 0
cc 16
nop 1

How to fix   Complexity   

Complexity

Complex classes like amd.io._parse_sitesyms() often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

1
"""Tools for reading crystals from files, or from the CSD with
2
``csd-python-api``. The readers return
3
:class:`amd.PeriodicSet <.periodicset.PeriodicSet>` objects representing
4
the crystal which can be passed to :func:`amd.AMD() <.calculate.AMD>`
5
and :func:`amd.PDD() <.calculate.PDD>`.
6
"""
7
8
import os
9
import re
10
import functools
11
import warnings
12
import errno
13
from typing import Tuple
14
15
import numpy as np
16
import numba
17
18
from .utils import cellpar_to_cell
19
from .periodicset import PeriodicSet
20
21
22
def _custom_warning(message, category, filename, lineno, *args, **kwargs):
23
    return f'{category.__name__}: {message}\n'
24
warnings.formatwarning = _custom_warning
25
26
_EQUIV_SITE_TOL = 1e-3
27
28
CIF_TAGS = {
29
    'cellpar': [
30
        '_cell_length_a',
31
        '_cell_length_b',
32
        '_cell_length_c',
33
        '_cell_angle_alpha',
34
        '_cell_angle_beta',
35
        '_cell_angle_gamma',],
36
37
    'atom_site_fract': [
38
        '_atom_site_fract_x',
39
        '_atom_site_fract_y',
40
        '_atom_site_fract_z',],
41
42
    'atom_site_cartn': [
43
        '_atom_site_cartn_x',
44
        '_atom_site_cartn_y',
45
        '_atom_site_cartn_z',],
46
47
    'atom_symbol': [
48
        '_atom_site_type_symbol',
49
        '_atom_site_label',],
50
51
    'symop': [
52
        '_symmetry_equiv_pos_as_xyz',
53
        '_space_group_symop_operation_xyz',
54
        '_space_group_symop.operation_xyz',
55
        '_symmetry_equiv_pos_as_xyz_',
56
        '_space_group_symop_operation_xyz_',],
57
58
    'spacegroup_name': [
59
        '_space_group_name_Hall',
60
        '_symmetry_space_group_name_hall',
61
        '_space_group_name_H-M_alt',
62
        '_symmetry_space_group_name_H-M',
63
        '_symmetry_space_group_name_H_M',
64
        '_symmetry_space_group_name_h-m',],
65
66
    'spacegroup_number': [
67
        '_space_group_IT_number',
68
        '_symmetry_Int_Tables_number',
69
        '_space_group_IT_number_',
70
        '_symmetry_Int_Tables_number_',],
71
}
72
73
74
class _Reader:
75
76
    _DISORDER_OPTIONS = {'skip', 'ordered_sites', 'all_sites'}
77
    _CCDC_IMPORT_ERR_MSG = 'Failed to import csd-python-api, please check ' \
78
                           'it is installed and licensed.'
79
80
    def __init__(self, iterable, converter, show_warnings=True):
81
        self._iterator = iter(iterable)
82
        self._converter = converter
83
        self.show_warnings = show_warnings
84
85
    def __iter__(self):
86
        if self._iterator is None or self._converter is None:
87
            raise RuntimeError(f'{self.__class__.__name__} not initialized.')
88
        return self
89
90
    def __next__(self):
91
        """Iterates over self._iterator, passing items through
92
        self._converter. Catches :class:`ParseError <.io.ParseError>`
93
        and warnings raised in self._converter, optionally printing
94
        them.
95
        """
96
97
        if not self.show_warnings:
98
            warnings.simplefilter('ignore')
99
100
        while True:
101
102
            item = next(self._iterator)
103
104
            with warnings.catch_warnings(record=True) as warning_msgs:
105
                msg = None
106
                try:
107
                    periodic_set = self._converter(item)
108
                except ParseError as err:
109
                    msg = str(err)
110
111
            if msg:
112
                warnings.warn(msg)
113
                continue
114
115
            for warning in warning_msgs:
116
                msg = f'(name={periodic_set.name}) {warning.message}'
117
                warnings.warn(msg, category=warning.category)
118
119
            return periodic_set
120
121
    def read(self):
122
        """Reads the crystal(s), returns one
123
        :class:`amd.PeriodicSet <.periodicset.PeriodicSet>` if there is
124
        only one, otherwise returns a list.
125
        """
126
127
        l = list(self)
128
        if len(l) == 1:
129
            return l[0]
130
        return l
131
132
133
class CifReader(_Reader):
134
    """Read all structures in a .cif file or all files in a folder
135
    with ase or csd-python-api (if installed), yielding
136
    :class:`amd.PeriodicSet <.periodicset.PeriodicSet>` s.
137
138
    Parameters
139
    ----------
140
    path : str
141
        Path to a .CIF file or directory. (Other files are accepted when
142
        using ``reader='ccdc'``, if csd-python-api is installed.)
143
    reader : str, optional
144
        The backend package used to parse the CIF. The default is 
145
        :code:`ase`, :code:`pymatgen` and :code:`gemmi` are also
146
        accepted, as well as :code:`ccdc` if csd-python-api is 
147
        installed. The ccdc reader should be able to read any format
148
        accepted by :class:`ccdc.io.EntryReader`, though only CIFs have
149
        been tested.
150
    remove_hydrogens : bool, optional
151
        Remove Hydrogens from the crystals.
152
    disorder : str, optional
153
        Controls how disordered structures are handled. Default is
154
        ``skip`` which skips any crystal with disorder, since disorder
155
        conflicts with the periodic set model. To read disordered
156
        structures anyway, choose either :code:`ordered_sites` to remove
157
        atoms with disorder or :code:`all_sites` include all atoms
158
        regardless of disorder.
159
    heaviest_component : bool, optional
160
        csd-python-api only. Removes all but the heaviest molecule in
161
        the asymmeric unit, intended for removing solvents.
162
    molecular_centres : bool, default False
163
        csd-python-api only. Extract the centres of molecules in the
164
        unit cell and store in the attribute molecular_centres.
165
    show_warnings : bool, optional
166
        Controls whether warnings that arise during reading are printed.
167
168
    Yields
169
    ------
170
    :class:`amd.PeriodicSet <.periodicset.PeriodicSet>`
171
        Represents the crystal as a periodic set, consisting of a finite
172
        set of points (motif) and lattice (unit cell). Contains other
173
        data, e.g. the crystal's name and information about the
174
        asymmetric unit.
175
176
    Examples
177
    --------
178
179
        ::
180
181
            # Put all crystals in a .CIF in a list
182
            structures = list(amd.CifReader('mycif.cif'))
183
184
            # Can also accept path to a directory, reading all files inside
185
            structures = list(amd.CifReader('path/to/folder'))
186
187
            # Reads just one if the .CIF has just one crystal
188
            periodic_set = amd.CifReader('mycif.cif').read()
189
190
            # List of AMDs (k=100) of crystals in a .CIF
191
            amds = [amd.AMD(periodic_set, 100) for periodic_set in amd.CifReader('mycif.cif')]
192
    """
193
194
    def __init__(
195
            self,
196
            path,
197
            reader='ase',
198
            remove_hydrogens=False,
199
            disorder='skip',
200
            heaviest_component=False,
201
            molecular_centres=False,
202
            show_warnings=True,
203
    ):
204
205
        if disorder not in CifReader._DISORDER_OPTIONS:
206
            msg = 'disorder parameter must be one of ' \
207
                  f'{CifReader._DISORDER_OPTIONS} (passed "{disorder}")'
208
            raise ValueError(msg)
209
210
        if reader != 'ccdc':
211
            if heaviest_component:
212
                msg = 'Parameter heaviest_component only implemented for ' \
213
                      'reader="ccdc".'
214
                raise NotImplementedError(msg)
215
216
            if molecular_centres:
217
                msg = 'Parameter molecular_centres only implemented for ' \
218
                      'reader="ccdc".'
219
                raise NotImplementedError(msg)
220
221
        if reader in ('ase', 'pycodcif'):
222
            from ase.io.cif import parse_cif
223
            extensions = {'cif'}
224
            file_parser = functools.partial(parse_cif, reader=reader)
225
            converter = functools.partial(
226
                periodicset_from_ase_cifblock,
227
                remove_hydrogens=remove_hydrogens,
228
                disorder=disorder
229
            )
230
231
        elif reader == 'pymatgen':
232
            extensions = {'cif'}
233
            file_parser = CifReader._pymatgen_cifblock_generator
234
            converter = functools.partial(
235
                periodicset_from_pymatgen_cifblock,
236
                remove_hydrogens=remove_hydrogens,
237
                disorder=disorder
238
            )
239
240
        elif reader == 'gemmi':
241
            import gemmi
242
            extensions = {'cif'}
243
            file_parser = gemmi.cif.read_file
244
            converter = functools.partial(
245
                periodicset_from_gemmi_block,
246
                remove_hydrogens=remove_hydrogens,
247
                disorder=disorder
248
            )
249
250
        elif reader == 'ccdc':
251
            try:
252
                import ccdc.io
253
            except (ImportError, RuntimeError) as e:
254
                raise ImportError(_Reader._CCDC_IMPORT_ERR_MSG) from e
255
256
            extensions = ccdc.io.EntryReader.known_suffixes
257
            file_parser = ccdc.io.EntryReader
258
            converter = functools.partial(
259
                periodicset_from_ccdc_entry,
260
                remove_hydrogens=remove_hydrogens,
261
                disorder=disorder,
262
                molecular_centres=molecular_centres,
263
                heaviest_component=heaviest_component
264
            )
265
266
        else:
267
            raise ValueError(f'Unknown reader "{reader}".')
268
269
        if os.path.isfile(path):
270
            iterable = file_parser(path)
271
        elif os.path.isdir(path):
272
            iterable = CifReader._dir_generator(path, file_parser, extensions)
273
        else:
274
            raise FileNotFoundError(
275
                errno.ENOENT, os.strerror(errno.ENOENT), path)
276
277
        super().__init__(iterable, converter, show_warnings=show_warnings)
278
279
    @staticmethod
280
    def _dir_generator(path, callable, extensions):
281
        for file in os.listdir(path):
282
            suff = os.path.splitext(file)[1][1:]
283
            if suff.lower() in extensions:
284
                yield from callable(os.path.join(path, file))
285
286
    @staticmethod
287
    def _pymatgen_cifblock_generator(path):
288
        """Path to .cif --> generator of pymatgen CifBlock objects."""
289
        from pymatgen.io.cif import CifFile
290
        yield from CifFile.from_file(path).data.values()
291
292
293
class CSDReader(_Reader):
294
    """Read structures from the CSD with csd-python-api, yielding
295
    :class:`amd.PeriodicSet <.periodicset.PeriodicSet>` s.
296
297
    Parameters
298
    ----------
299
    refcodes : str or List[str], optional
300
        Single or list of CSD refcodes to read. If None or 'CSD',
301
        iterates over the whole CSD.
302
    families : bool, optional
303
        Read all entries whose refcode starts with the given strings, or
304
        'families' (e.g. giving 'DEBXIT' reads all entries starting with
305
        DEBXIT).
306
    remove_hydrogens : bool, optional
307
        Remove hydrogens from the crystals.
308
    disorder : str, optional
309
        Controls how disordered structures are handled. Default is
310
        ``skip`` which skips any crystal with disorder, since disorder
311
        conflicts with the periodic set model. To read disordered
312
        structures anyway, choose either :code:`ordered_sites` to remove
313
        atoms with disorder or :code:`all_sites` include all atoms
314
        regardless of disorder.
315
    heaviest_component : bool, optional
316
        Removes all but the heaviest molecule in the asymmeric unit,
317
        intended for removing solvents.
318
    molecular_centres : bool, default False
319
        Extract the centres of molecules in the unit cell and store in
320
        attribute molecular_centres.
321
    show_warnings : bool, optional
322
        Controls whether warnings that arise during reading are printed.
323
324
    Yields
325
    ------
326
    :class:`amd.PeriodicSet <.periodicset.PeriodicSet>`
327
        Represents the crystal as a periodic set, consisting of a finite
328
        set of points (motif) and lattice (unit cell). Contains other
329
        useful data, e.g. the crystal's name and information about the
330
        asymmetric unit for calculation.
331
332
    Examples
333
    --------
334
335
        ::
336
337
            # Put these entries in a list
338
            refcodes = ['DEBXIT01', 'DEBXIT05', 'HXACAN01']
339
            structures = list(amd.CSDReader(refcodes))
340
341
            # Read refcode families (any whose refcode starts with strings in the list)
342
            refcode_families = ['ACSALA', 'HXACAN']
343
            structures = list(amd.CSDReader(refcode_families, families=True))
344
345
            # Get AMDs (k=100) for crystals in these families
346
            refcodes = ['ACSALA', 'HXACAN']
347
            amds = []
348
            for periodic_set in amd.CSDReader(refcodes, families=True):
349
                amds.append(amd.AMD(periodic_set, 100))
350
351
            # Giving the reader nothing reads from the whole CSD.
352
            for periodic_set in amd.CSDReader():
353
                ...
354
    """
355
356
    def __init__(
357
            self,
358
            refcodes=None,
359
            families=False,
360
            remove_hydrogens=False,
361
            disorder='skip',
362
            heaviest_component=False,
363
            molecular_centres=False,
364
            show_warnings=True,
365
    ):
366
367
        try:
368
            import ccdc.io
369
        except (ImportError, RuntimeError) as _:
370
            raise ImportError(_Reader._CCDC_IMPORT_ERR_MSG)
371
372
        if disorder not in _Reader._DISORDER_OPTIONS:
373
            msg = 'disorder parameter must be one of ' \
374
                  f'{_Reader._DISORDER_OPTIONS} (passed {disorder})'
375
            raise ValueError(msg)
376
377
        if isinstance(refcodes, str) and refcodes.lower() == 'csd':
378
            refcodes = None
379
380
        if refcodes is None:
381
            families = False
382
        else:
383
            if isinstance(refcodes, str):
384
                refcodes = [refcodes]
385
            else:
386
                refcodes = list(refcodes)
387
388
        if families:
389
            refcodes = self._refcodes_from_families_ccdc(refcodes)
390
391
        entry_reader = ccdc.io.EntryReader('CSD')
392
        converter = functools.partial(
393
            periodicset_from_ccdc_entry,
394
            remove_hydrogens=remove_hydrogens,
395
            disorder=disorder,
396
            molecular_centres=molecular_centres,
397
            heaviest_component=heaviest_component
398
        )
399
        iterable = self._ccdc_generator(refcodes, entry_reader)
400
        super().__init__(iterable, converter, show_warnings=show_warnings)
401
402
    @staticmethod
403
    def _ccdc_generator(refcodes, entry_reader):
404
        """Generates ccdc Entries from CSD refcodes.
405
        """
406
407
        if refcodes is None:
408
            for entry in entry_reader:
409
                yield entry
410
        else:
411
            for refcode in refcodes:
412
                try:
413
                    entry = entry_reader.entry(refcode)
414
                    yield entry
415
                except RuntimeError:
416
                    warnings.warn(f'{refcode} not found in database')
417
418
    @staticmethod
419
    def _refcodes_from_families_ccdc(refcode_families):
420
        """List of strings --> all CSD refcodes starting with any of the
421
        strings. Intended to be passed a list of families and return all
422
        refcodes in them.
423
        """
424
425
        try:
426
            import ccdc.search
427
        except (ImportError, RuntimeError) as _:
428
            raise ImportError(_Reader._CCDC_IMPORT_ERR_MSG)
429
430
        all_refcodes = []
431
        for refcode in refcode_families:
432
            query = ccdc.search.TextNumericSearch()
433
            query.add_identifier(refcode)
434
            hits = [hit.identifier for hit in query.search()]
435
            all_refcodes.extend(hits)
436
437
        # filter to unique refcodes
438
        seen = set()
439
        seen_add = seen.add
440
        refcodes = [
441
            refcode for refcode in all_refcodes
442
            if not (refcode in seen or seen_add(refcode))]
443
444
        return refcodes
445
446
447
class ParseError(ValueError):
448
    """Raised when an item cannot be parsed into a periodic set.
449
    """
450
    pass
451
452
453
def periodicset_from_ase_cifblock(
454
        block,
455
        remove_hydrogens=False,
456
        disorder='skip'
457
) -> PeriodicSet:
458
    """:class:`ase.io.cif.CIFBlock` --> 
459
    :class:`amd.PeriodicSet <.periodicset.PeriodicSet>`.
460
    :class:`ase.io.cif.CIFBlock` is the type returned by
461
    :class:`ase.io.cif.parse_cif`.
462
463
    Parameters
464
    ----------
465
    block : :class:`ase.io.cif.CIFBlock`
466
        An ase :class:`ase.io.cif.CIFBlock` object representing a
467
        crystal.
468
    remove_hydrogens : bool, optional
469
        Remove Hydrogens from the crystal.
470
    disorder : str, optional
471
        Controls how disordered structures are handled. Default is
472
        ``skip`` which skips any crystal with disorder, since disorder
473
        conflicts with the periodic set model. To read disordered
474
        structures anyway, choose either :code:`ordered_sites` to remove
475
        atoms with disorder or :code:`all_sites` include all atoms
476
        regardless of disorder.
477
478
    Returns
479
    -------
480
    :class:`amd.PeriodicSet <.periodicset.PeriodicSet>`
481
        Represents the crystal as a periodic set, consisting of a finite
482
        set of points (motif) and lattice (unit cell). Contains other
483
        useful data, e.g. the crystal's name and information about the
484
        asymmetric unit for calculation.
485
486
    Raises
487
    ------
488
    ParseError
489
        Raised if the structure fails to be parsed for any of the
490
        following: 1. Required data is missing (e.g. cell parameters),
491
        2. The motif is empty after removing H or disordered sites,
492
        3. :code:``disorder == 'skip'`` and disorder is found on any
493
        atom.
494
    """
495
496
    import ase
497
    import ase.spacegroup
498
499
    # Unit cell
500
    cellpar = [block.get(tag) for tag in CIF_TAGS['cellpar']]
501
    if None in cellpar:
502
        raise ParseError(f'{block.name} has missing cell data')
503
    cell = cellpar_to_cell(*cellpar)
504
505
    # Asymmetric unit coordinates. ase removes uncertainty brackets
506
    cartesian = False # flag needed for later
507
    asym_unit = [block.get(name) for name in CIF_TAGS['atom_site_fract']]
508
    if None in asym_unit: # missing scaled coords, try Cartesian
509
        asym_unit = [block.get(name) for name in CIF_TAGS['atom_site_cartn']]
510
        if None in asym_unit:
511
            raise ParseError(f'{block.name} has missing coordinates')
512
        cartesian = True
513
    asym_unit = list(zip(*asym_unit)) # transpose [xs,ys,zs] -> [p1,p2,...]
514
515
    # Atomic types
516
    asym_symbols = block._get_any(CIF_TAGS['atom_symbol'])
517
    if asym_symbols is None:
518
        warnings.warn('missing atomic types will be labelled 0')
519
        asym_types = [0] * len(asym_unit)
520
    else:
521
        asym_types = []
522
        for label in asym_symbols:
523
            if label in ('.', '?'):
524
                warnings.warn('missing atomic types will be labelled 0')
525
                num = 0
526
            else:
527
                sym = re.search(r'([A-Z][a-z]?)', label).group(0)
528
                if sym == 'D':
529
                    sym = 'H'
530
                num = ase.data.atomic_numbers[sym]
531
            asym_types.append(num)
532
533
    # Find where sites have disorder if necassary
534
    has_disorder = []
535
    if disorder != 'all_sites':
536
        occupancies = block.get('_atom_site_occupancy')
537
        if occupancies is None:
538
            occupancies = np.ones((len(asym_unit), ))
539
        labels = block.get('_atom_site_label')
540
        if labels is None:
541
            labels = [''] * len(asym_unit)
542
        for lab, occ in zip(labels, occupancies):
543
            has_disorder.append(_has_disorder(lab, occ))
544
545
    # Remove sites with ?, . or other invalid string for coordinates
546
    invalid = []
547
    for i, xyz in enumerate(asym_unit):
548
        if not all(isinstance(coord, (int, float)) for coord in xyz):
549
            invalid.append(i)
550
    if invalid:
551
        warnings.warn('atoms without sites or missing data will be removed')
552
        asym_unit = [c for i, c in enumerate(asym_unit) if i not in invalid]
553
        asym_types = [t for i, t in enumerate(asym_types) if i not in invalid]
554
        if disorder != 'all_sites':
555
            has_disorder = [d for i, d in enumerate(has_disorder)
556
                            if i not in invalid]
557
558
    remove_sites = []
559
560
    if remove_hydrogens:
561
        remove_sites.extend(i for i, num in enumerate(asym_types) if num == 1)
562
563
    # Remove atoms with fractional occupancy or raise ParseError
564 View Code Duplication
    if disorder != 'all_sites':
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
565
        for i, dis in enumerate(has_disorder):
566
            if i in remove_sites:
567
                continue
568
            if dis:
569
                if disorder == 'skip':
570
                    msg = f"{block.name} has disorder, pass " \
571
                           "disorder='ordered_sites'or 'all_sites' to " \
572
                           "remove/ignore disorder"
573
                    raise ParseError(msg)
574
                elif disorder == 'ordered_sites':
575
                    remove_sites.append(i)
576
577
    # Asymmetric unit
578
    asym_unit = [c for i, c in enumerate(asym_unit) if i not in remove_sites]
579
    asym_types = [t for i, t in enumerate(asym_types) if i not in remove_sites]
580
    if len(asym_unit) == 0:
581
        raise ParseError(f'{block.name} has no valid sites')
582
    asym_unit = np.array(asym_unit)
583
584
    # If Cartesian coords were given, convert to scaled
585
    if cartesian:
586
        asym_unit = asym_unit @ np.linalg.inv(cell)
587
    asym_unit = np.mod(asym_unit, 1)
588
    # asym_unit = _snap_small_prec_coords(asym_unit, 1e-4) # recommended by pymatgen
589
590
    # Remove overlapping sites unless disorder == 'all_sites'
591
    if disorder != 'all_sites':
592
        keep_sites = _unique_sites(asym_unit, _EQUIV_SITE_TOL)
593
        if not np.all(keep_sites):
594
            msg = 'may have overlapping sites, duplicates will be removed'
595
            warnings.warn(msg)
596
            asym_unit = asym_unit[keep_sites]
597
            asym_types = [t for t, keep in zip(asym_types, keep_sites) if keep]
598
599
    # Get symmetry operations
600
    sitesym = block._get_any(CIF_TAGS['symop'])
601
    if sitesym is None:
602
        label_or_num = block._get_any(CIF_TAGS['spacegroup_name'])
603
        if label_or_num is None:
604
            label_or_num = block._get_any(CIF_TAGS['spacegroup_number'])
605
        if label_or_num is None:
606
            warnings.warn('no symmetry data found, defaulting to P1')
607
            label_or_num = 1
608
        spg = ase.spacegroup.Spacegroup(label_or_num)
609
        rot, trans = spg.get_op()
610
    else:
611
        if isinstance(sitesym, str):
612
            sitesym = [sitesym]
613
        rot, trans =  _parse_sitesyms(sitesym)
614
615
    fr_motif, asym_inds, muls, invs = _expand_asym_unit(asym_unit, rot, trans)
616
    types = np.array([asym_types[i] for i in invs])
617
    motif = fr_motif @ cell
618
619
    return PeriodicSet(
620
        motif=motif,
621
        cell=cell,
622
        name=block.name,
623
        asymmetric_unit=asym_inds,
624
        wyckoff_multiplicities=muls,
625
        types=types
626
    )
627
628
629
def periodicset_from_ase_atoms(
630
        atoms,
631
        remove_hydrogens=False
632
) -> PeriodicSet:
633
    """:class:`ase.atoms.Atoms` -->
634
    :class:`amd.PeriodicSet <.periodicset.PeriodicSet>`. Does not have
635
    the option to remove disorder.
636
637
    Parameters
638
    ----------
639
    atoms : :class:`ase.atoms.Atoms`
640
        An ase :class:`ase.atoms.Atoms` object representing a crystal.
641
    remove_hydrogens : bool, optional
642
        Remove Hydrogens from the crystal.
643
644
    Returns
645
    -------
646
    :class:`amd.PeriodicSet <.periodicset.PeriodicSet>`
647
        Represents the crystal as a periodic set, consisting of a finite
648
        set of points (motif) and lattice (unit cell). Contains other
649
        useful data, e.g. the crystal's name and information about the
650
        asymmetric unit for calculation.
651
652
    Raises
653
    ------
654
    ParseError
655
        Raised if there are no valid sites in atoms.
656
    """
657
658
    from ase.spacegroup import get_basis
659
660
    cell = atoms.get_cell().array
661
662
    remove_inds = []
663
    if remove_hydrogens:
664
        for i in np.where(atoms.get_atomic_numbers() == 1)[0]:
665
            remove_inds.append(i)
666
    for i in sorted(remove_inds, reverse=True):
667
        atoms.pop(i)
668
669
    if len(atoms) == 0:
670
        raise ParseError(f'ase Atoms object has no valid sites')
671
672
    # Symmetry operations from spacegroup 
673
    spg = None
674
    if 'spacegroup' in atoms.info:
675
        spg = atoms.info['spacegroup']
676
        rot, trans = spg.rotations, spg.translations
677
    else:
678
        warnings.warn('no symmetry data found, defaulting to P1')
679
        rot = np.identity(3)[None, :]
680
        trans = np.zeros((1, 3))
681
682
    # Asymmetric unit. ase default tol is 1e-5
683
    # do differently! get_basis determines a reduced asym unit from the atoms;
684
    # surely this is not needed!
685
    asym_unit = get_basis(atoms, spacegroup=spg, tol=_EQUIV_SITE_TOL)
686
    fr_motif, asym_inds, muls, _ = _expand_asym_unit(asym_unit, rot, trans)
687
    motif = fr_motif @ cell
688
689
    return PeriodicSet(
690
        motif=motif,
691
        cell=cell,
692
        asymmetric_unit=asym_inds,
693
        wyckoff_multiplicities=muls,
694
        types=atoms.get_atomic_numbers()
695
    )
696
697
698
def periodicset_from_pymatgen_cifblock(
699
        block,
700
        remove_hydrogens=False,
701
        disorder='skip'
702
) -> PeriodicSet:
703
    """:class:`pymatgen.io.cif.CifBlock` -->
704
    :class:`amd.PeriodicSet <.periodicset.PeriodicSet>`.
705
706
    Parameters
707
    ----------
708
    block : :class:`pymatgen.io.cif.CifBlock`
709
        A pymatgen CifBlock object representing a crystal.
710
    remove_hydrogens : bool, optional
711
        Remove Hydrogens from the crystal.
712
    disorder : str, optional
713
        Controls how disordered structures are handled. Default is
714
        ``skip`` which skips any crystal with disorder, since disorder
715
        conflicts with the periodic set model. To read disordered
716
        structures anyway, choose either :code:`ordered_sites` to remove
717
        atoms with disorder or :code:`all_sites` include all atoms
718
        regardless of disorder.
719
720
    Returns
721
    -------
722
    :class:`amd.PeriodicSet <.periodicset.PeriodicSet>`
723
        Represents the crystal as a periodic set, consisting of a finite
724
        set of points (motif) and lattice (unit cell). Contains other
725
        useful data, e.g. the crystal's name and information about the
726
        asymmetric unit for calculation.
727
728
    Raises
729
    ------
730
    ParseError
731
        Raised if the structure can/should not be parsed for the
732
        following reasons: 1. No sites found or motif is empty after
733
        removing Hydrogens & disorder, 2. A site has missing
734
        coordinates, 3. :code:``disorder == 'skip'`` and disorder is
735
        found on any atom.
736
    """
737
738
    from pymatgen.io.cif import str2float
739
    import pymatgen.core.periodic_table as periodic_table
740
741
    odict = block.data
742
743
    # Unit cell
744
    cellpar = [odict.get(tag) for tag in CIF_TAGS['cellpar']]
745
    if None in cellpar:
746
        raise ParseError(f'{block.header} has missing cell data')
747
    cellpar = [str2float(v) for v in cellpar]
748
    cell = cellpar_to_cell(*cellpar)
749
750
    # Asymmetric unit coordinates
751
    cartesian = False
752
    asym_unit = [odict.get(tag) for tag in CIF_TAGS['atom_site_fract']]
753
754
    if None in asym_unit: # missing scaled coordinates, try Cartesian
755
        asym_unit = [odict.get(tag) for tag in CIF_TAGS['atom_site_cartn']]
756
        if None in asym_unit:
757
            raise ParseError(f'{block.header} has no coordinates')
758
        cartesian = True
759
760
    asym_unit = list(zip(*asym_unit)) # transpose [xs,ys,zs] -> [p1,p2,...]
761
    asym_unit = [[str2float(coord) for coord in xyz] for xyz in asym_unit]
762
763
    # Atomic types
764
    for tag in CIF_TAGS['atom_symbol']:
765
        asym_symbols = odict.get(tag)
766
        if asym_symbols is not None:
767
            asym_types = []
768
            for label in asym_symbols:
769
                if label in ('.', '?'):
770
                    warnings.warn('missing atomic types will be labelled 0')
771
                    num = 0
772
                else:
773
                    sym = re.search(r'([A-Z][a-z]?)', label).group(0)
774
                    if sym == 'D':
775
                        sym = 'H'
776
                    num = periodic_table.Element[sym].number
777
                asym_types.append(num)
778
            break
779
    else:
780
        warnings.warn('missing atomic types will be labelled 0')
781
        asym_types = [0] * len(asym_unit)
782
783
    # Find where sites have disorder if necassary
784
    has_disorder = []
785
    if disorder != 'all_sites':
786
        occupancies = odict.get('_atom_site_occupancy')
787
        if occupancies is None:
788
            occupancies = np.ones((len(asym_unit), ))
789
        labels = odict.get('_atom_site_label')
790
        if labels is None:
791
            labels = [''] * len(asym_unit)
792
        for lab, occ in zip(labels, occupancies):
793
            has_disorder.append(_has_disorder(lab, occ))
794
795
    # Remove sites with ?, . or other invalid string for coordinates
796
    invalid = []
797
    for i, xyz in enumerate(asym_unit):
798
        if not all(isinstance(coord, (int, float)) for coord in xyz):
799
            invalid.append(i)
800
801
    if invalid:
802
        warnings.warn('atoms without sites or missing data will be removed')
803
        asym_unit = [c for i, c in enumerate(asym_unit) if i not in invalid]
804
        asym_types = [c for i, c in enumerate(asym_types) if i not in invalid]
805
        if disorder != 'all_sites':
806
            has_disorder = [d for i, d in enumerate(has_disorder) 
807
                            if i not in invalid]
808
809
    remove_sites = []
810
811
    if remove_hydrogens:
812
        remove_sites.extend((i for i, n in enumerate(asym_types) if n == 1))
0 ignored issues
show
introduced by
The variable i does not seem to be defined in case the for loop on line 797 is not entered. Are you sure this can never be the case?
Loading history...
813
814
    # Remove atoms with fractional occupancy or raise ParseError
815 View Code Duplication
    if disorder != 'all_sites':
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
816
        for i, dis in enumerate(has_disorder):
817
            if i in remove_sites:
818
                continue
819
            if dis:
820
                if disorder == 'skip':
821
                    msg = f"{block.header} has disorder, pass " \
822
                            "disorder='ordered_sites' or 'all_sites' to " \
823
                            "remove/ignore disorder"
824
                    raise ParseError(msg)
825
                elif disorder == 'ordered_sites':
826
                    remove_sites.append(i)
827
828
    # Asymmetric unit
829
    asym_unit = [c for i, c in enumerate(asym_unit) if i not in remove_sites]
830
    asym_types = [t for i, t in enumerate(asym_types) if i not in remove_sites]
831
    if len(asym_unit) == 0:
832
        raise ParseError(f'{block.header} has no valid sites')
833
    asym_unit = np.array(asym_unit)
834
835
    # If Cartesian coords were given, convert to scaled
836
    if cartesian:
837
        asym_unit = asym_unit @ np.linalg.inv(cell)
838
    asym_unit = np.mod(asym_unit, 1)
839
    # asym_unit = _snap_small_prec_coords(asym_unit) # recommended by pymatgen
840
841
    # Remove overlapping sites unless disorder == 'all_sites'
842
    if disorder != 'all_sites':      
843
        keep_sites = _unique_sites(asym_unit, _EQUIV_SITE_TOL)
844
        if not np.all(keep_sites):
845
            msg = 'may have overlapping sites; duplicates will be removed'
846
            warnings.warn(msg)
847
        asym_unit = asym_unit[keep_sites]
848
        asym_types = [sym for sym, keep in zip(asym_types, keep_sites) if keep]
849
850
    # Apply symmetries to asymmetric unit
851
    rot, trans = _get_syms_pymatgen(odict)
852
    fr_motif, asym_inds, muls, invs = _expand_asym_unit(asym_unit, rot, trans)
853
    types = np.array([asym_types[i] for i in invs])
854
    motif = fr_motif @ cell
855
856
    return PeriodicSet(
857
        motif=motif,
858
        cell=cell,
859
        name=block.header,
860
        asymmetric_unit=asym_inds,
861
        wyckoff_multiplicities=muls,
862
        types=types
863
    )
864
865
866
def periodicset_from_pymatgen_structure(
867
        structure,
868
        remove_hydrogens=False,
869
        disorder='skip'
870
) -> PeriodicSet:
871
    """:class:`pymatgen.core.structure.Structure` -->
872
    :class:`amd.PeriodicSet <.periodicset.PeriodicSet>`.
873
    Does not set the name of the periodic set, as there seems to be no
874
    name attribute in the pymatgen Structure object.
875
876
    Parameters
877
    ----------
878
    structure : :class:`pymatgen.core.structure.Structure`
879
        A pymatgen Structure object representing a crystal.
880
    remove_hydrogens : bool, optional
881
        Remove Hydrogens from the crystal.
882
    disorder : str, optional
883
        Controls how disordered structures are handled. Default is
884
        ``skip`` which skips any crystal with disorder, since disorder
885
        conflicts with the periodic set model. To read disordered
886
        structures anyway, choose either :code:`ordered_sites` to remove
887
        atoms with disorder or :code:`all_sites` include all atoms
888
        regardless of disorder.
889
890
    Returns
891
    -------
892
    :class:`amd.PeriodicSet <.periodicset.PeriodicSet>`
893
        Represents the crystal as a periodic set, consisting of a finite
894
        set of points (motif) and lattice (unit cell). Contains other
895
        useful data, e.g. the crystal's name and information about the
896
        asymmetric unit for calculation.
897
    
898
    Raises
899
    ------
900
    ParseError
901
        Raised if the :code:`disorder == 'skip'` and 
902
        :code:`not structure.is_ordered`
903
    """
904
905
    from pymatgen.symmetry.analyzer import SpacegroupAnalyzer
906
907
    if remove_hydrogens:
908
        structure.remove_species(['H', 'D'])
909
910
    # Disorder
911
    if disorder == 'skip':
912
        if not structure.is_ordered:
913
            msg = f"pymatgen Structure has disorder, pass " \
914
                   "disorder='ordered_sites' or 'all_sites' to " \
915
                   "remove/ignore disorder"
916
            raise ParseError(msg)
917
    elif disorder == 'ordered_sites':
918
        remove_inds = []
919
        for i, comp in enumerate(structure.species_and_occu):
920
            if comp.num_atoms < 1:
921
                remove_inds.append(i)
922
        structure.remove_sites(remove_inds)
923
924
    motif = structure.cart_coords
925
    cell = structure.lattice.matrix
926
    sym_structure = SpacegroupAnalyzer(structure).get_symmetrized_structure()
927
    asym_unit = np.array([l[0] for l in sym_structure.equivalent_indices])
928
    muls = np.array([len(l) for l in sym_structure.equivalent_indices])
929
    types = np.array(sym_structure.atomic_numbers)
930
931
    return PeriodicSet(
932
        motif=motif,
933
        cell=cell,
934
        asymmetric_unit=asym_unit,
935
        wyckoff_multiplicities=muls,
936
        types=types
937
    )
938
939
940
def periodicset_from_ccdc_entry(
941
        entry,
942
        remove_hydrogens=False,
943
        disorder='skip',
944
        heaviest_component=False,
945
        molecular_centres=False
946
) -> PeriodicSet:
947
    """:class:`ccdc.entry.Entry` -->
948
    :class:`amd.PeriodicSet <.periodicset.PeriodicSet>`.
949
    Entry is the type returned by :class:`ccdc.io.EntryReader`.
950
951
    Parameters
952
    ----------
953
    entry : :class:`ccdc.entry.Entry`
954
        A ccdc Entry object representing a database entry.
955
    remove_hydrogens : bool, optional
956
        Remove Hydrogens from the crystal.
957
    disorder : str, optional
958
        Controls how disordered structures are handled. Default is
959
        ``skip`` which skips any crystal with disorder, since disorder
960
        conflicts with the periodic set model. To read disordered
961
        structures anyway, choose either :code:`ordered_sites` to remove
962
        atoms with disorder or :code:`all_sites` include all atoms
963
        regardless of disorder.
964
    heaviest_component : bool, optional
965
        Removes all but the heaviest molecule in the asymmeric unit,
966
        intended for removing solvents.
967
    molecular_centres : bool, default False
968
        Extract the centres of molecules in the unit cell and store in
969
        the attribute molecular_centres of the returned PeriodicSet.
970
971
    Returns
972
    -------
973
    :class:`amd.PeriodicSet <.periodicset.PeriodicSet>`
974
        Represents the crystal as a periodic set, consisting of a finite
975
        set of points (motif) and lattice (unit cell). Contains other
976
        useful data, e.g. the crystal's name and information about the
977
        asymmetric unit for calculation.
978
979
    Raises
980
    ------
981
    ParseError
982
        Raised if the structure fails parsing for any of the following:
983
        1. entry.has_3d_structure is False, 2.
984
        :code:``disorder == 'skip'`` and disorder is found on any atom,
985
        3. entry.crystal.molecule.all_atoms_have_sites is False,
986
        4. a.fractional_coordinates is None for any a in
987
        entry.crystal.disordered_molecule, 5. The motif is empty after
988
        removing Hydrogens and disordered sites.
989
    """
990
991
    # Entry specific flag
992
    if not entry.has_3d_structure:
993
        raise ParseError(f'{entry.identifier} has no 3D structure')
994
995
    # Disorder
996
    if disorder == 'skip' and entry.has_disorder:
997
        msg = f"{entry.identifier} has disorder, pass " \
998
                "disorder='ordered_sites' or 'all_sites' to remove/ignore" \
999
                "disorder"
1000
        raise ParseError(msg)
1001
1002
    return periodicset_from_ccdc_crystal(
1003
        entry.crystal,
1004
        remove_hydrogens=remove_hydrogens,
1005
        disorder=disorder,
1006
        heaviest_component=heaviest_component,
1007
        molecular_centres=molecular_centres
1008
    )
1009
1010
1011
def periodicset_from_ccdc_crystal(
1012
        crystal,
1013
        remove_hydrogens=False,
1014
        disorder='skip',
1015
        heaviest_component=False,
1016
        molecular_centres=False
1017
) -> PeriodicSet:
1018
    """:class:`ccdc.crystal.Crystal` -->
1019
    :class:`amd.PeriodicSet <.periodicset.PeriodicSet>`.
1020
    Crystal is the type returned by :class:`ccdc.io.CrystalReader`.
1021
1022
    Parameters
1023
    ----------
1024
    crystal : :class:`ccdc.crystal.Crystal`
1025
        A ccdc Crystal object representing a crystal structure.
1026
    remove_hydrogens : bool, optional
1027
        Remove Hydrogens from the crystal.
1028
    disorder : str, optional
1029
        Controls how disordered structures are handled. Default is
1030
        ``skip`` which skips any crystal with disorder, since disorder
1031
        conflicts with the periodic set model. To read disordered
1032
        structures anyway, choose either :code:`ordered_sites` to remove
1033
        atoms with disorder or :code:`all_sites` include all atoms
1034
        regardless of disorder.
1035
    heaviest_component : bool, optional
1036
        Removes all but the heaviest molecule in the asymmeric unit,
1037
        intended for removing solvents.
1038
    molecular_centres : bool, default False
1039
        Extract the centres of molecules in the unit cell and store in
1040
        the attribute molecular_centres of the returned PeriodicSet.
1041
1042
    Returns
1043
    -------
1044
    :class:`amd.PeriodicSet <.periodicset.PeriodicSet>`
1045
        Represents the crystal as a periodic set, consisting of a finite
1046
        set of points (motif) and lattice (unit cell). Contains other
1047
        useful data, e.g. the crystal's name and information about the
1048
        asymmetric unit for calculation.
1049
1050
    Raises
1051
    ------
1052
    ParseError
1053
        Raised if the structure fails parsing for any of the following:
1054
        1. :code:``disorder == 'skip'`` and disorder is found on any
1055
        atom, 2. crystal.molecule.all_atoms_have_sites is False,
1056
        3. a.fractional_coordinates is None for any a in
1057
        crystal.disordered_molecule, 4. The motif is empty after
1058
        removing H, disordered sites or solvents.
1059
    """
1060
1061
    molecule = crystal.disordered_molecule
1062
1063
    # Disorder
1064
    if disorder == 'skip':
1065
        if crystal.has_disorder or \
1066
         any(_has_disorder(a.label, a.occupancy) for a in molecule.atoms):
1067
            msg = f"{crystal.identifier} has disorder, pass " \
1068
                   "disorder='ordered_sites' or 'all_sites' to remove/ignore" \
1069
                   "disorder"
1070
            raise ParseError(msg)
1071
1072
    elif disorder == 'ordered_sites':
1073
        molecule.remove_atoms(
1074
            a for a in molecule.atoms if _has_disorder(a.label, a.occupancy)
1075
        )
1076
1077
    if remove_hydrogens:
1078
        molecule.remove_atoms(
1079
            a for a in molecule.atoms if a.atomic_symbol in 'HD'
1080
        )
1081
1082
    if heaviest_component and len(molecule.components) > 1:
1083
        molecule = _heaviest_component_ccdc(molecule)
1084
1085
    # Remove atoms with missing coordinates and warn
1086
    if any(a.fractional_coordinates is None for a in molecule.atoms):
1087
        warnings.warn('atoms without sites or missing data will be removed')
1088
        molecule.remove_atoms(
1089
            a for a in molecule.atoms if a.fractional_coordinates is None
1090
        )
1091
1092
    crystal.molecule = molecule
1093
    cell = cellpar_to_cell(*crystal.cell_lengths, *crystal.cell_angles)
1094
1095
    if molecular_centres:
1096
        frac_centres = _frac_molecular_centres_ccdc(crystal)
1097
        mol_centres = frac_centres @ cell
1098
        return PeriodicSet(mol_centres, cell, name=crystal.identifier)
1099
1100
    asym_atoms = crystal.asymmetric_unit_molecule.atoms
1101
    # check for None?
1102
    asym_unit = np.array([tuple(a.fractional_coordinates) for a in asym_atoms])
1103
1104
    if asym_unit.shape[0] == 0:
1105
        raise ParseError(f'{crystal.identifier} has no valid sites')
1106
1107
    asym_unit = np.mod(asym_unit, 1)
1108
    # asym_unit = _snap_small_prec_coords(asym_unit) # recommended by pymatgen
1109
    asym_types = [a.atomic_number for a in asym_atoms]
1110
1111
    # Disorder
1112
    if disorder != 'all_sites':
1113
        keep_sites = _unique_sites(asym_unit, _EQUIV_SITE_TOL)
1114
        if not np.all(keep_sites):
1115
            msg = 'may have overlapping sites; duplicates will be removed'
1116
            warnings.warn(msg)
1117
        asym_unit = asym_unit[keep_sites]
1118
        asym_types = [sym for sym, keep in zip(asym_types, keep_sites) if keep]
1119
1120
    # Symmetry operations
1121
    sitesym = crystal.symmetry_operators
1122
    # try spacegroup numbers?
1123
    if not sitesym:
1124
        warnings.warn('no symmetry data found, defaulting to P1')
1125
        sitesym = ['x,y,z']
1126
1127
    # Apply symmetries to asymmetric unit
1128
    rot, trans = _parse_sitesyms(sitesym)
1129
    fr_motif, asym_inds, muls, invs = _expand_asym_unit(asym_unit, rot, trans)
1130
    motif = fr_motif @ cell
1131
    types = np.array([asym_types[i] for i in invs])
1132
1133
    return PeriodicSet(
1134
        motif=motif,
1135
        cell=cell,
1136
        name=crystal.identifier,
1137
        asymmetric_unit=asym_inds,
1138
        wyckoff_multiplicities=muls,
1139
        types=types
1140
    )
1141
1142
1143
def periodicset_from_gemmi_block(
1144
        block,
1145
        remove_hydrogens=False,
1146
        disorder='skip'
1147
) -> PeriodicSet:
1148
    """:class:`gemmi.cif.Block` -->
1149
    :class:`amd.PeriodicSet <.periodicset.PeriodicSet>`.
1150
    Block is the type returned by :class:`gemmi.cif.read_file`.
1151
1152
    Parameters
1153
    ----------
1154
    block : :class:`ase.io.cif.CIFBlock`
1155
        An ase CIFBlock object representing a crystal.
1156
    remove_hydrogens : bool, optional
1157
        Remove Hydrogens from the crystal.
1158
    disorder : str, optional
1159
        Controls how disordered structures are handled. Default is
1160
        ``skip`` which skips any crystal with disorder, since disorder
1161
        conflicts with the periodic set model. To read disordered
1162
        structures anyway, choose either :code:`ordered_sites` to remove
1163
        atoms with disorder or :code:`all_sites` include all atoms
1164
        regardless of disorder.
1165
1166
    Returns
1167
    -------
1168
    :class:`amd.PeriodicSet <.periodicset.PeriodicSet>`
1169
        Represents the crystal as a periodic set, consisting of a finite
1170
        set of points (motif) and lattice (unit cell). Contains other
1171
        useful data, e.g. the crystal's name and information about the
1172
        asymmetric unit for calculation.
1173
1174
    Raises
1175
    ------
1176
    ParseError
1177
        Raised if the structure fails to be parsed for any of the
1178
        following: 1. Required data is missing (e.g. cell parameters),
1179
        2. :code:``disorder == 'skip'`` and disorder is found on any
1180
        atom, 3. The motif is empty after removing H or disordered
1181
        sites.
1182
    """
1183
1184
    import gemmi
1185
    from gemmi.cif import as_number, as_string, as_int
1186
1187
    # Unit cell
1188
    cellpar = [as_number(block.find_value(tag))
1189
               for tag in CIF_TAGS['cellpar']]
1190
    if None in cellpar:
1191
        raise ParseError(f'{block.name} has missing cell data')
1192
    cell = cellpar_to_cell(*cellpar)
1193
1194
    xyz_loop = block.find(CIF_TAGS['atom_site_fract']).loop
1195
    if xyz_loop is None:
1196
        # check for Cartesian coordinates
1197
        raise ParseError(f'{block.name} has missing coordinate data')
1198
1199
    # Asymmetric unit coordinates
1200
    loop_dict = _loop_to_dict_gemmi(xyz_loop)
1201
    xyz_str = [loop_dict[t] for t in CIF_TAGS['atom_site_fract']]
1202
    asym_unit = [[as_number(c) for c in coords] for coords in xyz_str]
1203
    asym_unit = np.mod(np.array(asym_unit).T, 1)
1204
    # asym_unit = _snap_small_prec_coords(asym_unit) # recommended by pymatgen
1205
1206
    # Asymmetric unit types
1207
    if '_atom_site_type_symbol' in loop_dict:
1208
        asym_syms = loop_dict['_atom_site_type_symbol']
1209
        asym_types = [gemmi.Element(s).atomic_number for s in asym_syms]
1210
    else:
1211
        warnings.warn('missing atomic types will be labelled 0')
1212
        asym_types = [0 for _ in range(len(asym_unit))]
1213
1214
    remove_sites = []
1215
1216
    # Disorder
1217
    if '_atom_site_label' in loop_dict:
1218
        labels = loop_dict['_atom_site_label']
1219
    else:
1220
        labels = [''] * xyz_loop.length()
1221
1222
    if '_atom_site_occupancy' in loop_dict:
1223
        occupancies = [as_number(occ)
1224
                       for occ in loop_dict['_atom_site_occupancy']]
1225
    else:
1226
        occupancies = [None for _ in range(xyz_loop.length())]
1227
1228
    if disorder == 'skip':
1229
        if any(_has_disorder(l, o) for l, o in zip(labels, occupancies)):
1230
            msg = f"{block.name} has disorder, pass " \
1231
                   "disorder='ordered_sites' or 'all_sites' to " \
1232
                   "remove/ignore disorder"
1233
            raise ParseError(msg)
1234
    elif disorder == 'ordered_sites':
1235
        for i, (lab, occ) in enumerate(zip(labels, occupancies)):
1236
            if _has_disorder(lab, occ):
1237
                remove_sites.append(i)
1238
1239
    if remove_hydrogens:
1240
        remove_sites.extend(
1241
            i for i, num in enumerate(asym_types) if num == 1
1242
        )
1243
1244
    # Asymmetric unit
1245
    asym_unit = np.delete(asym_unit, remove_sites, axis=0)
1246
    asym_types = [s for i, s in enumerate(asym_types) if i not in remove_sites]
1247
    if asym_unit.shape[0] == 0:
1248
        raise ParseError(f'{block.name} has no valid sites')
1249
1250
    # Remove overlapping sites unless disorder == 'all_sites'
1251
    if disorder != 'all_sites':
1252
        keep_sites = _unique_sites(asym_unit, _EQUIV_SITE_TOL)
1253
        if not np.all(keep_sites):
1254
            msg = 'may have overlapping sites; duplicates will be removed'
1255
            warnings.warn(msg)
1256
        asym_unit = asym_unit[keep_sites]
1257
        asym_types = [sym for sym, keep in zip(asym_types, keep_sites) if keep]
1258
1259
    # Get symmetry operations
1260
    sitesym = []
1261
    for tag in CIF_TAGS['symop']:
1262
        symop_loop = block.find([tag]).loop
1263
        if symop_loop is not None:
1264
            sitesym = _loop_to_dict_gemmi(symop_loop)[tag]
1265
            break
1266
1267
    if not sitesym:
1268
        for tag in CIF_TAGS['spacegroup_name']:
1269
            label_or_num = block.find_value(tag)
1270
            if label_or_num is not None:
1271
                label_or_num = as_string(label_or_num)
1272
                break
1273
        if label_or_num is None:
0 ignored issues
show
introduced by
The variable label_or_num does not seem to be defined in case the for loop on line 1268 is not entered. Are you sure this can never be the case?
Loading history...
1274
            for tag in CIF_TAGS['spacegroup_number']:
1275
                label_or_num = block.find_value(tag)
1276
                if label_or_num is not None:
1277
                    label_or_num = as_int(label_or_num)
1278
                    break
1279
        if label_or_num is None:
1280
            warnings.warn('no symmetry data found, defaulting to P1')
1281
            label_or_num = 1
1282
        ops = list(gemmi.SpaceGroup(label_or_num).operations())
1283
        rot = np.array([np.array(o.rot) / o.DEN for o in ops])
1284
        trans = np.array([np.array(o.tran) / o.DEN for o in ops])
1285
    else:
1286
        rot, trans =  _parse_sitesyms(sitesym)
1287
1288
    fr_motif, asym_inds, muls, invs = _expand_asym_unit(asym_unit, rot, trans)
1289
    types = np.array([asym_types[i] for i in invs])
1290
    motif = fr_motif @ cell
1291
1292
    return PeriodicSet(
1293
        motif=motif,
1294
        cell=cell,
1295
        name=block.name,
1296
        asymmetric_unit=asym_inds,
1297
        wyckoff_multiplicities=muls,
1298
        types=types
1299
    )
1300
1301
1302
def _parse_sitesyms(symmetries):
1303
    """Parses a sequence of symmetries in xyz form and returns rotation
1304
    and translation arrays. Similar to function found in ase.spacegroup.
1305
    """
1306
1307
    nsyms = len(symmetries)
1308
    rotations = np.zeros((nsyms, 3, 3), dtype=np.float64)
1309
    translations = np.zeros((nsyms, 3), dtype=np.float64)
1310
1311
    for i, sym in enumerate(symmetries):
1312
        for ind, element in enumerate(sym.split(',')):
1313
1314
            is_positive = True
1315
            is_fraction = False
1316
            sng_trans = None
1317
            fst_trans = []
1318
            snd_trans = []
1319
1320
            for char in element.lower():
1321
                if char == '+':
1322
                    is_positive = True
1323
                elif char == '-':
1324
                    is_positive = False
1325
                elif char == '/':
1326
                    is_fraction = True
1327
                elif char in 'xyz':
1328
                    rot_sgn = 1.0 if is_positive else -1.0
1329
                    rotations[i][ind][ord(char) - ord('x')] = rot_sgn
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable ord does not seem to be defined.
Loading history...
1330
                elif char.isdigit() or char == '.':
1331
                    if sng_trans is None:
1332
                        sng_trans = 1.0 if is_positive else -1.0
1333
                    if is_fraction:
1334
                        snd_trans.append(char)
1335
                    else:
1336
                        fst_trans.append(char)
1337
1338
            if not fst_trans:
1339
                e_trans = 0.0
1340
            else:
1341
                e_trans = sng_trans * float(''.join(fst_trans))
1342
1343
            if is_fraction:
1344
                e_trans /= float(''.join(snd_trans))
1345
1346
            translations[i][ind] = e_trans
1347
1348
    return rotations, translations
1349
1350
1351
def _expand_asym_unit(
1352
        asym_unit: np.ndarray,
1353
        rotations: np.ndarray,
1354
        translations: np.ndarray
1355
) -> Tuple[np.ndarray, ...]:
1356
    """Asymmetric unit frac coords, list of rotations & translations -->
1357
    full fractional motif, asymmetric unit indices, multiplicities and
1358
    inverses (which motif points come from where in the asym unit).
1359
    """
1360
1361
    frac_motif = []     # Full motif
1362
    asym_inds = [0]     # Indices of asymmetric unit 
1363
    multiplicities = [] # Wyckoff multiplicities
1364
    inverses = []       # Motif -> Asymmetric unit
1365
    m, dims = asym_unit.shape
1366
1367
    # Apply all symmetries first
1368
    expanded_sites = np.zeros((m, len(rotations), dims), dtype=np.float64)
1369
    for i in range(m):
1370
        expanded_sites[i] = np.dot(rotations, asym_unit[i]) + translations
1371
    expanded_sites = np.mod(expanded_sites, 1)
1372
1373
    for inv, sites in enumerate(expanded_sites):
1374
1375
        multiplicity = 0
1376
1377
        for site_ in sites:
1378
1379
            if not frac_motif:
1380
                frac_motif.append(site_)
1381
                inverses.append(inv)
1382
                multiplicity += 1
1383
                continue
1384
1385
            # check if site_ overlaps with existing sites
1386
            diffs1 = np.abs(site_ - frac_motif)
1387
            diffs2 = np.abs(diffs1 - 1)
1388
            mask = np.all((diffs1 <= _EQUIV_SITE_TOL) | 
1389
                          (diffs2 <= _EQUIV_SITE_TOL), axis=-1)
1390
1391
            if np.any(mask): # site is not new
1392
                where_equal = np.argwhere(mask).flatten()
1393
                for ind in where_equal:
1394
                    if inverses[ind] == inv: # site is invariant
1395
                        pass
1396
                    else: # equivalent to a different site
1397
                        msg = f'has equivalent sites at positions ' \
1398
                              f'{inverses[ind]}, {inv}'
1399
                        warnings.warn(msg)
1400
            else: # new site
1401
                frac_motif.append(site_)
1402
                inverses.append(inv)
1403
                multiplicity += 1
1404
1405
        if multiplicity > 0:
1406
            multiplicities.append(multiplicity)
1407
            asym_inds.append(len(frac_motif))
1408
1409
    frac_motif = np.array(frac_motif, dtype=np.float64)
1410
    asym_inds = np.array(asym_inds[:-1], dtype=np.int32)
1411
    multiplicities = np.array(multiplicities, dtype=np.int32)
1412
1413
    return frac_motif, asym_inds, multiplicities, inverses
1414
1415
1416
@numba.njit()
1417
def _unique_sites(asym_unit, tol):
1418
    """Uniquify (within tol) a list of fractional coordinates,
1419
    considering all points modulo 1. Returns an array of bools such that
1420
    asym_unit[_unique_sites(asym_unit, tol)] is the uniquified list.
1421
    """
1422
    
1423
    site_diffs1 = np.abs(np.expand_dims(asym_unit, 1) - asym_unit)
1424
    site_diffs2 = np.abs(site_diffs1 - 1)
1425
    sites_neq_mask = (site_diffs1 > tol) & (site_diffs2 > tol)
1426
    overlapping = np.triu(sites_neq_mask.sum(axis=-1) == 0, 1)
1427
    return overlapping.sum(axis=0) == 0
1428
1429
1430
def _has_disorder(label, occupancy):
1431
    """Return True if label ends with ? or occupancy is a number < 1.
1432
    """
1433
    return (np.isscalar(occupancy) and occupancy < 1) or label.endswith('?')
1434
1435
1436
def _get_syms_pymatgen(data):
1437
    """Parse symmetry operations given by data = block.data where block
1438
    is a pymatgen CifBlock object. If the symops are not present the
1439
    space group symbol/international number is parsed and symops are
1440
    generated.
1441
    """
1442
1443
    from pymatgen.symmetry.groups import SpaceGroup
1444
    from pymatgen.core.operations import SymmOp
1445
    import pymatgen.io.cif
1446
1447
    symops = []
1448
1449
    # Try xyz symmetry operations
1450
    for symmetry_label in CIF_TAGS['symop']:
1451
        xyz = data.get(symmetry_label)
1452
        if not xyz:
1453
            continue
1454
        if isinstance(xyz, str):
1455
            xyz = [xyz]
1456
        try:
1457
            symops = [SymmOp.from_xyz_string(s) for s in xyz]
1458
            break
1459
        except ValueError:
1460
            continue
1461
1462
    # Try spacegroup symbol
1463
    if not symops:
1464
        for symmetry_label in CIF_TAGS['spacegroup_name']:
1465
            sg = data.get(symmetry_label)
1466
            if not sg:
1467
                continue
1468
            sg = re.sub(r'[\s_]', '', sg)
1469
1470
            try:
1471
                spg = pymatgen.io.cif.space_groups.get(sg)
1472
                if not spg:
1473
                    continue
1474
                symops = SpaceGroup(spg).symmetry_ops
1475
                break
1476
            except ValueError:
1477
                pass
1478
1479
            try:
1480
                for d in pymatgen.io.cif._get_cod_data():
1481
                    if sg == re.sub(r'\s+', '', d['hermann_mauguin']):
1482
                        symops = [SymmOp.from_xyz_string(s)
1483
                                  for s in d['symops']]
1484
                        break
1485
            except Exception as e:
1486
                continue
1487
1488
            if symops:
1489
                break
1490
1491
    # Try international number
1492
    if not symops:
1493
        for symmetry_label in CIF_TAGS['spacegroup_number']:
1494
            num = data.get(symmetry_label)
1495
            if not num:
1496
                continue
1497
1498
            try:
1499
                i = int(pymatgen.io.cif.str2float(num))
1500
                symops = SpaceGroup.from_int_number(i).symmetry_ops
1501
                break
1502
            except ValueError:
1503
                continue
1504
1505
    if not symops:
1506
        warnings.warn('no symmetry data found, defaulting to P1')
1507
        symops = [SymmOp.from_xyz_string('x,y,z')]
1508
1509
    rotations = [op.rotation_matrix for op in symops]
1510
    translations = [op.translation_vector for op in symops]
1511
1512
    return rotations, translations
1513
1514
1515
def _frac_molecular_centres_ccdc(crystal):
1516
    """Returns the geometric centres of molecules in the unit cell.
1517
    Expects a ccdc Crystal object and returns fractional coordiantes.
1518
    """
1519
1520
    frac_centres = []
1521
    for comp in crystal.packing(inclusion='CentroidIncluded').components:
1522
        coords = [a.fractional_coordinates for a in comp.atoms]
1523
        frac_centres.append((sum(ax) / len(coords) for ax in zip(*coords)))
0 ignored issues
show
introduced by
The variable ax does not seem to be defined in case the for loop on line 1521 is not entered. Are you sure this can never be the case?
Loading history...
1524
    frac_centres = np.mod(np.array(frac_centres), 1)
1525
    return frac_centres[_unique_sites(frac_centres, _EQUIV_SITE_TOL)]
1526
1527
1528
def _heaviest_component_ccdc(molecule):
1529
    """Removes all but the heaviest component of the asymmetric unit.
1530
    Intended for removing solvents. Expects and returns a ccdc Molecule
1531
    object.
1532
    """
1533
1534
    component_weights = []
1535
    for component in molecule.components:
1536
        weight = 0
1537
        for a in component.atoms:
1538
            if isinstance(a.atomic_weight, (float, int)):
1539
                if isinstance(a.occupancy, (float, int)):
1540
                    weight += a.occupancy * a.atomic_weight
1541
                else:
1542
                    weight += a.atomic_weight
1543
        component_weights.append(weight)
1544
    largest_component_ind = np.argmax(np.array(component_weights))
1545
    molecule = molecule.components[largest_component_ind]
1546
    return molecule
1547
1548
1549
def _loop_to_dict_gemmi(gemmi_loop):
1550
    """gemmi Loop object --> dict, tags: values
1551
    """
1552
1553
    tablified_loop = [[] for _ in range(len(gemmi_loop.tags))]
1554
    n_cols = gemmi_loop.width()
1555
    for i, item in enumerate(gemmi_loop.values):
1556
        tablified_loop[i % n_cols].append(item)
1557
    return {tag: l for tag, l in zip(gemmi_loop.tags, tablified_loop)}
1558
1559
1560
def _snap_small_prec_coords(frac_coords, tol):
1561
    """Find where frac_coords is within 1e-4 of 1/3 or 2/3, change to
1562
    1/3 and 2/3. Recommended by pymatgen's CIF parser.
1563
    """
1564
    frac_coords[np.abs(1 - 3 * frac_coords) < tol] = 1 / 3.
1565
    frac_coords[np.abs(1 - 3 * frac_coords / 2) < tol] = 2 / 3.
1566
    return frac_coords
1567