Passed
Push — master ( e2877c...4c3efa )
by Daniel
04:03
created

amd.io.CSDReader._refcodes_from_families_ccdc()   A

Complexity

Conditions 3

Size

Total Lines 27
Code Lines 18

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 18
dl 0
loc 27
rs 9.5
c 0
b 0
f 0
cc 3
nop 1
1
"""Tools for reading crystals from files, or from the CSD with
2
``csd-python-api``. The readers return
3
:class:`amd.PeriodicSet <.periodicset.PeriodicSet>` objects representing
4
the crystal which can be passed to :func:`amd.AMD() <.calculate.AMD>`
5
and :func:`amd.PDD() <.calculate.PDD>` to get their invariants.
6
"""
7
8
import os
9
import re
10
import functools
11
import warnings
12
import errno
13
from typing import Tuple
14
15
import numpy as np
16
import numba
17
18
from .utils import cellpar_to_cell
19
from .periodicset import PeriodicSet
20
21
22
def _custom_warning(message, category, filename, lineno, *args, **kwargs):
23
    return f'{category.__name__}: {message}\n'
24
warnings.formatwarning = _custom_warning
25
26
_EQUIV_SITE_TOL = 1e-3
27
28
CIF_TAGS = {
29
    'cellpar': [
30
        '_cell_length_a',
31
        '_cell_length_b',
32
        '_cell_length_c',
33
        '_cell_angle_alpha',
34
        '_cell_angle_beta',
35
        '_cell_angle_gamma',],
36
37
    'atom_site_fract': [
38
        '_atom_site_fract_x',
39
        '_atom_site_fract_y',
40
        '_atom_site_fract_z',],
41
42
    'atom_site_cartn': [
43
        '_atom_site_cartn_x',
44
        '_atom_site_cartn_y',
45
        '_atom_site_cartn_z',],
46
47
    'atom_symbol': [
48
        '_atom_site_type_symbol',
49
        '_atom_site_label',],
50
51
    'symop': [
52
        '_symmetry_equiv_pos_as_xyz',
53
        '_space_group_symop_operation_xyz',
54
        '_space_group_symop.operation_xyz',
55
        '_symmetry_equiv_pos_as_xyz_',
56
        '_space_group_symop_operation_xyz_',],
57
58
    'spacegroup_name': [
59
        '_space_group_name_Hall',
60
        '_symmetry_space_group_name_hall',
61
        '_space_group_name_H-M_alt',
62
        '_symmetry_space_group_name_H-M',
63
        '_symmetry_space_group_name_H_M',
64
        '_symmetry_space_group_name_h-m',],
65
66
    'spacegroup_number': [
67
        '_space_group_IT_number',
68
        '_symmetry_Int_Tables_number',
69
        '_space_group_IT_number_',
70
        '_symmetry_Int_Tables_number_',],
71
}
72
73
74
class _Reader:
75
76
    _DISORDER_OPTIONS = {'skip', 'ordered_sites', 'all_sites'}
77
    _CCDC_IMPORT_ERR_MSG = 'Failed to import csd-python-api, please check ' \
78
                           'it is installed and licensed.'
79
80
    def __init__(self, iterable, converter, show_warnings=True):
81
        self._iterator = iter(iterable)
82
        self._converter = converter
83
        self.show_warnings = show_warnings
84
85
    def __iter__(self):
86
        if self._iterator is None or self._converter is None:
87
            raise RuntimeError(f'{self.__class__.__name__} not initialized.')
88
        return self
89
90
    def __next__(self):
91
        """Iterates over self._iterator, passing items through
92
        self._converter. Catches :class:`ParseError <.io.ParseError>`
93
        and warnings raised in self._converter, optionally printing
94
        them.
95
        """
96
97
        if not self.show_warnings:
98
            warnings.simplefilter('ignore')
99
100
        while True:
101
102
            item = next(self._iterator)
103
104
            with warnings.catch_warnings(record=True) as warning_msgs:
105
                msg = None
106
                try:
107
                    periodic_set = self._converter(item)
108
                except ParseError as err:
109
                    msg = str(err)
110
111
            if msg:
112
                warnings.warn(msg)
113
                continue
114
115
            for warning in warning_msgs:
116
                msg = f'{periodic_set.name} {warning.message}'
117
                warnings.warn(msg, category=warning.category)
118
119
            return periodic_set
120
121
    def read(self):
122
        """Reads the crystal(s), returns one
123
        :class:`amd.PeriodicSet <.periodicset.PeriodicSet>` if there is
124
        only one, otherwise returns a list.
125
        """
126
127
        l = list(self)
128
        if len(l) == 1:
129
            return l[0]
130
        return l
131
132
133
class CifReader(_Reader):
134
    """Read all structures in a .cif file or all files in a folder
135
    with ase or csd-python-api (if installed), yielding
136
    :class:`amd.PeriodicSet <.periodicset.PeriodicSet>` s.
137
138
    Parameters
139
    ----------
140
    path : str
141
        Path to a .cif file or directory. (Other files are accepted when
142
        using ``reader='ccdc'``, if csd-python-api is installed.)
143
    reader : str, optional
144
        The backend package used for parsing. Default is :code:`ase`,
145
        to use csd-python-api change to :code:`ccdc`. The ccdc reader
146
        should be able to read any format accepted by
147
        :class:`ccdc.io.EntryReader`, though only cifs have been tested.
148
    remove_hydrogens : bool, optional
149
        Remove Hydrogens from the crystal.
150
    disorder : str, optional
151
        Controls how disordered structures are handled. Default is
152
        ``skip`` which skips any crystal with disorder, since disorder
153
        conflicts with the periodic set model. To read disordered
154
        structures anyway, choose either :code:`ordered_sites` to remove
155
        atoms with disorder or :code:`all_sites` include all atoms
156
        regardless of disorder.
157
    heaviest_component : bool, optional
158
        csd-python-api only. Removes all but the heaviest molecule in
159
        the asymmeric unit, intended for removing solvents.
160
    molecular_centres : bool, default False
161
        csd-python-api only. Extract the centres of molecules in the
162
        unit cell and store in the attribute molecular_centres.
163
    show_warnings : bool, optional
164
        Controls whether warnings that arise during reading are printed.
165
166
    Yields
167
    ------
168
    :class:`amd.PeriodicSet <.periodicset.PeriodicSet>`
169
        Represents the crystal as a periodic set, consisting of a finite
170
        set of points (motif) and lattice (unit cell). Contains other
171
        data, e.g. the crystal's name and information about the
172
        asymmetric unit.
173
174
    Examples
175
    --------
176
177
        ::
178
179
            # Put all crystals in a .CIF in a list
180
            structures = list(amd.CifReader('mycif.cif'))
181
182
            # Can also accept path to a directory, reading all files inside
183
            structures = list(amd.CifReader('path/to/folder'))
184
185
            # Reads just one if the .CIF has just one crystal
186
            periodic_set = amd.CifReader('mycif.cif').read()
187
188
            # List of AMDs (k=100) of crystals in a .CIF
189
            amds = [amd.AMD(periodic_set, 100) for periodic_set in amd.CifReader('mycif.cif')]
190
    """
191
192
    def __init__(
193
            self,
194
            path,
195
            reader='ase',
196
            remove_hydrogens=False,
197
            disorder='skip',
198
            heaviest_component=False,
199
            molecular_centres=False,
200
            show_warnings=True,
201
    ):
202
203
        if disorder not in CifReader._DISORDER_OPTIONS:
204
            msg = 'disorder parameter must be one of ' \
205
                  f'{CifReader._DISORDER_OPTIONS} (passed "{disorder}")'
206
            raise ValueError(msg)
207
208
        if reader != 'ccdc':
209
            if heaviest_component:
210
                msg = 'Parameter heaviest_component only implemented for ' \
211
                      'reader="ccdc".'
212
                raise NotImplementedError(msg)
213
214
            if molecular_centres:
215
                msg = 'Parameter molecular_centres only implemented for ' \
216
                      'reader="ccdc".'
217
                raise NotImplementedError(msg)
218
219
        if reader in ('ase', 'pycodcif'):
220
            from ase.io.cif import parse_cif
221
            extensions = {'cif'}
222
            file_parser = functools.partial(parse_cif, reader=reader)
223
            converter = functools.partial(
224
                periodicset_from_ase_cifblock,
225
                remove_hydrogens=remove_hydrogens,
226
                disorder=disorder
227
            )
228
229
        elif reader == 'pymatgen':
230
            extensions = {'cif'}
231
            file_parser = CifReader._pymatgen_cifblock_generator
232
            converter = functools.partial(
233
                periodicset_from_pymatgen_cifblock,
234
                remove_hydrogens=remove_hydrogens,
235
                disorder=disorder
236
            )
237
238
        elif reader == 'gemmi':
239
            import gemmi
240
            extensions = {'cif'}
241
            file_parser = gemmi.cif.read_file
242
            converter = functools.partial(
243
                periodicset_from_gemmi_block,
244
                remove_hydrogens=remove_hydrogens,
245
                disorder=disorder
246
            )
247
248
        elif reader == 'ccdc':
249
            try:
250
                import ccdc.io
251
            except (ImportError, RuntimeError) as e:
252
                raise ImportError(_Reader._CCDC_IMPORT_ERR_MSG) from e
253
254
            extensions = ccdc.io.EntryReader.known_suffixes
255
            file_parser = ccdc.io.EntryReader
256
            converter = functools.partial(
257
                periodicset_from_ccdc_entry,
258
                remove_hydrogens=remove_hydrogens,
259
                disorder=disorder,
260
                molecular_centres=molecular_centres,
261
                heaviest_component=heaviest_component
262
            )
263
264
        else:
265
            raise ValueError(f'Unknown reader "{reader}".')
266
267
        if os.path.isfile(path):
268
            iterable = file_parser(path)
269
        elif os.path.isdir(path):
270
            iterable = CifReader._dir_generator(path, file_parser, extensions)
271
        else:
272
            raise FileNotFoundError(
273
                errno.ENOENT, os.strerror(errno.ENOENT), path)
274
275
        super().__init__(iterable, converter, show_warnings=show_warnings)
276
277
    @staticmethod
278
    def _dir_generator(path, callable, extensions):
279
        for file in os.listdir(path):
280
            suff = os.path.splitext(file)[1][1:]
281
            if suff.lower() in extensions:
282
                yield from callable(os.path.join(path, file))
283
284
    @staticmethod
285
    def _pymatgen_cifblock_generator(path):
286
        """Path to .cif --> generator of pymatgen CifBlock objects."""
287
        from pymatgen.io.cif import CifFile
288
        yield from CifFile.from_file(path).data.values()
289
290
291
class CSDReader(_Reader):
292
    """Read structures from the CSD with csd-python-api, yielding
293
    :class:`amd.PeriodicSet <.periodicset.PeriodicSet>` s.
294
295
    Parameters
296
    ----------
297
    refcodes : str or List[str], optional
298
        Single or list of CSD refcodes to read. If None or 'CSD',
299
        iterates over the whole CSD.
300
    families : bool, optional
301
        Read all entries whose refcode starts with the given strings, or
302
        'families' (e.g. giving 'DEBXIT' reads all entries starting with
303
        DEBXIT).
304
    remove_hydrogens : bool, optional
305
        Remove hydrogens from the crystal.
306
    disorder : str, optional
307
        Controls how disordered structures are handled. Default is
308
        ``skip`` which skips any crystal with disorder, since disorder
309
        conflicts with the periodic set model. To read disordered
310
        structures anyway, choose either :code:`ordered_sites` to remove
311
        atoms with disorder or :code:`all_sites` include all atoms
312
        regardless of disorder.
313
    heaviest_component : bool, optional
314
        Removes all but the heaviest molecule in the asymmeric unit,
315
        intended for removing solvents.
316
    molecular_centres : bool, default False
317
        Extract the centres of molecules in the unit cell and store in
318
        attribute molecular_centres.
319
    show_warnings : bool, optional
320
        Controls whether warnings that arise during reading are printed.
321
322
    Yields
323
    ------
324
    :class:`amd.PeriodicSet <.periodicset.PeriodicSet>`
325
        Represents the crystal as a periodic set, consisting of a finite
326
        set of points (motif) and lattice (unit cell). Contains other
327
        useful data, e.g. the crystal's name and information about the
328
        asymmetric unit for calculation.
329
330
    Examples
331
    --------
332
333
        ::
334
335
            # Put these entries in a list
336
            refcodes = ['DEBXIT01', 'DEBXIT05', 'HXACAN01']
337
            structures = list(amd.CSDReader(refcodes))
338
339
            # Read refcode families (any whose refcode starts with strings in the list)
340
            refcode_families = ['ACSALA', 'HXACAN']
341
            structures = list(amd.CSDReader(refcode_families, families=True))
342
343
            # Get AMDs (k=100) for crystals in these families
344
            refcodes = ['ACSALA', 'HXACAN']
345
            amds = []
346
            for periodic_set in amd.CSDReader(refcodes, families=True):
347
                amds.append(amd.AMD(periodic_set, 100))
348
349
            # Giving the reader nothing reads from the whole CSD.
350
            for periodic_set in amd.CSDReader():
351
                ...
352
    """
353
354
    def __init__(
355
            self,
356
            refcodes=None,
357
            families=False,
358
            remove_hydrogens=False,
359
            disorder='skip',
360
            heaviest_component=False,
361
            molecular_centres=False,
362
            show_warnings=True,
363
    ):
364
365
        try:
366
            import ccdc.io
367
        except (ImportError, RuntimeError) as _:
368
            raise ImportError(_Reader._CCDC_IMPORT_ERR_MSG)
369
370
        if disorder not in _Reader._DISORDER_OPTIONS:
371
            msg = 'disorder parameter must be one of ' \
372
                  f'{_Reader._DISORDER_OPTIONS} (passed {disorder})'
373
            raise ValueError(msg)
374
375
        if isinstance(refcodes, str) and refcodes.lower() == 'csd':
376
            refcodes = None
377
378
        if refcodes is None:
379
            families = False
380
        else:
381
            if isinstance(refcodes, str):
382
                refcodes = [refcodes]
383
            else:
384
                refcodes = list(refcodes)
385
386
        if families:
387
            refcodes = self._refcodes_from_families_ccdc(refcodes)
388
389
        entry_reader = ccdc.io.EntryReader('CSD')
390
        converter = functools.partial(
391
            periodicset_from_ccdc_entry,
392
            remove_hydrogens=remove_hydrogens,
393
            disorder=disorder,
394
            molecular_centres=molecular_centres,
395
            heaviest_component=heaviest_component
396
        )
397
        iterable = self._ccdc_generator(refcodes, entry_reader)
398
        super().__init__(iterable, converter, show_warnings=show_warnings)
399
400
    @staticmethod
401
    def _ccdc_generator(refcodes, entry_reader):
402
        """Generates ccdc Entries from CSD refcodes.
403
        """
404
405
        if refcodes is None:
406
            for entry in entry_reader:
407
                yield entry
408
        else:
409
            for refcode in refcodes:
410
                try:
411
                    entry = entry_reader.entry(refcode)
412
                    yield entry
413
                except RuntimeError:
414
                    warnings.warn(f'{refcode} not found in database')
415
416
    @staticmethod
417
    def _refcodes_from_families_ccdc(refcode_families):
418
        """List of strings --> all CSD refcodes starting with any of the
419
        strings. Intended to be passed a list of families and return all
420
        refcodes in them.
421
        """
422
423
        try:
424
            import ccdc.search
425
        except (ImportError, RuntimeError) as _:
426
            raise ImportError(_Reader._CCDC_IMPORT_ERR_MSG)
427
428
        all_refcodes = []
429
        for refcode in refcode_families:
430
            query = ccdc.search.TextNumericSearch()
431
            query.add_identifier(refcode)
432
            hits = [hit.identifier for hit in query.search()]
433
            all_refcodes.extend(hits)
434
435
        # filter to unique refcodes
436
        seen = set()
437
        seen_add = seen.add
438
        refcodes = [
439
            refcode for refcode in all_refcodes
440
            if not (refcode in seen or seen_add(refcode))]
441
442
        return refcodes
443
444
445
class ParseError(ValueError):
446
    """Raised when an item cannot be parsed into a periodic set.
447
    """
448
    pass
449
450
451
def periodicset_from_ase_cifblock(
452
        block,
453
        remove_hydrogens=False,
454
        disorder='skip'
455
) -> PeriodicSet:
456
    """:class:`ase.io.cif.CIFBlock` --> 
457
    :class:`amd.PeriodicSet <.periodicset.PeriodicSet>`.
458
    :class:`ase.io.cif.CIFBlock` is the type returned by
459
    :class:`ase.io.cif.parse_cif`.
460
461
    Parameters
462
    ----------
463
    block : :class:`ase.io.cif.CIFBlock`
464
        An ase :class:`ase.io.cif.CIFBlock` object representing a
465
        crystal.
466
    remove_hydrogens : bool, optional
467
        Remove Hydrogens from the crystal.
468
    disorder : str, optional
469
        Controls how disordered structures are handled. Default is
470
        ``skip`` which skips any crystal with disorder, since disorder
471
        conflicts with the periodic set model. To read disordered
472
        structures anyway, choose either :code:`ordered_sites` to remove
473
        atoms with disorder or :code:`all_sites` include all atoms
474
        regardless of disorder.
475
476
    Returns
477
    -------
478
    :class:`amd.PeriodicSet <.periodicset.PeriodicSet>`
479
        Represents the crystal as a periodic set, consisting of a finite
480
        set of points (motif) and lattice (unit cell). Contains other
481
        useful data, e.g. the crystal's name and information about the
482
        asymmetric unit for calculation.
483
484
    Raises
485
    ------
486
    ParseError
487
        Raised if the structure fails to be parsed for any of the
488
        following: 1. Required data is missing (e.g. cell parameters),
489
        2. The motif is empty after removing H or disordered sites,
490
        3. :code:``disorder == 'skip'`` and disorder is found on any
491
        atom.
492
    """
493
494
    import ase
495
496
    # Unit cell
497
    cellpar = [block.get(tag) for tag in CIF_TAGS['cellpar']]
498
    if None in cellpar:
499
        raise ParseError(f'{block.name} has missing cell data')
500
    cell = cellpar_to_cell(*cellpar)
501
502
    # Asymmetric unit coordinates. ase removes uncertainty brackets
503
    cartesian = False # flag needed for later
504
    asym_unit = [block.get(name) for name in CIF_TAGS['atom_site_fract']]
505
    if None in asym_unit: # missing scaled coords, try Cartesian
506
        asym_unit = [block.get(name) for name in CIF_TAGS['atom_site_cartn']]
507
        if None in asym_unit:
508
            raise ParseError(f'{block.name} has missing coordinates')
509
        cartesian = True
510
    asym_unit = list(zip(*asym_unit)) # transpose [xs,ys,zs] -> [p1,p2,...]
511
512
    # Atomic types
513
    asym_symbols = block._get_any(CIF_TAGS['atom_symbol'])
514
    if asym_symbols is None:
515
        warnings.warn('missing atomic types will be labelled 0')
516
        asym_types = [0] * len(asym_unit)
517
    else:
518
        asym_types = []
519
        for label in asym_symbols:
520
            if label in ('.', '?'):
521
                warnings.warn('missing atomic types will be labelled 0')
522
                num = 0
523
            else:
524
                sym = re.search(r'([A-Z][a-z]?)', label).group(0)
525
                if sym == 'D':
526
                    sym = 'H'
527
                num = ase.data.atomic_numbers[sym]
528
            asym_types.append(num)
529
530
    # Find where sites have disorder if necassary
531
    has_disorder = []
532
    if disorder != 'all_sites':
533
        occupancies = block.get('_atom_site_occupancy')
534
        if occupancies is None:
535
            occupancies = np.ones((len(asym_unit), ))
536
        labels = block.get('_atom_site_label')
537
        if labels is None:
538
            labels = [''] * len(asym_unit)
539
        for lab, occ in zip(labels, occupancies):
540
            has_disorder.append(_has_disorder(lab, occ))
541
542
    # Remove sites with ?, . or other invalid string for coordinates
543
    invalid = []
544
    for i, xyz in enumerate(asym_unit):
545
        if not all(isinstance(coord, (int, float)) for coord in xyz):
546
            invalid.append(i)
547
    if invalid:
548
        warnings.warn('atoms without sites or missing data will be removed')
549
        asym_unit = [c for i, c in enumerate(asym_unit) if i not in invalid]
550
        asym_types = [t for i, t in enumerate(asym_types) if i not in invalid]
551
        if disorder != 'all_sites':
552
            has_disorder = [d for i, d in enumerate(has_disorder)
553
                            if i not in invalid]
554
555
    remove_sites = []
556
557
    if remove_hydrogens:
558
        remove_sites.extend(i for i, num in enumerate(asym_types) if num == 1)
559
560
    # Remove atoms with fractional occupancy or raise ParseError
561 View Code Duplication
    if disorder != 'all_sites':
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
562
        for i, dis in enumerate(has_disorder):
563
            if i in remove_sites:
564
                continue
565
            if dis:
566
                if disorder == 'skip':
567
                    msg = f"{block.name} has disorder, pass " \
568
                           "disorder='ordered_sites'or 'all_sites' to " \
569
                           "remove/ignore disorder"
570
                    raise ParseError(msg)
571
                elif disorder == 'ordered_sites':
572
                    remove_sites.append(i)
573
574
    # Asymmetric unit
575
    asym_unit = [c for i, c in enumerate(asym_unit) if i not in remove_sites]
576
    asym_types = [t for i, t in enumerate(asym_types) if i not in remove_sites]
577
    if len(asym_unit) == 0:
578
        raise ParseError(f'{block.name} has no valid sites')
579
    asym_unit = np.array(asym_unit)
580
581
    # If Cartesian coords were given, convert to scaled
582
    if cartesian:
583
        asym_unit = asym_unit @ np.linalg.inv(cell)
584
    asym_unit = np.mod(asym_unit, 1)
585
    # asym_unit = _snap_small_prec_coords(asym_unit, 1e-4) # recommended by pymatgen
586
587
    # Remove overlapping sites unless disorder == 'all_sites'
588
    if disorder != 'all_sites':
589
        keep_sites = _unique_sites(asym_unit, _EQUIV_SITE_TOL)
590
        if not np.all(keep_sites):
591
            msg = 'may have overlapping sites, duplicates will be removed'
592
            warnings.warn(msg)
593
            asym_unit = asym_unit[keep_sites]
594
            asym_types = [t for t, keep in zip(asym_types, keep_sites) if keep]
595
596
    # Symmetry operations
597
    sitesym = block._get_any(CIF_TAGS['symop'])
598
    if sitesym is None: # no symops, use spacegroup
599
        try:
600
            spg = block.get_spacegroup(True)
601
            rot, trans = spg.rotations, spg.translations
602
        except: # no spacegroup, assume P1
603
            rot, trans = _parse_sitesym(['x,y,z'])
604
    else:
605
        if isinstance(sitesym, str):
606
            sitesym = [sitesym]
607
        rot, trans = _parse_sitesym(sitesym)
608
609
    # Apply symmetries to asymmetric unit
610
    frac_motif, asym_inds, wyc_muls, inverses = _expand_asym_unit(asym_unit, rot, trans)
611
    types = np.array([asym_types[i] for i in inverses])
612
    motif = frac_motif @ cell
613
614
    return PeriodicSet(
615
        motif=motif,
616
        cell=cell,
617
        name=block.name,
618
        asymmetric_unit=asym_inds,
619
        wyckoff_multiplicities=wyc_muls,
620
        types=types
621
    )
622
623
624
def periodicset_from_ase_atoms(
625
        atoms,
626
        remove_hydrogens=False
627
) -> PeriodicSet:
628
    """:class:`ase.atoms.Atoms` -->
629
    :class:`amd.PeriodicSet <.periodicset.PeriodicSet>`. Does not have
630
    the option to remove disorder.
631
632
    Parameters
633
    ----------
634
    atoms : :class:`ase.atoms.Atoms`
635
        An ase :class:`ase.atoms.Atoms` object representing a crystal.
636
    remove_hydrogens : bool, optional
637
        Remove Hydrogens from the crystal.
638
639
    Returns
640
    -------
641
    :class:`amd.PeriodicSet <.periodicset.PeriodicSet>`
642
        Represents the crystal as a periodic set, consisting of a finite
643
        set of points (motif) and lattice (unit cell). Contains other
644
        useful data, e.g. the crystal's name and information about the
645
        asymmetric unit for calculation.
646
647
    Raises
648
    ------
649
    ParseError
650
        Raised if there are no valid sites in atoms.
651
    """
652
653
    from ase.spacegroup import get_basis
654
655
    cell = atoms.get_cell().array
656
657
    remove_inds = []
658
    if remove_hydrogens:
659
        for i in np.where(atoms.get_atomic_numbers() == 1)[0]:
660
            remove_inds.append(i)
661
    for i in sorted(remove_inds, reverse=True):
662
        atoms.pop(i)
663
664
    if len(atoms) == 0:
665
        raise ParseError(f'ase Atoms object has no valid sites')
666
667
    # Symmetry operations from spacegroup 
668
    spg = None
669
    if 'spacegroup' in atoms.info:
670
        spg = atoms.info['spacegroup']
671
        rot, trans = spg.rotations, spg.translations
672
    # else assume no symmetries?
673
674
    # Asymmetric unit. ase default tol is 1e-5
675
    asym_unit = get_basis(atoms, spacegroup=spg, tol=_EQUIV_SITE_TOL)
676
    frac_motif, asym_inds, wyc_muls, _ = _expand_asym_unit(asym_unit, rot, trans)
0 ignored issues
show
introduced by
The variable trans does not seem to be defined in case 'spacegroup' in atoms.info on line 669 is False. Are you sure this can never be the case?
Loading history...
introduced by
The variable rot does not seem to be defined in case 'spacegroup' in atoms.info on line 669 is False. Are you sure this can never be the case?
Loading history...
677
    motif = frac_motif @ cell
678
679
    return PeriodicSet(
680
        motif=motif,
681
        cell=cell,
682
        asymmetric_unit=asym_inds,
683
        wyckoff_multiplicities=wyc_muls,
684
        types=atoms.get_atomic_numbers()
685
    )
686
687
688
def periodicset_from_pymatgen_cifblock(
689
        block,
690
        remove_hydrogens=False,
691
        disorder='skip'
692
) -> PeriodicSet:
693
    """:class:`pymatgen.io.cif.CifBlock` -->
694
    :class:`amd.PeriodicSet <.periodicset.PeriodicSet>`.
695
696
    Parameters
697
    ----------
698
    block : :class:`pymatgen.io.cif.CifBlock`
699
        A pymatgen CifBlock object representing a crystal.
700
    remove_hydrogens : bool, optional
701
        Remove Hydrogens from the crystal.
702
    disorder : str, optional
703
        Controls how disordered structures are handled. Default is
704
        ``skip`` which skips any crystal with disorder, since disorder
705
        conflicts with the periodic set model. To read disordered
706
        structures anyway, choose either :code:`ordered_sites` to remove
707
        atoms with disorder or :code:`all_sites` include all atoms
708
        regardless of disorder.
709
710
    Returns
711
    -------
712
    :class:`amd.PeriodicSet <.periodicset.PeriodicSet>`
713
        Represents the crystal as a periodic set, consisting of a finite
714
        set of points (motif) and lattice (unit cell). Contains other
715
        useful data, e.g. the crystal's name and information about the
716
        asymmetric unit for calculation.
717
718
    Raises
719
    ------
720
    ParseError
721
        Raised if the structure can/should not be parsed for the
722
        following reasons: 1. No sites found or motif is empty after
723
        removing Hydrogens & disorder, 2. A site has missing
724
        coordinates, 3. :code:``disorder == 'skip'`` and disorder is
725
        found on any atom.
726
    """
727
728
    from pymatgen.io.cif import str2float
729
    import pymatgen.core.periodic_table as periodic_table
730
731
    odict = block.data
732
733
    # Unit cell
734
    cellpar = [odict.get(tag) for tag in CIF_TAGS['cellpar']]
735
    if None in cellpar:
736
        raise ParseError(f'{block.header} has missing cell data')
737
    cellpar = [str2float(v) for v in cellpar]
738
    cell = cellpar_to_cell(*cellpar)
739
740
    # Asymmetric unit coordinates
741
    cartesian = False
742
    asym_unit = [odict.get(tag) for tag in CIF_TAGS['atom_site_fract']]
743
744
    if None in asym_unit: # missing scaled coordinates, try Cartesian
745
        asym_unit = [odict.get(tag) for tag in CIF_TAGS['atom_site_cartn']]
746
        if None in asym_unit:
747
            raise ParseError(f'{block.header} has no coordinates')
748
        cartesian = True
749
750
    asym_unit = list(zip(*asym_unit)) # transpose [xs,ys,zs] -> [p1,p2,...]
751
    asym_unit = [[str2float(coord) for coord in xyz] 
752
                 for xyz in asym_unit]
753
754
    # Atomic types
755
    for tag in CIF_TAGS['atom_symbol']:
756
        asym_symbols = odict.get(tag)
757
        if asym_symbols is not None:
758
            asym_types = []
759
            for label in asym_symbols:
760
                if label in ('.', '?'):
761
                    warnings.warn('missing atomic types will be labelled 0')
762
                    num = 0
763
                else:
764
                    sym = re.search(r'([A-Z][a-z]?)', label).group(0)
765
                    if sym == 'D':
766
                        sym = 'H'
767
                    num = periodic_table.Element[sym].number
768
                asym_types.append(num)
769
            break
770
    else:
771
        warnings.warn('missing atomic types will be labelled 0')
772
        asym_types = [0] * len(asym_unit)
773
774
    # Find where sites have disorder if necassary
775
    has_disorder = []
776
    if disorder != 'all_sites':
777
        occupancies = odict.get('_atom_site_occupancy')
778
        if occupancies is None:
779
            occupancies = np.ones((len(asym_unit), ))
780
        labels = odict.get('_atom_site_label')
781
        if labels is None:
782
            labels = [''] * len(asym_unit)
783
        for lab, occ in zip(labels, occupancies):
784
            has_disorder.append(_has_disorder(lab, occ))
785
786
    # Remove sites with ?, . or other invalid string for coordinates
787
    invalid = []
788
    for i, xyz in enumerate(asym_unit):
789
        if not all(isinstance(coord, (int, float)) for coord in xyz):
790
            invalid.append(i)
791
792
    if invalid:
793
        warnings.warn('atoms without sites or missing data will be removed')
794
        asym_unit = [c for i, c in enumerate(asym_unit) if i not in invalid]
795
        asym_types = [c for i, c in enumerate(asym_types) if i not in invalid]
796
        if disorder != 'all_sites':
797
            has_disorder = [d for i, d in enumerate(has_disorder) if i not in invalid]
798
799
    remove_sites = []
800
801
    if remove_hydrogens:
802
        remove_sites.extend((i for i, num in enumerate(asym_types) if num == 1))
0 ignored issues
show
introduced by
The variable i does not seem to be defined in case the for loop on line 788 is not entered. Are you sure this can never be the case?
Loading history...
803
804
    # Remove atoms with fractional occupancy or raise ParseError
805 View Code Duplication
    if disorder != 'all_sites':
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
806
        for i, dis in enumerate(has_disorder):
807
            if i in remove_sites:
808
                continue
809
            if dis:
810
                if disorder == 'skip':
811
                    msg = f"{block.header} has disorder, pass " \
812
                            "disorder='ordered_sites' or 'all_sites' to " \
813
                            "remove/ignore disorder"
814
                    raise ParseError(msg)
815
                elif disorder == 'ordered_sites':
816
                    remove_sites.append(i)
817
818
    # Asymmetric unit
819
    asym_unit = [c for i, c in enumerate(asym_unit) if i not in remove_sites]
820
    asym_types = [t for i, t in enumerate(asym_types) if i not in remove_sites]
821
    if len(asym_unit) == 0:
822
        raise ParseError(f'{block.header} has no valid sites')
823
    asym_unit = np.array(asym_unit)
824
825
    # If Cartesian coords were given, convert to scaled
826
    if cartesian:
827
        asym_unit = asym_unit @ np.linalg.inv(cell)
828
    asym_unit = np.mod(asym_unit, 1)
829
    # asym_unit = _snap_small_prec_coords(asym_unit) # recommended by pymatgen
830
831
    # Remove overlapping sites unless disorder == 'all_sites'
832
    if disorder != 'all_sites':      
833
        keep_sites = _unique_sites(asym_unit, _EQUIV_SITE_TOL)
834
        if not np.all(keep_sites):
835
            msg = 'may have overlapping sites; duplicates will be removed'
836
            warnings.warn(msg)
837
        asym_unit = asym_unit[keep_sites]
838
        asym_types = [sym for sym, keep in zip(asym_types, keep_sites) if keep]
839
840
    # Apply symmetries to asymmetric unit
841
    rot, trans = _parse_sitesym_pymatgen(odict)
842
    frac_motif, asym_inds, wyc_muls, inverses = _expand_asym_unit(asym_unit, rot, trans)
843
    types = np.array([asym_types[i] for i in inverses])
844
    motif = frac_motif @ cell
845
846
    return PeriodicSet(
847
        motif=motif,
848
        cell=cell,
849
        name=block.header,
850
        asymmetric_unit=asym_inds,
851
        wyckoff_multiplicities=wyc_muls,
852
        types=types
853
    )
854
855
856
def periodicset_from_pymatgen_structure(
857
        structure,
858
        remove_hydrogens=False,
859
        disorder='skip'
860
) -> PeriodicSet:
861
    """:class:`pymatgen.core.structure.Structure` -->
862
    :class:`amd.PeriodicSet <.periodicset.PeriodicSet>`.
863
    Does not set the name of the periodic set, as there seems to be no
864
    name attribute in the pymatgen Structure object.
865
866
    Parameters
867
    ----------
868
    structure : :class:`pymatgen.core.structure.Structure`
869
        A pymatgen Structure object representing a crystal.
870
    remove_hydrogens : bool, optional
871
        Remove Hydrogens from the crystal.
872
    disorder : str, optional
873
        Controls how disordered structures are handled. Default is
874
        ``skip`` which skips any crystal with disorder, since disorder
875
        conflicts with the periodic set model. To read disordered
876
        structures anyway, choose either :code:`ordered_sites` to remove
877
        atoms with disorder or :code:`all_sites` include all atoms
878
        regardless of disorder.
879
880
    Returns
881
    -------
882
    :class:`amd.PeriodicSet <.periodicset.PeriodicSet>`
883
        Represents the crystal as a periodic set, consisting of a finite
884
        set of points (motif) and lattice (unit cell). Contains other
885
        useful data, e.g. the crystal's name and information about the
886
        asymmetric unit for calculation.
887
    
888
    Raises
889
    ------
890
    ParseError
891
        Raised if the :code:`disorder == 'skip'` and 
892
        :code:`not structure.is_ordered`
893
    """
894
895
    from pymatgen.symmetry.analyzer import SpacegroupAnalyzer
896
897
    if remove_hydrogens:
898
        structure.remove_species(['H', 'D'])
899
900
    # Disorder
901
    if disorder == 'skip':
902
        if not structure.is_ordered:
903
            msg = f"pymatgen Structure has disorder, pass " \
904
                   "disorder='ordered_sites' or 'all_sites' to " \
905
                   "remove/ignore disorder"
906
            raise ParseError(msg)
907
    elif disorder == 'ordered_sites':
908
        remove_inds = []
909
        for i, comp in enumerate(structure.species_and_occu):
910
            if comp.num_atoms < 1:
911
                remove_inds.append(i)
912
        structure.remove_sites(remove_inds)
913
914
    motif = structure.cart_coords
915
    cell = structure.lattice.matrix
916
    sym_structure = SpacegroupAnalyzer(structure).get_symmetrized_structure()
917
    asym_unit = np.array([l[0] for l in sym_structure.equivalent_indices])
918
    wyc_muls = np.array([len(l) for l in sym_structure.equivalent_indices])
919
    types = np.array(sym_structure.atomic_numbers)
920
921
    return PeriodicSet(
922
        motif=motif,
923
        cell=cell,
924
        asymmetric_unit=asym_unit,
925
        wyckoff_multiplicities=wyc_muls,
926
        types=types
927
    )
928
929
930
def periodicset_from_ccdc_entry(
931
        entry,
932
        remove_hydrogens=False,
933
        disorder='skip',
934
        heaviest_component=False,
935
        molecular_centres=False
936
) -> PeriodicSet:
937
    """:class:`ccdc.entry.Entry` -->
938
    :class:`amd.PeriodicSet <.periodicset.PeriodicSet>`.
939
    Entry is the type returned by :class:`ccdc.io.EntryReader`.
940
941
    Parameters
942
    ----------
943
    entry : :class:`ccdc.entry.Entry`
944
        A ccdc Entry object representing a database entry.
945
    remove_hydrogens : bool, optional
946
        Remove Hydrogens from the crystal.
947
    disorder : str, optional
948
        Controls how disordered structures are handled. Default is
949
        ``skip`` which skips any crystal with disorder, since disorder
950
        conflicts with the periodic set model. To read disordered
951
        structures anyway, choose either :code:`ordered_sites` to remove
952
        atoms with disorder or :code:`all_sites` include all atoms
953
        regardless of disorder.
954
    heaviest_component : bool, optional
955
        Removes all but the heaviest molecule in the asymmeric unit,
956
        intended for removing solvents.
957
    molecular_centres : bool, default False
958
        Extract the centres of molecules in the unit cell and store in
959
        the attribute molecular_centres of the returned PeriodicSet.
960
961
    Returns
962
    -------
963
    :class:`amd.PeriodicSet <.periodicset.PeriodicSet>`
964
        Represents the crystal as a periodic set, consisting of a finite
965
        set of points (motif) and lattice (unit cell). Contains other
966
        useful data, e.g. the crystal's name and information about the
967
        asymmetric unit for calculation.
968
969
    Raises
970
    ------
971
    ParseError
972
        Raised if the structure fails parsing for any of the following:
973
        1. entry.has_3d_structure is False, 2.
974
        :code:``disorder == 'skip'`` and disorder is found on any atom,
975
        3. entry.crystal.molecule.all_atoms_have_sites is False,
976
        4. a.fractional_coordinates is None for any a in
977
        entry.crystal.disordered_molecule, 5. The motif is empty after
978
        removing Hydrogens and disordered sites.
979
    """
980
981
    # Entry specific flags
982
    if not entry.has_3d_structure:
983
        raise ParseError(f'{entry.identifier} has no 3D structure')
984
985
    # Disorder
986
    if disorder == 'skip' and entry.has_disorder:
987
        msg = f"{entry.identifier} has disorder, pass " \
988
                "disorder='ordered_sites' or 'all_sites' to remove/ignore" \
989
                "disorder"
990
        raise ParseError(msg)
991
992
    return periodicset_from_ccdc_crystal(
993
        entry.crystal,
994
        remove_hydrogens=remove_hydrogens,
995
        disorder=disorder,
996
        heaviest_component=heaviest_component,
997
        molecular_centres=molecular_centres
998
    )
999
1000
1001
def periodicset_from_ccdc_crystal(
1002
        crystal,
1003
        remove_hydrogens=False,
1004
        disorder='skip',
1005
        heaviest_component=False,
1006
        molecular_centres=False
1007
) -> PeriodicSet:
1008
    """:class:`ccdc.crystal.Crystal` -->
1009
    :class:`amd.PeriodicSet <.periodicset.PeriodicSet>`.
1010
    Crystal is the type returned by :class:`ccdc.io.CrystalReader`.
1011
1012
    Parameters
1013
    ----------
1014
    crystal : :class:`ccdc.crystal.Crystal`
1015
        A ccdc Crystal object representing a crystal structure.
1016
    remove_hydrogens : bool, optional
1017
        Remove Hydrogens from the crystal.
1018
    disorder : str, optional
1019
        Controls how disordered structures are handled. Default is
1020
        ``skip`` which skips any crystal with disorder, since disorder
1021
        conflicts with the periodic set model. To read disordered
1022
        structures anyway, choose either :code:`ordered_sites` to remove
1023
        atoms with disorder or :code:`all_sites` include all atoms
1024
        regardless of disorder.
1025
    heaviest_component : bool, optional
1026
        Removes all but the heaviest molecule in the asymmeric unit,
1027
        intended for removing solvents.
1028
    molecular_centres : bool, default False
1029
        Extract the centres of molecules in the unit cell and store in
1030
        the attribute molecular_centres of the returned PeriodicSet.
1031
1032
    Returns
1033
    -------
1034
    :class:`amd.PeriodicSet <.periodicset.PeriodicSet>`
1035
        Represents the crystal as a periodic set, consisting of a finite
1036
        set of points (motif) and lattice (unit cell). Contains other
1037
        useful data, e.g. the crystal's name and information about the
1038
        asymmetric unit for calculation.
1039
1040
    Raises
1041
    ------
1042
    ParseError
1043
        Raised if the structure fails parsing for any of the following:
1044
        1. :code:``disorder == 'skip'`` and disorder is found on any
1045
        atom, 2. crystal.molecule.all_atoms_have_sites is False,
1046
        3. a.fractional_coordinates is None for any a in
1047
        crystal.disordered_molecule, 4. The motif is empty after
1048
        removing H, disordered sites or solvents.
1049
    """
1050
1051
    molecule = crystal.disordered_molecule
1052
1053
    # Disorder
1054
    if disorder == 'skip':
1055
        if crystal.has_disorder or \
1056
         any(_has_disorder(a.label, a.occupancy) for a in molecule.atoms):
1057
            msg = f"{crystal.identifier} has disorder, pass " \
1058
                   "disorder='ordered_sites' or 'all_sites' to remove/ignore" \
1059
                   "disorder"
1060
            raise ParseError(msg)
1061
1062
    elif disorder == 'ordered_sites':
1063
        molecule.remove_atoms(a for a in molecule.atoms
1064
                              if _has_disorder(a.label, a.occupancy))
1065
1066
    if remove_hydrogens:
1067
        molecule.remove_atoms(
1068
            a for a in molecule.atoms if a.atomic_symbol in 'HD'
1069
        )
1070
1071
    if heaviest_component and len(molecule.components) > 1:
1072
        molecule = _heaviest_component_ccdc(molecule)
1073
1074
    # Remove atoms with missing coordinates and warn
1075
    is_missing = (a.fractional_coordinates is None for a in molecule.atoms)
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable a does not seem to be defined.
Loading history...
1076
    if any(is_missing):
1077
        warnings.warn('atoms without sites or missing data will be removed')
1078
        molecule.remove_atoms(
1079
            a for a, missing in zip(molecule.atoms, is_missing) if missing
1080
        )
1081
1082
    if not molecule.all_atoms_have_sites:
1083
        raise ParseError(f'{crystal.identifier} has atoms without sites')
1084
1085
    crystal.molecule = molecule
1086
    cell = cellpar_to_cell(*crystal.cell_lengths, *crystal.cell_angles)
1087
1088
    if molecular_centres:
1089
        frac_centres = _frac_molecular_centres_ccdc(crystal)
1090
        mol_centres = frac_centres @ cell
1091
        return PeriodicSet(mol_centres, cell, name=crystal.identifier)
1092
1093
    asym_atoms = crystal.asymmetric_unit_molecule.atoms
1094
    # check for None?
1095
    asym_unit = np.array([tuple(a.fractional_coordinates) for a in asym_atoms])
1096
1097
    if asym_unit.shape[0] == 0:
1098
        raise ParseError(f'{crystal.identifier} has no valid sites')
1099
1100
    asym_unit = np.mod(asym_unit, 1)
1101
    # asym_unit = _snap_small_prec_coords(asym_unit) # recommended by pymatgen
1102
    asym_types = [a.atomic_number for a in asym_atoms]
1103
1104
    # Disorder
1105
    if disorder != 'all_sites':
1106
        keep_sites = _unique_sites(asym_unit, _EQUIV_SITE_TOL)
1107
        if not np.all(keep_sites):
1108
            msg = 'may have overlapping sites; duplicates will be removed'
1109
            warnings.warn(msg)
1110
        asym_unit = asym_unit[keep_sites]
1111
        asym_types = [sym for sym, keep in zip(asym_types, keep_sites) if keep]
1112
1113
    # Symmetry operations
1114
    sitesym = crystal.symmetry_operators
1115
    # try spacegroup numbers?
1116
    if not sitesym:
1117
        sitesym = ['x,y,z']
1118
    rot = np.array([np.array(crystal.symmetry_rotation(op)).reshape((3, 3)) 
1119
                    for op in sitesym])
1120
    trans = np.array([crystal.symmetry_translation(op) for op in sitesym])
1121
1122
    # Apply symmetries to asymmetric unit
1123
    frac_motif, asym_inds, wyc_muls, inverses = _expand_asym_unit(asym_unit, rot, trans)
1124
    motif = frac_motif @ cell
1125
    types = np.array([asym_types[i] for i in inverses])
1126
1127
    return PeriodicSet(
1128
        motif=motif,
1129
        cell=cell,
1130
        name=crystal.identifier,
1131
        asymmetric_unit=asym_inds,
1132
        wyckoff_multiplicities=wyc_muls,
1133
        types=types
1134
    )
1135
1136
1137
# Not quite finished
1138
def periodicset_from_gemmi_block(
1139
        block,
1140
        remove_hydrogens=False,
1141
        disorder='skip'
1142
) -> PeriodicSet:
1143
    """:class:`gemmi.cif.Block` -->
1144
    :class:`amd.PeriodicSet <.periodicset.PeriodicSet>`.
1145
    Block is the type returned by :class:`gemmi.cif.read_file`.
1146
1147
    Parameters
1148
    ----------
1149
    block : :class:`ase.io.cif.CIFBlock`
1150
        An ase CIFBlock object representing a crystal.
1151
    remove_hydrogens : bool, optional
1152
        Remove Hydrogens from the crystal.
1153
    disorder : str, optional
1154
        Controls how disordered structures are handled. Default is
1155
        ``skip`` which skips any crystal with disorder, since disorder
1156
        conflicts with the periodic set model. To read disordered
1157
        structures anyway, choose either :code:`ordered_sites` to remove
1158
        atoms with disorder or :code:`all_sites` include all atoms
1159
        regardless of disorder.
1160
1161
    Returns
1162
    -------
1163
    :class:`amd.PeriodicSet <.periodicset.PeriodicSet>`
1164
        Represents the crystal as a periodic set, consisting of a finite
1165
        set of points (motif) and lattice (unit cell). Contains other
1166
        useful data, e.g. the crystal's name and information about the
1167
        asymmetric unit for calculation.
1168
1169
    Raises
1170
    ------
1171
    ParseError
1172
        Raised if the structure fails to be parsed for any of the
1173
        following: 1. Required data is missing (e.g. cell parameters),
1174
        2. :code:``disorder == 'skip'`` and disorder is found on any
1175
        atom, 3. The motif is empty after removing H or disordered
1176
        sites.
1177
    """
1178
1179
    import gemmi
1180
1181
    # Unit cell
1182
    cellpar = [gemmi.cif.as_number(block.find_value(tag))
1183
               for tag in CIF_TAGS['cellpar']]
1184
    if None in cellpar:
1185
        raise ParseError(f'{block.name} has missing cell information')
1186
    cell = cellpar_to_cell(*cellpar)
1187
1188
    xyz_loop = block.find(CIF_TAGS['atom_site_fract']).loop
1189
    if xyz_loop is None:
1190
        # check for Cartesian coordinates
1191
        raise ParseError(f'{block.name} has missing coordinate data')
1192
1193
    # Asymmetric unit coordinates
1194
    loop_dict = _loop_to_dict_gemmi(xyz_loop)
1195
    xyz_str = [loop_dict[t] for t in CIF_TAGS['atom_site_fract']]
1196
    asym_unit = [[gemmi.cif.as_number(c) for c in coords] for coords in xyz_str]
1197
    asym_unit = np.mod(np.array(asym_unit).T, 1)
1198
    # asym_unit = _snap_small_prec_coords(asym_unit) # recommended by pymatgen
1199
1200
    # Asymmetric unit types
1201
    if '_atom_site_type_symbol' in loop_dict:
1202
        asym_syms = loop_dict['_atom_site_type_symbol']
1203
        asym_types = [gemmi.Element(s).atomic_number for s in asym_syms]
1204
    else:
1205
        warnings.warn('missing atomic types will be labelled 0')
1206
        asym_types = [0 for _ in range(len(asym_unit))]
1207
1208
    remove_sites = []
1209
1210
    # Disorder
1211
    if '_atom_site_label' in loop_dict:
1212
        labels = loop_dict['_atom_site_label']
1213
    else:
1214
        labels = [''] * xyz_loop.length()
1215
1216
    if '_atom_site_occupancy' in loop_dict:
1217
        occupancies = [gemmi.cif.as_number(occ)
1218
                       for occ in loop_dict['_atom_site_occupancy']]
1219
    else:
1220
        occupancies = [None for _ in range(xyz_loop.length())]
1221
1222
    if disorder == 'skip':
1223
        if any(_has_disorder(l, o) for l, o in zip(labels, occupancies)):
1224
            msg = f"{block.name} has disorder, pass " \
1225
                   "disorder='ordered_sites' or 'all_sites' to " \
1226
                   "remove/ignore disorder"
1227
            raise ParseError(msg)
1228
    elif disorder == 'ordered_sites':
1229
        for i, (lab, occ) in enumerate(zip(labels, occupancies)):
1230
            if _has_disorder(lab, occ):
1231
                remove_sites.append(i)
1232
1233
    if remove_hydrogens:
1234
        remove_sites.extend(
1235
            i for i, num in enumerate(asym_types) if num == 1
1236
        )
1237
1238
    # Asymmetric unit
1239
    asym_unit = np.delete(asym_unit, remove_sites, axis=0)
1240
    asym_types = [s for i, s in enumerate(asym_types) if i not in remove_sites]
1241
    if asym_unit.shape[0] == 0:
1242
        raise ParseError(f'{block.name} has no valid sites')
1243
1244
    # Remove overlapping sites unless disorder == 'all_sites'
1245
    if disorder != 'all_sites':
1246
        keep_sites = _unique_sites(asym_unit, _EQUIV_SITE_TOL)
1247
        if not np.all(keep_sites):
1248
            msg = 'may have overlapping sites; duplicates will be removed'
1249
            warnings.warn(msg)
1250
        asym_unit = asym_unit[keep_sites]
1251
        asym_types = [sym for sym, keep in zip(asym_types, keep_sites) if keep]
1252
1253
    # Symmetry operations
1254
    sitesym = []
1255
    for tag in CIF_TAGS['symop']:
1256
        symop_loop = block.find([tag]).loop
1257
        if symop_loop is not None:
1258
            symop_loop_dict = _loop_to_dict_gemmi(symop_loop)
1259
            sitesym = symop_loop_dict[tag]
1260
            break
1261
    # Try spacegroup names/numbers?
1262
    if not sitesym:
1263
        sitesym = ['x,y,z',]
1264
1265
    # Apply symmetries to asymmetric unit
1266
    rot, trans = _parse_sitesym(sitesym)
1267
    frac_motif, asym_inds, wyc_muls, inverses = _expand_asym_unit(asym_unit, rot, trans)
1268
    types = np.array([asym_types[i] for i in inverses])
1269
    motif = frac_motif @ cell
1270
1271
    return PeriodicSet(
1272
        motif=motif,
1273
        cell=cell,
1274
        name=block.name,
1275
        asymmetric_unit=asym_inds,
1276
        wyckoff_multiplicities=wyc_muls,
1277
        types=types
1278
    )
1279
1280
1281
def _parse_sitesym(symmetries):
1282
    """Parses a sequence of symmetries in xyz form and returns rotation
1283
    and translation arrays. Similar to function found in
1284
    ase.spacegroup.spacegroup.
1285
    """
1286
1287
    nsyms = len(symmetries)
1288
    rotations = np.zeros((nsyms, 3, 3))
1289
    translations = np.zeros((nsyms, 3))
1290
1291
    for i, sym in enumerate(symmetries):
1292
        for ind, element in enumerate(sym.split(',')):
1293
1294
            is_positive = True
1295
            is_fraction = False
1296
            sng_trans = None
1297
            fst_trans = []
1298
            snd_trans = []
1299
1300
            for char in element.lower():
1301
                if char == '+':
1302
                    is_positive = True
1303
                elif char == '-':
1304
                    is_positive = False
1305
                elif char == '/':
1306
                    is_fraction = True
1307
                elif char in 'xyz':
1308
                    rot_sgn = 1 if is_positive else -1
1309
                    rotations[i][ind][ord(char) - ord('x')] = rot_sgn
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable ord does not seem to be defined.
Loading history...
1310
                elif char.isdigit() or char == '.':
1311
                    if sng_trans is None:
1312
                        sng_trans = 1.0 if is_positive else -1.0
1313
                    if is_fraction:
1314
                        snd_trans.append(char)
1315
                    else:
1316
                        fst_trans.append(char)
1317
1318
            if not fst_trans:
1319
                e_trans = 0.0
1320
            else:
1321
                e_trans = sng_trans * float(''.join(fst_trans))
1322
1323
            if is_fraction:
1324
                e_trans /= float(''.join(snd_trans))
1325
1326
            translations[i][ind] = e_trans
1327
1328
    return rotations, translations
1329
1330
1331
def _expand_asym_unit(
1332
        asym_unit: np.ndarray,
1333
        rotations: np.ndarray,
1334
        translations: np.ndarray
1335
) -> Tuple[np.ndarray, ...]:
1336
    """
1337
    Asymmetric unit frac coords, list of rotations & translations -->
1338
    full fractional motif, asymmetric unit indices, multiplicities and
1339
    inverses (which motif points come from where in the asym unit).
1340
    """
1341
1342
    frac_motif = []     # Full motif
1343
    asym_inds = [0]     # Indices of asymmetric unit 
1344
    multiplicities = [] # Wyckoff multiplicities
1345
    inverses = []       # Motif -> Asymmetric unit
1346
    m, dims = asym_unit.shape
1347
1348
    # Apply all symmetries first
1349
    expanded_sites = np.zeros((m, len(rotations), dims))
1350
    for i in range(m):
1351
        expanded_sites[i] = np.dot(rotations, asym_unit[i]) + translations
1352
    expanded_sites = np.mod(expanded_sites, 1)
1353
1354
    for inv, sites in enumerate(expanded_sites):
1355
1356
        multiplicity = 0
1357
1358
        for site_ in sites:
1359
1360
            if not frac_motif:
1361
                frac_motif.append(site_)
1362
                inverses.append(inv)
1363
                multiplicity += 1
1364
                continue
1365
1366
            # check if site_ overlaps with existing sites
1367
            diffs1 = np.abs(site_ - frac_motif)
1368
            diffs2 = np.abs(diffs1 - 1)
1369
            mask = np.all((diffs1 <= _EQUIV_SITE_TOL) | 
1370
                          (diffs2 <= _EQUIV_SITE_TOL), axis=-1)
1371
1372
            if np.any(mask): # site is not new
1373
                where_equal = np.argwhere(mask).flatten()
1374
                for ind in where_equal:
1375
                    if inverses[ind] == inv: # site is invariant
1376
                        pass
1377
                    else: # equivalent to a different site
1378
                        msg = f'has equivalent sites at positions' \
1379
                              f'{inverses[ind]}, {inv}'
1380
                        warnings.warn(msg)
1381
            else: # new site
1382
                frac_motif.append(site_)
1383
                inverses.append(inv)
1384
                multiplicity += 1
1385
1386
        if multiplicity > 0:
1387
            multiplicities.append(multiplicity)
1388
            asym_inds.append(len(frac_motif))
1389
1390
    frac_motif = np.array(frac_motif)
1391
    asym_inds = np.array(asym_inds[:-1])
1392
    multiplicities = np.array(multiplicities)
1393
1394
    return frac_motif, asym_inds, multiplicities, inverses
1395
1396
1397
@numba.njit()
1398
def _unique_sites(asym_unit, tol):
1399
    """Uniquify (within tol) a list of fractional coordinates,
1400
    considering all points modulo 1. Returns an array of bools such that
1401
    asym_unit[_unique_sites(asym_unit, tol)] is the uniquified list.
1402
    """
1403
    
1404
    site_diffs1 = np.abs(np.expand_dims(asym_unit, 1) - asym_unit)
1405
    site_diffs2 = np.abs(site_diffs1 - 1)
1406
    sites_neq_mask = (site_diffs1 > tol) & (site_diffs2 > tol)
1407
    overlapping = np.triu(sites_neq_mask.sum(axis=-1) == 0, 1)
1408
    return overlapping.sum(axis=0) == 0
1409
1410
1411
def _has_disorder(label, occupancy):
1412
    """Return True if label ends with ? or occupancy is a number < 1.
1413
    """
1414
    return (np.isscalar(occupancy) and occupancy < 1) or label.endswith('?')
1415
1416
1417
def _parse_sitesym_pymatgen(data):
1418
    """Parse symmetry operations given data = block.data where block is
1419
    a pymatgen CifBlock object. If the symops are not present the space
1420
    group symbol is parsed and symops are generated.
1421
    """
1422
1423
    from pymatgen.symmetry.groups import SpaceGroup
1424
    from pymatgen.core.operations import SymmOp
1425
    import pymatgen.io.cif
1426
1427
    symops = []
1428
1429
    # Try to parse xyz symmetry operations
1430
    for symmetry_label in CIF_TAGS['symop']:
1431
1432
        xyz = data.get(symmetry_label)
1433
        if not xyz:
1434
            continue
1435
        if isinstance(xyz, str):
1436
            xyz = [xyz]
1437
        try:
1438
            symops = [SymmOp.from_xyz_string(s) for s in xyz]
1439
            break
1440
        except ValueError:
1441
            continue
1442
1443
    # Spacegroup symbol
1444
    if not symops:
1445
1446
        for symmetry_label in CIF_TAGS['spacegroup_name']:
1447
1448
            sg = data.get(symmetry_label)
1449
            if not sg:
1450
                continue
1451
            sg = re.sub(r'[\s_]', '', sg)
1452
1453
            try:
1454
                spg = pymatgen.io.cif.space_groups.get(sg)
1455
                if not spg:
1456
                    continue
1457
                symops = SpaceGroup(spg).symmetry_ops
1458
                break
1459
            except ValueError:
1460
                pass
1461
1462
            try:
1463
                for d in pymatgen.io.cif._get_cod_data():
1464
                    if sg == re.sub(r'\s+', '', d['hermann_mauguin']):
1465
                        xyz = d['symops']
1466
                        symops = [SymmOp.from_xyz_string(s) for s in xyz]
1467
                        break
1468
            except Exception:
1469
                continue
1470
1471
            if symops:
1472
                break
1473
1474
    # International number
1475
    if not symops:
1476
        for symmetry_label in CIF_TAGS['spacegroup_number']:
1477
            num = data.get(symmetry_label)
1478
            if not num:
1479
                continue
1480
1481
            try:
1482
                i = int(pymatgen.io.cif.str2float(num))
1483
                symops = SpaceGroup.from_int_number(i).symmetry_ops
1484
                break
1485
            except ValueError:
1486
                continue
1487
1488
    if not symops:
1489
        symops = [SymmOp.from_xyz_string(s) for s in ['x', 'y', 'z']]
1490
1491
    rotations = [op.rotation_matrix for op in symops]
1492
    translations = [op.translation_vector for op in symops]
1493
1494
    return rotations, translations
1495
1496
1497
def _frac_molecular_centres_ccdc(crystal):
1498
    """Returns the geometric centres of molecules in the unit cell.
1499
    Expects a ccdc Crystal object and returns fractional coordiantes.
1500
    """
1501
1502
    frac_centres = []
1503
    for comp in crystal.packing(inclusion='CentroidIncluded').components:
1504
        coords = [a.fractional_coordinates for a in comp.atoms]
1505
        frac_centres.append((sum(ax) / len(coords) for ax in zip(*coords)))
0 ignored issues
show
introduced by
The variable ax does not seem to be defined in case the for loop on line 1503 is not entered. Are you sure this can never be the case?
Loading history...
1506
    frac_centres = np.mod(np.array(frac_centres), 1)
1507
    return frac_centres[_unique_sites(frac_centres, _EQUIV_SITE_TOL)]
1508
1509
1510
def _heaviest_component_ccdc(molecule):
1511
    """Removes all but the heaviest component of the asymmetric unit.
1512
    Intended for removing solvents. Expects and returns a ccdc Molecule
1513
    object.
1514
    """
1515
1516
    component_weights = []
1517
    for component in molecule.components:
1518
        weight = 0
1519
        for a in component.atoms:
1520
            if isinstance(a.atomic_weight, (float, int)):
1521
                if isinstance(a.occupancy, (float, int)):
1522
                    weight += a.occupancy * a.atomic_weight
1523
                else:
1524
                    weight += a.atomic_weight
1525
        component_weights.append(weight)
1526
    largest_component_ind = np.argmax(np.array(component_weights))
1527
    molecule = molecule.components[largest_component_ind]
1528
    return molecule
1529
1530
1531
def _loop_to_dict_gemmi(gemmi_loop):
1532
    """gemmi Loop object --> dict, tags: values
1533
    """
1534
1535
    tablified_loop = [[] for _ in range(len(gemmi_loop.tags))]
1536
    n_cols = gemmi_loop.width()
1537
    for i, item in enumerate(gemmi_loop.values):
1538
        tablified_loop[i % n_cols].append(item)
1539
    return {tag: l for tag, l in zip(gemmi_loop.tags, tablified_loop)}
1540
1541
1542
def _snap_small_prec_coords(frac_coords, tol):
1543
    """Find where frac_coords is within 1e-4 of 1/3 or 2/3, change to
1544
    1/3 and 2/3. Recommended by pymatgen's CIF parser.
1545
    """
1546
    frac_coords[np.abs(1 - 3 * frac_coords) < tol] = 1 / 3.
1547
    frac_coords[np.abs(1 - 3 * frac_coords / 2) < tol] = 2 / 3.
1548
    return frac_coords
1549