Passed
Push — master ( f4ca5c...e2877c )
by Daniel
07:55
created

amd.io._refcodes_from_families_ccdc()   A

Complexity

Conditions 3

Size

Total Lines 28
Code Lines 18

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 18
dl 0
loc 28
rs 9.5
c 0
b 0
f 0
cc 3
nop 1
1
"""Tools for reading crystals from files, or from the CSD with
2
``csd-python-api``. The readers return
3
:class:`amd.PeriodicSet <.periodicset.PeriodicSet>` objects representing
4
the crystal which can be passed to :func:`amd.AMD() <.calculate.AMD>`
5
and :func:`amd.PDD() <.calculate.PDD>` to get their invariants.
6
"""
7
8
import os
9
import re
10
import functools
11
import warnings
12
from typing import Tuple
13
14
import numpy as np
15
import numba
16
17
from .utils import cellpar_to_cell
18
from .periodicset import PeriodicSet
19
20
21
def _custom_warning(message, category, filename, lineno, *args, **kwargs):
22
    return f'{category.__name__}: {message}\n'
23
warnings.formatwarning = _custom_warning
24
25
_EQUIV_SITE_TOL = 1e-3
26
27
CIF_TAGS = {
28
    'cellpar': [
29
        '_cell_length_a',
30
        '_cell_length_b',
31
        '_cell_length_c',
32
        '_cell_angle_alpha',
33
        '_cell_angle_beta',
34
        '_cell_angle_gamma',],
35
36
    'atom_site_fract': [
37
        '_atom_site_fract_x',
38
        '_atom_site_fract_y',
39
        '_atom_site_fract_z',],
40
41
    'atom_site_cartn': [
42
        '_atom_site_cartn_x',
43
        '_atom_site_cartn_y',
44
        '_atom_site_cartn_z',],
45
46
    'atom_symbol': [
47
        '_atom_site_type_symbol',
48
        '_atom_site_label',],
49
50
    'symop': [
51
        '_symmetry_equiv_pos_as_xyz',
52
        '_space_group_symop_operation_xyz',
53
        '_space_group_symop.operation_xyz',
54
        '_symmetry_equiv_pos_as_xyz_',
55
        '_space_group_symop_operation_xyz_',],
56
57
    'spacegroup_name': [
58
        '_space_group_name_Hall',
59
        '_symmetry_space_group_name_hall',
60
        '_space_group_name_H-M_alt',
61
        '_symmetry_space_group_name_H-M',
62
        '_symmetry_space_group_name_H_M',
63
        '_symmetry_space_group_name_h-m',],
64
65
    'spacegroup_number': [
66
        '_space_group_IT_number',
67
        '_symmetry_Int_Tables_number',
68
        '_space_group_IT_number_',
69
        '_symmetry_Int_Tables_number_',],
70
}
71
72
73
class _Reader:
74
75
    _DISORDER_OPTIONS = {'skip', 'ordered_sites', 'all_sites'}
76
77
    def __init__(self, iterable, converter, show_warnings=True):
78
        self._iterator = iter(iterable)
79
        self._converter = converter
80
        self.show_warnings = show_warnings
81
82
    def __iter__(self):
83
        if self._iterator is None or self._converter is None:
84
            raise RuntimeError(f'{self.__class__.__name__} not initialized.')
85
        return self
86
87
    def __next__(self):
88
        """Iterates over self._iterator, passing items through
89
        self._converter. Catches :class:`ParseError <.io.ParseError>`
90
        and warnings raised in self._converter, optionally printing
91
        them.
92
        """
93
94
        if not self.show_warnings:
95
            warnings.simplefilter('ignore')
96
97
        while True:
98
99
            item = next(self._iterator)
100
101
            with warnings.catch_warnings(record=True) as warning_msgs:
102
                msg = None
103
                try:
104
                    periodic_set = self._converter(item)
105
                except ParseError as err:
106
                    msg = str(err)
107
108
            if msg:
109
                warnings.warn(msg)
110
                continue
111
112
            for warning in warning_msgs:
113
                msg = f'{periodic_set.name} {warning.message}'
114
                warnings.warn(msg, category=warning.category)
115
116
            return periodic_set
117
118
    def read(self):
119
        """Reads the crystal(s), returns one
120
        :class:`amd.PeriodicSet <.periodicset.PeriodicSet>` if there is
121
        only one, otherwise returns a list.
122
        """
123
124
        l = list(self)
125
        if len(l) == 1:
126
            return l[0]
127
        return l
128
129
130
class CifReader(_Reader):
131
    """Read all structures in a .cif file or all files in a folder
132
    with ase or csd-python-api (if installed), yielding
133
    :class:`amd.PeriodicSet <.periodicset.PeriodicSet>` s.
134
135
    Parameters
136
    ----------
137
    path : str
138
        Path to a .cif file or directory. (Other files are accepted when
139
        using ``reader='ccdc'``, if csd-python-api is installed.)
140
    reader : str, optional
141
        The backend package used for parsing. Default is :code:`ase`,
142
        to use csd-python-api change to :code:`ccdc`. The ccdc reader
143
        should be able to read any format accepted by
144
        :class:`ccdc.io.EntryReader`, though only cifs have been tested.
145
    remove_hydrogens : bool, optional
146
        Remove Hydrogens from the crystal.
147
    disorder : str, optional
148
        Controls how disordered structures are handled. Default is
149
        ``skip`` which skips any crystal with disorder, since disorder
150
        conflicts with the periodic set model. To read disordered
151
        structures anyway, choose either :code:`ordered_sites` to remove
152
        atoms with disorder or :code:`all_sites` include all atoms
153
        regardless of disorder.
154
    heaviest_component : bool, optional
155
        csd-python-api only. Removes all but the heaviest molecule in
156
        the asymmeric unit, intended for removing solvents.
157
    molecular_centres : bool, default False
158
        csd-python-api only. Extract the centres of molecules in the
159
        unit cell and store in the attribute molecular_centres.
160
    show_warnings : bool, optional
161
        Controls whether warnings that arise during reading are printed.
162
163
    Yields
164
    ------
165
    :class:`amd.PeriodicSet <.periodicset.PeriodicSet>`
166
        Represents the crystal as a periodic set, consisting of a finite
167
        set of points (motif) and lattice (unit cell). Contains other
168
        data, e.g. the crystal's name and information about the
169
        asymmetric unit.
170
171
    Examples
172
    --------
173
174
        ::
175
176
            # Put all crystals in a .CIF in a list
177
            structures = list(amd.CifReader('mycif.cif'))
178
179
            # Can also accept path to a directory, reading all files inside
180
            structures = list(amd.CifReader('path/to/folder'))
181
182
            # Reads just one if the .CIF has just one crystal
183
            periodic_set = amd.CifReader('mycif.cif').read()
184
185
            # List of AMDs (k=100) of crystals in a .CIF
186
            amds = [amd.AMD(periodic_set, 100) for periodic_set in amd.CifReader('mycif.cif')]
187
    """
188
189
    def __init__(
190
            self,
191
            path,
192
            reader='ase',
193
            remove_hydrogens=False,
194
            disorder='skip',
195
            heaviest_component=False,
196
            molecular_centres=False,
197
            show_warnings=True,
198
    ):
199
200
        if disorder not in CifReader._DISORDER_OPTIONS:
201
            msg = 'disorder parameter must be one of ' \
202
                  f'{_Reader._DISORDER_OPTIONS} (passed {disorder})'
203
            raise ValueError(msg)
204
205
        if reader != 'ccdc':
206
            if heaviest_component:
207
                msg = 'Parameter heaviest_component only implemented for ' \
208
                      'reader="ccdc".'
209
                raise NotImplementedError(msg)
210
211
            if molecular_centres:
212
                msg = 'Parameter molecular_centres only implemented for ' \
213
                      'reader="ccdc".'
214
                raise NotImplementedError(msg)
215
216
        if reader in ('ase', 'pycodcif'):
217
            from ase.io.cif import parse_cif
218
            extensions = {'cif'}
219
            file_parser = functools.partial(parse_cif, reader=reader)
220
            converter = functools.partial(
221
                periodicset_from_ase_cifblock,
222
                remove_hydrogens=remove_hydrogens,
223
                disorder=disorder
224
            )
225
226
        elif reader == 'pymatgen':
227
            extensions = {'cif'}
228
            file_parser = CifReader._pymatgen_cifblock_generator
229
            converter = functools.partial(
230
                periodicset_from_pymatgen_cifblock,
231
                remove_hydrogens=remove_hydrogens,
232
                disorder=disorder
233
            )
234
235
        elif reader == 'gemmi':
236
            import gemmi
237
            extensions = {'cif'}
238
            file_parser = gemmi.cif.read_file
239
            converter = functools.partial(
240
                periodicset_from_gemmi_block,
241
                remove_hydrogens=remove_hydrogens,
242
                disorder=disorder
243
            )
244
245
        elif reader == 'ccdc':
246
            try:
247
                import ccdc.io
248
            except (ImportError, RuntimeError) as e:
249
                msg = 'Failed to import csd-python-api, please check it is' \
250
                      'installed and licensed.'
251
                raise ImportError(msg) from e
252
253
            extensions = ccdc.io.EntryReader.known_suffixes
254
            file_parser = ccdc.io.EntryReader
255
            converter = functools.partial(
256
                periodicset_from_ccdc_entry,
257
                remove_hydrogens=remove_hydrogens,
258
                disorder=disorder,
259
                molecular_centres=molecular_centres,
260
                heaviest_component=heaviest_component
261
            )
262
263
        else:
264
            raise ValueError(f'Unknown reader {reader}.')
265
266
        if os.path.isfile(path):
267
            iterable = file_parser(path)
268
        elif os.path.isdir(path):
269
            iterable = CifReader._dir_generator(path, file_parser, extensions)
270
        else:
271
            raise FileNotFoundError(f'No such file or directory: {path}')
272
273
        super().__init__(iterable, converter, show_warnings=show_warnings)
274
275
    @staticmethod
276
    def _dir_generator(path, callable, extensions):
277
        for file in os.listdir(path):
278
            suff = os.path.splitext(file)[1][1:]
279
            if suff.lower() in extensions:
280
                yield from callable(os.path.join(path, file))
281
282
    @staticmethod
283
    def _pymatgen_cifblock_generator(path):
284
        """Path to .cif --> generator of pymatgen CifBlock objects."""
285
        from pymatgen.io.cif import CifFile
286
        yield from CifFile.from_file(path).data.values()
287
288
289
class CSDReader(_Reader):
290
    """Read structures from the CSD with csd-python-api, yielding
291
    :class:`amd.PeriodicSet <.periodicset.PeriodicSet>` s.
292
293
    Parameters
294
    ----------
295
    refcodes : str or List[str], optional
296
        Single or list of CSD refcodes to read. If None or 'CSD',
297
        iterates over the whole CSD.
298
    families : bool, optional
299
        Read all entries whose refcode starts with the given strings, or
300
        'families' (e.g. giving 'DEBXIT' reads all entries starting with
301
        DEBXIT).
302
    remove_hydrogens : bool, optional
303
        Remove hydrogens from the crystal.
304
    disorder : str, optional
305
        Controls how disordered structures are handled. Default is
306
        ``skip`` which skips any crystal with disorder, since disorder
307
        conflicts with the periodic set model. To read disordered
308
        structures anyway, choose either :code:`ordered_sites` to remove
309
        atoms with disorder or :code:`all_sites` include all atoms
310
        regardless of disorder.
311
    heaviest_component : bool, optional
312
        Removes all but the heaviest molecule in the asymmeric unit,
313
        intended for removing solvents.
314
    molecular_centres : bool, default False
315
        Extract the centres of molecules in the unit cell and store in
316
        attribute molecular_centres.
317
    show_warnings : bool, optional
318
        Controls whether warnings that arise during reading are printed.
319
320
    Yields
321
    ------
322
    :class:`amd.PeriodicSet <.periodicset.PeriodicSet>`
323
        Represents the crystal as a periodic set, consisting of a finite
324
        set of points (motif) and lattice (unit cell). Contains other
325
        useful data, e.g. the crystal's name and information about the
326
        asymmetric unit for calculation.
327
328
    Examples
329
    --------
330
331
        ::
332
333
            # Put these entries in a list
334
            refcodes = ['DEBXIT01', 'DEBXIT05', 'HXACAN01']
335
            structures = list(amd.CSDReader(refcodes))
336
337
            # Read refcode families (any whose refcode starts with strings in the list)
338
            refcode_families = ['ACSALA', 'HXACAN']
339
            structures = list(amd.CSDReader(refcode_families, families=True))
340
341
            # Get AMDs (k=100) for crystals in these families
342
            refcodes = ['ACSALA', 'HXACAN']
343
            amds = []
344
            for periodic_set in amd.CSDReader(refcodes, families=True):
345
                amds.append(amd.AMD(periodic_set, 100))
346
347
            # Giving the reader nothing reads from the whole CSD.
348
            for periodic_set in amd.CSDReader():
349
                ...
350
    """
351
352
    def __init__(
353
            self,
354
            refcodes=None,
355
            families=False,
356
            remove_hydrogens=False,
357
            disorder='skip',
358
            heaviest_component=False,
359
            molecular_centres=False,
360
            show_warnings=True,
361
    ):
362
363
        try:
364
            import ccdc.io
365
        except (ImportError, RuntimeError) as _:
366
            msg = 'Failed to import csd-python-api, please check it is ' \
367
                  'installed and licensed.'
368
            raise ImportError(msg)
369
370
        if disorder not in CSDReader._DISORDER_OPTIONS:
371
            msg = 'disorder parameter must be one of ' \
372
                  f'{_Reader._DISORDER_OPTIONS} (passed {disorder})'
373
            raise ValueError(msg)
374
375
        if isinstance(refcodes, str) and refcodes.lower() == 'csd':
376
            refcodes = None
377
378
        if refcodes is None:
379
            families = False
380
        else:
381
            if isinstance(refcodes, str):
382
                refcodes = [refcodes]
383
            else:
384
                refcodes = list(refcodes)
385
386
        if families:
387
            refcodes = _refcodes_from_families_ccdc(refcodes)
388
389
        entry_reader = ccdc.io.EntryReader('CSD')
390
        converter = functools.partial(
391
            periodicset_from_ccdc_entry,
392
            remove_hydrogens=remove_hydrogens,
393
            disorder=disorder,
394
            molecular_centres=molecular_centres,
395
            heaviest_component=heaviest_component
396
        )
397
        iterable = self._ccdc_generator(refcodes, entry_reader)
398
        super().__init__(iterable, converter, show_warnings=show_warnings)
399
400
    @staticmethod
401
    def _ccdc_generator(refcodes, entry_reader):
402
        """Generates ccdc Entries from CSD refcodes.
403
        """
404
405
        if refcodes is None:
406
            for entry in entry_reader:
407
                yield entry
408
        else:
409
            for refcode in refcodes:
410
                try:
411
                    entry = entry_reader.entry(refcode)
412
                    yield entry
413
                except RuntimeError:
414
                    warnings.warn(f'{refcode} not found in database')
415
416
417
class ParseError(ValueError):
418
    """Raised when an item cannot be parsed into a periodic set.
419
    """
420
    pass
421
422
423
def periodicset_from_ase_cifblock(
424
        block,
425
        remove_hydrogens=False,
426
        disorder='skip'
427
) -> PeriodicSet:
428
    """:class:`ase.io.cif.CIFBlock` --> 
429
    :class:`amd.PeriodicSet <.periodicset.PeriodicSet>`.
430
    :class:`ase.io.cif.CIFBlock` is the type returned by
431
    :class:`ase.io.cif.parse_cif`.
432
433
    Parameters
434
    ----------
435
    block : :class:`ase.io.cif.CIFBlock`
436
        An ase :class:`ase.io.cif.CIFBlock` object representing a
437
        crystal.
438
    remove_hydrogens : bool, optional
439
        Remove Hydrogens from the crystal.
440
    disorder : str, optional
441
        Controls how disordered structures are handled. Default is
442
        ``skip`` which skips any crystal with disorder, since disorder
443
        conflicts with the periodic set model. To read disordered
444
        structures anyway, choose either :code:`ordered_sites` to remove
445
        atoms with disorder or :code:`all_sites` include all atoms
446
        regardless of disorder.
447
448
    Returns
449
    -------
450
    :class:`amd.PeriodicSet <.periodicset.PeriodicSet>`
451
        Represents the crystal as a periodic set, consisting of a finite
452
        set of points (motif) and lattice (unit cell). Contains other
453
        useful data, e.g. the crystal's name and information about the
454
        asymmetric unit for calculation.
455
456
    Raises
457
    ------
458
    ParseError
459
        Raised if the structure fails to be parsed for any of the
460
        following: 1. Required data is missing (e.g. cell parameters),
461
        2. The motif is empty after removing H or disordered sites,
462
        3. :code:``disorder == 'skip'`` and disorder is found on any
463
        atom.
464
    """
465
466
    import ase
467
    from ase.spacegroup.spacegroup import parse_sitesym
468
469
    # Unit cell
470
    cellpar = [block.get(tag) for tag in CIF_TAGS['cellpar']]
471
    if None in cellpar:
472
        raise ParseError(f'{block.name} has missing cell data')
473
    cell = cellpar_to_cell(*cellpar)
474
475
    # Asymmetric unit coordinates. ase removes uncertainty brackets
476
    cartesian = False # flag needed for later
477
    asym_unit = [block.get(name) for name in CIF_TAGS['atom_site_fract']]
478
    if None in asym_unit: # missing scaled coords, try Cartesian
479
        asym_unit = [block.get(name) for name in CIF_TAGS['atom_site_cartn']]
480
        if None in asym_unit:
481
            raise ParseError(f'{block.name} has missing coordinates')
482
        cartesian = True
483
    asym_unit = list(zip(*asym_unit)) # transpose [xs,ys,zs] -> [p1,p2,...]
484
485
    # Atomic types
486
    asym_symbols = block._get_any(CIF_TAGS['atom_symbol'])
487
    if asym_symbols is None:
488
        warnings.warn('missing atomic types will be labelled 0')
489
        asym_types = [0] * len(asym_unit)
490
    else:
491
        asym_types = []
492
        for label in asym_symbols:
493
            if label in ('.', '?'):
494
                warnings.warn('missing atomic types will be labelled 0')
495
                num = 0
496
            else:
497
                sym = re.search(r'([A-Z][a-z]?)', label).group(0)
498
                if sym == 'D':
499
                    sym = 'H'
500
                num = ase.data.atomic_numbers[sym]
501
            asym_types.append(num)
502
503
    # Find where sites have disorder if necassary
504
    has_disorder = []
505
    if disorder != 'all_sites':
506
        occupancies = block.get('_atom_site_occupancy')
507
        if occupancies is None:
508
            occupancies = np.ones((len(asym_unit), ))
509
        labels = block.get('_atom_site_label')
510
        if labels is None:
511
            labels = [''] * len(asym_unit)
512
        for lab, occ in zip(labels, occupancies):
513
            has_disorder.append(_has_disorder(lab, occ))
514
515
    # Remove sites with ?, . or other invalid string for coordinates
516
    invalid = []
517
    for i, xyz in enumerate(asym_unit):
518
        if not all(isinstance(coord, (int, float)) for coord in xyz):
519
            invalid.append(i)
520
    if invalid:
521
        warnings.warn('atoms without sites or missing data will be removed')
522
        asym_unit = [c for i, c in enumerate(asym_unit) if i not in invalid]
523
        asym_types = [t for i, t in enumerate(asym_types) if i not in invalid]
524
        if disorder != 'all_sites':
525
            has_disorder = [d for i, d in enumerate(has_disorder)
526
                            if i not in invalid]
527
528
    remove_sites = []
529
530
    if remove_hydrogens:
531
        remove_sites.extend(i for i, num in enumerate(asym_types) if num == 1)
532
533
    # Remove atoms with fractional occupancy or raise ParseError
534 View Code Duplication
    if disorder != 'all_sites':
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
535
        for i, dis in enumerate(has_disorder):
536
            if i in remove_sites:
537
                continue
538
            if dis:
539
                if disorder == 'skip':
540
                    msg = f"{block.name} has disorder, pass " \
541
                           "disorder='ordered_sites'or 'all_sites' to " \
542
                           "remove/ignore disorder"
543
                    raise ParseError(msg)
544
                elif disorder == 'ordered_sites':
545
                    remove_sites.append(i)
546
547
    # Asymmetric unit
548
    asym_unit = [c for i, c in enumerate(asym_unit) if i not in remove_sites]
549
    asym_types = [t for i, t in enumerate(asym_types) if i not in remove_sites]
550
    if len(asym_unit) == 0:
551
        raise ParseError(f'{block.name} has no valid sites')
552
    asym_unit = np.array(asym_unit)
553
554
    # If Cartesian coords were given, convert to scaled
555
    if cartesian:
556
        asym_unit = asym_unit @ np.linalg.inv(cell)
557
    asym_unit = np.mod(asym_unit, 1)
558
    # asym_unit = _snap_small_prec_coords(asym_unit) # recommended by pymatgen
559
560
    # Remove overlapping sites unless disorder == 'all_sites'
561
    if disorder != 'all_sites':
562
        keep_sites = _unique_sites(asym_unit, _EQUIV_SITE_TOL)
563
        if not np.all(keep_sites):
564
            msg = 'may have overlapping sites, duplicates will be removed'
565
            warnings.warn(msg)
566
            asym_unit = asym_unit[keep_sites]
567
            asym_types = [t for t, keep in zip(asym_types, keep_sites) if keep]
568
569
    # Symmetry operations
570
    sitesym = block._get_any(CIF_TAGS['symop'])
571
    if sitesym is None: # no symops, use spacegroup
572
        try:
573
            spg = block.get_spacegroup(True)
574
            rot, trans = spg.rotations, spg.translations
575
        except: # no spacegroup, assume P1
576
            rot, trans = parse_sitesym(['x,y,z'])
577
    else:
578
        if isinstance(sitesym, str):
579
            sitesym = [sitesym]
580
        rot, trans = parse_sitesym(sitesym)
581
582
    # Apply symmetries to asymmetric unit
583
    frac_motif, asym_inds, wyc_muls, inverses = _expand_asym_unit(asym_unit, rot, trans)
584
    types = np.array([asym_types[i] for i in inverses])
585
    motif = frac_motif @ cell
586
587
    return PeriodicSet(
588
        motif=motif,
589
        cell=cell,
590
        name=block.name,
591
        asymmetric_unit=asym_inds,
592
        wyckoff_multiplicities=wyc_muls,
593
        types=types
594
    )
595
596
597
def periodicset_from_ase_atoms(
598
        atoms,
599
        remove_hydrogens=False
600
) -> PeriodicSet:
601
    """:class:`ase.atoms.Atoms` -->
602
    :class:`amd.PeriodicSet <.periodicset.PeriodicSet>`. Does not have
603
    the option to remove disorder.
604
605
    Parameters
606
    ----------
607
    atoms : :class:`ase.atoms.Atoms`
608
        An ase :class:`ase.atoms.Atoms` object representing a crystal.
609
    remove_hydrogens : bool, optional
610
        Remove Hydrogens from the crystal.
611
612
    Returns
613
    -------
614
    :class:`amd.PeriodicSet <.periodicset.PeriodicSet>`
615
        Represents the crystal as a periodic set, consisting of a finite
616
        set of points (motif) and lattice (unit cell). Contains other
617
        useful data, e.g. the crystal's name and information about the
618
        asymmetric unit for calculation.
619
620
    Raises
621
    ------
622
    ParseError
623
        Raised if there are no valid sites in atoms.
624
    """
625
626
    from ase.spacegroup import get_basis
627
628
    cell = atoms.get_cell().array
629
630
    remove_inds = []
631
    if remove_hydrogens:
632
        for i in np.where(atoms.get_atomic_numbers() == 1)[0]:
633
            remove_inds.append(i)
634
    for i in sorted(remove_inds, reverse=True):
635
        atoms.pop(i)
636
637
    if len(atoms) == 0:
638
        raise ParseError(f'ase Atoms object has no valid sites')
639
640
    # Symmetry operations from spacegroup 
641
    spg = None
642
    if 'spacegroup' in atoms.info:
643
        spg = atoms.info['spacegroup']
644
        rot, trans = spg.rotations, spg.translations
645
    # else assume no symmetries?
646
647
    # Asymmetric unit. ase default tol is 1e-5
648
    asym_unit = get_basis(atoms, spacegroup=spg, tol=_EQUIV_SITE_TOL)
649
    frac_motif, asym_inds, wyc_muls, _ = _expand_asym_unit(asym_unit, rot, trans)
0 ignored issues
show
introduced by
The variable trans does not seem to be defined in case 'spacegroup' in atoms.info on line 642 is False. Are you sure this can never be the case?
Loading history...
introduced by
The variable rot does not seem to be defined in case 'spacegroup' in atoms.info on line 642 is False. Are you sure this can never be the case?
Loading history...
650
    motif = frac_motif @ cell
651
652
    return PeriodicSet(
653
        motif=motif,
654
        cell=cell,
655
        asymmetric_unit=asym_inds,
656
        wyckoff_multiplicities=wyc_muls,
657
        types=atoms.get_atomic_numbers()
658
    )
659
660
661
def periodicset_from_pymatgen_cifblock(
662
        block,
663
        remove_hydrogens=False,
664
        disorder='skip'
665
) -> PeriodicSet:
666
    """:class:`pymatgen.io.cif.CifBlock` -->
667
    :class:`amd.PeriodicSet <.periodicset.PeriodicSet>`.
668
669
    Parameters
670
    ----------
671
    block : :class:`pymatgen.io.cif.CifBlock`
672
        A pymatgen CifBlock object representing a crystal.
673
    remove_hydrogens : bool, optional
674
        Remove Hydrogens from the crystal.
675
    disorder : str, optional
676
        Controls how disordered structures are handled. Default is
677
        ``skip`` which skips any crystal with disorder, since disorder
678
        conflicts with the periodic set model. To read disordered
679
        structures anyway, choose either :code:`ordered_sites` to remove
680
        atoms with disorder or :code:`all_sites` include all atoms
681
        regardless of disorder.
682
683
    Returns
684
    -------
685
    :class:`amd.PeriodicSet <.periodicset.PeriodicSet>`
686
        Represents the crystal as a periodic set, consisting of a finite
687
        set of points (motif) and lattice (unit cell). Contains other
688
        useful data, e.g. the crystal's name and information about the
689
        asymmetric unit for calculation.
690
691
    Raises
692
    ------
693
    ParseError
694
        Raised if the structure can/should not be parsed for the
695
        following reasons: 1. No sites found or motif is empty after
696
        removing Hydrogens & disorder, 2. A site has missing
697
        coordinates, 3. :code:``disorder == 'skip'`` and disorder is
698
        found on any atom.
699
    """
700
701
    from pymatgen.io.cif import str2float
702
    import pymatgen.core.periodic_table as periodic_table
703
704
    odict = block.data
705
706
    # Unit cell
707
    cellpar = [odict.get(tag) for tag in CIF_TAGS['cellpar']]
708
    if None in cellpar:
709
        raise ParseError(f'{block.header} has missing cell data')
710
    cellpar = [str2float(v) for v in cellpar]
711
    cell = cellpar_to_cell(*cellpar)
712
713
    # Asymmetric unit coordinates
714
    cartesian = False
715
    asym_unit = [odict.get(tag) for tag in CIF_TAGS['atom_site_fract']]
716
717
    if None in asym_unit: # missing scaled coordinates, try Cartesian
718
        asym_unit = [odict.get(tag) for tag in CIF_TAGS['atom_site_cartn']]
719
        if None in asym_unit:
720
            raise ParseError(f'{block.header} has no coordinates')
721
        cartesian = True
722
723
    asym_unit = list(zip(*asym_unit)) # transpose [xs,ys,zs] -> [p1,p2,...]
724
    asym_unit = [[str2float(coord) for coord in xyz] 
725
                 for xyz in asym_unit]
726
727
    # Atomic types
728
    for tag in CIF_TAGS['atom_symbol']:
729
        asym_symbols = odict.get(tag)
730
        if asym_symbols is not None:
731
            asym_types = []
732
            for label in asym_symbols:
733
                if label in ('.', '?'):
734
                    warnings.warn('missing atomic types will be labelled 0')
735
                    num = 0
736
                else:
737
                    sym = re.search(r'([A-Z][a-z]?)', label).group(0)
738
                    if sym == 'D':
739
                        sym = 'H'
740
                    num = periodic_table.Element[sym].number
741
                asym_types.append(num)
742
            break
743
    else:
744
        warnings.warn('missing atomic types will be labelled 0')
745
        asym_types = [0] * len(asym_unit)
746
747
    # Find where sites have disorder if necassary
748
    has_disorder = []
749
    if disorder != 'all_sites':
750
        occupancies = odict.get('_atom_site_occupancy')
751
        if occupancies is None:
752
            occupancies = np.ones((len(asym_unit), ))
753
        labels = odict.get('_atom_site_label')
754
        if labels is None:
755
            labels = [''] * len(asym_unit)
756
        for lab, occ in zip(labels, occupancies):
757
            has_disorder.append(_has_disorder(lab, occ))
758
759
    # Remove sites with ?, . or other invalid string for coordinates
760
    invalid = []
761
    for i, xyz in enumerate(asym_unit):
762
        if not all(isinstance(coord, (int, float)) for coord in xyz):
763
            invalid.append(i)
764
765
    if invalid:
766
        warnings.warn('atoms without sites or missing data will be removed')
767
        asym_unit = [c for i, c in enumerate(asym_unit) if i not in invalid]
768
        asym_types = [c for i, c in enumerate(asym_types) if i not in invalid]
769
        if disorder != 'all_sites':
770
            has_disorder = [d for i, d in enumerate(has_disorder) if i not in invalid]
771
772
    remove_sites = []
773
774
    if remove_hydrogens:
775
        remove_sites.extend((i for i, num in enumerate(asym_types) if num == 1))
0 ignored issues
show
introduced by
The variable i does not seem to be defined in case the for loop on line 761 is not entered. Are you sure this can never be the case?
Loading history...
776
777
    # Remove atoms with fractional occupancy or raise ParseError
778 View Code Duplication
    if disorder != 'all_sites':
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
779
        for i, dis in enumerate(has_disorder):
780
            if i in remove_sites:
781
                continue
782
            if dis:
783
                if disorder == 'skip':
784
                    msg = f"{block.header} has disorder, pass " \
785
                            "disorder='ordered_sites' or 'all_sites' to " \
786
                            "remove/ignore disorder"
787
                    raise ParseError(msg)
788
                elif disorder == 'ordered_sites':
789
                    remove_sites.append(i)
790
791
    # Asymmetric unit
792
    asym_unit = [c for i, c in enumerate(asym_unit) if i not in remove_sites]
793
    asym_types = [t for i, t in enumerate(asym_types) if i not in remove_sites]
794
    if len(asym_unit) == 0:
795
        raise ParseError(f'{block.header} has no valid sites')
796
    asym_unit = np.array(asym_unit)
797
798
    # If Cartesian coords were given, convert to scaled
799
    if cartesian:
800
        asym_unit = asym_unit @ np.linalg.inv(cell)
801
    asym_unit = np.mod(asym_unit, 1)
802
    # asym_unit = _snap_small_prec_coords(asym_unit) # recommended by pymatgen
803
804
    # Remove overlapping sites unless disorder == 'all_sites'
805
    if disorder != 'all_sites':      
806
        keep_sites = _unique_sites(asym_unit, _EQUIV_SITE_TOL)
807
        if not np.all(keep_sites):
808
            msg = 'may have overlapping sites; duplicates will be removed'
809
            warnings.warn(msg)
810
        asym_unit = asym_unit[keep_sites]
811
        asym_types = [sym for sym, keep in zip(asym_types, keep_sites) if keep]
812
813
    # Apply symmetries to asymmetric unit
814
    rot, trans = _parse_sitesym_pymatgen(odict)
815
    frac_motif, asym_inds, wyc_muls, inverses = _expand_asym_unit(asym_unit, rot, trans)
816
    types = np.array([asym_types[i] for i in inverses])
817
    motif = frac_motif @ cell
818
819
    return PeriodicSet(
820
        motif=motif,
821
        cell=cell,
822
        name=block.header,
823
        asymmetric_unit=asym_inds,
824
        wyckoff_multiplicities=wyc_muls,
825
        types=types
826
    )
827
828
829
def periodicset_from_pymatgen_structure(
830
        structure,
831
        remove_hydrogens=False,
832
        disorder='skip'
833
) -> PeriodicSet:
834
    """:class:`pymatgen.core.structure.Structure` -->
835
    :class:`amd.PeriodicSet <.periodicset.PeriodicSet>`.
836
    Does not set the name of the periodic set, as there seems to be no
837
    name attribute in the pymatgen Structure object.
838
839
    Parameters
840
    ----------
841
    structure : :class:`pymatgen.core.structure.Structure`
842
        A pymatgen Structure object representing a crystal.
843
    remove_hydrogens : bool, optional
844
        Remove Hydrogens from the crystal.
845
    disorder : str, optional
846
        Controls how disordered structures are handled. Default is
847
        ``skip`` which skips any crystal with disorder, since disorder
848
        conflicts with the periodic set model. To read disordered
849
        structures anyway, choose either :code:`ordered_sites` to remove
850
        atoms with disorder or :code:`all_sites` include all atoms
851
        regardless of disorder.
852
853
    Returns
854
    -------
855
    :class:`amd.PeriodicSet <.periodicset.PeriodicSet>`
856
        Represents the crystal as a periodic set, consisting of a finite
857
        set of points (motif) and lattice (unit cell). Contains other
858
        useful data, e.g. the crystal's name and information about the
859
        asymmetric unit for calculation.
860
    
861
    Raises
862
    ------
863
    ParseError
864
        Raised if the :code:`disorder == 'skip'` and 
865
        :code:`not structure.is_ordered`
866
    """
867
868
    from pymatgen.symmetry.analyzer import SpacegroupAnalyzer
869
870
    if remove_hydrogens:
871
        structure.remove_species(['H', 'D'])
872
873
    # Disorder
874
    if disorder == 'skip':
875
        if not structure.is_ordered:
876
            msg = f"pymatgen Structure has disorder, pass " \
877
                   "disorder='ordered_sites' or 'all_sites' to " \
878
                   "remove/ignore disorder"
879
            raise ParseError(msg)
880
    elif disorder == 'ordered_sites':
881
        remove_inds = []
882
        for i, comp in enumerate(structure.species_and_occu):
883
            if comp.num_atoms < 1:
884
                remove_inds.append(i)
885
        structure.remove_sites(remove_inds)
886
887
    motif = structure.cart_coords
888
    cell = structure.lattice.matrix
889
    sym_structure = SpacegroupAnalyzer(structure).get_symmetrized_structure()
890
    asym_unit = np.array([l[0] for l in sym_structure.equivalent_indices])
891
    wyc_muls = np.array([len(l) for l in sym_structure.equivalent_indices])
892
    types = np.array(sym_structure.atomic_numbers)
893
894
    return PeriodicSet(
895
        motif=motif,
896
        cell=cell,
897
        asymmetric_unit=asym_unit,
898
        wyckoff_multiplicities=wyc_muls,
899
        types=types
900
    )
901
902
903
def periodicset_from_ccdc_entry(
904
        entry,
905
        remove_hydrogens=False,
906
        disorder='skip',
907
        heaviest_component=False,
908
        molecular_centres=False
909
) -> PeriodicSet:
910
    """:class:`ccdc.entry.Entry` -->
911
    :class:`amd.PeriodicSet <.periodicset.PeriodicSet>`.
912
    Entry is the type returned by :class:`ccdc.io.EntryReader`.
913
914
    Parameters
915
    ----------
916
    entry : :class:`ccdc.entry.Entry`
917
        A ccdc Entry object representing a database entry.
918
    remove_hydrogens : bool, optional
919
        Remove Hydrogens from the crystal.
920
    disorder : str, optional
921
        Controls how disordered structures are handled. Default is
922
        ``skip`` which skips any crystal with disorder, since disorder
923
        conflicts with the periodic set model. To read disordered
924
        structures anyway, choose either :code:`ordered_sites` to remove
925
        atoms with disorder or :code:`all_sites` include all atoms
926
        regardless of disorder.
927
    heaviest_component : bool, optional
928
        Removes all but the heaviest molecule in the asymmeric unit,
929
        intended for removing solvents.
930
    molecular_centres : bool, default False
931
        Extract the centres of molecules in the unit cell and store in
932
        the attribute molecular_centres of the returned PeriodicSet.
933
934
    Returns
935
    -------
936
    :class:`amd.PeriodicSet <.periodicset.PeriodicSet>`
937
        Represents the crystal as a periodic set, consisting of a finite
938
        set of points (motif) and lattice (unit cell). Contains other
939
        useful data, e.g. the crystal's name and information about the
940
        asymmetric unit for calculation.
941
942
    Raises
943
    ------
944
    ParseError
945
        Raised if the structure fails parsing for any of the following:
946
        1. entry.has_3d_structure is False, 2.
947
        :code:``disorder == 'skip'`` and disorder is found on any atom,
948
        3. entry.crystal.molecule.all_atoms_have_sites is False,
949
        4. a.fractional_coordinates is None for any a in
950
        entry.crystal.disordered_molecule, 5. The motif is empty after
951
        removing Hydrogens and disordered sites.
952
    """
953
954
    # Entry specific flags
955
    if not entry.has_3d_structure:
956
        raise ParseError(f'{entry.identifier} has no 3D structure')
957
958
    # Disorder
959
    if disorder == 'skip' and entry.has_disorder:
960
        msg = f"{entry.identifier} has disorder, pass " \
961
                "disorder='ordered_sites' or 'all_sites' to remove/ignore" \
962
                "disorder"
963
        raise ParseError(msg)
964
965
    return periodicset_from_ccdc_crystal(
966
        entry.crystal,
967
        remove_hydrogens=remove_hydrogens,
968
        disorder=disorder,
969
        heaviest_component=heaviest_component,
970
        molecular_centres=molecular_centres
971
    )
972
973
974
def periodicset_from_ccdc_crystal(
975
        crystal,
976
        remove_hydrogens=False,
977
        disorder='skip',
978
        heaviest_component=False,
979
        molecular_centres=False
980
) -> PeriodicSet:
981
    """:class:`ccdc.crystal.Crystal` -->
982
    :class:`amd.PeriodicSet <.periodicset.PeriodicSet>`.
983
    Crystal is the type returned by :class:`ccdc.io.CrystalReader`.
984
985
    Parameters
986
    ----------
987
    crystal : :class:`ccdc.crystal.Crystal`
988
        A ccdc Crystal object representing a crystal structure.
989
    remove_hydrogens : bool, optional
990
        Remove Hydrogens from the crystal.
991
    disorder : str, optional
992
        Controls how disordered structures are handled. Default is
993
        ``skip`` which skips any crystal with disorder, since disorder
994
        conflicts with the periodic set model. To read disordered
995
        structures anyway, choose either :code:`ordered_sites` to remove
996
        atoms with disorder or :code:`all_sites` include all atoms
997
        regardless of disorder.
998
    heaviest_component : bool, optional
999
        Removes all but the heaviest molecule in the asymmeric unit,
1000
        intended for removing solvents.
1001
    molecular_centres : bool, default False
1002
        Extract the centres of molecules in the unit cell and store in
1003
        the attribute molecular_centres of the returned PeriodicSet.
1004
1005
    Returns
1006
    -------
1007
    :class:`amd.PeriodicSet <.periodicset.PeriodicSet>`
1008
        Represents the crystal as a periodic set, consisting of a finite
1009
        set of points (motif) and lattice (unit cell). Contains other
1010
        useful data, e.g. the crystal's name and information about the
1011
        asymmetric unit for calculation.
1012
1013
    Raises
1014
    ------
1015
    ParseError
1016
        Raised if the structure fails parsing for any of the following:
1017
        1. :code:``disorder == 'skip'`` and disorder is found on any
1018
        atom, 2. crystal.molecule.all_atoms_have_sites is False,
1019
        3. a.fractional_coordinates is None for any a in
1020
        crystal.disordered_molecule, 4. The motif is empty after
1021
        removing H, disordered sites or solvents.
1022
    """
1023
1024
    molecule = crystal.disordered_molecule
1025
1026
    # Disorder
1027
    if disorder == 'skip':
1028
        if crystal.has_disorder or \
1029
         any(_has_disorder(a.label, a.occupancy) for a in molecule.atoms):
1030
            msg = f"{crystal.identifier} has disorder, pass " \
1031
                   "disorder='ordered_sites' or 'all_sites' to remove/ignore" \
1032
                   "disorder"
1033
            raise ParseError(msg)
1034
1035
    elif disorder == 'ordered_sites':
1036
        molecule.remove_atoms(a for a in molecule.atoms
1037
                              if _has_disorder(a.label, a.occupancy))
1038
1039
    if remove_hydrogens:
1040
        molecule.remove_atoms(
1041
            a for a in molecule.atoms if a.atomic_symbol in 'HD'
1042
        )
1043
1044
    if heaviest_component and len(molecule.components) > 1:
1045
        molecule = _heaviest_component_ccdc(molecule)
1046
1047
    # Remove atoms with missing coordinates and warn
1048
    is_missing = (a.fractional_coordinates is None for a in molecule.atoms)
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable a does not seem to be defined.
Loading history...
1049
    if any(is_missing):
1050
        warnings.warn('atoms without sites or missing data will be removed')
1051
        molecule.remove_atoms(
1052
            a for a, missing in zip(molecule.atoms, is_missing) if missing
1053
        )
1054
1055
    if not molecule.all_atoms_have_sites:
1056
        raise ParseError(f'{crystal.identifier} has atoms without sites')
1057
1058
    crystal.molecule = molecule
1059
    cell = cellpar_to_cell(*crystal.cell_lengths, *crystal.cell_angles)
1060
1061
    if molecular_centres:
1062
        frac_centres = _frac_molecular_centres_ccdc(crystal)
1063
        mol_centres = frac_centres @ cell
1064
        return PeriodicSet(mol_centres, cell, name=crystal.identifier)
1065
1066
    asym_atoms = crystal.asymmetric_unit_molecule.atoms
1067
    # check for None?
1068
    asym_unit = np.array([tuple(a.fractional_coordinates) for a in asym_atoms])
1069
1070
    if asym_unit.shape[0] == 0:
1071
        raise ParseError(f'{crystal.identifier} has no valid sites')
1072
1073
    asym_unit = np.mod(asym_unit, 1)
1074
    # asym_unit = _snap_small_prec_coords(asym_unit) # recommended by pymatgen
1075
    asym_types = [a.atomic_number for a in asym_atoms]
1076
1077
    # Disorder
1078
    if disorder != 'all_sites':
1079
        keep_sites = _unique_sites(asym_unit, _EQUIV_SITE_TOL)
1080
        if not np.all(keep_sites):
1081
            msg = 'may have overlapping sites; duplicates will be removed'
1082
            warnings.warn(msg)
1083
        asym_unit = asym_unit[keep_sites]
1084
        asym_types = [sym for sym, keep in zip(asym_types, keep_sites) if keep]
1085
1086
    # Symmetry operations
1087
    sitesym = crystal.symmetry_operators
1088
    # try spacegroup numbers?
1089
    if not sitesym:
1090
        sitesym = ['x,y,z']
1091
    rot = np.array([np.array(crystal.symmetry_rotation(op)).reshape((3, 3)) 
1092
                    for op in sitesym])
1093
    trans = np.array([crystal.symmetry_translation(op) for op in sitesym])
1094
1095
    # Apply symmetries to asymmetric unit
1096
    frac_motif, asym_inds, wyc_muls, inverses = _expand_asym_unit(asym_unit, rot, trans)
1097
    motif = frac_motif @ cell
1098
    types = np.array([asym_types[i] for i in inverses])
1099
1100
    return PeriodicSet(
1101
        motif=motif,
1102
        cell=cell,
1103
        name=crystal.identifier,
1104
        asymmetric_unit=asym_inds,
1105
        wyckoff_multiplicities=wyc_muls,
1106
        types=types
1107
    )
1108
1109
1110
# Not quite finished.
1111
def periodicset_from_gemmi_block(
1112
        block,
1113
        remove_hydrogens=False,
1114
        disorder='skip'
1115
) -> PeriodicSet:
1116
    """:class:`gemmi.cif.Block` -->
1117
    :class:`amd.PeriodicSet <.periodicset.PeriodicSet>`.
1118
    Block is the type returned by :class:`gemmi.cif.read_file`.
1119
1120
    Parameters
1121
    ----------
1122
    block : :class:`ase.io.cif.CIFBlock`
1123
        An ase CIFBlock object representing a crystal.
1124
    remove_hydrogens : bool, optional
1125
        Remove Hydrogens from the crystal.
1126
    disorder : str, optional
1127
        Controls how disordered structures are handled. Default is
1128
        ``skip`` which skips any crystal with disorder, since disorder
1129
        conflicts with the periodic set model. To read disordered
1130
        structures anyway, choose either :code:`ordered_sites` to remove
1131
        atoms with disorder or :code:`all_sites` include all atoms
1132
        regardless of disorder.
1133
1134
    Returns
1135
    -------
1136
    :class:`amd.PeriodicSet <.periodicset.PeriodicSet>`
1137
        Represents the crystal as a periodic set, consisting of a finite
1138
        set of points (motif) and lattice (unit cell). Contains other
1139
        useful data, e.g. the crystal's name and information about the
1140
        asymmetric unit for calculation.
1141
1142
    Raises
1143
    ------
1144
    ParseError
1145
        Raised if the structure fails to be parsed for any of the
1146
        following: 1. Required data is missing (e.g. cell parameters),
1147
        2. :code:``disorder == 'skip'`` and disorder is found on any
1148
        atom, 3. The motif is empty after removing H or disordered
1149
        sites.
1150
    """
1151
1152
    import gemmi
1153
    from ase.spacegroup.spacegroup import parse_sitesym
1154
1155
    # Unit cell
1156
    cellpar = [gemmi.cif.as_number(block.find_value(tag))
1157
               for tag in CIF_TAGS['cellpar']]
1158
    if None in cellpar:
1159
        raise ParseError(f'{block.name} has missing cell information')
1160
    cell = cellpar_to_cell(*cellpar)
1161
1162
    xyz_loop = block.find(CIF_TAGS['atom_site_fract']).loop
1163
    if xyz_loop is None:
1164
        # check for Cartesian coordinates
1165
        raise ParseError(f'{block.name} has missing coordinate data')
1166
1167
    # Asymmetric unit coordinates
1168
    loop_dict = _loop_to_dict_gemmi(xyz_loop)
1169
    xyz_str = [loop_dict[t] for t in CIF_TAGS['atom_site_fract']]
1170
    asym_unit = [[gemmi.cif.as_number(c) for c in coords] for coords in xyz_str]
1171
    asym_unit = np.mod(np.array(asym_unit).T, 1)
1172
    # asym_unit = _snap_small_prec_coords(asym_unit) # recommended by pymatgen
1173
1174
    # Asymmetric unit types
1175
    if '_atom_site_type_symbol' in loop_dict:
1176
        asym_syms = loop_dict['_atom_site_type_symbol']
1177
        asym_types = [gemmi.Element(s).atomic_number for s in asym_syms]
1178
    else:
1179
        warnings.warn('missing atomic types will be labelled 0')
1180
        asym_types = [0 for _ in range(len(asym_unit))]
1181
1182
    remove_sites = []
1183
1184
    # Disorder
1185
    if '_atom_site_label' in loop_dict:
1186
        labels = loop_dict['_atom_site_label']
1187
    else:
1188
        labels = [''] * xyz_loop.length()
1189
1190
    if '_atom_site_occupancy' in loop_dict:
1191
        occupancies = [gemmi.cif.as_number(occ)
1192
                       for occ in loop_dict['_atom_site_occupancy']]
1193
    else:
1194
        occupancies = [None for _ in range(xyz_loop.length())]
1195
1196
    if disorder == 'skip':
1197
        if any(_has_disorder(l, o) for l, o in zip(labels, occupancies)):
1198
            msg = f"{block.name} has disorder, pass " \
1199
                   "disorder='ordered_sites' or 'all_sites' to " \
1200
                   "remove/ignore disorder"
1201
            raise ParseError(msg)
1202
    elif disorder == 'ordered_sites':
1203
        for i, (lab, occ) in enumerate(zip(labels, occupancies)):
1204
            if _has_disorder(lab, occ):
1205
                remove_sites.append(i)
1206
1207
    if remove_hydrogens:
1208
        remove_sites.extend(
1209
            i for i, num in enumerate(asym_types) if num == 1
1210
        )
1211
1212
    # Asymmetric unit
1213
    asym_unit = np.delete(asym_unit, remove_sites, axis=0)
1214
    asym_types = [s for i, s in enumerate(asym_types) if i not in remove_sites]
1215
    if asym_unit.shape[0] == 0:
1216
        raise ParseError(f'{block.name} has no valid sites')
1217
1218
    # Remove overlapping sites unless disorder == 'all_sites'
1219
    if disorder != 'all_sites':
1220
        keep_sites = _unique_sites(asym_unit, _EQUIV_SITE_TOL)
1221
        if not np.all(keep_sites):
1222
            msg = 'may have overlapping sites; duplicates will be removed'
1223
            warnings.warn(msg)
1224
        asym_unit = asym_unit[keep_sites]
1225
        asym_types = [sym for sym, keep in zip(asym_types, keep_sites) if keep]
1226
1227
    # Symmetry operations
1228
    sitesym = []
1229
    for tag in CIF_TAGS['symop']:
1230
        symop_loop = block.find([tag]).loop
1231
        if symop_loop is not None:
1232
            symop_loop_dict = _loop_to_dict_gemmi(symop_loop)
1233
            sitesym = symop_loop_dict[tag]
1234
            break
1235
    # Try spacegroup names/numbers?
1236
    if not sitesym:
1237
        sitesym = ['x,y,z',]
1238
1239
    # Apply symmetries to asymmetric unit
1240
    rot, trans = parse_sitesym(sitesym)
1241
    frac_motif, asym_inds, wyc_muls, inverses = _expand_asym_unit(asym_unit, rot, trans)
1242
    types = np.array([asym_types[i] for i in inverses])
1243
    motif = frac_motif @ cell
1244
1245
    return PeriodicSet(
1246
        motif=motif,
1247
        cell=cell,
1248
        name=block.name,
1249
        asymmetric_unit=asym_inds,
1250
        wyckoff_multiplicities=wyc_muls,
1251
        types=types
1252
    )
1253
1254
1255
def _expand_asym_unit(
1256
        asym_unit: np.ndarray,
1257
        rotations: np.ndarray,
1258
        translations: np.ndarray
1259
) -> Tuple[np.ndarray, ...]:
1260
    """
1261
    Asymmetric unit frac coords, list of rotations, list of translations
1262
    -->
1263
    full fractional motif, asymmetric unit indices, multiplicities,
1264
    inverses.
1265
    """
1266
1267
    frac_motif = []     # Full motif
1268
    asym_inds = [0]     # Indices of asymmetric unit 
1269
    multiplicities = [] # Wyckoff multiplicities
1270
    inverses = []       # Motif -> Asymmetric unit
1271
    m, dims = asym_unit.shape
1272
1273
    # Apply all symmetries first
1274
    expanded_sites = np.zeros((m, len(rotations), dims))
1275
    for i in range(m):
1276
        expanded_sites[i] = np.dot(rotations, asym_unit[i]) + translations
1277
    expanded_sites = np.mod(expanded_sites, 1)
1278
1279
    for inv, sites in enumerate(expanded_sites):
1280
1281
        multiplicity = 0
1282
1283
        for site_ in sites:
1284
1285
            if not frac_motif:
1286
                frac_motif.append(site_)
1287
                inverses.append(inv)
1288
                multiplicity += 1
1289
                continue
1290
1291
            # check if site_ overlaps with existing sites
1292
            diffs1 = np.abs(site_ - frac_motif)
1293
            diffs2 = np.abs(diffs1 - 1)
1294
            mask = np.all((diffs1 <= _EQUIV_SITE_TOL) | 
1295
                          (diffs2 <= _EQUIV_SITE_TOL), axis=-1)
1296
1297
            if np.any(mask): # site is not new
1298
                where_equal = np.argwhere(mask).flatten()
1299
                for ind in where_equal:
1300
                    if inverses[ind] == inv: # site is invariant
1301
                        pass
1302
                    else: # equivalent to a different site
1303
                        msg = f'has equivalent sites at positions' \
1304
                              f'{inverses[ind]}, {inv}'
1305
                        warnings.warn(msg)
1306
            else: # new site
1307
                frac_motif.append(site_)
1308
                inverses.append(inv)
1309
                multiplicity += 1
1310
1311
        if multiplicity > 0:
1312
            multiplicities.append(multiplicity)
1313
            asym_inds.append(len(frac_motif))
1314
1315
    frac_motif = np.array(frac_motif)
1316
    asym_inds = np.array(asym_inds[:-1])
1317
    multiplicities = np.array(multiplicities)
1318
1319
    return frac_motif, asym_inds, multiplicities, inverses
1320
1321
1322
@numba.njit()
1323
def _unique_sites(asym_unit, tol):
1324
    """Uniquify (within tol) a list of fractional coordinates,
1325
    considering all points modulo 1. Returns an array of bools such that
1326
    asym_unit[_unique_sites(asym_unit, tol)] is the uniquified list.
1327
    """
1328
1329
    site_diffs1 = np.abs(np.expand_dims(asym_unit, 1) - asym_unit)
1330
    site_diffs2 = np.abs(site_diffs1 - 1)
1331
    sites_neq_mask = np.logical_and((site_diffs1 > tol), (site_diffs2 > tol))
1332
    overlapping = np.triu(sites_neq_mask.sum(axis=-1) == 0, 1)
1333
    return overlapping.sum(axis=0) == 0
1334
1335
1336
def _has_disorder(label, occupancy):
1337
    """Return True if label ends with ? or occupancy is a number < 1.
1338
    """
1339
    return label.endswith('?') or (np.isscalar(occupancy) and occupancy < 1)
1340
1341
1342
def _parse_sitesym_pymatgen(data):
1343
    """Parse symmetry operations given data = block.data where block is
1344
    a pymatgen CifBlock object. If the symops are not present the space
1345
    group symbol is parsed and symops are generated.
1346
    """
1347
1348
    from pymatgen.symmetry.groups import SpaceGroup
1349
    from pymatgen.core.operations import SymmOp
1350
    import pymatgen.io.cif
1351
1352
    symops = []
1353
1354
    # Try to parse xyz symmetry operations
1355
    for symmetry_label in CIF_TAGS['symop']:
1356
1357
        xyz = data.get(symmetry_label)
1358
        if not xyz:
1359
            continue
1360
        if isinstance(xyz, str):
1361
            xyz = [xyz]
1362
        try:
1363
            symops = [SymmOp.from_xyz_string(s) for s in xyz]
1364
            break
1365
        except ValueError:
1366
            continue
1367
1368
    # Spacegroup symbol
1369
    if not symops:
1370
1371
        for symmetry_label in CIF_TAGS['spacegroup_name']:
1372
1373
            sg = data.get(symmetry_label)
1374
            if not sg:
1375
                continue
1376
            sg = re.sub(r'[\s_]', '', sg)
1377
1378
            try:
1379
                spg = pymatgen.io.cif.space_groups.get(sg)
1380
                if not spg:
1381
                    continue
1382
                symops = SpaceGroup(spg).symmetry_ops
1383
                break
1384
            except ValueError:
1385
                pass
1386
1387
            try:
1388
                for d in pymatgen.io.cif._get_cod_data():
1389
                    if sg == re.sub(r'\s+', '', d['hermann_mauguin']):
1390
                        xyz = d['symops']
1391
                        symops = [SymmOp.from_xyz_string(s) for s in xyz]
1392
                        break
1393
            except Exception:
1394
                continue
1395
1396
            if symops:
1397
                break
1398
1399
    # International number
1400
    if not symops:
1401
        for symmetry_label in CIF_TAGS['spacegroup_number']:
1402
            num = data.get(symmetry_label)
1403
            if not num:
1404
                continue
1405
1406
            try:
1407
                i = int(pymatgen.io.cif.str2float(num))
1408
                symops = SpaceGroup.from_int_number(i).symmetry_ops
1409
                break
1410
            except ValueError:
1411
                continue
1412
1413
    if not symops:
1414
        symops = [SymmOp.from_xyz_string(s) for s in ['x', 'y', 'z']]
1415
1416
    rotations = [op.rotation_matrix for op in symops]
1417
    translations = [op.translation_vector for op in symops]
1418
1419
    return rotations, translations
1420
1421
1422
def _frac_molecular_centres_ccdc(crystal):
1423
    """Returns the geometric centres of molecules in the unit cell.
1424
    Expects a ccdc Crystal object and returns fractional coordiantes.
1425
    """
1426
1427
    frac_centres = []
1428
    for comp in crystal.packing(inclusion='CentroidIncluded').components:
1429
        coords = [a.fractional_coordinates for a in comp.atoms]
1430
        x, y, z = zip(*coords)
1431
        m = len(coords)
1432
        frac_centres.append((sum(x) / m, sum(y) / m, sum(z) / m))
1433
    frac_centres = np.mod(np.array(frac_centres), 1)
1434
    return frac_centres[_unique_sites(frac_centres, _EQUIV_SITE_TOL)]
1435
1436
1437
def _heaviest_component_ccdc(molecule):
1438
    """Removes all but the heaviest component of the asymmetric unit.
1439
    Intended for removing solvents. Expects and returns a ccdc Molecule
1440
    object.
1441
    """
1442
1443
    component_weights = []
1444
    for component in molecule.components:
1445
        weight = 0
1446
        for a in component.atoms:
1447
            if isinstance(a.atomic_weight, (float, int)):
1448
                if isinstance(a.occupancy, (float, int)):
1449
                    weight += a.occupancy * a.atomic_weight
1450
                else:
1451
                    weight += a.atomic_weight
1452
        component_weights.append(weight)
1453
    largest_component_ind = np.argmax(np.array(component_weights))
1454
    molecule = molecule.components[largest_component_ind]
1455
    return molecule
1456
1457
1458
def _refcodes_from_families_ccdc(refcode_families):
1459
    """List of strings --> all CSD refcodes starting with any of the
1460
    strings. Intended to be passed a list of families and return all
1461
    refcodes in them.
1462
    """
1463
1464
    try:
1465
        import ccdc.search
1466
    except (ImportError, RuntimeError) as _:
1467
        msg = 'Failed to import csd-python-api, please check it is ' \
1468
              'installed and licensed.'
1469
        raise ImportError(msg)
1470
1471
    all_refcodes = []
1472
    for refcode in refcode_families:
1473
        query = ccdc.search.TextNumericSearch()
1474
        query.add_identifier(refcode)
1475
        hits = [hit.identifier for hit in query.search()]
1476
        all_refcodes.extend(hits)
1477
1478
    # filter to unique refcodes
1479
    seen = set()
1480
    seen_add = seen.add
1481
    refcodes = [
1482
        refcode for refcode in all_refcodes
1483
        if not (refcode in seen or seen_add(refcode))]
1484
1485
    return refcodes
1486
1487
1488
def _loop_to_dict_gemmi(gemmi_loop):
1489
    """gemmi Loop object --> dict, tags: values
1490
    """
1491
1492
    tablified_loop = [[] for _ in range(len(gemmi_loop.tags))]
1493
    n_cols = gemmi_loop.width()
1494
    for i, item in enumerate(gemmi_loop.values):
1495
        tablified_loop[i % n_cols].append(item)
1496
    return {tag: l for tag, l in zip(gemmi_loop.tags, tablified_loop)}
1497
1498
1499
def _snap_small_prec_coords(frac_coords):
1500
    """Find where frac_coords is within 1e-4 of 1/3 or 2/3, change to
1501
    1/3 and 2/3. Recommended by pymatgen's CIF parser.
1502
    """
1503
    frac_coords[np.abs(1 - 3 * frac_coords) < 1e-4] = 1 / 3.
1504
    frac_coords[np.abs(1 - 3 * frac_coords / 2) < 1e-4] = 2 / 3.
1505
    return frac_coords
1506