Passed
Push — master ( 3151ff...9779af )
by Daniel
08:02
created

amd.io._cif_val_to_float()   B

Complexity

Conditions 6

Size

Total Lines 15
Code Lines 11

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 11
dl 0
loc 15
rs 8.6666
c 0
b 0
f 0
cc 6
nop 1
1
"""Tools for reading crystals from files, or from the CSD with
0 ignored issues
show
coding-style introduced by
Too many lines in module (1449/1000)
Loading history...
2
``csd-python-api``. The readers return
3
:class:`amd.PeriodicSet <.periodicset.PeriodicSet>` objects representing the
4
crystal which can be passed to :func:`amd.AMD() <.calculate.AMD>` and
5
:func:`amd.PDD() <.calculate.PDD>` to get their invariants.
6
"""
7
8
import os
9
import re
10
import functools
11
import warnings
12
from typing import Tuple
13
14
import numpy as np
15
import numba
16
17
import ase.io.cif
18
import ase.data
19
import ase.spacegroup
20
from ase.spacegroup.spacegroup import parse_sitesym as ase_parse_sitesym
21
22
from .utils import cellpar_to_cell
23
from .periodicset import PeriodicSet
24
25
26
def _custom_warning(message, category, filename, lineno, *args, **kwargs):
0 ignored issues
show
Unused Code introduced by
The argument filename seems to be unused.
Loading history...
Unused Code introduced by
The argument lineno seems to be unused.
Loading history...
Unused Code introduced by
The argument args seems to be unused.
Loading history...
Unused Code introduced by
The argument kwargs seems to be unused.
Loading history...
27
    return f'{category.__name__}: {message}\n'
28
29
warnings.formatwarning = _custom_warning
30
31
_EQUIV_SITE_TOL = 1e-3
32
_DISORDER_OPTIONS = {'skip', 'ordered_sites', 'all_sites'}
33
34
CIF_TAGS = {
35
    'cellpar': [
36
        '_cell_length_a', 
0 ignored issues
show
Coding Style introduced by
Trailing whitespace
Loading history...
37
        '_cell_length_b', 
0 ignored issues
show
Coding Style introduced by
Trailing whitespace
Loading history...
38
        '_cell_length_c',
39
        '_cell_angle_alpha', 
0 ignored issues
show
Coding Style introduced by
Trailing whitespace
Loading history...
40
        '_cell_angle_beta', 
0 ignored issues
show
Coding Style introduced by
Trailing whitespace
Loading history...
41
        '_cell_angle_gamma',],
42
43
    'atom_site_fract': [
44
        '_atom_site_fract_x',
45
        '_atom_site_fract_y',
46
        '_atom_site_fract_z',],
47
48
    'atom_site_cartn': [
49
        '_atom_site_cartn_x',
50
        '_atom_site_cartn_y',
51
        '_atom_site_cartn_z',],
52
53
    'atom_symbol': [
54
        '_atom_site_type_symbol',
55
        '_atom_site_label',],
56
57
    'symop': [
58
        '_symmetry_equiv_pos_as_xyz',
59
        '_space_group_symop_operation_xyz',
60
        '_space_group_symop.operation_xyz',
61
        '_symmetry_equiv_pos_as_xyz_',
62
        '_space_group_symop_operation_xyz_',],
63
64
    'spacegroup_name': [
65
        '_space_group_name_Hall',
66
        '_space_group_name_Hall_',
67
        '_symmetry_space_group_name_hall',
68
        '_symmetry_space_group_name_hall_',
69
        '_space_group_name_H-M_alt',
70
        '_space_group_name_H-M_alt_',
71
        '_symmetry_space_group_name_H-M',
72
        '_symmetry_space_group_name_H-M_',
73
        '_symmetry_space_group_name_H_M',
74
        '_symmetry_space_group_name_H_M_',
75
        '_symmetry_space_group_name_h-m',
76
        '_symmetry_space_group_name_h-m_',],
77
78
    'spacegroup_number': [
79
        '_space_group_IT_number',
80
        '_symmetry_Int_Tables_number',
81
        '_space_group_IT_number_',
82
        '_symmetry_Int_Tables_number_',],
83
}
84
85
86
class _Reader:
87
88
    def __init__(
0 ignored issues
show
best-practice introduced by
Too many arguments (6/5)
Loading history...
89
            self,
90
            remove_hydrogens=False,
91
            disorder='skip',
92
            heaviest_component=False,
93
            molecular_centres=False,
94
            show_warnings=True
95
    ):
96
97
        if disorder not in _DISORDER_OPTIONS:
98
            msg = 'disorder parameter must be one of ' \
99
                  f'{_DISORDER_OPTIONS} (passed {disorder})'
100
            raise ValueError(msg)
101
102
        self.remove_hydrogens = remove_hydrogens
103
        self.disorder = disorder
104
        self.heaviest_component = heaviest_component
105
        self.molecular_centres = molecular_centres
106
        self.show_warnings = show_warnings
107
        self._backend_generator = None
108
        self._converter = None
109
110
    def __iter__(self):
111
        if self._backend_generator is None or self._converter is None:
112
            raise RuntimeError(f'{self.__class__.__name__} not initialized.')
113
        return self
114
115
    def __next__(self):
116
        """Iterates over self._backend_generator, passing items through self._converter. 
0 ignored issues
show
Coding Style introduced by
Trailing whitespace
Loading history...
117
        Catches ParseError + warnings raised in self._converter, optionally printing them.
118
        """
119
120
        if not self.show_warnings:
121
            warnings.simplefilter('ignore')
122
123
        while True:
124
125
            item = next(self._backend_generator) # will raise StopIteration when done
126
127
            with warnings.catch_warnings(record=True) as warning_msgs:
128
                msg = None
129
                try:
130
                    periodic_set = self._converter(item)
0 ignored issues
show
Bug introduced by
self._converter does not seem to be callable.
Loading history...
131
                except ParseError as err:
132
                    msg = str(err)
133
134
            if msg:
135
                warnings.warn(msg)
136
                continue
137
138
            for warning in warning_msgs:
139
                msg = f'{periodic_set.name} {warning.message}'
140
                warnings.warn(msg, category=warning.category)
141
142
            return periodic_set
143
144
    def read(self):
145
        """Reads the crystal(s), returns one 
0 ignored issues
show
Coding Style introduced by
Trailing whitespace
Loading history...
146
        :class:`amd.PeriodicSet <.periodicset.PeriodicSet>` if there is only
147
        one, otherwise returns a list. (Note the return type is not
148
        consistent!)"""
149
        l = list(self)
150
        if len(l) == 1:
0 ignored issues
show
unused-code introduced by
Unnecessary "else" after "return"
Loading history...
151
            return l[0]
152
        else:
153
            return l
154
155
156
class CifReader(_Reader):
157
    """Read all structures in a .cif file or all files in a folder
158
    with ase or csd-python-api (if installed), yielding
159
    :class:`amd.PeriodicSet <.periodicset.PeriodicSet>` s.
160
161
    Parameters
162
    ----------
163
    path : str
164
        Path to a .cif file or directory. (Other files are accepted when using
165
        ``reader='ccdc'``, if csd-python-api is installed.)
166
    reader : str, optional
167
        The backend package used for parsing. Default is :code:`ase`,
168
        to use csd-python-api change to :code:`ccdc`. The ccdc reader should 
0 ignored issues
show
Coding Style introduced by
Trailing whitespace
Loading history...
169
        be able to read any format accepted by :class:`ccdc.io.EntryReader`,
170
        though only cifs have been tested.
171
    remove_hydrogens : bool, optional
172
        Remove Hydrogens from the crystal.
173
    disorder : str, optional
174
        Controls how disordered structures are handled. Default is ``skip``
175
        which skips any crystal with disorder, since disorder conflicts with
176
        the periodic set model. To read disordered structures anyway, choose
177
        either :code:`ordered_sites` to remove atoms with disorder or
178
        :code:`all_sites` include all atoms regardless of disorder.
179
    heaviest_component : bool, optional
180
        csd-python-api only. Removes all but the heaviest molecule in the
181
        asymmeric unit, intended for removing solvents.
182
    molecular_centres : bool, default False
183
        csd-python-api only. Extract the centres of molecules in the unit cell
184
        and store in the attribute molecular_centres.
185
    show_warnings : bool, optional
186
        Controls whether warnings that arise during reading are printed.
187
188
    Yields
189
    ------
190
    :class:`amd.PeriodicSet <.periodicset.PeriodicSet>`
191
        Represents the crystal as a periodic set, consisting of a finite set
192
        of points (motif) and lattice (unit cell). Contains other data, e.g.
193
        the crystal's name and information about the asymmetric unit.
194
195
    Examples
196
    --------
197
    
0 ignored issues
show
Coding Style introduced by
Trailing whitespace
Loading history...
198
        ::
199
200
            # Put all crystals in a .CIF in a list
201
            structures = list(amd.CifReader('mycif.cif'))
202
203
            # Can also accept path to a directory, reading all files inside
204
            structures = list(amd.CifReader('path/to/folder'))
205
206
            # Reads just one if the .CIF has just one crystal
207
            periodic_set = amd.CifReader('mycif.cif').read()
208
209
            # List of AMDs (k=100) of crystals in a .CIF
210
            amds = [amd.AMD(periodic_set, 100) for periodic_set in amd.CifReader('mycif.cif')]
211
    """
212
213
    def __init__(
0 ignored issues
show
best-practice introduced by
Too many arguments (8/5)
Loading history...
214
            self,
215
            path,
216
            reader='ase',
217
            remove_hydrogens=False,
218
            disorder='skip',
219
            heaviest_component=False,
220
            molecular_centres=False,
221
            show_warnings=True,
222
    ):
223
224
        super().__init__(
225
            remove_hydrogens=remove_hydrogens,
226
            disorder=disorder,
227
            heaviest_component=heaviest_component,
228
            molecular_centres=molecular_centres,
229
            show_warnings=show_warnings
230
        )
231
232
        # file_parser: Callable(path) -> Iterable[External object]
233
        # self._converter: Callable(External object) -> PeriodicSet
234
        # set options in self._converter with functools.partial
235
236
        if reader != 'ccdc':
237
238
            if heaviest_component:
239
                msg = 'Parameter heaviest_component ' \
240
                      'only implemented for reader="ccdc".'
241
                raise NotImplementedError(msg)
242
243
            if molecular_centres:
244
                msg = 'Parameter molecular_centres ' \
245
                      'only implemented for reader="ccdc".'
246
                raise NotImplementedError(msg)
247
248
        if reader in ('ase', 'pycodcif'):
249
250
            extensions = {'cif'}
251
            file_parser = functools.partial(ase.io.cif.parse_cif, reader=reader)
252
            self._converter = functools.partial(
253
                periodicset_from_ase_cifblock,
254
                remove_hydrogens=self.remove_hydrogens,
255
                disorder=self.disorder
256
            )
257
258
        elif reader == 'pymatgen':
259
260
            extensions = {'cif'}
261
            file_parser = CifReader._pymatgen_cifblock_generator
262
            self._converter = functools.partial(
263
                periodicset_from_pymatgen_cifblock,
264
                remove_hydrogens=self.remove_hydrogens,
265
                disorder=self.disorder
266
            )
267
268
        elif reader == 'gemmi':
269
270
            import gemmi.cif
0 ignored issues
show
introduced by
Import outside toplevel (gemmi.cif)
Loading history...
introduced by
Unable to import 'gemmi.cif'
Loading history...
271
272
            extensions = {'cif'}
273
            file_parser = gemmi.cif.read_file
274
            self._converter = functools.partial(
275
                periodicset_from_gemmi_block,
276
                remove_hydrogens=self.remove_hydrogens,
277
                disorder=self.disorder
278
            )
279
280
        elif reader == 'ccdc':
281
282
            try:
283
                import ccdc.io
0 ignored issues
show
introduced by
Import outside toplevel (ccdc.io)
Loading history...
284
            except (ImportError, RuntimeError) as e:
285
                msg = 'Failed to import csd-python-api, please' \
286
                      'check it is installed and licensed.'
287
                raise ImportError(msg) from e
288
289
            extensions = ccdc.io.EntryReader.known_suffixes
290
            file_parser = ccdc.io.EntryReader
291
            self._converter = functools.partial(
292
                periodicset_from_ccdc_entry,
293
                remove_hydrogens=self.remove_hydrogens,
294
                disorder=self.disorder,
295
                molecular_centres=self.molecular_centres,
296
                heaviest_component=self.heaviest_component
297
            )
298
299
        else:
300
            raise ValueError(f'Unknown reader {reader}.')
301
302
        if os.path.isfile(path):
303
            self._backend_generator = iter(file_parser(path))
304
        elif os.path.isdir(path):
305
            self._backend_generator = iter(self._generate_from_dir(path, file_parser, extensions))
306
        else:
307
            raise FileNotFoundError(f'No such file or directory: {path}')
308
309
    def _generate_from_dir(self, path, file_parser, extensions):
0 ignored issues
show
Coding Style introduced by
This method could be written as a function/class method.

If a method does not access any attributes of the class, it could also be implemented as a function or static method. This can help improve readability. For example

class Foo:
    def some_method(self, x, y):
        return x + y;

could be written as

class Foo:
    @classmethod
    def some_method(cls, x, y):
        return x + y;
Loading history...
310
        for file in os.listdir(path):
311
            suff = os.path.splitext(file)[1][1:]
312
            if suff.lower() in extensions:
313
                yield from file_parser(os.path.join(path, file))
314
315
    @staticmethod
316
    def _pymatgen_cifblock_generator(path):
317
        """Path to .cif --> generator of pymatgen CifBlocks."""
318
        from pymatgen.io.cif import CifFile
0 ignored issues
show
introduced by
Unable to import 'pymatgen.io.cif'
Loading history...
introduced by
Import outside toplevel (pymatgen.io.cif.CifFile)
Loading history...
319
        yield from CifFile.from_file(path).data.values()
320
321
322
class CSDReader(_Reader):
323
    """Read structures from the CSD with csd-python-api, yielding 
0 ignored issues
show
Coding Style introduced by
Trailing whitespace
Loading history...
324
    :class:`amd.PeriodicSet <.periodicset.PeriodicSet>` s.
325
326
    Parameters
327
    ----------
328
    refcodes : str or List[str], optional
329
        Single or list of CSD refcodes to read. If None or 'CSD', iterates
330
        over the whole CSD.
331
    families : bool, optional
332
        Read all entries whose refcode starts with the given strings, or
333
        'families' (e.g. giving 'DEBXIT' reads all entries starting with
334
        DEBXIT).
335
    remove_hydrogens : bool, optional
336
        Remove hydrogens from the crystal.
337
    disorder : str, optional
338
        Controls how disordered structures are handled. Default is ``skip``
339
        which skips any crystal with disorder, since disorder conflicts with
340
        the periodic set model. To read disordered structures anyway, choose
341
        either :code:`ordered_sites` to remove atoms with disorder or
342
        :code:`all_sites` include all atoms regardless of disorder.
343
    heaviest_component : bool, optional
344
        Removes all but the heaviest molecule in the asymmeric unit, intended
345
        for removing solvents.
346
    molecular_centres : bool, default False
347
        Extract the centres of molecules in the unit cell and store in
348
        attribute molecular_centres.
349
    show_warnings : bool, optional
350
        Controls whether warnings that arise during reading are printed.
351
352
    Yields
353
    ------
354
    :class:`amd.PeriodicSet <.periodicset.PeriodicSet>`
355
        Represents the crystal as a periodic set, consisting of a finite set
356
        of points (motif) and lattice (unit cell). Contains other useful data,
357
        e.g. the crystal's name and information about the asymmetric unit for
358
        calculation.
359
360
    Examples
361
    --------
362
    
0 ignored issues
show
Coding Style introduced by
Trailing whitespace
Loading history...
363
        ::
364
365
            # Put these entries in a list
366
            refcodes = ['DEBXIT01', 'DEBXIT05', 'HXACAN01']
367
            structures = list(amd.CSDReader(refcodes))
368
369
            # Read refcode families (any whose refcode starts with strings in the list)
370
            refcode_families = ['ACSALA', 'HXACAN']
371
            structures = list(amd.CSDReader(refcode_families, families=True))
372
373
            # Get AMDs (k=100) for crystals in these families
374
            refcodes = ['ACSALA', 'HXACAN']
375
            amds = []
376
            for periodic_set in amd.CSDReader(refcodes, families=True):
377
                amds.append(amd.AMD(periodic_set, 100))
378
379
            # Giving the reader nothing reads from the whole CSD.
380
            for periodic_set in amd.CSDReader():
381
                ...
382
    """
383
384
    def __init__(
0 ignored issues
show
best-practice introduced by
Too many arguments (8/5)
Loading history...
385
            self,
386
            refcodes=None,
387
            families=False,
388
            remove_hydrogens=False,
389
            disorder='skip',
390
            heaviest_component=False,
391
            molecular_centres=False,
392
            show_warnings=True,
393
    ):
394
395
        super().__init__(
396
            remove_hydrogens=remove_hydrogens,
397
            disorder=disorder,
398
            heaviest_component=heaviest_component,
399
            molecular_centres=molecular_centres,
400
            show_warnings=show_warnings,
401
        )
402
403
        try:
404
            import ccdc.io
0 ignored issues
show
introduced by
Import outside toplevel (ccdc.io)
Loading history...
405
        except (ImportError, RuntimeError) as _:
406
            msg = 'Failed to import csd-python-api, please' \
407
                  'check it is installed and licensed.'
408
            raise ImportError(msg)
409
410
        if isinstance(refcodes, str) and refcodes.lower() == 'csd':
411
            refcodes = None
412
413
        if refcodes is None:
414
            families = False
415
        else:
416
            if isinstance(refcodes, str):
417
                refcodes = [refcodes]
418
            else:
419
                refcodes = list(refcodes)
420
421
        if families:
422
            refcodes = _refcodes_from_families(refcodes)
423
424
        # self._entry_reader = ccdc.io.EntryReader('CSD')
425
        entry_reader = ccdc.io.EntryReader('CSD')
426
        self._converter = functools.partial(
427
            periodicset_from_ccdc_entry,
428
            remove_hydrogens=self.remove_hydrogens,
429
            disorder=self.disorder,
430
            molecular_centres=self.molecular_centres,
431
            heaviest_component=self.heaviest_component
432
        )
433
        self._backend_generator = iter(self._ccdc_generator(refcodes, entry_reader))
434
435
    @staticmethod
436
    def _ccdc_generator(refcodes, entry_reader):
437
        """Generates ccdc Entries from CSD refcodes."""
438
439
        if refcodes is None:
440
            for entry in entry_reader:
441
                yield entry
442
        else:
443
            for refcode in refcodes:
444
                try:
445
                    entry = entry_reader.entry(refcode)
446
                    yield entry
447
                except RuntimeError:
448
                    warnings.warn(f'{refcode} not found in database')
449
450
451
class ParseError(ValueError):
452
    """Raised when an item cannot be parsed into a periodic set."""
453
    pass
0 ignored issues
show
Unused Code introduced by
Unnecessary pass statement
Loading history...
454
455
456
def periodicset_from_ase_cifblock(
457
        block,
458
        remove_hydrogens=False,
459
        disorder='skip'
460
) -> PeriodicSet:
461
    """:class:`ase.io.cif.CIFBlock` --> :class:`amd.PeriodicSet <.periodicset.PeriodicSet>`
462
    CIFBlock is the type returned by :class:`ase.io.cif.parse_cif`.
463
464
    Parameters
465
    ----------
466
    block : :class:`ase.io.cif.CIFBlock`
467
        An ase CIFBlock object representing a crystal.
468
    remove_hydrogens : bool, optional
469
        Remove Hydrogens from the crystal.
470
    disorder : str, optional
471
        Controls how disordered structures are handled. Default is ``skip``
472
        which raises a ParseError if the input has any disorder, since
473
        disorder conflicts with the periodic set model. To read disordered
474
        structures anyway, choose either :code:`ordered_sites` to remove sites
475
        with disorder or :code:`all_sites` include all sites regardless.
476
477
    Returns
478
    -------
479
    :class:`amd.PeriodicSet <.periodicset.PeriodicSet>`
480
        Represents the crystal as a periodic set, consisting of a finite set
481
        of points (motif) and lattice (unit cell). Contains other useful data,
482
        e.g. the crystal's name and information about the asymmetric unit for
483
        calculation.
484
485
    Raises
486
    ------
487
    ParseError :
488
        Raised if the structure fails to be parsed for any of the following:
489
        1. Required data is missing (e.g. cell parameters),
490
        2. disorder == 'skip' and any of:
491
            (a) any atom has fractional occupancy,
492
            (b) any atom's label ends with '?',
493
        3. The motif is empty after removing H or disordered sites.
494
    """
495
496
    # get cell
497
    try:
498
        cellpar = [block[tag] for tag in CIF_TAGS['cellpar']]
499
    except KeyError:
500
        raise ParseError(f'{block.name} has missing cell data')
501
    cell = cellpar_to_cell(*cellpar)
502
503
    # asymmetric unit fractional coords
504
    asym_unit = [block.get(name) for name in CIF_TAGS['atom_site_fract']]
505
    if None in asym_unit: # missing fractional coords, try cartesian
506
        asym_motif = [block.get(name) for name in CIF_TAGS['atom_site_cartn']]
507
        if None in asym_motif:
508
            raise ParseError(f'{block.name} has missing coordinate data')
509
        asym_unit = np.array(asym_motif) @ np.linalg.inv(cell)
510
511
    elif any(None in coords for coords in asym_unit):
512
        raise ParseError(f'{block.name} has missing coordinate data')
513
514
    # if there's a ?, . or other string, remove site
515
    if any(isinstance(c, str) for coords in asym_unit for c in coords):
516
        warnings.warn(f'atoms without sites or unparsable data will be removed')
0 ignored issues
show
introduced by
Using an f-string that does not have any interpolated variables
Loading history...
517
        asym_unit_ = [[], [], []]
518
        for xyz in zip(*asym_unit):
519
            if not any(isinstance(coord, str) for coord in xyz):
520
                for i in range(3):
521
                    asym_unit_[i].append(xyz[i])
522
        asym_unit = asym_unit_
523
524
    asym_unit = np.mod(np.array(asym_unit).T, 1)
525
    # asym_unit = _snap_small_prec_coords(asym_unit)
526
527
    # atomic types
528
    for tag in CIF_TAGS['atom_symbol']:
529
        if tag in block:
530
            asym_types = []
531
            for label in block[tag]:
532
                if label in ('.', '?'):
533
                    warnings.warn(f'missing atomic type data')
0 ignored issues
show
introduced by
Using an f-string that does not have any interpolated variables
Loading history...
534
                    asym_types.append(0)
535
                # remove additional labels
536
                sym = re.search(r'([A-Z][a-z]?)', label).group(0)
537
                if sym == 'D': sym = 'H'
0 ignored issues
show
Coding Style introduced by
More than one statement on a single line
Loading history...
538
                asym_types.append(ase.data.atomic_numbers[sym])
539
            break
540
    else:
541
        warnings.warn(f'missing atomic types')
0 ignored issues
show
introduced by
Using an f-string that does not have any interpolated variables
Loading history...
542
        asym_types = [0 for _ in range(len(asym_unit))]
543
544
    # symmetry operations
545
    sitesym = []
546
    for tag in CIF_TAGS['symop']: # try symop tags first
547
        if tag in block:
548
            sitesym = block[tag]
549
            break
550
    if isinstance(sitesym, str):
551
        sitesym = [sitesym]
552
    if not sitesym: # if no symops, get ops from spacegroup
553
        try:
554
            spg = block.get_spacegroup(True)
555
            rot, trans = spg.rotations, spg.translations
556
        except:
0 ignored issues
show
Coding Style Best Practice introduced by
General except handlers without types should be used sparingly.

Typically, you would use general except handlers when you intend to specifically handle all types of errors, f.e. when logging. Otherwise, such general error handlers can mask errors in your application that you want to know of.

Loading history...
557
            rot, trans = ase_parse_sitesym(['x,y,z'])
558
    else:
559
        rot, trans = ase_parse_sitesym(sitesym)
560
561
    # (if needed) remove Hydrogens, fract occupancy atoms
562
    remove_sites = [] # indices of asymmetric unit sites to remove
563
    occupancies = block.get('_atom_site_occupancy')
564
    labels = block.get('_atom_site_label')
565
    if occupancies is not None:
566
        if disorder == 'skip':
567
            if any(_atom_has_disorder(lab, occ) for lab, occ in zip(labels, occupancies)):
568
                raise ParseError(f'{block.name} has disorder')
569
        elif disorder == 'ordered_sites':
570
            remove_sites.extend(
571
                (i for i, (lab, occ) in enumerate(zip(labels, occupancies))
0 ignored issues
show
introduced by
The variable i does not seem to be defined for all execution paths.
Loading history...
572
                 if _atom_has_disorder(lab, occ)))
573
574
    if remove_hydrogens:
575
        remove_sites.extend((i for i, num in enumerate(asym_types) if num == 1))
576
577
    asym_unit = np.delete(asym_unit, remove_sites, axis=0)
578
    asym_types = [s for i, s in enumerate(asym_types) if i not in remove_sites]
579
580
    # if disorder == 'all_sites', don't remove overlapping sites
581
    if disorder != 'all_sites':
582
        keep_sites = _unique_sites(asym_unit)
583
        if not np.all(keep_sites):
584
            warnings.warn(f'may have overlapping sites; duplicates will be removed')
0 ignored issues
show
introduced by
Using an f-string that does not have any interpolated variables
Loading history...
585
        asym_unit = asym_unit[keep_sites]
586
        asym_types = [sym for sym, keep in zip(asym_types, keep_sites) if keep]
587
588
    if asym_unit.shape[0] == 0:
589
        raise ParseError(f'{block.name} has no valid sites')
590
591
    frac_motif, asym_inds, wyc_muls, inverses = _expand_asym_unit(asym_unit, rot, trans)
592
    full_types = np.array([asym_types[i] for i in inverses])
593
    motif = frac_motif @ cell
594
595
    kwargs = {
596
        'name': block.name,
597
        'asymmetric_unit': asym_inds,
598
        'wyckoff_multiplicities': wyc_muls,
599
        'types': full_types
600
    }
601
602
    return PeriodicSet(motif, cell, **kwargs)
603
604
605
# TODO: function needs to be finished.
0 ignored issues
show
Coding Style introduced by
TODO and FIXME comments should generally be avoided.
Loading history...
606
def periodicset_from_ase_atoms(
607
        atoms,
608
        remove_hydrogens=False,
609
        disorder='skip'
610
) -> PeriodicSet:
611
    """:class:`ase.atoms.Atoms` --> :class:`amd.PeriodicSet <.periodicset.PeriodicSet>`.
612
613
    Parameters
614
    ----------
615
    atoms : :class:`ase.atoms.Atoms`
616
        An ase Atoms object representing a crystal.
617
    remove_hydrogens : bool, optional
618
        Remove Hydrogens from the crystal.
619
    disorder : str, optional
620
        Controls how disordered structures are handled. Default is ``skip``
621
        which raises a ParseError if the input has any disorder, since
622
        disorder conflicts with the periodic set model. To read disordered
623
        structures anyway, choose either :code:`ordered_sites` to remove sites
624
        with disorder or :code:`all_sites` include all sites regardless.
625
626
    Returns
627
    -------
628
    :class:`amd.PeriodicSet <.periodicset.PeriodicSet>`
629
        Represents the crystal as a periodic set, consisting of a finite set
630
        of points (motif) and lattice (unit cell). Contains other useful data,
631
        e.g. the crystal's name and information about the asymmetric unit for
632
        calculation.
633
634
    Raises
635
    ------
636
    ParseError      ## rewrite!!
637
        Raised if the structure can/should not be parsed for the following reasons:
638
        1. no sites found or motif is empty after removing H or disordered sites,
639
        2. a site has missing coordinates,
640
        3. disorder == 'skip' and any of:
641
            (a) any atom has fractional occupancy,
642
            (b) any atom's label ends with '?'.
643
    """
644
645
    if remove_hydrogens:
646
        for i in np.where(atoms.get_atomic_numbers() == 1)[0][::-1]:
647
            atoms.pop(i)
648
649
    if disorder == 'skip':
650
        occ_info = atoms.info['occupancy']
651
        if occ_info is not None:
652
            for d in occ_info.values():
653
                for occ in d.values():
654
                    if float(occ) < 1:
655
                        raise ParseError(f'Ase atoms object has disorder')
0 ignored issues
show
introduced by
Using an f-string that does not have any interpolated variables
Loading history...
656
657
    elif disorder == 'ordered_sites':
658
        occ_info = atoms.info['occupancy']
659
        if occ_info is not None:
660
            remove_inds = []
661
            for i, d in occ_info.items():
662
                for occ in d.values():
663
                    if float(occ) < 1:
664
                        remove_inds.append(int(i))
665
666
    # .pop removes just one atom. 
0 ignored issues
show
Coding Style introduced by
Trailing whitespace
Loading history...
667
668
    # if disorder == 'skip':
669
    #     if not structure.is_ordered:
670
    #         raise ParseError(f'{name} has disorder')
671
    # elif disorder == 'ordered_sites':
672
    #     remove_inds = []
673
    #     for i, comp in enumerate(structure.species_and_occu):
674
    #         if comp.num_atoms < 1:
675
    #             remove_inds.append(i)
676
    #     structure.remove_sites(remove_inds)
677
678
    cell = atoms.get_cell().array
679
    sg = atoms.info['spacegroup']
680
    asym_unit = ase.spacegroup.get_basis(atoms)
681
    frac_motif, asym_inds, wyc_muls, inverses = _expand_asym_unit(asym_unit, sg.rotations, sg.translations)
0 ignored issues
show
Unused Code introduced by
The variable inverses seems to be unused.
Loading history...
682
    motif = frac_motif @ cell
683
    
0 ignored issues
show
Coding Style introduced by
Trailing whitespace
Loading history...
684
    kwargs = {
685
        'name': None,
686
        'asymmetric_unit': asym_inds,
687
        'wyckoff_multiplicities': wyc_muls,
688
        'types': atoms.get_atomic_numbers()
689
    }
690
691
    return PeriodicSet(motif, cell, **kwargs)
692
693
694
def periodicset_from_pymatgen_cifblock(
695
        block,
696
        remove_hydrogens=False,
697
        disorder='skip',
698
) -> PeriodicSet:
699
    """:class:`pymatgen.io.cif.CifBlock` --> :class:`amd.PeriodicSet <.periodicset.PeriodicSet>`.
700
701
    Parameters
702
    ----------
703
    block : :class:`pymatgen.core.cif.CifBlock`
704
        A pymatgen CifBlock object representing a crystal.
705
    remove_hydrogens : bool, optional
706
        Remove Hydrogens from the crystal.
707
    disorder : str, optional
708
        Controls how disordered structures are handled. Default is ``skip``
709
        which raises a ParseError if the input has any disorder, since
710
        disorder conflicts with the periodic set model. To read disordered
711
        structures anyway, choose either :code:`ordered_sites` to remove sites
712
        with disorder or :code:`all_sites` include all sites regardless.
713
714
    Returns
715
    -------
716
    :class:`amd.PeriodicSet <.periodicset.PeriodicSet>`
717
        Represents the crystal as a periodic set, consisting of a finite set
718
        of points (motif) and lattice (unit cell). Contains other useful data,
719
        e.g. the crystal's name and information about the asymmetric unit for
720
        calculation.
721
722
    Raises
723
    ------
724
    ParseError
725
        Raised if the structure can/should not be parsed for the following reasons:
726
        1. no sites found or motif is empty after removing H or disordered sites,
727
        2. a site has missing coordinates,
728
        3. disorder == 'skip' and any of:
729
            (a) any atom has fractional occupancy,
730
            (b) any atom's label ends with '?'.
731
    """
732
733
    from pymatgen.core.periodic_table import _pt_data
0 ignored issues
show
introduced by
Unable to import 'pymatgen.core.periodic_table'
Loading history...
introduced by
Import outside toplevel (pymatgen.core.periodic_table._pt_data)
Loading history...
734
735
    odict = block.data
736
737
    try:
738
        cellpar = [_cif_val_to_float(odict[tag]) for tag in CIF_TAGS['cellpar']]
739
        cell = cellpar_to_cell(*cellpar)
740
    except KeyError:
741
        raise ParseError(f'{block.header} has missing cell information')
742
743
    # check for missing data ?
744
    asym_unit = [[_cif_val_to_float(s) for s in odict.get(name)] 
0 ignored issues
show
Coding Style introduced by
Trailing whitespace
Loading history...
745
                 for name in CIF_TAGS['atom_site_fract']]
746
    if None in asym_unit:
747
        asym_motif = [[_cif_val_to_float(s) for s in odict.get(name)] 
0 ignored issues
show
Coding Style introduced by
Trailing whitespace
Loading history...
748
                      for name in CIF_TAGS['atom_site_cartn']]
749
        if None in asym_motif:
750
            raise ParseError(f'{block.header} has no sites')
751
        asym_unit = np.array(asym_motif) @ np.linalg.inv(cell)
752
753
    asym_unit = np.mod(np.array(asym_unit).T, 1)
754
    # asym_unit = _snap_small_prec_coords(asym_unit)
755
756
    # check _atom_site_label as well?
757
    symbols = odict.get('_atom_site_type_symbol')
758
    if symbols is None:
759
        warnings.warn(f'missing atomic types')
0 ignored issues
show
introduced by
Using an f-string that does not have any interpolated variables
Loading history...
760
        asym_types = [0 for _ in range(len(asym_unit))]
761
    else:
762
        asym_types = [_pt_data[s]['Atomic no'] for s in symbols]
763
764
    remove_sites = []
765
766
    occupancies = odict.get('_atom_site_occupancy')
767
    labels = odict.get('_atom_site_label')
768
    if occupancies is not None:
769
        if disorder == 'skip':
770
            if any(_atom_has_disorder(lab, occ) for lab, occ in zip(labels, occupancies)):
771
                raise ParseError(f'{block.header} has disorder')
772
        elif disorder == 'ordered_sites':
773
            remove_sites.extend(
774
                (i for i, (lab, occ) in enumerate(zip(labels, occupancies))
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable i does not seem to be defined.
Loading history...
775
                 if _atom_has_disorder(lab, occ)))
776
777
    if remove_hydrogens:
778
        remove_sites.extend((i for i, num in enumerate(asym_types) if num == 1))
779
780
    asym_unit = np.delete(asym_unit, remove_sites, axis=0)
781
    asym_types = [s for i, s in enumerate(asym_types) if i not in remove_sites]
782
783
    if disorder != 'all_sites':
784
        keep_sites = _unique_sites(asym_unit)
785
        if not np.all(keep_sites):
786
            warnings.warn(f'may have overlapping sites; duplicates will be removed')
0 ignored issues
show
introduced by
Using an f-string that does not have any interpolated variables
Loading history...
787
        asym_unit = asym_unit[keep_sites]
788
        asym_types = [sym for sym, keep in zip(asym_types, keep_sites) if keep]
789
790
    if asym_unit.shape[0] == 0:
791
        raise ParseError(f'{block.header} has no valid sites')
792
793
    rot, trans = _parse_sitesym_pymatgen(odict)
794
    frac_motif, asym_inds, wyc_muls, inverses = _expand_asym_unit(asym_unit, rot, trans)
795
    full_types = np.array([asym_types[i] for i in inverses])
796
    motif = frac_motif @ cell
797
798
    kwargs = {
799
        'name': block.header,
800
        'asymmetric_unit': asym_inds,
801
        'wyckoff_multiplicities': wyc_muls,
802
        'types': full_types
803
    }
804
805
    return PeriodicSet(motif, cell, **kwargs)
806
807
808
# TODO: add Raises to docstr
0 ignored issues
show
Coding Style introduced by
TODO and FIXME comments should generally be avoided.
Loading history...
809
def periodicset_from_pymatgen_structure(
810
        structure,
811
        remove_hydrogens=False,
812
        disorder='skip'
813
) -> PeriodicSet:
814
    """:class:`pymatgen.core.structure.Structure` --> :class:`amd.PeriodicSet <.periodicset.PeriodicSet>`.
815
    Does not set the name of the periodic set, as there seems to be no such attribute in 
0 ignored issues
show
Coding Style introduced by
Trailing whitespace
Loading history...
816
    the pymatgen Structure object.
817
818
    Parameters
819
    ----------
820
    structure : :class:`pymatgen.core.structure.Structure`
821
        A pymatgen Structure object representing a crystal.
822
    remove_hydrogens : bool, optional
823
        Remove Hydrogens from the crystal.
824
    disorder : str, optional
825
        Controls how disordered structures are handled. Default is ``skip``
826
        which raises a ParseError if the input has any disorder, since
827
        disorder conflicts with the periodic set model. To read disordered
828
        structures anyway, choose either :code:`ordered_sites` to remove sites
829
        with disorder or :code:`all_sites` include all sites regardless.
830
831
    Returns
832
    -------
833
    :class:`amd.PeriodicSet <.periodicset.PeriodicSet>`
834
        Represents the crystal as a periodic set, consisting of a finite set
835
        of points (motif) and lattice (unit cell). Contains other useful data,
836
        e.g. the crystal's name and information about the asymmetric unit for
837
        calculation.
838
    """
839
840
    from pymatgen.symmetry.analyzer import SpacegroupAnalyzer
0 ignored issues
show
introduced by
Unable to import 'pymatgen.symmetry.analyzer'
Loading history...
introduced by
Import outside toplevel (pymatgen.symmetry.analyzer.SpacegroupAnalyzer)
Loading history...
841
842
    if remove_hydrogens:
843
        structure.remove_species(['H', 'D'])
844
845
    if disorder == 'skip':
846
        if not structure.is_ordered:
847
            raise ParseError(f'Pymatgen structure object has disorder')
0 ignored issues
show
introduced by
Using an f-string that does not have any interpolated variables
Loading history...
848
    elif disorder == 'ordered_sites':
849
        remove_inds = []
850
        for i, comp in enumerate(structure.species_and_occu):
851
            if comp.num_atoms < 1:
852
                remove_inds.append(i)
853
        structure.remove_sites(remove_inds)
854
855
    motif = structure.cart_coords
856
    cell = structure.lattice.matrix
857
    sym_structure = SpacegroupAnalyzer(structure).get_symmetrized_structure()
858
    equiv_inds = sym_structure.equivalent_indices
859
860
    kwargs = {
861
        'asymmetric_unit': np.array([l[0] for l in equiv_inds]),
862
        'wyckoff_multiplicities': np.array([len(l) for l in equiv_inds]),
863
        'types': np.array(sym_structure.atomic_numbers)
864
    }
865
866
    return PeriodicSet(motif, cell, **kwargs)
867
868
869
def periodicset_from_ccdc_entry(
870
        entry,
871
        remove_hydrogens=False,
872
        disorder='skip',
873
        heaviest_component=False,
874
        molecular_centres=False
875
) -> PeriodicSet:
876
    """:class:`ccdc.entry.Entry` --> :class:`amd.PeriodicSet <.periodicset.PeriodicSet>`.
877
    Entry is the type returned by :class:`ccdc.io.EntryReader`.
878
879
    Parameters
880
    ----------
881
    entry : :class:`ccdc.entry.Entry`
882
        A ccdc Entry object representing a database entry.
883
    remove_hydrogens : bool, optional
884
        Remove Hydrogens from the crystal.
885
    disorder : str, optional
886
        Controls how disordered structures are handled. Default is ``skip``
887
        which raises a ParseError if the input has any disorder, since
888
        disorder conflicts with the periodic set model. To read disordered
889
        structures anyway, choose either :code:`ordered_sites` to remove sites
890
        with disorder or :code:`all_sites` include all sites regardless.
891
    heaviest_component : bool, optional
892
        Removes all but the heaviest molecule in the asymmeric unit, intended
893
        for removing solvents.
894
    molecular_centres : bool, default False
895
        Extract the centres of molecules in the unit cell and store in 
0 ignored issues
show
Coding Style introduced by
Trailing whitespace
Loading history...
896
        the attribute molecular_centres of the returned PeriodicSet.
897
898
    Returns
899
    -------
900
    :class:`amd.PeriodicSet <.periodicset.PeriodicSet>`
901
        Represents the crystal as a periodic set, consisting of a finite set
902
        of points (motif) and lattice (unit cell). Contains other useful data,
903
        e.g. the crystal's name and information about the asymmetric unit for
904
        calculation.
905
906
    Raises
907
    ------
908
    ParseError :
909
        Raised if the structure fails parsing for any of the following:
910
        1. entry.has_3d_structure is False,
911
        2. disorder == 'skip' and any of:
912
            (a) any disorder flag is True,
913
            (b) any atom has fractional occupancy,
914
            (c) any atom's label ends with '?',
915
        3. entry.crystal.molecule.all_atoms_have_sites is False,
916
        4. a.fractional_coordinates is None for any a in entry.crystal.disordered_molecule,
917
        5. The motif is empty after removing H, disordered sites or solvents.
918
    """
919
920
    # Entry specific flags
921
    if not entry.has_3d_structure:
922
        raise ParseError(f'{entry.identifier} has no 3D structure')
923
924
    if disorder == 'skip' and entry.has_disorder:
925
        raise ParseError(f'{entry.identifier} has disorder')
926
927
    return periodicset_from_ccdc_crystal(
928
        entry.crystal,
929
        remove_hydrogens=remove_hydrogens,
930
        disorder=disorder,
931
        heaviest_component=heaviest_component,
932
        molecular_centres=molecular_centres
933
    )
934
935
936
def periodicset_from_ccdc_crystal(
937
        crystal,
938
        remove_hydrogens=False,
939
        disorder='skip',
940
        heaviest_component=False,
941
        molecular_centres=False
942
) -> PeriodicSet:
943
    """:class:`ccdc.crystal.Crystal` --> :class:`amd.PeriodicSet <.periodicset.PeriodicSet>`.
944
    Crystal is the type returned by :class:`ccdc.io.CrystalReader`.
945
946
    Parameters
947
    ----------
948
    crystal : :class:`ccdc.crystal.Crystal`
949
        A ccdc Crystal object representing a crystal structure.
950
    remove_hydrogens : bool, optional
951
        Remove Hydrogens from the crystal.
952
    disorder : str, optional
953
        Controls how disordered structures are handled. Default is ``skip``
954
        which raises a ParseError if the input has any disorder, since
955
        disorder conflicts with the periodic set model. To read disordered
956
        structures anyway, choose either :code:`ordered_sites` to remove sites
957
        with disorder or :code:`all_sites` include all sites regardless.
958
    heaviest_component : bool, optional
959
        Removes all but the heaviest molecule in the asymmeric unit,
960
        intended for removing solvents.
961
    molecular_centres : bool, default False
962
        Extract the centres of molecules in the unit cell and store in 
0 ignored issues
show
Coding Style introduced by
Trailing whitespace
Loading history...
963
        the attribute molecular_centres of the returned PeriodicSet.
964
965
    Returns
966
    -------
967
    :class:`amd.PeriodicSet <.periodicset.PeriodicSet>`
968
        Represents the crystal as a periodic set, consisting of a finite set
969
        of points (motif) and lattice (unit cell). Contains other useful data,
970
        e.g. the crystal's name and information about the asymmetric unit for
971
        calculation.
972
973
    Raises
974
    ------
975
    ParseError :
976
        Raised if the structure fails parsing for any of the following:
977
        1. disorder == 'skip' and any of:
978
            (a) any disorder flag is True,
979
            (b) any atom has fractional occupancy,
980
            (c) any atom's label ends with '?',
981
        2. crystal.molecule.all_atoms_have_sites is False,
982
        3. a.fractional_coordinates is None for any a in crystal.disordered_molecule,
983
        4. The motif is empty after removing H, disordered sites or solvents.
984
    """
985
986
    molecule = crystal.disordered_molecule
987
988
    # if skipping disorder, check disorder flags and then all atoms
989
    if disorder == 'skip':
990
        if crystal.has_disorder or \
991
         any(_atom_has_disorder(a.label, a.occupancy) for a in molecule.atoms):
992
            raise ParseError(f'{crystal.identifier} has disorder')
993
994
    elif disorder == 'ordered_sites':
995
        molecule.remove_atoms(a for a in molecule.atoms
996
                              if _atom_has_disorder(a.label, a.occupancy))
997
998
    if remove_hydrogens:
999
        molecule.remove_atoms(a for a in molecule.atoms if a.atomic_symbol in 'HD')
1000
1001
    if heaviest_component and len(molecule.components) > 1:
1002
        molecule = _heaviest_component(molecule)
1003
1004
    # remove atoms with missing coord data and warn
1005
    if any(a.fractional_coordinates is None for a in molecule.atoms):
1006
        warnings.warn(f'trying to remove atoms without sites')
0 ignored issues
show
introduced by
Using an f-string that does not have any interpolated variables
Loading history...
1007
        molecule.remove_atoms(a for a in molecule.atoms 
0 ignored issues
show
Coding Style introduced by
Trailing whitespace
Loading history...
1008
                              if a.fractional_coordinates is None)
1009
1010
    if not molecule.all_atoms_have_sites:
1011
        raise ParseError(f'{crystal.identifier} has atoms without sites')
1012
1013
    crystal.molecule = molecule
1014
    cell = cellpar_to_cell(*crystal.cell_lengths, *crystal.cell_angles)
1015
1016
    if molecular_centres:
1017
        frac_centres = _frac_molecular_centres_ccdc(crystal)
1018
        mol_centres = frac_centres @ cell
1019
        return PeriodicSet(mol_centres, cell, name=crystal.identifier)
1020
1021
    asym_atoms = crystal.asymmetric_unit_molecule.atoms
1022
    # check for None?
1023
    asym_unit = np.array([tuple(a.fractional_coordinates) for a in asym_atoms])
1024
    asym_unit = np.mod(asym_unit, 1)
1025
    # asym_unit = _snap_small_prec_coords(asym_unit)
1026
    asym_types = [a.atomic_number for a in asym_atoms]
1027
1028
    if disorder != 'all_sites':
1029
        keep_sites = _unique_sites(asym_unit)
1030
        if not np.all(keep_sites):
1031
            warnings.warn(f'may have overlapping sites; duplicates will be removed')
0 ignored issues
show
introduced by
Using an f-string that does not have any interpolated variables
Loading history...
1032
        asym_unit = asym_unit[keep_sites]
1033
        asym_types = [sym for sym, keep in zip(asym_types, keep_sites) if keep]
1034
1035
    if asym_unit.shape[0] == 0:
1036
        raise ParseError(f'{crystal.identifier} has no valid sites')
1037
1038
    sitesym = crystal.symmetry_operators
1039
    # try spacegroup numbers?
1040
    if not sitesym:
1041
        sitesym = ['x,y,z']
1042
1043
    rot = np.array([np.array(crystal.symmetry_rotation(op)).reshape((3,3)) 
0 ignored issues
show
Coding Style introduced by
Trailing whitespace
Loading history...
Coding Style introduced by
Exactly one space required after comma
Loading history...
1044
                    for op in sitesym])
1045
    trans = np.array([crystal.symmetry_translation(op) for op in sitesym])
1046
    frac_motif, asym_inds, wyc_muls, inverses = _expand_asym_unit(asym_unit, rot, trans)
1047
    motif = frac_motif @ cell
1048
1049
    kwargs = {
1050
        'name': crystal.identifier,
1051
        'asymmetric_unit': asym_inds,
1052
        'wyckoff_multiplicities': wyc_muls,
1053
        'types': np.array([asym_types[i] for i in inverses])
1054
    }
1055
1056
    periodic_set = PeriodicSet(motif, cell, **kwargs)
1057
1058
    # if molecular_centres:
1059
    #     frac_centres = _frac_molecular_centres_ccdc(crystal)
1060
    #     periodic_set.molecular_centres = frac_centres @ cell
1061
1062
    return periodic_set
1063
1064
1065
def periodicset_from_gemmi_block(
1066
        block,
1067
        remove_hydrogens=False,
1068
        disorder='skip'
1069
) -> PeriodicSet:
1070
    """:class:`gemmi.cif.Block` --> :class:`amd.PeriodicSet <.periodicset.PeriodicSet>`.
1071
    Block is the type returned by :class:`gemmi.cif.read_file`.
1072
1073
    Parameters
1074
    ----------
1075
    block : :class:`ase.io.cif.CIFBlock`
1076
        An ase CIFBlock object representing a crystal.
1077
    remove_hydrogens : bool, optional
1078
        Remove Hydrogens from the crystal.
1079
    disorder : str, optional
1080
        Controls how disordered structures are handled. Default is ``skip``
1081
        which raises a ParseError if the input has any disorder, since
1082
        disorder conflicts with the periodic set model. To read disordered
1083
        structures anyway, choose either :code:`ordered_sites` to remove sites
1084
        with disorder or :code:`all_sites` include all sites regardless.
1085
1086
    Returns
1087
    -------
1088
    :class:`amd.PeriodicSet <.periodicset.PeriodicSet>`
1089
        Represents the crystal as a periodic set, consisting of a finite set
1090
        of points (motif) and lattice (unit cell). Contains other useful data,
1091
        e.g. the crystal's name and information about the asymmetric unit for
1092
        calculation.
1093
1094
    Raises
1095
    ------
1096
    ParseError :
1097
        Raised if the structure fails to be parsed for any of the following:
1098
        1. Required data is missing (e.g. cell parameters),
1099
        2. disorder == 'skip' and any of:
1100
            (a) any atom has fractional occupancy,
1101
            (b) any atom's label ends with '?',
1102
        3. The motif is empty after removing H or disordered sites.
1103
    """
1104
1105
    cellpar = [ase.io.cif.convert_value(block.find_value(tag)) 
0 ignored issues
show
Coding Style introduced by
Trailing whitespace
Loading history...
1106
               for tag in CIF_TAGS['cellpar']]
1107
    if None in cellpar:
1108
        raise ParseError(f'{block.name} has missing cell information')
1109
1110
    cell = cellpar_to_cell(*cellpar)
1111
1112
    xyz_loop = block.find(CIF_TAGS['atom_site_fract']).loop
1113
    if xyz_loop is None:
1114
        # check for crtsn!
1115
        raise ParseError(f'{block.name} has missing coordinate data')
1116
1117
    # loop will contain all xyz cols, maybe labels, maybe types, maybe occs
1118
1119
    loop_dict = _gemmi_loop_to_dict(xyz_loop)
1120
    xyz_str = [loop_dict[t] for t in CIF_TAGS['atom_site_fract']]
1121
    asym_unit = [[ase.io.cif.convert_value(c) for c in coords] for coords in xyz_str]
1122
1123
    asym_unit = np.mod(np.array(asym_unit).T, 1)
1124
    # asym_unit = _snap_small_prec_coords(asym_unit)
1125
1126
    if '_atom_site_type_symbol' in loop_dict:
1127
        asym_syms = loop_dict['_atom_site_type_symbol']
1128
        asym_types = [ase.data.atomic_numbers[s] for s in asym_syms]
1129
    else:
1130
        warnings.warn(f'missing atomic types')
0 ignored issues
show
introduced by
Using an f-string that does not have any interpolated variables
Loading history...
1131
        asym_types = [0 for _ in range(len(asym_unit))]
1132
1133
    remove_sites = []
1134
    
0 ignored issues
show
Coding Style introduced by
Trailing whitespace
Loading history...
1135
    # if labels exist, check them for disorder
1136
    if '_atom_site_label' in loop_dict:
1137
        labels = loop_dict['_atom_site_label']
1138
    else:
1139
        labels = ['' for _ in range(xyz_loop.length())]
1140
1141
    # if occupancies exist, check them for disorder
1142
    if '_atom_site_occupancy' in loop_dict:
1143
        occupancies = [ase.io.cif.convert_value(occ) 
0 ignored issues
show
Coding Style introduced by
Trailing whitespace
Loading history...
1144
                  for occ in loop_dict['_atom_site_occupancy']]
0 ignored issues
show
Coding Style introduced by
Wrong continued indentation (add 5 spaces).
Loading history...
1145
    else:
1146
        occupancies = [None for _ in range(xyz_loop.length())]
1147
1148
    if disorder == 'skip':
1149
        if any(_atom_has_disorder(lab, occ) for lab, occ in zip(labels, occupancies)):
1150
            raise ParseError(f'{block.name} has disorder')
1151
    elif disorder == 'ordered_sites':
1152
        remove_sites.extend(
1153
            (i for i, (lab, occ) in enumerate(zip(labels, occupancies))
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable i does not seem to be defined.
Loading history...
1154
                if _atom_has_disorder(lab, occ)))
0 ignored issues
show
Coding Style introduced by
Wrong continued indentation (remove 3 spaces).
Loading history...
1155
1156
    if remove_hydrogens:
1157
        remove_sites.extend((i for i, num in enumerate(asym_types) if num == 1))
1158
1159
    asym_unit = np.delete(asym_unit, remove_sites, axis=0)
1160
    asym_types = [s for i, s in enumerate(asym_types) if i not in remove_sites]
1161
1162
    if disorder != 'all_sites':
1163
        keep_sites = _unique_sites(asym_unit)
1164
        if not np.all(keep_sites):
1165
            warnings.warn(f'may have overlapping sites; duplicates will be removed')
0 ignored issues
show
introduced by
Using an f-string that does not have any interpolated variables
Loading history...
1166
        asym_unit = asym_unit[keep_sites]
1167
        asym_types = [sym for sym, keep in zip(asym_types, keep_sites) if keep]
1168
1169
    if asym_unit.shape[0] == 0:
1170
        raise ParseError(f'{block.name} has no valid sites')
1171
1172
    # get symops
1173
    sitesym = []
1174
    for tag in CIF_TAGS['symop']:
1175
        symop_loop = block.find([tag]).loop
1176
        if symop_loop is not None:
1177
            symop_loop_dict = _gemmi_loop_to_dict(symop_loop)
1178
            sitesym = symop_loop_dict[tag]
1179
            break
1180
    # try spacegroup names/nums?
1181
    if not sitesym:
1182
        sitesym = ['x,y,z',]
1183
1184
    rot, trans = ase_parse_sitesym(sitesym)
1185
    frac_motif, asym_inds, wyc_muls, inverses = _expand_asym_unit(asym_unit, rot, trans)
1186
    full_types = np.array([asym_types[i] for i in inverses])
1187
    motif = frac_motif @ cell
1188
1189
    kwargs = {
1190
        'name': block.name,
1191
        'asymmetric_unit': asym_inds,
1192
        'wyckoff_multiplicities': wyc_muls,
1193
        'types': full_types
1194
    }
1195
1196
    return PeriodicSet(motif, cell, **kwargs)
1197
1198
1199
def _expand_asym_unit(
1200
        asym_unit: np.ndarray,
1201
        rotations: np.ndarray,
1202
        translations: np.ndarray
1203
) -> Tuple[np.ndarray, ...]:
1204
    """
1205
    Asymmetric unit frac coords, list of rotations, list of translations
1206
    -->
1207
    full fractional motif, asymmetric unit indices, multiplicities, inverses.
1208
    """
1209
1210
    all_sites = []
1211
    asym_inds = [0]
1212
    multiplicities = []
1213
    inverses = []
1214
    m, dims = asym_unit.shape
1215
1216
    expanded_sites = np.zeros((m, len(rotations), dims))
1217
    for i in range(m):
1218
        expanded_sites[i] = np.dot(rotations, asym_unit[i]) + translations
1219
    expanded_sites = np.mod(expanded_sites, 1)
1220
1221
    for inv, sites in enumerate(expanded_sites):
1222
1223
        multiplicity = 0
1224
1225
        for site_ in sites:
1226
1227
            if not all_sites:
1228
                all_sites.append(site_)
1229
                inverses.append(inv)
1230
                multiplicity += 1
1231
                continue
1232
1233
            # check if site_ overlaps with existing sites
1234
            diffs1 = np.abs(site_ - all_sites)
1235
            diffs2 = np.abs(diffs1 - 1)
1236
            mask = np.all((diffs1 <= _EQUIV_SITE_TOL) | (diffs2 <= _EQUIV_SITE_TOL), axis=-1)
1237
1238
            if np.any(mask):
1239
                where_equal = np.argwhere(mask).flatten()
1240
                for ind in where_equal:
1241
                    if inverses[ind] == inv: # invarnant 
0 ignored issues
show
Coding Style introduced by
Trailing whitespace
Loading history...
1242
                        pass
1243
                    else: # equivalent to a different site
1244
                        warnings.warn(f'has equivalent sites at positions {inverses[ind]}, {inv}')
1245
            else:
1246
                all_sites.append(site_)
1247
                inverses.append(inv)
1248
                multiplicity += 1
1249
1250
        if multiplicity > 0:
1251
            multiplicities.append(multiplicity)
1252
            asym_inds.append(len(all_sites))
1253
1254
    frac_motif = np.array(all_sites)
1255
    asym_inds = np.array(asym_inds[:-1])
1256
    multiplicities = np.array(multiplicities)
1257
1258
    return frac_motif, asym_inds, multiplicities, inverses
1259
1260
1261
@numba.njit()
1262
def _unique_sites(asym_unit):
1263
    """Uniquify (within _EQUIV_SITE_TOL) a list of fractional coordinates,
1264
    considering all points modulo 1. Returns an array of bools such that
1265
    asym_unit[_unique_sites(asym_unit)] is a uniquified list."""
1266
    site_diffs1 = np.abs(np.expand_dims(asym_unit, 1) - asym_unit)
1267
    site_diffs2 = np.abs(site_diffs1 - 1)
1268
    sites_neq_mask = np.logical_and((site_diffs1 > _EQUIV_SITE_TOL), 
0 ignored issues
show
Coding Style introduced by
Trailing whitespace
Loading history...
1269
                                    (site_diffs2 > _EQUIV_SITE_TOL))
1270
    overlapping = np.triu(sites_neq_mask.sum(axis=-1) == 0, 1)
1271
    return overlapping.sum(axis=0) == 0
1272
1273
1274
def _parse_sitesym_pymatgen(data):
1275
    """In order to generate symmetry equivalent positions, the symmetry
1276
    operations are parsed. If the symops are not present, the space
1277
    group symbol is parsed, and symops are generated."""
1278
1279
    from pymatgen.symmetry.groups import SpaceGroup
0 ignored issues
show
introduced by
Unable to import 'pymatgen.symmetry.groups'
Loading history...
introduced by
Import outside toplevel (pymatgen.symmetry.groups.SpaceGroup)
Loading history...
1280
    from pymatgen.core.operations import SymmOp
0 ignored issues
show
introduced by
Unable to import 'pymatgen.core.operations'
Loading history...
introduced by
Import outside toplevel (pymatgen.core.operations.SymmOp)
Loading history...
1281
    import pymatgen.io.cif
0 ignored issues
show
introduced by
Import outside toplevel (pymatgen.io.cif)
Loading history...
introduced by
Unable to import 'pymatgen.io.cif'
Loading history...
1282
1283
    symops = []
1284
1285
    # try to parse xyz symops
1286
    for symmetry_label in CIF_TAGS['symop']:
1287
1288
        xyz = data.get(symmetry_label)
1289
        if not xyz:
1290
            continue
1291
        if isinstance(xyz, str):
1292
            xyz = [xyz]
1293
        try:
1294
            symops = [SymmOp.from_xyz_string(s) for s in xyz]
1295
            break
1296
        except ValueError:
1297
            continue
1298
1299
    # try to parse symbol
1300
    if not symops:
1301
1302
        for symmetry_label in CIF_TAGS['spacegroup_name']:
1303
1304
            sg = data.get(symmetry_label)
1305
            if not sg:
1306
                continue
1307
            sg = re.sub(r'[\s_]', '', sg)
1308
1309
            try:
1310
                spg = pymatgen.io.cif.space_groups.get(sg)
1311
                if not spg:
1312
                    continue
1313
                symops = SpaceGroup(spg).symmetry_ops
1314
                break
1315
            except ValueError:
1316
                pass
1317
1318
            try:
1319
                for d in pymatgen.io.cif._get_cod_data():
0 ignored issues
show
Coding Style Best Practice introduced by
It seems like _get_cod_data was declared protected and should not be accessed from this context.

Prefixing a member variable _ is usually regarded as the equivalent of declaring it with protected visibility that exists in other languages. Consequentially, such a member should only be accessed from the same class or a child class:

class MyParent:
    def __init__(self):
        self._x = 1;
        self.y = 2;

class MyChild(MyParent):
    def some_method(self):
        return self._x    # Ok, since accessed from a child class

class AnotherClass:
    def some_method(self, instance_of_my_child):
        return instance_of_my_child._x   # Would be flagged as AnotherClass is not
                                         # a child class of MyParent
Loading history...
1320
                    if sg == re.sub(r'\s+', '', d['hermann_mauguin']):
1321
                        xyz = d['symops']
1322
                        symops = [SymmOp.from_xyz_string(s) for s in xyz]
1323
                        break
1324
            except Exception:
0 ignored issues
show
Best Practice introduced by
Catching very general exceptions such as Exception is usually not recommended.

Generally, you would want to handle very specific errors in the exception handler. This ensure that you do not hide other types of errors which should be fixed.

So, unless you specifically plan to handle any error, consider adding a more specific exception.

Loading history...
1325
                continue
1326
1327
            if symops:
1328
                break
1329
1330
    # try to parse international number
1331
    if not symops:
1332
        for symmetry_label in CIF_TAGS['spacegroup_number']:
1333
            num = data.get(symmetry_label)
1334
            if not num:
1335
                continue
1336
1337
            try:
1338
                i = int(_cif_val_to_float(num))
1339
                symops = SpaceGroup.from_int_number(i).symmetry_ops
1340
                break
1341
            except ValueError:
1342
                continue
1343
1344
    if not symops:
1345
        symops = [SymmOp.from_xyz_string(s) for s in ['x', 'y', 'z']]
1346
1347
    rotations = [op.rotation_matrix for op in symops]
1348
    translations = [op.translation_vector for op in symops]
1349
1350
    return rotations, translations
1351
1352
1353
def _frac_molecular_centres_ccdc(crystal):
1354
    """Returns any geometric centres of molecules in the unit cell.
1355
    Expects a ccdc Crystal object and returns fractional coordiantes."""
1356
1357
    frac_centres = []
1358
    for comp in crystal.packing(inclusion='CentroidIncluded').components:
1359
        coords = [a.fractional_coordinates for a in comp.atoms]
1360
        x, y, z = zip(*coords)
1361
        m = len(coords)
1362
        frac_centres.append((sum(x) / m, sum(y) / m, sum(z) / m))
1363
    frac_centres = np.mod(np.array(frac_centres), 1)
1364
    return frac_centres[_unique_sites(frac_centres)]
1365
1366
1367
def _atom_has_disorder(label, occupancy):
1368
    """Return True if label ends with ? or occupancy is a number < 1."""
1369
    return label.endswith('?') or (np.isscalar(occupancy) and occupancy < 1)
1370
1371
1372
# def _snap_small_prec_coords(frac_coords):
1373
#     """Find where frac_coords is within 1e-4 of 1/3 or 2/3, change to 1/3 and 2/3.
1374
#     Recommended by pymatgen's CIF parser. May use later."""
1375
#     frac_coords[np.abs(1 - 3 * frac_coords) < 1e-4] = 1 / 3.
1376
#     frac_coords[np.abs(1 - 3 * frac_coords / 2) < 1e-4] = 2 / 3.
1377
#     return frac_coords
1378
1379
1380
def _heaviest_component(molecule):
1381
    """Heaviest component (removes all but the heaviest component of the asym unit).
1382
    Intended for removing solvents. Probably doesn't play well with disorder"""
1383
    component_weights = []
1384
    for component in molecule.components:
1385
        weight = 0
1386
        for a in component.atoms:
1387
            if isinstance(a.atomic_weight, (float, int)):
1388
                if isinstance(a.occupancy, (float, int)):
1389
                    weight += a.occupancy * a.atomic_weight
1390
                else:
1391
                    weight += a.atomic_weight
1392
        component_weights.append(weight)
1393
    largest_component_ind = np.argmax(np.array(component_weights))
1394
    molecule = molecule.components[largest_component_ind]
1395
    return molecule
1396
1397
1398
def _refcodes_from_families(refcode_families):
1399
    """List of strings --> all CSD refcodes starting with one of the strings.
1400
    Intended to be passed a list of families and return all refcodes in them."""
1401
1402
    try:
1403
        import ccdc.search
0 ignored issues
show
introduced by
Import outside toplevel (ccdc.search)
Loading history...
1404
    except (ImportError, RuntimeError) as _:
1405
        msg = 'Failed to import csd-python-api, please'\
1406
              'check it is installed and licensed.'
1407
        raise ImportError(msg)
1408
1409
    all_refcodes = []
1410
    for refcode in refcode_families:
1411
        query = ccdc.search.TextNumericSearch()
1412
        query.add_identifier(refcode)
1413
        hits = [hit.identifier for hit in query.search()]
1414
        all_refcodes.extend(hits)
1415
1416
    # filter to unique refcodes
1417
    seen = set()
1418
    seen_add = seen.add
1419
    refcodes = [
1420
        refcode for refcode in all_refcodes
1421
        if not (refcode in seen or seen_add(refcode))]
1422
1423
    return refcodes
1424
1425
1426
def _gemmi_loop_to_dict(loop):
1427
    """gemmi Loop object --> dict, tags: values"""
1428
    tablified_loop = [[] for _ in range(len(loop.tags))]
1429
    n_cols = loop.width()
1430
    for i, item in enumerate(loop.values):
1431
        tablified_loop[i % n_cols].append(item)
1432
    return {tag: l for tag, l in zip(loop.tags, tablified_loop)}
0 ignored issues
show
Unused Code introduced by
Unnecessary use of a comprehension
Loading history...
1433
1434
1435
def _cif_val_to_float(text):
1436
    """Remove uncertainty brackets from strings and return float."""
1437
1438
    try:
1439
        # Note that the ending ) is sometimes missing. That is why the code has
1440
        # been modified to treat it as optional. Same logic applies to lists.
1441
        return float(re.sub(r'\(.+\)*', '', text))
1442
    except TypeError:
1443
        if isinstance(text, list) and len(text) == 1:
1444
            return float(re.sub(r'\(.+\)*', '', text[0]))
1445
    except ValueError as err:
1446
        if text.strip() == '.':
1447
            return 0
1448
        raise err
1449
    raise ValueError(f'{text} cannot be converted to float')
1450