Source code for tme.structure

"""
Implements class Structure to represent atomic structures.

Copyright (c) 2023 European Molecular Biology Laboratory

Author: Valentin Maurer <valentin.maurer@embl-hamburg.de>
"""

import warnings
from copy import deepcopy
from itertools import groupby
from dataclasses import dataclass
from collections import namedtuple
from typing import List, Dict, Tuple
from os.path import splitext, basename

import numpy as np

from .types import NDArray
from .rotations import align_to_axis
from .preprocessor import atom_profile, Preprocessor
from .parser import PDBParser, MMCIFParser, GROParser
from .matching_utils import rigid_transform, minimum_enclosing_box

__all__ = ["Structure"]



[docs]
@dataclass(repr=False)
class Structure:
    """
    Represents atomic structures per the Protein Data Bank (PDB) specification.

    Examples
    --------
    The following achieves the definition of a :py:class:`Structure` instance

    >>> from tme import Structure
    >>> structure = Structure(
    >>>     record_type=["ATOM", "ATOM", "ATOM"],
    >>>     atom_serial_number=[0, 1, 2] ,
    >>>     atom_name=["C", "N", "H"],
    >>>     atom_coordinate=[[30,15,10], [35, 20, 15], [35,25,20]],
    >>>     alternate_location_indicator=[".", ".", "."],
    >>>     residue_name=["GLY", "GLY", "HIS"],
    >>>     chain_identifier=["A", "A", "B"],
    >>>     residue_sequence_number=[0, 0, 1],
    >>>     code_for_residue_insertion=["?", "?", "?"],
    >>>     occupancy=[0, 0, 0],
    >>>     temperature_factor=[0, 0, 0],
    >>>     segment_identifier=["1", "1", "1"],
    >>>     element_symbol=["C", "N", "C"],
    >>>     charge=["?", "?", "?"],
    >>>     metadata={},
    >>> )
    >>> structure
    Unique Chains: A-B, Atom Range: 0-2 [N = 3], Residue Range: 0-1 [N = 3]

    :py:class:`Structure` instances support a range of subsetting operations based on
    atom indices

    >>> structure[1]
    Unique Chains: A, Atom Range: 1-1 [N = 1], Residue Range: 0-0 [N = 1]
    >>> structure[(False, False, True)]
    Unique Chains: B, Atom Range: 2-2 [N = 1], Residue Range: 1-1 [N = 1]
    >>> structure[(1,2)]
    Unique Chains: A-B, Atom Range: 1-2 [N = 2], Residue Range: 0-1 [N = 2]

    They can be written to disk in a range of formats using :py:meth:`Structure.to_file`

    >>> structure.to_file("test.pdb") # Writes a PDB file to disk
    >>> structure.to_file("test.cif") # Writes a mmCIF file to disk

    New instances can be created from a range of formats using
    :py:meth:`Structure.from_file`

    >>> Structure.from_file("test.pdb") # Reads PDB file from disk
    Unique Chains: A-B, Atom Range: 0-2 [N = 3], Residue Range: 0-1 [N = 3]
    >>> Structure.from_file("test.cif") # Reads mmCIF file from disk
    Unique Chains: A-B, Atom Range: 0-2 [N = 3], Residue Range: 0-1 [N = 3]

    Class instances can be discretized on grids and converted to
    :py:class:`tme.density.Density` instances using :py:meth:`Structure.to_volume`
    or :py:meth:`tme.density.Density.from_structure`.

    >>> volume, origin, sampling_rate = structure.to_volume(shape=(50,40,30))

    References
    ----------
    .. [1] https://www.cgl.ucsf.edu/chimera/docs/UsersGuide/tutorials/pdbintro.html
    .. [2] https://www.ccp4.ac.uk/html/mmcifformat.html

    """

    #: Array of record types, e.g.ATOM.
    record_type: NDArray

    #: Array of serial numbers.
    atom_serial_number: NDArray

    #: Array of atom names.
    atom_name: NDArray

    #: Array of x,y,z atom coordinates.
    atom_coordinate: NDArray

    #: Array of alternate location indices.
    alternate_location_indicator: NDArray

    #: Array of residue names.
    residue_name: NDArray

    #: Array of chain identifiers.
    chain_identifier: NDArray

    #: Array of residue ids.
    residue_sequence_number: NDArray

    #: Array of insertion information.
    code_for_residue_insertion: NDArray

    #: Array of occupancy factors.
    occupancy: NDArray

    #: Array of B-factors.
    temperature_factor: NDArray

    #: Array of segment identifiers.
    segment_identifier: NDArray

    #: Array of element symbols.
    element_symbol: NDArray

    #: Array of charges.
    charge: NDArray

    #: Metadata dictionary.
    metadata: dict

    def __post_init__(self, *args, **kwargs):
        """
        Initialize the structure and populate header metadata.

        Raises
        ------
        ValueError
            If NDArray attributes does not match the number of atoms.
        """
        for attribute in self.__dict__:
            value = getattr(self, attribute)
            target_type = self.__annotations__.get(attribute, None)
            if target_type == NDArray:
                setattr(self, attribute, np.atleast_1d(np.array(value)))

        n_atoms = self.atom_coordinate.shape[0]
        for attribute in self.__dict__:
            value = getattr(self, attribute)
            if not isinstance(value, np.ndarray):
                continue
            if value.shape[0] != n_atoms:
                raise ValueError(
                    f"Expected shape of {attribute}: {n_atoms}, got {value.shape[0]}."
                )

        self._elements = _Elements()
        self.metadata = self._populate_metadata(self.metadata)

    def __getitem__(self, indices: List[int]) -> "Structure":
        """
        Get a Structure instance for specified indices.

        Parameters
        ----------
        indices : Union[int, bool, NDArray]
            The indices to get.

        Returns
        -------
        Structure
            The Structure instance for the given indices.
        """
        if type(indices) in (int, bool):
            indices = (indices,)

        indices = np.asarray(indices)
        attributes = (
            "record_type",
            "atom_serial_number",
            "atom_name",
            "atom_coordinate",
            "alternate_location_indicator",
            "residue_name",
            "chain_identifier",
            "residue_sequence_number",
            "code_for_residue_insertion",
            "occupancy",
            "temperature_factor",
            "segment_identifier",
            "element_symbol",
            "charge",
        )
        kwargs = {attr: getattr(self, attr)[indices] for attr in attributes}
        ret = self.__class__(**kwargs, metadata={})
        return ret

    def __repr__(self):
        """
        Return a string representation of the Structure.
        """
        unique_chains = "-".join(
            [
                ",".join([str(x) for x in entity])
                for entity in self.metadata["unique_chains"]
            ]
        )
        min_atom = np.min(self.atom_serial_number)
        max_atom = np.max(self.atom_serial_number)
        n_atom = self.atom_serial_number.size

        min_residue = np.min(self.residue_sequence_number)
        max_residue = np.max(self.residue_sequence_number)
        n_residue = np.unique(self.residue_sequence_number).size

        repr_str = (
            f"Structure object at {id(self)}\n"
            f"Unique Chains: {unique_chains}, "
            f"Atom Range: {min_atom}-{max_atom} [N = {n_atom}], "
            f"Residue Range: {min_residue}-{max_residue} [N = {n_residue}]"
        )
        return repr_str


[docs]
    def copy(self) -> "Structure":
        """
        Returns a copy of the Structure instance.

        Returns
        -------
        :py:class:`Structure`
            The copied Structure instance.

        Examples
        --------
        >>> import numpy as np
        >>> structure_copy = structure.copy()
        >>> np.allclose(structure_copy.atom_coordinate, structure.atom_coordinate)
        True
        """
        return deepcopy(self)


    def _populate_metadata(self, metadata: Dict = {}) -> Dict:
        """
        Populate the metadata dictionary with the data from the Structure instance.

        Parameters
        ----------
        metadata : dict, optional
            The initial metadata dictionary, by default {}.

        Returns
        -------
        dict
            The populated metadata dictionary.
        """
        metadata["weight"] = np.sum(
            [self._elements[atype].atomic_weight for atype in self.element_symbol]
        )

        label, idx, chain = np.unique(
            self.chain_identifier, return_inverse=True, return_index=True
        )
        chain_weight = np.bincount(
            chain,
            [self._elements[atype].atomic_weight for atype in self.element_symbol],
        )
        labels = self.chain_identifier[idx]
        metadata["chain_weight"] = {key: val for key, val in zip(labels, chain_weight)}

        # Group non-unique chains in separate lists in metadata["unique_chains"]
        metadata["unique_chains"], temp = [], {}
        for chain_label in label:
            index = len(metadata["unique_chains"])
            chain_sequence = "".join(
                [
                    str(y)
                    for y in self.element_symbol[
                        np.where(self.chain_identifier == chain_label)
                    ]
                ]
            )
            if chain_sequence not in temp:
                temp[chain_sequence] = index
                metadata["unique_chains"].append([chain_label])
                continue
            idx = temp.get(chain_sequence)
            metadata["unique_chains"][idx].append(chain_label)

        filtered_data = [
            (label, integer)
            for label, integer in zip(
                self.chain_identifier, self.residue_sequence_number
            )
        ]
        filtered_data = sorted(filtered_data, key=lambda x: x[0])
        metadata["chain_range"] = {}
        for label, values in groupby(filtered_data, key=lambda x: x[0]):
            values = [int(x[1]) for x in values]
            metadata["chain_range"][label] = (min(values), max(values))

        return metadata


[docs]
    @classmethod
    def from_file(
        cls,
        filename: str,
        keep_non_atom_records: bool = False,
        filter_by_elements: set = None,
        filter_by_residues: set = None,
    ) -> "Structure":
        """
        Reads an atomic structure file and into a :py:class:`Structure` instance.

        Parameters
        ----------
        filename : str
            Input file. Supported extensions are:

            +------+-------------------------------------------------------------+
            | .pdb | Reads a PDB file                                            |
            +------+-------------------------------------------------------------+
            | .cif | Reads an mmCIF file                                         |
            +------+-------------------------------------------------------------+
            | .gro | Reads a Gromos87 formated file                              |
            +------+-------------------------------------------------------------+
        keep_non_atom_records : bool, optional
            Wheter to keep residues that are not labelled ATOM.
        filter_by_elements: set, optional
            Which elements to keep. Default corresponds to all elements.
        filter_by_residues: set, optional
            Which residues to keep. Default corresponds to all residues.

        Raises
        ------
        NotImplementedError
            If the extension is not supported.

        Returns
        -------
        :py:class:`Structure`
            Structure instance representing the read in file.

        Examples
        --------
        >>> from importlib_resources import files
        >>> from tme import Structure
        >>> fname = str(files("tests.data").joinpath("Structures/5khe.cif"))
        >>> structure = Structure.from_file(filename=fname)
        >>> structure
        Unique Chains: A-B, Atom Range: 1-1564 [N = 1564], Residue Range: 142-239 [N = 1564]

        We can include non ATOM entries and restrict the considered elements
        and residues

        >>> structure = Structure.from_file(
        >>>     filename=fname,
        >>>     keep_non_atom_records=True,
        >>>     filter_by_elements = {"C"},
        >>>     filter_by_residues = {"GLY"},
        >>> )
        >>> structure
        Unique Chains: A,B, Atom Range: 96-1461 [N = 44], Residue Range: 154-228 [N = 44]
        """
        _, file_extension = splitext(basename(filename.lower()))
        _formats = {
            ".pdb": cls._load_pdb,
            ".cif": cls._load_mmcif,
            ".gro": cls._load_gro,
        }
        func = _formats.get(file_extension)
        if func is None:
            formats = ",".join([f"'{x}'" for x in _formats.keys()])
            raise NotImplementedError(
                f"Files with extension {file_extension} are not supported. "
                f"Supported filetypes are {formats}."
            )

        data = func(cls, filename)
        keep = np.ones(data["element_symbol"].size, dtype=bool)
        if filter_by_elements:
            keep = np.logical_and(
                keep,
                np.isin(data["element_symbol"], np.array(list(filter_by_elements))),
            )
        if filter_by_residues:
            keep = np.logical_and(
                keep, np.isin(data["residue_name"], np.array(list(filter_by_residues)))
            )
        if not keep_non_atom_records:
            keep = np.logical_and(keep, data["record_type"] == "ATOM")

        for key in data:
            if keep.sum() == keep.size:
                break

            if key == "metadata":
                continue
            if isinstance(data[key], np.ndarray):
                data[key] = data[key][keep]
            else:
                data[key] = [x for x, flag in zip(data[key], keep) if flag]

        data["metadata"]["filepath"] = filename

        return cls(**data)


    @staticmethod
    def _convert_dtypes(data: Dict[str, List], mapping: Dict):
        """
        Convert key values in data according to mapping.

        Parameters
        ----------
        data : Dict
            Mapping of keys to list of values
        mapping : Dict
            Mapping of key in return dict to (key, dtype) in data.

        Returns
        -------
        dict
            Key-value map using key-dtype pairs in mapping on data.
        """
        out = {}
        max_len = max([len(t) for t in data.values() if hasattr(t, "__len__")])

        missing_keys = set()
        for out_key, (inner_key, dtype) in mapping.items():
            default = "." if dtype is str else 0
            if inner_key in data:
                continue
            missing_keys.add(inner_key)
            out[out_key] = np.repeat(default, max_len).astype(dtype)

        if len(missing_keys):
            msg = ", ".join([f"'{x}'" for x in missing_keys])
            warnings.warn(
                f"Missing keys: ({msg}) in data - filling with default value."
            )

        for out_key, (inner_key, dtype) in mapping.items():
            default = "." if dtype is str else 0

            # Avoid modifying input dictionaries
            if inner_key in missing_keys:
                continue

            out_data = data[inner_key]
            if isinstance(data[inner_key][0], str):
                out_data = [str(x).strip() for x in data[inner_key]]

            out_data = np.asarray(out_data)
            if dtype is int:
                out_data = np.where(out_data == ".", "0", out_data)
            elif dtype == "base-36":
                dtype = int
                base36_offset = int("A0000", 36) - 100000
                out_data = np.where(
                    np.char.isdigit(out_data),
                    out_data,
                    np.vectorize(lambda x: int(x, 36) - base36_offset)(out_data),
                )
            try:
                out[out_key] = np.asarray(out_data, dtype=dtype)
            except ValueError:
                print(
                    f"Converting {out_key} to {dtype} failed. Setting {out_key} to {default}."
                )
                out[out_key] = np.repeat(default, max_len).astype(dtype)
        return out

    def _load_mmcif(self, filename: str) -> Dict:
        """
        Parses a macromolecular Crystallographic Information File (mmCIF)
        and returns the data in a dictionary format.

        Parameters
        ----------
        filename : str
            The filename of the mmCIF to load.

        Returns
        -------
        dict
            A dictionary of numpy arrays. Keys are the names of the PDB
            coordinate section. In addition, some details about the parsed
            structure are included. In case of conversion failure, the failing
            attribute is set to 0 if its supposed to be an integer value.
        """
        result = MMCIFParser(filename)

        atom_site_mapping = {
            "record_type": ("group_PDB", str),
            "atom_serial_number": ("id", int),
            "atom_name": ("label_atom_id", str),
            "alternate_location_indicator": ("label_alt_id", str),
            "residue_name": ("label_comp_id", str),
            # "chain_identifier": ("auth_asym_id", str),
            "chain_identifier": ("label_asym_id", str),
            "residue_sequence_number": ("label_seq_id", int),
            "code_for_residue_insertion": ("pdbx_PDB_ins_code", str),
            "occupancy": ("occupancy", float),
            "temperature_factor": ("B_iso_or_equiv", float),
            "segment_identifier": ("label_entity_id", str),
            "element_symbol": ("type_symbol", str),
            "charge": ("pdbx_formal_charge", str),
        }

        out = self._convert_dtypes(result["atom_site"], atom_site_mapping)
        number_entries = len(max(out.values(), key=len))
        for key, value in out.items():
            if value.size != 1:
                continue
            out[key] = np.repeat(value, number_entries // value.size)

        out["metadata"] = {}
        out["atom_coordinate"] = np.transpose(
            np.array(
                [
                    result["atom_site"]["Cartn_x"],
                    result["atom_site"]["Cartn_y"],
                    result["atom_site"]["Cartn_z"],
                ],
                dtype=np.float32,
            )
        )

        detail_mapping = {
            "resolution": ("em_3d_reconstruction", "resolution", np.nan),
            "resolution_method": ("em_3d_reconstruction", "resolution_method", np.nan),
            "method": ("exptl", "method", np.nan),
            "electron_source": ("em_imaging", "electron_source", np.nan),
            "illumination_mode": ("em_imaging", "illumination_mode", np.nan),
            "microscope_model": ("em_imaging", "microscope_model", np.nan),
        }
        for out_key, (base_key, inner_key, default) in detail_mapping.items():
            if base_key not in result:
                continue
            out["metadata"][out_key] = result[base_key].get(inner_key, default)

        return out

    def _load_pdb(self, filename: str) -> Dict:
        """
        Parses a Protein Data Bank (PDB) file and returns the data
        in a dictionary format.

        Parameters
        ----------
        filename : str
            The filename of the PDB file to load.

        Returns
        -------
        dict
            A dictionary of numpy arrays. Keys are the names of the PDB
            coordinate section. In addition, some details about the parsed
            structure are included. In case of conversion failure, the failing
            attribute is set to 0 if its supposed to be an integer value.
        """
        result = PDBParser(filename)

        atom_site_mapping = {
            "record_type": ("record_type", str),
            "atom_serial_number": ("atom_serial_number", "base-36"),
            "atom_name": ("atom_name", str),
            "alternate_location_indicator": ("alternate_location_indicator", str),
            "residue_name": ("residue_name", str),
            "chain_identifier": ("chain_identifier", str),
            "residue_sequence_number": ("residue_sequence_number", int),
            "code_for_residue_insertion": ("code_for_residue_insertion", str),
            "occupancy": ("occupancy", float),
            "temperature_factor": ("temperature_factor", float),
            "segment_identifier": ("segment_identifier", str),
            "element_symbol": ("element_symbol", str),
            "charge": ("charge", str),
        }

        out = self._convert_dtypes(result, atom_site_mapping)

        out["metadata"] = result["details"]
        out["atom_coordinate"] = np.array(result["atom_coordinate"], dtype=np.float32)

        return out

    def _load_gro(self, filename):
        result = GROParser(filename)

        atom_site_mapping = {
            "record_type": ("record_type", str),
            "atom_serial_number": ("atom_number", int),
            "atom_name": ("atom_name", str),
            "alternate_location_indicator": ("label_alt_id", str),
            "residue_name": ("residue_name", str),
            "chain_identifier": ("segment_identifier", str),
            "residue_sequence_number": ("residue_number", int),
            "code_for_residue_insertion": ("pdbx_PDB_ins_code", str),
            "occupancy": ("occupancy", float),
            "temperature_factor": ("B_iso_or_equiv", float),
            "segment_identifier": ("segment_identifier", str),
            "element_symbol": ("type_symbol", str),
            "charge": ("pdbx_formal_charge", str),
        }

        out = self._convert_dtypes(result, atom_site_mapping)

        unique_chains = np.unique(out["segment_identifier"])
        if len(unique_chains) > 1:
            warnings.warn(
                "Multiple GRO files detected - treating them as a single Structure. "
                "GRO file number is given by segment_identifier according to the "
                "input file. Note: You need to subset the Structure to operate on "
                "individual GRO files."
            )

        mkeys = ("title", "time", "box_vectors")
        out["metadata"] = {key: result.get(key) for key in mkeys}
        out["atom_coordinate"] = np.asarray(result["atom_coordinate"], dtype=np.float32)
        return out


[docs]
    def to_file(self, filename: str) -> None:
        """
        Writes the :py:class:`Structure` instance to disk.

        Parameters
        ----------
        filename : str
            The name of the file to be created. Supported extensions are

            +------+-------------------------------------------------------------+
            | .pdb | Creates a PDB file                                          |
            +------+-------------------------------------------------------------+
            | .cif | Creates an mmCIF file                                       |
            +------+-------------------------------------------------------------+
            | .gro | Creates an Gromos87 file                                    |
            +------+-------------------------------------------------------------+
        Raises
        ------
        NotImplementedError
            If the extension is not supported.

        Examples
        --------
        >>> from importlib_resources import files
        >>> from tempfile import NamedTemporaryFile
        >>> from tme import Structure
        >>> fname = str(files("tests.data").joinpath("Structures/5khe.cif"))
        >>> oname = NamedTemporaryFile().name
        >>> structure = Structure.from_file(filename=fname)
        >>> structure.to_file(f"{oname}.cif") # Writes an mmCIF file to disk
        >>> structure.to_file(f"{oname}.pdb") # Writes a PDB file to disk

        """
        _, file_extension = splitext(basename(filename.lower()))
        _formats = {
            ".pdb": self._write_pdb,
            ".cif": self._write_mmcif,
            ".gro": self._write_gro,
        }
        func = _formats.get(file_extension)
        if func is None:
            formats = ",".join([f"'{x}'" for x in _formats.keys()])
            raise NotImplementedError(
                f"Files with extension {file_extension} are not supported. "
                f"Supported filetypes are {formats}."
            )

        if np.any(np.vectorize(len)(self.chain_identifier) > 2):
            warnings.warn("Chain identifiers longer than one will be shortened.")

        if self.atom_coordinate.shape[0] > 10**5 and func == self._write_pdb:
            warnings.warn(
                "The structure contains more than 100,000 atoms. Consider using mmcif."
            )

        with open(filename, mode="w", encoding="utf-8") as ofile:
            ofile.write(func())


    def _write_pdb(self) -> str:
        """
        Returns a PDB string representation of the structure instance.

        Returns
        -------
        str
            String containing PDB file coordine lines.
        """
        data_out = []
        for index in range(self.atom_coordinate.shape[0]):
            x, y, z = self.atom_coordinate[index, :]
            line = list(" " * 80)
            line[0:6] = f"{self.record_type[index]:<6}"
            line[6:11] = f"{self.atom_serial_number[index]:>5}"
            line[12:16] = f"{self.atom_name[index]:<4}"
            line[16] = f"{self.alternate_location_indicator[index]:<1}"
            line[17:20] = f"{self.residue_name[index]:<3}"
            line[21] = f"{self.chain_identifier[index][0]:<1}"
            line[22:26] = f"{self.residue_sequence_number[index]:>4}"
            line[26] = f"{self.code_for_residue_insertion[index]:<1}"
            line[30:38] = f"{x:>8.3f}"
            line[38:46] = f"{y:>8.3f}"
            line[46:54] = f"{z:>8.3f}"
            line[54:60] = f"{self.occupancy[index]:>6.2f}"
            line[60:66] = f"{self.temperature_factor[index]:>6.2f}"
            line[72:76] = f"{self.segment_identifier[index]:>4}"
            line[76:78] = f"{self.element_symbol[index]:<2}"
            line[78:80] = f"{self.charge[index]:>2}"
            data_out.append("".join(line))
        data_out.append("END")
        data_out = "\n".join(data_out)
        return data_out

    def _write_mmcif(self) -> str:
        """
        Returns a MMCIF string representation of the structure instance.

        Returns
        -------
        str
            String containing MMCIF file coordinate lines.
        """
        model_num, entity_id = 1, 1
        data = {
            "group_PDB": [],
            "id": [],
            "type_symbol": [],
            "label_atom_id": [],
            "label_alt_id": [],
            "label_comp_id": [],
            "label_asym_id": [],
            "label_entity_id": [],
            "label_seq_id": [],
            "pdbx_PDB_ins_code": [],
            "Cartn_x": [],
            "Cartn_y": [],
            "Cartn_z": [],
            "occupancy": [],
            "B_iso_or_equiv": [],
            "pdbx_formal_charge": [],
            "auth_seq_id": [],
            "auth_comp_id": [],
            "auth_asym_id": [],
            "auth_atom_id": [],
            "pdbx_PDB_model_num": [],
        }

        for index in range(self.atom_coordinate.shape[0]):
            x, y, z = self.atom_coordinate[index, :]
            data["group_PDB"].append(self.record_type[index])
            data["id"].append(str(self.atom_serial_number[index]))
            data["type_symbol"].append(self.element_symbol[index])
            data["label_atom_id"].append(self.atom_name[index])
            data["label_alt_id"].append(self.alternate_location_indicator[index])
            data["label_comp_id"].append(self.residue_name[index])
            data["label_asym_id"].append(self.chain_identifier[index][0])
            data["label_entity_id"].append(str(entity_id))
            data["label_seq_id"].append(str(self.residue_sequence_number[index]))
            data["pdbx_PDB_ins_code"].append(self.code_for_residue_insertion[index])
            data["Cartn_x"].append(f"{x:.3f}")
            data["Cartn_y"].append(f"{y:.3f}")
            data["Cartn_z"].append(f"{z:.3f}")
            data["occupancy"].append(f"{self.occupancy[index]:.2f}")
            data["B_iso_or_equiv"].append(f"{self.temperature_factor[index]:.2f}")
            data["pdbx_formal_charge"].append(self.charge[index])
            data["auth_seq_id"].append(str(self.residue_sequence_number[index]))
            data["auth_comp_id"].append(self.residue_name[index])
            data["auth_asym_id"].append(self.chain_identifier[index][0])
            data["auth_atom_id"].append(self.atom_name[index])
            data["pdbx_PDB_model_num"].append(str(model_num))

        output_data = {"atom_site": data}
        original_file = self.metadata.get("filepath", "")
        try:
            new_data = {k: v for k, v in MMCIFParser(original_file).items()}
            index = self.atom_serial_number - 1
            new_data["atom_site"] = {
                k: [v[i] for i in index] for k, v in new_data["atom_site"].items()
            }
            new_data["atom_site"]["Cartn_x"] = data["Cartn_x"]
            new_data["atom_site"]["Cartn_y"] = data["Cartn_y"]
            new_data["atom_site"]["Cartn_z"] = data["Cartn_z"]
            output_data = new_data
        except Exception:
            pass

        ret = ""
        for category, subdict in output_data.items():
            if not len(subdict):
                continue

            ret += "#\n"
            is_loop = isinstance(subdict[list(subdict.keys())[0]], list)
            if not is_loop:
                for k in subdict:
                    ret += f"_{category}.{k}\t{subdict[k]}\n"
            else:
                ret += "loop_\n"
                ret += "".join([f"_{category}.{k}\n" for k in subdict])

                subdict = {
                    k: [_format_string(s) for s in v] for k, v in subdict.items()
                }
                key_length = {
                    key: len(max(value, key=lambda x: len(x), default=""))
                    for key, value in subdict.items()
                }
                padded_subdict = {
                    key: [s.ljust(key_length[key] + 1) for s in values]
                    for key, values in subdict.items()
                }

                data = [
                    "".join([str(x) for x in content])
                    for content in zip(*padded_subdict.values())
                ]
                ret += "\n".join([entry for entry in data]) + "\n"

        return ret

    def _write_gro(self) -> str:
        """
        Generate a GRO format string representation of the structure.

        Returns
        -------
        str
            String representation of the structure in GRO format.
        """
        ret = ""
        gro_files = np.unique(self.segment_identifier)
        for index, gro_file in enumerate(gro_files):
            subset = self[self.segment_identifier == gro_file]

            title = self.metadata.get("title", "Missing title")
            box_vectors = self.metadata.get("box_vectors")
            try:
                title = title[index]
                box_vectors = box_vectors[index]
            except Exception:
                pass

            if box_vectors is None:
                box_vectors = [0.0, 0.0, 0.0]

            num_atoms = subset.atom_coordinate.shape[0]
            lines = [title, f"{num_atoms}"]
            for i in range(num_atoms):
                res_num = subset.residue_sequence_number[i]
                res_name = subset.residue_name[i]
                atom_name = subset.atom_name[i]
                atom_num = subset.atom_serial_number[i]

                x, y, z = subset.atom_coordinate[i]
                coord = f"{atom_num % 100000:5d}{x:8.3f}{y:8.3f}{z:8.3f}"
                line = f"{res_num % 100000:5d}{res_name:5s}{atom_name:5s}{coord}"

                if "velocity" in subset.metadata:
                    vx, vy, vz = subset.metadata["velocity"][i]
                    line += f"{vx:8.4f}{vy:8.4f}{vz:8.4f}"

                lines.append(line)

            lines.append(" ".join(f"{v:.5f}" for v in box_vectors))
            ret += "\n".join(lines) + "\n"
        return ret


[docs]
    def subset_by_chain(self, chain: str = None) -> "Structure":
        """
        Return a subset of the structure that contains only atoms belonging to
        a specific chain. If no chain is specified, all chains are returned.

        Parameters
        ----------
        chain : str, optional
            The chain identifier. If multiple chains should be selected they need
            to be a comma separated string, e.g. 'A,B,CE'. If chain None,
            all chains are returned. Default is None.

        Returns
        -------
        :py:class:`Structure`
            A subset of the class instance containing only the specified chains.

        Raises
        ------
        ValueError
            If none of the specified chains exist.

        Examples
        --------
        >>> from importlib_resources import files
        >>> from tme import Structure
        >>> fname = str(files("tests.data").joinpath("Structures/5khe.cif"))
        >>> structure = Structure.from_file(filename=fname)
        >>> structure.subset_by_chain(chain="A")   # Keep A
        >>> structure.subset_by_chain(chain="A,B") # Keep A and B
        >>> structure.subset_by_chain(chain="B,C") # Keep B, C does not exist
        """
        chain = np.unique(self.chain_identifier) if chain is None else chain.split(",")
        keep = np.isin(self.chain_identifier, chain)
        return self[keep]



[docs]
    def subset_by_range(
        self,
        start: int,
        stop: int,
        chain: str = None,
    ) -> "Structure":
        """
        Return a subset of the structure within a specific range of residues.

        Parameters
        ----------
        start : int
            The starting residue sequence number.
        stop : int
            The ending residue sequence number.
        chain : str, optional
            The chain identifier. If multiple chains should be selected they need
            to be a comma separated string, e.g. 'A,B,CE'. If chain None,
            all chains are returned. Default is None.

        Returns
        -------
        :py:class:`Structure`
            A subset of the original structure within the specified residue range.

        Raises
        ------
        ValueError
            If none of the specified residue chain combinations exist.

        Examples
        --------
        >>> from importlib_resources import files
        >>> from tme import Structure
        >>> fname = str(files("tests.data").joinpath("Structures/5khe.cif"))
        >>> structure = Structure.from_file(filename=fname)
        >>> structure.subset_by_range(chain="A",start=150,stop=180)
        """
        ret = self.subset_by_chain(chain=chain)
        keep = np.logical_and(
            ret.residue_sequence_number >= start, ret.residue_sequence_number <= stop
        )
        return ret[keep]



[docs]
    def center_of_mass(self, weight_type: str = "atomic_weight") -> NDArray:
        """
        Calculate the center of mass of the structure.

        Parameters
        ----------
        weight_type : str, optional
            The type of weights to return. This can either be 'atomic_weight',
            'atomic_number', or 'euqual'. Defaults to 'atomic_weight'


        Returns
        -------
        NDArray
            The center of mass of the structure.

        Examples
        --------
        >>> from importlib_resources import files
        >>> from tme import Structure
        >>> fname = str(files("tests.data").joinpath("Structures/5khe.cif"))
        >>> structure = Structure.from_file(filename=fname)
        >>> structure.center_of_mass()
        array([-0.89391639, 29.94908928, -2.64736741])
        """
        atoms = self.element_symbol
        match weight_type:
            case "atomic_weight":
                weights = [self._elements[atom].atomic_weight for atom in atoms]
            case "atomic_number":
                weights = [self._elements[atom].atomic_number for atom in atoms]
            case "equal":
                weights = np.ones((len(atoms)))
            case _:
                raise NotImplementedError(
                    "weight_type can be 'atomic_weight', 'atomic_number' or 'equal."
                )
        return np.dot(self.atom_coordinate.T, weights) / np.sum(weights)



[docs]
    def rigid_transform(
        self,
        rotation_matrix: NDArray = None,
        translation: NDArray = None,
        use_geometric_center: bool = False,
    ) -> "Structure":
        """
        Performs a rigid transform of internal structure coordinates.

        Parameters
        ----------
        rotation_matrix : NDArray, optional
            The rotation matrix to apply to the coordinates, defaults to identity.
        translation : NDArray, optional
            The vector to translate the coordinates by, defaults to 0.
        use_geometric_center : bool, optional
            Whether to use geometric or coordinate center.

        Returns
        -------
        Structure
            The transformed instance of :py:class:`Structure`.

        Examples
        --------
        >>> from importlib_resources import files
        >>> from tme import Structure
        >>> from tme.matching_utils import get_rotation_matrices
        >>> fname = str(files("tests.data").joinpath("Structures/5khe.cif"))
        >>> structure = Structure.from_file(filename=fname)
        >>> structure.rigid_transform(
        >>>     rotation_matrix = get_rotation_matrices(60)[2],
        >>>     translation = (0, 1, -5)
        >>> )
        """
        out = np.empty_like(self.atom_coordinate.T)
        if translation is None:
            translation = np.zeros((self.atom_coordinate.shape[1]))
        if rotation_matrix is None:
            rotation_matrix = np.eye(self.atom_coordinate.shape[1])

        rigid_transform(
            coordinates=self.atom_coordinate.T,
            rotation_matrix=rotation_matrix,
            translation=translation,
            out=out,
            use_geometric_center=use_geometric_center,
        )
        ret = self.copy()
        ret.atom_coordinate = out.T.copy()
        return ret



[docs]
    def centered(self) -> Tuple["Structure", NDArray]:
        """
        Shifts the structure analogous to :py:meth:`tme.density.Density.centered`.

        Returns
        -------
        Structure
            A copy of the class instance whose data center of mass is in the
            center of the data array.
        NDArray
            The coordinate translation.

        See Also
        --------
        :py:meth:`tme.density.Density.centered`

        Examples
        --------
        >>> from importlib_resources import files
        >>> from tme import Structure
        >>> fname = str(files("tests.data").joinpath("Structures/5khe.cif"))
        >>> structure = Structure.from_file(filename=fname)
        >>> centered_structure, translation = structure.centered()
        >>> translation
        array([34.89391639,  4.05091072, 36.64736741])
        """
        center_of_mass = self.center_of_mass()
        enclosing_box = minimum_enclosing_box(coordinates=self.atom_coordinate.T)
        shift = np.subtract(np.divide(enclosing_box, 2), center_of_mass)

        transformed_structure = self.rigid_transform(
            translation=shift, rotation_matrix=np.eye(shift.size)
        )

        return transformed_structure, shift


    def _coordinate_to_position(
        self,
        shape: Tuple[int],
        sampling_rate: Tuple[float],
        origin: Tuple[float],
    ) -> (NDArray, Tuple[str], Tuple[int], float, Tuple[float]):
        """
        Converts coordinates to positions.

        Parameters
        ----------
        shape : Tuple[int,]
            The desired shape of the output array.
        sampling_rate : float
            The sampling rate of the output array in unit of self.atom_coordinate.
        origin : Tuple[float,]
            The origin of the coordinate system.

        Returns
        -------
        Tuple[NDArray, List[str], Tuple[int, ], float, Tuple[float,]]
            Returns positions, atom_types, shape, sampling_rate, and origin.
        """
        coordinates = self.atom_coordinate.copy()
        atom_types = self.element_symbol.copy()

        coordinates = coordinates
        sampling_rate = 1 if sampling_rate is None else sampling_rate
        adjust_origin = origin is not None and shape is None
        origin = coordinates.min(axis=0) if origin is None else origin
        positions = (coordinates - origin) / sampling_rate
        positions = np.rint(positions).astype(int)

        if adjust_origin:
            left_shift = positions.min(axis=0)
            positions -= left_shift
            shape = positions.max(axis=0) + 1
            origin = origin + np.multiply(left_shift, sampling_rate)

        if shape is None:
            shape = positions.max(axis=0) + 1

        valid_positions = np.sum(
            np.logical_and(positions < shape, positions >= 0), axis=1
        )

        positions = positions[valid_positions == positions.shape[1], :]
        atom_types = atom_types[valid_positions == positions.shape[1]]

        self.metadata["nAtoms_outOfBound"] = 0
        if positions.shape[0] != coordinates.shape[0]:
            out_of_bounds = coordinates.shape[0] - positions.shape[0]
            print(f"{out_of_bounds}/{coordinates.shape[0]} atoms were out of bounds.")
            self.metadata["nAtoms_outOfBound"] = out_of_bounds

        return positions, atom_types, shape, sampling_rate, origin

    def _position_to_vdw_sphere(
        self,
        positions: Tuple[float],
        atoms: Tuple[str],
        sampling_rate: Tuple[float],
        volume: NDArray,
    ) -> None:
        """
        Updates a volume with van der Waals spheres.

        Parameters
        ----------
        positions : Tuple[float, float, float]
            The positions of the atoms.
        atoms : Tuple[str]
            The types of the atoms.
        sampling_rate : float
            The desired sampling rate in unit of self.atom_coordinate of the
            output array.
        volume : NDArray
            The volume to update.
        """
        index_dict, vdw_rad, shape = {}, {}, volume.shape
        for atom_index, atom_position in enumerate(positions):
            atom_type = atoms[atom_index]
            if atom_type not in index_dict.keys():
                atom_vdwr = np.ceil(
                    np.divide(self._elements[atom_type].vdwr, (sampling_rate * 100))
                ).astype(int)

                vdw_rad[atom_type] = atom_vdwr
                atom_slice = tuple(slice(-k, k + 1) for k in atom_vdwr)
                distances = np.linalg.norm(
                    np.divide(
                        np.mgrid[atom_slice],
                        atom_vdwr.reshape((-1,) + (1,) * volume.ndim),
                    ),
                    axis=0,
                )
                index_dict[atom_type] = (distances <= 1).astype(volume.dtype)

            footprint = index_dict[atom_type]
            start = np.maximum(np.subtract(atom_position, vdw_rad[atom_type]), 0)
            stop = np.minimum(np.add(atom_position, vdw_rad[atom_type]) + 1, shape)
            volume_slice = tuple(slice(*coord) for coord in zip(start, stop))

            start_index = np.maximum(-np.subtract(atom_position, vdw_rad[atom_type]), 0)
            stop_index = np.add(
                footprint.shape,
                np.minimum(
                    np.subtract(shape, np.add(atom_position, vdw_rad[atom_type]) + 1), 0
                ),
            )
            index_slice = tuple(slice(*coord) for coord in zip(start_index, stop_index))
            volume[volume_slice] += footprint[index_slice]

    def _position_to_scattering_factors(
        self,
        positions: NDArray,
        atoms: NDArray,
        sampling_rate: NDArray,
        volume: NDArray,
        lowpass_filter: bool = True,
        downsampling_factor: float = 1.35,
        source: str = "peng1995",
    ) -> None:
        """
        Updates a volume with scattering factors.

        Parameters
        ----------
        positions : NDArray
            The positions of the atoms.
        atoms : NDArray
            Element symbols.
        sampling_rate : float
            Sampling rate that was used to convert coordinates to positions.
        volume : NDArray
            The volume to update.
        lowpass_filter : NDArray
            Whether the scattering factors should be lowpass filtered.
        downsampling_factor : NDArray
            Downsampling factor for scattering factor computation.
        source : str
            Which scattering factors to use

        Reference
        ---------
        https://github.com/I2PC/xmipp.
        """
        scattering_profiles, shape = dict(), volume.shape
        for atom_index, point in enumerate(positions):
            if atoms[atom_index] not in scattering_profiles:
                spline = atom_profile(
                    atom=atoms[atom_index],
                    M=downsampling_factor,
                    method=source,
                    lfilter=lowpass_filter,
                )
                scattering_profiles.update({atoms[atom_index]: spline})

            atomic_radius = np.divide(
                self._elements[atoms[atom_index]].vdwr, sampling_rate * 100
            )
            starts = np.maximum(np.ceil(point - atomic_radius), 0).astype(int)
            stops = np.minimum(np.floor(point + atomic_radius), shape).astype(int)

            grid_index = np.meshgrid(
                *[range(start, stop) for start, stop in zip(starts, stops)]
            )
            distances = np.einsum(
                "aijk->ijk",
                np.array([(grid_index[i] - point[i]) ** 2 for i in range(len(point))]),
                dtype=np.float64,
            )
            distances = np.sqrt(distances)
            if not len(distances):
                grid_index, distances = point, 0
            np.add.at(
                volume,
                tuple(grid_index),
                scattering_profiles[atoms[atom_index]](distances),
            )

    @staticmethod
    def _position_to_molmap(
        positions: NDArray,
        weights: Tuple[float],
        resolution: float = 4,
        sigma_factor: float = 1 / (np.pi * np.sqrt(2)),
        cutoff_value: float = 4.0,
        sampling_rate: float = None,
    ) -> NDArray:
        """
        Simulates electron densities analogous to Chimera's molmap function [1]_.

        Parameters
        ----------
        positions : NDArray
            Array containing atomic positions in x,y,z format (n,d).
        weights : tuple of float
            The weights to use for the entries in positions.
        resolution : float, optional
            The product of resolution and sigma_factor gives the sigma used to
            compute the discretized Gaussian.
        sigma_factor : float, optional
            The factor used with resolution to compute sigma. Default is 1 / (π√2).
        cutoff_value : float, optional
            The cutoff value for the Gaussian kernel. Default is 4.0.
        sampling_rate : float, optional
            Sampling rate along each dimension. One third of resolution by default.

        References
        ----------
        ..[1] https://www.cgl.ucsf.edu/chimera/docs/UsersGuide/midas/molmap.html

        Returns
        -------
        NDArray
            A numpy array containing the simulated electron densities.
        """
        if sampling_rate is None:
            sampling_rate = resolution / 3

        pad = int(3 * resolution)
        sigma = sigma_factor * resolution
        sigma_grid = sigma / sampling_rate

        # Limit padding to numerically stable values
        smax = np.max(sigma_grid)
        arr = np.arange(0, pad)
        gaussian = (
            np.exp(-0.5 * (arr / smax) ** 2)
            * np.power(2 * np.pi, -1.5)
            * np.power(sigma, -3.0)
        )
        pad_cutoff = np.max(arr[gaussian > 1e-8])
        if arr.size != 0:
            pad = int(pad_cutoff) + 1

        positions = positions
        origin = positions.min(axis=0) - pad * sampling_rate
        positions = np.rint(np.divide((positions - origin), sampling_rate)).astype(int)

        shape = positions.max(axis=0).astype(int) + pad + 1
        out = np.zeros(shape, dtype=np.float32)
        np.add.at(out, tuple(positions.T), weights)

        out = Preprocessor().gaussian_filter(
            template=out, sigma=sigma_grid, cutoff_value=cutoff_value
        )
        return out, origin

    def _get_atom_weights(
        self, atoms: Tuple[str] = None, weight_type: str = "atomic_weight"
    ) -> Tuple[float]:
        """
        Returns weights of individual atoms according to a specified weight type.

        Parameters
        ----------
        atoms : Tuple of strings, optional
            The atoms to get the weights for. If None, weights for all atoms
            are used. Default is None.

        weight_type : str, optional
            The type of weights to return. This can either be 'atomic_weight',
            'atomic_number', or 'van_der_waals_radius'. Default is 'atomic_weight'.

        Returns
        -------
        List[float]
            A list containing the weights of the atoms.
        """
        atoms = self.element_symbol if atoms is None else atoms
        match weight_type:
            case "atomic_weight":
                weight = [self._elements[atom].atomic_weight for atom in atoms]
            case "atomic_number":
                weight = [self._elements[atom].atomic_number for atom in atoms]
            case _:
                raise NotImplementedError(
                    "weight_type can either be 'atomic_weight' or 'atomic_number'"
                )
        return weight


[docs]
    def to_volume(
        self,
        shape: Tuple[int] = None,
        sampling_rate: Tuple[float] = None,
        origin: Tuple[float] = None,
        chain: str = None,
        weight_type: str = "atomic_weight",
        weight_type_args: Dict = dict(),
    ) -> Tuple[NDArray, NDArray, NDArray]:
        """
        Maps class instance to a volume.

        Parameters
        ----------
        shape : tuple of ints, optional
            Output array shape in (x,y,z) form.
        sampling_rate : tuple of float, optional
            Sampling rate of the output array in units of
            :py:attr:`Structure.atom_coordinate`
        origin : tuple of floats, optional
            Origin of the coordinate system in (x,y,z) form.
        chain : str, optional
            Chains to be included. Either single or comma separated string of chains.
            Defaults to None which returns all chains.
        weight_type : str, optional
            Weight given to individual atoms. Supported weights are:

            +----------------------------+---------------------------------------+
            | atomic_weight              | Using element weight point mass       |
            +----------------------------+---------------------------------------+
            | atomic_number              | Using atomic number point mass        |
            +----------------------------+---------------------------------------+
            | gaussian                   | Using element weighted Gaussian mass  |
            +----------------------------+---------------------------------------+
            | van_der_waals_radius       | Using binary van der waal spheres     |
            +----------------------------+---------------------------------------+
            | scattering_factors         | Using experimental scattering factors |
            +----------------------------+---------------------------------------+
            | lowpass_scattering_factors | Lowpass filtered scattering_factors   |
            +----------------------------+---------------------------------------+
        weight_type_args : dict, optional
            Additional arguments used for individual weight_types. `gaussian`
            accepts ``resolution``, `scattering` accepts ``method``.

        Returns
        -------
        Tuple[NDArray, NDArray, NDArray]
            Volume, origin and sampling_rate.

        Examples
        --------
        >>> from importlib_resources import files
        >>> from tme import Structure
        >>> fname = str(files("tests.data").joinpath("Structures/5khe.cif"))
        >>> structure = Structure.from_file(filename=fname)
        >>> vol, origin, sampling = structure.to_volume()
        >>> vol.shape, origin, sampling
        ((59, 35, 53), array([-30.71,  12.42, -27.15]), array([1., 1., 1.]))
        >>> vol, origin, sampling = structure.to_volume(sampling_rate=(2.2,1,3))
        ((27, 35, 18), array([-30.71,  12.42, -27.15]), array([2.2, 1. , 3. ]))

        ``sampling_rate`` and ``origin`` can be set to ensure correct alignment
        with corresponding density maps such as the ones at EMDB. Analogous to
        :py:meth:`Structure.subset_by_chain` only parts of the structure can be
        mapped onto grids using a variety of weighting schemes

        >>> structure.to_volume(weight_type="van_der_waals_radius")
        >>> structure.to_volume(
        >>>     weight_type="lowpass_scattering_factors",
        >>>     method_args={"source" : "dt1969", "downsampling_factor" : 1.35},
        >>> )
        """
        _weight_types = {
            "gaussian",
            "atomic_weight",
            "atomic_number",
            "van_der_waals_radius",
            "scattering_factors",
            "lowpass_scattering_factors",
        }
        _weight_string = ",".join([f"'{x}'" for x in _weight_types])
        if weight_type not in _weight_types:
            raise NotImplementedError(f"weight_type needs to be in {_weight_string}")

        if sampling_rate is None:
            sampling_rate = np.ones(self.atom_coordinate.shape[1])
        sampling_rate = np.array(sampling_rate)
        if sampling_rate.size == 1:
            sampling_rate = np.repeat(sampling_rate, self.atom_coordinate.shape[1])
        elif sampling_rate.size != self.atom_coordinate.shape[1]:
            raise ValueError(
                "sampling_rate should either be single value of array with"
                f"size {self.atom_coordinate.shape[1]}."
            )

        temp = self.subset_by_chain(chain=chain)
        positions, atoms, _shape, sampling_rate, origin = temp._coordinate_to_position(
            shape=shape, sampling_rate=sampling_rate, origin=origin
        )
        volume = np.zeros(_shape, dtype=np.float32)
        if weight_type in ("atomic_weight", "atomic_number"):
            weights = temp._get_atom_weights(atoms=atoms, weight_type=weight_type)
            np.add.at(volume, tuple(positions.T), weights)
        elif weight_type == "van_der_waals_radius":
            self._position_to_vdw_sphere(positions, atoms, sampling_rate, volume)
        elif weight_type == "scattering_factors":
            self._position_to_scattering_factors(
                positions,
                atoms,
                sampling_rate,
                volume,
                lowpass_filter=False,
                **weight_type_args,
            )
        elif weight_type == "lowpass_scattering_factors":
            self._position_to_scattering_factors(
                positions,
                atoms,
                sampling_rate,
                volume,
                lowpass_filter=True,
                **weight_type_args,
            )
        elif weight_type == "gaussian":
            volume, origin = self._position_to_molmap(
                positions=temp.atom_coordinate,
                weights=temp._get_atom_weights(
                    atoms=atoms, weight_type="atomic_number"
                ),
                sampling_rate=sampling_rate,
                **weight_type_args,
            )

        self.metadata.update(temp.metadata)
        return volume, origin, sampling_rate



[docs]
    @classmethod
    def compare_structures(
        cls,
        structure1: "Structure",
        structure2: "Structure",
        origin: NDArray = None,
        sampling_rate: float = None,
        weighted: bool = False,
    ) -> float:
        """
        Compute root mean square deviation (RMSD) between two structures with the
        same number of atoms.

        Parameters
        ----------
        structure1, structure2 : :py:class:`Structure`
            Structure instances to compare.
        origin : tuple of floats, optional
            Coordinate system origin. For computing RMSD on discretized grids.
        sampling_rate : tuple of floats, optional
            Sampling rate in units of :py:attr:`atom_coordinate`.
            For computing RMSD on discretized grids.
        weighted : bool, optional
            Whether atoms should be weighted acoording to their atomic weight.

        Returns
        -------
        float
            Root Mean Square Deviation between input structures.

        Examples
        --------
        >>> from importlib_resources import files
        >>> from tme.matching_utils import get_rotation_matrices
        >>> from tme import Structure
        >>> fname = str(files("tests.data").joinpath("Structures/5khe.cif"))
        >>> structure = Structure.from_file(filename=fname)
        >>> transformed = structure.rigid_transform(
        >>>     rotation_matrix = get_rotation_matrices(60)[2],
        >>>     translation = (0, 1, -5)
        >>> )
        >>> Structure.compare_structures(structure, transformed)
        31.35238
        >>> Structure.compare_structures(structure, structure)
        0.0
        """
        if origin is None:
            origin = np.zeros(structure1.atom_coordinate.shape[1])

        coordinates1 = structure1.atom_coordinate
        coordinates2 = structure2.atom_coordinate
        atoms1, atoms2 = structure1.element_symbol, structure2.element_symbol
        if sampling_rate is not None:
            coordinates1 = np.rint(
                np.divide(np.subtract(coordinates1, origin), sampling_rate)
            ).astype(int)
            coordinates2 = np.rint(
                np.divide(np.subtract(coordinates2, origin), sampling_rate)
            ).astype(int)

        weights1 = np.ones_like(structure1.atom_coordinate.shape[0])
        weights2 = np.ones_like(structure2.atom_coordinate.shape[0])
        if weighted:
            weights1 = np.array(structure1._get_atom_weights(atoms=atoms1))
            weights2 = np.array(structure2._get_atom_weights(atoms=atoms2))

        if not np.allclose(coordinates1.shape, coordinates2.shape):
            raise ValueError(
                "Input structures need to have the same number of coordinates."
            )
        if not np.allclose(weights1.shape, weights2.shape):
            raise ValueError("Input structures need to have the same number of atoms.")

        squared_diff = np.sum(np.square(coordinates1 - coordinates2), axis=1)
        weighted_quared_diff = squared_diff * ((weights1 + weights2) / 2)
        rmsd = np.sqrt(np.mean(weighted_quared_diff))

        return rmsd



[docs]
    @classmethod
    def align_structures(
        cls,
        structure1: "Structure",
        structure2: "Structure",
        origin: NDArray = None,
        sampling_rate: float = None,
        weighted: bool = False,
    ) -> Tuple["Structure", float]:
        """
        Align ``structure2`` to ``structure1`` using the Kabsch Algorithm. Both
        structures need to have the same number of atoms.

        Parameters
        ----------
        structure1, structure2 : :py:class:`Structure`
            Structure instances to align.
        origin : tuple of floats, optional
            Coordinate system origin. For computing RMSD on discretized grids.
        sampling_rate : tuple of floats, optional
            Sampling rate in units of :py:attr:`atom_coordinate`.
            For computing RMSD on discretized grids.
        weighted : bool, optional
            Whether atoms should be weighted by their atomic weight.

        Returns
        -------
        :py:class:`Structure`
            ``structure2`` aligned to ``structure1``.
        float
            Alignment RMSD

        Examples
        --------
        >>> from importlib_resources import files
        >>> from tme import Structure
        >>> from tme.matching_utils import get_rotation_matrices
        >>> fname = str(files("tests.data").joinpath("Structures/5khe.cif"))
        >>> structure = Structure.from_file(filename=fname)
        >>> transformed = structure.rigid_transform(
        >>>     rotation_matrix = get_rotation_matrices(60)[2],
        >>>     translation = (0, 1, -5)
        >>> )
        >>> aligned, rmsd = Structure.align_structures(structure, transformed)
        Initial RMSD: 31.07189 - Final RMSD: 0.00000
        """
        if origin is None:
            origin = np.minimum(
                structure1.atom_coordinate.min(axis=0),
                structure2.atom_coordinate.min(axis=0),
            ).astype(int)

        initial_rmsd = cls.compare_structures(
            structure1=structure1,
            structure2=structure2,
            origin=origin,
            sampling_rate=sampling_rate,
            weighted=weighted,
        )

        reference = structure1.atom_coordinate.copy()
        query = structure2.atom_coordinate.copy()
        if sampling_rate is not None:
            reference, atoms1, shape, _, _ = structure1._coordinate_to_position(
                shape=None, sampling_rate=sampling_rate, origin=origin
            )
            query, atoms2, shape, _, _ = structure2._coordinate_to_position(
                shape=None, sampling_rate=sampling_rate, origin=origin
            )

        reference_mean = reference.mean(axis=0)
        query_mean = query.mean(axis=0)

        reference = reference - reference_mean
        query = query - query_mean

        corr = np.dot(query.T, reference)
        U, S, Vh = np.linalg.svd(corr)

        rotation = np.dot(Vh.T, U.T).T
        if np.linalg.det(rotation) < 0:
            Vh[2, :] *= -1
            rotation = np.dot(Vh.T, U.T).T

        translation = reference_mean - np.dot(query_mean, rotation)

        temp = structure1.copy()
        temp.atom_coordinate = reference + reference_mean
        ret = structure2.copy()
        ret.atom_coordinate = np.dot(query + query_mean, rotation) + translation

        final_rmsd = cls.compare_structures(
            structure1=temp,
            structure2=ret,
            origin=origin,
            sampling_rate=None,
            weighted=weighted,
        )

        print(f"Initial RMSD: {initial_rmsd:.5f} - Final RMSD: {final_rmsd:.5f}")

        return ret, final_rmsd



[docs]
    def align_to_axis(
        self, coordinates: NDArray = None, axis: int = 2, flip: bool = False, **kwargs
    ):
        if coordinates is None:
            coordinates = self.atom_coordinate

        return align_to_axis(coordinates, axis=axis, flip=flip, **kwargs)




@dataclass(frozen=True, repr=True)
class _Elements:
    """Lookup table for chemical elements."""

    Atom = namedtuple(
        "Atom",
        [
            "atomic_number",
            "atomic_radius",
            "lattice_constant",
            "lattice_structure",
            "vdwr",
            "covalent_radius_bragg",
            "atomic_weight",
        ],
    )
    _default = Atom(0, 0, 0, "Atom does not exist in ressource.", 0, 0, 0)
    _elements = {
        "H": Atom(1, 25, 3.75, "HEX", 110, np.nan, 1.008),
        "HE": Atom(2, 120, 3.57, "HEX", 140, np.nan, 4.002602),
        "LI": Atom(3, 145, 3.49, "BCC", 182, 150, 6.94),
        "BE": Atom(4, 105, 2.29, "HEX", 153, 115, 9.0121831),
        "B": Atom(5, 85, 8.73, "TET", 192, np.nan, 10.81),
        "C": Atom(6, 70, 3.57, "DIA", 170, 77, 12.011),
        "N": Atom(7, 65, 4.039, "HEX", 155, 65, 14.007),
        "O": Atom(8, 60, 6.83, "CUB", 152, 65, 15.999),
        "F": Atom(9, 50, np.nan, "MCL", 147, 67, 18.998403163),
        "NE": Atom(10, 160, 4.43, "FCC", 154, np.nan, 20.1797),
        "NA": Atom(11, 180, 4.23, "BCC", 227, 177, 22.98976928),
        "MG": Atom(12, 150, 3.21, "HEX", 173, 142, 24.305),
        "AL": Atom(13, 125, 4.05, "FCC", 184, 135, 26.9815385),
        "SI": Atom(14, 110, 5.43, "DIA", 210, 117, 28.085),
        "P": Atom(15, 100, 7.17, "CUB", 180, np.nan, 30.973761998),
        "S": Atom(16, 100, 10.47, "ORC", 180, 102, 32.06),
        "CL": Atom(17, 100, 6.24, "ORC", 175, 105, 35.45),
        "AR": Atom(18, 71, 5.26, "FCC", 188, np.nan, 39.948),
        "K": Atom(19, 220, 5.23, "BCC", 275, 207, 39.0983),
        "CA": Atom(20, 180, 5.58, "FCC", 231, 170, 40.078),
        "SC": Atom(21, 160, 3.31, "HEX", 215, np.nan, 44.955908),
        "TI": Atom(22, 140, 2.95, "HEX", 211, 140, 47.867),
        "V": Atom(23, 135, 3.02, "BCC", 207, np.nan, 50.9415),
        "CR": Atom(24, 140, 2.88, "BCC", 206, 140, 51.9961),
        "MN": Atom(25, 140, 8.89, "CUB", 205, 147, 54.938044),
        "FE": Atom(26, 140, 2.87, "BCC", 204, 140, 55.845),
        "CO": Atom(27, 135, 2.51, "HEX", 200, 137, 58.933194),
        "NI": Atom(28, 135, 3.52, "FCC", 197, 135, 58.6934),
        "CU": Atom(29, 135, 3.61, "FCC", 196, 137, 63.546),
        "ZN": Atom(30, 135, 2.66, "HEX", 201, 132, 65.38),
        "GA": Atom(31, 130, 4.51, "ORC", 187, np.nan, 69.723),
        "GE": Atom(32, 125, 5.66, "DIA", 211, np.nan, 72.63),
        "AS": Atom(33, 115, 4.13, "RHL", 185, 126, 74.921595),
        "SE": Atom(34, 115, 4.36, "HEX", 190, 117, 78.971),
        "BR": Atom(35, 115, 6.67, "ORC", 185, 119, 79.904),
        "KR": Atom(36, np.nan, 5.72, "FCC", 202, np.nan, 83.798),
        "RB": Atom(37, 235, 5.59, "BCC", 303, 225, 85.4678),
        "SR": Atom(38, 200, 6.08, "FCC", 249, 195, 87.62),
        "Y": Atom(39, 180, 3.65, "HEX", 232, np.nan, 88.90584),
        "ZR": Atom(40, 155, 3.23, "HEX", 223, np.nan, 91.224),
        "NB": Atom(41, 145, 3.3, "BCC", 218, np.nan, 92.90637),
        "MO": Atom(42, 145, 3.15, "BCC", 217, np.nan, 95.95),
        "TC": Atom(43, 135, 2.74, "HEX", 216, np.nan, 97.90721),
        "RU": Atom(44, 130, 2.7, "HEX", 213, np.nan, 101.07),
        "RH": Atom(45, 135, 3.8, "FCC", 210, np.nan, 102.9055),
        "PD": Atom(46, 140, 3.89, "FCC", 210, np.nan, 106.42),
        "AG": Atom(47, 160, 4.09, "FCC", 211, 177, 107.8682),
        "CD": Atom(48, 155, 2.98, "HEX", 218, 160, 112.414),
        "IN": Atom(49, 155, 4.59, "TET", 193, np.nan, 114.818),
        "SN": Atom(50, 145, 5.82, "TET", 217, 140, 118.71),
        "SB": Atom(51, 145, 4.51, "RHL", 206, 140, 121.76),
        "TE": Atom(52, 140, 4.45, "HEX", 206, 133, 127.6),
        "I": Atom(53, 140, 7.72, "ORC", 198, 140, 126.90447),
        "XE": Atom(54, np.nan, 6.2, "FCC", 216, np.nan, 131.293),
        "CS": Atom(55, 260, 6.05, "BCC", 343, 237, 132.90545196),
        "BA": Atom(56, 215, 5.02, "BCC", 268, 210, 137.327),
        "LA": Atom(57, 195, 3.75, "HEX", 243, np.nan, 138.90547),
        "CE": Atom(58, 185, 5.16, "FCC", 242, np.nan, 140.116),
        "PR": Atom(59, 185, 3.67, "HEX", 240, np.nan, 140.90766),
        "ND": Atom(60, 185, 3.66, "HEX", 239, np.nan, 144.242),
        "PM": Atom(61, 185, np.nan, "", 238, np.nan, 144.91276),
        "SM": Atom(62, 185, 9, "RHL", 236, np.nan, 150.36),
        "EU": Atom(63, 185, 4.61, "BCC", 235, np.nan, 151.964),
        "GD": Atom(64, 180, 3.64, "HEX", 234, np.nan, 157.25),
        "TB": Atom(65, 175, 3.6, "HEX", 233, np.nan, 158.92535),
        "DY": Atom(66, 175, 3.59, "HEX", 231, np.nan, 162.5),
        "HO": Atom(67, 175, 3.58, "HEX", 230, np.nan, 164.93033),
        "ER": Atom(68, 175, 3.56, "HEX", 229, np.nan, 167.259),
        "TM": Atom(69, 175, 3.54, "HEX", 227, np.nan, 168.93422),
        "YB": Atom(70, 175, 5.49, "FCC", 226, np.nan, 173.045),
        "LU": Atom(71, 175, 3.51, "HEX", 224, np.nan, 174.9668),
        "HF": Atom(72, 155, 3.2, "HEX", 223, np.nan, 178.49),
        "TA": Atom(73, 145, 3.31, "BCC", 222, np.nan, 180.94788),
        "W": Atom(74, 135, 3.16, "BCC", 218, np.nan, 183.84),
        "RE": Atom(75, 135, 2.76, "HEX", 216, np.nan, 186.207),
        "OS": Atom(76, 130, 2.74, "HEX", 216, np.nan, 190.23),
        "IR": Atom(77, 135, 3.84, "FCC", 213, np.nan, 192.217),
        "PT": Atom(78, 135, 3.92, "FCC", 213, np.nan, 195.084),
        "AU": Atom(79, 135, 4.08, "FCC", 214, np.nan, 196.966569),
        "HG": Atom(80, 150, 2.99, "RHL", 223, np.nan, 200.592),
        "TL": Atom(81, 190, 3.46, "HEX", 196, 190, 204.38),
        "PB": Atom(82, 180, 4.95, "FCC", 202, np.nan, 207.2),
        "BI": Atom(83, 160, 4.75, "RHL", 207, 148, 208.9804),
        "PO": Atom(84, 190, 3.35, "SC", 197, np.nan, 209),
        "AT": Atom(85, np.nan, np.nan, "", 202, np.nan, 210),
        "RN": Atom(86, np.nan, np.nan, "FCC", 220, np.nan, 222),
        "FR": Atom(87, np.nan, np.nan, "BCC", 348, np.nan, 223),
        "RA": Atom(88, 215, np.nan, "", 283, np.nan, 226),
        "AC": Atom(89, 195, 5.31, "FCC", 247, np.nan, 227),
        "TH": Atom(90, 180, 5.08, "FCC", 245, np.nan, 232.0377),
        "PA": Atom(91, 180, 3.92, "TET", 243, np.nan, 231.03588),
        "U": Atom(92, 175, 2.85, "ORC", 241, np.nan, 238.02891),
        "NP": Atom(93, 175, 4.72, "ORC", 239, np.nan, 237),
        "PU": Atom(94, 175, np.nan, "MCL", 243, np.nan, 244),
        "AM": Atom(95, 175, np.nan, "", 244, np.nan, 243),
        "CM": Atom(96, np.nan, np.nan, "", 245, np.nan, 247),
        "BK": Atom(97, np.nan, np.nan, "", 244, np.nan, 247),
        "CF": Atom(98, np.nan, np.nan, "", 245, np.nan, 251),
        "ES": Atom(99, np.nan, np.nan, "", 245, np.nan, 252),
        "FM": Atom(100, np.nan, np.nan, "", 245, np.nan, 257),
        "MD": Atom(101, np.nan, np.nan, "", 246, np.nan, 258),
        "NO": Atom(102, np.nan, np.nan, "", 246, np.nan, 259),
        "LR": Atom(103, np.nan, np.nan, "", 246, np.nan, 262),
        "RF": Atom(104, np.nan, np.nan, "", np.nan, np.nan, 267),
        "DB": Atom(105, np.nan, np.nan, "", np.nan, np.nan, 268),
        "SG": Atom(106, np.nan, np.nan, "", np.nan, np.nan, 271),
        "BH": Atom(107, np.nan, np.nan, "", np.nan, np.nan, 274),
        "HS": Atom(108, np.nan, np.nan, "", np.nan, np.nan, 269),
        "MT": Atom(109, np.nan, np.nan, "", np.nan, np.nan, 276),
        "DS": Atom(110, np.nan, np.nan, "", np.nan, np.nan, 281),
        "RG": Atom(111, np.nan, np.nan, "", np.nan, np.nan, 281),
        "CN": Atom(112, np.nan, np.nan, "", np.nan, np.nan, 285),
        "NH": Atom(113, np.nan, np.nan, "", np.nan, np.nan, 286),
        "FL": Atom(114, np.nan, np.nan, "", np.nan, np.nan, 289),
        "MC": Atom(115, np.nan, np.nan, "", np.nan, np.nan, 288),
        "LV": Atom(116, np.nan, np.nan, "", np.nan, np.nan, 293),
        "TS": Atom(117, np.nan, np.nan, "", np.nan, np.nan, 294),
        "OG": Atom(118, np.nan, np.nan, "", np.nan, np.nan, 294),
    }

    def __getitem__(self, key: str):
        """
        Retrieve a value from the internal data using a given key.

        Parameters
        ----------
        key : str
            Key to retrieve the corresponding value for.

        Returns
        -------
        namedtuple
            The Atom tuple associated with the provided key.
        """
        return self._elements.get(key, self._default)


def _format_string(string: str) -> str:
    """
    Formats a string by adding quotation marks if it contains white spaces.

    Parameters
    ----------
    string : str
        Input string to be formatted.

    Returns
    -------
    str
        Formatted string with added quotation marks if needed.
    """
    if " " in string:
        return f"'{string}'"
    # Occurs e.g. for C1' atoms. The trailing whitespace is necessary.
    if string.count("'") == 1:
        return f'"{string}"'
    return string