Source code for tme.parser

"""
Implements parsers for atomic structure file formats.

Copyright (c) 2023 European Molecular Biology Laboratory

Author: Valentin Maurer <valentin.maurer@embl-hamburg.de>
"""

import re
import xml.etree.ElementTree as ET

from collections import deque
from abc import ABC, abstractmethod
from typing import List, Dict, Union


import numpy as np

__all__ = [
    "PDBParser",
    "MMCIFParser",
    "GROParser",
    "StarParser",
    "XMLParser",
    "MDOCParser",
]



[docs]
class Parser(ABC):
    """
    Base class for structure file parsers.

    Classes inheriting from :py:class:`Parser` need to define a function
    :py:meth:`Parser.parse_input` that creates a dictionary representation
    of the given file. The input is a deque of all lines in the file.
    """

    def __init__(self, filename: str, mode: str = "r", **kwargs) -> None:
        """
        Initialize a Parser object.

        Parameters
        ----------
        filename : str
            File name to parse data from.
        mode : str, optional
            Mode to open the file. Default is 'r' for read.
        kwargs : Dict, optional
            Optional keyword arguments passed to the child's parse_input method.

        """
        try:
            with open(filename, mode) as infile:
                data = infile.read()
        except UnicodeDecodeError:
            with open(filename, mode, encoding="utf-16") as infile:
                data = infile.read()

        data = deque(filter(lambda line: line and line[0] != "#", data.split("\n")))
        self._data = self.parse_input(data, **kwargs)

    def __getitem__(self, key: str):
        """
        Retrieve a value from the internal data using a given key.

        Parameters
        ----------
        key : str
            The key to use for retrieving the corresponding value from
            the internal data.

        Returns
        -------
        value
            The value associated with the provided key in the internal data.
        """
        return self._data[key]

    def __contains__(self, key) -> bool:
        """
        Check if a given key exists in the internal data.

        Parameters
        ----------
        key : str
            The key to check for in the internal data.

        Returns
        -------
        bool
            True if the key exists in the internal data, False otherwise.
        """
        return key in self._data

    def __repr__(self) -> str:
        """
        String representation of the Parser showing available keys and their lengths.

        Returns
        -------
        str
            A formatted string showing each key and the length of its value.
        """
        if not self._data:
            return f"{self.__class__.__name__}(empty)"

        lines = [f"{self.__class__.__name__}:"]
        try:
            for key, value in sorted(self._data.items()):
                if isinstance(value, (list, tuple)):
                    lines.append(f"  {key}: length {len(value)}")
                elif isinstance(value, dict):
                    lines.append(f"  {key}: dict with {len(value)} keys")
                elif isinstance(value, str):
                    lines.append(f"  {key}: str")
                else:
                    lines.append(f"  {key}: {type(value).__name__}")
        except Exception:
            pass

        return "\n".join(lines)


[docs]
    def get(self, key, default=None):
        """
        Retrieve a value from the internal data using a given key. If the
        key does not exist, return a default value.

        Parameters
        ----------
        key : str
            The key to use for retrieving the corresponding value from
             the internal data.
        default : Any
            The value to return if the key does not exist in the
            internal data, defaults to None.

        Returns
        -------
        value
            The value associated with the provided key in the internal data,
            or the default value if the key does not exist.
        """
        return self._data.get(key, default)



[docs]
    def keys(self):
        """
        List keys available in internal dictionary.
        """
        return self._data.keys()



[docs]
    def values(self):
        """
        List values available in internal dictionary.
        """
        return self._data.values()



[docs]
    def items(self):
        """
        List items available in internal dictionary.
        """
        return self._data.items()



[docs]
    @abstractmethod
    def parse_input(self, lines: List[str]) -> Dict:
        """
        Parse a list of lines from a file and convert the data into a dictionary.

        This function is not intended to be called directly, but should rather be
        defined by classes inheriting from :py:class:`Parser` to parse a given
        file format.

        Parameters
        ----------
        lines : list of str
            The lines of a structure file to parse.

        Returns
        -------
        dict
            A dictionary containing the parsed data.
        """





[docs]
class PDBParser(Parser):
    """
    Convert PDB file data into a dictionary representation [1]_.

    References
    ----------
    .. [1]  https://www.cgl.ucsf.edu/chimera/docs/UsersGuide/tutorials/pdbintro.html
    """


[docs]
    def parse_input(self, lines: deque) -> Dict:
        """
        Parse a list of lines from a PDB file and convert the data into a dictionary.

        Parameters
        ----------
        lines : deque of str
            The lines of a PDB file to parse.

        Returns
        -------
        dict
            A dictionary containing the parsed data from the PDB file.
        """
        metadata = {
            "resolution": re.compile(
                r"(.)+?(EFFECTIVE RESOLUTION\s+\(ANGSTROMS\)){1}(.)+?(\d+\.\d+)(\s)*$"
            ),
            "reconstruction_method": re.compile(
                r"(.)+?(RECONSTRUCTION METHOD)+(.)+?(\w+\s*\w+)(\s)*$"
            ),
            "electron_source": re.compile(r"(.)+?(SOURCE)+(.)+?(\w+\s*\w+)(\s)*$"),
            "illumination_mode": re.compile(
                r"(.)+?(ILLUMINATION MODE)+(.)+?(\w+\s*\w+)(\s)*$"
            ),
            "microscope_mode": re.compile(
                r"(.)+?(IMAGING MODE)+(.)+?(\w+\s*\w+)(\s)*$"
            ),
            "microscope_model": re.compile(
                r"(.)+?(MICROSCOPE MODEL)+(.+?:\s+)+?(.+)(\s)*$"
            ),
        }

        data = {
            "record_type": [],
            "atom_serial_number": [],
            "atom_name": [],
            "alternate_location_indicator": [],
            "residue_name": [],
            "chain_identifier": [],
            "residue_sequence_number": [],
            "code_for_residue_insertion": [],
            "atom_coordinate": [],
            "occupancy": [],
            "temperature_factor": [],
            "segment_identifier": [],
            "element_symbol": [],
            "charge": [],
            "details": {},
        }
        data["details"]["resolution"] = np.nan

        for line in lines:
            if line.startswith("REMARK"):
                matches = [(key, metadata[key].match(line)) for key in metadata]
                matches = [match for match in matches if match[1]]
                for key, match in matches:
                    data["details"][key] = match.group(4)
                    _ = metadata.pop(key)
            elif line.startswith("ATOM") or line.startswith("HETATM"):
                data["record_type"].append(line[0:6])
                data["atom_serial_number"].append(line[6:11])
                data["atom_name"].append(line[12:16])
                data["alternate_location_indicator"].append(line[16])
                data["residue_name"].append(line[17:20])

                data["chain_identifier"].append(line[21])
                data["residue_sequence_number"].append(line[22:26])
                data["code_for_residue_insertion"].append(line[26])
                data["atom_coordinate"].append((line[30:38], line[38:46], line[46:54]))
                data["occupancy"].append(line[54:60])
                data["temperature_factor"].append(line[60:66])
                data["segment_identifier"].append(line[74:76])
                data["element_symbol"].append(line[76:78])
                data["charge"].append(line[78:80])

        data["details"]["resolution"] = float(data["details"]["resolution"])

        return data





[docs]
class MMCIFParser(Parser):
    """
    Convert MMCIF file data into a dictionary representation. This implementation
    heavily relies on the atomium library [1]_.

    References
    ----------
    .. [1]  Ireland, S. M., & Martin, A. C. R. (2020). atomium (Version 1.0.0)
            [Computer software]. https://doi.org/10.1093/bioinformatics/btaa072
    """


[docs]
    def parse_input(self, lines: deque) -> Dict:
        lines = self._consolidate_strings(lines)
        blocks = self._split_in_blocks(lines)
        mmcif_dict = {}
        for block in blocks:
            if block["lines"][0] == "loop_":
                mmcif_dict[block["category"]] = self._loop_block_to_dict(block)
            else:
                mmcif_dict[block["category"]] = self._non_loop_block_to_dict(block)
        return mmcif_dict


    @staticmethod
    def _consolidate_strings(lines: List[str]) -> List[str]:
        """
        Consolidate multi-line strings that have been separated by semicolons in a
        list of strings.

        Parameters
        ----------
        lines : deque of str
            Deque of strings where each string is a line from an MMCIF file.

        Returns
        -------
        deque of str
            A deque of consolidated strings from the given input.
        """
        new_lines = deque()
        while lines:
            line = lines.popleft()
            if line.startswith(";"):
                string = [line[1:].strip()]
                while not lines[0].startswith(";"):
                    string.append(lines.popleft())
                lines.popleft()
                new_lines[-1] += ' "{}"'.format(
                    " ".join(string).replace('"', "").replace("'", "'")
                )
            else:
                new_lines.append(line.replace('"', "").replace("'", "'"))
        return new_lines

    @staticmethod
    def _split_in_blocks(lines: List[str]) -> List[Dict]:
        """
        Split a deque of consolidated strings into a list of dictionaries,
        each representing a block of data.

        Parameters
        ----------
        lines : deque of str
            Deque of consolidated strings where each string is a line from
            an MMCIF file.

        Returns
        -------
        list of dict
            A list of dictionaries where each dictionary represents a block
            of data from the MMCIF file.
        """
        category = None
        block, blocks = [], []
        while lines:
            line = lines.popleft()
            if line.startswith("data_"):
                continue
            if line.startswith("_"):
                line_category = line.split(".")[0]
                if line_category != category:
                    if category:
                        blocks.append({"category": category[1:], "lines": block})
                    category = line_category
                    block = []
            if line.startswith("loop_"):
                if category:
                    blocks.append({"category": category[1:], "lines": block})
                category = lines[0].split(".")[0]
                block = []
            block.append(line)
        if block:
            blocks.append({"category": category[1:], "lines": block})
        return blocks

    @staticmethod
    def _non_loop_block_to_dict(block: Dict) -> Dict:
        """
        Convert a non-loop block of data into a dictionary.

        Parameters
        ----------
        block : dict
            A dictionary representing a non-loop block of data from an MMCIF file.

        Returns
        -------
        dict
            A dictionary representing the parsed data from the given non-loop block.
        """
        d = {}
        # category = block["lines"][0].split(".")[0]
        for index in range(len(block["lines"]) - 1):
            if block["lines"][index + 1][0] != "_":
                block["lines"][index] += " " + block["lines"][index + 1]
        block["lines"] = [line for line in block["lines"] if line[0] == "_"]
        for line in block["lines"]:
            name = line.split(".")[1].split()[0]
            value = " ".join(line.split()[1:])
            d[name] = value
        return d

    def _loop_block_to_dict(self, block: Dict) -> Dict:
        """
        Convert a loop block of data into a dictionary.

        Parameters
        ----------
        block : dict
            A dictionary representing a loop block of data from an MMCIF file.

        Returns
        -------
        dict
            A dictionary representing the parsed data from the given loop block.
        """
        names, lines = [], []
        body_start = 0
        for index, line in enumerate(block["lines"][1:], start=1):
            if not line.startswith("_" + block["category"]):
                body_start = index
                break
        names = [line.split(".")[1].rstrip() for line in block["lines"][1:body_start]]
        lines = [self._split_line(line) for line in block["lines"][body_start:]]
        # reunites broken lines
        for n in range(len(lines) - 1):
            while n < len(lines) - 1 and len(lines[n]) + len(lines[n + 1]) <= len(
                names
            ):
                lines[n] += lines.pop(n + 1)
        res = {name: [] for name in names}
        for line in lines:
            for name, value in zip(names, line):
                res[name].append(value)
        return res

    @staticmethod
    def _split_line(line: str) -> List[str]:
        """
        Split a string into substrings, ignoring quotation marks within the string.

        Parameters
        ----------
        line : str
            The string to be split.

        Returns
        -------
        list of str
            A list of substrings resulting from the split operation on the given string.
        """
        if not re.search("['\"]", line):
            return line.split()

        chars = deque(line.strip())
        values, value, in_string = [], [], False
        while chars:
            char = chars.popleft()
            if char == " " and not in_string:
                values.append("".join(value))
                value = []
            elif char == '"':
                in_string = not in_string
                value.append(char)
            else:
                value.append(char)

        values.append(value)
        return ["".join(v) for v in values if v]



class GROParser(Parser):
    """
    Convert GRO file in Gromos87 format into a dictionary representation [1]_.

    References
    ----------
    .. [1] https://manual.gromacs.org/archive/5.0.4/online/gro.html
    """

    def parse_input(self, lines, **kwargs) -> Dict:
        data = {
            "title": [],
            "num_atoms": [],
            "record_type": [],
            "residue_number": [],
            "residue_name": [],
            "atom_name": [],
            "atom_number": [],
            "atom_coordinate": [],
            "segment_identifier": [],
            "velocity": [],
            "box_vectors": [],
            "time": [],
        }

        if not lines:
            return data

        time_pattern = re.compile(r"t=\s*(\d+\.?\d*)")
        file_index = -1

        # GRO files can be concatenated. Parse one per invocation
        while lines:
            file_index += 1
            str_file_index = str(file_index)

            title = lines.popleft()
            data["title"].append(title)

            time_match = time_pattern.search(title)
            if time_match:
                data["time"].append(float(time_match.group(1)))

            try:
                num_atoms = int(lines.popleft())
                data["num_atoms"].append(num_atoms)
            except (ValueError, IndexError):
                return data

            if num_atoms <= 0:
                continue

            valid_atoms = 0
            for _ in range(num_atoms):
                if not lines:
                    break

                line = lines.popleft()

                try:
                    res_num = int(line[:5])
                    res_name = line[5:10].strip()
                    atom_name = line[10:15].strip()
                    atom_num = int(line[15:20])

                    coord = (float(line[20:28]), float(line[28:36]), float(line[36:44]))

                    vel = None
                    if len(line) >= 68:
                        vel = (
                            float(line[44:52]),
                            float(line[52:60]),
                            float(line[60:68]),
                        )
                except (ValueError, IndexError):
                    continue

                valid_atoms += 1
                data["residue_number"].append(res_num)
                data["residue_name"].append(res_name)
                data["atom_name"].append(atom_name)
                data["atom_number"].append(atom_num)
                data["atom_coordinate"].append(coord)
                data["velocity"].append(vel)

            data["segment_identifier"].extend([str_file_index] * valid_atoms)
            data["record_type"].extend(["ATOM"] * valid_atoms)

            if lines:
                box_line = lines.popleft()
                try:
                    data["box_vectors"].append([float(val) for val in box_line.split()])
                except ValueError:
                    pass

        return data


class StarParser(MMCIFParser):
    """
    Convert STAR file data into a dictionary representation [1]_.

    References
    ----------
    .. [1]  https://www.iucr.org/__data/assets/file/0013/11416/star.5.html
    """

    def parse_input(self, lines: List[str], delimiter: str = None) -> Dict:
        pattern = re.compile(r"\s*#.*")

        ret, category, block = {}, None, []
        while lines:
            line = lines.popleft()

            if line.startswith("data") and not line.startswith("_"):
                if category != line and category is not None:
                    headers = list(ret[category].keys())
                    headers = [pattern.sub("", x) for x in headers]
                    ret[category] = {
                        header: list(column)
                        for header, column in zip(headers, zip(*block))
                    }
                    block.clear()
                category = line
                if category not in ret:
                    ret[category] = {}
                continue

            if line.startswith("_"):
                ret[category][line] = []
                continue

            if line.startswith("loop"):
                continue

            line_split = line.split(delimiter)
            if len(line_split):
                block.append(line_split)

        headers = list(ret[category].keys())
        headers = [pattern.sub("", x) for x in headers]
        ret[category] = {
            header: list(column) for header, column in zip(headers, zip(*block))
        }
        return ret


class XMLParser(Parser):
    """
    Parser for XML files.
    """

    def parse_input(self, lines: deque, **kwargs) -> Dict:
        root = ET.fromstring("\n".join(lines))
        return self._element_to_dict(root)

    def _element_to_dict(self, element) -> Dict:
        """
        Convert an XML element and its children to a dictionary.

        Parameters
        ----------
        element : xml.etree.ElementTree.Element
            The XML element to convert.

        Returns
        -------
        Dict
            Dictionary representation of the element.
        """
        result = {}

        if element.attrib:
            result["@attributes"] = {
                k: self._convert_value(v) for k, v in element.attrib.items()
            }

        children = list(element)
        if not children:
            if element.text and element.text.strip():
                text = element.text.strip()
                if "\n" in text:
                    result = [
                        self._convert_value(line.strip())
                        for line in text.split("\n")
                        if line.strip()
                    ]
                else:
                    result = self._convert_value(text)
        else:
            for child in children:
                child_dict = self._element_to_dict(child)

                if child.tag not in result:
                    result[child.tag] = child_dict
                else:
                    if not isinstance(result[child.tag], list):
                        result[child.tag] = [result[child.tag]]
                    result[child.tag].append(child_dict)

        return result

    def _convert_value(self, value_str: str) -> Union[int, float, bool, str]:
        """
        Convert a string value to an appropriate data type.

        Parameters
        ----------
        value_str : str
            String value to convert.

        Returns
        -------
        Union[int, float, bool, str]
            Converted value in appropriate data type.
        """
        if value_str.lower() in ("true", "false"):
            return value_str.lower() == "true"

        try:
            return int(value_str)
        except ValueError:
            pass

        try:
            return float(value_str)
        except ValueError:
            pass

        return value_str


class MDOCParser(Parser):
    """
    Convert MDOC file (SerialEM metadata) into a dictionary representation.

    MDOC files contain global parameters and per-tilt metadata for cryo-ET
    tilt series, with sections marked by [ZValue = N] for individual tilts.
    """

    def parse_input(self, lines: deque, **kwargs) -> Dict:
        data = {}
        global_params = {}
        in_zvalue_section = False
        zvalue_pattern = re.compile(r"\[ZValue\s*=\s*(\d+)\]")
        section_pattern = re.compile(r"\[T\s*=\s*(.*?)\]")

        if not lines:
            return data

        while lines:
            line = lines.popleft().strip()

            if not line:
                continue

            # Check for ZValue section header
            zvalue_match = zvalue_pattern.match(line)
            if zvalue_match:
                in_zvalue_section = True

                zvalue = int(zvalue_match.group(1))
                if "ZValue" not in data:
                    data["ZValue"] = []
                data["ZValue"].append(zvalue)
                continue

            # Check for T section header (comments/metadata)
            section_match = section_pattern.match(line)
            if section_match:
                section_content = section_match.group(1)
                if "sections" not in global_params:
                    global_params["sections"] = []
                global_params["sections"].append(section_content)
                continue

            # Parse key-value pairs
            if "=" in line:
                try:
                    key, value = line.split("=", 1)
                    key = key.strip()
                    value = value.strip()

                    try:
                        if "." not in value and "e" not in value.lower():
                            parsed_value = int(value)
                        else:
                            parsed_value = float(value)
                    except ValueError:
                        parsed_value = value

                    if not in_zvalue_section:
                        global_params[key] = parsed_value
                    else:
                        if key not in data:
                            data[key] = []
                        data[key].append(parsed_value)

                except ValueError:
                    continue

        data.update(global_params)
        return data