""" Implements parsers for atomic structure file formats.
Copyright (c) 2023 European Molecular Biology Laboratory
Author: Valentin Maurer <valentin.maurer@embl-hamburg.de>
"""
import re
import xml.etree.ElementTree as ET
from collections import deque
from abc import ABC, abstractmethod
from typing import List, Dict, Union
import numpy as np
__all__ = ["PDBParser", "MMCIFParser", "GROParser", "StarParser", "XMLParser"]
[docs]
class Parser(ABC):
"""
Base class for structure file parsers.
Classes inheriting from :py:class:`Parser` need to define a function
:py:meth:`Parser.parse_input` that creates a dictionary representation
of the given file. The input is a deque of all lines in the file.
"""
def __init__(self, filename: str, mode: str = "r", **kwargs) -> None:
"""
Initialize a Parser object.
Parameters
----------
filename : str
File name to parse data from.
mode : str, optional
Mode to open the file. Default is 'r' for read.
kwargs : Dict, optional
Optional keyword arguments passed to the child's parse_input method.
"""
try:
with open(filename, mode) as infile:
data = infile.read()
except UnicodeDecodeError:
with open(filename, mode, encoding="utf-16") as infile:
data = infile.read()
data = deque(filter(lambda line: line and line[0] != "#", data.split("\n")))
self._data = self.parse_input(data, **kwargs)
def __getitem__(self, key: str):
"""
Retrieve a value from the internal data using a given key.
Parameters
----------
key : str
The key to use for retrieving the corresponding value from
the internal data.
Returns
-------
value
The value associated with the provided key in the internal data.
"""
return self._data[key]
def __contains__(self, key) -> bool:
"""
Check if a given key exists in the internal data.
Parameters
----------
key : str
The key to check for in the internal data.
Returns
-------
bool
True if the key exists in the internal data, False otherwise.
"""
return key in self._data
[docs]
def get(self, key, default=None):
"""
Retrieve a value from the internal data using a given key. If the
key does not exist, return a default value.
Parameters
----------
key : str
The key to use for retrieving the corresponding value from
the internal data.
default : Any
The value to return if the key does not exist in the
internal data, defaults to None.
Returns
-------
value
The value associated with the provided key in the internal data,
or the default value if the key does not exist.
"""
return self._data.get(key, default)
[docs]
def keys(self):
"""
List keys available in internal dictionary.
"""
return self._data.keys()
[docs]
def values(self):
"""
List values available in internal dictionary.
"""
return self._data.values()
[docs]
def items(self):
"""
List items available in internal dictionary.
"""
return self._data.items()
[docs]
class PDBParser(Parser):
"""
Convert PDB file data into a dictionary representation [1]_.
References
----------
.. [1] https://www.cgl.ucsf.edu/chimera/docs/UsersGuide/tutorials/pdbintro.html
"""
[docs]
class MMCIFParser(Parser):
"""
Convert MMCIF file data into a dictionary representation. This implementation
heavily relies on the atomium library [1]_.
References
----------
.. [1] Ireland, S. M., & Martin, A. C. R. (2020). atomium (Version 1.0.0)
[Computer software]. https://doi.org/10.1093/bioinformatics/btaa072
"""
@staticmethod
def _consolidate_strings(lines: List[str]) -> List[str]:
"""
Consolidate multi-line strings that have been separated by semicolons in a
list of strings.
Parameters
----------
lines : deque of str
Deque of strings where each string is a line from an MMCIF file.
Returns
-------
deque of str
A deque of consolidated strings from the given input.
"""
new_lines = deque()
while lines:
line = lines.popleft()
if line.startswith(";"):
string = [line[1:].strip()]
while not lines[0].startswith(";"):
string.append(lines.popleft())
lines.popleft()
new_lines[-1] += ' "{}"'.format(
" ".join(string).replace('"', "").replace("'", "'")
)
else:
new_lines.append(line.replace('"', "").replace("'", "'"))
return new_lines
@staticmethod
def _split_in_blocks(lines: List[str]) -> List[Dict]:
"""
Split a deque of consolidated strings into a list of dictionaries,
each representing a block of data.
Parameters
----------
lines : deque of str
Deque of consolidated strings where each string is a line from
an MMCIF file.
Returns
-------
list of dict
A list of dictionaries where each dictionary represents a block
of data from the MMCIF file.
"""
category = None
block, blocks = [], []
while lines:
line = lines.popleft()
if line.startswith("data_"):
continue
if line.startswith("_"):
line_category = line.split(".")[0]
if line_category != category:
if category:
blocks.append({"category": category[1:], "lines": block})
category = line_category
block = []
if line.startswith("loop_"):
if category:
blocks.append({"category": category[1:], "lines": block})
category = lines[0].split(".")[0]
block = []
block.append(line)
if block:
blocks.append({"category": category[1:], "lines": block})
return blocks
@staticmethod
def _non_loop_block_to_dict(block: Dict) -> Dict:
"""
Convert a non-loop block of data into a dictionary.
Parameters
----------
block : dict
A dictionary representing a non-loop block of data from an MMCIF file.
Returns
-------
dict
A dictionary representing the parsed data from the given non-loop block.
"""
d = {}
# category = block["lines"][0].split(".")[0]
for index in range(len(block["lines"]) - 1):
if block["lines"][index + 1][0] != "_":
block["lines"][index] += " " + block["lines"][index + 1]
block["lines"] = [line for line in block["lines"] if line[0] == "_"]
for line in block["lines"]:
name = line.split(".")[1].split()[0]
value = " ".join(line.split()[1:])
d[name] = value
return d
def _loop_block_to_dict(self, block: Dict) -> Dict:
"""
Convert a loop block of data into a dictionary.
Parameters
----------
block : dict
A dictionary representing a loop block of data from an MMCIF file.
Returns
-------
dict
A dictionary representing the parsed data from the given loop block.
"""
names, lines = [], []
body_start = 0
for index, line in enumerate(block["lines"][1:], start=1):
if not line.startswith("_" + block["category"]):
body_start = index
break
names = [line.split(".")[1].rstrip() for line in block["lines"][1:body_start]]
lines = [self._split_line(line) for line in block["lines"][body_start:]]
# reunites broken lines
for n in range(len(lines) - 1):
while n < len(lines) - 1 and len(lines[n]) + len(lines[n + 1]) <= len(
names
):
lines[n] += lines.pop(n + 1)
res = {name: [] for name in names}
for line in lines:
for name, value in zip(names, line):
res[name].append(value)
return res
@staticmethod
def _split_line(line: str) -> List[str]:
"""
Split a string into substrings, ignoring quotation marks within the string.
Parameters
----------
line : str
The string to be split.
Returns
-------
list of str
A list of substrings resulting from the split operation on the given string.
"""
if not re.search("['\"]", line):
return line.split()
chars = deque(line.strip())
values, value, in_string = [], [], False
while chars:
char = chars.popleft()
if char == " " and not in_string:
values.append("".join(value))
value = []
elif char == '"':
in_string = not in_string
value.append(char)
else:
value.append(char)
values.append(value)
return ["".join(v) for v in values if v]
class GROParser(Parser):
"""
Convert GRO file in Gromos87 format into a dictionary representation [1]_.
References
----------
.. [1] https://manual.gromacs.org/archive/5.0.4/online/gro.html
"""
def parse_input(self, lines, **kwargs) -> Dict:
"""
Parse a list of lines from a GRO file and convert the data into a dictionary.
Parameters
----------
lines : deque of str
The lines of a GRO file to parse.
kwargs : Dict, optional
Optional keyword arguments.
Returns
-------
dict
A dictionary containing the parsed data from the GRO file.
"""
data = {
"title": [],
"num_atoms": [],
"record_type": [],
"residue_number": [],
"residue_name": [],
"atom_name": [],
"atom_number": [],
"atom_coordinate": [],
"segment_identifier": [],
"velocity": [],
"box_vectors": [],
"time": [],
}
if not lines:
return data
time_pattern = re.compile(r"t=\s*(\d+\.?\d*)")
file_index = -1
# GRO files can be concatenated. Parse one per invocation
while lines:
file_index += 1
str_file_index = str(file_index)
title = lines.popleft()
data["title"].append(title)
time_match = time_pattern.search(title)
if time_match:
data["time"].append(float(time_match.group(1)))
try:
num_atoms = int(lines.popleft())
data["num_atoms"].append(num_atoms)
except (ValueError, IndexError):
return data
if num_atoms <= 0:
continue
valid_atoms = 0
for _ in range(num_atoms):
if not lines:
break
line = lines.popleft()
try:
res_num = int(line[:5])
res_name = line[5:10].strip()
atom_name = line[10:15].strip()
atom_num = int(line[15:20])
coord = (float(line[20:28]), float(line[28:36]), float(line[36:44]))
vel = None
if len(line) >= 68:
vel = (
float(line[44:52]),
float(line[52:60]),
float(line[60:68]),
)
except (ValueError, IndexError):
continue
valid_atoms += 1
data["residue_number"].append(res_num)
data["residue_name"].append(res_name)
data["atom_name"].append(atom_name)
data["atom_number"].append(atom_num)
data["atom_coordinate"].append(coord)
data["velocity"].append(vel)
data["segment_identifier"].extend([str_file_index] * valid_atoms)
data["record_type"].extend(["ATOM"] * valid_atoms)
if lines:
box_line = lines.popleft()
try:
data["box_vectors"].append([float(val) for val in box_line.split()])
except ValueError:
pass
return data
class StarParser(MMCIFParser):
"""
Convert STAR file data into a dictionary representation [1]_.
References
----------
.. [1] https://www.iucr.org/__data/assets/file/0013/11416/star.5.html
"""
def parse_input(self, lines: List[str], delimiter: str = "\t") -> Dict:
pattern = re.compile(r"\s*#.*")
ret, category, block = {}, None, []
while lines:
line = lines.popleft()
if line.startswith("data") and not line.startswith("_"):
if category != line and category is not None:
headers = list(ret[category].keys())
headers = [pattern.sub("", x) for x in headers]
ret[category] = {
header: list(column)
for header, column in zip(headers, zip(*block))
}
block.clear()
category = line
if category not in ret:
ret[category] = {}
continue
if line.startswith("_"):
ret[category][line] = []
continue
if line.startswith("loop"):
continue
line_split = line.split(delimiter)
if len(line_split):
block.append(line_split)
headers = list(ret[category].keys())
headers = [pattern.sub("", x) for x in headers]
ret[category] = {
header: list(column) for header, column in zip(headers, zip(*block))
}
return ret
class XMLParser(Parser):
"""
Parser for XML files.
"""
def parse_input(self, lines: deque, **kwargs) -> Dict:
root = ET.fromstring("\n".join(lines))
return self._element_to_dict(root)
def _element_to_dict(self, element) -> Dict:
"""
Convert an XML element and its children to a dictionary.
Parameters
----------
element : xml.etree.ElementTree.Element
The XML element to convert.
Returns
-------
Dict
Dictionary representation of the element.
"""
result = {}
if element.attrib:
result["@attributes"] = {
k: self._convert_value(v) for k, v in element.attrib.items()
}
children = list(element)
if not children:
if element.text and element.text.strip():
text = element.text.strip()
if "\n" in text:
result = [
self._convert_value(line.strip())
for line in text.split("\n")
if line.strip()
]
else:
result = self._convert_value(text)
else:
for child in children:
child_dict = self._element_to_dict(child)
if child.tag not in result:
result[child.tag] = child_dict
else:
if not isinstance(result[child.tag], list):
result[child.tag] = [result[child.tag]]
result[child.tag].append(child_dict)
return result
def _convert_value(self, value_str: str) -> Union[int, float, bool, str]:
"""
Convert a string value to an appropriate data type.
Parameters
----------
value_str : str
String value to convert.
Returns
-------
Union[int, float, bool, str]
Converted value in appropriate data type.
"""
if value_str.lower() in ("true", "false"):
return value_str.lower() == "true"
try:
return int(value_str)
except ValueError:
pass
try:
return float(value_str)
except ValueError:
pass
return value_str