""" Implements parsers for atomic structure file formats.
Copyright (c) 2023 European Molecular Biology Laboratory
Author: Valentin Maurer <valentin.maurer@embl-hamburg.de>
import re
from collections import deque
from typing import List, Dict
from abc import ABC, abstractmethod
import numpy as np
class Parser(ABC):
Base class for structure file parsers.
Classes inheriting from :py:class:`Parser` need to define
a ``parse_input`` method that accepts a list of lines and returns a
dictionary representation of the data.
def __init__(self, filename: str, mode: str = "r") -> None:
Initialize a Parser object.
filename : str
File name to parse data from.
mode : str, optional
Mode to open the file. Default is 'r' for read.
with open(filename, "r") as infile:
data = infile.read()
data = deque(filter(lambda line: line and line[0] != "#", data.split("\n")))
self._data = self.parse_input(data)
def __getitem__(self, key: str):
Retrieve a value from the internal data using a given key.
key : str
The key to use for retrieving the corresponding value from
the internal data.
The value associated with the provided key in the internal data.
return self._data[key]
def __contains__(self, key) -> bool:
Check if a given key exists in the internal data.
key : str
The key to check for in the internal data.
True if the key exists in the internal data, False otherwise.
return key in self._data
def get(self, key, default):
Retrieve a value from the internal data using a given key. If the
key does not exist, return a default value.
key : str
The key to use for retrieving the corresponding value from
the internal data.
default : Any
The value to return if the key does not exist in the internal data.
The value associated with the provided key in the internal data,
or the default value if the key does not exist.
if key in self._data:
return self[key]
return default
def keys(self):
List keys available in internal dictionary.
return self._data.keys()
def values(self):
List values available in internal dictionary.
return self._data.values()
def items(self):
List items available in internal dictionary.
return self._data.items()
class PDBParser(Parser):
Convert PDB file data into a dictionary representation [1]_.
.. [1] https://www.cgl.ucsf.edu/chimera/docs/UsersGuide/tutorials/pdbintro.html
class MMCIFParser(Parser):
Convert MMCIF file data into a dictionary representation. This implementation
heavily relies on the atomium library [1]_.
.. [1] Ireland, S. M., & Martin, A. C. R. (2020). atomium (Version 1.0.0)
[Computer software]. https://doi.org/10.1093/bioinformatics/btaa072
def _consolidate_strings(lines: List[str]) -> List[str]:
Consolidate multi-line strings that have been separated by semicolons in a
list of strings.
lines : deque of str
Deque of strings where each string is a line from an MMCIF file.
deque of str
A deque of consolidated strings from the given input.
new_lines = deque()
while lines:
line = lines.popleft()
if line.startswith(";"):
string = [line[1:].strip()]
while not lines[0].startswith(";"):
new_lines[-1] += ' "{}"'.format(
" ".join(string).replace('"', "").replace("'", "'")
new_lines.append(line.replace('"', "").replace("'", "'"))
return new_lines
def _split_in_blocks(lines: List[str]) -> List[Dict]:
Split a deque of consolidated strings into a list of dictionaries,
each representing a block of data.
lines : deque of str
Deque of consolidated strings where each string is a line from
an MMCIF file.
list of dict
A list of dictionaries where each dictionary represents a block
of data from the MMCIF file.
category = None
block, blocks = [], []
while lines:
line = lines.popleft()
if line.startswith("data_"):
if line.startswith("_"):
line_category = line.split(".")[0]
if line_category != category:
if category:
blocks.append({"category": category[1:], "lines": block})
category = line_category
block = []
if line.startswith("loop_"):
if category:
blocks.append({"category": category[1:], "lines": block})
category = lines[0].split(".")[0]
block = []
if block:
blocks.append({"category": category[1:], "lines": block})
return blocks
def _non_loop_block_to_dict(block: Dict) -> Dict:
Convert a non-loop block of data into a dictionary.
block : dict
A dictionary representing a non-loop block of data from an MMCIF file.
A dictionary representing the parsed data from the given non-loop block.
d = {}
# category = block["lines"][0].split(".")[0]
for index in range(len(block["lines"]) - 1):
if block["lines"][index + 1][0] != "_":
block["lines"][index] += " " + block["lines"][index + 1]
block["lines"] = [line for line in block["lines"] if line[0] == "_"]
for line in block["lines"]:
name = line.split(".")[1].split()[0]
value = " ".join(line.split()[1:])
d[name] = value
return d
def _loop_block_to_dict(self, block: Dict) -> Dict:
Convert a loop block of data into a dictionary.
block : dict
A dictionary representing a loop block of data from an MMCIF file.
A dictionary representing the parsed data from the given loop block.
names, lines = [], []
body_start = 0
for index, line in enumerate(block["lines"][1:], start=1):
if not line.startswith("_" + block["category"]):
body_start = index
names = [line.split(".")[1].rstrip() for line in block["lines"][1:body_start]]
lines = [self._split_line(line) for line in block["lines"][body_start:]]
# reunites broken lines
for n in range(len(lines) - 1):
while n < len(lines) - 1 and len(lines[n]) + len(lines[n + 1]) <= len(
lines[n] += lines.pop(n + 1)
res = {name: [] for name in names}
for line in lines:
for name, value in zip(names, line):
return res
def _split_line(line: str) -> List[str]:
Split a string into substrings, ignoring quotation marks within the string.
line : str
The string to be split.
list of str
A list of substrings resulting from the split operation on the given string.
if not re.search("['\"]", line):
return line.split()
chars = deque(line.strip())
values, value, in_string = [], [], False
while chars:
char = chars.popleft()
if char == " " and not in_string:
value = []
elif char == '"':
in_string = not in_string
return ["".join(v) for v in values if v]