""" Implements parsers for atomic structure file formats.
Copyright (c) 2023 European Molecular Biology Laboratory
Author: Valentin Maurer <valentin.maurer@embl-hamburg.de>
"""
import re
from collections import deque
from typing import List, Dict
from abc import ABC, abstractmethod
import numpy as np
[docs]
class Parser(ABC):
"""
Base class for structure file parsers.
Classes inheriting from :py:class:`Parser` need to define
a ``parse_input`` method that accepts a list of lines and returns a
dictionary representation of the data.
"""
def __init__(self, filename: str, mode: str = "r") -> None:
"""
Initialize a Parser object.
Parameters
----------
filename : str
File name to parse data from.
mode : str, optional
Mode to open the file. Default is 'r' for read.
"""
with open(filename, "r") as infile:
data = infile.read()
data = deque(filter(lambda line: line and line[0] != "#", data.split("\n")))
self._data = self.parse_input(data)
def __getitem__(self, key: str):
"""
Retrieve a value from the internal data using a given key.
Parameters
----------
key : str
The key to use for retrieving the corresponding value from
the internal data.
Returns
-------
value
The value associated with the provided key in the internal data.
"""
return self._data[key]
def __contains__(self, key) -> bool:
"""
Check if a given key exists in the internal data.
Parameters
----------
key : str
The key to check for in the internal data.
Returns
-------
bool
True if the key exists in the internal data, False otherwise.
"""
return key in self._data
[docs]
def get(self, key, default):
"""
Retrieve a value from the internal data using a given key. If the
key does not exist, return a default value.
Parameters
----------
key : str
The key to use for retrieving the corresponding value from
the internal data.
default : Any
The value to return if the key does not exist in the internal data.
Returns
-------
value
The value associated with the provided key in the internal data,
or the default value if the key does not exist.
"""
if key in self._data:
return self[key]
return default
[docs]
def keys(self):
"""
List keys available in internal dictionary.
"""
return self._data.keys()
[docs]
def values(self):
"""
List values available in internal dictionary.
"""
return self._data.values()
[docs]
def items(self):
"""
List items available in internal dictionary.
"""
return self._data.items()
[docs]
class PDBParser(Parser):
"""
Convert PDB file data into a dictionary representation [1]_.
References
----------
.. [1] https://www.cgl.ucsf.edu/chimera/docs/UsersGuide/tutorials/pdbintro.html
"""
[docs]
class MMCIFParser(Parser):
"""
Convert MMCIF file data into a dictionary representation. This implementation
heavily relies on the atomium library [1]_.
References
----------
.. [1] Ireland, S. M., & Martin, A. C. R. (2020). atomium (Version 1.0.0)
[Computer software]. https://doi.org/10.1093/bioinformatics/btaa072
"""
@staticmethod
def _consolidate_strings(lines: List[str]) -> List[str]:
"""
Consolidate multi-line strings that have been separated by semicolons in a
list of strings.
Parameters
----------
lines : deque of str
Deque of strings where each string is a line from an MMCIF file.
Returns
-------
deque of str
A deque of consolidated strings from the given input.
"""
new_lines = deque()
while lines:
line = lines.popleft()
if line.startswith(";"):
string = [line[1:].strip()]
while not lines[0].startswith(";"):
string.append(lines.popleft())
lines.popleft()
new_lines[-1] += ' "{}"'.format(
" ".join(string).replace('"', "").replace("'", "'")
)
else:
new_lines.append(line.replace('"', "").replace("'", "'"))
return new_lines
@staticmethod
def _split_in_blocks(lines: List[str]) -> List[Dict]:
"""
Split a deque of consolidated strings into a list of dictionaries,
each representing a block of data.
Parameters
----------
lines : deque of str
Deque of consolidated strings where each string is a line from
an MMCIF file.
Returns
-------
list of dict
A list of dictionaries where each dictionary represents a block
of data from the MMCIF file.
"""
category = None
block, blocks = [], []
while lines:
line = lines.popleft()
if line.startswith("data_"):
continue
if line.startswith("_"):
line_category = line.split(".")[0]
if line_category != category:
if category:
blocks.append({"category": category[1:], "lines": block})
category = line_category
block = []
if line.startswith("loop_"):
if category:
blocks.append({"category": category[1:], "lines": block})
category = lines[0].split(".")[0]
block = []
block.append(line)
if block:
blocks.append({"category": category[1:], "lines": block})
return blocks
@staticmethod
def _non_loop_block_to_dict(block: Dict) -> Dict:
"""
Convert a non-loop block of data into a dictionary.
Parameters
----------
block : dict
A dictionary representing a non-loop block of data from an MMCIF file.
Returns
-------
dict
A dictionary representing the parsed data from the given non-loop block.
"""
d = {}
# category = block["lines"][0].split(".")[0]
for index in range(len(block["lines"]) - 1):
if block["lines"][index + 1][0] != "_":
block["lines"][index] += " " + block["lines"][index + 1]
block["lines"] = [line for line in block["lines"] if line[0] == "_"]
for line in block["lines"]:
name = line.split(".")[1].split()[0]
value = " ".join(line.split()[1:])
d[name] = value
return d
def _loop_block_to_dict(self, block: Dict) -> Dict:
"""
Convert a loop block of data into a dictionary.
Parameters
----------
block : dict
A dictionary representing a loop block of data from an MMCIF file.
Returns
-------
dict
A dictionary representing the parsed data from the given loop block.
"""
names, lines = [], []
body_start = 0
for index, line in enumerate(block["lines"][1:], start=1):
if not line.startswith("_" + block["category"]):
body_start = index
break
names = [line.split(".")[1].rstrip() for line in block["lines"][1:body_start]]
lines = [self._split_line(line) for line in block["lines"][body_start:]]
# reunites broken lines
for n in range(len(lines) - 1):
while n < len(lines) - 1 and len(lines[n]) + len(lines[n + 1]) <= len(
names
):
lines[n] += lines.pop(n + 1)
res = {name: [] for name in names}
for line in lines:
for name, value in zip(names, line):
res[name].append(value)
return res
@staticmethod
def _split_line(line: str) -> List[str]:
"""
Split a string into substrings, ignoring quotation marks within the string.
Parameters
----------
line : str
The string to be split.
Returns
-------
list of str
A list of substrings resulting from the split operation on the given string.
"""
if not re.search("['\"]", line):
return line.split()
chars = deque(line.strip())
values, value, in_string = [], [], False
while chars:
char = chars.popleft()
if char == " " and not in_string:
values.append("".join(value))
value = []
elif char == '"':
in_string = not in_string
value.append(char)
else:
value.append(char)
values.append(value)
return ["".join(v) for v in values if v]