# -*- coding: utf-8 -*-
# The turbomoleio package, a python interface to Turbomole
# for preparing inputs, parsing outputs and other related tools.
#
# Copyright (C) 2018-2022 BASF SE, Matgenix SRL.
#
# This file is part of turbomoleio.
#
# Turbomoleio is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Turbomoleio is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with turbomoleio (see ~turbomoleio/COPYING). If not,
# see <https://www.gnu.org/licenses/>.
"""Utility module for the "data group" model of TurboMole.
This module contains utility functions to parse, manipulate and write files
formatted using the "data group" convention of TurboMole. Examples of files
that are written in this fashion are the "control" file, the "coord" file, the
"basis" file and many other files written by TurboMole.
Files using the "data group" model of TurboMole are human readable ASCII files
in which data is formally split into consistent "groups". Each "data group"
consists of a keyword starting with a dollar ("$") sign (e.g. "$symmetry",
"$open shell", "$optimize" ...) and its associated data. The associated data
can be empty or just one value/word but it can also be very large (e.g.
molecular orbitals : "$scfmo" data group contained in the "mos" file).
A complete description of the data group model of TurboMole can be found in
the user's manual, as well as a description of all the keywords available.
"""
import os
import re
from monty.json import MSONable
[docs]
def cleanup_string(string, cleanup_types=None):
"""Return a cleaned data groups string.
This function allows to clean up a data groups string. It allows to :
- remove everything before the first dollar ("$") sign,
- remove blank lines,
- remove spaces and tabs before "#" comments,
- remove spaces and tabs before "$" signs.
Args:
string (str): A string that should be in the "data group" format of
TurboMole. Typically, this string is read from file (e.g. control,
coord, basis, ...).
cleanup_types (list of str, optional): List of the cleanup types that
should be performed. Valid types are "BEFORE_FIRST_DOLLAR",
"BLANK_LINES", "LEADING_SPACES_DOLLAR" and "LEADING_SPACES_HASH".
Defaults to None in which case all four types of cleanup will be
performed.
Returns:
str: A cleaned up string.
Raises:
ValueError: If one of the cleanup_types is not valid, if there is no
dollar ("$") sign, if one of the dollar ("$") or hash ("#") signs
is preceded by characters other than a space or a tab in a line or
if there is no "$end" at the end of the file.
"""
if "$" not in string:
raise ValueError('No dollar ("$") sign in the string.')
if cleanup_types is None:
cleanup_types = [
"BLANK_LINES",
"LEADING_SPACES_DOLLAR",
"LEADING_SPACES_HASH",
"BEFORE_FIRST_DOLLAR",
]
for cleanup_type in cleanup_types:
# Removing lines before the first dollar sign (except if this dollar
# sign is in a hash comment)
if cleanup_type == "BEFORE_FIRST_DOLLAR":
newlines = []
after_first_dollar = False
for line in string.split("\n"):
if after_first_dollar or line.strip().startswith("#"):
newlines.append(line)
elif line.strip().startswith("$"):
after_first_dollar = True
newlines.append(line.lstrip())
string = "\n".join(newlines)
# Removing all blank lines
elif cleanup_type == "BLANK_LINES":
string = re.sub(r"(\n[ \t]*){2,}", "\n", string, flags=re.MULTILINE)
# Removing all spaces and tabs before dollar signs
elif cleanup_type == "LEADING_SPACES_DOLLAR":
string = re.sub(r"^[ \t]*\$", "$", string, flags=re.MULTILINE)
# Removing all spaces before hash signs
elif cleanup_type == "LEADING_SPACES_HASH":
string = re.sub(r"^[ \t]*#", "#", string, flags=re.MULTILINE)
else:
raise ValueError(
'Cleanup of type "{}" ' "is not a valid type.".format(cleanup_type)
)
for match in re.findall(pattern=r"^[ \t\S]+\$", string=string, flags=re.MULTILINE):
if not match.strip().startswith("#"):
raise ValueError("Some character(s) are preceding a " 'dollar ("$") sign.')
if "$end" not in string:
raise ValueError('No "$end" in the string.')
return string
[docs]
def split_string_to_dg_list(string, cleanup_types=None, remove_comment_types=None):
r"""Split a string into a list of data groups.
Args:
string (str): A string that should be in the "data group" format of
TurboMole. Typically, this string is read from file (e.g. control,
coord, basis, ...).
cleanup_types (list of str, optional): List of the cleanup types that
should be performed before splitting the string into a list of
data groups. Valid types are "BEFORE_FIRST_DOLLAR",
"HASH_START", "DUMMY", "AFTER_END" and "BLANK_LINES". Defaults to
None in which case all five types of comments will be removed.
remove_comment_types (list of str, optional): List of the types of
comments that should be removed before splitting the string into a
list of data groups. Valid types are "BEFORE_FIRST_DOLLAR",
"HASH_START", "DUMMY", "AFTER_END" and "BLANK_LINES". Defaults to
None in which case all five types of comments will be removed.
Returns:
list: The list of data groups in the string.
Raises:
ValueError: If there is no dollar ("$") sign in the string, i.e.
the string is not or incorrectly formatted with respect to the
data group format of TurboMole.
Examples:
>>> split_string_to_dg_list("$title\n$end\n")
['$title\n', '$end\n']
>>> split_string_to_dg_list("no dollar sign")
[]
>>> split_string_to_dg_list("\n\n$title\n"
... "$optimize\n"
... " internal on\n"
... " redundant on\n"
... " cartesian off\n"
... " global off\n"
... " basis off\n")
['$title\n', '$optimize\n internal on\n redundant on\n cartesian off\n global off\n basis off\n']
""" # noqa: E501
string = cleanup_string(string=string, cleanup_types=cleanup_types)
string = remove_comments(string=string, comment_types=remove_comment_types)
return ["$" + dg for dg in string.split("$")[1:]]
[docs]
def remove_dg_from_list(dg_to_remove, dg_list, strict=True):
"""Return new list with a data group removed from the initial list.
This function will remove dg_to_remove from a list of data groups. If the
data group is duplicated, all occurrences are removed. The standard usage
is to remove the data group if it is an exact match (i.e. if the data
group to be removed is followed by a space, a tab, a new line or a return).
One can also remove all data groups starting with `dg_to_remove` by using
strict=False.
Args:
dg_to_remove (str): Data group to be removed from the list. Can be
given with or without the dollar ("$") sign.
dg_list (list): List of strings. Each string should be a data group,
i.e. should start with a dollar ("$") sign and should not contain
any other dollar sign.
strict (bool): If False, all the data groups starting with
`dg_to_remove` will be removed from the list. Otherwise, the data
group is removed only if it is an exact match of `dg_to_remove`.
Returns:
list: The list of data groups with `dg_to_remove` removed.
"""
if dg_to_remove[0] == "$":
dg_to_remove = dg_to_remove[1:]
new_dg_list = []
if strict:
pattern = r"^\$" + dg_to_remove + r"\s"
else:
pattern = r"^\$" + dg_to_remove
for dg in dg_list:
if not re.match(pattern=pattern, string=dg):
new_dg_list.append(dg)
return new_dg_list
[docs]
def compare_datagroup_string(dg1, dg2, tol=None):
"""
Compare two datagroup strings as split from the DataGroups object.
Return True if the two match.
The two strings should have the same number of lines. Each line should have the
same number of chunks. Chunks will be separated using spaces and "=". Each chunk
should match exactly as a string, except if they are numbers. In that, if a tol
is specified they will be converted to float and their difference should be lower
than the tolerance.
Args:
dg1 (str): the first datagroup string to be compared.
dg2 (str): the second datagroup string to be compared.
tol (float): the tolerance allowed when comparing numbers. If None
even the number should match as strings.
Returns:
bool: True if the string match.
"""
lines1 = dg1.splitlines()
lines2 = dg2.splitlines()
if len(lines1) != len(lines2):
return False
for l1, l2 in zip(lines1, lines2):
strings1 = re.split(r"\s+|=", l1)
strings2 = re.split(r"\s+|=", l2)
if len(strings1) != len(strings2):
return False
for s1, s2 in zip(strings1, strings2):
if tol is not None:
try:
f1 = float(s1.replace("D", "E"))
f2 = float(s2.replace("D", "E"))
if abs(f1 - f2) > tol:
return False
except ValueError:
if s1 != s2:
return False
elif s1 != s2:
return False
return True
[docs]
class DataGroups(MSONable):
"""Generic class for data group formatted files and strings.
DataGroups is a generic class for parsing, manipulating and generating
strings/files using the data group format of TurboMole.
"""
def __init__(self, string=None, dg_list=None):
"""Initialize a `DataGroups` class.
The DataGroups object defines a set of key-value-like pairs. A data
group is a key starting with a dollar ("$") sign. The value
corresponding to the data group is called the data block. When
initializing `DataGroups` from a string (e.g. reading a file), comments
and/or unnecessary lines/spaces/... may be present in that string. In
that case, the initial string is kept as a reference. Note that if
changes are applied to the DataGroups object, the initial string is
not updated as comments may interfere with the modifications. The
actual string corresponding to the data group list (`dg_list`) is
always in line with the data group list.
Args:
string (str): A string in the data group format.
dg_list (list of str): A list of the data groups.
"""
if dg_list is None:
if string is None:
raise ValueError('Both "string" and "dg_list" are None.')
self.initial_string = string
self.dg_list = split_string_to_dg_list(
string=string, cleanup_types=None, remove_comment_types=None
)
else:
self.dg_list = dg_list
if string is None:
self.initial_string = "".join(dg_list)
else:
self.initial_string = string
[docs]
def kill_data_group(self, data_group, strict=True):
"""Remove `data_group` from this `DataGroups` object.
Args:
data_group (str): Data group to be removed.
strict (bool): If True `data_group` should be an exact match.
If False, any Data group starting with `data_group` will be
removed.
"""
self.dg_list = remove_dg_from_list(
dg_to_remove=data_group, dg_list=self.dg_list, strict=strict
)
kdg = kill_data_group
[docs]
def add_data_group(self, data_group, data_block):
"""Add `data_group`->`data_block` to this `DataGroups` object.
Args:
data_group (str): Data group (key) to be added. The dollar ("$")
sign will be automatically added if not present.
data_block (str): Data block corresponding to the data group.
Raises:
RuntimeError: if data_group already exists.
"""
if data_group[0] != "$":
data_group = "$" + data_group
if self.show_data_group(data_group=data_group, strict=True) is not None:
raise RuntimeError(
'Data group "{}" already exists in this '
"DataGroups object.".format(data_group)
)
if not re.fullmatch(pattern=r"^\$[a-zA-Z][ \-\w\(\)]*", string=data_group):
raise ValueError(
"Data group should start with a letter and be "
"followed by alphanumeric characters, a space, "
'"-", "_", "(" or ")".'
)
if not data_block.endswith("\n"):
data_block = data_block + "\n"
if data_block.startswith((" ", "\t", "\n")):
self.dg_list.insert(-1, "{}{}".format(data_group, data_block))
else:
self.dg_list.insert(-1, "{} {}".format(data_group, data_block))
adg = add_data_group
[docs]
def show_data_group(
self,
data_group,
strict=True,
default=None,
show_from_subfile=True,
raise_if_multiple_subfiles=False,
raise_if_missing_subfile=False,
raise_if_regular_and_subfile=False,
):
"""Show `data_group` from this `DataGroups` object.
Args:
data_group (str): Data group (key) to be added. The dollar ("$")
sign will be automatically added if not present.
strict (bool): Whether `data_group` should be an exact match or
if data groups starting with `data_group` are allowed.
default (str): the default value that will be returned if the
data_group is not present in the list. Default is None.
show_from_subfile (bool): If True, will show `data_group` from
within the "subfile" if the data block contains a
"file=FILENAME". If False the "file=FILENAME" block is simply
returned. This supposes that file exists in the current
directory.
raise_if_multiple_subfiles (bool): Whether to raise an error if
multiple "file=" directives are present in this data group. If
False, just returns the standard control data block.
raise_if_missing_subfile (bool): Whether to raise an error if the
subfile does not exist in the current directory or exists but
is empty. If False, just returns the standard data block.
raise_if_regular_and_subfile (bool): Whether to raise an error if
data group contains both a reference to a file with a
"file=FILENAME" and regular data block options. If
False, just returns the standard data block.
Returns:
str: the value of the selected datagroup. None if no `data_group`
is found.
Raises:
RuntimeError: if multiple occurrences of `data_group` are found
(and raise_if_multiple_subfiles is True), if the subfile is
missing or empty (and raise_if_missing_subfile is True) or if
regular data block options coexist with a reference to a subfile
(and raise_if_regular_and_subfile is True).
"""
if data_group[0] == "$":
data_group = data_group[1:]
if strict:
pattern = r"^\$" + data_group + r"[=\s]"
else:
pattern = r"^\$" + data_group
matches = []
for dg in self.dg_list:
if re.match(pattern=pattern, string=dg):
matches.append(dg)
if len(matches) == 0:
return default
dollar_data_group = "$" + data_group
if len(matches) > 1:
raise RuntimeError(
"Found multiple occurrences of data group "
'"{}".'.format(dollar_data_group)
)
if strict:
data_block = matches[0].replace(dollar_data_group, "")
else:
match = re.sub(r"^\$" + data_group + r"[-\w]*", "", matches[0])
data_block = re.sub(r"^ ", "", match)
if not show_from_subfile:
return data_block
subfiles_count = data_block.count("file=")
if subfiles_count == 0:
return data_block
elif subfiles_count == 1:
filename = self._get_subfile_fname(
data_block=data_block,
raise_if_regular_and_subfile=raise_if_regular_and_subfile,
)
if os.path.exists(filename):
with open(filename, "r") as f:
subfile_string = f.read()
if subfile_string.strip():
subfile = DataGroups(subfile_string)
return subfile.show_data_group(data_group=data_group)
if raise_if_missing_subfile:
raise RuntimeError(
'File "{}" for data group "{}" is '
"missing or empty.".format(filename, data_group)
)
return data_block
elif raise_if_multiple_subfiles:
raise RuntimeError('Multiple "file=FILENAME" directives.')
return data_block
sdg = show_data_group
[docs]
def change_data_group(self, data_group, data_block):
"""Change the value of the `data_group` to the new `data_block`.
If the key is not present will be added.
It will first apply the kill_data_group and then add_data_group. Note
that in the current implementation, the data group will be moved to
the end of the data groups object (just before $end).
If `data_block` is None it is equivalent to kill_data_group. To set
a data group with no explicit value use an empty string for `data_block`
(e.g. to add $uhf the input should be data_group="uhf" and data_block="").
Args:
data_group (str): Data group (key) to be changed. The dollar ("$")
sign will be automatically added if not present.
data_block (str): Data block corresponding to the data group.
If None it will simply kill_data_group for the specified
data_group value.
"""
self.kill_data_group(data_group=data_group, strict=True)
if data_block is not None:
self.add_data_group(data_group=data_group, data_block=data_block)
cdg = change_data_group
[docs]
def modify_data_group_options(self, data_group, options):
"""Modify a data group option.
Given a data group that allows several options on separate line
(e.g. $dft), updates the values of the options according to the
dictionary provided.
The option dictionary should have the form:
.. code-block:: text
{"option_name1": "option_name1 option_value",
"option_name2": "option_name2=option_value"}
The key will be used to identify the line to be modified and that line
would be entirely replaced by the value.
Since different options may be defined in different ways, no
attempt is made here to identify the suitable format for the option.
It is responsibility of the caller to specify the line for the option
in the correct format.
If the datagroup is not present will be created with the specified options.
If the entire data group should be modified it would be safer to use
change_data_group.
Args:
data_group (str): Data group (key) to be changed. The dollar ("$")
sign will be automatically added if not present.
options (dict): The options that should be added or updated. If the
value of one key is None the option will be removed if present.
"""
dg = self.show_data_group(data_group, strict=True)
# start from a line with a space if dg is not present (should not be empty)
if dg is None:
dg = " "
# copy the dict as it will modified
options = dict(options)
new_lines = []
for line in dg.splitlines():
# check if one of the lines matches the options. If yes replace
# the line and remove it from the options dict, otherwise keep
# the original line.
for opt in list(options.keys()):
if opt in line:
opt_kv = options.pop(opt)
# if None skip the line
if opt_kv is not None:
new_lines.append(" " + opt_kv)
break
else:
new_lines.append(line)
# add all the options that were not present in the original set
# except for None. Sorted so that the results are deterministic
new_values = set(options.values())
new_values.discard(None)
for opt_kv in sorted(new_values):
new_lines.append(" " + opt_kv)
self.change_data_group(data_group, "\n".join(new_lines))
mdgo = modify_data_group_options
[docs]
def show_data_group_option(self, data_group, option, default=None):
"""Show a data group option.
Given a data group that allows several options on separate line
(e.g. $dft), returns the value of the option provided.
Since each value may be defined in a different way, the returned
value will include anything after the option specified. It is up
to the caller determine if there are symbols like "=" that should
be removed, depending on the data group and option that is queried.
Args:
data_group (str): Data group (key) to be shown. The dollar ("$")
sign will be automatically added if not present.
option (str): The options that should be returned.
default (str): the default value that will be returned if the
data_group or the option are not present in the list.
Returns:
str: The value of the option.
"""
dg = self.show_data_group(
data_group,
strict=True,
show_from_subfile=True,
raise_if_missing_subfile=False,
)
if dg is None:
return default
compiled_re = re.compile(r"\s*{}(.*)$".format(option), re.DOTALL)
for line in dg.splitlines():
match = compiled_re.match(line)
if match:
return match.group(1)
return default
sdgo = show_data_group_option
@property
def number_of_data_groups(self):
"""Get the number of datagroups in this DataGroups object."""
return len(self.dg_list) - 1
ndg = number_of_data_groups
[docs]
@classmethod
def empty(cls):
"""Create an empty DataGroups object.
An empty DataGroups object only contains the "$end" data group.
"""
return cls(dg_list=["$end\n"])
[docs]
def as_dict(self):
"""Return a dictionary representing the DataGroups object."""
return {
"@module": self.__class__.__module__,
"@class": self.__class__.__name__,
"string": self.initial_string,
"dg_list": self.dg_list,
}
[docs]
@classmethod
def from_dict(cls, d):
"""Create DataGroups object from a dictionary."""
return cls(string=d["string"], dg_list=d["dg_list"])
def __str__(self):
"""Return a string representation of this DataGroups object.
The string representation of the DataGroups object is supposed to be
written to a file and used as an input to TurboMole executables.
"""
return "".join(self.dg_list)
[docs]
def to_file(self, filename):
"""Write this DataGroups object to a file.
If the file exists it will be overwritten.
Args:
filename (str): Name of the file to which this DataGroups object
should be written.
"""
with open(filename, "w") as f:
f.write(self.__str__())
[docs]
@classmethod
def from_file(cls, filename):
"""Create DataGroups object reading from a given file.
Args:
filename (str): Name of the file from which this DataGroups object
should be read.
"""
with open(filename, "r") as f:
string = f.read()
return cls(string=string)
@staticmethod
def _get_subfile_fname(data_block, raise_if_regular_and_subfile=False):
"""
Extract the name of the file from a datagroup string.
Args:
data_block (str): the string with the content of the data group.
raise_if_regular_and_subfile (bool): if True will raise an error if the
string contains both a file=FILENAME and other data. If False will
return the filename anyway.
Returns:
(str): the name of the file, None if no match for "file=filename" is found.
Raises:
RuntimeError: if raise_if_regular_and_subfile is True and file=FILENAME
is not the only content of the string.
"""
pattern = r"file=([^\s]+)"
match = re.search(pattern=pattern, string=data_block)
if not match:
return None
data_block_test = data_block.replace(match.group(0), "")
data_block_test = data_block_test.strip()
if data_block_test != "" and raise_if_regular_and_subfile:
raise RuntimeError(
"Both a reference to a file "
'("file=FILENAME") and regular data '
"blocks are in the data group."
)
return match.group(1)
[docs]
def show_subfile_fname(self, data_group, raise_if_regular_and_subfile=False):
"""
Extract the name of the file from a datagroup.
Args:
data_group (str): the string with the content of the data group.
raise_if_regular_and_subfile (bool): if True will raise an error if the
string contains both a file=FILENAME and other data. If False will
return the filename anyway.
Returns:
(str): the name of the file, None if no match for "file=filename" is found.
Raises:
RuntimeError: if raise_if_regular_and_subfile is True and file=FILENAME
is not the only content of the string.
"""
return self._get_subfile_fname(
self.sdg(data_group, show_from_subfile=False),
raise_if_regular_and_subfile=raise_if_regular_and_subfile,
)
[docs]
def compare(self, datagroups, tol=None, ignored_dg=None, return_all_diffs=False):
"""
Compare the current object with another DataGroups.
Aside from the datagroups that should be ignored, (define in the
ignored_dg argument), the two should contain the same amount of elements in
dg_list and each one from the current object should have a matching elements
in the one that is passed as argument.
To match two strings should have the same number of lines. Each line should
have the same number of chunks. Each chunk should match exactly as a string,
except if they are numbers. In that, if a tol is specified they will be
converted to float and their difference should be lower than the tolerance.
If the two objects match None will be returned, otherwise a message describing
the property that caused the match to fail.
Note that, given the variety of notations supported by Turbomole, this
function is to be intended more for testing purposes rather than as a complete
tool to guarantee the equivalence of two control file.
Args:
datagroups (DataGroups): a Datagroups that should match with the
current instance.
tol (float): the tolerance allowed when comparing numbers. If None
even the number should match as strings.
ignored_dg (list): a list of datagroups that should be ignored for
the comparison. It is expected to be the name of the datagroup,
thus starting with a "$".
If the "$" symbol is not present will be added before trying to
match the datagroup to skip.
return_all_diffs (bool): If True, a list of all differences is returned.
If False (default), a string describing the first source of
difference found is returned.
Returns:
None if the results match or a string describing a source of difference
otherwise (if return_all_diffs is False) or a list of all differences
(if return_all_diffs is True).
"""
dg_other_list = list(datagroups.dg_list)
diffs = []
if ignored_dg:
# add the initial $ to each datagroup if not already present.
ignored_dg = [
"$" + dg if not dg.startswith("$") else dg for dg in ignored_dg
]
for dg1 in self.dg_list:
# skip the check if dg1 matches the list of datagroups that should
# be ignored.
if ignored_dg and any(dg1.startswith(dg) for dg in ignored_dg):
continue
for i, dg2 in enumerate(dg_other_list):
if compare_datagroup_string(dg1, dg2, tol):
dg_other_list.pop(i)
break
else:
msg = "Datagroup does not match to any of the references: {}".format(
dg1
)
if return_all_diffs:
diffs.append(msg)
else:
return msg
# check that all the remaining datagroups in the second control belong
# to the ignore list.
for dg2 in dg_other_list:
if not ignored_dg or not any(dg2.startswith(dg) for dg in ignored_dg):
msg = (
"Datagroup in the reference does not match to any of the current "
"control: {}".format(dg2)
)
if return_all_diffs:
diffs.append(msg)
else:
return msg
if return_all_diffs:
return diffs if len(diffs) > 0 else None
return None