Source code for turbomoleio.core.datagroups

# -*- coding: utf-8 -*-
# The turbomoleio package, a python interface to Turbomole
# for preparing inputs, parsing outputs and other related tools.
#
# Copyright (C) 2018-2022 BASF SE, Matgenix SRL.
#
# This file is part of turbomoleio.
#
# Turbomoleio is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Turbomoleio is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with turbomoleio (see ~turbomoleio/COPYING). If not,
# see <https://www.gnu.org/licenses/>.

"""Utility module for the "data group" model of TurboMole.

This module contains utility functions to parse, manipulate and write files
formatted using the "data group" convention of TurboMole. Examples of files
that are written in this fashion are the "control" file, the "coord" file, the
"basis" file and many other files written by TurboMole.

Files using the "data group" model of TurboMole are human readable ASCII files
in which data is formally split into consistent "groups". Each "data group"
consists of a keyword starting with a dollar ("$") sign (e.g. "$symmetry",
"$open shell", "$optimize" ...) and its associated data. The associated data
can be empty or just one value/word but it can also be very large (e.g.
molecular orbitals : "$scfmo" data group contained in the "mos" file).

A complete description of the data group model of TurboMole can be found in
the user's manual, as well as a description of all the keywords available.
"""

import os
import re

from monty.json import MSONable


[docs] def cleanup_string(string, cleanup_types=None): """Return a cleaned data groups string. This function allows to clean up a data groups string. It allows to : - remove everything before the first dollar ("$") sign, - remove blank lines, - remove spaces and tabs before "#" comments, - remove spaces and tabs before "$" signs. Args: string (str): A string that should be in the "data group" format of TurboMole. Typically, this string is read from file (e.g. control, coord, basis, ...). cleanup_types (list of str, optional): List of the cleanup types that should be performed. Valid types are "BEFORE_FIRST_DOLLAR", "BLANK_LINES", "LEADING_SPACES_DOLLAR" and "LEADING_SPACES_HASH". Defaults to None in which case all four types of cleanup will be performed. Returns: str: A cleaned up string. Raises: ValueError: If one of the cleanup_types is not valid, if there is no dollar ("$") sign, if one of the dollar ("$") or hash ("#") signs is preceded by characters other than a space or a tab in a line or if there is no "$end" at the end of the file. """ if "$" not in string: raise ValueError('No dollar ("$") sign in the string.') if cleanup_types is None: cleanup_types = [ "BLANK_LINES", "LEADING_SPACES_DOLLAR", "LEADING_SPACES_HASH", "BEFORE_FIRST_DOLLAR", ] for cleanup_type in cleanup_types: # Removing lines before the first dollar sign (except if this dollar # sign is in a hash comment) if cleanup_type == "BEFORE_FIRST_DOLLAR": newlines = [] after_first_dollar = False for line in string.split("\n"): if after_first_dollar or line.strip().startswith("#"): newlines.append(line) elif line.strip().startswith("$"): after_first_dollar = True newlines.append(line.lstrip()) string = "\n".join(newlines) # Removing all blank lines elif cleanup_type == "BLANK_LINES": string = re.sub(r"(\n[ \t]*){2,}", "\n", string, flags=re.MULTILINE) # Removing all spaces and tabs before dollar signs elif cleanup_type == "LEADING_SPACES_DOLLAR": string = re.sub(r"^[ \t]*\$", "$", string, flags=re.MULTILINE) # Removing all spaces before hash signs elif cleanup_type == "LEADING_SPACES_HASH": string = re.sub(r"^[ \t]*#", "#", string, flags=re.MULTILINE) else: raise ValueError( 'Cleanup of type "{}" ' "is not a valid type.".format(cleanup_type) ) for match in re.findall(pattern=r"^[ \t\S]+\$", string=string, flags=re.MULTILINE): if not match.strip().startswith("#"): raise ValueError("Some character(s) are preceding a " 'dollar ("$") sign.') if "$end" not in string: raise ValueError('No "$end" in the string.') return string
[docs] def remove_comments(string, comment_types=None): """Return a data group string with comment lines removed. Comments or additional information in TurboMole is typically specified in three different ways : a line starting with a hash ("#") symbol, a data group with the keyword "$dummy" (can span multiple lines), anything after the "$end" keyword or anything after a hash ("#") symbol (so-called "inline" comments). This function removes all types of comments in a data group formatted string. Note also that if a newline is added after the "$end" keyword if not present. Args: string (str): A string that should be in the "data group" format of TurboMole. Typically, this string is read from file (e.g. control, coord, basis, ...). comment_types (:obj:`list`, optional): List of the types of comments that should be removed. Valid types are "HASH_START", "DUMMY", "AFTER_END" and "HASH_INLINE". Defaults to None in which case all four types of comments will be removed. Returns: str: A string without the comment lines. Raises: ValueError: If one of the comment_types is not valid. """ if comment_types is None: comment_types = ["HASH_START", "DUMMY", "AFTER_END", "HASH_INLINE"] for comment_type in comment_types: # Removing lines starting with a hash symbol if comment_type == "HASH_START": string = re.sub(r"^#.*\n?", "", string, flags=re.MULTILINE) # Removing portions of the file corresponding to a $dummy data group elif comment_type == "DUMMY": old = None # Loop is needed because replaced characters are not used in # consecutive regex searches while old != string: old = string string = re.sub(r"^\$dummy[^$]*\$", "$", string, flags=re.MULTILINE) # Dummy data group at the end of the file string = re.sub(r"^\$dummy[\s\S]*\Z", "", string, flags=re.MULTILINE) # Removing lines after $end elif comment_type == "AFTER_END": string = re.sub(r"^\$end[\s\S]*\Z", "$end\n", string, flags=re.MULTILINE) # Removing inline comments i.e. #-starting comments in the middle of # a line (e.g. : "$symmetry c1 # Symmetry of the system") # Note that lines starting with a hash symbol will also be replaced # by a blank line elif comment_type == "HASH_INLINE": # Only matches if there is something other than white spaces # before the hash ("#") symbol. Only the first capturing group # is reinserted with the "\1", while the lookahead ("(?=$)") is # checking for the end of the line but leaving it as it is, be it # a newline or an end of file. string = re.sub(r"(^\s*\S.*?)(#.*)(?=$)", r"\1", string, flags=re.MULTILINE) else: raise ValueError( 'Comment of type "{}" ' "is not a valid type.".format(comment_type) ) return string
[docs] def split_string_to_dg_list(string, cleanup_types=None, remove_comment_types=None): r"""Split a string into a list of data groups. Args: string (str): A string that should be in the "data group" format of TurboMole. Typically, this string is read from file (e.g. control, coord, basis, ...). cleanup_types (list of str, optional): List of the cleanup types that should be performed before splitting the string into a list of data groups. Valid types are "BEFORE_FIRST_DOLLAR", "HASH_START", "DUMMY", "AFTER_END" and "BLANK_LINES". Defaults to None in which case all five types of comments will be removed. remove_comment_types (list of str, optional): List of the types of comments that should be removed before splitting the string into a list of data groups. Valid types are "BEFORE_FIRST_DOLLAR", "HASH_START", "DUMMY", "AFTER_END" and "BLANK_LINES". Defaults to None in which case all five types of comments will be removed. Returns: list: The list of data groups in the string. Raises: ValueError: If there is no dollar ("$") sign in the string, i.e. the string is not or incorrectly formatted with respect to the data group format of TurboMole. Examples: >>> split_string_to_dg_list("$title\n$end\n") ['$title\n', '$end\n'] >>> split_string_to_dg_list("no dollar sign") [] >>> split_string_to_dg_list("\n\n$title\n" ... "$optimize\n" ... " internal on\n" ... " redundant on\n" ... " cartesian off\n" ... " global off\n" ... " basis off\n") ['$title\n', '$optimize\n internal on\n redundant on\n cartesian off\n global off\n basis off\n'] """ # noqa: E501 string = cleanup_string(string=string, cleanup_types=cleanup_types) string = remove_comments(string=string, comment_types=remove_comment_types) return ["$" + dg for dg in string.split("$")[1:]]
[docs] def remove_dg_from_list(dg_to_remove, dg_list, strict=True): """Return new list with a data group removed from the initial list. This function will remove dg_to_remove from a list of data groups. If the data group is duplicated, all occurrences are removed. The standard usage is to remove the data group if it is an exact match (i.e. if the data group to be removed is followed by a space, a tab, a new line or a return). One can also remove all data groups starting with `dg_to_remove` by using strict=False. Args: dg_to_remove (str): Data group to be removed from the list. Can be given with or without the dollar ("$") sign. dg_list (list): List of strings. Each string should be a data group, i.e. should start with a dollar ("$") sign and should not contain any other dollar sign. strict (bool): If False, all the data groups starting with `dg_to_remove` will be removed from the list. Otherwise, the data group is removed only if it is an exact match of `dg_to_remove`. Returns: list: The list of data groups with `dg_to_remove` removed. """ if dg_to_remove[0] == "$": dg_to_remove = dg_to_remove[1:] new_dg_list = [] if strict: pattern = r"^\$" + dg_to_remove + r"\s" else: pattern = r"^\$" + dg_to_remove for dg in dg_list: if not re.match(pattern=pattern, string=dg): new_dg_list.append(dg) return new_dg_list
[docs] def compare_datagroup_string(dg1, dg2, tol=None): """ Compare two datagroup strings as split from the DataGroups object. Return True if the two match. The two strings should have the same number of lines. Each line should have the same number of chunks. Chunks will be separated using spaces and "=". Each chunk should match exactly as a string, except if they are numbers. In that, if a tol is specified they will be converted to float and their difference should be lower than the tolerance. Args: dg1 (str): the first datagroup string to be compared. dg2 (str): the second datagroup string to be compared. tol (float): the tolerance allowed when comparing numbers. If None even the number should match as strings. Returns: bool: True if the string match. """ lines1 = dg1.splitlines() lines2 = dg2.splitlines() if len(lines1) != len(lines2): return False for l1, l2 in zip(lines1, lines2): strings1 = re.split(r"\s+|=", l1) strings2 = re.split(r"\s+|=", l2) if len(strings1) != len(strings2): return False for s1, s2 in zip(strings1, strings2): if tol is not None: try: f1 = float(s1.replace("D", "E")) f2 = float(s2.replace("D", "E")) if abs(f1 - f2) > tol: return False except ValueError: if s1 != s2: return False elif s1 != s2: return False return True
[docs] class DataGroups(MSONable): """Generic class for data group formatted files and strings. DataGroups is a generic class for parsing, manipulating and generating strings/files using the data group format of TurboMole. """ def __init__(self, string=None, dg_list=None): """Initialize a `DataGroups` class. The DataGroups object defines a set of key-value-like pairs. A data group is a key starting with a dollar ("$") sign. The value corresponding to the data group is called the data block. When initializing `DataGroups` from a string (e.g. reading a file), comments and/or unnecessary lines/spaces/... may be present in that string. In that case, the initial string is kept as a reference. Note that if changes are applied to the DataGroups object, the initial string is not updated as comments may interfere with the modifications. The actual string corresponding to the data group list (`dg_list`) is always in line with the data group list. Args: string (str): A string in the data group format. dg_list (list of str): A list of the data groups. """ if dg_list is None: if string is None: raise ValueError('Both "string" and "dg_list" are None.') self.initial_string = string self.dg_list = split_string_to_dg_list( string=string, cleanup_types=None, remove_comment_types=None ) else: self.dg_list = dg_list if string is None: self.initial_string = "".join(dg_list) else: self.initial_string = string
[docs] def kill_data_group(self, data_group, strict=True): """Remove `data_group` from this `DataGroups` object. Args: data_group (str): Data group to be removed. strict (bool): If True `data_group` should be an exact match. If False, any Data group starting with `data_group` will be removed. """ self.dg_list = remove_dg_from_list( dg_to_remove=data_group, dg_list=self.dg_list, strict=strict )
kdg = kill_data_group
[docs] def add_data_group(self, data_group, data_block): """Add `data_group`->`data_block` to this `DataGroups` object. Args: data_group (str): Data group (key) to be added. The dollar ("$") sign will be automatically added if not present. data_block (str): Data block corresponding to the data group. Raises: RuntimeError: if data_group already exists. """ if data_group[0] != "$": data_group = "$" + data_group if self.show_data_group(data_group=data_group, strict=True) is not None: raise RuntimeError( 'Data group "{}" already exists in this ' "DataGroups object.".format(data_group) ) if not re.fullmatch(pattern=r"^\$[a-zA-Z][ \-\w\(\)]*", string=data_group): raise ValueError( "Data group should start with a letter and be " "followed by alphanumeric characters, a space, " '"-", "_", "(" or ")".' ) if not data_block.endswith("\n"): data_block = data_block + "\n" if data_block.startswith((" ", "\t", "\n")): self.dg_list.insert(-1, "{}{}".format(data_group, data_block)) else: self.dg_list.insert(-1, "{} {}".format(data_group, data_block))
adg = add_data_group
[docs] def show_data_group( self, data_group, strict=True, default=None, show_from_subfile=True, raise_if_multiple_subfiles=False, raise_if_missing_subfile=False, raise_if_regular_and_subfile=False, ): """Show `data_group` from this `DataGroups` object. Args: data_group (str): Data group (key) to be added. The dollar ("$") sign will be automatically added if not present. strict (bool): Whether `data_group` should be an exact match or if data groups starting with `data_group` are allowed. default (str): the default value that will be returned if the data_group is not present in the list. Default is None. show_from_subfile (bool): If True, will show `data_group` from within the "subfile" if the data block contains a "file=FILENAME". If False the "file=FILENAME" block is simply returned. This supposes that file exists in the current directory. raise_if_multiple_subfiles (bool): Whether to raise an error if multiple "file=" directives are present in this data group. If False, just returns the standard control data block. raise_if_missing_subfile (bool): Whether to raise an error if the subfile does not exist in the current directory or exists but is empty. If False, just returns the standard data block. raise_if_regular_and_subfile (bool): Whether to raise an error if data group contains both a reference to a file with a "file=FILENAME" and regular data block options. If False, just returns the standard data block. Returns: str: the value of the selected datagroup. None if no `data_group` is found. Raises: RuntimeError: if multiple occurrences of `data_group` are found (and raise_if_multiple_subfiles is True), if the subfile is missing or empty (and raise_if_missing_subfile is True) or if regular data block options coexist with a reference to a subfile (and raise_if_regular_and_subfile is True). """ if data_group[0] == "$": data_group = data_group[1:] if strict: pattern = r"^\$" + data_group + r"[=\s]" else: pattern = r"^\$" + data_group matches = [] for dg in self.dg_list: if re.match(pattern=pattern, string=dg): matches.append(dg) if len(matches) == 0: return default dollar_data_group = "$" + data_group if len(matches) > 1: raise RuntimeError( "Found multiple occurrences of data group " '"{}".'.format(dollar_data_group) ) if strict: data_block = matches[0].replace(dollar_data_group, "") else: match = re.sub(r"^\$" + data_group + r"[-\w]*", "", matches[0]) data_block = re.sub(r"^ ", "", match) if not show_from_subfile: return data_block subfiles_count = data_block.count("file=") if subfiles_count == 0: return data_block elif subfiles_count == 1: filename = self._get_subfile_fname( data_block=data_block, raise_if_regular_and_subfile=raise_if_regular_and_subfile, ) if os.path.exists(filename): with open(filename, "r") as f: subfile_string = f.read() if subfile_string.strip(): subfile = DataGroups(subfile_string) return subfile.show_data_group(data_group=data_group) if raise_if_missing_subfile: raise RuntimeError( 'File "{}" for data group "{}" is ' "missing or empty.".format(filename, data_group) ) return data_block elif raise_if_multiple_subfiles: raise RuntimeError('Multiple "file=FILENAME" directives.') return data_block
sdg = show_data_group
[docs] def change_data_group(self, data_group, data_block): """Change the value of the `data_group` to the new `data_block`. If the key is not present will be added. It will first apply the kill_data_group and then add_data_group. Note that in the current implementation, the data group will be moved to the end of the data groups object (just before $end). If `data_block` is None it is equivalent to kill_data_group. To set a data group with no explicit value use an empty string for `data_block` (e.g. to add $uhf the input should be data_group="uhf" and data_block=""). Args: data_group (str): Data group (key) to be changed. The dollar ("$") sign will be automatically added if not present. data_block (str): Data block corresponding to the data group. If None it will simply kill_data_group for the specified data_group value. """ self.kill_data_group(data_group=data_group, strict=True) if data_block is not None: self.add_data_group(data_group=data_group, data_block=data_block)
cdg = change_data_group
[docs] def modify_data_group_options(self, data_group, options): """Modify a data group option. Given a data group that allows several options on separate line (e.g. $dft), updates the values of the options according to the dictionary provided. The option dictionary should have the form: .. code-block:: text {"option_name1": "option_name1 option_value", "option_name2": "option_name2=option_value"} The key will be used to identify the line to be modified and that line would be entirely replaced by the value. Since different options may be defined in different ways, no attempt is made here to identify the suitable format for the option. It is responsibility of the caller to specify the line for the option in the correct format. If the datagroup is not present will be created with the specified options. If the entire data group should be modified it would be safer to use change_data_group. Args: data_group (str): Data group (key) to be changed. The dollar ("$") sign will be automatically added if not present. options (dict): The options that should be added or updated. If the value of one key is None the option will be removed if present. """ dg = self.show_data_group(data_group, strict=True) # start from a line with a space if dg is not present (should not be empty) if dg is None: dg = " " # copy the dict as it will modified options = dict(options) new_lines = [] for line in dg.splitlines(): # check if one of the lines matches the options. If yes replace # the line and remove it from the options dict, otherwise keep # the original line. for opt in list(options.keys()): if opt in line: opt_kv = options.pop(opt) # if None skip the line if opt_kv is not None: new_lines.append(" " + opt_kv) break else: new_lines.append(line) # add all the options that were not present in the original set # except for None. Sorted so that the results are deterministic new_values = set(options.values()) new_values.discard(None) for opt_kv in sorted(new_values): new_lines.append(" " + opt_kv) self.change_data_group(data_group, "\n".join(new_lines))
mdgo = modify_data_group_options
[docs] def show_data_group_option(self, data_group, option, default=None): """Show a data group option. Given a data group that allows several options on separate line (e.g. $dft), returns the value of the option provided. Since each value may be defined in a different way, the returned value will include anything after the option specified. It is up to the caller determine if there are symbols like "=" that should be removed, depending on the data group and option that is queried. Args: data_group (str): Data group (key) to be shown. The dollar ("$") sign will be automatically added if not present. option (str): The options that should be returned. default (str): the default value that will be returned if the data_group or the option are not present in the list. Returns: str: The value of the option. """ dg = self.show_data_group( data_group, strict=True, show_from_subfile=True, raise_if_missing_subfile=False, ) if dg is None: return default compiled_re = re.compile(r"\s*{}(.*)$".format(option), re.DOTALL) for line in dg.splitlines(): match = compiled_re.match(line) if match: return match.group(1) return default
sdgo = show_data_group_option @property def number_of_data_groups(self): """Get the number of datagroups in this DataGroups object.""" return len(self.dg_list) - 1 ndg = number_of_data_groups
[docs] @classmethod def empty(cls): """Create an empty DataGroups object. An empty DataGroups object only contains the "$end" data group. """ return cls(dg_list=["$end\n"])
[docs] def as_dict(self): """Return a dictionary representing the DataGroups object.""" return { "@module": self.__class__.__module__, "@class": self.__class__.__name__, "string": self.initial_string, "dg_list": self.dg_list, }
[docs] @classmethod def from_dict(cls, d): """Create DataGroups object from a dictionary.""" return cls(string=d["string"], dg_list=d["dg_list"])
def __str__(self): """Return a string representation of this DataGroups object. The string representation of the DataGroups object is supposed to be written to a file and used as an input to TurboMole executables. """ return "".join(self.dg_list)
[docs] def to_file(self, filename): """Write this DataGroups object to a file. If the file exists it will be overwritten. Args: filename (str): Name of the file to which this DataGroups object should be written. """ with open(filename, "w") as f: f.write(self.__str__())
[docs] @classmethod def from_file(cls, filename): """Create DataGroups object reading from a given file. Args: filename (str): Name of the file from which this DataGroups object should be read. """ with open(filename, "r") as f: string = f.read() return cls(string=string)
@staticmethod def _get_subfile_fname(data_block, raise_if_regular_and_subfile=False): """ Extract the name of the file from a datagroup string. Args: data_block (str): the string with the content of the data group. raise_if_regular_and_subfile (bool): if True will raise an error if the string contains both a file=FILENAME and other data. If False will return the filename anyway. Returns: (str): the name of the file, None if no match for "file=filename" is found. Raises: RuntimeError: if raise_if_regular_and_subfile is True and file=FILENAME is not the only content of the string. """ pattern = r"file=([^\s]+)" match = re.search(pattern=pattern, string=data_block) if not match: return None data_block_test = data_block.replace(match.group(0), "") data_block_test = data_block_test.strip() if data_block_test != "" and raise_if_regular_and_subfile: raise RuntimeError( "Both a reference to a file " '("file=FILENAME") and regular data ' "blocks are in the data group." ) return match.group(1)
[docs] def show_subfile_fname(self, data_group, raise_if_regular_and_subfile=False): """ Extract the name of the file from a datagroup. Args: data_group (str): the string with the content of the data group. raise_if_regular_and_subfile (bool): if True will raise an error if the string contains both a file=FILENAME and other data. If False will return the filename anyway. Returns: (str): the name of the file, None if no match for "file=filename" is found. Raises: RuntimeError: if raise_if_regular_and_subfile is True and file=FILENAME is not the only content of the string. """ return self._get_subfile_fname( self.sdg(data_group, show_from_subfile=False), raise_if_regular_and_subfile=raise_if_regular_and_subfile, )
[docs] def compare(self, datagroups, tol=None, ignored_dg=None, return_all_diffs=False): """ Compare the current object with another DataGroups. Aside from the datagroups that should be ignored, (define in the ignored_dg argument), the two should contain the same amount of elements in dg_list and each one from the current object should have a matching elements in the one that is passed as argument. To match two strings should have the same number of lines. Each line should have the same number of chunks. Each chunk should match exactly as a string, except if they are numbers. In that, if a tol is specified they will be converted to float and their difference should be lower than the tolerance. If the two objects match None will be returned, otherwise a message describing the property that caused the match to fail. Note that, given the variety of notations supported by Turbomole, this function is to be intended more for testing purposes rather than as a complete tool to guarantee the equivalence of two control file. Args: datagroups (DataGroups): a Datagroups that should match with the current instance. tol (float): the tolerance allowed when comparing numbers. If None even the number should match as strings. ignored_dg (list): a list of datagroups that should be ignored for the comparison. It is expected to be the name of the datagroup, thus starting with a "$". If the "$" symbol is not present will be added before trying to match the datagroup to skip. return_all_diffs (bool): If True, a list of all differences is returned. If False (default), a string describing the first source of difference found is returned. Returns: None if the results match or a string describing a source of difference otherwise (if return_all_diffs is False) or a list of all differences (if return_all_diffs is True). """ dg_other_list = list(datagroups.dg_list) diffs = [] if ignored_dg: # add the initial $ to each datagroup if not already present. ignored_dg = [ "$" + dg if not dg.startswith("$") else dg for dg in ignored_dg ] for dg1 in self.dg_list: # skip the check if dg1 matches the list of datagroups that should # be ignored. if ignored_dg and any(dg1.startswith(dg) for dg in ignored_dg): continue for i, dg2 in enumerate(dg_other_list): if compare_datagroup_string(dg1, dg2, tol): dg_other_list.pop(i) break else: msg = "Datagroup does not match to any of the references: {}".format( dg1 ) if return_all_diffs: diffs.append(msg) else: return msg # check that all the remaining datagroups in the second control belong # to the ignore list. for dg2 in dg_other_list: if not ignored_dg or not any(dg2.startswith(dg) for dg in ignored_dg): msg = ( "Datagroup in the reference does not match to any of the current " "control: {}".format(dg2) ) if return_all_diffs: diffs.append(msg) else: return msg if return_all_diffs: return diffs if len(diffs) > 0 else None return None