Source code for solvation_analysis.residence

"""
================
Residence
================
:Author: Orion Cohen, Tingzheng Hou, Kara Fong
:Year: 2021
:Copyright: GNU Public License v3

Understand the dynamic coordination of solvents with the solute.

Residence times for all solvents are automatically calculated from autocovariance
of the solvent-solute adjacency matrix.

While ``residence`` can be used in isolation, it is meant to be used
as an attribute of the Solute class. This makes instantiating it and calculating the
solvation data a non-issue.
"""

import math
import warnings

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.tsa.stattools import acovf
from scipy.optimize import curve_fit

import solvation_analysis
from solvation_analysis._column_names import (
    SOLVENT,
    SOLUTE_ATOM_IX,
    SOLVENT_ATOM_IX,
    SOLUTE_IX,
)
from solvation_analysis._utils import calculate_adjacency_dataframe



[docs]
class Residence:
    """
    Calculate the residence times of solvents.

    This class calculates the residence time of each solvent on the solute.
    The residence time is in units of Solute frames, so if the Solute object
    has 1000 frames over 1 nanosecond, then each frame will be 1 picosecond.
    Thus a residence time of 100 would translate to 100 picoseconds.

    Two residence time implementations are available. Both calculate the
    solute-solvent autocorrelation function for each solute-solvent pair,
    take and take the mean over the solvents of each type, this should yield
    an exponentially decaying autocorrelation function.

    The first implementation fits an exponential curve to the autocorrelation
    function and extract the time constant, which is inversely proportional to the
    residence time. This result is saved in the ``residence_times_fit`` attribute.
    Unfortunately, the fit often fails to converge (value is set to np.nan),
    making this method unreliable.

    Instead, the default implementation is to simply find point where the
    value of the autocorrelation function is 1/e, which is the time constant
    of an exact exponential. These values are saved in ``residence_times``.

    It is recommended that the user visually inspect the autocorrelation function
    with ``Residence.plot_autocorrelation_function`` to ensure an approximately
    exponential decay. The residence times are only valid insofar as the autocorrelation
    function resembles an exact exponential, it should decays to zero with a long tail.
    If the exponential does not decay to zero or its slope does not level off, increasing
    the simulation time may help. For this technique to be appropriate, the simulation time
    should exceed the residence time.

    A fuller description of the method can be found in
    `Self, Fong, and Persson <https://pubs-acs-org.libproxy.berkeley.edu/doi/full/10.1021/acsenergylett.9b02118>`_

    Parameters
    ----------
    solvation_data : pandas.DataFrame
        The solvation data frame output by Solute.
    step : int
        The spacing of frames in solvation_data. This should be equal
        to solute.step.

    Examples
    --------

     .. code-block:: python

        # first define Li, BN, and FEC AtomGroups
        >>> solute = Solute(Li, {'BN': BN, 'FEC': FEC, 'PF6': PF6})
        >>> residence = Residence.from_solute(solute)
        >>> residence.residence_times_cutoff
        {'BN': 4.02, 'FEC': 3.79, 'PF6': 1.15}
    """


[docs]
    def __init__(self, solvation_data: pd.DataFrame, step: int) -> None:
        self.solvation_data = solvation_data
        self._auto_covariances = self._calculate_auto_covariance_dict()
        self._residence_times_cutoff = self._calculate_residence_times_with_cutoff(
            self._auto_covariances, step
        )
        self._residence_times_fit, self._fit_parameters = (
            self._calculate_residence_times_with_fit(self._auto_covariances, step)
        )



[docs]
    @staticmethod
    def from_solute(solute: "solvation_analysis.Solute") -> "Residence":
        """
        Generate a Residence object from a solute.

        Parameters
        ----------
        solute : Solute

        Returns
        -------
        Residence
        """
        assert solute.has_run, "The solute must be run before calling from_solute"
        return Residence(solute.solvation_data, solute.step)


    def _calculate_auto_covariance_dict(self) -> dict[str, np.ndarray]:
        partial_index = self.solvation_data.index.droplevel(SOLVENT_ATOM_IX)
        unique_indices = np.unique(partial_index)
        frame_solute_index = pd.MultiIndex.from_tuples(
            unique_indices, names=partial_index.names
        )
        auto_covariance_dict = {}
        for res_name, res_solvation_data in self.solvation_data.groupby([SOLVENT]):
            if isinstance(res_name, tuple):
                res_name = res_name[0]
            adjacency_mini = calculate_adjacency_dataframe(res_solvation_data)
            adjacency_df = adjacency_mini.reindex(frame_solute_index, fill_value=0)
            auto_covariance = Residence._calculate_auto_covariance(adjacency_df)
            # normalize
            auto_covariance = auto_covariance / np.max(auto_covariance)
            auto_covariance_dict[res_name] = auto_covariance
        return auto_covariance_dict

    @staticmethod
    def _calculate_residence_times_with_cutoff(
        auto_covariances: dict[str, np.ndarray],
        step: int,
        convergence_cutoff: float = 0.1,
    ) -> dict[str, float]:
        residence_times = {}
        for res_name, auto_covariance in auto_covariances.items():
            if np.min(auto_covariance) > convergence_cutoff:
                residence_times[res_name] = np.nan
                warnings.warn(
                    f"the autocovariance for {res_name} does not converge to zero "
                    "so a residence time cannot be calculated. A longer simulation "
                    "is required to get a valid estimate of the residence time."
                )
            unassigned = True
            for frame, val in enumerate(auto_covariance):
                if val < 1 / math.e:
                    residence_times[res_name] = frame * step
                    unassigned = False
                    break
            if unassigned:
                residence_times[res_name] = np.nan
        return residence_times

    @staticmethod
    def _calculate_residence_times_with_fit(
        auto_covariances: dict[str, np.ndarray], step: int
    ) -> tuple[dict[str, float], dict[str, tuple[float, float, float]]]:
        # calculate the residence times
        residence_times = {}
        fit_parameters = {}
        for res_name, auto_covariance in auto_covariances.items():
            res_time, params = Residence._fit_exponential(auto_covariance, res_name)
            residence_times[res_name], fit_parameters[res_name] = (
                round(res_time * step, 2),
                params,
            )
        return residence_times, fit_parameters


[docs]
    def plot_auto_covariance(self, res_name: str) -> tuple[plt.Figure, plt.Axes]:
        """
        Plot the autocovariance of a solvent on the solute.

        See the discussion in the class docstring for more information.

        Parameters
        ----------
        res_name : str
            the name of a solvent in the solute.

        Returns
        -------
        fig : matplotlib.Figure
        ax : matplotlib.Axes
        """
        auto_covariance = self.auto_covariances[res_name]
        frames = np.arange(len(auto_covariance))
        params = self.fit_parameters[res_name]

        def exp_func(x):
            return self._exponential_decay(x, *params)

        exp_fit = np.array(map(exp_func, frames))
        fig, ax = plt.subplots()
        ax.plot(frames, auto_covariance, "b-", label="auto covariance")
        try:
            ax.scatter(frames, exp_fit, label="exponential fit")
        except RuntimeError:
            warnings.warn(
                f"The fit for {res_name} failed so the exponential "
                f"fit will not be plotted."
            )
        ax.hlines(y=1 / math.e, xmin=frames[0], xmax=frames[-1], label="1/e cutoff")
        ax.set_xlabel("Timestep (frames)")
        ax.set_ylabel("Normalized Autocovariance")
        ax.set_ylim(0, 1)
        ax.legend()
        return fig, ax


    @staticmethod
    def _exponential_decay(x: np.ndarray, a: float, b: float, c: float) -> np.ndarray:
        """
        An exponential decay function.

        Args:
            x: Independent variable.
            a: Initial quantity.
            b: Exponential decay constant.
            c: Constant.

        Returns:
            The acf
        """
        return a * np.exp(-b * x) + c

    @staticmethod
    def _fit_exponential(
        auto_covariance: np.ndarray, res_name: str
    ) -> tuple[float, tuple[float, float, float]]:
        auto_covariance_norm = auto_covariance / auto_covariance[0]
        try:
            params, param_covariance = curve_fit(
                Residence._exponential_decay,
                np.arange(len(auto_covariance_norm)),
                auto_covariance_norm,
                p0=(1, 0.1, 0.01),
            )
            tau = 1 / params[1]  # p
        except RuntimeError:
            warnings.warn(
                f"The fit for {res_name} failed so its values in"
                f"residence_time_fits and fit_parameters will be"
                f"set to np.nan."
            )
            tau, params = np.nan, (np.nan, np.nan, np.nan)
        return tau, params

    @staticmethod
    def _calculate_auto_covariance(adjacency_matrix: pd.DataFrame) -> np.ndarray:
        auto_covariances = []
        timesteps = adjacency_matrix.index.levels[0]

        for solute_ix, solute_df in adjacency_matrix.groupby(
            [SOLUTE_IX, SOLUTE_ATOM_IX]
        ):
            # this is needed to make sure auto-covariances can be concatenated later
            new_solute_df = solute_df.droplevel([SOLUTE_IX, SOLUTE_ATOM_IX]).reindex(
                timesteps, fill_value=0
            )
            non_zero_cols = new_solute_df.loc[:, (solute_df != 0).any(axis=0)]
            auto_covariance_df = non_zero_cols.apply(
                acovf,
                axis=0,
                result_type="expand",
                demean=False,
                adjusted=True,
                fft=True,
            )
            # timesteps with no binding are getting skipped, we need to make sure to include all timesteps
            auto_covariances.append(auto_covariance_df.values)

        auto_covariance = np.mean(np.concatenate(auto_covariances, axis=1), axis=1)
        return auto_covariance

    @property
    def auto_covariances(self) -> dict[str, np.ndarray]:
        """
        A dictionary where keys are residue names and values are the
        autocovariance of the that residue on the solute.
        """
        return self._auto_covariances

    @property
    def residence_times_cutoff(self) -> dict[str, float]:
        """
        A dictionary where keys are residue names and values are the
        residence times of the that residue on the solute, calculated
        with the 1/e cutoff method.
        """
        return self._residence_times_cutoff

    @property
    def residence_times_fit(self) -> dict[str, float]:
        """
        A dictionary where keys are residue names and values are the
        residence times of the that residue on the solute, calculated
        with the exponential fit method.
        """
        return self._residence_times_fit

    @property
    def fit_parameters(self) -> dict[str, tuple[float, float, float]]:
        """
        A dictionary where keys are residue names and values are the
        parameters for the exponential fit to the autocorrelation function.
        """
        return self._fit_parameters