Source code for solvation_analysis.speciation

"""
================
Speciation
================
:Author: Orion Cohen, Tingzheng Hou, Kara Fong
:Year: 2021
:Copyright: GNU Public License v3

Explore the precise solvation shell of every solute.

Speciation tabulates the unique solvation shell compositions, their fraction,
and their temporal locations.

From this, it provides search functionality to query for specific solvation shell
compositions. Extremely convenient for visualization.

While ``speciation`` can be used in isolation, it is meant to be used
as an attribute of the Solute class. This makes instantiating it and calculating the
solvation data a non-issue.
"""

import pandas as pd

import solvation_analysis
from solvation_analysis._column_names import (
    FRAME,
    SOLUTE_IX,
    SOLVENT,
    SOLVENT_IX,
    COUNT,
)



[docs]
class Speciation:
    """
    Calculate the solvation shells of every solute.

    Speciation organizes the solvation data by the type of residue
    coordinated with the central solvent. It collects this information in a
    pandas.DataFrame indexed by the frame and solute number. Each column is
    one of the solvents in the solvent_name column of the solvation data. The
    column value is how many residue of that type are in the solvation shell.

    Speciation provides the speciation of each solute in the speciation
    attribute, it also calculates the fraction of each unique
    shell and makes it available in the speciation_fraction attribute.

    Additionally, there are methods for finding solvation shells of
    interest and computing how common certain shell configurations are.

    Parameters
    ----------
    solvation_data : pandas.DataFrame
        The solvation dataframe output by Solute.
    n_frames : int
        The number of frames in solvation_data.
    n_solutes : int
        The number of solutes in solvation_data.
    """


[docs]
    def __init__(
        self, solvation_data: pd.DataFrame, n_frames: int, n_solutes: int
    ) -> None:
        self.solvation_data = solvation_data
        self.n_frames = n_frames
        self.n_solutes = n_solutes
        self._speciation_df, self._speciation_fraction = self._compute_speciation()
        self._solvent_co_occurrence = self._solvent_co_occurrence()



[docs]
    @staticmethod
    def from_solute(solute: "solvation_analysis.Solute") -> "Speciation":
        """
        Generate a Speciation object from a solute.

        Parameters
        ----------
        solute : Solute

        Returns
        -------
        Pairing
        """
        assert solute.has_run, "The solute must be run before calling from_solute"
        return Speciation(
            solute.solvation_data,
            solute.n_frames,
            solute.n_solutes,
        )


    def _compute_speciation(self) -> tuple[pd.DataFrame, pd.DataFrame]:
        counts = self.solvation_data.groupby([FRAME, SOLUTE_IX, SOLVENT]).count()[
            SOLVENT_IX
        ]
        counts_re = counts.reset_index([SOLVENT])
        speciation_data = counts_re.pivot(columns=[SOLVENT]).fillna(0).astype(int)
        res_names = speciation_data.columns.levels[1]
        speciation_data.columns = res_names
        sum_series = speciation_data.groupby(speciation_data.columns.to_list()).size()
        sum_sorted = sum_series.sort_values(ascending=False)
        speciation_fraction = sum_sorted.reset_index().rename(columns={0: COUNT})
        speciation_fraction[COUNT] = speciation_fraction[COUNT] / (
            self.n_frames * self.n_solutes
        )
        return speciation_data, speciation_fraction

    @classmethod
    def _mean_speciation(
        cls, speciation_frames: pd.DataFrame, solute_number: int, frame_number: int
    ) -> pd.Series:
        means = speciation_frames.sum(axis=1) / (solute_number * frame_number)
        return means


[docs]
    def calculate_shell_fraction(self, shell_dict: dict[str, int]) -> float:
        """
        Calculate the fraction of shells matching shell_dict.

        This function computes the fraction of solvation shells that exist with a particular
        composition. The composition is specified by the shell_dict. The fraction
        will be of all shells that match that specification.

        Attributes
        ----------
        shell_dict : dict of {str: int}
            a specification for a shell composition. Keys are residue names (str)
            and values are the number of desired residues. e.g. if shell_dict =
            {'mol1': 4} then the function will return the fraction of shells
            that have 4 mol1. Note that this may include shells with 4 mol1 and
            any number of other solvents. To specify a shell with 4 mol1 and nothing
            else, enter a dict such as {'mol1': 4, 'mol2': 0, 'mol3': 0}.

        Returns
        -------
        float
            the fraction of shells

        Examples
        --------

         .. code-block:: python

            # first define Li, BN, and FEC AtomGroups
            >>> solute = Solute(Li, {'BN': BN, 'FEC': FEC, 'PF6': PF6})
            >>> solute.run()
            >>> solute.speciation.calculate_shell_fraction({'BN': 4, 'PF6': 1})
            0.0898
        """
        query_list = [f"{name} == {str(count)}" for name, count in shell_dict.items()]
        query = " and ".join(query_list)
        query_counts = self.speciation_fraction.query(query)
        return query_counts[COUNT].sum()



[docs]
    def get_shells(self, shell_dict: dict[str, int]) -> pd.DataFrame:
        """
        Find all solvation shells that match shell_dict.

        This returns the frame, solute index, and composition of all solutes
        that match the composition given in shell_dict.

        Attributes
        ----------
        shell_dict : dict of {str: int}
            a specification for a shell composition. Keys are residue names (str)
            and values are the number of desired residues. e.g. if shell_dict =
            {'mol1': 4} then the function will return all shells
            that have 4 mol1. Note that this may include shells with 4 mol1 and
            any number of other solvents. To specify a shell with 4 mol1 and nothing
            else, enter a dict such as {'mol1': 4, 'mol2': 0, 'mol3': 0}.

        Returns
        -------
        pandas.DataFrame
            the index and composition of all shells that match shell_dict
        """
        query_list = [f"{name} == {str(count)}" for name, count in shell_dict.items()]
        query = " and ".join(query_list)
        query_counts = self.speciation_data.query(query)
        return query_counts


    def _solvent_co_occurrence(self) -> pd.DataFrame:
        # calculate the co-occurrence of solvent molecules.
        expected_solvents_list = []
        actual_solvents_list = []
        for solvent in self.speciation_data.columns.values:
            # calculate number of available coordinating solvent slots
            shells_w_solvent = self.speciation_data.query(f"`{solvent}` > 0")
            n_solvents = shells_w_solvent.sum()
            # calculate expected number of coordinating solvents
            n_coordination_slots = n_solvents.sum() - len(shells_w_solvent)
            coordination_fraction = (
                self.speciation_data.sum() / self.speciation_data.sum().sum()
            )
            expected_solvents = coordination_fraction * n_coordination_slots
            # calculate actual number of coordinating solvents
            actual_solvents = n_solvents.copy()
            actual_solvents[solvent] = actual_solvents[solvent] - len(shells_w_solvent)
            # name series and append to list
            expected_solvents.name = solvent
            actual_solvents.name = solvent
            expected_solvents_list.append(expected_solvents)
            actual_solvents_list.append(actual_solvents)
        if len(actual_solvents_list) == 0 or len(expected_solvents_list) == 0:
            # we return this if nothing is solvated
            return pd.DataFrame()
        # make DataFrames
        actual_df = pd.concat(actual_solvents_list, axis=1)
        expected_df = pd.concat(expected_solvents_list, axis=1)
        # calculate correlation matrix
        correlation = actual_df / expected_df
        return correlation

    @property
    def speciation_data(self) -> pd.DataFrame:
        """
        A dataframe containing the speciation of every solute at
        every trajectory frame. Indexed by timestep and solute numbers.
        Columns are the solvent molecules and values are the number
        of solvent in the shell.
        """
        return self._speciation_df

    @property
    def speciation_fraction(self) -> pd.DataFrame:
        """
        The fraction of shells of each type. Columns are the solvent
        molecules and values are the number of solvent in the shell.
        The final column is the fraction of total shell of that
        particular composition.
        """
        return self._speciation_fraction

    @property
    def solvent_co_occurrence(self) -> pd.DataFrame:
        """
        The actual co-occurrence of solvents divided by the expected co-occurrence.
        In other words, given one molecule of solvent i in the shell, what is the
        probability of finding a solvent j relative to choosing a solvent at random
        from the pool of all coordinated solvents. This matrix will
        likely not be symmetric.
        """
        return self._solvent_co_occurrence