Source code for solvation_analysis.speciation

"""
================
Speciation
================
:Author: Orion Cohen, Tingzheng Hou, Kara Fong
:Year: 2021
:Copyright: GNU Public License v3

Explore the precise solvation shell of every solute.

Speciation tabulates the unique solvation shell compositions, their fraction,
and their temporal locations.

From this, it provides search functionality to query for specific solvation shell
compositions. Extremely convenient for visualization.

While ``speciation`` can be used in isolation, it is meant to be used
as an attribute of the Solute class. This makes instantiating it and calculating the
solvation data a non-issue.
"""

import pandas as pd

import solvation_analysis
from solvation_analysis._column_names import (
    FRAME,
    SOLUTE_IX,
    SOLVENT,
    SOLVENT_IX,
    COUNT,
)


[docs] class Speciation: """ Calculate the solvation shells of every solute. Speciation organizes the solvation data by the type of residue coordinated with the central solvent. It collects this information in a pandas.DataFrame indexed by the frame and solute number. Each column is one of the solvents in the solvent_name column of the solvation data. The column value is how many residue of that type are in the solvation shell. Speciation provides the speciation of each solute in the speciation attribute, it also calculates the fraction of each unique shell and makes it available in the speciation_fraction attribute. Additionally, there are methods for finding solvation shells of interest and computing how common certain shell configurations are. Parameters ---------- solvation_data : pandas.DataFrame The solvation dataframe output by Solute. n_frames : int The number of frames in solvation_data. n_solutes : int The number of solutes in solvation_data. """
[docs] def __init__( self, solvation_data: pd.DataFrame, n_frames: int, n_solutes: int ) -> None: self.solvation_data = solvation_data self.n_frames = n_frames self.n_solutes = n_solutes self._speciation_df, self._speciation_fraction = self._compute_speciation() self._solvent_co_occurrence = self._solvent_co_occurrence()
[docs] @staticmethod def from_solute(solute: "solvation_analysis.Solute") -> "Speciation": """ Generate a Speciation object from a solute. Parameters ---------- solute : Solute Returns ------- Pairing """ assert solute.has_run, "The solute must be run before calling from_solute" return Speciation( solute.solvation_data, solute.n_frames, solute.n_solutes, )
def _compute_speciation(self) -> tuple[pd.DataFrame, pd.DataFrame]: counts = self.solvation_data.groupby([FRAME, SOLUTE_IX, SOLVENT]).count()[ SOLVENT_IX ] counts_re = counts.reset_index([SOLVENT]) speciation_data = counts_re.pivot(columns=[SOLVENT]).fillna(0).astype(int) res_names = speciation_data.columns.levels[1] speciation_data.columns = res_names sum_series = speciation_data.groupby(speciation_data.columns.to_list()).size() sum_sorted = sum_series.sort_values(ascending=False) speciation_fraction = sum_sorted.reset_index().rename(columns={0: COUNT}) speciation_fraction[COUNT] = speciation_fraction[COUNT] / ( self.n_frames * self.n_solutes ) return speciation_data, speciation_fraction @classmethod def _mean_speciation( cls, speciation_frames: pd.DataFrame, solute_number: int, frame_number: int ) -> pd.Series: means = speciation_frames.sum(axis=1) / (solute_number * frame_number) return means
[docs] def calculate_shell_fraction(self, shell_dict: dict[str, int]) -> float: """ Calculate the fraction of shells matching shell_dict. This function computes the fraction of solvation shells that exist with a particular composition. The composition is specified by the shell_dict. The fraction will be of all shells that match that specification. Attributes ---------- shell_dict : dict of {str: int} a specification for a shell composition. Keys are residue names (str) and values are the number of desired residues. e.g. if shell_dict = {'mol1': 4} then the function will return the fraction of shells that have 4 mol1. Note that this may include shells with 4 mol1 and any number of other solvents. To specify a shell with 4 mol1 and nothing else, enter a dict such as {'mol1': 4, 'mol2': 0, 'mol3': 0}. Returns ------- float the fraction of shells Examples -------- .. code-block:: python # first define Li, BN, and FEC AtomGroups >>> solute = Solute(Li, {'BN': BN, 'FEC': FEC, 'PF6': PF6}) >>> solute.run() >>> solute.speciation.calculate_shell_fraction({'BN': 4, 'PF6': 1}) 0.0898 """ query_list = [f"{name} == {str(count)}" for name, count in shell_dict.items()] query = " and ".join(query_list) query_counts = self.speciation_fraction.query(query) return query_counts[COUNT].sum()
[docs] def get_shells(self, shell_dict: dict[str, int]) -> pd.DataFrame: """ Find all solvation shells that match shell_dict. This returns the frame, solute index, and composition of all solutes that match the composition given in shell_dict. Attributes ---------- shell_dict : dict of {str: int} a specification for a shell composition. Keys are residue names (str) and values are the number of desired residues. e.g. if shell_dict = {'mol1': 4} then the function will return all shells that have 4 mol1. Note that this may include shells with 4 mol1 and any number of other solvents. To specify a shell with 4 mol1 and nothing else, enter a dict such as {'mol1': 4, 'mol2': 0, 'mol3': 0}. Returns ------- pandas.DataFrame the index and composition of all shells that match shell_dict """ query_list = [f"{name} == {str(count)}" for name, count in shell_dict.items()] query = " and ".join(query_list) query_counts = self.speciation_data.query(query) return query_counts
def _solvent_co_occurrence(self) -> pd.DataFrame: # calculate the co-occurrence of solvent molecules. expected_solvents_list = [] actual_solvents_list = [] for solvent in self.speciation_data.columns.values: # calculate number of available coordinating solvent slots shells_w_solvent = self.speciation_data.query(f"`{solvent}` > 0") n_solvents = shells_w_solvent.sum() # calculate expected number of coordinating solvents n_coordination_slots = n_solvents.sum() - len(shells_w_solvent) coordination_fraction = ( self.speciation_data.sum() / self.speciation_data.sum().sum() ) expected_solvents = coordination_fraction * n_coordination_slots # calculate actual number of coordinating solvents actual_solvents = n_solvents.copy() actual_solvents[solvent] = actual_solvents[solvent] - len(shells_w_solvent) # name series and append to list expected_solvents.name = solvent actual_solvents.name = solvent expected_solvents_list.append(expected_solvents) actual_solvents_list.append(actual_solvents) if len(actual_solvents_list) == 0 or len(expected_solvents_list) == 0: # we return this if nothing is solvated return pd.DataFrame() # make DataFrames actual_df = pd.concat(actual_solvents_list, axis=1) expected_df = pd.concat(expected_solvents_list, axis=1) # calculate correlation matrix correlation = actual_df / expected_df return correlation @property def speciation_data(self) -> pd.DataFrame: """ A dataframe containing the speciation of every solute at every trajectory frame. Indexed by timestep and solute numbers. Columns are the solvent molecules and values are the number of solvent in the shell. """ return self._speciation_df @property def speciation_fraction(self) -> pd.DataFrame: """ The fraction of shells of each type. Columns are the solvent molecules and values are the number of solvent in the shell. The final column is the fraction of total shell of that particular composition. """ return self._speciation_fraction @property def solvent_co_occurrence(self) -> pd.DataFrame: """ The actual co-occurrence of solvents divided by the expected co-occurrence. In other words, given one molecule of solvent i in the shell, what is the probability of finding a solvent j relative to choosing a solvent at random from the pool of all coordinated solvents. This matrix will likely not be symmetric. """ return self._solvent_co_occurrence