Source code for solvation_analysis.networking

"""
================
Networking
================
:Author: Orion Cohen, Tingzheng Hou, Kara Fong
:Year: 2021
:Copyright: GNU Public License v3

Study the topology and structure of solute-solvent networks.

Networking yields a complete description of coordinated solute-solvent networks
in the solute, regardless of identify. This could include cation-anion networks
or hydrogen bond networks.

While ``networking`` can be used in isolation, it is meant to be used
as an attribute of the Solute class. This makes instantiating it and calculating the
solvation data a non-issue.
"""

from typing import Union

import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from scipy.sparse.csgraph import connected_components

import solvation_analysis
from solvation_analysis._utils import calculate_adjacency_dataframe
from solvation_analysis._column_names import (
    FRAME,
    SOLVENT,
    SOLVENT_IX,
    SOLUTE_IX,
    NETWORK,
    PAIRED,
    NETWORKED,
    ISOLATED,
)


[docs] class Networking: """ Calculate the number and size of solute-solvent networks. A network is defined as a bipartite graph of solutes and solvents, where edges are defined by coordination in the solvation_data DataFrame. A single solvent or multiple solvents can be selected, but coordination between solvents will not be included, only coordination between solutes and solvents. Networking uses the solvation_data to construct an adjacency matrix and then extracts the connected subgraphs within it. These connected subgraphs are stored in a DataFrame in Networking.network_df. Several other representations of the networking data are included in the attributes. Parameters ---------- solvents : str or list[str] the solvents to include in the solute-solvent network. solvation_data : pandas.DataFrame a dataframe of solvation data with columns "frame", "solute_atom", "solvent_atom", "distance", "solvent_name", and "solvent". solute_res_ix : np.ndarray the residue indices of the solutes in solvation_data res_name_map : pd.Series a mapping between residue indices and the solute & solvent names in a Solute. Examples -------- .. code-block:: python # first define Li, BN, and FEC AtomGroups >>> solute = Solute.from_atoms(Li, {'BN': BN, 'FEC': FEC, 'PF6': PF6}) >>> networking = Networking.from_solute(solute, 'PF6') """
[docs] def __init__( self, solvents: Union[str, list[str]], solvation_data: pd.DataFrame, solute_res_ix: np.ndarray, res_name_map: pd.Series, ) -> None: self.solvents = solvents self.solvation_data = solvation_data # TODO: we need all analysis classes to run when there is no solvation_data # solvent_present = np.isin(self.solvents, self.solvation_data[SOLVENT].unique()) # if not solvent_present.all(): # raise Exception(f"Solvent(s) {np.array(self.solvents)[~solvent_present]} not found in solvation data.") self.solute_res_ix = solute_res_ix self.res_name_map = res_name_map self.n_solute = len(solute_res_ix) self._network_df = self._generate_networks() self._network_sizes = self._calculate_network_sizes() self._solute_status, self._solute_status_by_frame = ( self._calculate_solute_status() ) self._solute_status = self._solute_status.to_dict()
[docs] @staticmethod def from_solute( solute: "solvation_analysis.Solute", solvents: Union[str, list[str]] ) -> "Networking": """ Generate a Networking object from a solute and solvent names. Parameters ---------- solute : Solute solvents : str or list of str the strings should be the name of solvents in the Solute. The strings must match exactly for Networking to work properly. The selected solvents will be used to construct the networking graph that is described in documentation for the Networking class. Returns ------- Networking """ return Networking( solvents, solute.solvation_data, solute.solute_res_ix, solute.res_name_map, )
@staticmethod def _unwrap_adjacency_dataframe(df: pd.DataFrame) -> csr_matrix: # this class will transform the biadjacency matrix into a proper adjacency matrix connections = df.reset_index(FRAME).drop(columns=FRAME) idx = connections.columns.append(connections.index) directed = connections.reindex(index=idx, columns=idx, fill_value=0) undirected = directed.values + directed.values.T adjacency_matrix = csr_matrix(undirected) return adjacency_matrix def _generate_networks(self) -> pd.DataFrame: """ This function generates a dataframe containing all the solute-solvent networks in every frame of the simulation. The rough approach is as follows: 1. transform the solvation_data DataFrame into an adjacency matrix 2. determine the connected subgraphs in the adjacency matrix 3. tabulate the solvent involved in each network and store in a DataFrame """ solvents = [self.solvents] if isinstance(self.solvents, str) else self.solvents solvation_subset = self.solvation_data[ np.isin(self.solvation_data[SOLVENT], solvents) ] # create adjacency matrix from solvation_subset graph = calculate_adjacency_dataframe(solvation_subset) network_arrays = [] # loop through each time step / frame for frame, df in graph.groupby(FRAME): # drop empty columns df = df.loc[:, (df != 0).any(axis=0)] # save map from local index to residue index solute_map = df.index.get_level_values(SOLUTE_IX).values solvent_map = df.columns.values ix_to_res_ix = np.concatenate([solvent_map, solute_map]) adjacency_df = Networking._unwrap_adjacency_dataframe(df) _, network = connected_components( csgraph=adjacency_df, directed=False, return_labels=True ) network_array = np.vstack( [ np.full(len(network), frame), # frame network, # network self.res_name_map[ix_to_res_ix], # res_names ix_to_res_ix, # res index ] ).T network_arrays.append(network_array) # create and return network dataframe if len(network_arrays) == 0: all_clusters = [] else: all_clusters = np.concatenate(network_arrays) cluster_df = ( pd.DataFrame(all_clusters, columns=[FRAME, NETWORK, SOLVENT, SOLVENT_IX]) .set_index([FRAME, NETWORK]) .sort_values([FRAME, NETWORK]) ) return cluster_df def _calculate_network_sizes(self) -> pd.DataFrame: # This utility calculates the network sizes and returns a convenient dataframe. cluster_df = self.network_df cluster_sizes = cluster_df.groupby([FRAME, NETWORK]).count() size_counts = ( cluster_sizes.groupby([FRAME, SOLVENT]).count().unstack(fill_value=0) ) size_counts.columns = size_counts.columns.droplevel( None ) # the column value is None return size_counts def _calculate_solute_status(self) -> tuple[pd.Series, pd.DataFrame]: """ This utility calculates the fraction of each solute with a given "status". Namely, whether the solvent is "isolated", "paired" (with a single solvent), or "networked" of > 2 species. """ # an empty df with the right index status = self.network_sizes.iloc[:, 0:0] status[PAIRED] = self.network_sizes.iloc[:, 0:1].sum(axis=1).astype(int) status[NETWORKED] = self.network_sizes.iloc[:, 1:].sum(axis=1).astype(int) status[ISOLATED] = self.n_solute - status.loc[:, [PAIRED, NETWORKED]].sum( axis=1 ) status = status.loc[:, [ISOLATED, PAIRED, NETWORKED]] solute_status_by_frame = status / self.n_solute solute_status = solute_status_by_frame.mean() return solute_status, solute_status_by_frame
[docs] def get_network_res_ix(self, network_index: int, frame: int) -> np.ndarray: """ Return the indexes of all residues in a selected network. The network_index and frame must be provided to fully specify the network. Once the indexes are returned, they can be used to select an AtomGroup with the species of interest, see Examples. Parameters ---------- network_index : int The index of the network of interest frame : int the frame in the trajectory to perform selection at. Defaults to the current trajectory frame. Returns ------- res_ix : np.ndarray Examples -------- .. code-block:: python # first define Li, BN, and FEC AtomGroups >>> solute = Solute(Li, {'BN': BN, 'FEC': FEC, 'PF6': PF6}) >>> networking = Networking.from_solute(solute, 'PF6') >>> res_ix = networking.get_network_res_ix(1, 5) >>> solute.u.residues[res_ix].atoms <AtomGroup with 126 Atoms> """ res_ix = self.network_df.loc[ pd.IndexSlice[frame, network_index], SOLVENT_IX ].values return res_ix.astype(int)
@property def network_df(self) -> pd.DataFrame: """ The dataframe containing all networking data. the indices are the frame and network index, respectively. the columns are the solvent_name and res_ix. """ return self._network_df @property def network_sizes(self) -> pd.DataFrame: """ A dataframe of network sizes. the index is the frame. the column headers are network sizes, or the number of solutes + solvents in the network, so the columns might be [2, 3, 4, ...]. the values in each column are the number of networks with that size in each frame. """ return self._network_sizes @property def solute_status(self) -> dict[str, float]: """ A dictionary where the keys are the "status" of the solute and the values are the fraction of solute with that status, averaged over all frames. "isolated" means that the solute not coordinated with any of the networking solvents, network size is 1. "paired" means the solute and is coordinated with a single networking solvent and that solvent is not coordinated to any other solutes, network size is 2. "networked" means that the solute is coordinated to more than one solvent or its solvent is coordinated to more than one solute, network size >= 3. """ return self._solute_status @property def solute_status_by_frame(self) -> pd.DataFrame: """ As described above, except organized into a dataframe where each row is a unique frame and the columns are "isolated", "paired", and "networked". """ return self._solute_status_by_frame