Source code for pylipid.func.binding_site

##############################################################################
# PyLipID: A python module for analysing protein-lipid interactions
#
# Author: Wanling Song
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
##############################################################################

"""This module contains functions for calculating binding sites"""

import os
from collections import defaultdict
from itertools import product
import community
import networkx as nx
import numpy as np
import pandas as pd
import mdtraj as md
from statsmodels.nonparametric.kernel_density import KDEMultivariate as kde
from sklearn.decomposition import PCA
from ..util import check_dir, rmsd, get_traj_info
from .clusterer import cluster_DBSCAN, cluster_KMeans


__all__ = ["get_node_list", "collect_bound_poses", "vectorize_poses", "calculate_scores", "write_bound_poses",
           "calculate_surface_area_wrapper", "analyze_pose_wrapper"]


[docs]def get_node_list(corrcoef, threshold=4):
    r"""Calculate community structures from interaction network.

    The interaction network is built using the correlation coeffient matrix, in which the edges are the Pearson correlation
    of the two connecting nodes. The community structures of this network is calculated using the Louvain algorithm [1]_,
    which find high modularity network partitions. The modularity is defined as [2]_:

    .. math::
            Q=\frac{1}{2 m} \sum_{i, j}\left[A_{i j}-\frac{k_{i} k_{j}}{2 m}\right] \delta\left(c_{i}, c_{j}\right)

    where :math:`A_{i j}` is the weight of the edge between node i and node j; :math:`k_{i}` is the sum of weights
    of the nodes attached to the node i, i.e. the degree of the node; :math:`c_{i}` is the community to which node i
    assigned; :math:`\delta\left(c_{i}, c_{j}\right)` is 1 if i=j and 0 otherwise; and
    :math:`m=\frac{1}{2} \sum_{i j} A_{i j}` is the number of edges. In the modularity optimization, the Louvain
    algorithm orders the nodes in the network, and then, one by one, removes and inserts each node in a different
    community c_i until no significant increase in modularity. After modularity optimization, all the nodes that
    belong to the same community are merged into a single node, of which the edge weights are the sum of the weights
    of the comprising nodes. This optimization-aggregation loop is iterated until all nodes are collapsed into one.

    By default, this method returns communities containing at least 4 nodes. This setting can be changed by using
    the parameter ``threshold``.

    Parameters
    -----------
    corrcoef : ndarray(n, n)
        The Pearson correlation matrix.

    threshold : int, default=4
        Size of communities. Only communities with more nodes than the threshold will be returned.

    Returns
    --------
    node_list : list of lists
        A list of community nodes.

    modularity : float or None
        The modularity of network partition. It measure the quality of network partition. The value is between 1 and
        -1. The bigger the modularity, the better the partition.

    References
    ----------
    .. [1] Blondel, V. D.; Guillaume, J.-L.; Lambiotte, R.; Lefebvre, E., Fast unfolding of communities in large
           networks. Journal of Statistical Mechanics: Theory and Experiment 2008, 2008 (10), P10008

    .. [2] Newman, M. E. J., Analysis of weighted networks. Physical Review E 2004, 70 (5), 056131.

    """
    # TODO: check negative values in corrcoef_matrix. Come up with better solutions.
    corrcoef[corrcoef < 0.0] = 0.0 # network edge can't take negative values. Residues with
                                   # negative correlationsare are forced to separate to different binding sites.
    graph = nx.Graph(corrcoef)
    partition = community.best_partition(graph, weight='weight')
    values = [partition.get(node) for node in graph.nodes()]
    node_list = []
    for value in range(max(values)):
        nodes = [k for k, v in partition.items() if v == value]
        if len(nodes) >= threshold:
            node_list.append(nodes)
    if len(node_list) > 0:
        modularity = community.modularity(partition, graph)
    else:
        modularity = None
    return node_list, modularity


[docs]def collect_bound_poses(binding_site_map, contact_residue_index, trajfile_list, topfile_list,
                        lipid, protein_ref, lipid_ref, stride=1, nprot=1):
    """Collected the bound poses from trajectories.

    The bound poses of binding sites specified in the ``binding_site_map`` are collected from trajectories based on the
    contacting lipid residue index that is provided by ``contact_residue_index``.

    In implementation, the ``binding_site_map provides`` a dictionary that informs of what residues constitues the selected
    binding sites. The ``contact_residue_index`` is the lipid index generated by
    :meth:`pylipid.api.LipidInteraction.collect_residue_contacts`, which is a python dictionary that records,
    for each residue, the contacting lipid indices at each trajectory frame. The contacting lipids at each trajectory frame
    are calculated by merging, with duplicates removed, the contacting lipid molecules of the binding site residues. The
    coordinates of the contacting lipid molecules, together with the protein coordinates that the lipid poses bind are taken
    from the trajectories and made into a MDtraj.Trajectory object, which contains coordinates and topology information of
    the protein-lipid complex and the unitcell information of the simulation box at the time the poses were taken.

    ``protein_ref`` and ``lipid_ref`` are the MDtraj topology object (mdtraj.Topology) of a copy of the  protein molecule
    and a copy of the lipid molecule respectively. These two are combined to generate the topology object for the
    protein-lipid complex which is required for mdtraj.Trajectory object of the bound poses.

    The simulation trajectories are read in this function to obtain the lipid and receptor coordinates. Therefore, the
    order of the ``trajfile_list`` and ``topfile_list`` and the settings of ``stride`` and ``nprot`` should keep consistent
    with the settings that generated the contacting lipid index ``contact_residue_index``.

    Parameters
    ----------
    binding_site_map : dict
        A python dictionary with the selected binding site IDs as its keys and the binding site residue indices as the
        corresponding values. {binding site ID: a list of binding site residue indices}

    contact_residue_index : dict
        A python dictionary that records contacting lipid molecule indices for residues. It should has residue indices
        as its keys and a list of contacting lipid molecule residue indices for each of the trajectory frame as their
        corresponding values.

    trajfile_list : list of str
        A list of trajectory files.

    topfile_list : list of str
        A list of topology files.

    lipid : str
        Residue name of the lipid to check.

    protein_ref : mdtraj.Topology or mdtraj.Trajectory
        A MDtraj object of a copy of the protein molecule that contains topology information of the protein.

    lipid_ref : mdtraj.Trajectory
        A MDtraj object of a copy of the lipid molecule that contains topology information of the lipid molecule.

    stride : int,default=1
        Analyze every stride-th frame of the trajectories. Should keep consistent with the settings that generated the
        contacting lipid index ``contact_residue_index``.

    nprot : int, default=1
        Number of copies of proteins in the simulation systems. Should keep consistent with the settings that generated
        the contacting lipid index ``contact_residue_index``.

    Returns
    ---------
    pose_traj : dict
        A python dictionary that stores bound pose coordinates. This python dictionary has the selected binding site IDs
        as its keys and each corresponding value contains a mdtraj.Trajectory object that contains the bound lipid coordinates
        with the receptor coordinates the lipid bind to.

    pose_info : dict
        A python dictionary that stores the corresponding information to the lipid poses in ``pose_traj``. The pose
        information is a tuple of four figures, which correspond to traj_idx, protein_idx, lipid_residue_index,
        frame_id*traj.timestep.

    See also
    --------
    pylipid.func.cal_contact_residues
        The function that calculates the contact residues from distance matrix
    pylipid.api.LipidInteraction.collect_residue_contacts
        The function that creates contact lipid index.
    pylipid.api.LipidInteraction.analyze_bound_poses
        The function that analyzes lipid bound poses.

    """
    joined_top = protein_ref.top.join(lipid_ref.top)
    trajfile_list = np.atleast_1d(trajfile_list)
    topfile_list = np.atleast_1d(topfile_list)
    stride = int(stride)
    nprot = int(nprot)
    pose_pool = defaultdict(list)
    pose_info = defaultdict(list)
    for traj_idx in np.arange(len(trajfile_list)):
        traj = md.load(trajfile_list[traj_idx], top=topfile_list[traj_idx], stride=stride)
        protein_indices_all = traj.top.select("protein")
        natoms_per_protein = int(len(protein_indices_all) / nprot)
        # obtain lipid residue index
        lipid_residue_index_set = set()
        for atom_idx in traj.top.select("resn {}".format(lipid)):
            lipid_residue_index_set.add(traj.top.atom(atom_idx).residue.index)
        lipid_residue_index_set = list(lipid_residue_index_set)
        lipid_residue_index_set.sort()
        # store collect poses
        for protein_idx in np.arange(nprot):
            protein_indices = protein_indices_all[
                              protein_idx * natoms_per_protein:(protein_idx + 1) * natoms_per_protein]
            for bs_id, nodes in binding_site_map.items():
                list_to_take = traj_idx * nprot + protein_idx
                contact_BS = [np.unique(np.concatenate(
                    [contact_residue_index[node][list_to_take][frame_idx] for node in nodes]))
                    for frame_idx in range(traj.n_frames)]
                for frame_id in range(len(contact_BS)):
                    if len(contact_BS[frame_id]) > 0:
                        for lipid_id in contact_BS[frame_id]:
                            lipid_residue_index = lipid_residue_index_set[int(lipid_id)]
                            lipid_atom_indices = np.sort(
                                [atom.index for atom in traj.top.residue(lipid_residue_index).atoms])
                            pose_pool[bs_id].append(
                                [np.copy(traj.xyz[frame_id, np.hstack([protein_indices, lipid_atom_indices])]),
                                 np.copy(traj.unitcell_angles[frame_id]), np.copy(traj.unitcell_lengths[frame_id])])
                            pose_info[bs_id].append((traj_idx, protein_idx, lipid_residue_index, frame_id*traj.timestep))
    pose_traj = {}
    for bs_id in pose_pool.keys():
        pose_traj[bs_id] = md.Trajectory([frame[0] for frame in pose_pool[bs_id]], joined_top,
                                          time=None,
                                          unitcell_angles=[frame[1] for frame in pose_pool[bs_id]],
                                          unitcell_lengths=[frame[2] for frame in pose_pool[bs_id]])

    return pose_traj, pose_info


[docs]def vectorize_poses(bound_poses, binding_nodes, protein_atom_indices, lipid_atom_indices):
    """Convert the bound poses to distance vectors.

    This function takes the protein-lipid complex structure, and translate the bound lipid poses to distance vectors,
    which contains the minimum distances of each of the lipid molecule atom to the binding site residues. It uses
    the function `mdtraj.compute_distances <https://mdtraj.org/1.9.4/api/generated/mdtraj.compute_distances.html>`_
    to calculate atom-wise distances.

    Parameters
    ----------
    bound_poses : mdtraj.Trajectory
        mdtraj.Trajectory object that contain protein-lipid complex coordinates.

    binding_nodes : list
        Binding site residue indices.

    protein_atom_indices : list
        A list of residue atom indices in the protein-lipid complex structure, i.e. [[0,1,2],[3,4,5],[6,7]] means the
        first residue contains atom 0,1,2, and the second residue contains atom 3,4,5.

    lipid_atom_indcies : list
        The atom indices of the lipid molecule in the protein-lipid complex structure.

    Returns
    -------
    dist_matrix : numpy.ndarray
        The distance matrix of the bound poses, in the shape of [n_lipid_atoms, n_poses, n_binding_site_residues]

    """
    # calculate distance to binding site residues for each lipid atom
    dist_per_atom = np.array(
        [np.array([md.compute_distances(bound_poses,
                                        list(product([lipid_atom_indices[idx]], protein_atom_indices[node])),
                                        periodic=True).min(axis=1) for node in binding_nodes]).T
         for idx in np.arange(len(lipid_atom_indices))])  # shape: [n_lipid_atoms, n_poses, n_BS_residues]
    return dist_per_atom


[docs]def calculate_scores(dist_matrix, kde_bw=0.15, pca_component=0.90, score_weights=None):
    r"""Calculate scores based on probability density.

    This function first lower the dimension of dist_matrix by using a
    `PCA <https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html>`_. Then the distribution of
    the distance vectors for each atom is estimated using
    `KDEMultivariate <https://www.statsmodels.org/devel/generated/statsmodels.nonparametric.kernel_density.KDEMultivariate.html>`_.

    The score of a lipid pose is calculated based on the probability density function of the atom positions in the binding
    site and weights given to the atoms:

    .. math::
        \text { score }=\sum_{i} W_{i} \cdot \hat{f}_{i, H}(D)

    where :math:`W_{i}` is the weight given to atom i of the lipid molecule, H is the bandwidth and
    :math:`\hat{f}_{i, H}(D)` is a multivariate kernel density etimation of the position of atom i in the specified
    binding site. :math:`\hat{f}_{i, H}(D)` is calculated from all the bound lipid poses in that binding site.

    Parameters
    ----------
    dist_matrix : numpy.ndarray, shape=(n_lipid_atoms, n_poses, n_binding_site_residues)
        The distance vectors describing the position of bound poses in the binding site. This dist_matrix can be
        generated by :meth:`~vectorize_poses`.

    kde_bw : scalar, default=0.15
        The bandwidth for kernel density estimation. Used by
        `KDEMultivariate <https://www.statsmodels.org/devel/generated/statsmodels.nonparametric.kernel_density.KDEMultivariate.html>`_.
        By default, the bandwidth is set to 0.15nm which roughly corresponds to the vdw radius of MARTINI 2 beads.

    pca_component : scalar, default=0.9
        The number of components to keep. if ``0 < pca_component<1``, select the number of components such that the
        amount of variance that needs to be explained is greater than the percentage specified by n_components. It is used
        by `PCA <https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html>`_.

    score_weights : None or dict
        A dictionary that contains the weight for n_lipid_atoms, {idx_atom: weight}

    Returns
    ---------
    scores : numpy.ndarray, shape=(n_samples,)
        Scores for bound poses.

    See also
    --------
    pylipid.func.collect_bound_poses
        Collect bound poses from trajectories.
    pylipid.func.vectorize_poses
        Convert bound poses to distance vectors.

    """
    weights = {atom_idx: 1 for atom_idx in np.arange(np.shape(dist_matrix)[0])}
    if score_weights is not None:
        weights.update(score_weights)

    kde_funcs = {}
    try:
        for atom_idx in np.arange(np.shape(dist_matrix)[0]):
            transformed_data = PCA(n_components=pca_component).fit_transform(dist_matrix[atom_idx])
            var_type = ""
            bw = []
            for dummy in range(len(transformed_data[0])):
                var_type += "c"
                bw.append(kde_bw)
            kde_funcs[atom_idx] = kde(data=transformed_data, var_type=var_type, bw=bw)
        # evaluate binding poses
        scores = np.sum([weights[atom_idx] * kde_funcs[atom_idx].pdf()
                         for atom_idx in np.arange(np.shape(dist_matrix)[0])], axis=0)
        return scores

    except ValueError:
        print("Pose generation error -- possibly due to insufficient number of binding event.")


[docs]def write_bound_poses(pose_traj, pose_indices, save_dir, pose_prefix="BoundPose", pose_format="pdb"):
    """Write selected bound poses to disc.

    The ``pose_traj`` is a mdtraj.Trajectory object. The selected bound poses, i.e. selected frames of this mdtraj.Trajectory
    object, are written to disc by mdtraj.Trajectory.save() function.

    Parameters
    ----------
    pose_traj : mdtraj.Trajectory
        Bound poses coordinates object.

    pose_indices : array_like
        The indices of the poses, i.e. the indices of the frames, to be written.

    save_dir : str
        The directory to save the written coordinate files.

    pose_prefix : str, optional, default=BoundPose"
        The prefix of the coordinate files.

    pose_format : str, optional, default="gro"
        The format of the coordinate files. Support formats included by mdtraj.Trajectory.save().

    """
    for idx, pose_id in enumerate(pose_indices):
        pose_traj[pose_id].save(os.path.join(save_dir, "{}{}.{}".format(pose_prefix, idx, pose_format)))
    return


[docs]def analyze_pose_wrapper(bs_id, bound_poses, binding_nodes, pose_info, pose_dir=None,
                         n_top_poses=3, protein_atom_indices=None, lipid_atom_indices=None, atom_weights=None,
                         kde_bw=0.15, pca_component=0.90, pose_format="gro", n_clusters="auto",
                         eps=None, min_samples=None, metric="euclidean", trajfile_list=None):
    r"""A wrapper function that ranks poses, clusters poses and calculates pose RMSD.

    A wrapper function is to assist the calculation using `p_tqdm <https://github.com/swansonk14/p_tqdm>`_, which is
    a multiprocessing library incorporated with progress bars. So some of the setting or parameters are used to
    assist the use of multiprocessing.

    This function uses :meth:`~vectorize_poses` to convert bound poses into distance vectors, which measure the relative
    position of lipid atoms in their binding site. Then it uses :meth:`~calculate_scores` to calculate the density distributions
    of lipid atoms and score the bound poses based on the probability density function of the position of lipid atoms. Based on
    the scores, a couple of top-ranked poses (determined by ``n_top_poses``) are written out in a coordinate format
    determined by ``pose_format``. The top-ranked poses for a binding site are saved in the directory of BSidP{bs_id}_rank.

    This function also clusters the bound poses. The bound poses are clustered using their distance vectoris, i.e. their
    relative positions in the binding site. If ``n_clusters`` is set to `auto`, this function will cluster the bound poses
    using :meth:`pylipid.func.cluster_DBSCAN`. DBSCAN finds clusters of core samples of high density. If ``n_clusters``
    is given an integer larger than 0, this function will cluster the lipid bound poses using :meth:`pylipid.func.cluster_KMeans`.
    The KMeans cluster separates the samples into `n` clusters of equal variances, via minimizing the `inertia`.

    For writing out the cluster poses, this function will randomly select one pose from each cluster in the case of
    using KMeans or one from the core samples of each cluster when DBSCAN is used, and writes the selected protein-lipid
    coordinates in the provided pose format ``pose_format``. The clustered poses are saved in the directory BSid{bs_id}_clusters.

    The root mean square deviation (RMSD) of a lipid bound pose in a binding site is calculated from the relative
    position of the pose in the binding site compared to the average position of the bound poses. Thus, the pose
    RMSD is defined as:

    .. math::
        RMSD=\sqrt{\frac{\sum_{i=0}^{N} \sum_{j=0}^{M}\left(D_{i j}-\bar{D}_{j}\right)^{2}}{N}}

    where :math:`D_{i j}` is the distance of atom `i` to the residue `j` in the binding site; :math:`\bar{D}_{j}` is the
    average distance of atom `i` from all bound poses in the binding site to residue `j`; `N` is the number of atoms in
    the lipid molecule and `M` is the number of residues in the binding site.

    Parameters
    ----------
    bs_id : int
        Binding Site ID. Used in creating directory for stoing poses.

    bound_poses : mdtraj.Trajectory
        A mdtraj.Trajectory object that contains protein-lipid complex coordinates.

    binding_nodes : list
        A list of residue indices of binding site residues.

    pose_info : list
        A list of tuples of information of the bound poses in ``bound_poses``. A info tuple contains four figures,
        which correspond to traj_idx, protein_idx, lipid_residue_index, frame_id*traj.timestep. These pose_info tuples
        and the bound poses are generated by :meth:`~collect_bound_poses`.

    pose_dir : str or None
        The directory to save representative bound poses and clustered poses. If None, save at the current working directory.

    n_top_poses : int, default=3
        Number of representative bound poses selected to write to disc.

    protein_atom_indices : list
        A list of residue atom indices in the protein-lipid complex structure, i.e. [[0,1,2],[3,4,5],[6,7]] means the
        first residue contains atom 0,1,2, and the second residue contains atom 3,4,5. Used when vectorizing the bound poses
        by :meth:`~vectorize_poses`.

    lipid_atom_indcies : list
        The atom indices of the lipid molecule in the protein-lipid complex structure. Used when vectorizing the bound poses
        by :meth:`~vectorize_poses`.

    atom_weights : None or dict
        A dictionary that contains the weight for n_lipid_atoms, {idx_atom: weight}. Used when scoring the bound poses by
        :meth:`~calculate_scores`.

    kde_bw : calar, default=0.15
        The bandwidth for kernel density estimation. Used when estimating the kernel density of lipid atom positions by
        :meth:`~calculate_scores`. By default, the bandwidth is set to 0.15nm which roughly corresponds  to the vdw
        radius of MARTINI 2 beads.

    pca_component : scalar, default=0.9
        The number of components to keep. if ``0 < pca_component<1``, select the number of components such that the
        amount of variance that needs to be explained is greater than the percentage specified by n_components. It is used
        to decrease the dimentions of distance vectors when estimating the kernel density, by :meth:`~calculate_scores`.

    pose_format : str, optional, default="gro"
        The format of the coordinate files. Support formats included by mdtraj.Trajectory.save(). Used by
        :meth:`~write_bound_poses`.

    n_clusters : int or "auto", default="auto"
        Number of clusters to find in the bound poses. When set to `auto`, :meth:`pylipid.func.cluster_DBSCAN` will be
        used for clustering. When a integer is given, :meth:`pylipid.func.cluster_KMeans` will be used.

    eps : float or None, default=None
        The maximum distance between two samples for one to be considered as in the neighborhood of the other. Used by
        :meth:`pylipid.func.cluster_DBSCAN`.

    min_samples : int or None, default=None
        The number of samples (or total weight) in a neighborhood for a point to be considered as a core point. Used by
        :meth:`pylipid.func.cluster_DBSCAN`.

    metric : string, or callable, default=’euclidean’
        The metric to use when calculating distance between instances in a feature array. Used by :meth:`pylipid.func.cluster_DBSCAN`.

    trajfile_list : str or None, default=None
        Used to write pose information, i.e. what trajectory, which timestep and which lipid molecule the written pose
        was taken from.

    Returns
    -------
    pose_rmsds : list
        A list of bound pose RMSDs.

    See Also
    --------
    pylipid.func.vectorize_poses
        Convert the bound poses to distance vectors.
    pylipid.func.calculate_scores
        Calculate scores based on probability density.
    pylipid.func.cluster_DBSCAN
        Cluster data using DBSCAN.
    pylipid.func.cluster_KMeans
        Cluster data using KMeans.

    """
    if len(bound_poses) == 0:
        print(f"No Bound pose collected from Binding Site {bs_id}! Possibly due to insufficient sampling.")
        return []
    ## rank poses ##
    pose_dir_rank = check_dir(pose_dir, "BSid{}_rank".format(bs_id), print_info=False)
    # lipid_dist_per_atom shape: [n_lipid_atoms, n_bound_poses, n_BS_residues]
    lipid_dist_per_atom = vectorize_poses(bound_poses, binding_nodes,
                                          protein_atom_indices, lipid_atom_indices)
    scores = calculate_scores(lipid_dist_per_atom, kde_bw=kde_bw, pca_component=pca_component,
                              score_weights=atom_weights)
    num_of_poses = min(n_top_poses, bound_poses.n_frames)
    selected_pose_indices = np.argsort(scores)[::-1][:num_of_poses]
    if len(selected_pose_indices) > 0:
        write_bound_poses(bound_poses, selected_pose_indices, pose_dir_rank, pose_prefix="BSid{}_top".format(bs_id),
                          pose_format=pose_format)
        _write_pose_info([pose_info[int(pose_idx)] for pose_idx in selected_pose_indices],
                         f"{pose_dir_rank}/pose_info.txt", trajfile_list)
    ## cluster poses ##
    lipid_dist_per_pose = np.array([lipid_dist_per_atom[:, pose_id, :].ravel()
                                    for pose_id in np.arange(lipid_dist_per_atom.shape[1])])
    pose_dir_clustered = check_dir(pose_dir, "BSid{}_clusters".format(bs_id), print_info=False)
    transformed_data = PCA(n_components=pca_component).fit_transform(lipid_dist_per_pose)
    if n_clusters == 'auto':
        _, core_sample_indices = cluster_DBSCAN(transformed_data, eps=eps, min_samples=min_samples,
                                                metric=metric)
        selected_pose_indices = [np.random.choice(i_core_sample, 1)[0] for i_core_sample in core_sample_indices]
    elif n_clusters > 0:
        cluster_labels = cluster_KMeans(transformed_data, n_clusters=n_clusters)
        cluster_id_set = np.unique(cluster_labels)
        selected_pose_indices = [np.random.choice(np.where(cluster_labels == cluster_id)[0], 1)[0]
                            for cluster_id in cluster_id_set]
    if len(selected_pose_indices) > 0:
        write_bound_poses(bound_poses, selected_pose_indices, pose_dir_clustered,
                          pose_prefix="BSid{}_cluster".format(bs_id), pose_format=pose_format)
        _write_pose_info([pose_info[int(pose_idx)] for pose_idx in selected_pose_indices],
                              f"{pose_dir_clustered}/pose_info.txt", trajfile_list)
    ## calculate RMSD ##
    dist_mean = np.mean(lipid_dist_per_pose, axis=0)
    pose_rmsds = [rmsd(lipid_dist_per_pose[pose_id], dist_mean)
                  for pose_id in np.arange(len(lipid_dist_per_pose))]
    return pose_rmsds


[docs]def calculate_surface_area_wrapper(trajfile, topfile, traj_idx, binding_site_map={}, nprot=1, timeunit="us",
                                   stride=1, dt_traj=None, radii=None):
    """A wrapper function for calculating surface area.

    A wrapper function is to assist the calculation using `p_tqdm <https://github.com/swansonk14/p_tqdm>`_, which is
    a multiprocessing library incorporated with progress bars. So some of the setting or parameters are used to
    assist the use of multiprocessing.

    In this function, the provided trajectory is read by mdtraj.read() and the solvent accessible surface of each residue
    is calculated by `mdtraj.shrake_rupley <https://mdtraj.org/1.9.4/api/generated/mdtraj.shrake_rupley.html>`_. The
    binding site surface area is calculated as the sum of surface areas of its comprising residues. The binding sites are
    defined by ``binding_site_map`` which is a python dictionary object that includes binding site IDs as its keys and
    the residue indices of the binding site residues as the corresponding values.

    The calculation of surface area requires a definition of the protein atom radius. MDtraj defines the radii for common
    atoms (see `here <https://github.com/mdtraj/mdtraj/blob/master/mdtraj/geometry/sasa.py#L56>`_). The radius of the BB
    bead in MARTINI2 is defined as 0.26 nm, the SC1/SC2/SC3 are defined as 0.23 nm in this function. Use the param
    ``radii`` to define or change of definition of atom radius.

    The returned data ``surface_area`` is a list of pandas.DataFrame objects, and each of these pandas.DataFrame object
    contains surface area data for one copy of the proteins in the simulation system. That is, if the system has N copies
    of the receptor, ``surface_area`` will have N pandas.DataFrame objects. Each pandas.DataFrame contains the surface
    areas as a function of time for the selected binding site are shown by column with the column name of
    "Binding Site {idx}". This pandas.DataFrame object also has a "Time" column which records the timestep at which the
    surface areas are measured. The other returned data ``data_keys`` contains a list of tuples of (traj_idx, protein_idx),
    which corresponds to the list of pandas.DataFrame objects of ``surface_area``. The reason for having this arrangement
    is to assist the generation of a big pandas.DataFrame that includes surface area data from all trajectories in
    :meth:`pylipid.api.LipidInteraction.compute_surface_area` when used in combination with ``p_tqdm``.

    Parameters
    ----------
    trajfile : str
        Trajectory filename.

    topfile : str
        Topology filename.

    traj_idx : idx
        The index of the given trajectory in the original trajectory list.

    binding_site_map : dict
        A python dictionary to defines the selected binding site IDs and the residue indices of the binding site residues

    nprot : int, default=1
        Number of copies of the receptor in the simulation system.

    timeunit : {"us", "ns"}
        The time unit used for reporting the timestpes.

    stride : int, default=1
        Analyze every stride-th frame from the trajectory.

    dt_traj : int or None
        Timestep of trajectories. It is required when trajectories do not have timestep information. Not needed for
        trajectory formats of e.g. xtc, trr etc. If None, timestep information will take from trajectories.

    radii : dict or None
        Protein atom radii {atom name: radius}. The radius is reported in nanometer.

    Returns
    -------
    surface_area : list
        A list of {nprot} pandas.DataFrame objects. Each pandas.DataFrame object records the surface areas as a function
        of time for the selected binding sites on one copy of the receptor in the system.

    data_keys : list
        A list of tuples (traj_idx, protein_idx) which are the traj index and protein index information of the
        pandas.DataFrame objects in ``surface_area``.

    See Also
    --------
    pylipid.api.LipidInteraction.compute_surface_area
        Calculate binding site surface area from a list of trajectories.

    """
    MARTINI_CG_radii = {"BB": 0.26, "SC1": 0.23, "SC2": 0.23, "SC3": 0.23}
    if radii is None:
        radii_book = MARTINI_CG_radii
    else:
        radii_book = {**MARTINI_CG_radii, **radii}
    traj = md.load(trajfile, top=topfile, stride=stride)
    traj_info, _, _ = get_traj_info(traj, nprot=nprot, protein_ref="A") # Assign random value to protein_ref to
    # prevent generation of protein Topology file to save time.
    if dt_traj is None:
        traj_times = traj.time / 1000000.0 if timeunit == "us" else traj.time / 1000.0
    else:
        traj_times = float(dt_traj * stride) * np.arange(traj.n_frames)
    surface_data = []
    data_keys = []
    for protein_idx in np.arange(nprot):
        protein_indices = np.hstack(traj_info["protein_residue_atomid_list"][protein_idx])
        new_traj = traj.atom_slice(protein_indices, inplace=False)
        traj_info_new, _, _ = get_traj_info(new_traj, nprot=1, protein_ref="A") # only one copy of protein in this
        # new_traj. Assign random value to protein_ref to prevent generation of protein Topology file to save time.
        binding_site_map_atoms = {
            bs_id: np.hstack([traj_info_new["protein_residue_atomid_list"][0][node] for node in nodes])
            for bs_id, nodes in binding_site_map.items()}
        area_all = md.shrake_rupley(new_traj, mode='atom', change_radii=radii_book)
        selected_data = [area_all[:, atomids].sum(axis=1) for bs_id, atomids in binding_site_map_atoms.items()]
        selected_data.append(traj_times)
        column_names = ["Binding Site {}".format(bs_id) for bs_id, atomids in binding_site_map_atoms.items()]
        column_names.append("Time")
        surface_data.append(pd.DataFrame(np.array(selected_data).T, columns=column_names))
        data_keys.append((traj_idx, protein_idx))
    return surface_data, data_keys


def _write_pose_info(selected_pose_info, fn, trajfile_list):
    """Write pose information for each selected pose. """
    with open(fn, "w") as f:
        for idx, info in enumerate(selected_pose_info):
            f.write("POSE ID : {}\n".format(int(idx)))
            f.write("TRAJ FN : {}\n".format(trajfile_list[info[0]]))
            f.write("PROT ID : {}\n".format(info[1]))
            f.write("LIPID ID: {}\n".format(info[2]))
            f.write("TIME    : {:8.3f} ps\n".format(info[3]))
            f.write("\n")
            f.write("\n")