############################################################################### PyLipID: A python module for analysing protein-lipid interactions## Author: Wanling Song## Permission is hereby granted, free of charge, to any person obtaining a copy# of this software and associated documentation files (the "Software"), to deal# in the Software without restriction, including without limitation the rights# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell# copies of the Software, and to permit persons to whom the Software is# furnished to do so, subject to the following conditions:## The above copyright notice and this permission notice shall be included in all# copies or substantial portions of the Software.##############################################################################"""This module contains functions for clustering the bound poses. """importnumpyasnpfromsklearn.neighborsimportNearestNeighborsfromsklearn.clusterimportDBSCANfromsklearn.clusterimportKMeansfromkneebow.rotorimportRotor__all__=["cluster_DBSCAN","cluster_KMeans"]
[docs]defcluster_DBSCAN(data,eps=None,min_samples=None,metric="euclidean"):r"""Cluster data using DBSCAN. This function clusters the samples using a density-based cluster `DBSCAN <https://scikit-learn.org/stable/modules/generated/sklearn.cluster.DBSCAN.html>`_ provided by scikit. DBSCAN finds clusters of core samples of high density. A sample point is a core sample if at least `min_samples` points are within distance :math:`\varepsilon` of it. A cluster is defined as a set of sample points that are mutually density-connected and density-reachable, i.e. there is a path :math:`\left\langle p_{1}, p_{2}, \ldots, p_{n}\right\rangle` where each :math:`p_{i+1}` is within distance :math:`\varepsilon` of :math:`p_{i}` for any two p in the two. The values of `min_samples` and :math:`\varepsilon` determine the performance of this cluster. If None, `min_samples` takes the value of 2 * n_dims. If :math:`\varepsilon` is None, it is set as the value at the knee of the k-distance plot. Parameters ---------- data : numpy.ndarray, shape=(n_samples, n_dims) Sample data to find clusters. eps : None or scalar, default=None The maximum distance between two samples for one to be considered as in the neighborhood of the other. This is not a maximum bound on the distances of points within a cluster. This is the most important DBSCAN parameter to choose appropriately for your data set and distance function. If None, it is set as the value at the knee of the k-distance plot. min_samples : None or scalar, default=None The number of samples (or total weight) in a neighborhood for a point to be considered as a core point. This includes the point itself. If None, it takes the value of 2 * n_dims metric : string or callable, default=’euclidean’ The metric to use when calculating distance between instances in a feature array. If metric is a string or callable, it must be one of the options allowed by `sklearn.metrics.pairwise_distances` for its metric parameter. Returns ------- labels : array_like, shape=(n_samples,) Cluster labels for each data point. core_sample_indices : array_like, shape=(n_clusters,) Indices of core samples. """iflen(data)<=len(data[0]):returnnp.array([0fordummyindata]),np.arange(len(data))[np.newaxis,:]if2*len(data[0])>len(data):min_samples=np.min([len(data[0]),4])eliflen(data)<1000:min_samples=np.min([2*len(data[0]),len(data)])eliflen(data)>=1000:min_samples=np.min([5*len(data[0]),len(data)])ifepsisNone:nearest_neighbors=NearestNeighbors(n_neighbors=min_samples)nearest_neighbors.fit(data)distances,indices=nearest_neighbors.kneighbors(data)distances=np.sort(distances,axis=0)[:,1]data_vstacked=np.vstack([np.arange(len(distances)),distances]).Trotor=Rotor()rotor.fit_rotate(data_vstacked)elbow_index=rotor.get_elbow_index()eps=distances[elbow_index]dbscan=DBSCAN(eps=eps,min_samples=min_samples,metric=metric)dbscan.fit(data)core_sample_indices=[[]forlabelinnp.unique(dbscan.labels_)iflabel!=-1]forcore_sample_indexindbscan.core_sample_indices_:core_sample_indices[dbscan.labels_[core_sample_index]].append(core_sample_index)returndbscan.labels_,core_sample_indices
[docs]defcluster_KMeans(data,n_clusters):r"""Cluster data using KMeans. This function clusters the samples using `KMeans <https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html>`_ provided by scikit. The KMeans cluster separates the samples into `n` clusters of equal variances, via minimizing the `inertia`, which is defined as: .. math:: \sum_{i=0}^{n} \min _{u_{i} \in C}\left(\left\|x_{i}-u_{i}\right\|^{2}\right) where :math:`u_{i}` is the `centroid` of cluster i. KMeans scales well with large dataset but performs poorly with clusters of varying sizes and density. Parameters ---------- data : numpy.ndarray, shape=(n_samples, n_dims) Sample data to find clusters. n_clusters : int The number of clusters to form as well as the number of centroids to generate. Returns ----------- labels : array_like, shape=(n_samples) Cluster labels for each data point. """iflen(data)<n_clusters:returncluster_DBSCAN(data,eps=None,min_samples=None,metric="euclidean")model=KMeans(n_clusters=n_clusters).fit(data)labels=model.predict(data)returnlabels