Source code for pylipid.func.clusterer

##############################################################################
# PyLipID: A python module for analysing protein-lipid interactions
#
# Author: Wanling Song
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
##############################################################################

"""This module contains functions for clustering the bound poses. """

import numpy as np
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import DBSCAN
from sklearn.cluster import KMeans
from kneebow.rotor import Rotor


__all__ = ["cluster_DBSCAN", "cluster_KMeans"]


[docs]def cluster_DBSCAN(data, eps=None, min_samples=None, metric="euclidean"): r"""Cluster data using DBSCAN. This function clusters the samples using a density-based cluster `DBSCAN <https://scikit-learn.org/stable/modules/generated/sklearn.cluster.DBSCAN.html>`_ provided by scikit. DBSCAN finds clusters of core samples of high density. A sample point is a core sample if at least `min_samples` points are within distance :math:`\varepsilon` of it. A cluster is defined as a set of sample points that are mutually density-connected and density-reachable, i.e. there is a path :math:`\left\langle p_{1}, p_{2}, \ldots, p_{n}\right\rangle` where each :math:`p_{i+1}` is within distance :math:`\varepsilon` of :math:`p_{i}` for any two p in the two. The values of `min_samples` and :math:`\varepsilon` determine the performance of this cluster. If None, `min_samples` takes the value of 2 * n_dims. If :math:`\varepsilon` is None, it is set as the value at the knee of the k-distance plot. Parameters ---------- data : numpy.ndarray, shape=(n_samples, n_dims) Sample data to find clusters. eps : None or scalar, default=None The maximum distance between two samples for one to be considered as in the neighborhood of the other. This is not a maximum bound on the distances of points within a cluster. This is the most important DBSCAN parameter to choose appropriately for your data set and distance function. If None, it is set as the value at the knee of the k-distance plot. min_samples : None or scalar, default=None The number of samples (or total weight) in a neighborhood for a point to be considered as a core point. This includes the point itself. If None, it takes the value of 2 * n_dims metric : string or callable, default=’euclidean’ The metric to use when calculating distance between instances in a feature array. If metric is a string or callable, it must be one of the options allowed by `sklearn.metrics.pairwise_distances` for its metric parameter. Returns ------- labels : array_like, shape=(n_samples,) Cluster labels for each data point. core_sample_indices : array_like, shape=(n_clusters,) Indices of core samples. """ if len(data) <= len(data[0]): return np.array([0 for dummy in data]), np.arange(len(data))[np.newaxis, :] if 2*len(data[0]) > len(data): min_samples = np.min([len(data[0]), 4]) elif len(data) < 1000: min_samples = np.min([2 * len(data[0]), len(data)]) elif len(data) >= 1000: min_samples = np.min([5 * len(data[0]), len(data)]) if eps is None: nearest_neighbors = NearestNeighbors(n_neighbors=min_samples) nearest_neighbors.fit(data) distances, indices = nearest_neighbors.kneighbors(data) distances = np.sort(distances, axis=0)[:, 1] data_vstacked = np.vstack([np.arange(len(distances)), distances]).T rotor = Rotor() rotor.fit_rotate(data_vstacked) elbow_index = rotor.get_elbow_index() eps = distances[elbow_index] dbscan = DBSCAN(eps=eps, min_samples=min_samples, metric=metric) dbscan.fit(data) core_sample_indices = [[] for label in np.unique(dbscan.labels_) if label != -1] for core_sample_index in dbscan.core_sample_indices_: core_sample_indices[dbscan.labels_[core_sample_index]].append(core_sample_index) return dbscan.labels_, core_sample_indices
[docs]def cluster_KMeans(data, n_clusters): r"""Cluster data using KMeans. This function clusters the samples using `KMeans <https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html>`_ provided by scikit. The KMeans cluster separates the samples into `n` clusters of equal variances, via minimizing the `inertia`, which is defined as: .. math:: \sum_{i=0}^{n} \min _{u_{i} \in C}\left(\left\|x_{i}-u_{i}\right\|^{2}\right) where :math:`u_{i}` is the `centroid` of cluster i. KMeans scales well with large dataset but performs poorly with clusters of varying sizes and density. Parameters ---------- data : numpy.ndarray, shape=(n_samples, n_dims) Sample data to find clusters. n_clusters : int The number of clusters to form as well as the number of centroids to generate. Returns ----------- labels : array_like, shape=(n_samples) Cluster labels for each data point. """ if len(data) < n_clusters: return cluster_DBSCAN(data, eps=None, min_samples=None, metric="euclidean") model = KMeans(n_clusters=n_clusters).fit(data) labels = model.predict(data) return labels