Source code for pylipid.func.clusterer

##############################################################################
# PyLipID: A python module for analysing protein-lipid interactions
#
# Author: Wanling Song
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
##############################################################################

"""This module contains functions for clustering the bound poses. """

import numpy as np
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import DBSCAN
from sklearn.cluster import KMeans
from kneebow.rotor import Rotor


__all__ = ["cluster_DBSCAN", "cluster_KMeans"]


[docs]def cluster_DBSCAN(data, eps=None, min_samples=None, metric="euclidean"):
    r"""Cluster data using DBSCAN.

    This function clusters the samples using a density-based cluster
    `DBSCAN <https://scikit-learn.org/stable/modules/generated/sklearn.cluster.DBSCAN.html>`_ provided by scikit.
    DBSCAN finds clusters of core samples of high density. A sample point is a core sample if at least `min_samples`
    points are within distance :math:`\varepsilon` of it. A cluster is defined as a set of sample points that are
    mutually density-connected and density-reachable, i.e. there is a path
    :math:`\left\langle p_{1}, p_{2}, \ldots, p_{n}\right\rangle` where each :math:`p_{i+1}` is within distance
    :math:`\varepsilon` of :math:`p_{i}` for any two p in the two. The values of `min_samples` and :math:`\varepsilon`
    determine the performance of this cluster.

    If None, `min_samples` takes the value of 2 * n_dims. If :math:`\varepsilon` is None, it is set as the value at the
    knee of the k-distance plot.

    Parameters
    ----------
    data : numpy.ndarray, shape=(n_samples, n_dims)
        Sample data to find clusters.

    eps : None or scalar, default=None
        The maximum distance between two samples for one to be considered as in the neighborhood of the other. This is
        not a maximum bound on the distances of points within a cluster. This is the most important DBSCAN parameter to
        choose appropriately for your data set and distance function. If None, it is set as the value at the
        knee of the k-distance plot.

    min_samples : None or scalar, default=None
        The number of samples (or total weight) in a neighborhood for a point to be considered as a core point. This
        includes the point itself. If None, it takes the value of 2 * n_dims

    metric : string or callable, default=’euclidean’
        The metric to use when calculating distance between instances in a feature array. If metric
        is a string or callable, it must be one of the options allowed by `sklearn.metrics.pairwise_distances`
        for its metric parameter.

    Returns
    -------
    labels : array_like, shape=(n_samples,)
        Cluster labels for each data point.

    core_sample_indices : array_like, shape=(n_clusters,)
        Indices of core samples.

    """
    if len(data) <= len(data[0]):
        return np.array([0 for dummy in data]), np.arange(len(data))[np.newaxis, :]
    if 2*len(data[0]) > len(data):
        min_samples = np.min([len(data[0]), 4])
    elif len(data) < 1000:
        min_samples = np.min([2 * len(data[0]), len(data)])
    elif len(data) >= 1000:
        min_samples = np.min([5 * len(data[0]), len(data)])
    if eps is None:
        nearest_neighbors = NearestNeighbors(n_neighbors=min_samples)
        nearest_neighbors.fit(data)
        distances, indices = nearest_neighbors.kneighbors(data)
        distances = np.sort(distances, axis=0)[:, 1]
        data_vstacked = np.vstack([np.arange(len(distances)), distances]).T
        rotor = Rotor()
        rotor.fit_rotate(data_vstacked)
        elbow_index = rotor.get_elbow_index()
        eps = distances[elbow_index]
    dbscan = DBSCAN(eps=eps, min_samples=min_samples, metric=metric)
    dbscan.fit(data)
    core_sample_indices = [[] for label in np.unique(dbscan.labels_) if label != -1]
    for core_sample_index in dbscan.core_sample_indices_:
        core_sample_indices[dbscan.labels_[core_sample_index]].append(core_sample_index)
    return dbscan.labels_, core_sample_indices


[docs]def cluster_KMeans(data, n_clusters):
    r"""Cluster data using KMeans.

    This function clusters the samples
    using `KMeans <https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html>`_
    provided by scikit. The KMeans cluster separates the samples into `n` clusters of equal variances, via minimizing
    the `inertia`, which is defined as:

    .. math::
        \sum_{i=0}^{n} \min _{u_{i} \in C}\left(\left\|x_{i}-u_{i}\right\|^{2}\right)

    where :math:`u_{i}` is the `centroid`  of cluster i. KMeans scales well with large dataset but performs poorly
    with clusters of varying sizes and density.

    Parameters
    ----------
    data : numpy.ndarray, shape=(n_samples, n_dims)
        Sample data to find clusters.

    n_clusters : int
        The number of clusters to form as well as the number of centroids to generate.

    Returns
    -----------
    labels : array_like, shape=(n_samples)
        Cluster labels for each data point.

    """
    if len(data) < n_clusters:
        return cluster_DBSCAN(data, eps=None, min_samples=None, metric="euclidean")
    model = KMeans(n_clusters=n_clusters).fit(data)
    labels = model.predict(data)
    return labels