##############################################################################
# PyLipID: A python module for analysing protein-lipid interactions
#
# Author: Wanling Song
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
##############################################################################
"""This module contains functions that plot interactions as a function of residue index.
"""
import os
from collections import defaultdict
import matplotlib.pyplot as plt
from matplotlib.ticker import MultipleLocator
import numpy as np
import pandas as pd
import logomaker
__all__ = ["plot_residue_data", "plot_residue_data_logo",
"plot_binding_site_data", "plot_surface_area", "AxisIndex"]
[docs]def plot_residue_data(residue_index, interactions, gap=200, ylabel=None,
fn=None, title=None, fig_close=False):
"""Plot interactions as a function of residue index
Parameters
----------
residue_index : list
Residue indices in an ascending order. If a residue index is smaller than its preceding one,
the plotting function will consider it as the start of a new chain and will plot the following data
in a new figure. A gap in residue index that is less than ``gap`` will be marked as gray areas in
the figure, but a gap that is larger than ``gap`` will start a new figure.
interactions : list
Values to plot. In the same order as ``residue_index``.
gap : int, optional, default=200
The number of missing residues in ``residue_index`` that initiate a new figure. The gap between two adjacent
index in ``residue_index`` that is smaller than the provided value will be considered as missing residues
and will be marked as gray areas in the figure, whereas a gap that is larger than the provided value
will start a new figure and plot the following data in that new figure. This can help to make figures
more compressed. The default gap is 200.
ylabel : str, optional, default=None
y axis label. Default is "Interactions".
fn : str, optional, default=None
Figure name. By default the figure is saved as "Figure_interactions.pdf" as the current
working directory.
title : str, optional, default=None
Figure title.
fig_close : bool, optional, default=False
Use plt.close() to close the figure. Can be used to save memory if many figures are opened.
"""
bar_color = "#176BA0"
if ylabel is None:
ylabel = "Interactions"
if fn is None:
fn = os.path.join(os.getcwd(), "Figure_interactions.pdf")
# check for chain breaks
gray_areas = defaultdict(list) # show grey area to indicate missing residues
chain_starts = [0] # plot in separate figures if the gap between two adjacent residues is larger than 50
for idx in np.arange(1, len(residue_index)):
if residue_index[idx] - residue_index[idx - 1] < 0:
chain_starts.append(idx)
elif residue_index[idx] - residue_index[idx - 1] > gap:
chain_starts.append(idx)
elif 1 < residue_index[idx] - residue_index[idx - 1] <= gap:
gray_areas[chain_starts[-1]].append([residue_index[idx - 1] + 1, residue_index[idx] - 1])
chain_starts.append(len(residue_index))
# plot
for chain_idx in np.arange(len(chain_starts[:-1])):
df = interactions[chain_starts[chain_idx]:chain_starts[chain_idx + 1]]
resi_selected = residue_index[chain_starts[chain_idx]:chain_starts[chain_idx + 1]]
if 0 < len(df) <= 20:
fig, ax = plt.subplots(1, 1, figsize=(2.8, 1.5))
ax.xaxis.set_major_locator(MultipleLocator(5))
ax.xaxis.set_minor_locator(MultipleLocator(1))
elif 20 < len(df) <= 50:
fig, ax = plt.subplots(1, 1, figsize=(3.2, 1.5))
ax.xaxis.set_major_locator(MultipleLocator(10))
ax.xaxis.set_minor_locator(MultipleLocator(1))
elif 50 < len(df) <= 300:
fig, ax = plt.subplots(1, 1, figsize=(3.8, 1.8))
ax.xaxis.set_major_locator(MultipleLocator(50))
ax.xaxis.set_minor_locator(MultipleLocator(10))
elif 300 < len(df) <= 1000:
fig, ax = plt.subplots(1, 1, figsize=(4.5, 1.8))
ax.xaxis.set_major_locator(MultipleLocator(100))
ax.xaxis.set_minor_locator(MultipleLocator(10))
elif 1000 < len(df) <= 2000:
fig, ax = plt.subplots(1, 1, figsize=(6.0, 1.8))
ax.xaxis.set_major_locator(MultipleLocator(200))
ax.xaxis.set_minor_locator(MultipleLocator(50))
elif len(df) > 2000:
fig, ax = plt.subplots(1, 1, figsize=(7.5, 1.8))
ax.xaxis.set_major_locator(MultipleLocator(500))
ax.xaxis.set_minor_locator(MultipleLocator(100))
ax.bar(resi_selected, df, 1.0, linewidth=0, color=bar_color)
# plot missing residue area
if chain_starts[chain_idx] in gray_areas.keys():
for gray_area in gray_areas[chain_starts[chain_idx]]:
ax.axvspan(gray_area[0], gray_area[1], facecolor="#c0c0c0", alpha=0.3)
# axis setting
ax.set_ylim(0, df.max() * 1.05)
ax.set_xlim(resi_selected.min() - 1, resi_selected.max() + 1)
ax.set_ylabel(ylabel, fontsize=8, weight="bold")
ax.set_xlabel("Residue Index", fontsize=8, weight="bold")
for label in ax.xaxis.get_ticklabels() + ax.yaxis.get_ticklabels():
plt.setp(label, fontsize=8, weight="bold")
if title is not None:
ax.set_title(title, fontsize=8, weight="bold")
plt.tight_layout()
if len(chain_starts) == 2:
fig.savefig(fn, dpi=300)
else:
name, ext = os.path.splitext(fn)
fig.savefig("{}_{}{}".format(name, chain_idx, ext), dpi=300)
if fig_close:
plt.close()
return
[docs]def plot_residue_data_logo(residue_index, logos, interactions, gap=1000, letter_map=None,
color_scheme="chemistry", ylabel=None, title=None, fn=None, fig_close=False):
"""Plot interactions using `logomaker.Logo
<https://logomaker.readthedocs.io/en/latest/implementation.html#logo-class>`_.
Parameters
-----------
residue_index : list
Residue indices in an ascending order. If a residue index is smaller than its preceding one,
the plotting function will consider it as the start of a new chain and will plot the following data
in a new figure. A gap in residue index that is less than ``gap`` will be marked as gray areas in
the figure, but a gap that is larger than ``gap`` will start a new figure.
logos : list of str
Single letter logos in the corresponding order as ``residue_index``. The height of logos in the figure
will be determined by values given to ``interactions``. Three-letter name of the 20 common amino acids
are accepted and will be converted to their corresponding single-letter names in this function by
the default. Other mappings can be defined via ``letter_map``.
interactions : list
Plotting values in the corresponding order as ``residue_index``.
gap : int, optional, default=1000
The number of missing residues in ``residue_index`` that starts a new figure. A gap between two adjacent
index in ``residue_index`` that is smaller than the provided value will be considered as missing residues
and will be marked as gray areas in the figure, whereas a gap that is larger than the provided value
will start a new figure and plot the following data in that new figure. This can help to make figures
more compressed. The gap needs to be greater than 1000. The default is 1000.
letter_map : dict, optional, default=None
A dictionary that maps provided names to single-letter logos in the form of
{"provided name": "single_letter logo"}.
color_scheme : str, optional, default="chemistry"
The color scheme used by logomaker.Logo(). See
`Color Schemes <https://logomaker.readthedocs.io/en/latest/examples.html#color-schemes>`_ for accepted values.
Default is "chemistry".
ylabel : str, optional, default=None
y axis label. Default is "Interactions".
fn : str, optional, default=None
Figure name. By default the figure is saved as "Figure_interactions_logo.pdf" as the current
working directory.
fig_close : bool, optional, default=False
Use plt.close() to close the figure. Can be used to save memory if many figures are opened.
"""
# single-letter dictionary
single_letter = {'CYS': 'C', 'ASP': 'D', 'SER': 'S', 'GLN': 'Q', 'LYS': 'K',
'ILE': 'I', 'PRO': 'P', 'THR': 'T', 'PHE': 'F', 'ASN': 'N',
'GLY': 'G', 'HIS': 'H', 'LEU': 'L', 'ARG': 'R', 'TRP': 'W',
'ALA': 'A', 'VAL': 'V', 'GLU': 'E', 'TYR': 'Y', 'MET': 'M'}
if letter_map is not None:
single_letter.update(letter_map)
logos_checked = []
for name in logos:
if len(name) == 1:
logos_checked.append(name)
else:
logos_checked.append(single_letter[name])
if ylabel is None:
ylabel = "Interactions"
if fn is None:
fn = os.path.join(os.getcwd(), "Figure_interactions_logo.pdf")
length = 100
# check for chain breaks, gray_areas and axis breaks
axis_obj = AxisIndex(residue_index, logos_checked, interactions, length, gap)
axis_obj.sort()
# plot
for page_idx in axis_obj.breaks.keys():
n_rows = len(axis_obj.breaks[page_idx])
fig, axes = plt.subplots(n_rows, 1, figsize=(4.5, 1.3 * n_rows), sharey=True)
plt.subplots_adjust(hspace=0.5, left=0.2)
ymax = []
for ax_idx, ax in enumerate(np.atleast_1d(axes)):
resi_selected = [item[0] for item in axis_obj.breaks[page_idx][ax_idx]]
logos_selected = [item[1] for item in axis_obj.breaks[page_idx][ax_idx]]
interaction_selected = [item[2] for item in axis_obj.breaks[page_idx][ax_idx]]
ymax.append(np.max(interaction_selected))
if np.sum(interaction_selected) > 0:
df = pd.DataFrame({"Resid": resi_selected, "Resn": logos_selected, "Data": interaction_selected})
matrix = df.pivot(index="Resid", columns='Resn', values="Data").fillna(0)
logomaker.Logo(matrix, color_scheme=color_scheme, ax=ax)
if ax_idx == (n_rows - 1):
ax.set_xlabel("Residue Index", fontsize=8, weight="bold")
ax.xaxis.set_major_locator(MultipleLocator(20))
ax.xaxis.set_minor_locator(MultipleLocator(1))
ax.set_xlim(resi_selected[0] - 0.5, resi_selected[-1] + 0.5)
ax.set_ylabel(ylabel, fontsize=8, weight="bold", va="center")
for label in ax.xaxis.get_ticklabels() + ax.yaxis.get_ticklabels():
plt.setp(label, fontsize=8, weight="bold")
np.atleast_1d(axes)[-1].set_ylim(0, np.max(ymax) * 1.05)
# plot missing areas
if page_idx in axis_obj.gray_areas.keys():
for item in axis_obj.gray_areas[page_idx]:
np.atleast_1d(axes)[item[0]].axvspan(item[1], item[2], facecolor="#c0c0c0", alpha=0.3)
if title is not None:
np.atleast_1d(axes)[0].set_title(title, fontsize=10, weight="bold")
plt.tight_layout()
if len(axis_obj.breaks.keys()) == 1:
fig.savefig(fn, dpi=300)
else:
name, ext = os.path.splitext(fn)
fig.savefig("{}_{}{}".format(name, page_idx, ext), dpi=300)
if fig_close:
plt.close()
return
[docs]def plot_binding_site_data(data, fig_fn, ylabel=None, title=None, fig_close=False):
"""Plot binding site data in a matplotlib violin plot.
The provided ``data`` needs to be a pandas.DataFrame object which has "Binding Site {idx}" as its column names and
records binding site information by column.
Parameters
----------
data : padnas.DataFrame
Data to plot. It needs to have "Binding Site {idx}" as its column names and records binding site information by
column.
fig_fn : str
Figure name.
ylabel : str, optional, default=None
Y label.
title : str, optional, default=None
Figure title.
fig_close : bool, optional, default=False
Use plt.close() to close the figure. Can be used to save memory if many figures are opened.
"""
from itertools import cycle as _cycle
def adjacent_values(vals, q1, q3):
upper_adjacent_value = q3 + (q3 - q1) * 1.5
upper_adjacent_value = np.clip(upper_adjacent_value, q3, vals[-1])
lower_adjacent_value = q1 - (q3 - q1) * 1.5
lower_adjacent_value = np.clip(lower_adjacent_value, vals[0], q1)
return lower_adjacent_value, upper_adjacent_value
if ylabel is None:
ylabel = ""
if title is None:
title = ""
color_set = _cycle(plt.get_cmap("tab10").colors)
plt.rcParams["font.size"] = 10
plt.rcParams["font.weight"] = "bold"
BS_names = [col for col in data.columns]
BS_id_set = [int(name.split()[-1]) for name in BS_names]
BS_id_set.sort()
data_processed = [np.sort(data["Binding Site {}".format(bs_id)].dropna().tolist())
for bs_id in BS_id_set]
colors = [next(color_set) for dummy in BS_id_set]
fig, ax = plt.subplots(1, 1, figsize=(len(BS_id_set)*0.6, 2.8))
plt.subplots_adjust(bottom=0.20, top=0.83)
ax.set_title(title, fontsize=10, weight="bold")
parts = ax.violinplot(data_processed, showmeans=False, showmedians=False, showextrema=False)
for pc_idx, pc in enumerate(parts['bodies']):
pc.set_facecolors(colors[pc_idx])
pc.set_edgecolor('black')
pc.set_alpha(1)
# deal with the situation in which the columns in data have different lengths.
quartile1, medians, quartile3 = np.array([np.percentile(d, [25, 50, 75]) for d in data_processed]).T
whiskers = np.array([
adjacent_values(sorted_array, q1, q3)
for sorted_array, q1, q3 in zip(data_processed, quartile1, quartile3)])
whiskers_min, whiskers_max = whiskers[:, 0], whiskers[:, 1]
inds = np.arange(1, len(medians) + 1)
ax.scatter(inds, medians, marker='o', color='white', s=3, zorder=3)
ax.vlines(inds, quartile1, quartile3, color='k', linestyle='-', lw=5)
ax.vlines(inds, whiskers_min, whiskers_max, color='k', linestyle='-', lw=1)
ax.get_xaxis().set_tick_params(direction='out')
ax.xaxis.set_ticks_position('bottom')
ax.set_xticks(np.arange(1, len(BS_id_set) + 1))
ax.set_xticklabels(BS_id_set, fontsize=10, weight="bold")
ax.set_xlim(0.25, len(BS_id_set) + 0.75)
ax.set_xlabel('Binding Site', fontsize=10, weight="bold")
ax.set_ylabel(ylabel, fontsize=10, weight="bold")
plt.tight_layout()
fig.savefig(fig_fn, dpi=200)
if fig_close:
plt.close()
return
[docs]def plot_surface_area(surface_area, fig_fn, timeunit=None, fig_close=False):
"""Plot binding site surface area as a function of time.
The provided ``surface_area`` needs to be a pandas.DataFrame object, which has columns named as "Binding Site {idx}"
to record binding site surface areas and a column named "Time" to record the timesteps at which the surface area data
are taken.
Parameters
----------
surface_area : pandas.DataFrame
A pandas.DataFrame object which has columns named as "Binding Site {idx}"
to record binding site surface areas and a column named "Time" to record the timesteps at which the surface area data
fig_fn : str
Figure filename.
timeunit : {"ns", "us"} or None, optional, default=None
Time unit shown in x label.
fig_close : bool, optional, default=False
Use plt.close() to close the figure. Can be used to save memory if many figures are opened.
See also
---------
pylipid.func.calculate_surface_area
The function that generates surface_area data.
"""
from itertools import cycle as _cycle
color_set = _cycle(plt.get_cmap("tab10").colors)
plt.rcParams["font.size"] = 10
plt.rcParams["font.weight"] = "normal"
if timeunit is None:
timeunit = ""
elif timeunit == "ns":
timeunit = " (ns)"
elif timeunit == "us":
timeunit = r" ($\mu$s)"
row_set = list(set([ind[:2] for ind in surface_area.index]))
row_set.sort()
col_set = [col for col in surface_area.columns if col != "Time"]
colors = [next(color_set) for dummy in col_set]
fig, axes = plt.subplots(len(row_set), len(col_set), figsize=(len(col_set)*2.4, len(row_set)*1.6),
sharex=True, sharey=True)
plt.subplots_adjust(wspace=0.2, hspace=0.16)
if len(col_set) == 1:
axes = np.atleast_1d(axes)[:, np.newaxis]
else:
axes = np.atleast_2d(axes)
for row_idx, row in enumerate(row_set):
df = surface_area.loc[row]
for col_idx, bs_name in enumerate(col_set):
axes[row_idx, col_idx].plot(df["Time"], df[bs_name], color=colors[col_idx],
label="traj {} prot {}".format(row[0], row[1]))
if row_idx == len(row_set)-1:
axes[row_idx, col_idx].set_xlabel("Time{}".format(timeunit), fontsize=10)
if col_idx == 0:
axes[row_idx, col_idx].set_ylabel(r"Area (nm$^2$)", fontsize=10)
if row_idx == 0:
axes[row_idx, col_idx].set_title(bs_name, fontsize=10)
axes[row_idx, col_idx].legend(loc="best", frameon=False)
fig.tight_layout()
fig.savefig(fig_fn, dpi=200)
if fig_close:
plt.close()
return
class AxisIndex:
"""Build axes for logo figure."""
def __init__(self, residue_index, logos, interactions, length, gap):
self.page_idx = 0
self.length = length
self.gap = gap
self.residue_index = residue_index
self.logos = logos
self.interactions = interactions
self.axis_start = (residue_index[0] // length) * length
self.breaks = defaultdict(list)
self.breaks[self.page_idx].append([])
self.gray_areas = defaultdict(list)
def fill_missing(self, start_value, end_value):
for xloci in np.arange(start_value, end_value + 1):
self.breaks[self.page_idx][-1].append((xloci, "A", 0))
self.gray_areas[self.page_idx].append((len(self.breaks[self.page_idx]) - 1, start_value, end_value))
def new_axis(self, pointer):
self.breaks[self.page_idx].append([])
self.axis_start = self.residue_index[pointer]
self.breaks[self.page_idx][-1].append(
(self.residue_index[pointer], self.logos[pointer], self.interactions[pointer]))
def new_page(self, pointer):
if len(self.breaks[self.page_idx][-1]) < self.length:
self.fill_missing(self.axis_start + len(self.breaks[self.page_idx][-1]), self.axis_start + self.length - 1)
self.page_idx += 1
self.breaks[self.page_idx].append([])
self.axis_start = (self.residue_index[pointer] // self.length) * self.length
if self.axis_start != self.residue_index[pointer]:
self.fill_missing(self.axis_start, self.residue_index[pointer] - 1)
self.breaks[self.page_idx][-1].append(
(self.residue_index[pointer], self.logos[pointer], self.interactions[pointer]))
def new_gap(self, pointer):
gray_start = self.residue_index[pointer - 1] + 1
for xloci in np.arange(self.residue_index[pointer - 1] + 1, self.residue_index[pointer]):
if xloci - self.axis_start < self.length:
self.breaks[self.page_idx][-1].append((xloci, "A", 0))
else:
self.gray_areas[self.page_idx].append((len(self.breaks[self.page_idx]) - 1, gray_start, xloci - 1))
self.breaks[self.page_idx].append([])
self.breaks[self.page_idx][-1].append((xloci, "A", 0))
self.axis_start = xloci
gray_start = xloci
self.gray_areas[self.page_idx].append(
(len(self.breaks[self.page_idx]) - 1, gray_start, self.residue_index[pointer] - 1))
self.breaks[self.page_idx][-1].append(
(self.residue_index[pointer], self.logos[pointer], self.interactions[pointer]))
def sort(self):
end = False
if self.axis_start != self.residue_index[0]:
self.fill_missing(self.axis_start, self.residue_index[0] - 1)
self.breaks[self.page_idx][-1].append((self.residue_index[0], self.logos[0], self.interactions[0]))
pointer = 1
while not end:
if self.residue_index[pointer] - self.residue_index[pointer - 1] == 1 and self.residue_index[
pointer] - self.axis_start < self.length:
self.breaks[self.page_idx][-1].append(
(self.residue_index[pointer], self.logos[pointer], self.interactions[pointer]))
pointer += 1
elif self.residue_index[pointer] - self.residue_index[pointer - 1] == 1 and self.residue_index[
pointer] - self.axis_start >= self.length:
self.new_axis(pointer)
pointer += 1
elif self.residue_index[pointer] - self.residue_index[pointer - 1] < 0:
self.new_page(pointer)
pointer += 1
elif 1 < self.residue_index[pointer] - self.residue_index[pointer - 1] <= self.gap:
self.new_gap(pointer)
pointer += 1
elif self.residue_index[pointer] - self.residue_index[pointer - 1] > self.gap:
self.new_page(pointer)
pointer += 1
if pointer == len(self.residue_index):
end = True
if len(self.breaks[self.page_idx][-1]) < self.length:
self.fill_missing(self.axis_start + len(self.breaks[self.page_idx][-1]), self.axis_start + self.length - 1)