====
Demo
====

Here we provide a no-brainer demo script for lipid interaction analysis using PyLipID. This script works
for versions later than 1.4. Please update PyLipID to the latest version ::

    import numpy as np
    import matplotlib.pyplot as plt
    from pylipid.api import LipidInteraction
    from pylipid.util import check_dir

    ##################################################################
    ##### This part needs changes according to your setting ##########
    ##################################################################
    trajfile_list = ["run1/md.xtc", "run2/md.xtc"]
    topfile_list = ["run1/md.gro", "run2/md.gro"]  # topology file is needed when trajectory format does not
                                                   # provide topology information. See mdtraj.load() for more
                                                   # information.
    dt_traj = None  # the timestep of trajectories. Need to use this param when trajectories are in a format
                    # with no timestep information. Not necessary for trajectory formats of e.g. xtc, trr.
    stride = 1   # tell pylipid to analyze every stride-th frame. Can be used to save computation memory
                 # and speed up the calculation.

    lipid = "CHOL"   # residue name in the topology.
    lipid_atoms = None  # all lipid atoms will be considered for interaction calculation.
    cutoffs = [0.5, 0.8]  # dual-cutoff scheme for coarse-grained simulations. Single-cutoff scheme can be
                          # achieved by using the same value for two cutoffs.

    nprot = 1   # if the simulation system has N copies of receptors, "nprot=N" will report interactions
                # averaged from the N copies, but "nprot=1" will ask PyLipID to report interaction for
                # each copy.

    binding_site_size = 4  # binding site should contain at least four residues.

    n_top_poses = 3     # write out num. of representative bound poses for each binding site.
    n_clusters = "auto"  # cluster the bound poses for a binding site into num. of clusters. PyLipID
                         # will write out a pose conformation for each of the cluster. By default, i.e.
                         # "auto", PyLipID will use a density based clusterer to find possible clusters.

    save_dir = None  # save at current working directory if it is None.
    save_pose_format = "gro"  # format that poses are written in
    save_pose_traj = True  # save all the bound poses in a trajectory for each binding site. The generated
                           # trajectories can take some disk space (up to a couple GB depending on your system).
    save_pose_traj_format = "xtc"  # The format for the saved pose trajectories. Can take any format that is supported
                                   # by mdtraj.

    timeunit = "us"  # micro-sec. "ns" is nanosecond. Time unit used for reporting the results.
    resi_offset = 0  # shift the residue index, useful for MARTINI models.

    radii = None  # Radii of protein atoms/beads. In the format of python dictionary {atom_name: radius}
                  # Used for calculation of binding site surface area. The van der waals radii of common atoms were
                  # defined by mdtraj (https://github.com/mdtraj/mdtraj/blob/master/mdtraj/geometry/sasa.py#L56).
                  # The radii of MARTINI 2.2 beads were included in PyLipID.

    pdb_file_to_map = None   # if a pdb coordinate of the receptor is provided, a python script
                             # "show_binding_site_info.py" will be generated which maps the binding
                             # site information to the structure in PyMol. As PyMol cannot recognize
                             # coarse-grained structures, an atomistic structure of the receptor is needed.

    fig_format = "pdf"  # format for all pylipid produced figures. Allow for formats that are supported by
                        # matplotlib.pyplot.savefig().

    num_cpus = None  # the number of cpu to use when functions are using multiprocessing. By default,
                     # i.e. None, the functions will use up all the cpus available. This can use up all the memory in
                     # some cases.

    #####################################
    ###### no changes needed below ######
    #####################################

    #### calculate lipid interactions
    li = LipidInteraction(trajfile_list, topfile_list=topfile_list, cutoffs=cutoffs, lipid=lipid,
                          lipid_atoms=lipid_atoms, nprot=1, resi_offset=resi_offset,
                          timeunit=timeunit, save_dir=save_dir, stride=stride, dt_traj=dt_traj)
    li.collect_residue_contacts()
    li.compute_residue_duration(residue_id=None)
    li.compute_residue_occupancy(residue_id=None)
    li.compute_residue_lipidcount(residue_id=None)
    li.show_stats_per_traj(write_log=True, print_log=True)
    li.compute_residue_koff(residue_id=None, plot_data=True, fig_close=True,
                            fig_format=fig_format, num_cpus=num_cpus)
    li.compute_binding_nodes(threshold=binding_site_size, print_data=False)
    if len(li.node_list) == 0:
        print("*"*50)
        print("No binding site detected! Skip analysis for binding sites.")
        print("*"*50)
    else:
        li.compute_site_duration(binding_site_id=None)
        li.compute_site_occupancy(binding_site_id=None)
        li.compute_site_lipidcount(binding_site_id=None)
        li.compute_site_koff(binding_site_id=None, plot_data=True, fig_close=True,
                             fig_format=fig_format, num_cpus=num_cpus)
        pose_traj, pose_rmsd_data = li.analyze_bound_poses(binding_site_id=None, pose_format=save_pose_format,
                                                           n_top_poses=n_top_poses, n_clusters=n_clusters,
                                                           fig_format=fig_format, num_cpus=num_cpus)
        # save pose trajectories
        if save_pose_traj:
            for bs_id in pose_traj.keys():
                pose_traj[bs_id].save("{}/Bound_Poses_{}/Pose_traj_BSid{}.{}".format(li.save_dir, li.lipid, bs_id,
                                                                              save_pose_traj_format))
        del pose_traj  # save memory space
        surface_area_data = li.compute_surface_area(binding_site_id=None, radii=radii, fig_format=fig_format)
        data_dir = check_dir(li.save_dir, "Dataset_{}".format(li.lipid))
        pose_rmsd_data.to_csv("{}/Pose_RMSD_data.csv".format(data_dir), index=False, header=True)
        surface_area_data.to_csv("{}/Surface_Area_data.csv".format(data_dir), index=True, header=True)
        li.write_site_info(sort_residue="Residence Time")

    if pdb_file_to_map is not None:
        li.save_pymol_script(pdb_file_to_map)

    #### write and save data
    for item in ["Dataset", "Duration", "Occupancy", "Lipid Count", "CorrCoef"]:
        li.save_data(item=item)
    for item in ["Residence Time", "Duration", "Occupancy", "Lipid Count"]:
        li.save_coordinate(item=item)
    for item in ["Residence Time", "Duration", "Occupancy", "Lipid Count"]:
        li.plot(item=item, fig_close=True, fig_format=fig_format)
        li.plot_logo(item=item, fig_close=True, fig_format=fig_format)

    #### plot binding site comparison.
    if len(li.node_list) > 0:
        for item in ["Duration BS", "Occupancy BS"]:
            li.save_data(item=item)

            ylabel_timeunit = 'ns' if li.timeunit == "ns" else r"$\mu$s"
            ylabel_dict = {"Residence Time": "Residence Time ({})".format(ylabel_timeunit),
                           "Duration": "Duration ({})".format(ylabel_timeunit),
                           "Occupancy": "Occuoancy (100%)",
                           "Lipid Count": "Lipid Count (num.)"}

            # plot No. 1
            binding_site_IDs = np.sort(
                     [int(bs_id) for bs_id in li.dataset["Binding Site ID"].unique() if bs_id != -1])
            for item in ["Residence Time", "Duration", "Occupancy", "Lipid Count"]:
                item_values = np.array(
                          [li.dataset[li.dataset["Binding Site ID"]==bs_id]["Binding Site {}".format(item)].unique()[0]
                           for bs_id in binding_site_IDs])
                fig, ax = plt.subplots(1, 1, figsize=(len(li.node_list)*0.5, 2.6))
                ax.scatter(np.arange(len(item_values)), np.sort(item_values)[::-1], s=50, color="red")
                ax.set_xticks(np.arange(len(item_values)))
                sorted_index = np.argsort(item_values)[::-1]
                ax.set_xticklabels(binding_site_IDs[sorted_index])
                ax.set_xlabel("Binding Site ID", fontsize=12)
                ax.set_ylabel(ylabel_dict[item], fontsize=12)
                for label in ax.xaxis.get_ticklabels()+ax.yaxis.get_ticklabels():
                    plt.setp(label, fontsize=12, weight="normal")
                plt.tight_layout()
                plt.savefig("{}/{}_{}_v_binding_site.{}".format(li.save_dir, li.lipid, "_".join(item.split()), fig_format),
                            dpi=200)
                plt.close()

            # plot No. 2
            binding_site_IDs_RMSD = np.sort([int(bs_id) for bs_id in binding_site_IDs
                                            if f"Binding Site {bs_id}" in pose_rmsd_data.columns])
            RMSD_averages = np.array(
                         [pose_rmsd_data[f"Binding Site {bs_id}"].dropna(inplace=False).mean()
                          for bs_id in binding_site_IDs_RMSD])
            fig, ax = plt.subplots(1, 1, figsize=(len(li.node_list)*0.5, 2.6))
            ax.scatter(np.arange(len(RMSD_averages)), np.sort(RMSD_averages)[::-1], s=50, color="red")
            ax.set_xticks(np.arange(len(RMSD_averages)))
            sorted_index = np.argsort(RMSD_averages)[::-1]
            ax.set_xticklabels(binding_site_IDs_RMSD[sorted_index])
            ax.set_xlabel("Binding Site ID", fontsize=12)
            ax.set_ylabel("RMSD (nm)", fontsize=12)
            for label in ax.xaxis.get_ticklabels()+ax.yaxis.get_ticklabels():
                plt.setp(label, fontsize=12, weight="normal")
            plt.tight_layout()
            plt.savefig("{}/{}_RMSD_v_binding_site.{}".format(li.save_dir, li.lipid, fig_format), dpi=200)
            plt.close()

            # plot No. 3
            surface_area_averages = np.array(
                           [surface_area_data["Binding Site {}".format(bs_id)].dropna(inplace=False).mean()
                            for bs_id in binding_site_IDs])
            fig, ax = plt.subplots(1, 1, figsize=(len(li.node_list)*0.5, 2.6))
            ax.scatter(np.arange(len(surface_area_averages)), np.sort(surface_area_averages)[::-1], s=50, color="red")
            ax.set_xticks(np.arange(len(surface_area_averages)))
            sorted_index = np.argsort(surface_area_averages)[::-1]
            ax.set_xticklabels(binding_site_IDs[sorted_index])
            ax.set_xlabel("Binding Site ID", fontsize=12)
            ax.set_ylabel(r"Surface Area (nm$^2$)", fontsize=12)
            for label in ax.xaxis.get_ticklabels()+ax.yaxis.get_ticklabels():
                plt.setp(label, fontsize=12, weight="normal")
            plt.tight_layout()
            plt.savefig("{}/{}_surface_area_v_binding_site.{}".format(li.save_dir, li.lipid, fig_format), dpi=200)
            plt.close()

            # plot No. 4
            res_time_BS = np.array(
                      [li.dataset[li.dataset["Binding Site ID"]==bs_id]["Binding Site Residence Time"].unique()[0]
                       for bs_id in binding_site_IDs_RMSD])
            fig, ax = plt.subplots(1, 1, figsize=(len(li.node_list)*0.5, 2.6))
            ax.scatter(res_time_BS, RMSD_averages, s=50, color="red")
            ax.set_xlabel(ylabel_dict["Residence Time"], fontsize=12)
            ax.set_ylabel("RMSD (nm)", fontsize=12)
            for label in ax.xaxis.get_ticklabels()+ax.yaxis.get_ticklabels():
                plt.setp(label, fontsize=12, weight="normal")
            plt.tight_layout()
            plt.savefig("{}/{}_Residence_Time_v_RMSD.{}".format(li.save_dir, li.lipid, fig_format), dpi=200)
            plt.close()

            # plot No. 5
            res_time_BS = np.array(
                      [li.dataset[li.dataset["Binding Site ID"]==bs_id]["Binding Site Residence Time"].unique()[0]
                       for bs_id in binding_site_IDs])
            fig, ax = plt.subplots(1, 1, figsize=(len(li.node_list)*0.5, 2.6))
            ax.scatter(res_time_BS, surface_area_averages, s=50, color="red")
            ax.set_xlabel(ylabel_dict["Residence Time"], fontsize=12)
            ax.set_ylabel(r"Surface Area (nm$^2$)", fontsize=12)
            for label in ax.xaxis.get_ticklabels()+ax.yaxis.get_ticklabels():
                plt.setp(label, fontsize=12, weight="normal")
            plt.tight_layout()
            plt.savefig("{}/{}_Residence_Time_v_surface_area.{}".format(li.save_dir, li.lipid, fig_format), dpi=200)
            plt.close()