Commit 5f1dc14c authored by Hu, Anan's avatar Hu, Anan
Browse files

Upload New File

parent f4ca7176
#!/usr/bin/env python
"""
BIF30806 Project group: Araformatic
Script to parse gene expression data and visualize heatmap and PCA plot
Based on Marnix H. Medema, "hclustering_answer.py",
Bioinformatics Group, Wageningen University
Usage: python3 cluster.py gene_expression_file
gene_expression_file: txt file with list of genes as rows, leaf names as columns,
values are log2FoldChange
"""
import sys
import numpy
import scipy
import scipy.cluster.hierarchy as sch
import pandas as pd
import matplotlib.pylab as plt
from matplotlib import cm
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import seaborn as sns; sns.set()
def parse_expression_data(lines):
"""Parse input file with gene expression data into Pandas dataframe.
lines: list, lines of the output of differential expression analysis
Returns:
data_frame: Pandas dataframe, expression data
gene_names: list, names of all genes
leaf_names: list, names of all leaves
"""
gene_names = []
fc_dict = {}
for line in lines:
line = line.strip()
if not line:
continue
if line.startswith("Gene"):
leaf_names = line.split("\t")[1:]
for leaf in leaf_names:
fc_dict[leaf] = []
else:
gene_names.append(line.split("\t")[0])
for i in range(len(leaf_names)):
fc_value = float(line.split("\t")[i+1])
fc_dict[leaf_names[i]].append(fc_value) # key: leaf name, value: log2FoldChange
data_frame = pd.DataFrame(fc_dict, index=gene_names)
return data_frame, gene_names, leaf_names
def cluster_by_leaves(data_frame):
"""Cluster by leaves.
data_frame: Pandas dataframe, expression data
"""
distances = sch.distance.pdist(data_frame, 'correlation') # Calculate pairwise distances
# to change method, google "sch.linkage documentation"
clustering = sch.linkage(distances, method='complete') # Perform hierarchical clustering
tree = sch.dendrogram(clustering, labels=data_frame.index.values) # Generate clustering dendrogram
plt.gcf().clear()
def cluster_by_genes(transposed_df, gene_names):
"""Cluster by genes.
transposed_df: Pandas dataframe, expression data
gene_names: list, names of all genes
Returns:
clustering: ndarray, hierarchical clustering linkage matrix
"""
distances = sch.distance.pdist(transposed_df, 'correlation') # Calculate pairwise distances
clustering = sch.linkage(distances,method='complete') # Perform hierarchical clustering
tree = sch.dendrogram(clustering, leaf_font_size=2, color_threshold=1,\
labels = gene_names)
# plt.savefig('dendrogram_gene.pdf', format="PDF")
plt.gcf().clear()
return clustering
def generate_heatmap(transposed_df):
"""Generate 2-D heatmap.
transposed_df: Pandas dataframe, expression data
"""
cluster = sns.clustermap(transposed_df, method='complete', metric='correlation',\
col_cluster=True, figsize=(31,31), cmap="vlag")
cluster.savefig('heatmap of DEGs.pdf', format="PDF")
plt.show()
plt.gcf().clear()
def get_clustercolors_from_data(clustering):
"""Get cluster color assignments from the data.
clustering: ndarray, hierarchical clustering linkage matrix
Returns:
colors: list of color codes for clusters from jet color map
"""
assignments = sch.fcluster(clustering, t=1, criterion='distance')
# Divide color palette into colors equally far apart according to nr of clusters
division = int(256 / len(list(set(assignments))))
color_indices = [idx*division for idx in assignments]
colors = [cm.jet((idx*division)) for idx in assignments]
return colors
def perform_PCA(transposed_df, colors, leaf_names):
"""Perform PCA plot.
transposed_df: Pandas dataframe, expression data
colors: list of color codes for clusters from jet color map
leaf_names: list, names of all leaves
"""
#Standardize
X_std = StandardScaler().fit_transform(transposed_df)
#PCA
sklearn_pca = PCA(n_components=2)
X_transf = sklearn_pca.fit_transform(X_std)
#Plot the data
plt.scatter(X_transf[:,0], X_transf[:,1], c=colors, s=10)
# adjust axis
# plt.axis([-100,100,-100,100])
# add labels for axes
plt.xlabel("PC1")
plt.ylabel("PC2")
# add labels for dots
for i in range(len(X_transf)):
plt.annotate(leaf_names[i], xy=(X_transf[i, 0], X_transf[i, 1]),\
xytext=(X_transf[i, 0] + 0.1, X_transf[i, 1] + 0.1), fontsize=5)
plt.title('PCA Gene Expression', fontsize=15)
plt.savefig('PCA.pdf', format="PDF")
def main():
# grab input file names from the command line
INPUT_FN = sys.argv[1]
input_file = open(INPUT_FN, "r").readlines()
# parse data into DataFrame
data_frame, gene_names, leaf_names = parse_expression_data(input_file)
data_frame = data_frame.transpose()
print(leaf_names)
# cluster
cluster_by_leaves(data_frame)
transposed_df = data_frame.transpose()
clustering = cluster_by_genes(transposed_df, gene_names)
# generate clustermap
generate_heatmap(transposed_df)
# perform PCA
colors = get_clustercolors_from_data(clustering)
transposed_pca = transposed_df.transpose() # PCA plot with leaves
perform_PCA(transposed_pca, colors, leaf_names)
if __name__ == "__main__":
main()
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment