Commit 3432d5d4 authored by Hu, Anan's avatar Hu, Anan
Browse files

heatmap & PCA

parent 7d689b4f
......@@ -6,9 +6,12 @@ BIF30806 Project group: Araformatic
Script to parse gene expression data and visualize heatmap and PCA plot
Based on Marnix H. Medema, "hclustering_answer.py",
Bioinformatics Group, Wageningen University
Usage: python3 cluster.py gene_expression_file
gene_expression_file: txt file with list of genes as rows, leaf names as columns,
Usage: python3 cluster.py gene_expression_file.txt heatmap_name.pdf PCA_plot_name.pdf
gene_expression_file.txt: txt file with list of genes as rows, leaf names as columns,
values are log2FoldChange
heatmap_name.pdf: string, name of the output heatmap file
PCA_plot_name.pdf: string, name of the output PCA plot file
output: 2 pdf files
"""
import sys
......@@ -46,6 +49,9 @@ def parse_expression_data(lines):
for i in range(len(leaf_names)):
fc_value = float(line.split("\t")[i+1])
fc_dict[leaf_names[i]].append(fc_value) # key: leaf name, value: log2FoldChange
data_frame = pd.DataFrame(fc_dict, index=gene_names)
return data_frame, gene_names, leaf_names
......@@ -72,18 +78,17 @@ def cluster_by_genes(transposed_df, gene_names):
clustering = sch.linkage(distances,method='complete') # Perform hierarchical clustering
tree = sch.dendrogram(clustering, leaf_font_size=2, color_threshold=1,\
labels = gene_names)
# plt.savefig('dendrogram_gene.pdf', format="PDF")
plt.gcf().clear()
return clustering
def generate_heatmap(transposed_df):
def generate_heatmap(transposed_df, heatmap_name):
"""Generate 2-D heatmap.
transposed_df: Pandas dataframe, expression data
"""
cluster = sns.clustermap(transposed_df, method='complete', metric='correlation',\
col_cluster=True, figsize=(31,31), cmap="vlag")
cluster.savefig('heatmap of DEGs.pdf', format="PDF")
cluster.savefig(heatmap_name, format="PDF")
plt.show()
plt.gcf().clear()
......@@ -101,7 +106,7 @@ def get_clustercolors_from_data(clustering):
colors = [cm.jet((idx*division)) for idx in assignments]
return colors
def perform_PCA(transposed_df, colors, leaf_names):
def perform_PCA(transposed_df, colors, leaf_names, pca_name):
"""Perform PCA plot.
transposed_df: Pandas dataframe, expression data
......@@ -125,12 +130,14 @@ def perform_PCA(transposed_df, colors, leaf_names):
plt.annotate(leaf_names[i], xy=(X_transf[i, 0], X_transf[i, 1]),\
xytext=(X_transf[i, 0] + 0.1, X_transf[i, 1] + 0.1), fontsize=5)
plt.title('PCA Gene Expression', fontsize=15)
plt.savefig('PCA.pdf', format="PDF")
plt.savefig(pca_name, format="PDF")
def main():
# grab input file names from the command line
# grab file names from the command line
INPUT_FN = sys.argv[1]
input_file = open(INPUT_FN, "r").readlines()
heatmap_name = sys.argv[2]
pca_name = sys.argv[3]
# parse data into DataFrame
data_frame, gene_names, leaf_names = parse_expression_data(input_file)
......@@ -143,12 +150,12 @@ def main():
clustering = cluster_by_genes(transposed_df, gene_names)
# generate clustermap
generate_heatmap(transposed_df)
generate_heatmap(transposed_df, heatmap_name)
# perform PCA
colors = get_clustercolors_from_data(clustering)
transposed_pca = transposed_df.transpose() # PCA plot with leaves
perform_PCA(transposed_pca, colors, leaf_names)
perform_PCA(transposed_pca, colors, leaf_names, pca_name)
if __name__ == "__main__":
main()
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment