Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Araformatics group
Araformatics project
Commits
5f1dc14c
Commit
5f1dc14c
authored
Dec 06, 2021
by
Hu, Anan
Browse files
Upload New File
parent
f4ca7176
Changes
1
Hide whitespace changes
Inline
Side-by-side
scripts/cluster_final.py
0 → 100644
View file @
5f1dc14c
#!/usr/bin/env python
"""
BIF30806 Project group: Araformatic
Script to parse gene expression data and visualize heatmap and PCA plot
Based on Marnix H. Medema, "hclustering_answer.py",
Bioinformatics Group, Wageningen University
Usage: python3 cluster.py gene_expression_file
gene_expression_file: txt file with list of genes as rows, leaf names as columns,
values are log2FoldChange
"""
import
sys
import
numpy
import
scipy
import
scipy.cluster.hierarchy
as
sch
import
pandas
as
pd
import
matplotlib.pylab
as
plt
from
matplotlib
import
cm
from
sklearn.decomposition
import
PCA
from
sklearn.preprocessing
import
StandardScaler
import
seaborn
as
sns
;
sns
.
set
()
def
parse_expression_data
(
lines
):
"""Parse input file with gene expression data into Pandas dataframe.
lines: list, lines of the output of differential expression analysis
Returns:
data_frame: Pandas dataframe, expression data
gene_names: list, names of all genes
leaf_names: list, names of all leaves
"""
gene_names
=
[]
fc_dict
=
{}
for
line
in
lines
:
line
=
line
.
strip
()
if
not
line
:
continue
if
line
.
startswith
(
"Gene"
):
leaf_names
=
line
.
split
(
"
\t
"
)[
1
:]
for
leaf
in
leaf_names
:
fc_dict
[
leaf
]
=
[]
else
:
gene_names
.
append
(
line
.
split
(
"
\t
"
)[
0
])
for
i
in
range
(
len
(
leaf_names
)):
fc_value
=
float
(
line
.
split
(
"
\t
"
)[
i
+
1
])
fc_dict
[
leaf_names
[
i
]].
append
(
fc_value
)
# key: leaf name, value: log2FoldChange
data_frame
=
pd
.
DataFrame
(
fc_dict
,
index
=
gene_names
)
return
data_frame
,
gene_names
,
leaf_names
def
cluster_by_leaves
(
data_frame
):
"""Cluster by leaves.
data_frame: Pandas dataframe, expression data
"""
distances
=
sch
.
distance
.
pdist
(
data_frame
,
'correlation'
)
# Calculate pairwise distances
# to change method, google "sch.linkage documentation"
clustering
=
sch
.
linkage
(
distances
,
method
=
'complete'
)
# Perform hierarchical clustering
tree
=
sch
.
dendrogram
(
clustering
,
labels
=
data_frame
.
index
.
values
)
# Generate clustering dendrogram
plt
.
gcf
().
clear
()
def
cluster_by_genes
(
transposed_df
,
gene_names
):
"""Cluster by genes.
transposed_df: Pandas dataframe, expression data
gene_names: list, names of all genes
Returns:
clustering: ndarray, hierarchical clustering linkage matrix
"""
distances
=
sch
.
distance
.
pdist
(
transposed_df
,
'correlation'
)
# Calculate pairwise distances
clustering
=
sch
.
linkage
(
distances
,
method
=
'complete'
)
# Perform hierarchical clustering
tree
=
sch
.
dendrogram
(
clustering
,
leaf_font_size
=
2
,
color_threshold
=
1
,
\
labels
=
gene_names
)
# plt.savefig('dendrogram_gene.pdf', format="PDF")
plt
.
gcf
().
clear
()
return
clustering
def
generate_heatmap
(
transposed_df
):
"""Generate 2-D heatmap.
transposed_df: Pandas dataframe, expression data
"""
cluster
=
sns
.
clustermap
(
transposed_df
,
method
=
'complete'
,
metric
=
'correlation'
,
\
col_cluster
=
True
,
figsize
=
(
31
,
31
),
cmap
=
"vlag"
)
cluster
.
savefig
(
'heatmap of DEGs.pdf'
,
format
=
"PDF"
)
plt
.
show
()
plt
.
gcf
().
clear
()
def
get_clustercolors_from_data
(
clustering
):
"""Get cluster color assignments from the data.
clustering: ndarray, hierarchical clustering linkage matrix
Returns:
colors: list of color codes for clusters from jet color map
"""
assignments
=
sch
.
fcluster
(
clustering
,
t
=
1
,
criterion
=
'distance'
)
# Divide color palette into colors equally far apart according to nr of clusters
division
=
int
(
256
/
len
(
list
(
set
(
assignments
))))
color_indices
=
[
idx
*
division
for
idx
in
assignments
]
colors
=
[
cm
.
jet
((
idx
*
division
))
for
idx
in
assignments
]
return
colors
def
perform_PCA
(
transposed_df
,
colors
,
leaf_names
):
"""Perform PCA plot.
transposed_df: Pandas dataframe, expression data
colors: list of color codes for clusters from jet color map
leaf_names: list, names of all leaves
"""
#Standardize
X_std
=
StandardScaler
().
fit_transform
(
transposed_df
)
#PCA
sklearn_pca
=
PCA
(
n_components
=
2
)
X_transf
=
sklearn_pca
.
fit_transform
(
X_std
)
#Plot the data
plt
.
scatter
(
X_transf
[:,
0
],
X_transf
[:,
1
],
c
=
colors
,
s
=
10
)
# adjust axis
# plt.axis([-100,100,-100,100])
# add labels for axes
plt
.
xlabel
(
"PC1"
)
plt
.
ylabel
(
"PC2"
)
# add labels for dots
for
i
in
range
(
len
(
X_transf
)):
plt
.
annotate
(
leaf_names
[
i
],
xy
=
(
X_transf
[
i
,
0
],
X_transf
[
i
,
1
]),
\
xytext
=
(
X_transf
[
i
,
0
]
+
0.1
,
X_transf
[
i
,
1
]
+
0.1
),
fontsize
=
5
)
plt
.
title
(
'PCA Gene Expression'
,
fontsize
=
15
)
plt
.
savefig
(
'PCA.pdf'
,
format
=
"PDF"
)
def
main
():
# grab input file names from the command line
INPUT_FN
=
sys
.
argv
[
1
]
input_file
=
open
(
INPUT_FN
,
"r"
).
readlines
()
# parse data into DataFrame
data_frame
,
gene_names
,
leaf_names
=
parse_expression_data
(
input_file
)
data_frame
=
data_frame
.
transpose
()
print
(
leaf_names
)
# cluster
cluster_by_leaves
(
data_frame
)
transposed_df
=
data_frame
.
transpose
()
clustering
=
cluster_by_genes
(
transposed_df
,
gene_names
)
# generate clustermap
generate_heatmap
(
transposed_df
)
# perform PCA
colors
=
get_clustercolors_from_data
(
clustering
)
transposed_pca
=
transposed_df
.
transpose
()
# PCA plot with leaves
perform_PCA
(
transposed_pca
,
colors
,
leaf_names
)
if
__name__
==
"__main__"
:
main
()
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment