Commit ef27ec0c authored by Molemaker, Maureen's avatar Molemaker, Maureen
Browse files

Merge branch 'main' of git.wur.nl:araformatics-group/araformatics-project into main

parents 576f63d8 57f4f8cd
......@@ -7,7 +7,6 @@
# import
library(DESeq2)
# Functions
run_analysis <- function(file_name) {
data <- read.table(file_name, row.names = 1, header=TRUE)
......@@ -29,8 +28,8 @@ run_analysis <- function(file_name) {
# do glm test
dds = nbinomWaldTest(dds)
res = results(dds)
write.table(res, col.names=NA, row.names=T, file =paste(file_name, ".tsv"), sep ="\t")
res = results(dds, lfcThreshold=1, altHypothesis="greaterAbs")
write.table(res, col.names=NA, row.names=T, file =paste(file_name,".tsv", collapse="", sep=""), sep ="\t", quote=T)
}
......
#!/usr/bin/env Rscript
#BIF30806 Project group: Araformatic
#(2021)
#GO enrichment
#Script to run topGO package for testing GO terms of differential gene
#expression analysis
# Scrip written based on:
# https://ucdavis-bioinformatics-training.github.io/2018-June-RNA-Seq-Workshop/friday/enrichment.html
# Documentation available of topGO usage
# https://www.r-bloggers.com/2015/09/passing-arguments-to-an-r-script-from-command-lines/
#Usage: Rscript GOenrichment_proj.R input_file.txt
# input_file: txt file, genes as rows, leafs as columns and values
# are adjusted p value from DGE
#Take arguments from command line
args = commandArgs(trailingOnly = TRUE)
# test if there is at least one argument: if not, return an error
if (length(args)==0) {
stop("Argument missing", call.=FALSE)
} else if (length(args)==1) {
# default output file
print("Input file specified")
}
#Install packages from source.
source("https://bioconductor.org/biocLite.R")
biocLite(c("topGO", "KEGGREST", "org.At.tair.db", "Rgraphviz"))
if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
#BiocManager::install("topGO")
#BiocManager::install("AnnotationDbi")
#BiocManager::install("org.At.tair.db")
#BiocManager::install("Rgraphviz")
#BiocManager::install("multtest")
library(topGO)
library("org.At.tair.db")
library(KEGGREST)
library(AnnotationDbi)
library(Rgraphviz)
library(multtest, verbose = FALSE)
####### ADJUSTED P-VALUE #http://rileylab.org/wp-content/uploads/2016/08/multtest.pdf
#useful info: #multiple_dexp[,c(1,which(colnames(multiple_dexp) == current_leaf))]
#GO enrichment on multiple leafs(samples)
#list with samples names
multiple_dexp<- read.delim(args[1],stringsAsFactors = F)
#multiple_dexp<- read.delim("multiple_input.txt",stringsAsFactors = F)
sample_names <-colnames(multiple_dexp)# all names
sample_names[2:length(sample_names)]
pcutoff <- 0.05
#gene names from DGE analysis output
genes <- multiple_dexp$AGI
for(leaf in sample_names[2:length(sample_names)]){
current_leaf = leaf
#filtering genes.replace to 1 if above threshold, otherwise 0 (significant)
filter_multiple <- ifelse(multiple_dexp[,which(colnames(multiple_dexp) == current_leaf)] < pcutoff,1,0) # adj.P.Val name of the column
names(filter_multiple) <- genes
geneList<- filter_multiple
# create topGOdata object
GOdata_sample <- new("topGOdata",
ontology = "BP",
allGenes = geneList,
geneSelectionFun = function(x)(x == 1),
annot = annFUN.org, mapping = "org.At.tair.db")
#Run the Fishers exact Test (raw p-vals)
fisher_sample <- runTest(GOdata_sample, algorithm = "elim", statistic = "fisher")
sample_result <- GenTable(GOdata_sample, raw.p.value = fisher_sample,
topNodes = length(fisher_sample@score), numChar = 120)
# print(sample_result) #shows some statistics
#multiple testing correction (FDR p value)
#http://www.bioconductor.org/packages/release/bioc/html/multtest.html
raw_pvalues <- as.double(sample_result$raw.p.value)
cor_pval <- mt.rawp2adjp(raw_pvalues, "BY")
adjp <-cor_pval$adjp #if want order_adjp <- res_FDR$adjp[order(res_FDR$index), ]
#Add FDR p value to topGO output
FDR_by_sample <- sample_result
FDR_by_sample <- cbind(FDR_by_sample, adjp)
#GOgraph -> with raw pval
par(cex = 0.3)
showSigOfNodes(GOdata_sample, score(fisher_sample), firstSigNodes = 5, useInfo = 'all')
printGraph(GOdata_sample, fisher_sample, firstSigNodes = 5, fn.prefix = current_leaf, useInfo = "def", pdfSW = TRUE)#termsP.value named vector of p-values.
#dev.off()
#write table with information per leaf
table = paste(current_leaf,"txt",sep=".")
write.table(FDR_by_sample, col.names=T, row.names=F, file = table, sep ="\t",
quote = F)
}
#!/usr/bin/env python
"""
BIF30806 Project group: Araformatic
Script to parse gene expression data and visualize heatmap and PCA plot
Based on Marnix H. Medema, "hclustering_answer.py",
Bioinformatics Group, Wageningen University
Usage: python3 cluster.py gene_expression_file
gene_expression_file: txt file with list of genes as rows, leaf names as columns,
values are log2FoldChange
"""
import sys
import numpy
import scipy
import scipy.cluster.hierarchy as sch
import pandas as pd
import matplotlib.pylab as plt
from matplotlib import cm
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import seaborn as sns; sns.set()
def parse_expression_data(lines):
"""Parse input file with gene expression data into Pandas dataframe.
lines: list, lines of the output of differential expression analysis
Returns:
data_frame: Pandas dataframe, expression data
gene_names: list, names of all genes
leaf_names: list, names of all leaves
"""
gene_names = []
fc_dict = {}
for line in lines:
line = line.strip()
if not line:
continue
if line.startswith("Gene"):
leaf_names = line.split("\t")[1:]
for leaf in leaf_names:
fc_dict[leaf] = []
else:
gene_names.append(line.split("\t")[0])
for i in range(len(leaf_names)):
fc_value = float(line.split("\t")[i+1])
fc_dict[leaf_names[i]].append(fc_value) # key: leaf name, value: log2FoldChange
data_frame = pd.DataFrame(fc_dict, index=gene_names)
return data_frame, gene_names, leaf_names
def cluster_by_leaves(data_frame):
"""Cluster by leaves.
data_frame: Pandas dataframe, expression data
"""
distances = sch.distance.pdist(data_frame, 'correlation') # Calculate pairwise distances
# to change method, google "sch.linkage documentation"
clustering = sch.linkage(distances, method='complete') # Perform hierarchical clustering
tree = sch.dendrogram(clustering, labels=data_frame.index.values) # Generate clustering dendrogram
plt.gcf().clear()
def cluster_by_genes(transposed_df, gene_names):
"""Cluster by genes.
transposed_df: Pandas dataframe, expression data
gene_names: list, names of all genes
Returns:
clustering: ndarray, hierarchical clustering linkage matrix
"""
distances = sch.distance.pdist(transposed_df, 'correlation') # Calculate pairwise distances
clustering = sch.linkage(distances,method='complete') # Perform hierarchical clustering
tree = sch.dendrogram(clustering, leaf_font_size=2, color_threshold=1,\
labels = gene_names)
# plt.savefig('dendrogram_gene.pdf', format="PDF")
plt.gcf().clear()
return clustering
def generate_heatmap(transposed_df):
"""Generate 2-D heatmap.
transposed_df: Pandas dataframe, expression data
"""
cluster = sns.clustermap(transposed_df, method='complete', metric='correlation',\
col_cluster=True, figsize=(31,31), cmap="vlag")
cluster.savefig('heatmap of DEGs.pdf', format="PDF")
plt.show()
plt.gcf().clear()
def get_clustercolors_from_data(clustering):
"""Get cluster color assignments from the data.
clustering: ndarray, hierarchical clustering linkage matrix
Returns:
colors: list of color codes for clusters from jet color map
"""
assignments = sch.fcluster(clustering, t=1, criterion='distance')
# Divide color palette into colors equally far apart according to nr of clusters
division = int(256 / len(list(set(assignments))))
color_indices = [idx*division for idx in assignments]
colors = [cm.jet((idx*division)) for idx in assignments]
return colors
def perform_PCA(transposed_df, colors, leaf_names):
"""Perform PCA plot.
transposed_df: Pandas dataframe, expression data
colors: list of color codes for clusters from jet color map
leaf_names: list, names of all leaves
"""
#Standardize
X_std = StandardScaler().fit_transform(transposed_df)
#PCA
sklearn_pca = PCA(n_components=2)
X_transf = sklearn_pca.fit_transform(X_std)
#Plot the data
plt.scatter(X_transf[:,0], X_transf[:,1], c=colors, s=10)
# adjust axis
# plt.axis([-100,100,-100,100])
# add labels for axes
plt.xlabel("PC1")
plt.ylabel("PC2")
# add labels for dots
for i in range(len(X_transf)):
plt.annotate(leaf_names[i], xy=(X_transf[i, 0], X_transf[i, 1]),\
xytext=(X_transf[i, 0] + 0.1, X_transf[i, 1] + 0.1), fontsize=5)
plt.title('PCA Gene Expression', fontsize=15)
plt.savefig('PCA.pdf', format="PDF")
def main():
# grab input file names from the command line
INPUT_FN = sys.argv[1]
input_file = open(INPUT_FN, "r").readlines()
# parse data into DataFrame
data_frame, gene_names, leaf_names = parse_expression_data(input_file)
data_frame = data_frame.transpose()
print(leaf_names)
# cluster
cluster_by_leaves(data_frame)
transposed_df = data_frame.transpose()
clustering = cluster_by_genes(transposed_df, gene_names)
# generate clustermap
generate_heatmap(transposed_df)
# perform PCA
colors = get_clustercolors_from_data(clustering)
transposed_pca = transposed_df.transpose() # PCA plot with leaves
perform_PCA(transposed_pca, colors, leaf_names)
if __name__ == "__main__":
main()
......@@ -3,7 +3,7 @@
Author: Joran Schoorlemmer
Student nr: 1004586
Description: Extract counts from coverage data in gtf files and write to csv
Usage: python3 extractcounts.py <reference.gtf> <listofsamplegtffiles.txt>
Usage: python3 extractcounts.py <listofreffiles.txt> <listofsamplegtffiles.txt>
"""
......@@ -32,9 +32,11 @@ def parse_gtf(in_file, read_length = 100):
# turn into into int to do calculations
trans_len = abs(int(line_cont[4])-int(line_cont[3]))
gene_info = line_cont[8].split(' ')
gene_name = gene_info[5].lstrip('"').rstrip('";')
cov = float(gene_info[11].lstrip('"').rstrip('";'))
genes[gene_name] = round(cov * trans_len /read_length)
if len(gene_info) == 12:
gene_name = gene_info[5].lstrip('"').rstrip('";')
cov = float(gene_info[7].lstrip('"').rstrip('";'))
if cov > 0.01:
genes[gene_name] = round(cov * trans_len /read_length)
return genes
......@@ -50,7 +52,8 @@ def group_replicates(sample_names):
with open(sample_names) as f_names:
for f in f_names:
f = f.strip("\n")
rep_dict_key = f[:-5]
# strip redundant info out of string
rep_dict_key = f[9:-12]
if not rep_dict_key in rep_dict:
rep_dict[rep_dict_key] = {}
rep_dict[rep_dict_key][f] = parse_gtf('samples/'+ str(f))
......@@ -61,25 +64,30 @@ def group_replicates(sample_names):
def write_csv(ref_dict, group_dict, out_f):
"""write a tab delim csv file with genes and count values
ref_dict: dict, dict with gene name as keys and counts as values
ref_dict: dict, dict with replicate as keys and dict with gene names as
keys and counts as values as values
group_dict: dict, dict with replicate as keys and dict with gene names as
keys and counts as values as values
out_f: str, output file name in csv format
"""
with open(out_f, 'w') as f:
f.write("gene_name\treference\t")
f.write("gene_name\t")
# create set of gene names to make sure all rows are present and unique
gene_names = set(list(ref_dict.keys()))
gene_names = set()
for ref in ref_dict:
gene_names.update(list(ref_dict[ref].keys()))
f.write("ref_" + str(ref)+"\t")
for replicate in group_dict:
gene_names.update(list(group_dict[replicate].keys()))
f.write(str(replicate)+"\t")
f.write("\n")
for gene in gene_names:
f.write(str(gene)+"\t")
try:
f.write(str(ref_dict[gene])+'\t')
except: # if gene is not present in gtf, write 0 counts
f.write('0\t')
for ref in ref_dict:
try:
f.write(str(ref_dict[ref][gene])+'\t')
except: # if gene is not present in gtf, write 0 counts
f.write('0\t')
for replicate in group_dict:
try:
f.write(str(group_dict[replicate][gene])+'\t')
......@@ -91,20 +99,22 @@ def write_csv(ref_dict, group_dict, out_f):
def main(ref_name, sample_names):
"""Run functions and loop trough sample files
ref_name: str, filename of reference gtf file
ref_name: str, filename of txt file containing filenames of references
sample_names: str, filename of txt file containing filenames of samples
"""
ref = parse_gtf(ref_name)
ref_dict = group_replicates(ref_name)
ref_key = list(ref_dict.keys())
ref_dict = ref_dict[ref_key[0]]
print("parsed: reference")
dict_all = group_replicates(sample_names)
print("all gtf files parsed\n")
for group in dict_all: #loop trough
# write files to output folder counts/
out_file = 'counts/' + str(group) + 'counts.csv'
write_csv(ref, dict_all[group], out_file)
write_csv(ref_dict, dict_all[group], out_file)
print("csv written: "+ str(group))
# Run script
if __name__ == "__main__":
main(argv[1], argv[2])
\ No newline at end of file
main(argv[1], argv[2])
#!/usr/bin/env python3
"""
Author: Joran Schoorlemmer
Student nr: 1004586
Description: Write log2fold and padj values from tsv files into two csv files
Usage: python3 extractdiffanalysis.py <listofsamplefiles.txt> \
<log2foldoutputfilename.txt> <padjoutputfilename.txt>
"""
# Import
from sys import argv
# Functions
def extractlogfold(csv_file):
"""Extract fold changes from csv file
csv_file: str, name of input sample csv file
fold_dict: dict, dict with gene names as keys and l2f changes as values
"""
with open(csv_file, 'r') as logf:
logf.readline()
fold_dict = {} # init dict
for line in logf:
line.strip()
line_info = line.split('\t') # write tsv line to list
gene_name = line_info[0]
log2fold = float(line_info[2])
fold_dict[gene_name] = log2fold
return fold_dict
def extractpadj(csv_file):
"""Extract padj values from csv file
csv_file: str, name of input sample csv file
p_dict: dict, dict with gene names as keys and l2f changes as values
"""
with open(csv_file, 'r') as logf:
logf.readline()
p_dict = {} # init dict
for line in logf:
line.strip()
line_info = line.split('\t') # write tsv line to list
gene_name = line_info[0]
padj = float(line_info[6].strip())
p_dict[gene_name] = padj
return p_dict
def writecsv(sample_dict, out_file):
"""Write sample dict into csv file
sample_dict: dict, dict with samples as keys and dicts with gene names as
keys as values
out_file: str, filename of output csv file
"""
with open(out_file, 'w') as out_f:
out_f.write('Gene\t')
gene_names = set()
# write samples and find gene names
for sample in sample_dict:
gene_names.update(list(sample_dict[sample].keys()))
out_f.write(str(sample[0:4])+"\t") # parse sample name
out_f.write('\n')
for gene in gene_names:
out_f.write(str(gene)+"\t")
for sample in sample_dict:
try:
out_f.write(str(sample_dict[sample][gene])+'\t')
except: # if gene is not present in csv, write NA
out_f.write('NA\t')
out_f.write('\n')
# Main
def main(csv_names, out_file_fold, out_file_p):
"""Run functions and loop trough sample files
csv_names: str, filename of txt file with sample file names
out_file_fold: str, filename of l2f output csv file
out_file_p: str, filename of padj output csv file
"""
with open(csv_names,'r') as f_names:
# init dicts
dict_all_fold = {}
dict_all_padj = {}
# loop trough files
for fn in f_names:
fn = fn.strip('\n')
fold_dict = extractlogfold(fn)
p_dict = extractpadj(fn)
dict_all_fold[fn] = fold_dict
dict_all_padj[fn] = p_dict
# write csv files
writecsv(dict_all_fold, out_file_fold)
writecsv(dict_all_padj, out_file_p)
# Run script
if __name__ == "__main__":
main(argv[1], argv[2], argv[3])
\ No newline at end of file
"""
Author: Maureen Molemaker, studentnumber
Author: Liset Eendebak, studentnumber 1014355
script to map the fastq reads onto the reference genome
and to annotate the transcripts
Usage: snakemake -s mapping -c 1 # use 1 core
"""
SAMPLES = ["20180613.A-13R1", "20180613.A-13R2", "20180613.A-13R3",
"20180613.A-13R4","20180613.A-13R5","20180605.A-8R1", "20180605.A-8R2",
"20180605.A-8R3","20180605.A-8R4","20180605.A-8R5", "20180605.A-12R1",
"20180605.A-12R2","20180613.A-12R3","20180605.A-12R4","20180605.A-12R5",
"20180613.A-5R1","20180605.A-5R2", "20180613.A-5R3","20180605.A-5R4",
"20180605.A-5R5","20180613.A-6R1","20180605.A-6R2","20180605.A-6R3",
"20180613.A-6R4","20180605.A-6R5", "20180613.A-1R1", "20180605.A-1R2",
"20180613.A-1R3", "20180605.A-1R4", "20180605.A-1R5", "20180605.A-30R1",
"20180605.A-30R2","20180605.A-30R3", "20180605.A-30R4",
"20180605.A-30R5"]
rule all:
input:
expand("{sample}_hisat2.gtf", sample=SAMPLES)
rule mapping:
input:
reads = "../transcriptome/{sample}_R1.fastq.gz",
index = "../Arabidopsis_thaliana/TAIR.1.ht2",
output:
temp("{sample}_hisat2.sam"),
params:
index="../Arabidopsis_thaliana/TAIR",
shell:
"hisat2 -p 1 -x {params.index} -U {input.reads} -S {output}"
rule sort_sam: # works for both bowtie2 and hisat2
input:
"{sample}_hisat2.sam"
output:
"{sample}_hisat2.bam"
shell:
"samtools sort -o {output} {input}"
rule gene_annotation:
input:
bamfile = "{sample}_hisat2.bam",
annotation = "../Arabidopsis_thaliana/genes.gtf",
output:
"{sample}_hisat2.gtf"
params:
label="{sample}"
shell:
"stringtie -G {input.annotation} -o {output} -l {params.label} {input.bamfile}"
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment