Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Araformatics group
Araformatics project
Commits
004b0591
Commit
004b0591
authored
Dec 07, 2021
by
Schoorlemmer, Joran
Browse files
Upload parsing py script for clustering
parent
3e1efcc9
Changes
1
Hide whitespace changes
Inline
Side-by-side
scripts/extractdiffanalysis.py
0 → 100644
View file @
004b0591
#!/usr/bin/env python3
"""
Author: Joran Schoorlemmer
Student nr: 1004586
Description: Write log2fold and padj values from tsv files into two csv files
Usage: python3 extractdiffanalysis.py <listofsamplefiles.txt>
\
<log2foldoutputfilename.txt> <padjoutputfilename.txt>
"""
# Import
from
sys
import
argv
# Functions
def
extractlogfold
(
csv_file
):
"""Extract fold changes from csv file
csv_file: str, name of input sample csv file
fold_dict: dict, dict with gene names as keys and l2f changes as values
"""
with
open
(
csv_file
,
'r'
)
as
logf
:
logf
.
readline
()
fold_dict
=
{}
# init dict
for
line
in
logf
:
line
.
strip
()
line_info
=
line
.
split
(
'
\t
'
)
# write tsv line to list
gene_name
=
line_info
[
0
]
log2fold
=
float
(
line_info
[
2
])
fold_dict
[
gene_name
]
=
log2fold
return
fold_dict
def
extractpadj
(
csv_file
):
"""Extract padj values from csv file
csv_file: str, name of input sample csv file
p_dict: dict, dict with gene names as keys and l2f changes as values
"""
with
open
(
csv_file
,
'r'
)
as
logf
:
logf
.
readline
()
p_dict
=
{}
# init dict
for
line
in
logf
:
line
.
strip
()
line_info
=
line
.
split
(
'
\t
'
)
# write tsv line to list
gene_name
=
line_info
[
0
]
padj
=
float
(
line_info
[
6
].
strip
())
p_dict
[
gene_name
]
=
padj
return
p_dict
def
writecsv
(
sample_dict
,
out_file
):
"""Write sample dict into csv file
sample_dict: dict, dict with samples as keys and dicts with gene names as
keys as values
out_file: str, filename of output csv file
"""
with
open
(
out_file
,
'w'
)
as
out_f
:
out_f
.
write
(
'Gene
\t
'
)
gene_names
=
set
()
# write samples and find gene names
for
sample
in
sample_dict
:
gene_names
.
update
(
list
(
sample_dict
[
sample
].
keys
()))
out_f
.
write
(
str
(
sample
[
0
:
4
])
+
"
\t
"
)
# parse sample name
out_f
.
write
(
'
\n
'
)
for
gene
in
gene_names
:
out_f
.
write
(
str
(
gene
)
+
"
\t
"
)
for
sample
in
sample_dict
:
try
:
out_f
.
write
(
str
(
sample_dict
[
sample
][
gene
])
+
'
\t
'
)
except
:
# if gene is not present in csv, write NA
out_f
.
write
(
'NA
\t
'
)
out_f
.
write
(
'
\n
'
)
# Main
def
main
(
csv_names
,
out_file_fold
,
out_file_p
):
"""Run functions and loop trough sample files
csv_names: str, filename of txt file with sample file names
out_file_fold: str, filename of l2f output csv file
out_file_p: str, filename of padj output csv file
"""
with
open
(
csv_names
,
'r'
)
as
f_names
:
# init dicts
dict_all_fold
=
{}
dict_all_padj
=
{}
# loop trough files
for
fn
in
f_names
:
fn
=
fn
.
strip
(
'
\n
'
)
fold_dict
=
extractlogfold
(
fn
)
p_dict
=
extractpadj
(
fn
)
dict_all_fold
[
fn
]
=
fold_dict
dict_all_padj
[
fn
]
=
p_dict
# write csv files
writecsv
(
dict_all_fold
,
out_file_fold
)
writecsv
(
dict_all_padj
,
out_file_p
)
# Run script
if
__name__
==
"__main__"
:
main
(
argv
[
1
],
argv
[
2
],
argv
[
3
])
\ No newline at end of file
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment