Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Araformatics group
Araformatics project
Commits
dd6e6131
Commit
dd6e6131
authored
Dec 06, 2021
by
Eendebak, Liset
Browse files
Merge branch 'main' of git.wur.nl:araformatics-group/araformatics-project into main
parents
40a0c942
be6de690
Changes
1
Hide whitespace changes
Inline
Side-by-side
scripts/extractcounts.py
View file @
dd6e6131
...
...
@@ -3,7 +3,7 @@
Author: Joran Schoorlemmer
Student nr: 1004586
Description: Extract counts from coverage data in gtf files and write to csv
Usage: python3 extractcounts.py <
reference.gtf
> <listofsamplegtffiles.txt>
Usage: python3 extractcounts.py <
listofreffiles.txt
> <listofsamplegtffiles.txt>
"""
...
...
@@ -32,9 +32,10 @@ def parse_gtf(in_file, read_length = 100):
# turn into into int to do calculations
trans_len
=
abs
(
int
(
line_cont
[
4
])
-
int
(
line_cont
[
3
]))
gene_info
=
line_cont
[
8
].
split
(
' '
)
gene_name
=
gene_info
[
5
].
lstrip
(
'"'
).
rstrip
(
'";'
)
cov
=
float
(
gene_info
[
11
].
lstrip
(
'"'
).
rstrip
(
'";'
))
genes
[
gene_name
]
=
round
(
cov
*
trans_len
/
read_length
)
if
len
(
gene_info
)
==
16
:
gene_name
=
gene_info
[
9
].
lstrip
(
'"'
).
rstrip
(
'";'
)
cov
=
float
(
gene_info
[
11
].
lstrip
(
'"'
).
rstrip
(
'";'
))
genes
[
gene_name
]
=
round
(
cov
*
trans_len
/
read_length
)
return
genes
...
...
@@ -50,7 +51,8 @@ def group_replicates(sample_names):
with
open
(
sample_names
)
as
f_names
:
for
f
in
f_names
:
f
=
f
.
strip
(
"
\n
"
)
rep_dict_key
=
f
[:
-
5
]
# strip redundant info out of string
rep_dict_key
=
f
[
9
:
-
12
]
if
not
rep_dict_key
in
rep_dict
:
rep_dict
[
rep_dict_key
]
=
{}
rep_dict
[
rep_dict_key
][
f
]
=
parse_gtf
(
'samples/'
+
str
(
f
))
...
...
@@ -61,25 +63,30 @@ def group_replicates(sample_names):
def
write_csv
(
ref_dict
,
group_dict
,
out_f
):
"""write a tab delim csv file with genes and count values
ref_dict: dict, dict with gene name as keys and counts as values
ref_dict: dict, dict with replicate as keys and dict with gene names as
keys and counts as values as values
group_dict: dict, dict with replicate as keys and dict with gene names as
keys and counts as values as values
out_f: str, output file name in csv format
"""
with
open
(
out_f
,
'w'
)
as
f
:
f
.
write
(
"gene_name
\
t
reference
\
t
"
)
f
.
write
(
"gene_name
\t
"
)
# create set of gene names to make sure all rows are present and unique
gene_names
=
set
(
list
(
ref_dict
.
keys
()))
gene_names
=
set
()
for
ref
in
ref_dict
:
gene_names
.
update
(
list
(
ref_dict
[
ref
].
keys
()))
f
.
write
(
"ref_"
+
str
(
ref
)
+
"
\t
"
)
for
replicate
in
group_dict
:
gene_names
.
update
(
list
(
group_dict
[
replicate
].
keys
()))
f
.
write
(
str
(
replicate
)
+
"
\t
"
)
f
.
write
(
"
\n
"
)
for
gene
in
gene_names
:
f
.
write
(
str
(
gene
)
+
"
\t
"
)
try
:
f
.
write
(
str
(
ref_dict
[
gene
])
+
'
\t
'
)
except
:
# if gene is not present in gtf, write 0 counts
f
.
write
(
'0
\t
'
)
for
ref
in
ref_dict
:
try
:
f
.
write
(
str
(
ref_dict
[
ref
][
gene
])
+
'
\t
'
)
except
:
# if gene is not present in gtf, write 0 counts
f
.
write
(
'0
\t
'
)
for
replicate
in
group_dict
:
try
:
f
.
write
(
str
(
group_dict
[
replicate
][
gene
])
+
'
\t
'
)
...
...
@@ -91,20 +98,22 @@ def write_csv(ref_dict, group_dict, out_f):
def
main
(
ref_name
,
sample_names
):
"""Run functions and loop trough sample files
ref_name: str, filename of
reference gtf file
ref_name: str, filename of
txt file containing filenames of references
sample_names: str, filename of txt file containing filenames of samples
"""
ref
=
parse_gtf
(
ref_name
)
ref_dict
=
group_replicates
(
ref_name
)
ref_key
=
list
(
ref_dict
.
keys
())
ref_dict
=
ref_dict
[
ref_key
[
0
]]
print
(
"parsed: reference"
)
dict_all
=
group_replicates
(
sample_names
)
print
(
"all gtf files parsed
\n
"
)
for
group
in
dict_all
:
#loop trough
# write files to output folder counts/
out_file
=
'counts/'
+
str
(
group
)
+
'counts.csv'
write_csv
(
ref
,
dict_all
[
group
],
out_file
)
write_csv
(
ref
_dict
,
dict_all
[
group
],
out_file
)
print
(
"csv written: "
+
str
(
group
))
# Run script
if
__name__
==
"__main__"
:
main
(
argv
[
1
],
argv
[
2
])
\ No newline at end of file
main
(
argv
[
1
],
argv
[
2
])
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment