Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Kautsar, Satria
BiG-SCAPE
Commits
61f9766d
Commit
61f9766d
authored
Aug 02, 2018
by
Jorge Navarro Muñoz
Browse files
Merge branch 'master' of
https://git.wageningenur.nl/medema-group/BiG-SCAPE
parents
31792e92
86bd2cbc
Changes
1
Hide whitespace changes
Inline
Side-by-side
bigscape.py
View file @
61f9766d
...
...
@@ -436,7 +436,7 @@ def get_gbk_files(inputpath, outputdir, bgc_fasta_folder, min_bgc_size, exclude_
for
f
in
sorted
(
files_no_proteins
):
noseqs
.
write
(
"{}
\n
"
.
format
(
f
))
if
len
(
files_no_biosynthetic_genes
)
>
0
and
(
mode
==
"
lcs
"
or
mode
==
"
local
"
):
if
len
(
files_no_biosynthetic_genes
)
>
0
and
(
mode
==
"
glocal
"
or
mode
==
"
auto
"
):
print
(
" Warning: Input set has files with no Biosynthetic Genes (affects alignment mode)"
)
print
(
" See no_biosynthetic_genes_list.txt"
)
with
open
(
os
.
path
.
join
(
outputdir
,
"logs"
,
"no_biosynthetic_genes_list.txt"
),
"w"
)
as
nobiogenes
:
...
...
@@ -837,7 +837,7 @@ def cluster_distance_lcs(A, B, A_domlist, B_domlist, dcg_A, dcg_b, core_pos_A, c
##print(sliceStartA, sliceStartB, sliceLengthA)
#print("")
if
mode
==
"
lcs
"
or
(
mode
==
"auto"
and
(
bgc_info
[
A
].
contig_edge
or
bgc_info
[
B
].
contig_edge
)):
if
mode
==
"
glocal
"
or
(
mode
==
"auto"
and
(
bgc_info
[
A
].
contig_edge
or
bgc_info
[
B
].
contig_edge
)):
#X: bgc that drive expansion
#Y: the other bgc
# forward: True if expansion is to the right
...
...
@@ -1689,7 +1689,8 @@ def clusterJsonBatch(bgcs, pathBase, className, matrix, pos_alignments, cutoffs=
except
UnboundLocalError
:
# Noticed this could happen if the sequences are exactly
# the same and all distances == 0
print
(
" Warning: Unable to root at midpoint file {}"
.
format
(
newick_file_path
))
if
verbose
:
print
(
" Warning: Unable to root at midpoint file {}"
.
format
(
newick_file_path
))
pass
newick
=
tree
.
format
(
"newick"
)
...
...
@@ -1699,7 +1700,7 @@ def clusterJsonBatch(bgcs, pathBase, className, matrix, pos_alignments, cutoffs=
newick_trees
[
exemplar_idx
]
=
newick
###
Use the 0.5 distance cutoff to cluster clans by default
###
- - - GCC - - -
bs_similarity_families
=
[]
if
clusterClans
and
cutoff
==
clanClassificationCutoff
:
# Detect if there's only 1 GCF. It makes pySAPC crash
...
...
@@ -1750,7 +1751,8 @@ def clusterJsonBatch(bgcs, pathBase, className, matrix, pos_alignments, cutoffs=
# affinity propagation can fail in some circumstances (e.g. only singletons)
if
exemplarsClans
is
not
None
:
clanLabels
=
[
exemplarsClans
[
labelsClans
[
i
]]
for
i
in
range
(
len
(
familyIdx
))]
# translate and record GCF label instead of GCF number
clanLabels
=
[
familyIdx
[
exemplarsClans
[
labelsClans
[
i
]]]
for
i
in
range
(
len
(
familyIdx
))]
else
:
clanLabels
=
[]
...
...
@@ -2028,17 +2030,17 @@ def CMD_parser():
classes to each subclass (e.g. a 'terpene-nrps' BGC from
\
Others would be added to the Terpene and NRPS classes)"
)
parser
.
add_argument
(
"--mode"
,
dest
=
"mode"
,
default
=
"glo
b
al"
,
choices
=
[
"global"
,
"
lcs
"
,
"auto"
],
help
=
"Alignment mode for each pair of
\
gene clusters. 'global'
(default)
the whole list of
\
domains
of each BGC are compared; '
lcs
': Longest
\
Common
Subcluster mode. Redefine the subset of the
\
domains
used to calculate distance by trying to find
\
the
longest slice of common domain content per gene
\
in
both BGCs, then expand each slice.
\
'auto' use LCS
when at least one of the BGCs in each
\
pair has the
'contig_edge' annotation from antiSMASH
\
v4+,
otherwise use global mode on that pair"
)
parser
.
add_argument
(
"--mode"
,
dest
=
"mode"
,
default
=
"glo
c
al"
,
choices
=
[
"global"
,
"
glocal
"
,
"auto"
],
help
=
"Alignment mode for each pair of
\
gene clusters. 'global'
:
the whole list of
domains
\
of each BGC are compared; '
glocal
': Longest
Common
\
Subcluster mode. Redefine the subset of the
domains
\
used to calculate distance by trying to find
the
\
longest slice of common domain content per gene
in
\
both BGCs, then expand each slice.
'auto': use glocal
\
when at least one of the BGCs in each
pair has the
\
'contig_edge' annotation from antiSMASH
v4+,
\
otherwise use global mode on that pair"
)
parser
.
add_argument
(
"--anchorfile"
,
dest
=
"anchorfile"
,
default
=
os
.
path
.
join
(
os
.
path
.
dirname
(
os
.
path
.
realpath
(
__file__
)),
"anchor_domains.txt"
),
...
...
@@ -2201,11 +2203,11 @@ if __name__=="__main__":
if
mode
==
"auto"
:
networks_folder_all
+=
"_auto"
run_mode_string
+=
"_auto"
elif
mode
==
"
lcs
"
:
networks_folder_all
+=
"_
lcs
"
elif
mode
==
"
glocal
"
:
networks_folder_all
+=
"_
glocal
"
run_mode_string
+=
"_glocal"
else
:
run_mode_string
+=
"_
ful
l"
run_mode_string
+=
"_
globa
l"
time1
=
time
.
time
()
...
...
@@ -2230,7 +2232,7 @@ if __name__=="__main__":
for
line
in
open
(
os
.
path
.
join
(
bigscape_path
,
"domain_whitelist.txt"
),
"r"
):
if
line
[
0
]
==
"#"
:
continue
domain_whitelist
.
add
(
line
.
split
(
"
\t
"
)[
0
])
domain_whitelist
.
add
(
line
.
split
(
"
\t
"
)[
0
]
.
strip
()
)
if
len
(
domain_whitelist
)
==
0
:
print
(
"Error: --domain_whitelist used, but no domains found in the file"
)
else
:
...
...
@@ -3244,12 +3246,9 @@ if __name__=="__main__":
identifier
=
""
if
len
(
bgc_info
[
bgc
].
organism
)
>
1
:
identifier
=
bgc_info
[
bgc
].
organism
elif
len
(
bgc_info
[
bgc
].
accession_id
)
>
1
:
if
(
bgc_info
[
bgc
].
accession_id
[
2
]
==
"_"
):
# is a refseq accession
identifier
=
bgc_info
[
bgc
].
accession_id
[
2
].
split
(
"."
)[
0
]
elif
len
(
bgc_info
[
bgc
].
accession_id
)
>
6
:
# *assume* a genbank WGS accession
# todo: use more robust check / assumption e.g. other types of genbank data?
identifier
=
bgc_info
[
bgc
].
accession_id
[
0
:
6
]
else
:
# use original genome file name (i.e. exclude "..clusterXXX from antiSMASH run")
file_name_base
=
os
.
path
.
splitext
(
os
.
path
.
basename
(
genbankDict
[
bgc
][
0
]))[
0
]
identifier
=
file_name_base
.
rsplit
(
".cluster"
,
1
)[
0
]
if
len
(
identifier
)
<
1
:
identifier
=
"Unknown Genome {}"
.
format
(
len
(
genomes
))
if
identifier
not
in
genomes
:
...
...
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment