Commit 61f9766d authored by Jorge Navarro Muñoz's avatar Jorge Navarro Muñoz
parents 31792e92 86bd2cbc
......@@ -436,7 +436,7 @@ def get_gbk_files(inputpath, outputdir, bgc_fasta_folder, min_bgc_size, exclude_
for f in sorted(files_no_proteins):
noseqs.write("{}\n".format(f))
if len(files_no_biosynthetic_genes) > 0 and (mode == "lcs" or mode == "local"):
if len(files_no_biosynthetic_genes) > 0 and (mode == "glocal" or mode == "auto"):
print(" Warning: Input set has files with no Biosynthetic Genes (affects alignment mode)")
print(" See no_biosynthetic_genes_list.txt")
with open(os.path.join(outputdir, "logs", "no_biosynthetic_genes_list.txt"), "w") as nobiogenes:
......@@ -837,7 +837,7 @@ def cluster_distance_lcs(A, B, A_domlist, B_domlist, dcg_A, dcg_b, core_pos_A, c
##print(sliceStartA, sliceStartB, sliceLengthA)
#print("")
if mode=="lcs" or (mode=="auto" and (bgc_info[A].contig_edge or bgc_info[B].contig_edge)):
if mode=="glocal" or (mode=="auto" and (bgc_info[A].contig_edge or bgc_info[B].contig_edge)):
#X: bgc that drive expansion
#Y: the other bgc
# forward: True if expansion is to the right
......@@ -1689,7 +1689,8 @@ def clusterJsonBatch(bgcs, pathBase, className, matrix, pos_alignments, cutoffs=
except UnboundLocalError:
# Noticed this could happen if the sequences are exactly
# the same and all distances == 0
print(" Warning: Unable to root at midpoint file {}".format(newick_file_path))
if verbose:
print(" Warning: Unable to root at midpoint file {}".format(newick_file_path))
pass
newick = tree.format("newick")
......@@ -1699,7 +1700,7 @@ def clusterJsonBatch(bgcs, pathBase, className, matrix, pos_alignments, cutoffs=
newick_trees[exemplar_idx] = newick
### Use the 0.5 distance cutoff to cluster clans by default
### - - - GCC - - -
bs_similarity_families = []
if clusterClans and cutoff == clanClassificationCutoff:
# Detect if there's only 1 GCF. It makes pySAPC crash
......@@ -1750,7 +1751,8 @@ def clusterJsonBatch(bgcs, pathBase, className, matrix, pos_alignments, cutoffs=
# affinity propagation can fail in some circumstances (e.g. only singletons)
if exemplarsClans is not None:
clanLabels = [exemplarsClans[labelsClans[i]] for i in range(len(familyIdx))]
# translate and record GCF label instead of GCF number
clanLabels = [familyIdx[exemplarsClans[labelsClans[i]]] for i in range(len(familyIdx))]
else:
clanLabels = []
......@@ -2028,17 +2030,17 @@ def CMD_parser():
classes to each subclass (e.g. a 'terpene-nrps' BGC from\
Others would be added to the Terpene and NRPS classes)")
parser.add_argument("--mode", dest="mode", default="global", choices=["global",
"lcs", "auto"], help="Alignment mode for each pair of\
gene clusters. 'global' (default) the whole list of\
domains of each BGC are compared; 'lcs': Longest\
Common Subcluster mode. Redefine the subset of the \
domains used to calculate distance by trying to find\
the longest slice of common domain content per gene\
in both BGCs, then expand each slice.\
'auto' use LCS when at least one of the BGCs in each\
pair has the 'contig_edge' annotation from antiSMASH\
v4+, otherwise use global mode on that pair")
parser.add_argument("--mode", dest="mode", default="glocal", choices=["global",
"glocal", "auto"], help="Alignment mode for each pair of\
gene clusters. 'global': the whole list of domains \
of each BGC are compared; 'glocal': Longest Common \
Subcluster mode. Redefine the subset of the domains \
used to calculate distance by trying to find the \
longest slice of common domain content per gene in \
both BGCs, then expand each slice. 'auto': use glocal\
when at least one of the BGCs in each pair has the \
'contig_edge' annotation from antiSMASH v4+, \
otherwise use global mode on that pair")
parser.add_argument("--anchorfile", dest="anchorfile",
default=os.path.join(os.path.dirname(os.path.realpath(__file__)),"anchor_domains.txt"),
......@@ -2201,11 +2203,11 @@ if __name__=="__main__":
if mode == "auto":
networks_folder_all += "_auto"
run_mode_string += "_auto"
elif mode == "lcs":
networks_folder_all += "_lcs"
elif mode == "glocal":
networks_folder_all += "_glocal"
run_mode_string += "_glocal"
else:
run_mode_string += "_full"
run_mode_string += "_global"
time1 = time.time()
......@@ -2230,7 +2232,7 @@ if __name__=="__main__":
for line in open(os.path.join(bigscape_path,"domain_whitelist.txt"), "r"):
if line[0] == "#":
continue
domain_whitelist.add(line.split("\t")[0])
domain_whitelist.add(line.split("\t")[0].strip())
if len(domain_whitelist) == 0:
print("Error: --domain_whitelist used, but no domains found in the file")
else:
......@@ -3244,12 +3246,9 @@ if __name__=="__main__":
identifier = ""
if len(bgc_info[bgc].organism) > 1:
identifier = bgc_info[bgc].organism
elif len(bgc_info[bgc].accession_id) > 1:
if (bgc_info[bgc].accession_id[2] == "_"): # is a refseq accession
identifier = bgc_info[bgc].accession_id[2].split(".")[0]
elif len(bgc_info[bgc].accession_id) > 6: # *assume* a genbank WGS accession
# todo: use more robust check / assumption e.g. other types of genbank data?
identifier = bgc_info[bgc].accession_id[0:6]
else : # use original genome file name (i.e. exclude "..clusterXXX from antiSMASH run")
file_name_base = os.path.splitext(os.path.basename(genbankDict[bgc][0]))[0]
identifier = file_name_base.rsplit(".cluster",1)[0]
if len(identifier) < 1:
identifier = "Unknown Genome {}".format(len(genomes))
if identifier not in genomes:
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment