Commit 1250fc9f authored by Koehorst, Jasper's avatar Koehorst, Jasper
Browse files

ENA improvements

parent c25958d6
......@@ -14,6 +14,7 @@ import org.apache.jena.query.QueryExecutionFactory;
import org.apache.jena.query.QuerySolution;
import org.apache.jena.rdf.model.Model;
import org.apache.jena.rdf.model.impl.ModelCom;
import org.apache.jena.tdb.store.Hash;
import org.apache.log4j.Logger;
import org.irods.jargon.core.pub.DataObjectAO;
import org.irods.jargon.core.pub.DataTransferOperations;
......@@ -27,6 +28,8 @@ import java.io.File;
import java.io.FileWriter;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Scanner;
public class ENA {
......@@ -41,25 +44,32 @@ public class ENA {
// Obtained via https://www.ebi.ac.uk/ena/browser/advanced-search
String command = "curl -X POST -H \"Content-Type: application/x-www-form-urlencoded\" -d 'result=assembly&query=tax_tree(10239)&fields=accession%2Cassembly_level%2Cgenome_representation%2Csample_accession%2Ctax_id&format=tsv' \"https://www.ebi.ac.uk/ena/portal/api/search\"";
command = "curl -X POST -H \"Content-Type: application/x-www-form-urlencoded\" -d 'result=assembly&query=tax_tree(303)&fields=accession%2Cassembly_level%2Cassembly_name%2Cassembly_title%2Cassembly_type%2Cbase_count%2Cgenome_representation%2Clast_updated%2Csample_accession%2Cscientific_name%2Csecondary_sample_accession%2Cstrain%2Cstudy_accession%2Cstudy_description%2Cstudy_name%2Cstudy_title%2Ctax_id%2Cversion&format=tsv' \"https://www.ebi.ac.uk/ena/portal/api/search\"";
command = "curl -X POST -H \"Content-Type: application/x-www-form-urlencoded\" -d 'result=assembly&query=tax_tree(2039240)&fields=accession%2Cassembly_level%2Cassembly_name%2Cassembly_title%2Cassembly_type%2Cbase_count%2Cgenome_representation%2Clast_updated%2Csample_accession%2Cscientific_name%2Csecondary_sample_accession%2Cstrain%2Cstudy_accession%2Cstudy_description%2Cstudy_name%2Cstudy_title%2Ctax_id%2Cversion&format=tsv' \"https://www.ebi.ac.uk/ena/portal/api/search\"";
// Proteo bacteria and DSM strain
command = "curl -X POST -H \"Content-Type: application/x-www-form-urlencoded\" -d 'result=assembly&query=tax_tree(1224)%20AND%20assembly_title%3D%22*%20DSM%20*%22&fields=accession%2Cassembly_level%2Cassembly_name%2Cassembly_title%2Cassembly_type%2Cbase_count%2Cgenome_representation%2Clast_updated%2Csample_accession%2Cscientific_name%2Csecondary_sample_accession%2Cstrain%2Cstudy_accession%2Cstudy_description%2Cstudy_name%2Cstudy_title%2Ctax_id%2Cversion&format=tsv' \"https://www.ebi.ac.uk/ena/portal/api/search\"";
command = "curl -X POST -H \"Content-Type: application/x-www-form-urlencoded\" -d 'result=assembly&query=tax_tree(2)&fields=accession%2Cassembly_level%2Cassembly_name%2Cassembly_title%2Cassembly_type%2Cbase_count%2Cgenome_representation%2Clast_updated%2Csample_accession%2Cscientific_name%2Csecondary_sample_accession%2Cstrain%2Cstudy_accession%2Cstudy_description%2Cstudy_name%2Cstudy_title%2Ctax_id%2Cversion&format=tsv' \"https://www.ebi.ac.uk/ena/portal/api/search\"";
// No issue with just copying the ENA curl request
command = command.replaceFirst("curl", "curl --output curl.txt");
PrintWriter printWriter = new PrintWriter(new File("curl.sh"));
printWriter.write(command);
printWriter.close();
ExecCommand execCommand = new ExecCommand("sh curl.sh");
logger.warn("ECHO!, not executing the curl...");
ExecCommand execCommand = new ExecCommand("echo sh curl.sh");
if (execCommand.getExit() > 0) {
throw new Exception("Execution of curl failed");
}
Scanner scanner = new Scanner(new File("curl.txt"));
String[] header = scanner.nextLine().split("\t");
int taxid = -1;
int accession = -1;
HashMap<String, Integer> lookup = new HashMap<>();
for (int index = 0; index < header.length; index++) {
if (header[index].matches("accession")) {
accession = index;
}
if (header[index].matches("tax_id")) {
taxid = index;
}
lookup.put(header[index], index);
}
logger.info("Loading HDT file");
......@@ -77,67 +87,128 @@ public class ENA {
HDTGraph graph = new HDTGraph(hdt);
Model model = new ModelCom(graph);
while (scanner.hasNextLine()) {
String line = scanner.nextLine();
// Tax 2 codon file
Scanner tax2codonFile = new Scanner(new File("tax2codon.txt"));
HashMap<Integer, Integer> tax2codon = new HashMap<>();
commandOptions.id = line.split("\t")[accession];
commandOptions.taxon = Integer.parseInt(line.split("\t")[taxid]);
logger.info("Loading tax 2 codon file");
WorkflowGenomeSync workflow = new WorkflowGenomeSync();
// Set everything to default values and overwrite otherwise (constructor does not work?)
workflow = setGenomeSyncWorkflowsDefaults(workflow);
workflow.threads = commandOptions.threads;
workflow.memory = commandOptions.memory;
workflow.setGca(commandOptions.id);
while (tax2codonFile.hasNextLine()) {
String line = tax2codonFile.nextLine();
String tax = line.split(" ")[0];
String codon = line.split(" ")[1];
tax2codon.put(Integer.parseInt(tax), Integer.parseInt(codon));
}
tax2codonFile.close();
String lineage = getLineage(model, commandOptions.taxon);
if (lineage == null) {
continue;
}
while (scanner.hasNextLine()) {
String[] line = scanner.nextLine().split("\t");
lineage = lineage.toLowerCase().replaceAll(" ", "_");
boolean pass = filter(lookup, line);
if (lineage.startsWith("bacteria")) {
workflow.bacteria = true;
if (lineage.contains("pseudomonas")) {
workflow.codon = 11;
}
}
if (pass) {
// Destination
workflow.destination = "/" + connection.irodsAccount.getZone() + "/references/genomes/" + lineage + "/" + commandOptions.id;
commandOptions.id = line[lookup.get("accession")];
commandOptions.taxon = Integer.parseInt(line[lookup.get("tax_id")]);
logger.info(workflow.destination);
WorkflowGenomeSync workflow = new WorkflowGenomeSync();
// Set everything to default values and overwrite otherwise (constructor does not work?)
workflow = setGenomeSyncWorkflowsDefaults(workflow);
workflow.threads = commandOptions.threads;
workflow.memory = commandOptions.memory;
workflow.setGca(commandOptions.id);
String yamlFileName = commandOptions.id + ".yaml";
YamlWriter writer = new YamlWriter(new FileWriter(yamlFileName));
writer.write(workflow);
writer.close();
String lineage = getLineage(model, commandOptions.taxon);
if (lineage == null) {
continue;
}
// Prepare for upload
IRODSFile destFile = connection.fileFactory.instanceIRODSFile(workflow.destination + "/" + yamlFileName);
lineage = lineage.toLowerCase().replaceAll(" ", "_");
if (!connection.fileFactory.instanceIRODSFile(workflow.destination).exists()) {
connection.fileFactory.instanceIRODSFile(workflow.destination).mkdirs();
}
if (lineage.startsWith("bacteria")) {
workflow.codon = tax2codon.get(commandOptions.taxon);
logger.info("Codon table " + workflow.codon + " will be used");
}
// workflow.bacteria = true;
// // Codon table 4 for sure
// if (lineage.contains("entomoplasmatales") || lineage.contains("mycoplasmatales")) {
// workflow.codon = 4;
// }
// // The unsure ones
// else if (lineage.contains("spirochaetes")) {
// continue;
// }
// // codon table 11 for sure?...
// else if (
// lineage.contains("firmicutes") ||
// lineage.contains("actinobacteria") ||
// lineage.contains("proteobacteria") ||
// lineage.contains("chlamydiae") ||
// lineage.contains("chlorobi") ||
// lineage.contains("fusobacteria") ||
// lineage.contains("bacteroidetes")
// )
// {
// workflow.codon = 11;
// } else {
// throw new Exception("Codon table not set for anything in " + lineage);
// }
// }
// Destination
workflow.destination = "/" + connection.irodsAccount.getZone() + "/references/genomes/" + lineage + "/" + commandOptions.id;
logger.info(workflow.destination);
String yamlFileName = commandOptions.id + ".yaml";
YamlWriter writer = new YamlWriter(new FileWriter(yamlFileName));
writer.write(workflow);
writer.close();
// Prepare for upload
IRODSFile destFile = connection.fileFactory.instanceIRODSFile(workflow.destination + "/" + yamlFileName);
if (!connection.fileFactory.instanceIRODSFile(workflow.destination).exists()) {
connection.fileFactory.instanceIRODSFile(workflow.destination).mkdirs();
}
// if (destFile.exists()) continue;
while (destFile.exists()) {
destFile.delete();
}
while (destFile.exists()) {
destFile.delete();
}
dataTransferOperationsAO.putOperation(new File(yamlFileName), destFile, null, null);
dataTransferOperationsAO.putOperation(new File(yamlFileName), destFile, null, null);
// Add metadata tag...
DataObjectAO dataObjectAO = connection.irodsFileSystem.getIRODSAccessObjectFactory().getDataObjectAO(connection.irodsAccount);
AvuData avuMetaData = new AvuData("cwl", "/unlock/infrastructure/cwl/workflows/" + commandOptions.cwl, "waiting");
dataObjectAO.setAVUMetadata(destFile.getAbsolutePath(), avuMetaData);
// Add metadata tag...
DataObjectAO dataObjectAO = connection.irodsFileSystem.getIRODSAccessObjectFactory().getDataObjectAO(connection.irodsAccount);
AvuData avuMetaData = new AvuData("cwl", "/unlock/infrastructure/cwl/workflows/" + commandOptions.cwl, "waiting");
dataObjectAO.setAVUMetadata(destFile.getAbsolutePath(), avuMetaData);
// delete local file
new File(yamlFileName).delete();
}
}
}
private static boolean filter(HashMap<String, Integer> lookup, String[] line) {
// delete local file
new File(yamlFileName).delete();
if (lookup.containsKey("assembly_title")) {
// Accept all DSM strains
if (line[lookup.get("assembly_title")].toUpperCase().contains(" DSM ")) {
return true;
}
if (line[lookup.get("assembly_title")].toUpperCase().contains(" ATCC ")) {
return true;
}
}
// if (lookup.containsKey("assembly_level")) {
// if (!line[lookup.get("assembly_level")].endsWith("complete genome"))
// return false;
// }
return false;
}
private static String getLineage(Model model, int taxonid) {
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment