run.py 4.21 KB
Newer Older
Koehorst, Jasper's avatar
Koehorst, Jasper committed
1
2
from joblib import Parallel, delayed
import multiprocessing
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
import os
import sys
import shlex

curl = """curl -X POST -H "Content-Type: application/x-www-form-urlencoded" -d 'result=sample&query=secondary_sample_accession%3D%22XXXXXX%22&format=tsv' "https://www.ebi.ac.uk/ena/portal/api/search\""""
xml = "https://www.ebi.ac.uk/ena/browser/api/xml/"
# wget https://www.ebi.ac.uk/metagenomics/api/v1/genomes/MGYG-HGUT-00001/downloads/MGYG-HGUT-00001.fna
# wget https://www.ebi.ac.uk/metagenomics/api/v1/genomes/MGYG-HGUT-00001/downloads/MGYG-HGUT-00001.gff
# wget https://www.ebi.ac.uk/metagenomics/api/v1/genomes/MGYG-HGUT-00001/downloads/MGYG-HGUT-00001_InterProScan.tsv
# wget https://www.ebi.ac.uk/metagenomics/api/v1/genomes/MGYG-HGUT-02750/downloads/MGYG-HGUT-02750_eggNOG.tsv

source = "https://www.ebi.ac.uk/metagenomics/api/v1/genomes/XXX/downloads/XXX.type"

def retrieval(identifier, accession):
    print("Processing", identifier)
    path = source.replace("XXX", identifier)
    fna = path.replace(".type",".fna")
    gff = path.replace(".type",".gff")
    interpro = path.replace(".type","_InterProScan.tsv")
    egg = path.replace(".type", "_eggNOG.tsv")
    
    # Curl command
    xml_destination = "./xml/" + accession + ".xml"
    if not os.path.isfile(xml_destination):
        command = curl.replace("XXXXXX", accession)
        output = run_command(command)
        os.system("wget -nc " + xml + output.split()[2] + " --output-document=" + xml_destination)
        
    # Wget command
    for element in [fna, gff, interpro, egg]:
        # Create the folder
        path = "./genome/" + identifier + "/"
        if not os.path.isdir(path):
            os.makedirs(path)
        # Download the files
        path = "./genome/" + identifier + "/" + element.split("/")[-1]
        if not os.path.isfile(path):
            command = "wget -nc " + element + " --output-document=" + path
            os.system(command)

def run_command(command):
    import subprocess
    result = subprocess.run(shlex.split(command), capture_output=True)
    return result.stdout.decode('ascii')

def conversion(identifier):
Koehorst, Jasper's avatar
Koehorst, Jasper committed
49
    if os.path.isfile("./genome/" + identifier + "/" + identifier + ".conversion.ttl"): return
50
    # "-tool", "conversion", "-f", fasta.getAbsolutePath(), "-gff2rdf", "-i", gff.getAbsolutePath(), "-o", output.getAbsolutePath(), "-id", "MGYG-HGUT-00001", "-codon", "11", "-topology", "linear"
Koehorst, Jasper's avatar
update    
Koehorst, Jasper committed
51
    command = "java -jar ./MGnifyParser.jar -tool conversion -f ./genome/" + identifier + "/" + identifier + ".fna -gff2rdf -i ./genome/" + identifier + "/" + identifier + ".gff -id " + identifier + " -codon 11 -topology linear -o ./genome/" + identifier + "/" + identifier + ".conversion.ttl"
52
    print(command)
Koehorst, Jasper's avatar
Koehorst, Jasper committed
53
    os.system(command)
54

Koehorst, Jasper's avatar
Koehorst, Jasper committed
55
def interproscan(identifier):
Koehorst, Jasper's avatar
Koehorst, Jasper committed
56
57
    if os.path.isfile("./genome/" + identifier + "/" + identifier + ".interproscan.ttl"): return
    command = "java -jar ./MGnifyParser.jar -tool interpro -i ./genome/" + identifier + "/" + identifier + ".conversion.ttl -o ./genome/" + identifier + "/" + identifier + ".interproscan.ttl -tsv ./genome/" +identifier +"/" + identifier + "_InterProScan.tsv -version InterProScan-v5_35-74_0"
Koehorst, Jasper's avatar
Koehorst, Jasper committed
58
    print(command)
Koehorst, Jasper's avatar
Koehorst, Jasper committed
59
    os.system(command)
Koehorst, Jasper's avatar
Koehorst, Jasper committed
60
61

def eggnog(identifier):
Koehorst, Jasper's avatar
Koehorst, Jasper committed
62
63
    if os.path.isfile("./genome/" + identifier + "/" + identifier + ".interproscan.eggnog.ttl"): return
    command = "java -jar ./MGnifyParser.jar -tool eggnog -i ./genome/" + identifier + "/" + identifier + ".interproscan.ttl -o ./genome/" + identifier + "/" + identifier + ".interproscan.eggnog.ttl -tsv ./genome/" +identifier +"/" + identifier + "_eggNOG.tsv -v v2-db5.0"
Koehorst, Jasper's avatar
Koehorst, Jasper committed
64
    print(command)
Koehorst, Jasper's avatar
Koehorst, Jasper committed
65
    os.system(command)
Koehorst, Jasper's avatar
Koehorst, Jasper committed
66

Koehorst, Jasper's avatar
Koehorst, Jasper committed
67
68
69
70
71
72

def conversion_stage(identifier):
    conversion(identifier)
    interproscan(identifier)
    eggnog(identifier)

73
if __name__ == "__main__":
Koehorst, Jasper's avatar
Koehorst, Jasper committed
74
    identifiers = set()
75
    for line in open("identifiers.tsv"):
Koehorst, Jasper's avatar
Koehorst, Jasper committed
76
        if "MGYG" not in line: continue
77
78
79
80
        identifier = line.strip().split("\t")[1]
        accession =  line.strip().split("\t")[2]
        if (identifier.startswith("MGYG")):
            retrieval(identifier, accession)
Koehorst, Jasper's avatar
Koehorst, Jasper committed
81
82
83
84
85
            identifiers.add(identifier)

    # Number of cores
    num_cores = 3 # multiprocessing.cpu_count()
    results = Parallel(n_jobs=num_cores)(delayed(conversion_stage)(identifier) for identifier in identifiers)