run.py 4.4 KB
Newer Older
Koehorst, Jasper's avatar
Koehorst, Jasper committed
1
2
from joblib import Parallel, delayed
import multiprocessing
3
4
5
import os
import sys
import shlex
Koehorst, Jasper's avatar
Koehorst, Jasper committed
6
import gzip
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49

curl = """curl -X POST -H "Content-Type: application/x-www-form-urlencoded" -d 'result=sample&query=secondary_sample_accession%3D%22XXXXXX%22&format=tsv' "https://www.ebi.ac.uk/ena/portal/api/search\""""
xml = "https://www.ebi.ac.uk/ena/browser/api/xml/"
# wget https://www.ebi.ac.uk/metagenomics/api/v1/genomes/MGYG-HGUT-00001/downloads/MGYG-HGUT-00001.fna
# wget https://www.ebi.ac.uk/metagenomics/api/v1/genomes/MGYG-HGUT-00001/downloads/MGYG-HGUT-00001.gff
# wget https://www.ebi.ac.uk/metagenomics/api/v1/genomes/MGYG-HGUT-00001/downloads/MGYG-HGUT-00001_InterProScan.tsv
# wget https://www.ebi.ac.uk/metagenomics/api/v1/genomes/MGYG-HGUT-02750/downloads/MGYG-HGUT-02750_eggNOG.tsv

source = "https://www.ebi.ac.uk/metagenomics/api/v1/genomes/XXX/downloads/XXX.type"

def retrieval(identifier, accession):
    print("Processing", identifier)
    path = source.replace("XXX", identifier)
    fna = path.replace(".type",".fna")
    gff = path.replace(".type",".gff")
    interpro = path.replace(".type","_InterProScan.tsv")
    egg = path.replace(".type", "_eggNOG.tsv")
    
    # Curl command
    xml_destination = "./xml/" + accession + ".xml"
    if not os.path.isfile(xml_destination):
        command = curl.replace("XXXXXX", accession)
        output = run_command(command)
        os.system("wget -nc " + xml + output.split()[2] + " --output-document=" + xml_destination)
        
    # Wget command
    for element in [fna, gff, interpro, egg]:
        # Create the folder
        path = "./genome/" + identifier + "/"
        if not os.path.isdir(path):
            os.makedirs(path)
        # Download the files
        path = "./genome/" + identifier + "/" + element.split("/")[-1]
        if not os.path.isfile(path):
            command = "wget -nc " + element + " --output-document=" + path
            os.system(command)

def run_command(command):
    import subprocess
    result = subprocess.run(shlex.split(command), capture_output=True)
    return result.stdout.decode('ascii')

def conversion(identifier):
Koehorst, Jasper's avatar
Koehorst, Jasper committed
50
51
    output_file = "./genome/" + identifier + "/" + identifier + ".conversion.ttl"
    if os.path.isfile(output_file + ".gz"): return
52
    # "-tool", "conversion", "-f", fasta.getAbsolutePath(), "-gff2rdf", "-i", gff.getAbsolutePath(), "-o", output.getAbsolutePath(), "-id", "MGYG-HGUT-00001", "-codon", "11", "-topology", "linear"
Koehorst, Jasper's avatar
Koehorst, Jasper committed
53
    command = "java -jar ./MGnifyParser.jar -tool conversion -f ./genome/" + identifier + "/" + identifier + ".fna -gff2rdf -i ./genome/" + identifier + "/" + identifier + ".gff -id " + identifier + " -codon 11 -topology linear -o " + output_file
54
    print(command)
Koehorst, Jasper's avatar
Koehorst, Jasper committed
55
    os.system(command)
Koehorst, Jasper's avatar
Koehorst, Jasper committed
56
    gzip(output_file)
57

Koehorst, Jasper's avatar
Koehorst, Jasper committed
58
def interproscan(identifier):
Koehorst, Jasper's avatar
Koehorst, Jasper committed
59
60
61
    output_file = "./genome/" + identifier + "/" + identifier + ".interproscan.ttl"
    if os.path.isfile(output_file + ".gz"): return
    command = "java -jar ./MGnifyParser.jar -tool interpro -i ./genome/" + identifier + "/" + identifier + ".conversion.ttl -o " + output_file + " -tsv ./genome/" +identifier +"/" + identifier + "_InterProScan.tsv -version InterProScan-v5_35-74_0"
Koehorst, Jasper's avatar
Koehorst, Jasper committed
62
    print(command)
Koehorst, Jasper's avatar
Koehorst, Jasper committed
63
    os.system(command)
Koehorst, Jasper's avatar
Koehorst, Jasper committed
64
    gzip(output_file)
Koehorst, Jasper's avatar
Koehorst, Jasper committed
65
66

def eggnog(identifier):
Koehorst, Jasper's avatar
Koehorst, Jasper committed
67
68
69
    output_file = "./genome/" + identifier + "/" + identifier + ".interproscan.eggnog.ttl"
    if os.path.isfile(output_file + ".gz"): return
    command = "java -jar ./MGnifyParser.jar -tool eggnog -i ./genome/" + identifier + "/" + identifier + ".interproscan.ttl -o " + output_file + " -tsv ./genome/" +identifier +"/" + identifier + "_eggNOG.tsv -v v2-db5.0"
Koehorst, Jasper's avatar
Koehorst, Jasper committed
70
    print(command)
Koehorst, Jasper's avatar
Koehorst, Jasper committed
71
    os.system(command)
Koehorst, Jasper's avatar
Koehorst, Jasper committed
72
    gzip(output_file)
Koehorst, Jasper's avatar
Koehorst, Jasper committed
73

Koehorst, Jasper's avatar
Koehorst, Jasper committed
74
75
76
77
78
79
def gzip(file):
    f_in = open(file)
    f_out = gzip.open(file + '.gz', 'wb')
    f_out.writelines(f_in)
    f_out.close()
    f_in.close()
Koehorst, Jasper's avatar
Koehorst, Jasper committed
80
81
82
83
84
85

def conversion_stage(identifier):
    conversion(identifier)
    interproscan(identifier)
    eggnog(identifier)

86
if __name__ == "__main__":
Koehorst, Jasper's avatar
Koehorst, Jasper committed
87
    identifiers = set()
88
    for line in open("identifiers.tsv"):
Koehorst, Jasper's avatar
Koehorst, Jasper committed
89
        if "MGYG" not in line: continue
90
91
92
93
        identifier = line.strip().split("\t")[1]
        accession =  line.strip().split("\t")[2]
        if (identifier.startswith("MGYG")):
            retrieval(identifier, accession)
Koehorst, Jasper's avatar
Koehorst, Jasper committed
94
95
96
97
98
            identifiers.add(identifier)

    # Number of cores
    num_cores = 3 # multiprocessing.cpu_count()
    results = Parallel(n_jobs=num_cores)(delayed(conversion_stage)(identifier) for identifier in identifiers)