run.py 4.62 KB
Newer Older
Koehorst, Jasper's avatar
Koehorst, Jasper committed
1
2
from joblib import Parallel, delayed
import multiprocessing
3
4
5
import os
import sys
import shlex
Koehorst, Jasper's avatar
Koehorst, Jasper committed
6
import gzip
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49

curl = """curl -X POST -H "Content-Type: application/x-www-form-urlencoded" -d 'result=sample&query=secondary_sample_accession%3D%22XXXXXX%22&format=tsv' "https://www.ebi.ac.uk/ena/portal/api/search\""""
xml = "https://www.ebi.ac.uk/ena/browser/api/xml/"
# wget https://www.ebi.ac.uk/metagenomics/api/v1/genomes/MGYG-HGUT-00001/downloads/MGYG-HGUT-00001.fna
# wget https://www.ebi.ac.uk/metagenomics/api/v1/genomes/MGYG-HGUT-00001/downloads/MGYG-HGUT-00001.gff
# wget https://www.ebi.ac.uk/metagenomics/api/v1/genomes/MGYG-HGUT-00001/downloads/MGYG-HGUT-00001_InterProScan.tsv
# wget https://www.ebi.ac.uk/metagenomics/api/v1/genomes/MGYG-HGUT-02750/downloads/MGYG-HGUT-02750_eggNOG.tsv

source = "https://www.ebi.ac.uk/metagenomics/api/v1/genomes/XXX/downloads/XXX.type"

def retrieval(identifier, accession):
    print("Processing", identifier)
    path = source.replace("XXX", identifier)
    fna = path.replace(".type",".fna")
    gff = path.replace(".type",".gff")
    interpro = path.replace(".type","_InterProScan.tsv")
    egg = path.replace(".type", "_eggNOG.tsv")
    
    # Curl command
    xml_destination = "./xml/" + accession + ".xml"
    if not os.path.isfile(xml_destination):
        command = curl.replace("XXXXXX", accession)
        output = run_command(command)
        os.system("wget -nc " + xml + output.split()[2] + " --output-document=" + xml_destination)
        
    # Wget command
    for element in [fna, gff, interpro, egg]:
        # Create the folder
        path = "./genome/" + identifier + "/"
        if not os.path.isdir(path):
            os.makedirs(path)
        # Download the files
        path = "./genome/" + identifier + "/" + element.split("/")[-1]
        if not os.path.isfile(path):
            command = "wget -nc " + element + " --output-document=" + path
            os.system(command)

def run_command(command):
    import subprocess
    result = subprocess.run(shlex.split(command), capture_output=True)
    return result.stdout.decode('ascii')

def conversion(identifier):
Koehorst, Jasper's avatar
Koehorst, Jasper committed
50
51
    fasta_file = "./genome/" + identifier + "/" + identifier + ".fna"
    gff_file = "./genome/" + identifier + "/" + identifier + ".gff"
Koehorst, Jasper's avatar
Koehorst, Jasper committed
52
    output_file = "./genome/" + identifier + "/" + identifier + ".conversion.ttl"
Koehorst, Jasper's avatar
Koehorst, Jasper committed
53
    
Koehorst, Jasper's avatar
Koehorst, Jasper committed
54
    if os.path.isfile(output_file + ".gz"): return
Koehorst, Jasper's avatar
Koehorst, Jasper committed
55
56
    
    command("java -jar ./MGnifyParser.jar -tool conversion -f " + fasta_file + " -gff2rdf -i " + gff_file + " -id " + identifier + " -codon 11 -topology linear -o " + output_file)
57

Koehorst, Jasper's avatar
Koehorst, Jasper committed
58
def interproscan(identifier):
Koehorst, Jasper's avatar
Koehorst, Jasper committed
59
    input_file = "./genome/" + identifier + "/" + identifier + ".conversion.ttl"
Koehorst, Jasper's avatar
Koehorst, Jasper committed
60
    output_file = "./genome/" + identifier + "/" + identifier + ".interproscan.ttl"
Koehorst, Jasper's avatar
Koehorst, Jasper committed
61
62
    tsv_file = "./genome/" +identifier +"/" + identifier + "_InterProScan.tsv"

Koehorst, Jasper's avatar
Koehorst, Jasper committed
63
    if os.path.isfile(output_file + ".gz"): return
Koehorst, Jasper's avatar
Koehorst, Jasper committed
64
65

    command("java -jar ./MGnifyParser.jar -tool interpro -i " + input_file + " -o " + output_file + " -tsv " + tsv_file + " -version InterProScan-v5_35-74_0")
Koehorst, Jasper's avatar
Koehorst, Jasper committed
66
67

def eggnog(identifier):
Koehorst, Jasper's avatar
Koehorst, Jasper committed
68
69
    input_file = "./genome/" + identifier + "/" + identifier + ".interproscan.ttl"
    tsv_file = "./genome/" +identifier +"/" + identifier + "_eggNOG.tsv"
Koehorst, Jasper's avatar
Koehorst, Jasper committed
70
    output_file = "./genome/" + identifier + "/" + identifier + ".interproscan.eggnog.ttl"
Koehorst, Jasper's avatar
Koehorst, Jasper committed
71
    
Koehorst, Jasper's avatar
Koehorst, Jasper committed
72
    if os.path.isfile(output_file + ".gz"): return
Koehorst, Jasper's avatar
Koehorst, Jasper committed
73
74
75
76
77

    command("java -jar ./MGnifyParser.jar -tool eggnog -i " + input_file + " -o " + output_file + " -tsv " + tsv_file + " -v v2-db5.0")
    

def gzip(input_file):
Koehorst, Jasper's avatar
Koehorst, Jasper committed
78
79
80
    if os.path.isfile(input_file):
        command = "gzip " + input_file
        os.system(command)
Koehorst, Jasper's avatar
Koehorst, Jasper committed
81

Koehorst, Jasper's avatar
Koehorst, Jasper committed
82
83
84
def command(command):
    print(command)
    os.system(command)
Koehorst, Jasper's avatar
Koehorst, Jasper committed
85
86
87
88
89

def conversion_stage(identifier):
    conversion(identifier)
    interproscan(identifier)
    eggnog(identifier)
Koehorst, Jasper's avatar
Koehorst, Jasper committed
90
91
92
93
94
        # Perform gzip compression
    gzip("./genome/" + identifier + "/" + identifier + ".conversion.ttl")
    gzip("./genome/" + identifier + "/" + identifier + ".interproscan.ttl")
    gzip("./genome/" + identifier + "/" + identifier + ".interproscan.eggnog.ttl")

Koehorst, Jasper's avatar
Koehorst, Jasper committed
95

96
if __name__ == "__main__":
Koehorst, Jasper's avatar
Koehorst, Jasper committed
97
    identifiers = set()
98
    for line in open("identifiers.tsv"):
Koehorst, Jasper's avatar
Koehorst, Jasper committed
99
        if "MGYG" not in line: continue
100
101
102
103
        identifier = line.strip().split("\t")[1]
        accession =  line.strip().split("\t")[2]
        if (identifier.startswith("MGYG")):
            retrieval(identifier, accession)
Koehorst, Jasper's avatar
Koehorst, Jasper committed
104
            identifiers.add(identifier)
Koehorst, Jasper's avatar
Koehorst, Jasper committed
105
106
            conversion_stage(identifier)
            break
Koehorst, Jasper's avatar
Koehorst, Jasper committed
107
    # Number of cores
Koehorst, Jasper's avatar
Koehorst, Jasper committed
108
109
    # num_cores = 3 # multiprocessing.cpu_count()
    # results = Parallel(n_jobs=num_cores)(delayed(conversion_stage)(identifier) for identifier in identifiers)