Commit e5349de9 authored by Jasper Koehorst's avatar Jasper Koehorst
Browse files

excel creator simplified

parent fd39af12
# sra
# sra to metadata excel file
SRA XML crawler for datasets
\ No newline at end of file
SRA XML query and parser interface for datasets available through ENA.
To use it you need to provide 3 arguments.
For example:
To generate the excel file for these three projects you can use the following argument:
python3 study_creator.py "DRP005906|DRP007222|DRP007099" example@email.nl myexcelfilename.xlsx
To only create an excel file for one project you can use:
python3 study_creator.py "DRP005906" example@email.nl myexcelfilename.xlsx
The first argument after calling the python script is based on the entrez query syntax so you can make more sophisticated queries using the advanced search engine at https://www.ebi.ac.uk/ena/browser/advanced-search.
\ No newline at end of file
import os
import sys
from Bio import Entrez
from joblib import Parallel, delayed
from joblib.externals.loky.process_executor import TerminatedWorkerError
def processInput(identifier):
identifier = str(identifier)
folder = make_folder(identifier)
path = folder + "/" + identifier + ".xml"
if os.path.isfile(path):
# print("Already exists")
return path
Entrez.email = "ja@nee.com"
handle = Entrez.efetch(db="sra", id=identifier, rettype="xml", retmode="text")
content = handle.read()
print(path)
with open(path, 'w') as f:
print(content, file=f)
return path
def make_folder(identifier):
folder = ""
# print(identifier)
for e in identifier:
if len(folder) % 4 == 0:
folder = folder + "/"
folder = folder + e
return "./sra" + folder
while True:
query = "(((((\"paired\"[Layout]) AND ((\"instrument illumina hiseq 1000\"[Properties] OR \"instrument illumina hiseq 1500\"[Properties] OR \"instrument illumina hiseq 2000\"[Properties] OR \"instrument illumina hiseq 2500\"[Properties] OR \"instrument illumina hiseq 3000\"[Properties] OR \"instrument illumina hiseq 4000\"[Properties] OR \"instrument illumina hiseq x ten\"[Properties] OR \"instrument illumina miseq\"[Properties]))) AND \"illumina\"[Platform]) AND \"amplicon\"[Strategy]) AND \"filetype fastq\"[Properties]) AND \"cluster public\"[Properties]"
query = "PRJNA527973"
# query = "PRJNA517152"
Entrez.email = "A.N.Other@example.com" # Always tell NCBI who you are
handle = Entrez.esearch(db="sra",
term=query,
# term="(("
# "("
# "\"paired\"[Layout] AND \"illumina\"[Platform]"
# ") AND \"strategy amplicon\"[Properties]"
# ") AND \"amplicon\"[Strategy] AND \"filetype fastq\"[Properties]"
# ") AND "
# "("
# "cluster_public[prop] AND \"filetype fastq\"[Properties]"
# ") AND "
# "("
# "(\""
# "instrument illumina hiseq 1000\"[Properties] OR \"instrument illumina hiseq 1500\"[Properties] OR \"instrument illumina hiseq 2000\"[Properties] OR \"instrument illumina hiseq 2500\"[Properties] OR \"instrument illumina hiseq 3000\"[Properties] OR \"instrument illumina hiseq 4000\"[Properties] OR \"instrument illumina hiseq x ten\"[Properties] OR \"instrument illumina miseq\"[Properties]"
# "))",
retmax="3000000")
record = Entrez.read(handle)
identifiers = set()
for index, identifier in enumerate(record['IdList']):
folder = make_folder(identifier)
path = folder + "/" + identifier + ".xml"
if os.path.exists(path):
continue
if not os.path.exists(folder):
os.makedirs(folder)
identifiers.add(int(identifier))
num_cores = 2
print("Parsing", len(identifiers))
try:
results = Parallel(n_jobs=num_cores)(delayed(processInput)(i) for i in identifiers)
# Stop when finished
sys.exit(0)
except Exception as e:
pass
This diff is collapsed.
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment