Commit f1323363 authored by Overduin, Sam's avatar Overduin, Sam
Browse files

Upload cut_barcode_from_reads.py

parent f62a6ce7
#!/usr/bin/env python3
import gzip
import argparse
import datetime
import subprocess
#./cut_barcode_from_reads.py -i ../data/barcoded/enriched.fastq.gz -o ../data/barcoded/enriched_trimmed.fastq.gz
parser = argparse.ArgumentParser()
parser.add_argument('-i', '--infile', required=True)
#parser.add_argument('-p', '--threads', type=int, required=False)
parser.add_argument('-o', '--outfile', required=True)
args = parser.parse_args()
def write_output(output_file, output_lines):
#compress_file = False
if output_file.endswith('.gz'):
compress_file = True
output_file = output_file[:-3]
with open(output_file, 'w+') as f:
for line in output_lines:
f.write(line)
print('Done writing')
if compress_file:
print('Starting compression')
cmd = 'pigz -p16 "'+output_file+'"'
subprocess.call(cmd, shell=True)
print('Done compression')
new_fasta = []
print(datetime.datetime.now(), 'started reading fastq.gz')
with gzip.open(args.infile, 'rt') as f:
i = 0
n = 0
corrected_instances = 0
records = 0
for line in f:
n += 1
if n == 1:
start_bar = line.find('BX:Z:')+5
if start_bar == 4:
new_fasta.append(line)
barcode = None
else:
new_fasta.append(line)
barcode = line[start_bar:start_bar+16]
#print(line, barcode)
elif n == 2:
if barcode:
bar_in_seq = line.find(barcode)
if bar_in_seq != -1:
cut_pos = bar_in_seq + 16
#print(line, line[cut_pos:], barcode)
new_fasta.append(line[cut_pos:])
corrected_instances += 1
else:
cut_pos = 0
new_fasta.append(line)
else:
cut_pos = 0
new_fasta.append(line)
elif n == 3:
new_fasta.append(line)
elif n == 4:
new_fasta.append(line[cut_pos:])
records += 1
if records % 100000000 == 0:
print(datetime.datetime.now(), records, 'records parsed')
n = 0
else:
print('this state should never be reached, check code')
print(line)
exit()
i += 1
#if i == 100000:
#break
print(datetime.datetime.now(), 'Trimmed', corrected_instances, 'out of', records, 'records')
print(datetime.datetime.now(), 'Writing file')
write_output(args.outfile, new_fasta)
print(datetime.datetime.now(), 'Done writing')
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment