Commit 63fb3517 authored by Haarst, Jan van's avatar Haarst, Jan van
Browse files

scripts to work with sff files

parent e9b45830
#!/usr/bin/python
'''
This software adds information to the identifiers of reads in an sff file
This way, different sources of reads can be easily tracked within assemblies,
mappings, etc.
'''
# copyright Jan van Haarst
# Plant Research International, Wageningen UR
# this was build upon sff_extract.py,
#copyright Jose Blanca and Bastien Chevreux
#COMAV institute, Universidad Politecnica de Valencia (UPV)
#Valencia, Spain
#This program is free software: you can redistribute it and/or modify
#it under the terms of the GNU General Public License as published by
#the Free Software Foundation, either version 3 of the License, or
#(at your option) any later version.
#This program is distributed in the hope that it will be useful,
#but WITHOUT ANY WARRANTY; without even the implied warranty of
#MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
#GNU General Public License for more details.
#You should have received a copy of the GNU General Public License
#along with this program. If not, see <http://www.gnu.org/licenses/>.
__author__ = 'Jan van Haarst'
__copyright__ = 'Copyright 2009, Jan van Haarst, Plant Research International, Wageningen UR'
__license__ = 'GPLv3 or later'
__version__ = '0.0.1'
__email__ = 'jan.vanhaarst@wur.nl'
__status__ = 'beta'
import struct
import sys
import os
import subprocess
import tempfile
fake_sff_name = 'fake_sff_name'
# readname as key: lines with matches from SSAHA, one best match
ssahapematches = {}
# linker readname as key: length of linker sequence
linkerlengths = {}
# set to true if something really fishy is going on with the sequences
stern_warning = True
def read_bin_fragment(struct_def, fileh, offset=0, data=None,
byte_padding=None):
'''It reads a chunk of a binary file.
You have to provide the struct, a file object, the offset (where to start
reading).
Also you can provide an optional dict that will be populated with the
extracted data.
If a byte_padding is given the number of bytes read will be a multiple of
that number, adding the required pad at the end.
It returns the number of bytes reads and the data dict.
'''
if data is None:
data = {}
#we read each item
bytes_read = 0
for item in struct_def:
#we go to the place and read
fileh.seek(offset + bytes_read)
n_bytes = struct.calcsize(item[1])
buffer = fileh.read(n_bytes)
read = struct.unpack('>' + item[1], buffer)
if len(read) == 1:
read = read[0]
data[item[0]] = read
bytes_read += n_bytes
#if there is byte_padding the bytes_to_read should be a multiple of the
#byte_padding
if byte_padding is not None:
pad = byte_padding
bytes_read = ((bytes_read + pad - 1) // pad) * pad
return (bytes_read, data)
def check_magic(magic):
''' It checks that the magic number of the file matches the sff magic.
(".sff"=0x2E736666=779314790)
'''
if magic != 779314790:
raise RuntimeError('This file does not seems to be an sff file.')
def check_version(version):
'''It checks that the version is supported, otherwise it raises an error.'''
supported = ('\x00', '\x00', '\x00', '\x01')
i = 0
for item in version:
if version[i] != supported[i]:
raise RuntimeError('SFF version not supported. Please contact the author of the software.')
i += 1
def read_header(fileh):
'''It reads the header from the sff file and returns a dict with the
information'''
#first we read the first part of the header
head_struct = [
('magic_number', 'I'),
('version', 'cccc'),
('index_offset', 'Q'),
('index_length', 'I'),
('number_of_reads', 'I'),
('header_length', 'H'),
('key_length', 'H'),
('number_of_flows_per_read', 'H'),
('flowgram_format_code', 'B'),
]
data = {}
first_bytes, data = read_bin_fragment(struct_def=head_struct, fileh=fileh,
offset=0, data=data)
check_magic(data['magic_number'])
check_version(data['version'])
#now that we know the number_of_flows_per_read and the key_length
#we can read the second part of the header
struct2 = [
('flow_chars', str(data['number_of_flows_per_read']) + 'c'),
('key_sequence', str(data['key_length']) + 'c')
]
read_bin_fragment(struct_def=struct2, fileh=fileh, offset=first_bytes,
data=data)
return data
def read_sequence(header, fileh, fposition):
'''It reads one read from the sff file located at the fposition and
returns a dict with the information.'''
header_length = header['header_length']
index_offset = header['index_offset']
index_length = header['index_length']
#the sequence struct
read_header_1 = [
('read_header_length', 'H'),
('name_length', 'H'),
('number_of_bases', 'I'),
('clip_qual_left', 'H'),
('clip_qual_right', 'H'),
('clip_adapter_left', 'H'),
('clip_adapter_right', 'H'),
]
def read_header_2(name_length):
'''It returns the struct definition for the second part of the header'''
return [('name', str(name_length) +'c')]
def read_data(number_of_bases):
'''It returns the struct definition for the read data section.'''
#size = {'c': 1, 'B':1, 'H':2, 'I':4, 'Q':8}
if header['flowgram_format_code'] == 1:
flow_type = 'H'
else:
raise Error('file version not supported')
number_of_bases = str(number_of_bases)
return [
('flowgram_values', str(header['number_of_flows_per_read']) +
flow_type),
('flow_index_per_base', number_of_bases + 'B'),
('bases', number_of_bases + 'c'),
('quality_scores', number_of_bases + 'B'),
]
data = {}
#we read the first part of the header
bytes_read, data = read_bin_fragment(struct_def=read_header_1,
fileh=fileh, offset=fposition, data=data)
read_bin_fragment(struct_def=read_header_2(data['name_length']),
fileh=fileh, offset=fposition + bytes_read, data=data)
#we join the letters of the name
data['name'] = ''.join(data['name'])
offset = data['read_header_length']
#we read the sequence and the quality
read_data_st = read_data(data['number_of_bases'])
bytes_read, data = read_bin_fragment(struct_def=read_data_st,
fileh=fileh, offset=fposition + offset,
data=data, byte_padding=8)
#we join the bases
data['bases'] = ''.join(data['bases'])
return data['read_header_length'] + bytes_read, data
def sequences(fileh, header):
'''It returns a generator with the data for each read.'''
#now we can read all the sequences
fposition = header['header_length'] #position in the file
reads_read = 0
while True:
if fposition == header['index_offset']:
#we have to skip the index section
fposition += index_length
continue
else:
bytes_read, seq_data = read_sequence(header=header, fileh=fileh,
fposition=fposition)
yield seq_data
fposition += bytes_read
reads_read += 1
if reads_read >= header['number_of_reads']:
break
def extract_read_info(data, fname):
'''Given the data for one read it returns 2 strs with the fasta seq, fasta
qual .'''
#seq and qual
seq = data['bases']
qual = data['quality_scores']
name_line = ''.join(('>', data['name'],'\n'))
seq = ''.join((name_line, seq, '\n'))
qual_line = ' '.join([str(q) for q in qual])
qual = ''.join((name_line, qual_line, '\n'))
return seq, qual
def get_read_data(data):
'''Given the data for one read it returns 2 strs with the fasta seq
and fasta qual.'''
#seq and qual
if config['mix_case']:
seq = sequence_case(data)
qual = data['quality_scores']
else :
seq = data['bases']
qual = data['quality_scores']
return seq, qual
def extract_reads_from_sff(config):
'''Given the configuration and the list of sff_files it renames the id's and
writes an new sff file with that data.
'''
sff_file=config['input_fname']
#check whether we can read the file
try:
if not os.path.getsize(sff_file):
raise RuntimeError('Empty file? : ' + sff_file)
fh = open(sff_file, 'r')
fh.close()
except:
raise RuntimeError('Unreadable file? : ' + str(sff_file))
sff_fh = open(sff_file, 'rb')
print "Working on '" + sff_file + "':"
seqcheckstore = ([])
sys.stdout.flush()
header_data = read_header(fileh=sff_fh)
print "Read Header done:", header_data
nseqs_sff=0
for seq_data in sequences(fileh=sff_fh, header=header_data):
seq, qual = extract_read_info(seq_data, sff_fh.name)
print seq
print qual
nseqs_sff += 1
print "Seq Header done:",nseqs_sff
sff_fh.close()
#sff_out_fh = open(config['output_fname'], w)
#seqcheckstore = ([])
#sff_out_fh.close()
##########################################################################
#
# BaCh: This block was shamelessly copied from
# http://python.genedrift.org/2007/07/04/reading-fasta-files-conclusion/
# and then subsequently modified to read fasta correctly
# It's still not fool proof, but should be good enough
#
##########################################################################
class Fasta:
def __init__(self, name, sequence):
self.name = name
self.sequence = sequence
def read_fasta(file):
items = []
aninstance = Fasta('', '')
linenum = 0
for line in file:
linenum += 1
if line.startswith(">"):
if len(aninstance.sequence):
items.append(aninstance)
# name == all characters until the first whitespace
# (split()[0]) but without the starting ">" ([1:])
aninstance.name = line.split()[0][1:]
aninstance.sequence = ''
if len(aninstance.name) == 0:
raise RuntimeError(file.name + ': no name in line ' + str(linenum) + '?')
else:
if len(aninstance.name) == 0:
raise RuntimeError(file.name + ': no sequence header at line ' + str(linenum) + '?')
aninstance.sequence += line.strip()
if len(aninstance.name) and len(aninstance.sequence):
items.append(aninstance)
return items
##########################################################################
def version_string ():
return "sff_change_id " + __version__
def read_config():
'''It reads the configuration options from the command line arguments and
it returns a dict with them.'''
from optparse import OptionParser, OptionGroup
usage = "usage: %prog [options] sff_file"
desc = "Rename read in a single SFF file"
parser = OptionParser(usage = usage, version = version_string(), description = desc)
parser.add_option("-i", "--in_name", dest="input_fname",
help="name for output file")
parser.add_option("-o", "--out_name", dest="output_fname",
help="name for output file")
#we parse the cmd line
(options, args) = parser.parse_args()
#we put the result in a dict
global config
config = {}
for property in dir(options):
if property[0] == '_' or property in ('ensure_value', 'read_file',
'read_module'):
continue
config[property] = getattr(options, property)
return config, args
##########################################################################
def main():
argv = sys.argv
if len(argv) == 1:
sys.argv.append('-h')
read_config()
sys.exit()
try:
config, args = read_config()
extract_reads_from_sff(config)
except (OSError, IOError, RuntimeError), errval:
print errval
return 1
if stern_warning:
return 1
return 0
if __name__ == "__main__":
sys.exit(main())
#!/usr/bin/python
""""
Script to append a suffix to an identifier.
see http://seqanswers.com/forums/showthread.php?p=8106
(c) Peter Cock, Jan van Haarst
"""
import sys
from optparse import OptionParser
sys.path = ["/home/jvh/code/biopython.nobackup/peterjc-biopython-fd061a56f32f16ae92fc8d076a371d869fb53c41"] + sys.path
#sys.path.append("/home/jvh/code/biopython.nobackup/peterjc-biopython-fd061a56f32f16ae92fc8d076a371d869fb53c41")
from Bio import SeqIO
def rename(record,suffix) :
"""Function to alter the record's identifier."""
record.id += suffix
return record
def rename_split(record,suffix) :
"""Function to alter the record's identifier."""
record.id += record.id.split(suffix)[0]
return record
def main():
parser = OptionParser(usage="%prog --input=input_filename --output=output_filename --ID-suffix=string_to_add_to_identifier", version="%prog 1.0")
parser.add_option("-i", "--input",
dest="input_filename",
help="Input filename",
metavar="FILE")
parser.add_option("-o", "--output",
dest="output_filename",
help="Output filename",
metavar="FILE")
parser.add_option("-s", "--ID-suffix",
dest="suffix",
help="String_to_add_to_identifier",
metavar="STRING")
(options, args) = parser.parse_args()
if not options.output_filename or not options.input_filename or not options.suffix:
parser.print_help()
sys.exit(1)
print 'Input:\t', options.input_filename
print 'Output:\t', options.output_filename
print 'Suffix:\t', options.suffix
#Python generator expression, only one record in memory at a time:
records = (rename(rec,options.suffix) for rec in SeqIO.parse(open(options.input_filename,"rb"),"sff"))
#records = (rename_split(rec,options.suffix) for rec in SeqIO.parse(open(options.input_filename,"rb"),"sff"))
#This will not write the Roche XML manifest!
handle = open(options.output_filename, "wb")
SeqIO.write(records, handle, "sff")
handle.close()
if __name__ == "__main__":
main()
This diff is collapsed.
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment