Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
sysop-bioinformatics
management_scripts
Commits
63fb3517
Commit
63fb3517
authored
Sep 09, 2009
by
Haarst, Jan van
Browse files
scripts to work with sff files
parent
e9b45830
Changes
3
Expand all
Hide whitespace changes
Inline
Side-by-side
sff/sff_change_id.py
0 → 100755
View file @
63fb3517
#!/usr/bin/python
'''
This software adds information to the identifiers of reads in an sff file
This way, different sources of reads can be easily tracked within assemblies,
mappings, etc.
'''
# copyright Jan van Haarst
# Plant Research International, Wageningen UR
# this was build upon sff_extract.py,
#copyright Jose Blanca and Bastien Chevreux
#COMAV institute, Universidad Politecnica de Valencia (UPV)
#Valencia, Spain
#This program is free software: you can redistribute it and/or modify
#it under the terms of the GNU General Public License as published by
#the Free Software Foundation, either version 3 of the License, or
#(at your option) any later version.
#This program is distributed in the hope that it will be useful,
#but WITHOUT ANY WARRANTY; without even the implied warranty of
#MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
#GNU General Public License for more details.
#You should have received a copy of the GNU General Public License
#along with this program. If not, see <http://www.gnu.org/licenses/>.
__author__
=
'Jan van Haarst'
__copyright__
=
'Copyright 2009, Jan van Haarst, Plant Research International, Wageningen UR'
__license__
=
'GPLv3 or later'
__version__
=
'0.0.1'
__email__
=
'jan.vanhaarst@wur.nl'
__status__
=
'beta'
import
struct
import
sys
import
os
import
subprocess
import
tempfile
fake_sff_name
=
'fake_sff_name'
# readname as key: lines with matches from SSAHA, one best match
ssahapematches
=
{}
# linker readname as key: length of linker sequence
linkerlengths
=
{}
# set to true if something really fishy is going on with the sequences
stern_warning
=
True
def
read_bin_fragment
(
struct_def
,
fileh
,
offset
=
0
,
data
=
None
,
byte_padding
=
None
):
'''It reads a chunk of a binary file.
You have to provide the struct, a file object, the offset (where to start
reading).
Also you can provide an optional dict that will be populated with the
extracted data.
If a byte_padding is given the number of bytes read will be a multiple of
that number, adding the required pad at the end.
It returns the number of bytes reads and the data dict.
'''
if
data
is
None
:
data
=
{}
#we read each item
bytes_read
=
0
for
item
in
struct_def
:
#we go to the place and read
fileh
.
seek
(
offset
+
bytes_read
)
n_bytes
=
struct
.
calcsize
(
item
[
1
])
buffer
=
fileh
.
read
(
n_bytes
)
read
=
struct
.
unpack
(
'>'
+
item
[
1
],
buffer
)
if
len
(
read
)
==
1
:
read
=
read
[
0
]
data
[
item
[
0
]]
=
read
bytes_read
+=
n_bytes
#if there is byte_padding the bytes_to_read should be a multiple of the
#byte_padding
if
byte_padding
is
not
None
:
pad
=
byte_padding
bytes_read
=
((
bytes_read
+
pad
-
1
)
//
pad
)
*
pad
return
(
bytes_read
,
data
)
def
check_magic
(
magic
):
''' It checks that the magic number of the file matches the sff magic.
(".sff"=0x2E736666=779314790)
'''
if
magic
!=
779314790
:
raise
RuntimeError
(
'This file does not seems to be an sff file.'
)
def
check_version
(
version
):
'''It checks that the version is supported, otherwise it raises an error.'''
supported
=
(
'
\x00
'
,
'
\x00
'
,
'
\x00
'
,
'
\x01
'
)
i
=
0
for
item
in
version
:
if
version
[
i
]
!=
supported
[
i
]:
raise
RuntimeError
(
'SFF version not supported. Please contact the author of the software.'
)
i
+=
1
def
read_header
(
fileh
):
'''It reads the header from the sff file and returns a dict with the
information'''
#first we read the first part of the header
head_struct
=
[
(
'magic_number'
,
'I'
),
(
'version'
,
'cccc'
),
(
'index_offset'
,
'Q'
),
(
'index_length'
,
'I'
),
(
'number_of_reads'
,
'I'
),
(
'header_length'
,
'H'
),
(
'key_length'
,
'H'
),
(
'number_of_flows_per_read'
,
'H'
),
(
'flowgram_format_code'
,
'B'
),
]
data
=
{}
first_bytes
,
data
=
read_bin_fragment
(
struct_def
=
head_struct
,
fileh
=
fileh
,
offset
=
0
,
data
=
data
)
check_magic
(
data
[
'magic_number'
])
check_version
(
data
[
'version'
])
#now that we know the number_of_flows_per_read and the key_length
#we can read the second part of the header
struct2
=
[
(
'flow_chars'
,
str
(
data
[
'number_of_flows_per_read'
])
+
'c'
),
(
'key_sequence'
,
str
(
data
[
'key_length'
])
+
'c'
)
]
read_bin_fragment
(
struct_def
=
struct2
,
fileh
=
fileh
,
offset
=
first_bytes
,
data
=
data
)
return
data
def
read_sequence
(
header
,
fileh
,
fposition
):
'''It reads one read from the sff file located at the fposition and
returns a dict with the information.'''
header_length
=
header
[
'header_length'
]
index_offset
=
header
[
'index_offset'
]
index_length
=
header
[
'index_length'
]
#the sequence struct
read_header_1
=
[
(
'read_header_length'
,
'H'
),
(
'name_length'
,
'H'
),
(
'number_of_bases'
,
'I'
),
(
'clip_qual_left'
,
'H'
),
(
'clip_qual_right'
,
'H'
),
(
'clip_adapter_left'
,
'H'
),
(
'clip_adapter_right'
,
'H'
),
]
def
read_header_2
(
name_length
):
'''It returns the struct definition for the second part of the header'''
return
[(
'name'
,
str
(
name_length
)
+
'c'
)]
def
read_data
(
number_of_bases
):
'''It returns the struct definition for the read data section.'''
#size = {'c': 1, 'B':1, 'H':2, 'I':4, 'Q':8}
if
header
[
'flowgram_format_code'
]
==
1
:
flow_type
=
'H'
else
:
raise
Error
(
'file version not supported'
)
number_of_bases
=
str
(
number_of_bases
)
return
[
(
'flowgram_values'
,
str
(
header
[
'number_of_flows_per_read'
])
+
flow_type
),
(
'flow_index_per_base'
,
number_of_bases
+
'B'
),
(
'bases'
,
number_of_bases
+
'c'
),
(
'quality_scores'
,
number_of_bases
+
'B'
),
]
data
=
{}
#we read the first part of the header
bytes_read
,
data
=
read_bin_fragment
(
struct_def
=
read_header_1
,
fileh
=
fileh
,
offset
=
fposition
,
data
=
data
)
read_bin_fragment
(
struct_def
=
read_header_2
(
data
[
'name_length'
]),
fileh
=
fileh
,
offset
=
fposition
+
bytes_read
,
data
=
data
)
#we join the letters of the name
data
[
'name'
]
=
''
.
join
(
data
[
'name'
])
offset
=
data
[
'read_header_length'
]
#we read the sequence and the quality
read_data_st
=
read_data
(
data
[
'number_of_bases'
])
bytes_read
,
data
=
read_bin_fragment
(
struct_def
=
read_data_st
,
fileh
=
fileh
,
offset
=
fposition
+
offset
,
data
=
data
,
byte_padding
=
8
)
#we join the bases
data
[
'bases'
]
=
''
.
join
(
data
[
'bases'
])
return
data
[
'read_header_length'
]
+
bytes_read
,
data
def
sequences
(
fileh
,
header
):
'''It returns a generator with the data for each read.'''
#now we can read all the sequences
fposition
=
header
[
'header_length'
]
#position in the file
reads_read
=
0
while
True
:
if
fposition
==
header
[
'index_offset'
]:
#we have to skip the index section
fposition
+=
index_length
continue
else
:
bytes_read
,
seq_data
=
read_sequence
(
header
=
header
,
fileh
=
fileh
,
fposition
=
fposition
)
yield
seq_data
fposition
+=
bytes_read
reads_read
+=
1
if
reads_read
>=
header
[
'number_of_reads'
]:
break
def
extract_read_info
(
data
,
fname
):
'''Given the data for one read it returns 2 strs with the fasta seq, fasta
qual .'''
#seq and qual
seq
=
data
[
'bases'
]
qual
=
data
[
'quality_scores'
]
name_line
=
''
.
join
((
'>'
,
data
[
'name'
],
'
\n
'
))
seq
=
''
.
join
((
name_line
,
seq
,
'
\n
'
))
qual_line
=
' '
.
join
([
str
(
q
)
for
q
in
qual
])
qual
=
''
.
join
((
name_line
,
qual_line
,
'
\n
'
))
return
seq
,
qual
def
get_read_data
(
data
):
'''Given the data for one read it returns 2 strs with the fasta seq
and fasta qual.'''
#seq and qual
if
config
[
'mix_case'
]:
seq
=
sequence_case
(
data
)
qual
=
data
[
'quality_scores'
]
else
:
seq
=
data
[
'bases'
]
qual
=
data
[
'quality_scores'
]
return
seq
,
qual
def
extract_reads_from_sff
(
config
):
'''Given the configuration and the list of sff_files it renames the id's and
writes an new sff file with that data.
'''
sff_file
=
config
[
'input_fname'
]
#check whether we can read the file
try
:
if
not
os
.
path
.
getsize
(
sff_file
):
raise
RuntimeError
(
'Empty file? : '
+
sff_file
)
fh
=
open
(
sff_file
,
'r'
)
fh
.
close
()
except
:
raise
RuntimeError
(
'Unreadable file? : '
+
str
(
sff_file
))
sff_fh
=
open
(
sff_file
,
'rb'
)
print
"Working on '"
+
sff_file
+
"':"
seqcheckstore
=
([])
sys
.
stdout
.
flush
()
header_data
=
read_header
(
fileh
=
sff_fh
)
print
"Read Header done:"
,
header_data
nseqs_sff
=
0
for
seq_data
in
sequences
(
fileh
=
sff_fh
,
header
=
header_data
):
seq
,
qual
=
extract_read_info
(
seq_data
,
sff_fh
.
name
)
print
seq
print
qual
nseqs_sff
+=
1
print
"Seq Header done:"
,
nseqs_sff
sff_fh
.
close
()
#sff_out_fh = open(config['output_fname'], w)
#seqcheckstore = ([])
#sff_out_fh.close()
##########################################################################
#
# BaCh: This block was shamelessly copied from
# http://python.genedrift.org/2007/07/04/reading-fasta-files-conclusion/
# and then subsequently modified to read fasta correctly
# It's still not fool proof, but should be good enough
#
##########################################################################
class
Fasta
:
def
__init__
(
self
,
name
,
sequence
):
self
.
name
=
name
self
.
sequence
=
sequence
def
read_fasta
(
file
):
items
=
[]
aninstance
=
Fasta
(
''
,
''
)
linenum
=
0
for
line
in
file
:
linenum
+=
1
if
line
.
startswith
(
">"
):
if
len
(
aninstance
.
sequence
):
items
.
append
(
aninstance
)
# name == all characters until the first whitespace
# (split()[0]) but without the starting ">" ([1:])
aninstance
.
name
=
line
.
split
()[
0
][
1
:]
aninstance
.
sequence
=
''
if
len
(
aninstance
.
name
)
==
0
:
raise
RuntimeError
(
file
.
name
+
': no name in line '
+
str
(
linenum
)
+
'?'
)
else
:
if
len
(
aninstance
.
name
)
==
0
:
raise
RuntimeError
(
file
.
name
+
': no sequence header at line '
+
str
(
linenum
)
+
'?'
)
aninstance
.
sequence
+=
line
.
strip
()
if
len
(
aninstance
.
name
)
and
len
(
aninstance
.
sequence
):
items
.
append
(
aninstance
)
return
items
##########################################################################
def
version_string
():
return
"sff_change_id "
+
__version__
def
read_config
():
'''It reads the configuration options from the command line arguments and
it returns a dict with them.'''
from
optparse
import
OptionParser
,
OptionGroup
usage
=
"usage: %prog [options] sff_file"
desc
=
"Rename read in a single SFF file"
parser
=
OptionParser
(
usage
=
usage
,
version
=
version_string
(),
description
=
desc
)
parser
.
add_option
(
"-i"
,
"--in_name"
,
dest
=
"input_fname"
,
help
=
"name for output file"
)
parser
.
add_option
(
"-o"
,
"--out_name"
,
dest
=
"output_fname"
,
help
=
"name for output file"
)
#we parse the cmd line
(
options
,
args
)
=
parser
.
parse_args
()
#we put the result in a dict
global
config
config
=
{}
for
property
in
dir
(
options
):
if
property
[
0
]
==
'_'
or
property
in
(
'ensure_value'
,
'read_file'
,
'read_module'
):
continue
config
[
property
]
=
getattr
(
options
,
property
)
return
config
,
args
##########################################################################
def
main
():
argv
=
sys
.
argv
if
len
(
argv
)
==
1
:
sys
.
argv
.
append
(
'-h'
)
read_config
()
sys
.
exit
()
try
:
config
,
args
=
read_config
()
extract_reads_from_sff
(
config
)
except
(
OSError
,
IOError
,
RuntimeError
),
errval
:
print
errval
return
1
if
stern_warning
:
return
1
return
0
if
__name__
==
"__main__"
:
sys
.
exit
(
main
())
sff/sff_change_id_biopython.py
0 → 100755
View file @
63fb3517
#!/usr/bin/python
""""
Script to append a suffix to an identifier.
see http://seqanswers.com/forums/showthread.php?p=8106
(c) Peter Cock, Jan van Haarst
"""
import
sys
from
optparse
import
OptionParser
sys
.
path
=
[
"/home/jvh/code/biopython.nobackup/peterjc-biopython-fd061a56f32f16ae92fc8d076a371d869fb53c41"
]
+
sys
.
path
#sys.path.append("/home/jvh/code/biopython.nobackup/peterjc-biopython-fd061a56f32f16ae92fc8d076a371d869fb53c41")
from
Bio
import
SeqIO
def
rename
(
record
,
suffix
)
:
"""Function to alter the record's identifier."""
record
.
id
+=
suffix
return
record
def
rename_split
(
record
,
suffix
)
:
"""Function to alter the record's identifier."""
record
.
id
+=
record
.
id
.
split
(
suffix
)[
0
]
return
record
def
main
():
parser
=
OptionParser
(
usage
=
"%prog --input=input_filename --output=output_filename --ID-suffix=string_to_add_to_identifier"
,
version
=
"%prog 1.0"
)
parser
.
add_option
(
"-i"
,
"--input"
,
dest
=
"input_filename"
,
help
=
"Input filename"
,
metavar
=
"FILE"
)
parser
.
add_option
(
"-o"
,
"--output"
,
dest
=
"output_filename"
,
help
=
"Output filename"
,
metavar
=
"FILE"
)
parser
.
add_option
(
"-s"
,
"--ID-suffix"
,
dest
=
"suffix"
,
help
=
"String_to_add_to_identifier"
,
metavar
=
"STRING"
)
(
options
,
args
)
=
parser
.
parse_args
()
if
not
options
.
output_filename
or
not
options
.
input_filename
or
not
options
.
suffix
:
parser
.
print_help
()
sys
.
exit
(
1
)
print
'Input:
\t
'
,
options
.
input_filename
print
'Output:
\t
'
,
options
.
output_filename
print
'Suffix:
\t
'
,
options
.
suffix
#Python generator expression, only one record in memory at a time:
records
=
(
rename
(
rec
,
options
.
suffix
)
for
rec
in
SeqIO
.
parse
(
open
(
options
.
input_filename
,
"rb"
),
"sff"
))
#records = (rename_split(rec,options.suffix) for rec in SeqIO.parse(open(options.input_filename,"rb"),"sff"))
#This will not write the Roche XML manifest!
handle
=
open
(
options
.
output_filename
,
"wb"
)
SeqIO
.
write
(
records
,
handle
,
"sff"
)
handle
.
close
()
if
__name__
==
"__main__"
:
main
()
sff/sff_extract
0 → 100755
View file @
63fb3517
This diff is collapsed.
Click to expand it.
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment