Commit 8fef00b8 authored by Kruistum's avatar Kruistum
Browse files

Initial commit

\ No newline at end of file
Whole genome alignment pipeline
MUMmer 4.x (; make sure you added the locations of binaries to your PATH
SLURM (made to run on a cluster with SLURM as queue manager)
in your working directory, you will need fasta files of your genomes, fasta indexes of these files (for f in *.fa; do samtools faidx $f; done), and a seqfile with this format:
*species_name fastafile.fa
species_name2 fastafile2.fa
species_name3 fastafile3.fa
the star is to indicate the reference species.
Note that the pipeline does not handle dots in the fasta headers well, so if you have them please replace with something else (like _)
then, inside this directory, do:
ln -s /path/to/whole_genome_alignment/* .
to fetch all the necessary scripts
after this you can submit a whole genome alignment by using:
python --seqfile your_seqfile.txt --prefix output_prefix --split_in_this_amount max_amount_of_jobs
Note that the theoretical maximum amount of jobs is determined by the amount of fasta entries in your reference fasta (chromosomes if these are completely assembled)
the script will submit jobs to SLURM corresponding to the different pipeline stages, with dependencies so they are scheduled after the previous stage has finished.
Stages are:
1. alignment (MUMmer)
2. merging then splitting the alignments by scaffolds of the reference species
3. merging pairwise alignments into multi alignments for these scaffolds (e.g. local re-alignment)
4. concatenate results and cleanup
you can keep an eye on the jobs by typing squeue -u yourusername; if you see any jobs with reason DependencyNeverSatisfied then something went wrong
good luck ;)
# -*- coding: utf-8 -*-
Created on Tue Jul 30 15:40:58 2019
@author: kruis015
import sys
from functions import parse_maf, write_maf
import argparse
parser = argparse.ArgumentParser(description='add species names to a maf file')
parser.add_argument("--prefix", help="multi alignmer prefix", type=str)
parser.add_argument("--seqfile", help="multi aligner seqfile", type=str)
parser.add_argument("--ref", help="reference species", type=str)
args = parser.parse_args()
seqfile = args.seqfile
maf = args.prefix+'_sorted.maf'
mafout = args.prefix+'_sorted_speciesnames.maf'
def add_speciesname(parsedmaf, scafdict):
for block in parsedmaf:
block.reference.contig = scafdict[block.reference.contig]+'.'+block.reference.contig
except KeyError:
block.reference.contig = 'unknown.'+block.reference.contig
for line in block.lines:
line.contig = scafdict[line.contig]+'.'+line.contig
except KeyError:
line.contig = 'unknown.'+line.contig
yield block
scaflist = {}
with open(seqfile) as f:
for line in f:
species, fasta = line.strip().split()
species = species.strip('*')
faidx = fasta+'.fai'
with open(faidx) as f2:
for line in f2:
scaflist[line.split()[0]] = species
parsedmaf = parse_maf(maf, args.ref)
write_maf(add_speciesname(parsedmaf, scaflist), mafout)
\ No newline at end of file
File added
#SBATCH --job-name=joinmaf
#SBATCH --time=4-0
#SBATCH --mem=8000
#SBATCH --tmp=1000
#SBATCH --constraint=gen2
#SBATCH --output=joinmaflog_%j.txt
#SBATCH --error=error_output_joinmaflog_%j.txt
while getopts "i:o:r:" opt; do
case $opt in
echo "Invalid option: -$OPTARG"
python --ref $REF --infile $INFILE --out $OUT
# -*- coding: utf-8 -*-
Created on Thu Jul 11 11:17:55 2019
@author: kruis015
rewrite two or more maf files that are concatenated and sorted
partially rewritten from the LAST maf-join utility to allow local realignment
import sys
from functions import *
import subprocess
import argparse
parser = argparse.ArgumentParser(description='rewrite two maf files that are concatenated and sorted (based on maf-join)')
parser.add_argument("--infile", help="maf file", type=str)
parser.add_argument("--out", help="output file", type=str)
parser.add_argument("--ref", help="name of reference species")
args = parser.parse_args()
class cd:
"""Context manager for changing the current working directory"""
def __init__(self, newPath):
self.newPath = os.path.expanduser(newPath)
def __enter__(self):
self.savedPath = os.getcwd()
def __exit__(self, etype, value, traceback):
def countNonGaps(s): return len(s) - s.count('-')
def nthNonGap(s, n):
'''Get the start position of the n-th non-gap.'''
for i, x in enumerate(s):
if x != '-':
if n == 0: return i
n -= 1
raise ValueError('non-gap not found')
def nthLastNonGap(s, n):
'''Get the end position of the n-th last non-gap.'''
return len(s) - nthNonGap(s[::-1], n)
def mafSlice(block, alnBeg, alnEnd):
'''Return a slice of a MAF block, using coordinates in the alignment.'''
newblock = AlignmentBlock('newblock')
refstart = block.reference.startpos + countNonGaps(block.reference.seq[:alnBeg])
refend = block.reference.endpos - countNonGaps(block.reference.seq[alnEnd:])
refseq = block.reference.seq[alnBeg:alnEnd]
newblock.reference = AlignmentBlock.ReferenceLine(block.reference.species, block.reference.contig, refstart, refend - refstart,
block.reference.strand, block.reference.contiglength, refseq)
for line in block.lines:
linestart = line.startpos + countNonGaps(line.seq[:alnBeg])
lineend = line.endpos - countNonGaps(line.seq[alnEnd:])
lineseq = line.seq[alnBeg:alnEnd]
newblock.lines.append(AlignmentBlock.AlignmentLine(line.species, line.contig, linestart, lineend - linestart,
line.strand, line.contiglength, lineseq))
return newblock
def mafSliceTopSeq(block, newstartpos, newendpos):
'''Return a slice of a MAF block, using coordinates in the top sequence.'''
lettersFromBeg = newstartpos - block.reference.startpos
lettersFromEnd = block.reference.endpos - newendpos
alnBeg = nthNonGap(block.reference.seq, lettersFromBeg)
alnEnd = nthLastNonGap(block.reference.seq, lettersFromEnd)
return mafSlice(block, alnBeg, alnEnd)
def mafJoin(blocklist):
'''join two or more maf blocks into one big maf block
The blocks have to share their reference line coordinates'''
assert len(blocklist) > 1
firstblock = blocklist[0]
fstart = firstblock.reference.startpos
fend = firstblock.reference.endpos
fcontig = firstblock.reference.contig
fseq = firstblock.reference.seq.replace('-','')
for block in blocklist[1:]:
assert block.reference.startpos == fstart
assert block.reference.endpos == fend
assert block.reference.contig == fcontig
assert block.reference.seq.replace('-','') == fseq
for line in block.lines:
firstblock.lines.append(AlignmentBlock.AlignmentLine(line.species, line.contig, line.startpos, line.length,
line.strand, line.contiglength, line.seq))
#get sequences for realignment if block length > 1 nucleotide
realigndict = {}
realigndict['ref'] = firstblock.reference.seq.replace('-','')
for i, line in enumerate(firstblock.lines):
realigndict['line'+str(i)] = line.seq.replace('-','')
len1 = [len(s) == 1 for s in realigndict.values()]
if False in len1:
#now realign and store in block
write_fasta(realigndict, 'temp.fa')
aln = '/lustre/backup/WUR/ABGC/kruis015/heterandria/tools/mafft-linux64/mafft.bat --auto temp.fa > temp_aligned.fa', shell=True)
aligned_seqs = read_fasta('temp_aligned.fa')
firstblock.reference.seq = aligned_seqs['ref']
for i, line in enumerate(firstblock.lines):
line.seq = aligned_seqs['line'+str(i)]
return firstblock
def handle_overlaps(overlapping):
'''converts a set of overlapping blocks into a set of non-overlapping blocks'''
all_unique_boundaries = []
for block in overlapping:
if block.reference.startpos not in all_unique_boundaries:
if block.reference.endpos not in all_unique_boundaries:
all_unique_boundaries = sorted(all_unique_boundaries)
newblocks = []
for i, b in enumerate(all_unique_boundaries[:-1]):
slices = []
for block in overlapping:
if block.reference.startpos <= b and block.reference.endpos >= all_unique_boundaries[i+1]:
slices.append(mafSliceTopSeq(block, b, all_unique_boundaries[i+1]))
if len(slices) == 1:
elif len(slices) > 1:
print('something went wrong')
return newblocks
def yield_blocks(alignments):
#function that yields blocks after the respective portion of the genome is handled by the overlap handling function
#works with a sorted maf file and a cache system: blocks are on first instance added to the cache. New blocks are
#compared to blocks in the cache. If they overlap, the block pairs are replaced with sets of non-overlapping blocks
#blocks in cache that have lower coordinates that the current block are yielded and can be written to the new maf file
cache = []
for block in alignments:
if 'n' in block.reference.seq:
fl = block.reference
removelist = []
newblocks = []
for block3 in cache: #first check if blocks can be yielded
if (block3.reference.endpos < fl.startpos and block3.reference.contig == fl.contig) or block3.reference.contig != fl.contig:
currentblock = cache.pop(cache.index(block3))
yield currentblock
overlapping = [block] #then collect overlapping blocks
for i, block2 in enumerate(cache):
fl2 = block2.reference
if fl.startpos < fl2.endpos and fl.contig == fl2.contig:
if len(overlapping) > 1: #if there are overlapping blocks, rewrite;
newblocks = handle_overlaps(overlapping)
except (AssertionError, ValueError):
print('skipping one set of blocks')
if len(removelist) == 0:
elif len(removelist) > 0:
cache2 = [block for block in cache if block not in removelist]
cache = cache2
cache += newblocks
#in the end, clean cache
for block in cache:
yield block
newdir = args.infile.split('.')[0]
cmd0 = 'mkdir %s' % newdir, shell=True)
with cd('./'+newdir):
cmd1 = 'ln -s ../%s' % args.infile, shell=True)
alignments = parse_maf(args.infile, args.ref)
write_maf(yield_blocks(alignments), args.out)
cmd2 = 'rm %s' % args.infile, shell=True)
cmd3 = 'ln -s %s/%s' % (newdir, args.out), shell=True)
\ No newline at end of file
#! /bin/sh
# Sort MAF-format alignments by sequence name, then strand, then start
# position, then end position, of the top sequence. Also, merge
# identical alignments. Comment lines starting with "#" are written
# at the top, in unchanged order. If option "-d" is specified, then
# alignments that appear only once are omitted (like uniq -d).
# Minor flaws, that do not matter for typical MAF input:
# 1) It might not work if the input includes TABs.
# 2) Preceding whitespace is considered part of the sequence name. I
# want to use sort -b, but it seems to be broken in different ways for
# different versions of sort!
# 3) Alignments with differences in whitespace are considered
# non-identical.
# This script uses perl instead of specialized commands like uniq.
# The reason is that, on some systems (e.g. Mac OS X), uniq doesn't
# work with long lines.
# Make "sort" use a standard ordering:
export LC_ALL
while getopts hdn: opt
case $opt in
h) cat <<EOF
Usage: $(basename $0) [options] my-alignments.maf
-h show this help message and exit
-d only print duplicate alignments
-n sort by the n-th sequence (default: 1)
d) uniqOpt=2
n) whichSequence="$OPTARG"
shift $((OPTIND - 1))
baseField=$((6 * $whichSequence))
a=$(($baseField - 4))
b=$(($baseField - 1))
c=$(($baseField - 3))
d=$(($baseField - 2))
# 1) Add digits to "#" lines, so that sorting won't change their order.
# 2) Replace spaces, except in "s" lines.
# 3) Join each alignment into one big line.
perl -pe '
y/ /\a/ unless /^s/;
y/\n/\b/ if /^\w/;
' "$@" |
sort -k$a -k$b -k${c}n -k${d}n | # sort the lines
# Print only the first (or second) of each run of identical lines:
perl -ne '$c = 0 if $x ne $_; $x = $_; print if ++$c == '$uniqOpt |
# 1) Remove the digits from "#" lines.
# 2) Restore spaces and newlines.
perl -pe '
y/\a\b/ \n/;
#SBATCH --job-name=merge_mafs
#SBATCH --time=100
#SBATCH --mem=4000
#SBATCH --tmp=1000
#SBATCH --constraint=gen2
#SBATCH --output=anctest_%j.txt
#SBATCH --error=error_output_anctest_%j.txt
while getopts "p:s:r:" opt; do
case $opt in
echo "Invalid option: -$OPTARG"
#concatenate results
grep -vh '#' *_rewritten.maf >> ${PREFIX}.maf
./maf-sort ${PREFIX}.maf > ${PREFIX}_sorted.maf
#cleanup intermediate files
rm ${PREFIX}.maf
rm *joinmaflog*
rm -r splitmaf_*
#add species names
python --prefix $PREFIX --seqfile $SEQFILE --ref $REF
\ No newline at end of file
# -*- coding: utf-8 -*-
Created on Thu Jul 11 16:28:27 2019
@author: kruis015
import sys
from functions import *
import subprocess
import argparse
parser = argparse.ArgumentParser(description='rewrite two maf files that are concatenated and sorted (based on maf-join)')
parser.add_argument("--ref", help="name of reference species")
parser.add_argument("--numjobs", help="amount of available jobs", type=int)
args = parser.parse_args()
def mindict(d):
rev = {}
for p in d:
rev[d[p]] = p
minimum = min([x for x in d.values()])
return rev[minimum]
def divide_work(maffile, refspecies, num_processes):
'''divide the work between processes, based on scaffold lengths'''
scafs = {}
for block in parse_maf(maffile, refspecies):
if block.reference.contig not in scafs:
scafs[block.reference.contig] = block.reference.contiglength
processes = {}
counter = {}
for i in range(num_processes):
processes[str(i)] = []
counter[str(i)] = 0
for s in scafs:
counter[mindict(counter)] += scafs[s]
return processes
cmd1 = 'grep -hv "#" *%s.maf >> alltogether.maf' % args.ref, shell=True)
cmd2 = './maf-sort alltogether.maf > alltogether_sorted.maf', shell=True)
cmd3 = 'rm alltogether.maf', shell=True)
processes = divide_work('alltogether_sorted.maf', args.ref, args.numjobs)
for p in processes:
alignments = parse_maf('alltogether_sorted.maf', args.ref, select=processes[p])
write_maf(alignments, 'splitmaf_'+str(p)+'.maf')
cmd4 = 'rm alltogether_sorted.maf', shell=True)
#SBATCH --job-name=splitmaf
#SBATCH --time=1-0
#SBATCH --mem=8000
#SBATCH --tmp=1000
#SBATCH --constraint=gen2
#SBATCH --output=split_%j.txt
#SBATCH --error=error_output_split_%j.txt
while getopts "n:r:" opt; do
case $opt in
echo "Invalid option: -$OPTARG"
python --ref $REF --numjobs $NUMJOBS
# -*- coding: utf-8 -*-
Created on Wed Jul 10 09:09:11 2019
@author: kruis015
import argparse
import os
import subprocess
import sys
import math
parser = argparse.ArgumentParser(description='pipeline for multi genome alignment')
parser.add_argument("--seqfile", help="sequence file in progressivecactus format (mandatory)", type=str)
parser.add_argument("--prefix", help="output prefix (mandatory)", type=str)
parser.add_argument("--split_in_this_amount", help="split maf merging in this amount of jobs (default: %(default)d)", type=int, default=25)
parser.add_argument("--alignment_length", help="filter out alignments shorter than this number (default: %(default)d)", type=int, default=500)
args = parser.parse_args()
cwd = os.getcwd()
ref_species = ''
ref_fasta = ''
others = {}
with open(args.seqfile) as f:
for line in f:
species, file = line.strip().split()
if line[0] == '*':
ref_species = species.strip('*')
ref_fasta = file
others[species] = file
except ValueError:
if ref_species == '':
print('please indicate your reference species with a * before the species name')
jobcounter = 0
jobids = []
for species in others: #launch alignment jobs
cmd = 'sbatch -r %s -q %s -l %s -o %s' % (ref_fasta, others[species], args.alignment_length, species+'_to_'+ref_species)
jobstr = subprocess.check_output(cmd, shell=True)
jobcounter += len(jobids)
depstr = 'afterok'
for jobid in jobids:
depstr += ':'+jobid
cmd2 = 'sbatch --dependency=%s -n %s -r %s' % (depstr, args.split_in_this_amount, ref_species)
jobstr = subprocess.check_output(cmd2, shell=True)
jobid2 = jobstr.strip().split()[-1]
jobcounter += 1
jobids3 = []
#join maf files:
for j in range(args.split_in_this_amount):
cmd3 = 'sbatch --dependency=afterok:%s -i %s -o %s -r %s' % (jobid2, 'splitmaf_'+str(j)+'.maf', 'splitmaf_'+str(j)+'_rewritten.maf', ref_species)
jobstr = subprocess.check_output(cmd3, shell=True)
jobcounter += len(jobids3)
#finally merge results
depstr = 'afterok'
for jobid in jobids3:
depstr += ':'+jobid
cmd2 = 'sbatch --dependency=%s -p %s -r %s -s %s' % (depstr, args.prefix, ref_species, args.seqfile), shell=True)
jobcounter += 1
print('submitted %s jobs to slurm' % jobcounter)
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment