Runyasra.py
From OpenWetWare
Jump to navigationJump to search
Copy and past the following into a text editor and save it as runyasra.py to use the script.
#!/usr/bin/env python
#Usage: python runyasra.py [options] <Reads> <Reference>
#
#Options:
# --version show program's version number and exit
# -h, --help show this help message and exit
# -t READ_TYPE, --read-type=READ_TYPE
# Specify the type of reads. (Default: solexa)
# -o ORIENTATION, --orientation=ORIENTATION
# Specify orientation of the sequence. (Default:
# circular)
# -p PERCENT_IDENTITY, --percent-identity=PERCENT_IDENTITY
# The percent identity (PID in yasra). The settings
# correspond to different percent values depending on
# the read type (-t). (Defalt: same)
# -m PATH, --make-path=PATH
# Specify path to external makefile used by YASRA.
# (Default: use the makefile built in to runyasra)
# -s, --single-step Activate yasra's single_step option (Default: run
# yasra normally)
# -v, --verbose Print relevant statistics and progresss reports.
# (Default: run silently)
# -r, --remove-dots-reads
# Replace dots with Ns in the reads file before runnning
# yasra. The modified file is placed in the output
# filder.(Default: create a link to the original file)
# -f, --remove-dots-ref
# Replace dots with Ns in the reference file before
# runnning yasra. The modified file is placed in the
# output filder.(Default: create a link to the original
# file)
# -d, --dos2unix-ref Run dos2unix on the reference file before yasra. The
# modified file is placed in the output filder.
# (Default: create a link to the priginal reference)
def readFile(filePath):
fileHandle = open(filePath, 'r')
fileData = fileHandle.read()
fileHandle.close()
return fileData
def writeFile(filePath, fileData):
fileHandle = open(filePath, 'w')
fileHandle.write(fileData)
fileHandle.close()
#Error handling function#####################################################################################################
def errorExit(message=None,exitStatus=1):
'''Is called when the script encounters a fatal error. Prints the debug log to standard out (usually the screen)'''
if message:
print 'Error: ' + message
else:
for line in debugLog: print line #print debug log
sys.exit(exitStatus)
#############################################################################################################################
###Imports / Variable Initilization##########################################################################################
import os, sys, copy, string
from datetime import *
from optparse import *
from subprocess import *
timeStamp = datetime.now().ctime().replace(' ','-') #Ex: 'Mon-Jun-14-11:08:55-2010'
commandLine = copy.deepcopy(sys.argv)
yasraPath = '/local/cluster/YASRA/bin'
cwd = os.getcwd()
progName, progVersion = ('runyasra','1.02')
progUsage = 'python %s [options] <Reads> <Reference>' % progName
#############################################################################################################################
###Command Line Parser#######################################################################################################
cmndLineParser = OptionParser(usage=progUsage, version="Version %s" % progVersion)
cmndLineParser.add_option('-t', '--read-type', action='store', default='solexa', type='choice', choices=['solexa','454'], help="Specify the type of reads. (Default: solexa)")
cmndLineParser.add_option('-o', '--orientation', action='store', default='circular', type='choice', choices=['circular','linear'], help="Specify orientation of the sequence. (Default: circular")
cmndLineParser.add_option('-p', '--percent-identity', action='store', default='same', type='choice', choices=['same','high','medium','low','verylow','desperate'], help="The percent identity (PID in yasra). The settings correspond to different percent values depending on the read type (-t). (Defalt: same)")
cmndLineParser.add_option('-m', '--makefile-path', action='store', default=None, type='string', metavar='PATH', help="Specify path to external makefile used by YASRA. (Default: use the makefile built in to runyasra)")
cmndLineParser.add_option('-s', '--single-step', action='store_true', default=False, help="Activate yasra's single_step option (Default: run yasra normally)")
cmndLineParser.add_option('-v', '--verbose', action='store_true', default=False, help='Print relevant statistics and progresss reports. (Default: run silently)')
cmndLineParser.add_option('-r', '--remove-dots-reads', action='store_true', default=False, help='Replace dots with Ns in the reads file before runnning yasra. The modified file is placed in the output filder.(Default: create a link to the original file)')
cmndLineParser.add_option('-f', '--remove-dots-ref', action='store_true', default=False, help='Replace dots with Ns in the reference file before runnning yasra. The modified file is placed in the output filder.(Default: create a link to the original file)')
cmndLineParser.add_option('-d', '--dos2unix-ref', action='store_true', default=False, help='Run dos2unix on the reference file before yasra. The modified file is placed in the output filder. (Default: create a link to the priginal reference)')
(options, args) = cmndLineParser.parse_args(commandLine)
#############################################################################################################################
###Command Line Interpretation###############################################################################################
argNum = len(args) - 1 #counts the amount of arguments, negating the 'runyasra' at the start of the command line
if argNum != 2: errorExit('runyasra takes exactly 2 argumets; %d supplied' % argNum, 0)
readsPath = os.path.join(cwd, args[-2])
refPath = os.path.join(cwd, args[-1])
if os.path.exists(readsPath) == False: errorExit('Invalid path to the reads file: %s' % readsPath,0)
if os.path.exists(refPath) == False: errorExit('Invalid path to the reference file: %s' % refPath,0)
outDirName = '%s_%s_%s' % (os.path.basename(readsPath), os.path.basename(refPath), timeStamp)
outDirPath = os.path.join(os.getcwd(),outDirName) #output directory is located in the current working directory
os.mkdir(outDirPath) #create output directory
os.chdir(outDirPath) #move into the output directory
#############################################################################################################################
###Dot replacement / Dos2unix implimentation#################################################################################
newReadsPath = os.path.join(outDirPath, os.path.basename(readsPath))
newRefPath = os.path.join(outDirPath, os.path.basename(refPath))
if options.remove_dots_reads:
readsData = readFile(readsPath)
readsData.replace('.','N')
writeFile(newReadsPath, readsData)
else:
os.link(readsPath, newReadsPath)
if options.remove_dots_ref or options.dos2unix_ref:
if options.remove_dots_ref:
refData = readFile(refPath)
refData.replace('.','N')
writeFile(newRefPath, refData)
if options.dos2unix_ref:
dos2unixCmndLine = ['dos2unix','--quiet','--newfile',refPath,newRefPath]
dos2unixProcess = Popen(dos2unixCmndLine)
dos2unixProcess.wait()
else:
os.link(refPath, newRefPath)
#############################################################################################################################
###Makefile Creation#########################################################################################################
if options.makefile_path is None:
makefileData = '''
#for a full scale assisted assembly:
# keep files $(READS) Makefile and reference.fa in this directory, change the
# values of the variables C, READS and then type:
#
# make TYPE= ORIENT= PID= &> Summary
#
# The choices for TYPE are '454' and 'solexa'. '454' refers to reads from 454
# technology which are more than 150 bp long. 'solexa' refers to short reads
# which are expected to be less than 50 bps.
#
# The choices for ORIENT are 'linear' and 'circular'. It refers to the
# orientation of the reference genome. (example: mtDNA would be circular)
#
# The choices for PID are:
#
# --for 454 reads:
# 'same' : about 98% identity
# 'high' : about 95% identity
# 'medium : about 90% identity
# 'low' : about 85% identity
# 'verylow' : about 75% identity
# 'desperate' : really low scores, because we are despeate (rather slow)
#
# --for solexa reads:
# 'same' : about 95% identity
# 'medium' : about 85% identity
# 'desperate' : low scores, as we are desperate :) (rather slow)
#
# This mode of assembly is rather slow because we try to walk through the small
# gaps and close the contigs. However, if we want to ignore the gaps and want
# quicker results (and this is the mode I would advise most of the time) is by
# typing:
#
# make single_step TYPE= ORIENT= PID= &> Summary
#
# Final output of the process is the file 'Final_Assembly' which is the fasta
# file of contigs. 'Assembly.qual' which details the base profile for each of
# the positions in the consensus sequence. A section in this
# file is like:
#
# Contig 3
# .
# .
# .
#
# 302 C 0 181 0 0 0 181
# .
# .
#
# where the depicted line shows the base profile for the position 302 in Contig
# 3.
# The line is to be read as follows:
# "Base Position" "Consensus Base" "Number of reads supporting 'A'" "Number of
# reads supporting 'C'" "Number of reads supporting 'G'" "Number of reads
# supporting 'T'" "Number of reads supporting a gap" "Total coverage at that
# base"
#
# Another file Amb.txt is generated which lists the positions the assembler is
# not sure about. Contigs.ace is an ace file which is generated by the
# assembler. Use 'toAmos' to convert it to a format which can be viewed using
# Hawkeye. I have not tested it with Consed.
#
# Running
#
# make subs
# creates a file of differences between the target genome and the reference
# genome and gives some statistics about them.
#
# For more information regarding options and tools in YASRA please contact :
#
# ratan@bx.psu.edu
#
#this needs to be changed to point to the directory with the binaries.
C=''' + yasraPath + '''
#this is the file of reads to be used for the assembly
READS=''' + os.path.basename(readsPath) + '''
#this is a soft link/file which is to be used as a template
TEMPLATE=''' + os.path.basename(refPath) + '''
#this is the length of the ids of the reads.
WL=`cat $(READS) | grep '>' | awk '{print $$1}' | wc -L`
#is this 454 or solexa data
TYPE=''' + options.read_type + '''
#is this a circular or a linear genome
ORIENT=''' + options.orientation + '''
#maximum length of a read
MAX=`cat $(READS) | awk '{if(substr($$0,1,1) == ">"){if(a>max){max=a}; a=0} else{a = a+length($$0)}}; END{print max}'`
# Q gives parameters for finding weak matches to a rather distant species
# R gives parameters for finding high-identity matches to endogenous DNA
ifeq ($(TYPE), 454)
''' + '\t' + '''MAKE_TEMPLATE=min=150
''' + '\t' + '''ifeq ($(PID),same)
''' + '\t\t' + '''Q=--yasra98
''' + '\t' + '''endif
''' + '\t' + '''ifeq ($(PID),high)
''' + '\t\t' + '''Q=--yasra95
''' + '\t' + '''endif
''' + '\t' + '''ifeq ($(PID),medium)
''' + '\t\t' + '''Q=--yasra90
''' + '\t' + '''endif
''' + '\t' + '''ifeq ($(PID), low)
''' + '\t\t' + '''Q=--yasra85
''' + '\t' + '''endif
''' + '\t' + '''ifeq ($(PID), verylow)
''' + '\t\t' + '''Q=--yasra75
''' + '\t' + '''endif
''' + '\t' + '''ifeq ($(PID), desperate)
''' + '\t\t' + '''Q=Y=2000 K=2200 L=3000
''' + '\t' + '''endif
''' + '\t' + '''R=--yasra98
endif
ifeq ($(TYPE), solexa)
''' + '\t' + '''MAKE_TEMPLATE=N=100 min=30
''' + '\t' + '''SOLEXA_INSERT=N=100
''' + '\t' + '''SOLEXA=-solexa
''' + '\t' + '''ifeq ($(PID),same)
''' + '\t\t' + '''Q=--yasra95short
''' + '\t' + '''endif
''' + '\t' + '''ifeq ($(PID),medium)
''' + '\t\t' + '''Q=--yasra85short
''' + '\t' + '''endif
''' + '\t' + '''ifeq ($(PID), desperate)
''' + '\t\t' + '''Q=T=0 W=7 K=1200 L=1400
''' + '\t' + '''endif
''' + '\t' + '''R=--yasra95short
endif
ifeq ($(ORIENT), circular)
''' + '\t' + '''CIRCULAR=-circular
endif
TOOLS=''' + yasraPath + '''
COMPARE=$(TEMPLATE)
CON_INFO=info.txt
REJECT=hits_reject.fa
REJ=Rejects.txt
REP=repeats.txt
HITS=hits_final.fa
all:step1 step2 step3 step4 step5
single_step:
''' + '\t' + '''make final_assembly T=$(TEMPLATE) V=60 P="$Q" I=70 S="-sig=1"
''' + '\t' + '''rm MAlign ReMAlign
step1 :
''' + '\t' + '''#Assemble on the original template:
''' + '\t' + '''make assemble_hits T=$(TEMPLATE) V=60 I=70 P="$Q" N=1
step2:
''' + '\t' + '''touch fake.txt
''' + '\t' + '''$C/finisher $(REJ) $(REP)
''' + '\t' + '''@rm Assembly* hits* template[0-9]* $(REP) fake.txt
step3:
''' + '\t' + '''#Determine difference between reads and assembly:
''' + '\t' + '''lastz template $(READS) $R | $C/lav_processor | $C/template_hits template $(READS) cov=70 pct=90 stats discard=$(REJECT) > read_hits
''' + '\t' + '''make plot H=read_hits
step4:
''' + '\t' + '''#Trim Assembly:
''' + '\t' + '''if [ -s plot_rejects.eps ]; then mv plot_rejects.eps old_plot_rejects.eps; fi
''' + '\t' + '''$C/trim_assembly template read_hits $(REJECT) min=1 > AssemblyX
''' + '\t' + '''$C/make_template AssemblyX noends $(MAKE_TEMPLATE) info=$(CON_INFO) > ftemplate
''' + '\t' + '''make assemble_hits T=ftemplate V=70 I=90 P="$R" N=Y
''' + '\t' + '''$C/make_template AssemblyY noends $(MAKE_TEMPLATE) info=$(CON_INFO) > final_template
''' + '\t' + '''lastz final_template $(READS) $R | $C/lav_processor | $C/template_hits final_template $(READS) cov=70 pct=90 stats discard=$(REJECT) > read_hits
''' + '\t' + '''make plot H=read_hits
''' + '\t' + '''@rm read_hits AssemblyX AssemblyY ftemplate Assembly_ftemplate hits_ftemplate.fa
step5:
''' + '\t' + '''make final_assembly T=final_template V=70 P="$R" I=90 S="-sig=1"
''' + '\t' + '''@rm final_template template MAlign ReMAlign
final_assembly:
''' + '\t' + '''time lastz $T $(READS) $P | time $C/lav_processor $S | time $C/template_hits $T $(READS) cov=$V pct=$I discard=$(REJECT) > $(HITS)
''' + '\t' + '''cat $(HITS) | grep '>Hit' | awk '{print $$4, $$1}' | sort -n | uniq -w $(WL) -D | awk '{print $$2}' > $(REP)
''' + '\t' + '''time $C/assembler hits_final.fa -ace=Contigs.ace -rejects=$(REJ) -repeats=$(REP) $(SOLEXA) -max_length=$(MAX) > MAlign
''' + '\t' + '''time $C/realigner -ace=Contigs.ace < MAlign > ReMAlign
''' + '\t' + '''time $C/consense -ace=Contigs.ace -amb=Amb.txt -profile=Assembly.qual < ReMAlign > Final_Assembly
plot:
''' + '\t' + '''make -f $(TOOLS)/tools.mk rejects H=$H R=$(REJECT) C=$(CON_INFO);
stepx:
''' + '\t' + '''#Assemble on the endogenous contigs:
''' + '\t' + '''cp fake.txt $(CON_INFO)
''' + '\t' + '''$C/make_template Assembly$W info=fake.txt $(MAKE_TEMPLATE)> template$X
''' + '\t' + '''make assemble_hits T=template$X V=70 P="$R" I=90 N=$X
assemble_hits :
''' + '\t' + '''time lastz $T $(READS) $P | $C/lav_processor | $C/template_hits $T $(READS) cov=$V pct=$I discard=$(REJECT) stats > hits_$T.fa
''' + '\t' + '''time make assemble HITS=hits_$T.fa
''' + '\t' + '''$C/welder Assembly_$T $(CIRCULAR) $(SOLEXA) | $C/consense > Assembly$N
assemble:
''' + '\t' + '''cat $(HITS) | grep '>Hit' | awk '{print $$4, $$1}' | sort -n | uniq -w $(WL) -D | awk '{print $$2}' > $(REP)
''' + '\t' + '''$C/assembler $(HITS) -repeats=$(REP) $(SOLEXA) -rejects=$(REJ) -max_length=$(MAX) | $C/realigner > ReMAlign
''' + '\t' + '''$C/consense -amb=Amb.txt -profile=Assembly.qual < ReMAlign > Assembly_$T
subs:
''' + '\t' + '''lastz $(COMPARE) Final_Assembly B=0 --identity=95..100 > fake.bz
''' + '\t' + '''$C/substitutions fake.bz $(COMPARE) Final_Assembly substitutions gaps > subs.txt
clean:
''' + '\t' + '''rm -r `ls -1 | grep -v "Makefile\|$(READS)\|reference.fa\|README"`
'''
else:
makefileData = readFile(options.makefile_path)
makefilePath = os.path.join(outDirPath, 'Makefile')
makefileHandle = open(makefilePath, 'w')
makefileHandle.write(makefileData)
makefileHandle.close()
#############################################################################################################################
def printProcess(process):
'''Prints the standard ouput and error messages of a process while the process ia running.'''
outLen = 0
errLen = 0
out = ''
err = ''
while process.poll() is None: #while it hasnt finished...
(outData, errData) = process.communicate()
outDiff = len(outData) - outLen
errDiff = len(errData) - errLen
if outDiff > 0:
outLen += outDiff
out += outData[-outDiff:]
print outData[-outDiff:]
if errDiff > 0:
errLen += errDiff
err += errData[-errDiff:]
print errData[-errDiff:]
return (out, err)
###Run yasra#################################################################################################################
makeCmndLine = ['make','TYPE=' + options.read_type,'ORIENT=' + options.orientation,'PID=' + options.percent_identity]
if options.single_step: makeCmndLine.insert(1,'single_step')
makeStdOut = open(os.path.join(outDirPath, 'yasra_standard_output.txt'), 'w')
makeStdErr = open(os.path.join(outDirPath, 'yasra_standard_error.txt'), 'w')
makeProcess = Popen(makeCmndLine, stdout=PIPE, stderr=PIPE)
if options.verbose:
(outData, errData) = printProcess(makeProcess)
else:
(outData, errData) = makeProcess.communicate()
makeProcess.wait()
makeStdOut.write(outData)
makeStdErr.write(errData)
makeStdOut.close()
makeStdErr.close()
if makeProcess.returncode == 0: #yasra completed normally
if options.verbose: print 'yasra output saved in the directory: %s' % outDirPath
else:
errorExit('yasra returned a non-zero code; it may have not completed succesfully', 0)
dontChange = ('Makefile',os.path.basename(readsPath), os.path.basename(refPath))
outPaths = [os.path.join(outDirPath, path) for path in os.listdir(outDirPath) if os.path.isfile(path) and path not in dontChange]
renamedPaths = [os.path.join(outDirPath, '%s_%s_%s%s' % (os.path.splitext(path)[0], os.path.basename(readsPath), os.path.basename(refPath), os.path.splitext(path)[1])) for path in os.listdir(outDirPath) if os.path.isfile(path) and path not in dontChange]
for index in range(0,len(outPaths)): os.rename(outPaths[index],renamedPaths[index])
############################################################################################################################