Runyasra.py

Copy and past the following into a text editor and save it as runyasra.py to use the script.


 * 1) !/usr/bin/env python


 * 1) Usage: python runyasra.py [options]  
 * 2) Options:
 * 3)  --version             show program's version number and exit
 * 4)  -h, --help            show this help message and exit
 * 5)  -t READ_TYPE, --read-type=READ_TYPE
 * 6)                        Specify the type of reads. (Default: solexa)
 * 7)  -o ORIENTATION, --orientation=ORIENTATION
 * 8)                        Specify orientation of the sequence. (Default:
 * 9)                        circular)
 * 10)  -p PERCENT_IDENTITY, --percent-identity=PERCENT_IDENTITY
 * 11)                        The percent identity (PID in yasra). The settings
 * 12)                        correspond to different percent values depending on
 * 13)                        the read type (-t). (Defalt: same)
 * 14)  -m PATH, --make-path=PATH
 * 15)                        Specify path to external makefile used by YASRA.
 * 16)                        (Default: use the makefile built in to runyasra)
 * 17)  -s, --single-step     Activate yasra's single_step option (Default: run
 * 18)                        yasra normally)
 * 19)  -v, --verbose         Print relevant statistics and progresss reports.
 * 20)                        (Default: run silently)
 * 21)  -r, --remove-dots-reads
 * 22)                        Replace dots with Ns in the reads file before runnning
 * 23)                        yasra. The modified file is placed in the output
 * 24)                        filder.(Default: create a link to the original file)
 * 25)  -f, --remove-dots-ref
 * 26)                        Replace dots with Ns in the reference file before
 * 27)                        runnning yasra. The modified file is placed in the
 * 28)                        output filder.(Default: create a link to the original
 * 29)                        file)
 * 30)  -d, --dos2unix-ref    Run dos2unix on the reference file before yasra. The
 * 31)                        modified file is placed in the output filder.
 * 32)                        (Default: create a link to the priginal reference)
 * 1)                        (Default: create a link to the priginal reference)

def readFile(filePath): fileHandle = open(filePath, 'r') fileData = fileHandle.read fileHandle.close return fileData

def writeFile(filePath, fileData): fileHandle = open(filePath, 'w') fileHandle.write(fileData) fileHandle.close

def errorExit(message=None,exitStatus=1): '''Is called when the script encounters a fatal error. Prints the debug log to standard out (usually the screen)''' if message: print 'Error: ' + message else: for line in debugLog: print line  #print debug log sys.exit(exitStatus)
 * 1) Error handling function#####################################################################################################

import os, sys, copy, string from datetime import * from optparse import * from subprocess import * timeStamp = datetime.now.ctime.replace(' ','-') #Ex: 'Mon-Jun-14-11:08:55-2010' commandLine = copy.deepcopy(sys.argv) yasraPath = '/local/cluster/YASRA/bin' cwd = os.getcwd progName, progVersion = ('runyasra','1.02') progUsage = 'python %s [options]  ' % progName
 * 1) Imports / Variable Initilization##########################################################################################

cmndLineParser = OptionParser(usage=progUsage, version="Version %s" % progVersion) cmndLineParser.add_option('-t', '--read-type',         action='store',         default='solexa',   type='choice',  choices=['solexa','454'],                                       help="Specify the type of reads. (Default: solexa)") cmndLineParser.add_option('-o', '--orientation',       action='store',         default='circular', type='choice',  choices=['circular','linear'],                                  help="Specify orientation of the sequence. (Default: circular") cmndLineParser.add_option('-p', '--percent-identity',   action='store',         default='same',     type='choice',  choices=['same','high','medium','low','verylow','desperate'],   help="The percent identity (PID in yasra). The settings correspond to different percent values depending on the read type (-t). (Defalt: same)") cmndLineParser.add_option('-m', '--makefile-path',      action='store',         default=None,       type='string',  metavar='PATH',                                                 help="Specify path to external makefile used by YASRA. (Default: use the makefile built in to runyasra)") cmndLineParser.add_option('-s', '--single-step',        action='store_true',    default=False,                                                                                      help="Activate yasra's single_step option (Default: run yasra normally)") cmndLineParser.add_option('-v', '--verbose',           action='store_true',    default=False,                                                                                      help='Print relevant statistics and progresss reports. (Default: run silently)') cmndLineParser.add_option('-r', '--remove-dots-reads', action='store_true',    default=False,                                                                                      help='Replace dots with Ns in the reads file before runnning yasra. The modified file is placed in the output filder.(Default: create a link to the original file)') cmndLineParser.add_option('-f', '--remove-dots-ref',   action='store_true',    default=False,                                                                                      help='Replace dots with Ns in the reference file before runnning yasra. The modified file is placed in the output filder.(Default: create a link to the original file)') cmndLineParser.add_option('-d', '--dos2unix-ref',      action='store_true',    default=False,                                                                                      help='Run dos2unix on the reference file before yasra. The modified file is placed in the output filder. (Default: create a link to the priginal reference)') (options, args) = cmndLineParser.parse_args(commandLine)
 * 1) Command Line Parser#######################################################################################################

argNum = len(args) - 1  #counts the amount of arguments, negating the 'runyasra' at the start of the command line if argNum != 2: errorExit('runyasra takes exactly 2 argumets; %d supplied' % argNum, 0) readsPath = os.path.join(cwd, args[-2]) refPath = os.path.join(cwd, args[-1]) if os.path.exists(readsPath) == False: errorExit('Invalid path to the reads file: %s' % readsPath,0) if os.path.exists(refPath) == False: errorExit('Invalid path to the reference file: %s' % refPath,0) outDirName = '%s_%s_%s' % (os.path.basename(readsPath), os.path.basename(refPath), timeStamp) outDirPath = os.path.join(os.getcwd,outDirName)  #output directory is located in the current working directory os.mkdir(outDirPath)  #create output directory os.chdir(outDirPath)  #move into the output directory
 * 1) Command Line Interpretation###############################################################################################

newReadsPath = os.path.join(outDirPath, os.path.basename(readsPath)) newRefPath = os.path.join(outDirPath, os.path.basename(refPath)) if options.remove_dots_reads: readsData = readFile(readsPath) readsData.replace('.','N') writeFile(newReadsPath, readsData) else: os.link(readsPath, newReadsPath) if options.remove_dots_ref or options.dos2unix_ref: if options.remove_dots_ref: refData = readFile(refPath) refData.replace('.','N') writeFile(newRefPath, refData) if options.dos2unix_ref: dos2unixCmndLine = ['dos2unix','--quiet','--newfile',refPath,newRefPath] dos2unixProcess = Popen(dos2unixCmndLine) dos2unixProcess.wait else: os.link(refPath, newRefPath)
 * 1) Dot replacement / Dos2unix implimentation#################################################################################

if options.makefile_path is None: makefileData = ''' C= + yasraPath + 
 * 1) Makefile Creation#########################################################################################################
 * 1) for a full scale assisted assembly:
 * 2)  keep files $(READS) Makefile and reference.fa in this directory, change the
 * 3)  values of the variables C, READS and then type:
 * 4)  make TYPE= ORIENT= PID= &> Summary
 * 5)  The choices for TYPE are '454' and 'solexa'. '454' refers to reads from 454
 * 6)  technology which are more than 150 bp long. 'solexa' refers to short reads
 * 7)  which are expected to be less than 50 bps.
 * 8)  The choices for ORIENT are 'linear' and 'circular'. It refers to the
 * 9)  orientation of the reference genome. (example: mtDNA would be circular)
 * 10) The choices for PID are:
 * 11) --for 454 reads:
 * 12)    'same'       : about 98% identity
 * 13)    'high'       : about 95% identity
 * 14)    'medium      : about 90% identity
 * 15)    'low'        : about 85% identity
 * 16)    'verylow'    : about 75% identity
 * 17)    'desperate'  : really low scores, because we are despeate (rather slow)
 * 18) --for solexa reads:
 * 19) 	'same'       : about 95% identity
 * 20) 	'medium'     : about 85% identity
 * 21) 	'desperate'  : low scores, as we are desperate :) (rather slow)
 * 22) This mode of assembly is rather slow because we try to walk through the small
 * 23) gaps and close the contigs. However, if we want to ignore the gaps and want
 * 24) quicker results (and this is the mode I would advise most of the time) is by
 * 25) typing:
 * 26)  make single_step TYPE= ORIENT= PID= &> Summary
 * 27) Final output of the process is the file 'Final_Assembly' which is the fasta
 * 28) file of contigs. 'Assembly.qual' which details the base profile for each of
 * 29) the positions in the consensus sequence. A section in this
 * 30) file is like:
 * 31) Contig 3
 * 32) 302     C       0       181     0       0       0       181
 * 33) where the depicted line shows the base profile for the position 302 in Contig
 * 34) 3.
 * 35) The line is to be read as follows:
 * 36) "Base Position" "Consensus Base" "Number of reads supporting 'A'" "Number of
 * 37) reads supporting 'C'" "Number of reads supporting 'G'" "Number of reads
 * 38) supporting 'T'" "Number of reads supporting a gap" "Total coverage at that
 * 39) base"
 * 40) Another file Amb.txt is generated which lists the positions the assembler is
 * 41) not sure about. Contigs.ace is an ace file which is generated by the
 * 42) assembler. Use 'toAmos' to convert it to a format which can be viewed using
 * 43) Hawkeye. I have not tested it with Consed.
 * 44) Running
 * 45) make subs
 * 46) creates a file of differences between the target genome and the reference
 * 47) genome and gives some statistics about them.
 * 48) For more information regarding options and tools in YASRA please contact :
 * 49) ratan@bx.psu.edu
 * 50) this needs to be changed to point to the directory with the binaries.
 * 1) 3.
 * 2) The line is to be read as follows:
 * 3) "Base Position" "Consensus Base" "Number of reads supporting 'A'" "Number of
 * 4) reads supporting 'C'" "Number of reads supporting 'G'" "Number of reads
 * 5) supporting 'T'" "Number of reads supporting a gap" "Total coverage at that
 * 6) base"
 * 7) Another file Amb.txt is generated which lists the positions the assembler is
 * 8) not sure about. Contigs.ace is an ace file which is generated by the
 * 9) assembler. Use 'toAmos' to convert it to a format which can be viewed using
 * 10) Hawkeye. I have not tested it with Consed.
 * 11) Running
 * 12) make subs
 * 13) creates a file of differences between the target genome and the reference
 * 14) genome and gives some statistics about them.
 * 15) For more information regarding options and tools in YASRA please contact :
 * 16) ratan@bx.psu.edu
 * 17) this needs to be changed to point to the directory with the binaries.
 * 1) For more information regarding options and tools in YASRA please contact :
 * 2) ratan@bx.psu.edu
 * 3) this needs to be changed to point to the directory with the binaries.
 * 1) ratan@bx.psu.edu
 * 2) this needs to be changed to point to the directory with the binaries.
 * 1) this needs to be changed to point to the directory with the binaries.

READS= + os.path.basename(readsPath) +  TEMPLATE= + os.path.basename(refPath) +  WL=`cat $(READS) | grep '>' | awk '{print $$1}' | wc -L` TYPE= + options.read_type +  ORIENT= + options.orientation +  MAX=`cat $(READS) | awk '{if(substr($$0,1,1) == ">"){if(a>max){max=a}; a=0} else{a = a+length($$0)}}; END{print max}'`
 * 1) this is the file of reads to be used for the assembly
 * 1) this is a soft link/file which is to be used as a template
 * 1) this is the length of the ids of the reads.
 * 1) is this 454 or solexa data
 * 1) is this a circular or a linear genome
 * 1) maximum length of a read


 * 1) Q gives parameters for finding weak matches to a rather distant species
 * 2) R gives parameters for finding high-identity matches to endogenous DNA

ifeq ($(TYPE), 454)  + '\t' + MAKE_TEMPLATE=min=150  + '\t' + ifeq ($(PID),same)  + '\t\t' + Q=--yasra98  + '\t' + endif  + '\t' + ifeq ($(PID),high)  + '\t\t' + Q=--yasra95  + '\t' + endif  + '\t' + ifeq ($(PID),medium)  + '\t\t' + Q=--yasra90  + '\t' + endif  + '\t' + ifeq ($(PID), low)  + '\t\t' + Q=--yasra85  + '\t' + endif  + '\t' + ifeq ($(PID), verylow)  + '\t\t' + Q=--yasra75  + '\t' + endif  + '\t' + ifeq ($(PID), desperate)  + '\t\t' + Q=Y=2000 K=2200 L=3000  + '\t' + endif  + '\t' + R=--yasra98 endif

ifeq ($(TYPE), solexa)  + '\t' + MAKE_TEMPLATE=N=100 min=30  + '\t' + SOLEXA_INSERT=N=100  + '\t' + SOLEXA=-solexa  + '\t' + ifeq ($(PID),same)  + '\t\t' + Q=--yasra95short  + '\t' + endif  + '\t' + ifeq ($(PID),medium)  + '\t\t' + Q=--yasra85short  + '\t' + endif  + '\t' + ifeq ($(PID), desperate)  + '\t\t' + Q=T=0 W=7 K=1200 L=1400  + '\t' + endif  + '\t' + R=--yasra95short endif

ifeq ($(ORIENT), circular)  + '\t' + CIRCULAR=-circular endif

TOOLS= + yasraPath +  COMPARE=$(TEMPLATE)

CON_INFO=info.txt REJECT=hits_reject.fa REJ=Rejects.txt REP=repeats.txt HITS=hits_final.fa all:step1 step2 step3 step4 step5

single_step:  + '\t' + make final_assembly T=$(TEMPLATE) V=60 P="$Q" I=70 S="-sig=1"  + '\t' + rm MAlign ReMAlign

step1 :  + '\t' + #Assemble on the original template:  + '\t' + make assemble_hits T=$(TEMPLATE) V=60 I=70 P="$Q" N=1

step2:  + '\t' + touch fake.txt  + '\t' + $C/finisher $(REJ) $(REP)  + '\t' + @rm Assembly* hits* template[0-9]* $(REP) fake.txt

step3:  + '\t' + #Determine difference between reads and assembly:  + '\t' + lastz template $(READS) $R | $C/lav_processor | $C/template_hits template $(READS) cov=70 pct=90 stats   discard=$(REJECT) > read_hits  + '\t' + make plot H=read_hits

step4:  + '\t' + #Trim Assembly:  + '\t' + if [ -s plot_rejects.eps ]; then mv plot_rejects.eps old_plot_rejects.eps; fi  + '\t' + $C/trim_assembly template read_hits $(REJECT) min=1 > AssemblyX  + '\t' + $C/make_template AssemblyX noends $(MAKE_TEMPLATE) info=$(CON_INFO) > ftemplate  + '\t' + make assemble_hits T=ftemplate V=70 I=90 P="$R" N=Y  + '\t' + $C/make_template AssemblyY noends $(MAKE_TEMPLATE) info=$(CON_INFO) > final_template  + '\t' + lastz final_template $(READS) $R | $C/lav_processor | $C/template_hits final_template $(READS) cov=70 pct=90 stats discard=$(REJECT) > read_hits  + '\t' + make plot H=read_hits  + '\t' + @rm read_hits AssemblyX AssemblyY ftemplate Assembly_ftemplate hits_ftemplate.fa

step5:  + '\t' + make final_assembly T=final_template V=70 P="$R" I=90 S="-sig=1"  + '\t' + @rm final_template template MAlign ReMAlign

final_assembly:  + '\t' + time lastz $T $(READS) $P | time $C/lav_processor $S | time $C/template_hits $T $(READS) cov=$V pct=$I discard=$(REJECT)  > $(HITS)  + '\t' + cat $(HITS) | grep '>Hit' | awk '{print $$4, $$1}' | sort -n | uniq -w $(WL) -D | awk '{print $$2}' > $(REP)  + '\t' + time $C/assembler hits_final.fa -ace=Contigs.ace -rejects=$(REJ) -repeats=$(REP) $(SOLEXA) -max_length=$(MAX) > MAlign  + '\t' + time $C/realigner -ace=Contigs.ace < MAlign > ReMAlign  + '\t' + time $C/consense -ace=Contigs.ace -amb=Amb.txt -profile=Assembly.qual < ReMAlign > Final_Assembly

plot:  + '\t' + make -f $(TOOLS)/tools.mk rejects H=$H R=$(REJECT) C=$(CON_INFO);

stepx:  + '\t' + #Assemble on the endogenous contigs:  + '\t' + cp fake.txt $(CON_INFO)  + '\t' + $C/make_template Assembly$W info=fake.txt $(MAKE_TEMPLATE)> template$X  + '\t' + make assemble_hits T=template$X V=70 P="$R" I=90 N=$X

assemble_hits :  + '\t' + time lastz $T $(READS) $P | $C/lav_processor | $C/template_hits $T $(READS) cov=$V pct=$I discard=$(REJECT) stats > hits_$T.fa  + '\t' + time make assemble HITS=hits_$T.fa  + '\t' + $C/welder Assembly_$T $(CIRCULAR) $(SOLEXA) | $C/consense > Assembly$N

assemble:  + '\t' + cat $(HITS) | grep '>Hit' | awk '{print $$4, $$1}' | sort -n | uniq -w $(WL) -D | awk '{print $$2}' > $(REP)  + '\t' + $C/assembler $(HITS) -repeats=$(REP) $(SOLEXA) -rejects=$(REJ) -max_length=$(MAX) | $C/realigner > ReMAlign  + '\t' + $C/consense -amb=Amb.txt -profile=Assembly.qual < ReMAlign > Assembly_$T

subs:  + '\t' + lastz $(COMPARE) Final_Assembly B=0 --identity=95..100 > fake.bz  + '\t' + $C/substitutions fake.bz $(COMPARE) Final_Assembly substitutions gaps > subs.txt

clean:  + '\t' + rm -r `ls -1 | grep -v "Makefile\|$(READS)\|reference.fa\|README"` ''' else: makefileData = readFile(options.makefile_path) makefilePath = os.path.join(outDirPath, 'Makefile') makefileHandle = open(makefilePath, 'w') makefileHandle.write(makefileData) makefileHandle.close

def printProcess(process): Prints the standard ouput and error messages of a process while the process ia running. outLen = 0 errLen = 0 out = '' err = '' while process.poll is None: #while it hasnt finished... (outData, errData) = process.communicate outDiff = len(outData) - outLen errDiff = len(errData) - errLen if outDiff > 0: outLen += outDiff out += outData[-outDiff:] print outData[-outDiff:] if errDiff > 0: errLen += errDiff err += errData[-errDiff:] print errData[-errDiff:] return (out, err)

makeCmndLine = ['make','TYPE=' + options.read_type,'ORIENT=' + options.orientation,'PID=' + options.percent_identity] if options.single_step: makeCmndLine.insert(1,'single_step') makeStdOut = open(os.path.join(outDirPath, 'yasra_standard_output.txt'), 'w') makeStdErr = open(os.path.join(outDirPath, 'yasra_standard_error.txt'), 'w') makeProcess = Popen(makeCmndLine, stdout=PIPE, stderr=PIPE) if options.verbose: (outData, errData) = printProcess(makeProcess) else: (outData, errData) = makeProcess.communicate makeProcess.wait makeStdOut.write(outData) makeStdErr.write(errData) makeStdOut.close makeStdErr.close if makeProcess.returncode == 0:  #yasra completed normally if options.verbose: print 'yasra output saved in the directory: %s' % outDirPath else: errorExit('yasra returned a non-zero code; it may have not completed succesfully', 0) dontChange = ('Makefile',os.path.basename(readsPath), os.path.basename(refPath)) outPaths = [os.path.join(outDirPath, path) for path in os.listdir(outDirPath) if os.path.isfile(path) and path not in dontChange] renamedPaths = [os.path.join(outDirPath, '%s_%s_%s%s' % (os.path.splitext(path)[0], os.path.basename(readsPath), os.path.basename(refPath), os.path.splitext(path)[1])) for path in os.listdir(outDirPath) if os.path.isfile(path) and path not in dontChange] for index in range(0,len(outPaths)): os.rename(outPaths[index],renamedPaths[index])
 * 1) Run yasra#################################################################################################################