Runyasra.py
From OpenWetWare
Copy and past the following into a text editor and save it as runyasra.py to use the script.
#!/usr/bin/env python #Usage: python runyasra.py [options] <Reads> <Reference> # #Options: # --version show program's version number and exit # -h, --help show this help message and exit # -t READ_TYPE, --read-type=READ_TYPE # Specify the type of reads. (Default: solexa) # -o ORIENTATION, --orientation=ORIENTATION # Specify orientation of the sequence. (Default: # circular) # -p PERCENT_IDENTITY, --percent-identity=PERCENT_IDENTITY # The percent identity (PID in yasra). The settings # correspond to different percent values depending on # the read type (-t). (Defalt: same) # -m PATH, --make-path=PATH # Specify path to external makefile used by YASRA. # (Default: use the makefile built in to runyasra) # -s, --single-step Activate yasra's single_step option (Default: run # yasra normally) # -v, --verbose Print relevant statistics and progresss reports. # (Default: run silently) # -r, --remove-dots-reads # Replace dots with Ns in the reads file before runnning # yasra. The modified file is placed in the output # filder.(Default: create a link to the original file) # -f, --remove-dots-ref # Replace dots with Ns in the reference file before # runnning yasra. The modified file is placed in the # output filder.(Default: create a link to the original # file) # -d, --dos2unix-ref Run dos2unix on the reference file before yasra. The # modified file is placed in the output filder. # (Default: create a link to the priginal reference) def readFile(filePath): fileHandle = open(filePath, 'r') fileData = fileHandle.read() fileHandle.close() return fileData def writeFile(filePath, fileData): fileHandle = open(filePath, 'w') fileHandle.write(fileData) fileHandle.close() #Error handling function##################################################################################################### def errorExit(message=None,exitStatus=1): '''Is called when the script encounters a fatal error. Prints the debug log to standard out (usually the screen)''' if message: print 'Error: ' + message else: for line in debugLog: print line #print debug log sys.exit(exitStatus) ############################################################################################################################# ###Imports / Variable Initilization########################################################################################## import os, sys, copy, string from datetime import * from optparse import * from subprocess import * timeStamp = datetime.now().ctime().replace(' ','-') #Ex: 'Mon-Jun-14-11:08:55-2010' commandLine = copy.deepcopy(sys.argv) yasraPath = '/local/cluster/YASRA/bin' cwd = os.getcwd() progName, progVersion = ('runyasra','1.02') progUsage = 'python %s [options] <Reads> <Reference>' % progName ############################################################################################################################# ###Command Line Parser####################################################################################################### cmndLineParser = OptionParser(usage=progUsage, version="Version %s" % progVersion) cmndLineParser.add_option('-t', '--read-type', action='store', default='solexa', type='choice', choices=['solexa','454'], help="Specify the type of reads. (Default: solexa)") cmndLineParser.add_option('-o', '--orientation', action='store', default='circular', type='choice', choices=['circular','linear'], help="Specify orientation of the sequence. (Default: circular") cmndLineParser.add_option('-p', '--percent-identity', action='store', default='same', type='choice', choices=['same','high','medium','low','verylow','desperate'], help="The percent identity (PID in yasra). The settings correspond to different percent values depending on the read type (-t). (Defalt: same)") cmndLineParser.add_option('-m', '--makefile-path', action='store', default=None, type='string', metavar='PATH', help="Specify path to external makefile used by YASRA. (Default: use the makefile built in to runyasra)") cmndLineParser.add_option('-s', '--single-step', action='store_true', default=False, help="Activate yasra's single_step option (Default: run yasra normally)") cmndLineParser.add_option('-v', '--verbose', action='store_true', default=False, help='Print relevant statistics and progresss reports. (Default: run silently)') cmndLineParser.add_option('-r', '--remove-dots-reads', action='store_true', default=False, help='Replace dots with Ns in the reads file before runnning yasra. The modified file is placed in the output filder.(Default: create a link to the original file)') cmndLineParser.add_option('-f', '--remove-dots-ref', action='store_true', default=False, help='Replace dots with Ns in the reference file before runnning yasra. The modified file is placed in the output filder.(Default: create a link to the original file)') cmndLineParser.add_option('-d', '--dos2unix-ref', action='store_true', default=False, help='Run dos2unix on the reference file before yasra. The modified file is placed in the output filder. (Default: create a link to the priginal reference)') (options, args) = cmndLineParser.parse_args(commandLine) ############################################################################################################################# ###Command Line Interpretation############################################################################################### argNum = len(args) - 1 #counts the amount of arguments, negating the 'runyasra' at the start of the command line if argNum != 2: errorExit('runyasra takes exactly 2 argumets; %d supplied' % argNum, 0) readsPath = os.path.join(cwd, args[-2]) refPath = os.path.join(cwd, args[-1]) if os.path.exists(readsPath) == False: errorExit('Invalid path to the reads file: %s' % readsPath,0) if os.path.exists(refPath) == False: errorExit('Invalid path to the reference file: %s' % refPath,0) outDirName = '%s_%s_%s' % (os.path.basename(readsPath), os.path.basename(refPath), timeStamp) outDirPath = os.path.join(os.getcwd(),outDirName) #output directory is located in the current working directory os.mkdir(outDirPath) #create output directory os.chdir(outDirPath) #move into the output directory ############################################################################################################################# ###Dot replacement / Dos2unix implimentation################################################################################# newReadsPath = os.path.join(outDirPath, os.path.basename(readsPath)) newRefPath = os.path.join(outDirPath, os.path.basename(refPath)) if options.remove_dots_reads: readsData = readFile(readsPath) readsData.replace('.','N') writeFile(newReadsPath, readsData) else: os.link(readsPath, newReadsPath) if options.remove_dots_ref or options.dos2unix_ref: if options.remove_dots_ref: refData = readFile(refPath) refData.replace('.','N') writeFile(newRefPath, refData) if options.dos2unix_ref: dos2unixCmndLine = ['dos2unix','--quiet','--newfile',refPath,newRefPath] dos2unixProcess = Popen(dos2unixCmndLine) dos2unixProcess.wait() else: os.link(refPath, newRefPath) ############################################################################################################################# ###Makefile Creation######################################################################################################### if options.makefile_path is None: makefileData = ''' #for a full scale assisted assembly: # keep files $(READS) Makefile and reference.fa in this directory, change the # values of the variables C, READS and then type: # # make TYPE= ORIENT= PID= &> Summary # # The choices for TYPE are '454' and 'solexa'. '454' refers to reads from 454 # technology which are more than 150 bp long. 'solexa' refers to short reads # which are expected to be less than 50 bps. # # The choices for ORIENT are 'linear' and 'circular'. It refers to the # orientation of the reference genome. (example: mtDNA would be circular) # # The choices for PID are: # # --for 454 reads: # 'same' : about 98% identity # 'high' : about 95% identity # 'medium : about 90% identity # 'low' : about 85% identity # 'verylow' : about 75% identity # 'desperate' : really low scores, because we are despeate (rather slow) # # --for solexa reads: # 'same' : about 95% identity # 'medium' : about 85% identity # 'desperate' : low scores, as we are desperate :) (rather slow) # # This mode of assembly is rather slow because we try to walk through the small # gaps and close the contigs. However, if we want to ignore the gaps and want # quicker results (and this is the mode I would advise most of the time) is by # typing: # # make single_step TYPE= ORIENT= PID= &> Summary # # Final output of the process is the file 'Final_Assembly' which is the fasta # file of contigs. 'Assembly.qual' which details the base profile for each of # the positions in the consensus sequence. A section in this # file is like: # # Contig 3 # . # . # . # # 302 C 0 181 0 0 0 181 # . # . # # where the depicted line shows the base profile for the position 302 in Contig # 3. # The line is to be read as follows: # "Base Position" "Consensus Base" "Number of reads supporting 'A'" "Number of # reads supporting 'C'" "Number of reads supporting 'G'" "Number of reads # supporting 'T'" "Number of reads supporting a gap" "Total coverage at that # base" # # Another file Amb.txt is generated which lists the positions the assembler is # not sure about. Contigs.ace is an ace file which is generated by the # assembler. Use 'toAmos' to convert it to a format which can be viewed using # Hawkeye. I have not tested it with Consed. # # Running # # make subs # creates a file of differences between the target genome and the reference # genome and gives some statistics about them. # # For more information regarding options and tools in YASRA please contact : # # ratan@bx.psu.edu # #this needs to be changed to point to the directory with the binaries. C=''' + yasraPath + ''' #this is the file of reads to be used for the assembly READS=''' + os.path.basename(readsPath) + ''' #this is a soft link/file which is to be used as a template TEMPLATE=''' + os.path.basename(refPath) + ''' #this is the length of the ids of the reads. WL=`cat $(READS) | grep '>' | awk '{print $$1}' | wc -L` #is this 454 or solexa data TYPE=''' + options.read_type + ''' #is this a circular or a linear genome ORIENT=''' + options.orientation + ''' #maximum length of a read MAX=`cat $(READS) | awk '{if(substr($$0,1,1) == ">"){if(a>max){max=a}; a=0} else{a = a+length($$0)}}; END{print max}'` # Q gives parameters for finding weak matches to a rather distant species # R gives parameters for finding high-identity matches to endogenous DNA ifeq ($(TYPE), 454) ''' + '\t' + '''MAKE_TEMPLATE=min=150 ''' + '\t' + '''ifeq ($(PID),same) ''' + '\t\t' + '''Q=--yasra98 ''' + '\t' + '''endif ''' + '\t' + '''ifeq ($(PID),high) ''' + '\t\t' + '''Q=--yasra95 ''' + '\t' + '''endif ''' + '\t' + '''ifeq ($(PID),medium) ''' + '\t\t' + '''Q=--yasra90 ''' + '\t' + '''endif ''' + '\t' + '''ifeq ($(PID), low) ''' + '\t\t' + '''Q=--yasra85 ''' + '\t' + '''endif ''' + '\t' + '''ifeq ($(PID), verylow) ''' + '\t\t' + '''Q=--yasra75 ''' + '\t' + '''endif ''' + '\t' + '''ifeq ($(PID), desperate) ''' + '\t\t' + '''Q=Y=2000 K=2200 L=3000 ''' + '\t' + '''endif ''' + '\t' + '''R=--yasra98 endif ifeq ($(TYPE), solexa) ''' + '\t' + '''MAKE_TEMPLATE=N=100 min=30 ''' + '\t' + '''SOLEXA_INSERT=N=100 ''' + '\t' + '''SOLEXA=-solexa ''' + '\t' + '''ifeq ($(PID),same) ''' + '\t\t' + '''Q=--yasra95short ''' + '\t' + '''endif ''' + '\t' + '''ifeq ($(PID),medium) ''' + '\t\t' + '''Q=--yasra85short ''' + '\t' + '''endif ''' + '\t' + '''ifeq ($(PID), desperate) ''' + '\t\t' + '''Q=T=0 W=7 K=1200 L=1400 ''' + '\t' + '''endif ''' + '\t' + '''R=--yasra95short endif ifeq ($(ORIENT), circular) ''' + '\t' + '''CIRCULAR=-circular endif TOOLS=''' + yasraPath + ''' COMPARE=$(TEMPLATE) CON_INFO=info.txt REJECT=hits_reject.fa REJ=Rejects.txt REP=repeats.txt HITS=hits_final.fa all:step1 step2 step3 step4 step5 single_step: ''' + '\t' + '''make final_assembly T=$(TEMPLATE) V=60 P="$Q" I=70 S="-sig=1" ''' + '\t' + '''rm MAlign ReMAlign step1 : ''' + '\t' + '''#Assemble on the original template: ''' + '\t' + '''make assemble_hits T=$(TEMPLATE) V=60 I=70 P="$Q" N=1 step2: ''' + '\t' + '''touch fake.txt ''' + '\t' + '''$C/finisher $(REJ) $(REP) ''' + '\t' + '''@rm Assembly* hits* template[0-9]* $(REP) fake.txt step3: ''' + '\t' + '''#Determine difference between reads and assembly: ''' + '\t' + '''lastz template $(READS) $R | $C/lav_processor | $C/template_hits template $(READS) cov=70 pct=90 stats discard=$(REJECT) > read_hits ''' + '\t' + '''make plot H=read_hits step4: ''' + '\t' + '''#Trim Assembly: ''' + '\t' + '''if [ -s plot_rejects.eps ]; then mv plot_rejects.eps old_plot_rejects.eps; fi ''' + '\t' + '''$C/trim_assembly template read_hits $(REJECT) min=1 > AssemblyX ''' + '\t' + '''$C/make_template AssemblyX noends $(MAKE_TEMPLATE) info=$(CON_INFO) > ftemplate ''' + '\t' + '''make assemble_hits T=ftemplate V=70 I=90 P="$R" N=Y ''' + '\t' + '''$C/make_template AssemblyY noends $(MAKE_TEMPLATE) info=$(CON_INFO) > final_template ''' + '\t' + '''lastz final_template $(READS) $R | $C/lav_processor | $C/template_hits final_template $(READS) cov=70 pct=90 stats discard=$(REJECT) > read_hits ''' + '\t' + '''make plot H=read_hits ''' + '\t' + '''@rm read_hits AssemblyX AssemblyY ftemplate Assembly_ftemplate hits_ftemplate.fa step5: ''' + '\t' + '''make final_assembly T=final_template V=70 P="$R" I=90 S="-sig=1" ''' + '\t' + '''@rm final_template template MAlign ReMAlign final_assembly: ''' + '\t' + '''time lastz $T $(READS) $P | time $C/lav_processor $S | time $C/template_hits $T $(READS) cov=$V pct=$I discard=$(REJECT) > $(HITS) ''' + '\t' + '''cat $(HITS) | grep '>Hit' | awk '{print $$4, $$1}' | sort -n | uniq -w $(WL) -D | awk '{print $$2}' > $(REP) ''' + '\t' + '''time $C/assembler hits_final.fa -ace=Contigs.ace -rejects=$(REJ) -repeats=$(REP) $(SOLEXA) -max_length=$(MAX) > MAlign ''' + '\t' + '''time $C/realigner -ace=Contigs.ace < MAlign > ReMAlign ''' + '\t' + '''time $C/consense -ace=Contigs.ace -amb=Amb.txt -profile=Assembly.qual < ReMAlign > Final_Assembly plot: ''' + '\t' + '''make -f $(TOOLS)/tools.mk rejects H=$H R=$(REJECT) C=$(CON_INFO); stepx: ''' + '\t' + '''#Assemble on the endogenous contigs: ''' + '\t' + '''cp fake.txt $(CON_INFO) ''' + '\t' + '''$C/make_template Assembly$W info=fake.txt $(MAKE_TEMPLATE)> template$X ''' + '\t' + '''make assemble_hits T=template$X V=70 P="$R" I=90 N=$X assemble_hits : ''' + '\t' + '''time lastz $T $(READS) $P | $C/lav_processor | $C/template_hits $T $(READS) cov=$V pct=$I discard=$(REJECT) stats > hits_$T.fa ''' + '\t' + '''time make assemble HITS=hits_$T.fa ''' + '\t' + '''$C/welder Assembly_$T $(CIRCULAR) $(SOLEXA) | $C/consense > Assembly$N assemble: ''' + '\t' + '''cat $(HITS) | grep '>Hit' | awk '{print $$4, $$1}' | sort -n | uniq -w $(WL) -D | awk '{print $$2}' > $(REP) ''' + '\t' + '''$C/assembler $(HITS) -repeats=$(REP) $(SOLEXA) -rejects=$(REJ) -max_length=$(MAX) | $C/realigner > ReMAlign ''' + '\t' + '''$C/consense -amb=Amb.txt -profile=Assembly.qual < ReMAlign > Assembly_$T subs: ''' + '\t' + '''lastz $(COMPARE) Final_Assembly B=0 --identity=95..100 > fake.bz ''' + '\t' + '''$C/substitutions fake.bz $(COMPARE) Final_Assembly substitutions gaps > subs.txt clean: ''' + '\t' + '''rm -r `ls -1 | grep -v "Makefile\|$(READS)\|reference.fa\|README"` ''' else: makefileData = readFile(options.makefile_path) makefilePath = os.path.join(outDirPath, 'Makefile') makefileHandle = open(makefilePath, 'w') makefileHandle.write(makefileData) makefileHandle.close() ############################################################################################################################# def printProcess(process): '''Prints the standard ouput and error messages of a process while the process ia running.''' outLen = 0 errLen = 0 out = '' err = '' while process.poll() is None: #while it hasnt finished... (outData, errData) = process.communicate() outDiff = len(outData) - outLen errDiff = len(errData) - errLen if outDiff > 0: outLen += outDiff out += outData[-outDiff:] print outData[-outDiff:] if errDiff > 0: errLen += errDiff err += errData[-errDiff:] print errData[-errDiff:] return (out, err) ###Run yasra################################################################################################################# makeCmndLine = ['make','TYPE=' + options.read_type,'ORIENT=' + options.orientation,'PID=' + options.percent_identity] if options.single_step: makeCmndLine.insert(1,'single_step') makeStdOut = open(os.path.join(outDirPath, 'yasra_standard_output.txt'), 'w') makeStdErr = open(os.path.join(outDirPath, 'yasra_standard_error.txt'), 'w') makeProcess = Popen(makeCmndLine, stdout=PIPE, stderr=PIPE) if options.verbose: (outData, errData) = printProcess(makeProcess) else: (outData, errData) = makeProcess.communicate() makeProcess.wait() makeStdOut.write(outData) makeStdErr.write(errData) makeStdOut.close() makeStdErr.close() if makeProcess.returncode == 0: #yasra completed normally if options.verbose: print 'yasra output saved in the directory: %s' % outDirPath else: errorExit('yasra returned a non-zero code; it may have not completed succesfully', 0) dontChange = ('Makefile',os.path.basename(readsPath), os.path.basename(refPath)) outPaths = [os.path.join(outDirPath, path) for path in os.listdir(outDirPath) if os.path.isfile(path) and path not in dontChange] renamedPaths = [os.path.join(outDirPath, '%s_%s_%s%s' % (os.path.splitext(path)[0], os.path.basename(readsPath), os.path.basename(refPath), os.path.splitext(path)[1])) for path in os.listdir(outDirPath) if os.path.isfile(path) and path not in dontChange] for index in range(0,len(outPaths)): os.rename(outPaths[index],renamedPaths[index]) ############################################################################################################################