Runyasra.py

From OpenWetWare
Revision as of 12:09, 8 September 2010 by Zachary S. L. Foster (talk | contribs)
(diff) ← Older revision | Latest revision (diff) | Newer revision → (diff)
Jump to navigationJump to search

Copy and past the following into a text editor and save it as runyasra.py to use the script.

#!/usr/bin/env python

#Usage: python runyasra.py [options] <Reads> <Reference>
#
#Options:
#  --version             show program's version number and exit
#  -h, --help            show this help message and exit
#  -t READ_TYPE, --read-type=READ_TYPE
#                        Specify the type of reads. (Default: solexa)
#  -o ORIENTATION, --orientation=ORIENTATION
#                        Specify orientation of the sequence. (Default:
#                        circular)
#  -p PERCENT_IDENTITY, --percent-identity=PERCENT_IDENTITY
#                        The percent identity (PID in yasra). The settings
#                        correspond to different percent values depending on
#                        the read type (-t). (Defalt: same)
#  -m PATH, --make-path=PATH
#                        Specify path to external makefile used by YASRA.
#                        (Default: use the makefile built in to runyasra)
#  -s, --single-step     Activate yasra's single_step option (Default: run
#                        yasra normally)
#  -v, --verbose         Print relevant statistics and progresss reports.
#                        (Default: run silently)
#  -r, --remove-dots-reads
#                        Replace dots with Ns in the reads file before runnning
#                        yasra. The modified file is placed in the output
#                        filder.(Default: create a link to the original file)
#  -f, --remove-dots-ref
#                        Replace dots with Ns in the reference file before
#                        runnning yasra. The modified file is placed in the
#                        output filder.(Default: create a link to the original
#                        file)
#  -d, --dos2unix-ref    Run dos2unix on the reference file before yasra. The
#                        modified file is placed in the output filder.
#                        (Default: create a link to the priginal reference)

def readFile(filePath):
    fileHandle = open(filePath, 'r')
    fileData = fileHandle.read()
    fileHandle.close()
    return fileData

def writeFile(filePath, fileData):
    fileHandle = open(filePath, 'w')
    fileHandle.write(fileData)
    fileHandle.close()

#Error handling function#####################################################################################################
def errorExit(message=None,exitStatus=1):
    '''Is called when the script encounters a fatal error. Prints the debug log to standard out (usually the screen)''' 
    if message:
        print 'Error: ' + message
    else:
        for line in debugLog: print line   #print debug log
    sys.exit(exitStatus)
#############################################################################################################################

###Imports / Variable Initilization##########################################################################################
import os, sys, copy, string
from datetime import *
from optparse import *
from subprocess import *
timeStamp = datetime.now().ctime().replace(' ','-')  #Ex: 'Mon-Jun-14-11:08:55-2010'
commandLine = copy.deepcopy(sys.argv)
yasraPath = '/local/cluster/YASRA/bin'
cwd = os.getcwd()
progName, progVersion = ('runyasra','1.02')
progUsage = 'python %s [options] <Reads> <Reference>' % progName
#############################################################################################################################

###Command Line Parser#######################################################################################################
cmndLineParser  = OptionParser(usage=progUsage, version="Version %s" % progVersion)
cmndLineParser.add_option('-t', '--read-type',          action='store',         default='solexa',   type='choice',  choices=['solexa','454'],                                       help="Specify the type of reads. (Default: solexa)")
cmndLineParser.add_option('-o', '--orientation',        action='store',         default='circular', type='choice',  choices=['circular','linear'],                                  help="Specify orientation of the sequence. (Default: circular")
cmndLineParser.add_option('-p', '--percent-identity',   action='store',         default='same',     type='choice',  choices=['same','high','medium','low','verylow','desperate'],   help="The percent identity (PID in yasra). The settings correspond to different percent values depending on the read type (-t). (Defalt: same)")
cmndLineParser.add_option('-m', '--makefile-path',      action='store',         default=None,       type='string',  metavar='PATH',                                                 help="Specify path to external makefile used by YASRA. (Default: use the makefile built in to runyasra)")
cmndLineParser.add_option('-s', '--single-step',        action='store_true',    default=False,                                                                                      help="Activate yasra's single_step option (Default: run yasra normally)")
cmndLineParser.add_option('-v', '--verbose',            action='store_true',    default=False,                                                                                      help='Print relevant statistics and progresss reports. (Default: run silently)')
cmndLineParser.add_option('-r', '--remove-dots-reads',  action='store_true',    default=False,                                                                                      help='Replace dots with Ns in the reads file before runnning yasra. The modified file is placed in the output filder.(Default: create a link to the original file)')
cmndLineParser.add_option('-f', '--remove-dots-ref',    action='store_true',    default=False,                                                                                      help='Replace dots with Ns in the reference file before runnning yasra. The modified file is placed in the output filder.(Default: create a link to the original file)')
cmndLineParser.add_option('-d', '--dos2unix-ref',       action='store_true',    default=False,                                                                                      help='Run dos2unix on the reference file before yasra. The modified file is placed in the output filder. (Default: create a link to the priginal reference)')
(options, args) = cmndLineParser.parse_args(commandLine)
#############################################################################################################################

###Command Line Interpretation###############################################################################################
argNum = len(args) - 1   #counts the amount of arguments, negating the 'runyasra' at the start of the command line
if argNum != 2: errorExit('runyasra takes exactly 2 argumets; %d supplied' % argNum, 0)
readsPath = os.path.join(cwd, args[-2])
refPath = os.path.join(cwd, args[-1])
if os.path.exists(readsPath) == False: errorExit('Invalid path to the reads file: %s' % readsPath,0)
if os.path.exists(refPath) == False: errorExit('Invalid path to the reference file: %s' % refPath,0)
outDirName = '%s_%s_%s' % (os.path.basename(readsPath), os.path.basename(refPath), timeStamp)
outDirPath = os.path.join(os.getcwd(),outDirName)   #output directory is located in the current working directory
os.mkdir(outDirPath)   #create output directory
os.chdir(outDirPath)   #move into the output directory
#############################################################################################################################

###Dot replacement / Dos2unix implimentation#################################################################################
newReadsPath = os.path.join(outDirPath, os.path.basename(readsPath))
newRefPath = os.path.join(outDirPath, os.path.basename(refPath))
if options.remove_dots_reads:
    readsData = readFile(readsPath)
    readsData.replace('.','N')
    writeFile(newReadsPath, readsData)
else:
    os.link(readsPath, newReadsPath)
if options.remove_dots_ref or options.dos2unix_ref:
    if options.remove_dots_ref:
        refData = readFile(refPath)
        refData.replace('.','N')
        writeFile(newRefPath, refData)
    if options.dos2unix_ref:
        dos2unixCmndLine = ['dos2unix','--quiet','--newfile',refPath,newRefPath]
        dos2unixProcess = Popen(dos2unixCmndLine)
        dos2unixProcess.wait()
else:
    os.link(refPath, newRefPath)
#############################################################################################################################

###Makefile Creation#########################################################################################################
if options.makefile_path is None:
    makefileData = '''
#for a full scale assisted assembly:
#  keep files $(READS) Makefile and reference.fa in this directory, change the
#  values of the variables C, READS and then type:
#
#  make TYPE= ORIENT= PID= &> Summary
#  
#  The choices for TYPE are '454' and 'solexa'. '454' refers to reads from 454
#  technology which are more than 150 bp long. 'solexa' refers to short reads
#  which are expected to be less than 50 bps.
#	
#  The choices for ORIENT are 'linear' and 'circular'. It refers to the 
#  orientation of the reference genome. (example: mtDNA would be circular)
#
# The choices for PID are:
# 
# --for 454 reads:
#    'same'       : about 98% identity
#    'high'       : about 95% identity
#    'medium      : about 90% identity
#    'low'        : about 85% identity
#    'verylow'    : about 75% identity
#    'desperate'  : really low scores, because we are despeate (rather slow)
#
# --for solexa reads:
#	'same'        : about 95% identity
#	'medium'      : about 85% identity
#	'desperate'   : low scores, as we are desperate :) (rather slow)
#
# This mode of assembly is rather slow because we try to walk through the small
# gaps and close the contigs. However, if we want to ignore the gaps and want
# quicker results (and this is the mode I would advise most of the time) is by
# typing:
#
#  make single_step TYPE= ORIENT= PID= &> Summary
#
# Final output of the process is the file 'Final_Assembly' which is the fasta
# file of contigs. 'Assembly.qual' which details the base profile for each of 
# the positions in the consensus sequence. A section in this
# file is like:
#
# Contig 3
# .
# .
# .
#
# 302     C       0       181     0       0       0       181
# .
# .
#
# where the depicted line shows the base profile for the position 302 in Contig
# 3. 
# The line is to be read as follows:
# "Base Position" "Consensus Base" "Number of reads supporting 'A'" "Number of
# reads supporting 'C'" "Number of reads supporting 'G'" "Number of reads
# supporting 'T'" "Number of reads supporting a gap" "Total coverage at that
# base" 
#
# Another file Amb.txt is generated which lists the positions the assembler is
# not sure about. Contigs.ace is an ace file which is generated by the
# assembler. Use 'toAmos' to convert it to a format which can be viewed using
# Hawkeye. I have not tested it with Consed.
#
# Running 
#
# make subs 
# creates a file of differences between the target genome and the reference
# genome and gives some statistics about them.
#
# For more information regarding options and tools in YASRA please contact :
# 
# ratan@bx.psu.edu
#
#this needs to be changed to point to the directory with the binaries.
C=''' + yasraPath + '''

#this is the file of reads to be used for the assembly
READS=''' + os.path.basename(readsPath) + '''
#this is a soft link/file which is to be used as a template
TEMPLATE=''' + os.path.basename(refPath) + '''
#this is the length of the ids of the reads. 
WL=`cat $(READS) | grep '>' | awk '{print $$1}' | wc -L`
#is this 454 or solexa data
TYPE=''' + options.read_type + '''
#is this a circular or a linear genome
ORIENT=''' + options.orientation + '''
#maximum length of a read
MAX=`cat $(READS) | awk '{if(substr($$0,1,1) == ">"){if(a>max){max=a}; a=0} else{a = a+length($$0)}}; END{print max}'`

# Q gives parameters for finding weak matches to a rather distant species
# R gives parameters for finding high-identity matches to endogenous DNA

ifeq ($(TYPE), 454)
''' + '\t' + '''MAKE_TEMPLATE=min=150
''' + '\t' + '''ifeq ($(PID),same)
''' + '\t\t' + '''Q=--yasra98
''' + '\t' + '''endif
''' + '\t' + '''ifeq ($(PID),high)
''' + '\t\t' + '''Q=--yasra95
''' + '\t' + '''endif
''' + '\t' + '''ifeq ($(PID),medium)
''' + '\t\t' + '''Q=--yasra90
''' + '\t' + '''endif
''' + '\t' + '''ifeq ($(PID), low)
''' + '\t\t' + '''Q=--yasra85
''' + '\t' + '''endif
''' + '\t' + '''ifeq ($(PID), verylow)
''' + '\t\t' + '''Q=--yasra75
''' + '\t' + '''endif
''' + '\t' + '''ifeq ($(PID), desperate)
''' + '\t\t' + '''Q=Y=2000 K=2200 L=3000
''' + '\t' + '''endif
''' + '\t' + '''R=--yasra98 
endif

ifeq ($(TYPE), solexa)
''' + '\t' + '''MAKE_TEMPLATE=N=100 min=30
''' + '\t' + '''SOLEXA_INSERT=N=100
''' + '\t' + '''SOLEXA=-solexa
''' + '\t' + '''ifeq ($(PID),same)
''' + '\t\t' + '''Q=--yasra95short
''' + '\t' + '''endif
''' + '\t' + '''ifeq ($(PID),medium)
''' + '\t\t' + '''Q=--yasra85short
''' + '\t' + '''endif
''' + '\t' + '''ifeq ($(PID), desperate)
''' + '\t\t' + '''Q=T=0 W=7 K=1200 L=1400
''' + '\t' + '''endif
''' + '\t' + '''R=--yasra95short 
endif

ifeq ($(ORIENT), circular)
''' + '\t' + '''CIRCULAR=-circular
endif

TOOLS=''' + yasraPath + '''
COMPARE=$(TEMPLATE)

CON_INFO=info.txt
REJECT=hits_reject.fa
REJ=Rejects.txt
REP=repeats.txt
HITS=hits_final.fa
all:step1 step2 step3 step4 step5

single_step:
''' + '\t' + '''make final_assembly T=$(TEMPLATE) V=60 P="$Q" I=70 S="-sig=1"
''' + '\t' + '''rm MAlign ReMAlign

step1 :
''' + '\t' + '''#Assemble on the original template:
''' + '\t' + '''make assemble_hits T=$(TEMPLATE) V=60 I=70 P="$Q" N=1

step2:
''' + '\t' + '''touch fake.txt 
''' + '\t' + '''$C/finisher $(REJ) $(REP)
''' + '\t' + '''@rm Assembly* hits* template[0-9]* $(REP) fake.txt

step3:
''' + '\t' + '''#Determine difference between reads and assembly:
''' + '\t' + '''lastz template $(READS) $R | $C/lav_processor | $C/template_hits template $(READS) cov=70 pct=90 stats    discard=$(REJECT) > read_hits
''' + '\t' + '''make plot H=read_hits

step4:
''' + '\t' + '''#Trim Assembly:
''' + '\t' + '''if [ -s plot_rejects.eps ]; then mv plot_rejects.eps old_plot_rejects.eps; fi
''' + '\t' + '''$C/trim_assembly template read_hits $(REJECT) min=1 > AssemblyX
''' + '\t' + '''$C/make_template AssemblyX noends $(MAKE_TEMPLATE) info=$(CON_INFO) > ftemplate
''' + '\t' + '''make assemble_hits T=ftemplate V=70 I=90 P="$R" N=Y
''' + '\t' + '''$C/make_template AssemblyY noends $(MAKE_TEMPLATE) info=$(CON_INFO) > final_template
''' + '\t' + '''lastz final_template $(READS) $R | $C/lav_processor | $C/template_hits final_template $(READS) cov=70 pct=90 stats discard=$(REJECT) > read_hits
''' + '\t' + '''make plot H=read_hits
''' + '\t' + '''@rm read_hits AssemblyX AssemblyY ftemplate Assembly_ftemplate hits_ftemplate.fa

step5:
''' + '\t' + '''make final_assembly T=final_template V=70 P="$R" I=90 S="-sig=1"
''' + '\t' + '''@rm final_template template MAlign ReMAlign

final_assembly:
''' + '\t' + '''time lastz $T $(READS) $P | time $C/lav_processor $S  | time $C/template_hits $T $(READS) cov=$V pct=$I discard=$(REJECT)  > $(HITS)
''' + '\t' + '''cat $(HITS) | grep '>Hit' | awk '{print $$4, $$1}' | sort -n |  uniq -w $(WL) -D | awk '{print $$2}' > $(REP)
''' + '\t' + '''time $C/assembler hits_final.fa -ace=Contigs.ace -rejects=$(REJ) -repeats=$(REP) $(SOLEXA) -max_length=$(MAX) > MAlign
''' + '\t' + '''time $C/realigner -ace=Contigs.ace < MAlign > ReMAlign
''' + '\t' + '''time $C/consense -ace=Contigs.ace -amb=Amb.txt -profile=Assembly.qual < ReMAlign > Final_Assembly

plot:
''' + '\t' + '''make -f $(TOOLS)/tools.mk rejects H=$H R=$(REJECT) C=$(CON_INFO);

stepx:
''' + '\t' + '''#Assemble on the endogenous contigs:
''' + '\t' + '''cp fake.txt $(CON_INFO)
''' + '\t' + '''$C/make_template Assembly$W info=fake.txt $(MAKE_TEMPLATE)> template$X
''' + '\t' + '''make assemble_hits T=template$X V=70 P="$R" I=90 N=$X


assemble_hits : 
''' + '\t' + '''time lastz $T $(READS) $P | $C/lav_processor  | $C/template_hits $T $(READS) cov=$V pct=$I discard=$(REJECT) stats > hits_$T.fa
''' + '\t' + '''time make assemble HITS=hits_$T.fa
''' + '\t' + '''$C/welder Assembly_$T $(CIRCULAR) $(SOLEXA) | $C/consense > Assembly$N

assemble:
''' + '\t' + '''cat $(HITS) | grep '>Hit' | awk '{print $$4, $$1}' | sort -n |  uniq -w $(WL) -D | awk '{print $$2}' > $(REP)
''' + '\t' + '''$C/assembler $(HITS) -repeats=$(REP) $(SOLEXA) -rejects=$(REJ) -max_length=$(MAX) | $C/realigner > ReMAlign
''' + '\t' + '''$C/consense -amb=Amb.txt -profile=Assembly.qual < ReMAlign > Assembly_$T


subs:
''' + '\t' + '''lastz $(COMPARE) Final_Assembly B=0 --identity=95..100 > fake.bz
''' + '\t' + '''$C/substitutions fake.bz $(COMPARE) Final_Assembly substitutions gaps > subs.txt


clean:
''' + '\t' + '''rm -r `ls -1 | grep -v "Makefile\|$(READS)\|reference.fa\|README"`
'''
else:
    makefileData = readFile(options.makefile_path)
makefilePath = os.path.join(outDirPath, 'Makefile')
makefileHandle = open(makefilePath, 'w')
makefileHandle.write(makefileData)
makefileHandle.close()
#############################################################################################################################

def printProcess(process):
    '''Prints the standard ouput and error messages of a process while the process ia running.'''
    outLen = 0
    errLen = 0
    out = ''
    err = ''
    while process.poll() is None:  #while it hasnt finished...
        (outData, errData) = process.communicate()
        outDiff =  len(outData) - outLen
        errDiff =  len(errData) - errLen
        if outDiff > 0:
            outLen += outDiff
            out += outData[-outDiff:]
            print outData[-outDiff:]
        if errDiff > 0:
            errLen += errDiff
            err += errData[-errDiff:]
            print errData[-errDiff:]
    return (out, err)

###Run yasra#################################################################################################################
makeCmndLine = ['make','TYPE=' + options.read_type,'ORIENT=' + options.orientation,'PID=' + options.percent_identity]
if options.single_step: makeCmndLine.insert(1,'single_step')
makeStdOut = open(os.path.join(outDirPath, 'yasra_standard_output.txt'), 'w')
makeStdErr = open(os.path.join(outDirPath, 'yasra_standard_error.txt'), 'w')
makeProcess = Popen(makeCmndLine, stdout=PIPE, stderr=PIPE)
if options.verbose:
    (outData, errData) = printProcess(makeProcess)
else:
    (outData, errData) = makeProcess.communicate()  
makeProcess.wait()
makeStdOut.write(outData)
makeStdErr.write(errData)
makeStdOut.close()
makeStdErr.close()
if makeProcess.returncode == 0:   #yasra completed normally
    if options.verbose: print 'yasra output saved in the directory: %s' % outDirPath
else:
    errorExit('yasra returned a non-zero code; it may have not completed succesfully', 0)
dontChange = ('Makefile',os.path.basename(readsPath), os.path.basename(refPath))
outPaths = [os.path.join(outDirPath, path) for path in os.listdir(outDirPath) if os.path.isfile(path) and path not in dontChange]
renamedPaths = [os.path.join(outDirPath, '%s_%s_%s%s' % (os.path.splitext(path)[0], os.path.basename(readsPath), os.path.basename(refPath), os.path.splitext(path)[1])) for path in os.listdir(outDirPath) if os.path.isfile(path) and path not in dontChange]
for index in range(0,len(outPaths)): os.rename(outPaths[index],renamedPaths[index])
############################################################################################################################