Qualtofa.py

Copy and past the following into a text editor and save it as qualtofa.py to use the script.
#!/usr/bin/env python

###Imports / Variable Initilization##########################################################################################
import os, string, sys, time, copy
from datetime import *
from optparse import *
from subprocess import *
commandLine = copy.deepcopy(sys.argv)
cwd = os.getcwd()
timeStamp = datetime.now().ctime().replace(' ','-').replace(':','-').replace('--','-')  #Ex: 'Mon-Jun-14-11-08-55-2010'
cntgMatchRanges = None
progName, progVersion = ('qualtofa.py','2.52')
progUsage = 'python %s [options] <.qual file>' % progName
printLog = []
#############################################################################################################################

def errorExit(message=None,exitStatus=1):
    '''Version 1.0
    Is called when the script encounters a fatal error. Prints the debug log to standard out (usually the screen)'''
    
    if message:
        print '%s: Error: %s' % (progName, message)
    else:
        print progName +': Unknown error, printing debug log...'
        for line in debugLog: print line   #print debug log
    sys.exit(exitStatus)

def mkRange(numList):
    '''Function designed to make a list of ranges (e.g. 2-5, 8-10) out of a list of integers (e.g. 2,3,4,5,8,9,10)'''
    outList = []
    if len(numList) > 0:
        start   = numList[0]
        current = numList[0]
        numList.append(numList[-1])
        for num in numList[1:]:
            if (num - 1) != current:
                outList.append([start,current])
                start = num
            current = num
    return outList

def readLFile(filePath, numLinesToRead = None):
    fileHandle = open(filePath, 'r')
    if numLinesToRead is None:
        fileData = fileHandle.readlines()
    else:
        fileData = [fileHandle.readline() for count in range(0,numLinesToRead)]
    fileHandle.close()
    return fileData

def writeLFile(filePath, fileData):
    fileHandle = open(filePath, 'w')
    for line in fileData:
        fileHandle.write(fileData)
    fileHandle.close()

def saveQualStandard(filePath, contigs, qualValues, headers, schema=None):
    if schema is None: schema = ['Position in the contig(1 based)', 'Called base', "Number of reads suppoting 'A'", "Number of reads supporting 'C'", "Number of reads supporting 'G'", "Number of reads supporting 'T'", "Number of reads supporting '_'", 'Coverage']
    if len(contigs) == 0: errorExit('saveQualStandard: empty list supplied')
    if len(contigs) != len(qualValues) or len(contigs) != len(headers): errorExit('saveQualStandard: Unequal number of entries is input list')
    outHandle = open(filePath, 'w')
    outHandle.write('Schema:'+ '\n' + '\t'.join(schema) + '\n')
    for cIndex in range(0,len(contigs)):
        outHandle.write(headers[cIndex] + '\n')
        for bIndex in range(0,len(contigs[cIndex])): outHandle.write('\t'.join([contigs[cIdnex][bIndex]] + qualValues[cIndex]) + '\n')
    outHandle.close()
            
def parseQualStandard(filePath):
    columbWidth = 8
    fileData = [line.strip().split('\t') for line in readLFile(filePath)]   #fileData is a list of lists representing the tab-deliminated structure of the .qual file      
    if fileData[0] == ['Schema:']: del fileData[0]   #removes the unessesary first line of the file, if present
    schema = fileData[0]   #extract the tab-deliminated schema into a list
    headers, contigs, qualValues = ([], [], [])   #initilizes multiple lists at a time
    index = 0
    for line in fileData[1:]:
        if len(line) == 1:   #if it is a header
            headers.append(line[0])
            contigs.append([])
            qualValues.append([])
        elif len(line) == columbWidth:   #if it is  quality value
            contigs[-1].append(line[1])
            qualValues[-1].append([int(num) for num in line[2:]])   #converts all of the quality values into integer objects and saves them in qualvalues
        else: errorExit('parseQualStandard: Quality file at %s deviated from the expected format on line %d.' % (filePath, index))
        index += 1        
    return (contigs, qualValues, headers, schema)

def parseQualAlignment(filePath):
    columbWidth = 9
    headColNum = 2
    fileData = [[part.strip() for part in line.split('\t')] for line in readLFile(filePath)]   #fileData is a list of lists representing the tab-deliminated structure of the .qual file  
    if fileData[0] == ['Schema:']: del fileData[0]   #removes the unessesary first line of the file, if present
    schema = fileData[0]   #extract the tab-deliminated schema into a list
    refHeader = fileData[1][0]
    reference = [line[1] for line in fileData[2:]]   #extracts the reference from the second columb
    qualData = [line[headColNum:] for line in fileData[2:]]   #removes the first lines and columbs of fileData, which dont contain unextracted data
    headers, contigs, cntgStarts, qualValues = ([], [], [], [])   #initilizes multiple lists at a time
    refPos = 0
    for line in qualData:
        if len(line) > 0:
            for index in range(0,len(line),columbWidth):
                if headers.count(line[index]) == 0:   #if the contig header has not been encountered before
                    headers.append(line[index])
                    cntgStarts.append(refPos)
                    contigs.append([])
                    qualValues.append([])
                headIndex = headers.index(line[index])
                contigs[headIndex].append(line[index + 2])   #append the called base character to its contig
                qualValues[headIndex].append([int(num) for num in line[index + 3: index + columbWidth]])   #converts all of the quality values into integer objects and saves them in qualvalues 
        refPos += 1
    alignmentIndexs = [[cntgStarts[index], cntgStarts[index] + len(contigs[index]) - 1] for index in range(0,len(cntgStarts))]
    return (contigs, qualValues, headers, alignmentIndexs, reference, refHeader, schema)

def parseQual(filePath):
    columbWidth = 9
    headColNum = 2
    fileHead = readLFile(filePath, 2)
    if fileHead[0].strip() == 'Schema:': del fileHead[0]   #removes the unessesary first line of the file, if present
    schema = fileHead[0].strip().split('\t')
    if schema[1] == 'Reference' and len(schema) % columbWidth == headColNum:
        return parseQualAlignment(filePath)
    elif len(schema) == 8:
        return parseQualStandard(filePath)
    else: errorExit('parseQual: The quality file at %s has an unreconized format based on the schema on line 2.' % filePath)

def listToStr(aList):
    out = ''
    for char in aList: out += char
    return out

def saveFasta(filePath, headers, sequences):
    outHandle = open(filePath, 'w')
    if len(headers) != 0 and len(sequences) != 0:
        if len(headers) != len(sequences): errorExit('saveFastaFile: different number of headers and sequences')
        if type(headers) == str and type(sequences) == str:
            headers = [headers]
            sequences = [sequences]
        if type(sequences[0]) == list: sequences = [listToStr(seq) for seq in sequences]
        for index in range(0,len(headers)):
            outHandle.write('>%s\n' % headers[index])
            if type(sequences[index]) == str: sequences[index] = [sequences[index]]  #if there is only one line of sequence for a given header, make it a one-item list for the next loop
            for seq in sequences[index]:
                outHandle.write(seq + '\n')
    outHandle.close()

def parseFasta(filePath):
    headers = []
    seqs = []
    fileHandle = open(filePath, 'r')
    fileData = fileHandle.readlines() + ['>']   #the additional header line is there make to following for loop process the last sequence 
    fileHandle.close()
    currSeq = ''
    for line in fileData:
        if line[0] == '>':   #if the line is a fasta header
            headers.append(line[1:].strip() )   #the fisrt character is the '>' and the last is a newline '\n'
            seqs.append(currSeq)
            currSeq = ''
        else:
            currSeq += line.strip()
    return (headers[:-1], seqs[1:])

def removeRepeats(aList):
    for entry in aList:
        if aList.count(entry) > 1:
            aList.remove(entry)
            return removeRepeats(aList)
    return aList

def maskingCallback(option, opt_str, value, parser):
    value = []
    def floatable(str):
        try:
            float(str)
            return True
        except ValueError: return False
    for arg in parser.rargs:         
        if arg[:2] == "--" and len(arg) > 2: break   # stop on --foo like options             
        if arg[:1] == "-" and len(arg) > 1 and not floatable(arg): break   # stop on -a, but not on -3 or -3.0
        value.append(arg)
        if len(value) == 2: break 
    if len(value) == 0: errorExit("option '%s' requires one or two arguments; none supplied." % option.dest)
    del parser.rargs[:len(value)]
    if len(value) == 1: value.append(0)
    value[0] = int(value[0])
    value[1] = float(value[1])
    if value[0] < 0: errorExit("The first argument of the option '%s' must be a positive integer; %d supplied." % (option.dest,value[0]))
    if value[1] < 0 or value[1] > 1: errorExit("The second argument of the option '%s' must be a decimal between 0 and 1; %f supplied." % (option.dest,value[1]))
    setattr(parser.values, option.dest, value)

###Command Line Parser#######################################################################################################
cmndLineParser  = OptionParser(usage=progUsage, version="Version %s" % progVersion)
maksingGroup = OptionGroup(cmndLineParser, 'Coverage and Call Proportion Masking','The following options take one integer argument and one decimal argument between 0 and 1, if the second is not supplied it is assumed to be 0.')
cmndLineParser.add_option(   "-c",   "--include-contigs",    action="store_true",    default=False,                                                             help="Include each contig on its own line. (Default: only output the consensus)")
maksingGroup.add_option(   "-m",   "--mask-contigs",       action="callback",      default=[0,0],      metavar='INT FLOAT',   callback=maskingCallback,    dest='mask_contigs',      help="Set minimum coverage depth and call proportion for contig masking; independent of the consensus. Requires the use of the -c modifier.(Default: 0, 0)")
maksingGroup.add_option(   "-s",   "--mask-contig-SNPs",   action="callback",      default=[0,0],      metavar='INT FLOAT',   callback=maskingCallback,    dest='mask_contig_SNPs',  help="Set minimum coverage depth and call proportion for contig SNP masking; independent of the consensus. Requires the use of the -c modifier.(Default: 0, 0)")
maksingGroup.add_option(   "-M",   "--mask-consensus",     action="callback",      default=[0,0],      metavar='INT FLOAT',   callback=maskingCallback,    dest='mask_consensus',    help="Set minimum coverage depth and call proportion for the consensus; a new masked sequence will be added to the output file. (Default: 0, 0)")
maksingGroup.add_option(   "-S",   "--mask-SNPs",          action="callback",      default=[0,0],      metavar='INT FLOAT',   callback=maskingCallback,    dest='mask_SNPs',         help="Set minimum coverage depth and call proportion for SNPs in the consensus; a new masked sequence will be added to the output file. (Default: 0, 0)")
cmndLineParser.add_option_group(maksingGroup)
cmndLineParser.add_option(   "-t",   "--end-trim",           action="store",         default=0,          type="int",    metavar="INT",      help="Trim all the bases on either end of all contigs by a specified number of bases. (Default: 0)")
cmndLineParser.add_option(   "-q",   "--end-trim-qual",      action="store",         default=0,          type="int",    metavar="INT",      help="Trim all the bases on either end of all contigs that have a quality value less than the specified amount. This option is implemented after standard end trimming (see -t option). (Default: 0)")
cmndLineParser.add_option(   "-l",   "--min-match-length",   action="store",         default=0,          type="int",    metavar="INT",      help="Only include contigs that have a least one match longer or as long as the specified amount. This is implemented after any end trimming (see -q and -t options). (Default: 0)")
cmndLineParser.add_option(   "-n",   "--no-overlap",         action="store_true",    default=False,                                         help="Add deletions (i.e. -'s) to the reference to accommodate any overlapping sequence, including unmatched sequence. (Default: Condense all overlapping regions of the consensus into IUPAC ambiguity codes.)")
cmndLineParser.add_option(   "-a",   "--no-match-overlap",   action="store_true",    default=False,                                          help="Add deletions to the reference to accommodate overlapping matches. (Default: Condense all overlapping regions of the consensus into IUPAC ambiguity codes.)")
cmndLineParser.add_option(   "-k",   "--keep-contained",     action="store_true",    default=False,                                         help="Include contained contigs in the alignment and consensus. (Default: save sequences of contained contigs to a separate file)")
cmndLineParser.add_option(   "-p",   "--save-SNPs",          action="store_true",    default=False,                                         help="Save SNPs to a .qual file. (Default: Do not save SNP file)")
cmndLineParser.add_option(   "-d",   "--dont-align-contigs", action="store_true",    default=False,                                         help="Do not align contigs to the reference. Requires the use of the -c modifier. (Default: align contigs to the reference)")
cmndLineParser.add_option(   "-r",   "--external-ref",       action="store",         default=None,       type="string", metavar="PATH",     help="Specify a reference to use instead of the one contained in the quality file.")
cmndLineParser.add_option(   "-v",   "--verbose",            action="store_true",    default=False,                                         help="Print progress updates and relevant statistics to the standard output. (Default: run silently)")
cmndLineParser.add_option(   "-i",   "--save-run-info",      action="store_true",    default=False,     help="Save a txt file of anything printed to the screen. Requires the use of -v/--verbose. (Default: Do not save)")
(options, args) = cmndLineParser.parse_args(commandLine)
if options.include_contigs == False:
    if sum(options.mask_contigs) > 1: errorExit('The option -m/--mask-contigs requires the use of the option -c/--include-contigs.')
    if sum(options.mask_contig_SNPs) > 1: errorExit('The option -s/--mask-contig-SNPs requires the use of the option -c/--include-contigs.')
    if options.dont_align_contigs: errorExit('The option -d/--dont-align-contigs requires the use of the option -c/--include-contigs.')
if len(args) == 1:
    cmndLineParser.print_help()
    sys.exit(0)
argNum = len(args) - 1   #counts the amount of arguments, negating the 'qualtofa' at the start of the command line
if argNum != 1: errorExit('qualtofa takes exactly 1 argument; %d supplied' % argNum, 0)
qualPath = args[1]   #The file path to the input file supplied by the user
#############################################################################################################################

def verbose(toPrint):
    if options.verbose:
        print toPrint,
        printLog.append(toPrint)

###Quality file parsing######################################################################################################
verbose('qualtofa: Loading data from %s...' % qualPath)
qualData = parseQual(qualPath)
verbose('Complete\n')
if len(qualData) == 4:   #if the quality file contains a list of unaligned contigs (e.g. one directly from yasra)
    isAlignment = False
    if options.no_match_overlap: errorExit('The option -a/--no-match-overlap requires an aligned quality consensous file, like that given by sumqual.py.')
    if any(options.mask_consensus): errorExit('The option -M/--mask-consensus requires an aligned quality consensous file, like that given by sumqual.py.')
    if any(options.mask_SNPs): errorExit('The option -S/--mask-SNPs requires an aligned quality consensous file, like that given by sumqual.py.')
    if any(options.mask_contig_SNPs): errorExit('The option -s/--mask-contig-SNPs requires an aligned quality consensous file, like that given by sumqual.py.')
    if options.keep_contained: errorExit('The option -k/--keep-contained requires an aligned quality consensous file, like that given by sumqual.py.')
    if options.min_match_length: errorExit('The option -l/--min-match-length requires an aligned quality consensous file, like that given by sumqual.py.')
    if options.save_SNPs: errorExit('The option -p/--save-SNPs requires an aligned quality consensous file, like that given by sumqual.py.')
    if options.no_overlap: errorExit('The option -n/--no-overlap requires an aligned quality consensous file, like that given by sumqual.py.')
    if options.external_ref: errorExit('The option -r/--external-ref requires an aligned quality consensous file, like that given by sumqual.py.')
    contigs, qualities, headers, schema = qualData
elif len(qualData) == 7:   #if the quality file contains an alignment (i.e. from sumqual)
    isAlignment = True
    contigs, qualities, headers, alignmentIndexs, reference, refHeader, schema = qualData
else: errorExit()
#############################################################################################################################

###External reference parsing(-r option implimintation)######################################################################
if options.external_ref is not None:
    verbose('qualtofa: option -r/--external-ref: loading reference at %s...' % options.external_ref)
    (seqs,headers) = parseFasta(options.external_ref)
    reference, refHeader = seqs[0], headers[0]
    verbose('Complete\n')
#############################################################################################################################

def delContig(index):
    del contigs[index]
    del headers[index]
    del qualities[index]
    if isAlignment:
        del alignmentIndexs[index]
        if cntgMatchRanges is not None: del cntgMatchRanges[index]

def delContigs(indexes):
    if len(indexes) > 0:    
        indexes.sort()
        indexes.reverse()
        indexes = removeRepeats(indexes)
        for index in indexes: delContig(index)   #deletes all contigs

def delBase(cntgIndex, baseIndex):
    del qualities[cntgIndex][baseIndex]
    del contigs[cntgIndex][baseIndex]
    if isAlignment:
        if baseIndex == 0: alignmentIndexs[cntgIndex][0] += 1
        else: alignmentIndexs[cntgIndex][1] -= 1 

def delBases(cntgIndex, baseIndexes):
    baseIndexes.sort()
    baseIndexes.reverse()
    for index in baseIndexes: delBase(cntgIndex, index)

###Contig end Trimming (-q and -t option implimentation)#####################################################################
if options.end_trim > 0:
    verbose('qualtofa: option -t/--end-trim: Trimming %d bases from both ends of each contig...' % options.end_trim)
    rejectIndexes = [index for index in range(0,len(contigs)) if len(contigs[index]) <= options.end_trim * 2]   #makes a list of all the indexs of contigs that are too short to be trimmed. 
    delContigs(rejectIndexes)   #deletes all rejected contigs
    contigs = [cntg[options.end_trim : -options.end_trim] for cntg in contigs]   #trims all remaining contigs
    qualities = [qual[options.end_trim : -options.end_trim] for qual in qualities]   #trims all remaining contigs
    alignmentIndexs = [[indexRange[0] + options.end_trim,indexRange[1] - options.end_trim]  for indexRange in alignmentIndexs]
    if options.verbose:
        verbose('Complete\n')
        if len(rejectIndexes) > 0: verbose('option -t/--end-trim: %d contig(s) were removed from the analysis because they were shorter than twice the trim length of %d.\n' % (len(rejectIndexes),options.end_trim))

if options.end_trim_qual > 0:
    verbose('qualtofa: option -q/--end-trim-qual: Trimming contig ends until a base with a coverage of %d or more is reached...' % options.end_trim_qual)
    for index in range(0, len(contigs)):
        while len(qualities[index]) > 0 and qualities[index][0][-1] < options.end_trim_qual: delBase(index,0)
        while len(qualities[index]) > 0 and qualities[index][-1][-1] < options.end_trim_qual: delBase(index,-1)
    rejectIndexes = [index for index in range(0,len(qualities)) if len(qualities[index]) == 0] #makes a list of all the indexs of contigs that dont contain sequence after trimming
    delContigs(rejectIndexes)   #deletes all rejected contigs
    if options.verbose:
        verbose('Complete\n')
        if len(rejectIndexes) > 0: verbose('option -q/--end-trim-qual: %d contig(s) removed from the analysis because all of their sequence had quality values less than %d.\n' % (len(rejectIndexes),options.end_trim_qual))
#############################################################################################################################

###Contigs Match Coordinate Extraction (-l option implimentation)############################################################
if isAlignment:
    cntgMatchRanges = [mkRange([index for index in range(0,len(cntg)) if cntg[index].isupper()]) for cntg in contigs]
    if options.min_match_length > 0:
        verbose('qualtofa: option -l/--min-match-length: filtering contigs without a match of at least %d base pairs...' % options.min_match_length)
        maxMatchLens = [max([match[1] - match[0] + 1 for match in matchList] + [0]) for matchList in cntgMatchRanges]
        rejectIndexes = [index for index in range(0,len(contigs)) if maxMatchLens[index] < options.min_match_length]
        delContigs(rejectIndexes)
        if options.verbose:
            verbose('Complete\n')
            if len(rejectIndexes) > 0: verbose('option -l/--min-match-length: %d contig(s) removed from the analysis due to the lack of a match %d base pairs long.\n' % (len(rejectIndexes),options.min_match_length))
#############################################################################################################################

###Contained contig extraction (-k option implimentation)####################################################################
if isAlignment:
    verbose('qualtofa: Searching for contained contigs...')
    matchRanges = [[cntgMatchRanges[index][0][0] + alignmentIndexs[index][0],cntgMatchRanges[index][-1][1] + alignmentIndexs[index][0]] for index in range(0,len(cntgMatchRanges))]
    containedIndexes = [[bIndex for bIndex in range(0,len(matchRanges)) if matchRanges[aIndex][0] <= matchRanges[bIndex][0] and matchRanges[aIndex][1] >= matchRanges[bIndex][1] and aIndex != bIndex] for aIndex in range(0,len(matchRanges))]
    containedIndexes = [index for contained in containedIndexes for index in contained]
    containedContigs = [(headers[index], contigs[index]) for index in containedIndexes]
    verbose('Complete\n')
    if options.keep_contained == False:
        delContigs(containedIndexes)
        if len(containedIndexes) > 0: verbose('qualtofa: %d contained contig(s) removed from the analysis.\n' % len(containedIndexes))
    elif len(containedIndexes) > 0: verbose('qualtofa: option -k/--keep-contained: %d contained contig(s) included in the analysis.\n' % len(containedIndexes))
#############################################################################################################################

def getIUPAC(baseValues):
    '''Takes a list of base character values (ACTG-~N) and condenses them into a single IUPAC code. Assumes only input characters to be ACTG-~N'''
    def IUPAC(bases):
        conversion = {'CT':'Y','AG':'R','AT':'W','CG':'S','GT':'K','AC':'M','AGT':'D','ACG':'V','ACT':'H','CGT':'B','ACGT':'N'}
        bases.sort()  #['A', 'C', 'G', 'T']
        if bases[0].islower():
            bases = map(str.upper,bases)
            return str.lower(conversion[''.join(bases)])
        else: return conversion[''.join(bases)]
    chars = removeRepeats(baseValues)
    chars.sort()  #['-', 'A', 'C', 'G', 'N', 'T', 'a', 'c', 'g', 'n', 't', '~']
    if len(chars) == 0: return '-'
    if len(chars) == 1: return chars[0]
    if chars[-1] == '~':   #converts '~' to '-'
        del chars[-1]
        if chars[0] != '-': chars.insert(0,'-')
    priorityList = ('ACGT','N','-','acgt','n')
    for group in priorityList:
        matchs = [base for base in chars if group.find(base) != -1]
        if len(matchs) == 0: continue
        elif len(matchs) == 1: return matchs[0]
        else: return IUPAC(matchs)

###Overlap handleing (-i, -n options implimentation)############################################################################
if isAlignment:
    if options.no_match_overlap or options.no_overlap:
        if options.no_overlap: verbose('qualtofa: option -n/--no-overlap: Adding deletions to the reference to accomodate all overlapping sequence...')
        else: verbose('qualtofa: option -a/--no-match-overlap: Adding deletions to the reference to accomodate overlapping matches...')
        matchRanges = [[cntgMatchRanges[index][0][0] + alignmentIndexs[index][0],cntgMatchRanges[index][-1][1] + alignmentIndexs[index][0]] for index in range(0,len(cntgMatchRanges))]
        if options.no_overlap: interSeqGaps = [alignmentIndexs[index + 1][0] - alignmentIndexs[index][1] -1 for index in range(0,len(contigs) - 1)]
        else: interSeqGaps = [matchRanges[index + 1][0] - matchRanges[index][1] - 1 for index in range(0,len(contigs) - 1)]
        verbose('Complete\n')
        gapSum = sum([abs(gap) for gap in interSeqGaps if gap < 0])
        totalLen = sum(map(len,contigs))
        if options.no_overlap: verbose('qualtofa: option -n/--no-overlap: %d of %d total bases (%d percent) in the consensus is overlaping sequence.\n' % (gapSum,totalLen,gapSum*100/len(reference)))
        else: verbose('qualtofa: option -a/--no-match-overlap: %d of %d total bases (%.1f%%) in the consensus are overlapping matchs.\n' % (gapSum,totalLen,float(gapSum)*100/len(reference)))
        refOffset = 0
        for index in range(0,len(interSeqGaps)):
            if interSeqGaps[index] < 0:   #if matching sequence overlaps...
                for count in range(0,abs(interSeqGaps[index])): reference.insert(matchRanges[index][1] + refOffset + 1, '-')
                refOffset += abs(interSeqGaps[index])
            alignmentIndexs[index + 1][0] += refOffset
            alignmentIndexs[index + 1][1] += refOffset
##########################################################################################################################################

def getProp(call, qual):
    equivalancies = {'A':0,'C':1,'G':2,'T':3,'N':4}
    try: callIndex = equivalancies[call.upper()]
    except KeyError: errorExit('getProp: Non-neucleotide base %s encountered...' % call)
    if callIndex == 4:
        return float(max(qual[:4]))/float(qual[-1])
    return float(qual[callIndex])/float(qual[-1])    
    
###Consensous Creation (-M, -S options implimentation)############################################################################
if isAlignment:
    verbose('qualtofa: Creating consensus...')
    numMasked, numQualMasked, SNPmasked, SNPqualmasked, numPropMasked, SNPpropMasked, SNPcount = (0,0,0,0,0,0,0)
    if sum(options.mask_SNPs + options.mask_consensus) > 0: includeMaskedCon = True
    else: includeMaskedCon = False
    rawConsensous = [[] for index in range(0,len(reference))]
    if includeMaskedCon: maskedRawCon = [[] for index in range(0,len(reference))]
    for cIndex in range(0,len(contigs)):
        for bIndex in range(0,len(contigs[cIndex])):
            rIndex = alignmentIndexs[cIndex][0] + bIndex
            refChar = reference[rIndex]
            cntgChar = contigs[cIndex][bIndex]
            rawConsensous[rIndex].append(cntgChar)
            if includeMaskedCon:
                if cntgChar.isupper() and refChar.isalpha() and refChar != cntgChar:
                    SNPcount += 1
                    if qualities[cIndex][bIndex][-1] < options.mask_SNPs[0]:
                        cntgChar = 'N'
                        SNPqualmasked += 1
                    if getProp(contigs[cIndex][bIndex],qualities[cIndex][bIndex]) < options.mask_SNPs[1]:
                        cntgChar = 'N'
                        SNPpropMasked += 1
                    if qualities[cIndex][bIndex][-1] < options.mask_SNPs[0] or getProp(contigs[cIndex][bIndex],qualities[cIndex][bIndex]) < options.mask_SNPs[1]:
                        SNPmasked += 1
                if contigs[cIndex][bIndex].isalpha():
                    if qualities[cIndex][bIndex][-1] < options.mask_consensus[0]:
                        numQualMasked += 1
                        if contigs[cIndex][bIndex].isupper(): cntgChar = 'N'
                        else: cntgChar = 'n'
                    if getProp(contigs[cIndex][bIndex],qualities[cIndex][bIndex]) < options.mask_consensus[1]:
                        numPropMasked += 1
                        if contigs[cIndex][bIndex].isupper(): cntgChar = 'N'
                        else: cntgChar = 'n'
                    if qualities[cIndex][bIndex][-1] < options.mask_consensus[0] or getProp(contigs[cIndex][bIndex],qualities[cIndex][bIndex]) < options.mask_consensus[1]:
                        numMasked += 1
                maskedRawCon[rIndex].append(cntgChar)           
    conSequence = [getIUPAC(charList) for charList in rawConsensous]
    if includeMaskedCon: maskedConSeq = [getIUPAC(charList) for charList in maskedRawCon]
    verbose('Complete\n')
    totalLen = len(conSequence)
    if sum(options.mask_SNPs) > 0: verbose('qualtofa: %d of %d total consensus SNPs (%.1f%%) were masked.\n' % (SNPmasked,SNPcount, float(SNPmasked*100)/SNPcount))
    if options.mask_SNPs[0] > 0: verbose('qualtofa: option -S/--mask-SNPs: %d of %d total SNPs (%.1f%%) have less than %d coverage depth.\n' % (SNPqualmasked,SNPcount, float(SNPqualmasked*100)/SNPcount, options.mask_SNPs[0]))
    if options.mask_SNPs[1] > 0: verbose('qualtofa: option -S/--mask-SNPs: %d of %d total SNPs (%.1f%%) have less than a %.2f call proportion.\n' % (SNPpropMasked,SNPcount, float(SNPpropMasked*100)/SNPcount, options.mask_SNPs[1]))
    if options.mask_consensus[0] > 0: verbose('qualtofa: %d of %d total consensus bases (%.1f%% )were masked.\n' % (numMasked,totalLen, float(numMasked*100)/totalLen))        
    if options.mask_consensus[0] > 0: verbose('qualtofa: option -M/--mask-consensus: %d of %d total bases (%.1f%%) have less than %d coverage depth.\n' % (numQualMasked,totalLen, float(numQualMasked*100)/totalLen, options.mask_consensus[0]))        
    if options.mask_consensus[1] > 0: verbose('qualtofa: option -M/--mask-consensus: %d of %d total bases (%.1f%%) have less than a %.2f call proportion.\n' % (numPropMasked,totalLen, float(numPropMasked*100)/totalLen, options.mask_consensus[1]))        
#############################################################################################################################

        
###Contigs SNP Detection/Masking(-s option implimentation)###################################################################
if isAlignment:
    SNPcovNum, SNPpropNum, SNPmaskedNum, SNPcount = (0,0,0,0)
    SNPs = []
    for cIndex in range(0,len(contigs)):
        for bIndex in range(0,len(contigs[cIndex])):
            refPos = bIndex +  alignmentIndexs[cIndex][0]
            refChar = reference[refPos]
            cntgChar = copy.deepcopy(contigs[cIndex][bIndex])
            if cntgChar.isupper() and refChar.isalpha() and cntgChar.isalpha() and refChar != cntgChar:
                SNPcount += 1
                SNPs.append([refPos,refChar,headers[cIndex],bIndex,cntgChar] + qualities[cIndex][bIndex])
                if qualities[cIndex][bIndex][-1] < options.mask_contig_SNPs[0]:
                    contigs[cIndex][bIndex] = 'N'
                    SNPcovNum += 1
                if getProp(cntgChar,qualities[cIndex][bIndex]) < options.mask_contig_SNPs[1]:
                    contigs[cIndex][bIndex] = 'N'
                    SNPpropNum += 1
                if qualities[cIndex][bIndex][-1] < options.mask_contig_SNPs[0] or getProp(cntgChar,qualities[cIndex][bIndex]) < options.mask_contig_SNPs[1]:
                    SNPmaskedNum += 1
    totalLen = sum(map(len,contigs))
    if sum(options.mask_contig_SNPs) > 0: verbose('qualtofa: %d of %d total SNPs (%.1f%%) were masked\n' % (SNPmaskedNum,SNPcount,float(SNPmaskedNum*100)/SNPcount)) 
    if options.mask_contig_SNPs[0] > 0: verbose('qualtofa: option -s/--mask-contig-SNPs: %d of %d total SNPs (%.1f%%) have less than %d coverage depth.\n' % (SNPcovNum,SNPcount,float(SNPcovNum*100)/SNPcount,options.mask_contig_SNPs[0])) 
    if options.mask_contig_SNPs[1] > 0: verbose('qualtofa: option -s/--mask-contig-SNPs: %d of %d total SNPs (%.1f%%) have less than a %.2f call proportion.\n' % (SNPpropNum,SNPcount,float(SNPpropNum*100)/SNPcount,options.mask_contig_SNPs[1]))     
#############################################################################################################################

###Contigs Masking(-m option implimentation)#################################################################################
if sum(options.mask_contigs) > 0:
    numMasked, numQualMasked, numPropMasked = (0,0,0)
    for cIndex in range(0,len(contigs)):
        for bIndex in range(0,len(contigs[cIndex])):
            cntgChar = copy.deepcopy(contigs[cIndex][bIndex])
            if cntgChar.isalpha():
                if qualities[cIndex][bIndex][-1] < options.mask_contigs[0] or getProp(cntgChar,qualities[cIndex][bIndex]) < options.mask_contigs[1]:
                    numMasked += 1
                if qualities[cIndex][bIndex][-1] < options.mask_contigs[0]:
                    if contigs[cIndex][bIndex].isupper(): contigs[cIndex][bIndex] = 'N'
                    else: contigs[cIndex][bIndex] = 'n'
                    numQualMasked += 1
                if getProp(cntgChar,qualities[cIndex][bIndex]) < options.mask_contigs[1]:
                    if contigs[cIndex][bIndex].isupper(): contigs[cIndex][bIndex] = 'N'
                    else: contigs[cIndex][bIndex] = 'n'
                    numPropMasked += 1             
    totalLen = sum(map(len,contigs))
    verbose('qualtofa: %d bases of %d total bases (%.1f%%) were masked.\n' % (numMasked,totalLen, float(numMasked*100)/totalLen)) 
    verbose('qualtofa: option -m/--mask-contigs: %d of %d total bases (%.1f%%) have less than %d coverage depth.\n' % (numQualMasked,totalLen, float(numQualMasked*100)/totalLen, options.mask_contigs[0]))  
    verbose('qualtofa: option -m/--mask-contigs: %d of %d total bases (%.1f%%) have less than a %.2f call proportion.\n' % (numPropMasked,totalLen, float(numPropMasked*100)/totalLen, options.mask_contigs[1]))      
#############################################################################################################################

###Contianed Contigs file saving#############################################################################################
if len(containedContigs) > 0:
    outPath = qualPath + '_contained_contigs.fsa'
    verbose('qualtofa: Saving contained contigs to %s...' % outPath)
    saveFasta(outPath, *zip(*containedContigs))
    verbose('Complete\n')
#############################################################################################################################

###SNP file saving###########################################################################################################
if options.save_SNPs:
    outPath = qualPath + '_SNPs.qual'
    verbose('qualtofa: Saving %d SNPs to %s...' % (len(SNPs),outPath))
    outHandle = open(outPath, 'w')
    outHandle.write('Schema:\nPosition\tReference\tName\tIndex\tcall\tA\tC\tG\tT\tdash\tsum\n')
    for SNP in SNPs: outHandle.write('\t'.join(map(str,SNP)) + '\n')
    outHandle.close()
    verbose('Complete\n')
#############################################################################################################################

###Contig sequence alignment#################################################################################################
if isAlignment and options.dont_align_contigs == False:
    contigs = [['-'] * alignmentIndexs[index][0] + contigs[index] for index in range(0,len(contigs))]
#############################################################################################################################

###Output file Saving########################################################################################################
verbose('qualtofa: Saving fasta output to %s...' % (qualPath + '.fsa'))
conHeader = 'Consensus'
maskedConHeader = 'Masked Consensus'    
if isAlignment:
    outputHeads, outputSeqs = ([refHeader,conHeader],[reference,conSequence])
    if includeMaskedCon:
        outputHeads.append(maskedConHeader)
        outputSeqs.append(maskedConSeq)
else: outputHeads, outputSeqs = ([],[])
if options.include_contigs:
    outputHeads += headers
    outputSeqs += contigs
outPath = os.path.join(os.getcwd(), os.path.basename(qualPath) + '.fsa')
saveFasta(outPath, outputHeads, outputSeqs)
verbose('Complete\n')
if options.save_run_info:
    runtimeOutPath = os.path.join(os.getcwd(), 'qualtofa_runtime_output_%s.txt' % timeStamp)
    verbose('qualtofa: option -i/--save-run-info: saving run-time output to %s...' % runtimeOutPath)
    runHandle = open(runtimeOutPath, 'w')
    for line in printLog: runHandle.write(line)
    runHandle.close()
    verbose('Complete\n')
#############################################################################################################################
Qualtofa.py

Navigation menu

Page actions

Page actions

Personal tools

Navigation

Search

research

Tools