Sumqual.py

Copy and past the following into a text editor and save it as sumqual.py to use the script.
 * 1) !/usr/bin/env python

import os, string, sys, time, copy from datetime import * from optparse import * from subprocess import * commandLine = copy.deepcopy(sys.argv) cwd = os.getcwd timeStamp = datetime.now.ctime.replace(' ','-').replace(':','-').replace('--','-') #Ex: 'Mon-Jun-14-11-08-55-2010' cntgMatchRanges = None progName, progVersion = ('sumqual.py','2.12') progUsage = 'python %s [options] <.qual file>  ' % progName printLog = []
 * 1) Imports / Variable Initilization##########################################################################################

def errorExit(message=None,exitStatus=1): '''Version 1.0 Is called when the script encounters a fatal error. Prints the debug log to standard out (usually the screen)''' if message: print '%s: Error: %s' % (progName, message) else: print progName +': Unknown error, printing debug log...' for line in debugLog: print line  #print debug log sys.exit(exitStatus)

def mkRange(numList): Function designed to make a list of ranges (e.g. 2-5, 8-10) out of a list of integers (e.g. 2,3,4,5,8,9,10) outList = [] if len(numList) > 0: start  = numList[0] current = numList[0] numList.append(numList[-1]) for num in numList[1:]: if (num - 1) != current: outList.append([start,current]) start = num current = num return outList

def saveQualStandard(filePath, contigs, qualValues, headers, schema=None): if schema is None: schema = ['Position in the contig(1 based)', 'Called base', "Number of reads suppoting 'A'", "Number of reads supporting 'C'", "Number of reads supporting 'G'", "Number of reads supporting 'T'", "Number of reads supporting '_'", 'Coverage'] if len(contigs) == 0: errorExit('saveQualStandard: empty list supplied') if len(contigs) != len(qualValues) or len(contigs) != len(headers): errorExit('saveQualStandard: Unequal number of entries is input list') outHandle = open(filePath, 'w') outHandle.write('Schema:'+ '\n' + '\t'.join(schema) + '\n') for cIndex in range(0,len(contigs)): outHandle.write(headers[cIndex] + '\n') for bIndex in range(0,len(contigs[cIndex])): outHandle.write('\t'.join([contigs[cIdnex][bIndex]] + qualValues[cIndex]) + '\n') outHandle.close def parseQualStandard(filePath): columbWidth = 8 fileHandle = open(filePath, 'r') fileData = [line.strip.split('\t') for line in fileHandle.readlines]  #fileData is a list of lists representing the tab-deliminated structure of the .qual file fileHandle.close if fileData[0] == ['Schema:']: del fileData[0]  #removes the unessesary first line of the file, if present schema = fileData[0]  #extract the tab-deliminated schema into a list headers, contigs, qualValues = ([], [], [])  #initilizes multiple lists at a time index = 0 for line in fileData[1:]: if len(line) == 1:  #if it is a header headers.append(line[0]) contigs.append([]) qualValues.append([]) elif len(line) == columbWidth:  #if it is  quality value contigs[-1].append(line[1]) qualValues[-1].append([int(num) for num in line[2:]])  #converts all of the quality values into integer objects and saves them in qualvalues else: errorExit('parseQualStandard: Quality file at %s deviated from the expected format on line %d.' % (filePath, index)) index += 1 for i in range(0,len(contigs)): if headers[i] == 'Contig128': print map(len,(contigs[i], qualValues[i])) return (contigs, qualValues, headers, schema)

def parseQualAlignment(filePath): columbWidth = 9 headColNum = 2 fileHandle = open(filePath, 'r') fileData = [line.strip.split('\t') for line in fileHandle.readlines]  #fileData is a list of lists representing the tab-deliminated structure of the .qual file fileHandle.close if fileData[0] == ['Schema:']: del fileData[0]  #removes the unessesary first line of the file, if present schema = fileData[0]  #extract the tab-deliminated schema into a list refHeader = fileData[1][0] reference = [line[1] for line in fileData[2:]]  #extracts the reference from the second columb qualData = [line[headColNum:] for line in fileData[2:]]  #removes the first lines and columbs of fileData, which dont contain unextracted data headers, contigs, cntgStarts, qualValues = ([], [], [], [])  #initilizes multiple lists at a time refPos = 0 for line in qualData: if len(line) > 0: for index in range(0,len(line),columbWidth): if headers.count(line[index]) == 0:  #if the contig header has not been encountered before headers.append(line[index]) cntgStarts.append(refPos) contigs.append([]) qualValues.append([]) headIndex = headers.index(line[index]) contigs[headIndex].append(line[index + 2])  #append the called base character to its contig qualValues[headIndex].append([int(num) for num in line[index + 3: index + columbWidth]])  #converts all of the quality values into integer objects and saves them in qualvalues refPos += 1 alignmentIndexs = [[cntgStarts[index], cntgStarts[index] + len(contigs[index]) - 1] for index in range(0,len(cntgStarts))]   return (contigs, qualValues, headers, alignmentIndexs, reference, refHeader, schema)

def parseQual(filePath): columbWidth = 9 headColNum = 2 fileHandle = open(filePath, 'r') fileHead = fileHandle.readline if fileHead[0].strip == 'Schema:': fileHandle.readline  #removes the unessesary first line of the file, if present schema = fileHandle.readline.strip.split('\t') fileHandle.close if schema[1] == 'Reference' and len(schema) % columbWidth == headColNum: return parseQualAlignment(filePath) elif len(schema) == 8: return parseQualStandard(filePath) else: errorExit('parseQual: The quality file at %s has an unreconized format based on the schema on line 2.' % filePath)

def parseFasta(filePath): headers = [] seqs = [] fileHandle = open(filePath, 'r') fileData = fileHandle.readlines + ['>']  #the additional header line is there make to following for loop process the last sequence fileHandle.close currSeq = '' for line in fileData: if line[0] == '>':  #if the line is a fasta header headers.append(line[1:].strip )  #the fisrt character is the '>' and the last is a newline '\n' seqs.append(list(currSeq)) currSeq = '' else: currSeq += line.strip return (headers[:-1], seqs[1:])

def removeRepeats(aList): for entry in aList: if aList.count(entry) > 1: aList.remove(entry) return removeRepeats(aList) return aList

cmndLineParser = OptionParser(usage=progUsage, version="Version %s" % progVersion) cmndLineParser.add_option(  "-v",   "--verbose",            action="store_true",    default=False,     help="Print progress updates and relevant statistics to the standard output. (Default: run silently)") cmndLineParser.add_option(  "-g",   "--save-gaps",          action="store_true",    default=False,     help="Save the reference sequence corresponding to empty gaps in the consensus in a fasta file. (Default: Do not save)") cmndLineParser.add_option(  "-n",   "--no-names",           action="store_true",    default=False,     help="Do not include the name of the original contig with each base. Output cannot be used with qualtofa. (Default: the name of the contig each base came from is included)") cmndLineParser.add_option(  "-r",   "--no-reference",       action="store_true",    default=False,     help="Do not include the reference in the ouput. Output cannot be used with qualtofa. (Default: include reference)") cmndLineParser.add_option(  "-i",   "--save-run-info",      action="store_true",    default=False,     help="Save a txt file of anything printed to the screen. Requires the use of -v/--verbose. (Default: Do not save)") (options, args) = cmndLineParser.parse_args(commandLine) if len(args) == 1: cmndLineParser.print_help sys.exit(0) argNum = len(args) - 1  #counts the amount of arguments, negating the 'qualtofa' at the start of the command line if argNum != 3: errorExit('qualtofa takes exactly 3 arguments; %d supplied' % argNum, 0) qualPath, nucPath, refPath = args[1:]  #The file path to the input file supplied by the user
 * 1) Command Line Parser#######################################################################################################

def verbose(toPrint): if options.verbose: print toPrint, printLog.append(toPrint)

def parseNucmer(filePath): try: fileHandle = open(filePath, 'r') except IOError: errorExit('parseNucmer: could not open file at %s' % filePath) fileData = fileHandle.read fileHandle.close nucInPaths = fileData[0] #a '>' appears before the name of every contig and each match is sepeated by a '0' on its own line (i.e. '\n0\n') #data is parsed by contig ('>'), match ('\n0\n'), and line '\n'. contigData = cntg.split('\n') for cntg in group.split('\n0\n')[:-1 for group in fileData[1:].split('>')][1:] for cIndex in range(0,len(contigData)): if len(contigData[cIndex]) > 1: for gIndex in range(1,len(contigData[cIndex])): cntgHeadParts = contigData[cIndex][0][0].split cntgHeadParts[1] += '(%d)' % (gIndex + 1) contigData[cIndex][gIndex].insert(0,' '.join(cntgHeadParts))  #adds contigs information to each sub contig contigData = [cntg for group in contigData for cntg in group]  #makes into one contig list headers = zip(*[cntg[0].split(' ') for cntg in contigData]) refHeads, cntgHeads, refLenghts, cntgLengths = (headers[0],list(headers[1]),map(int,headers[2]),map(int,headers[3])) alignStats = [map(int,cntg[1].split) for cntg in contigData] deltas = [map(int,cntg[2:]) for cntg in contigData] outList = (alignStats, cntgHeads, refHeads, refLenghts, cntgLengths, deltas) return outList verbose('sumqual: Loading quality information from file at %s...' % qualPath,) qualData = parseQual(qualPath) verbose('Complete\nsumqual: Loading Nucmer output from file at %s...' % nucPath,) nucData = parseNucmer(nucPath) verbose('Complete\nsumqual: Loading reference from file at %s...' % refPath,) refData = parseFasta(refPath) verbose('Complete\nsumqual: Consolidating input data...',) if len(qualData) == 7: errorExit('Sumqual requires a standard quality file (e.g one directly from YASRA); qualitiy alignment file, %s supplied.' % qualPath) nucInQualCounts = [list(nucData[1]).count(name) for name in qualData[2]] #the number of times each name from the qual file appears in the nucmer one if max(nucInQualCounts) > 1: errorExit('Contigs with duplicate names deteced in the nucmer file') delIndexes = [index for index in range(0,len(nucInQualCounts)) if nucInQualCounts[index] == 0] #the indexes of all contigs that dont appear in the nucmer output delIndexes.reverse for index in range(0,3): for dIndex in delIndexes: del qualData[index][dIndex] matchCount = 1 for nucIndex in range(0,len(nucData[1])): nucHeader = nucData[1][nucIndex] if nucIndex > 0 and nucHeader.startswith(nucData[1][nucIndex - matchCount]) and nucHeader.endswith('(%d)' % (matchCount + 1)): matchCount += 1 for listIndex in range(0,3): qualData[listIndex].insert(nucIndex,copy.deepcopy(qualData[listIndex][nucIndex - matchCount + 1])) continue else: matchCount = 1 if qualData[2].count(nucHeader) == 0: errorExit('Contig "%s" in file %s could not be found in %s' % (nucHeader,nucPath,qualPath)) inIndex = qualData[2].index(nucHeader) for listIndex in range(0,3): qualData[listIndex].insert(nucIndex,qualData[listIndex][inIndex]) del qualData[listIndex][inIndex + 1] contigs, qualValues, qualHeaders, inSchema = qualData matchStats, headers, refHeads, refLenghts, cntgLengths, deltas = nucData if sum([refHeads[0] != refHeads[index] for index in range(0,len(refHeads))]) > 1: errorExit('Multiple references detected in %s. sumqual does not support multiple references.' % nucPath) nucRefHeader = refHeads[0] refHeaders, refSeqs = refData if len(refHeaders) == 0: errorExit('The reference file at %s is empty.' % refPath) if len(refHeaders) > 1: print 'Caution: Multiple sequences decteced in reference file at %s. Only the first sequence, named %s, will be used.\n' % (refPath,refHeaders[0]) refHeader, refSeq = (refHeaders[0], refSeqs[0]) verbose('Complete\n')
 * 1) Input data file parsing and consolidation#################################################################################

def compliment(seq): conversionKey = {'A':'T','T':'A','G':'C','C':'G'} outSeq = [] for base in seq: outSeq.append(conversionKey[base]) return outSeq
 * 1) Revsersed sequence Handleing##############################################################################################

reversedIndexes = [] for cIndex in range(0,len(contigs)): if matchStats[cIndex][2] > matchStats[cIndex][3] and matchStats[cIndex][1] >= matchStats[cIndex][0]: revMatch = matchStats[cIndex] matchStats[cIndex][2] = len(contigs[cIndex]) - revMatch[2] + 1 matchStats[cIndex][3] = len(contigs[cIndex]) - revMatch[3] + 1 reversedIndexes.append(cIndex) elif matchStats[cIndex][0] > matchStats[cIndex][1] and matchStats[cIndex][3] >= matchStats[cIndex][2]: temp = matchStats[cIndex][0] matchStats[cIndex][0] = matchStats[cIndex][1] matchStats[cIndex][1] = temp reversedIndexes.append(cIndex) for index in reversedIndexes: headers[index] += '(reverse)' contigs[index].reverse contigs[index] = compliment(contigs[index])

if len(reversedIndexes) > 0: verbose('sumqual: %d reversed contig alignments found.\n' % len(reversedIndexes))

verbose('sumqual: Makeing all matching sequence uppercase and all unmatching lowercase...',) for cIndex in range(0,len(contigs)): contigs[cIndex] = [char.lower for char in contigs[cIndex][:matchStats[cIndex][2] - 1]] + \ [char.upper for char in contigs[cIndex][matchStats[cIndex][2] - 1:matchStats[cIndex][3]]] + \ [char.lower for char in contigs[cIndex][matchStats[cIndex][3]:]] verbose('Complete\n')
 * 1) Case Modification#########################################################################################################

def deltaCoords(deltas, refStart, cntgStart): if type(deltas) != list: errorExit('deltaCoords: list required...') if len(deltas) == 0: return ([],[]) refOut, cntgOut = ([],[]) cntgPos, refPos = cntgStart, refStart for delta in deltas: cntgPos += abs(delta) refPos += abs(delta) if delta > 0: cntgOut.append(cntgPos) elif delta < 0: refOut.append(refPos) return (refOut, cntgOut)
 * 1) Indel incorperation#######################################################################################################

def addCntgDel(delIndex, cntgIndex): contigs[cntgIndex].insert(delIndex,'-') qualValues[cntgIndex].insert(delIndex,[0]*6) matchStats[cIndex][3] += 1 def addRefDel(delIndex, cntgIndex): isAfterIndel = [index for index in range(0,len(contigs)) if delIndex <= matchStats[index][0] - 1 and index != cntgIndex] for index in isAfterIndel: matchStats[index][0] += 1 matchStats[index][1] += 1 refSeq.insert(delIndex,'-') delsToAdd = [(delIndex - matchStats[cIndex][0] + matchStats[cIndex][2], cIndex) for cIndex in range(0,len(contigs)) if delIndex >= matchStats[cIndex][0] - 1 \ and delIndex <= matchStats[cIndex][1] - 1 \ and (delIndex,cIndex) not in allRefDels \ and cntgIndex != cIndex] for cDel in delsToAdd: addCntgDel(*cDel) for index in range(0,len(allRefDels)): if allRefDels[index][1] != cntgIndex and allRefDels[index][0] > delIndex: allRefDels[index][0] += 1

verbose('sumqual: Incorperating Nucmer indel information...',) allCntgDels = cntgDel,index] for index in range(0,len(contigs)) for cntgDel in deltaCoords(deltas[index], matchStats[index][0] - 2, matchStats[index][2] - 2)[1 allRefDels = refDel,index] for index in range(0,len(contigs)) for refDel in deltaCoords(deltas[index], matchStats[index][0] - 2, matchStats[index][2] - 2)[0 for cntgDel in allCntgDels: addCntgDel(*cntgDel) for refDel in allRefDels: addRefDel(*refDel) verbose('Complete\nsumqual: %d reference deltions and %d contig deletions (i.e. "-") added.\n' % (len(allRefDels),len(allCntgDels)))

for i in range(0,len(contigs)): if headers[i] == 'Contig128': print map(len,(contigs[i], qualValues[i]))

def saveQualAlignment(savePath, contigs, qualValues, headers, alignmentIndexs, reference, refHeader, schema=None): firstBaseIndex = min([stats[0] - stats[2] for stats in alignmentIndexs]) lastBaseIndex = max([alignmentIndexs[index][0] - alignmentIndexs[index][2] + len(contigs[index]) - 1 for index in range(0,len(contigs))]) outList = str(index + 1),refSeq[index for index in range(0,len(refSeq))] outList = str(index),'-'] for index in range(firstBaseIndex,0)] + outList + [[str(index),'-'] for index in range(len(refSeq),lastBaseIndex + 1)]   for cIndex in range(0,len(headers)):        if headers[cIndex] == 'Contig128':            print map(len,(contigs[cIndex],qualValues[cIndex]))             + qualValues[cIndex][index] for index in range(0,len(contigs[cIndex]))]        quals = [map(str,line) for line in quals]        outPos = alignmentIndexs[cIndex][0] - alignmentIndexs[cIndex][2] #+ alignmentIndexs[0][2] - alignmentIndexs[0][0]        if firstBaseIndex < 0: outPos += abs(firstBaseIndex)        for qIndex in range(0,len(quals)): outList[outPos + qIndex] += quals[qIndex]    if schema == None: schema = ['Position', 'Reference', 'Name', 'Index', 'Call', 'A', 'C', 'G', 'T', 'Dash', 'Sum']    maxCols = (max(map(len,outList)) - 2) / 6    if maxCols > 1: schema += [part + '-alt' + str(count) for count in range(2,maxCols + 1) for part in schema[2:]]    outHandle = open(savePath, 'w')    outHandle.write('Schema:\n' + '\t'.join(schema) + '\n' + refHeader + '\n')    for line in map('\t'.join,outList): outHandle.write(line + '\n')    outHandle.close outPath = os.path.join(os.getcwd, os.path.basename(nucPath) + '.qual') verbose('sumqual: Saving quality consensus alignment to %s...' % outPath,) saveQualAlignment(outPath, contigs, qualValues, headers, matchStats, refSeq, refHeader) verbose('Complete\n') if options.save_run_info:    runtimeOutPath = os.path.join(os.getcwd, 'sumqual_runtime_output_%s.txt' % timeStamp)    verbose('sumqual: option -i/--save-run-info: saving run-time output to %s...' % runtimeOutPath)    runHandle = open(runtimeOutPath, 'w')    for line in printLog: runHandle.write(line)    runHandle.close    verbose('Complete\n')
 * 1) Output file saveing#######################################################################################################