Sumqual.py
From OpenWetWare
Jump to navigationJump to search
Copy and past the following into a text editor and save it as sumqual.py to use the script.
#!/usr/bin/env python ###Imports / Variable Initilization########################################################################################## import os, string, sys, time, copy from datetime import * from optparse import * from subprocess import * commandLine = copy.deepcopy(sys.argv) cwd = os.getcwd() timeStamp = datetime.now().ctime().replace(' ','-').replace(':','-').replace('--','-') #Ex: 'Mon-Jun-14-11-08-55-2010' cntgMatchRanges = None progName, progVersion = ('sumqual.py','2.12') progUsage = 'python %s [options] <.qual file> <NUCmer file path> <Reference Path>' % progName printLog = [] ############################################################################################################################# def errorExit(message=None,exitStatus=1): '''Version 1.0 Is called when the script encounters a fatal error. Prints the debug log to standard out (usually the screen)''' if message: print '%s: Error: %s' % (progName, message) else: print progName +': Unknown error, printing debug log...' for line in debugLog: print line #print debug log sys.exit(exitStatus) def mkRange(numList): '''Function designed to make a list of ranges (e.g. 2-5, 8-10) out of a list of integers (e.g. 2,3,4,5,8,9,10)''' outList = [] if len(numList) > 0: start = numList[0] current = numList[0] numList.append(numList[-1]) for num in numList[1:]: if (num - 1) != current: outList.append([start,current]) start = num current = num return outList def saveQualStandard(filePath, contigs, qualValues, headers, schema=None): if schema is None: schema = ['Position in the contig(1 based)', 'Called base', "Number of reads suppoting 'A'", "Number of reads supporting 'C'", "Number of reads supporting 'G'", "Number of reads supporting 'T'", "Number of reads supporting '_'", 'Coverage'] if len(contigs) == 0: errorExit('saveQualStandard: empty list supplied') if len(contigs) != len(qualValues) or len(contigs) != len(headers): errorExit('saveQualStandard: Unequal number of entries is input list') outHandle = open(filePath, 'w') outHandle.write('Schema:'+ '\n' + '\t'.join(schema) + '\n') for cIndex in range(0,len(contigs)): outHandle.write(headers[cIndex] + '\n') for bIndex in range(0,len(contigs[cIndex])): outHandle.write('\t'.join([contigs[cIdnex][bIndex]] + qualValues[cIndex]) + '\n') outHandle.close() def parseQualStandard(filePath): columbWidth = 8 fileHandle = open(filePath, 'r') fileData = [line.strip().split('\t') for line in fileHandle.readlines()] #fileData is a list of lists representing the tab-deliminated structure of the .qual file fileHandle.close() if fileData[0] == ['Schema:']: del fileData[0] #removes the unessesary first line of the file, if present schema = fileData[0] #extract the tab-deliminated schema into a list headers, contigs, qualValues = ([], [], []) #initilizes multiple lists at a time index = 0 for line in fileData[1:]: if len(line) == 1: #if it is a header headers.append(line[0]) contigs.append([]) qualValues.append([]) elif len(line) == columbWidth: #if it is quality value contigs[-1].append(line[1]) qualValues[-1].append([int(num) for num in line[2:]]) #converts all of the quality values into integer objects and saves them in qualvalues else: errorExit('parseQualStandard: Quality file at %s deviated from the expected format on line %d.' % (filePath, index)) index += 1 for i in range(0,len(contigs)): if headers[i] == 'Contig128': print map(len,(contigs[i], qualValues[i])) return (contigs, qualValues, headers, schema) def parseQualAlignment(filePath): columbWidth = 9 headColNum = 2 fileHandle = open(filePath, 'r') fileData = [line.strip().split('\t') for line in fileHandle.readlines()] #fileData is a list of lists representing the tab-deliminated structure of the .qual file fileHandle.close() if fileData[0] == ['Schema:']: del fileData[0] #removes the unessesary first line of the file, if present schema = fileData[0] #extract the tab-deliminated schema into a list refHeader = fileData[1][0] reference = [line[1] for line in fileData[2:]] #extracts the reference from the second columb qualData = [line[headColNum:] for line in fileData[2:]] #removes the first lines and columbs of fileData, which dont contain unextracted data headers, contigs, cntgStarts, qualValues = ([], [], [], []) #initilizes multiple lists at a time refPos = 0 for line in qualData: if len(line) > 0: for index in range(0,len(line),columbWidth): if headers.count(line[index]) == 0: #if the contig header has not been encountered before headers.append(line[index]) cntgStarts.append(refPos) contigs.append([]) qualValues.append([]) headIndex = headers.index(line[index]) contigs[headIndex].append(line[index + 2]) #append the called base character to its contig qualValues[headIndex].append([int(num) for num in line[index + 3: index + columbWidth]]) #converts all of the quality values into integer objects and saves them in qualvalues refPos += 1 alignmentIndexs = [[cntgStarts[index], cntgStarts[index] + len(contigs[index]) - 1] for index in range(0,len(cntgStarts))] return (contigs, qualValues, headers, alignmentIndexs, reference, refHeader, schema) def parseQual(filePath): columbWidth = 9 headColNum = 2 fileHandle = open(filePath, 'r') fileHead = fileHandle.readline() if fileHead[0].strip() == 'Schema:': fileHandle.readline() #removes the unessesary first line of the file, if present schema = fileHandle.readline().strip().split('\t') fileHandle.close() if schema[1] == 'Reference' and len(schema) % columbWidth == headColNum: return parseQualAlignment(filePath) elif len(schema) == 8: return parseQualStandard(filePath) else: errorExit('parseQual: The quality file at %s has an unreconized format based on the schema on line 2.' % filePath) def parseFasta(filePath): headers = [] seqs = [] fileHandle = open(filePath, 'r') fileData = fileHandle.readlines() + ['>'] #the additional header line is there make to following for loop process the last sequence fileHandle.close() currSeq = '' for line in fileData: if line[0] == '>': #if the line is a fasta header headers.append(line[1:].strip() ) #the fisrt character is the '>' and the last is a newline '\n' seqs.append(list(currSeq)) currSeq = '' else: currSeq += line.strip() return (headers[:-1], seqs[1:]) def removeRepeats(aList): for entry in aList: if aList.count(entry) > 1: aList.remove(entry) return removeRepeats(aList) return aList ###Command Line Parser####################################################################################################### cmndLineParser = OptionParser(usage=progUsage, version="Version %s" % progVersion) cmndLineParser.add_option( "-v", "--verbose", action="store_true", default=False, help="Print progress updates and relevant statistics to the standard output. (Default: run silently)") cmndLineParser.add_option( "-g", "--save-gaps", action="store_true", default=False, help="Save the reference sequence corresponding to empty gaps in the consensus in a fasta file. (Default: Do not save)") cmndLineParser.add_option( "-n", "--no-names", action="store_true", default=False, help="Do not include the name of the original contig with each base. Output cannot be used with qualtofa. (Default: the name of the contig each base came from is included)") cmndLineParser.add_option( "-r", "--no-reference", action="store_true", default=False, help="Do not include the reference in the ouput. Output cannot be used with qualtofa. (Default: include reference)") cmndLineParser.add_option( "-i", "--save-run-info", action="store_true", default=False, help="Save a txt file of anything printed to the screen. Requires the use of -v/--verbose. (Default: Do not save)") (options, args) = cmndLineParser.parse_args(commandLine) if len(args) == 1: cmndLineParser.print_help() sys.exit(0) argNum = len(args) - 1 #counts the amount of arguments, negating the 'qualtofa' at the start of the command line if argNum != 3: errorExit('qualtofa takes exactly 3 arguments; %d supplied' % argNum, 0) qualPath, nucPath, refPath = args[1:] #The file path to the input file supplied by the user ############################################################################################################################# def verbose(toPrint): if options.verbose: print toPrint, printLog.append(toPrint) def parseNucmer(filePath): try: fileHandle = open(filePath, 'r') except IOError: errorExit('parseNucmer: could not open file at %s' % filePath) fileData = fileHandle.read() fileHandle.close() nucInPaths = fileData[0] #a '>' appears before the name of every contig and each match is sepeated by a '0' on its own line (i.e. '\n0\n') #data is parsed by contig ('>'), match ('\n0\n'), and line '\n'. contigData = [[cntg.split('\n') for cntg in group.split('\n0\n')[:-1]] for group in fileData[1:].split('>')][1:] for cIndex in range(0,len(contigData)): if len(contigData[cIndex]) > 1: for gIndex in range(1,len(contigData[cIndex])): cntgHeadParts = contigData[cIndex][0][0].split() cntgHeadParts[1] += '(%d)' % (gIndex + 1) contigData[cIndex][gIndex].insert(0,' '.join(cntgHeadParts)) #adds contigs information to each sub contig contigData = [cntg for group in contigData for cntg in group] #makes into one contig list headers = zip(*[cntg[0].split(' ') for cntg in contigData]) refHeads, cntgHeads, refLenghts, cntgLengths = (headers[0],list(headers[1]),map(int,headers[2]),map(int,headers[3])) alignStats = [map(int,cntg[1].split()) for cntg in contigData] deltas = [map(int,cntg[2:]) for cntg in contigData] outList = (alignStats, cntgHeads, refHeads, refLenghts, cntgLengths, deltas) return outList ###Input data file parsing and consolidation################################################################################# verbose('sumqual: Loading quality information from file at %s...' % qualPath,) qualData = parseQual(qualPath) verbose('Complete\nsumqual: Loading Nucmer output from file at %s...' % nucPath,) nucData = parseNucmer(nucPath) verbose('Complete\nsumqual: Loading reference from file at %s...' % refPath,) refData = parseFasta(refPath) verbose('Complete\nsumqual: Consolidating input data...',) if len(qualData) == 7: errorExit('Sumqual requires a standard quality file (e.g one directly from YASRA); qualitiy alignment file, %s supplied.' % qualPath) nucInQualCounts = [list(nucData[1]).count(name) for name in qualData[2]] #the number of times each name from the qual file appears in the nucmer one if max(nucInQualCounts) > 1: errorExit('Contigs with duplicate names deteced in the nucmer file') delIndexes = [index for index in range(0,len(nucInQualCounts)) if nucInQualCounts[index] == 0] #the indexes of all contigs that dont appear in the nucmer output delIndexes.reverse() for index in range(0,3): for dIndex in delIndexes: del qualData[index][dIndex] matchCount = 1 for nucIndex in range(0,len(nucData[1])): nucHeader = nucData[1][nucIndex] if nucIndex > 0 and nucHeader.startswith(nucData[1][nucIndex - matchCount]) and nucHeader.endswith('(%d)' % (matchCount + 1)): matchCount += 1 for listIndex in range(0,3): qualData[listIndex].insert(nucIndex,copy.deepcopy(qualData[listIndex][nucIndex - matchCount + 1])) continue else: matchCount = 1 if qualData[2].count(nucHeader) == 0: errorExit('Contig "%s" in file %s could not be found in %s' % (nucHeader,nucPath,qualPath)) inIndex = qualData[2].index(nucHeader) for listIndex in range(0,3): qualData[listIndex].insert(nucIndex,qualData[listIndex][inIndex]) del qualData[listIndex][inIndex + 1] contigs, qualValues, qualHeaders, inSchema = qualData matchStats, headers, refHeads, refLenghts, cntgLengths, deltas = nucData if sum([refHeads[0] != refHeads[index] for index in range(0,len(refHeads))]) > 1: errorExit('Multiple references detected in %s. sumqual does not support multiple references.' % nucPath) nucRefHeader = refHeads[0] refHeaders, refSeqs = refData if len(refHeaders) == 0: errorExit('The reference file at %s is empty.' % refPath) if len(refHeaders) > 1: print 'Caution: Multiple sequences decteced in reference file at %s. Only the first sequence, named %s, will be used.\n' % (refPath,refHeaders[0]) refHeader, refSeq = (refHeaders[0], refSeqs[0]) verbose('Complete\n') ############################################################################################################################# ###Revsersed sequence Handleing############################################################################################## def compliment(seq): conversionKey = {'A':'T','T':'A','G':'C','C':'G'} outSeq = [] for base in seq: outSeq.append(conversionKey[base]) return outSeq reversedIndexes = [] for cIndex in range(0,len(contigs)): if matchStats[cIndex][2] > matchStats[cIndex][3] and matchStats[cIndex][1] >= matchStats[cIndex][0]: revMatch = matchStats[cIndex] matchStats[cIndex][2] = len(contigs[cIndex]) - revMatch[2] + 1 matchStats[cIndex][3] = len(contigs[cIndex]) - revMatch[3] + 1 reversedIndexes.append(cIndex) elif matchStats[cIndex][0] > matchStats[cIndex][1] and matchStats[cIndex][3] >= matchStats[cIndex][2]: temp = matchStats[cIndex][0] matchStats[cIndex][0] = matchStats[cIndex][1] matchStats[cIndex][1] = temp reversedIndexes.append(cIndex) for index in reversedIndexes: headers[index] += '(reverse)' contigs[index].reverse() contigs[index] = compliment(contigs[index]) if len(reversedIndexes) > 0: verbose('sumqual: %d reversed contig alignments found.\n' % len(reversedIndexes)) ############################################################################################################################# ###Case Modification######################################################################################################### verbose('sumqual: Makeing all matching sequence uppercase and all unmatching lowercase...',) for cIndex in range(0,len(contigs)): contigs[cIndex] = [char.lower() for char in contigs[cIndex][:matchStats[cIndex][2] - 1]] + \ [char.upper() for char in contigs[cIndex][matchStats[cIndex][2] - 1:matchStats[cIndex][3]]] + \ [char.lower() for char in contigs[cIndex][matchStats[cIndex][3]:]] verbose('Complete\n') ############################################################################################################################# ###Indel incorperation####################################################################################################### def deltaCoords(deltas, refStart, cntgStart): if type(deltas) != list: errorExit('deltaCoords: list required...') if len(deltas) == 0: return ([],[]) refOut, cntgOut = ([],[]) cntgPos, refPos = cntgStart, refStart for delta in deltas: cntgPos += abs(delta) refPos += abs(delta) if delta > 0: cntgOut.append(cntgPos) elif delta < 0: refOut.append(refPos) return (refOut, cntgOut) def addCntgDel(delIndex, cntgIndex): contigs[cntgIndex].insert(delIndex,'-') qualValues[cntgIndex].insert(delIndex,[0]*6) matchStats[cIndex][3] += 1 def addRefDel(delIndex, cntgIndex): isAfterIndel = [index for index in range(0,len(contigs)) if delIndex <= matchStats[index][0] - 1 and index != cntgIndex] for index in isAfterIndel: matchStats[index][0] += 1 matchStats[index][1] += 1 refSeq.insert(delIndex,'-') delsToAdd = [(delIndex - matchStats[cIndex][0] + matchStats[cIndex][2], cIndex) for cIndex in range(0,len(contigs)) if delIndex >= matchStats[cIndex][0] - 1 \ and delIndex <= matchStats[cIndex][1] - 1 \ and (delIndex,cIndex) not in allRefDels \ and cntgIndex != cIndex] for cDel in delsToAdd: addCntgDel(*cDel) for index in range(0,len(allRefDels)): if allRefDels[index][1] != cntgIndex and allRefDels[index][0] > delIndex: allRefDels[index][0] += 1 verbose('sumqual: Incorperating Nucmer indel information...',) allCntgDels = [[cntgDel,index] for index in range(0,len(contigs)) for cntgDel in deltaCoords(deltas[index], matchStats[index][0] - 2, matchStats[index][2] - 2)[1]] allRefDels = [[refDel,index] for index in range(0,len(contigs)) for refDel in deltaCoords(deltas[index], matchStats[index][0] - 2, matchStats[index][2] - 2)[0]] for cntgDel in allCntgDels: addCntgDel(*cntgDel) for refDel in allRefDels: addRefDel(*refDel) verbose('Complete\nsumqual: %d reference deltions and %d contig deletions (i.e. "-") added.\n' % (len(allRefDels),len(allCntgDels))) ############################################################################################################################# for i in range(0,len(contigs)): if headers[i] == 'Contig128': print map(len,(contigs[i], qualValues[i])) def saveQualAlignment(savePath, contigs, qualValues, headers, alignmentIndexs, reference, refHeader, schema=None): firstBaseIndex = min([stats[0] - stats[2] for stats in alignmentIndexs]) lastBaseIndex = max([alignmentIndexs[index][0] - alignmentIndexs[index][2] + len(contigs[index]) - 1 for index in range(0,len(contigs))]) outList = [[str(index + 1),refSeq[index]] for index in range(0,len(refSeq))] outList = [[str(index),'-'] for index in range(firstBaseIndex,0)] + outList + [[str(index),'-'] for index in range(len(refSeq),lastBaseIndex + 1)] for cIndex in range(0,len(headers)): if headers[cIndex] == 'Contig128': print map(len,(contigs[cIndex],qualValues[cIndex])) #print contigs[cIndex] quals = [[headers[cIndex], index + 1, contigs[cIndex][index]] + qualValues[cIndex][index] for index in range(0,len(contigs[cIndex]))] quals = [map(str,line) for line in quals] outPos = alignmentIndexs[cIndex][0] - alignmentIndexs[cIndex][2] #+ alignmentIndexs[0][2] - alignmentIndexs[0][0] if firstBaseIndex < 0: outPos += abs(firstBaseIndex) for qIndex in range(0,len(quals)): outList[outPos + qIndex] += quals[qIndex] if schema == None: schema = ['Position', 'Reference', 'Name', 'Index', 'Call', 'A', 'C', 'G', 'T', 'Dash', 'Sum'] maxCols = (max(map(len,outList)) - 2) / 6 if maxCols > 1: schema += [part + '-alt' + str(count) for count in range(2,maxCols + 1) for part in schema[2:]] outHandle = open(savePath, 'w') outHandle.write('Schema:\n' + '\t'.join(schema) + '\n' + refHeader + '\n') for line in map('\t'.join,outList): outHandle.write(line + '\n') outHandle.close() ###Output file saveing####################################################################################################### outPath = os.path.join(os.getcwd(), os.path.basename(nucPath) + '.qual') verbose('sumqual: Saving quality consensus alignment to %s...' % outPath,) saveQualAlignment(outPath, contigs, qualValues, headers, matchStats, refSeq, refHeader) verbose('Complete\n') if options.save_run_info: runtimeOutPath = os.path.join(os.getcwd(), 'sumqual_runtime_output_%s.txt' % timeStamp) verbose('sumqual: option -i/--save-run-info: saving run-time output to %s...' % runtimeOutPath) runHandle = open(runtimeOutPath, 'w') for line in printLog: runHandle.write(line) runHandle.close() verbose('Complete\n') #############################################################################################################################