Qualtofa.py
From OpenWetWare
Copy and past the following into a text editor and save it as qualtofa.py to use the script.
#!/usr/bin/env python ###Imports / Variable Initilization########################################################################################## import os, string, sys, time, copy from datetime import * from optparse import * from subprocess import * commandLine = copy.deepcopy(sys.argv) cwd = os.getcwd() timeStamp = datetime.now().ctime().replace(' ','-').replace(':','-').replace('--','-') #Ex: 'Mon-Jun-14-11-08-55-2010' cntgMatchRanges = None progName, progVersion = ('qualtofa.py','2.52') progUsage = 'python %s [options] <.qual file>' % progName printLog = [] ############################################################################################################################# def errorExit(message=None,exitStatus=1): '''Version 1.0 Is called when the script encounters a fatal error. Prints the debug log to standard out (usually the screen)''' if message: print '%s: Error: %s' % (progName, message) else: print progName +': Unknown error, printing debug log...' for line in debugLog: print line #print debug log sys.exit(exitStatus) def mkRange(numList): '''Function designed to make a list of ranges (e.g. 2-5, 8-10) out of a list of integers (e.g. 2,3,4,5,8,9,10)''' outList = [] if len(numList) > 0: start = numList[0] current = numList[0] numList.append(numList[-1]) for num in numList[1:]: if (num - 1) != current: outList.append([start,current]) start = num current = num return outList def readLFile(filePath, numLinesToRead = None): fileHandle = open(filePath, 'r') if numLinesToRead is None: fileData = fileHandle.readlines() else: fileData = [fileHandle.readline() for count in range(0,numLinesToRead)] fileHandle.close() return fileData def writeLFile(filePath, fileData): fileHandle = open(filePath, 'w') for line in fileData: fileHandle.write(fileData) fileHandle.close() def saveQualStandard(filePath, contigs, qualValues, headers, schema=None): if schema is None: schema = ['Position in the contig(1 based)', 'Called base', "Number of reads suppoting 'A'", "Number of reads supporting 'C'", "Number of reads supporting 'G'", "Number of reads supporting 'T'", "Number of reads supporting '_'", 'Coverage'] if len(contigs) == 0: errorExit('saveQualStandard: empty list supplied') if len(contigs) != len(qualValues) or len(contigs) != len(headers): errorExit('saveQualStandard: Unequal number of entries is input list') outHandle = open(filePath, 'w') outHandle.write('Schema:'+ '\n' + '\t'.join(schema) + '\n') for cIndex in range(0,len(contigs)): outHandle.write(headers[cIndex] + '\n') for bIndex in range(0,len(contigs[cIndex])): outHandle.write('\t'.join([contigs[cIdnex][bIndex]] + qualValues[cIndex]) + '\n') outHandle.close() def parseQualStandard(filePath): columbWidth = 8 fileData = [line.strip().split('\t') for line in readLFile(filePath)] #fileData is a list of lists representing the tab-deliminated structure of the .qual file if fileData[0] == ['Schema:']: del fileData[0] #removes the unessesary first line of the file, if present schema = fileData[0] #extract the tab-deliminated schema into a list headers, contigs, qualValues = ([], [], []) #initilizes multiple lists at a time index = 0 for line in fileData[1:]: if len(line) == 1: #if it is a header headers.append(line[0]) contigs.append([]) qualValues.append([]) elif len(line) == columbWidth: #if it is quality value contigs[-1].append(line[1]) qualValues[-1].append([int(num) for num in line[2:]]) #converts all of the quality values into integer objects and saves them in qualvalues else: errorExit('parseQualStandard: Quality file at %s deviated from the expected format on line %d.' % (filePath, index)) index += 1 return (contigs, qualValues, headers, schema) def parseQualAlignment(filePath): columbWidth = 9 headColNum = 2 fileData = [[part.strip() for part in line.split('\t')] for line in readLFile(filePath)] #fileData is a list of lists representing the tab-deliminated structure of the .qual file if fileData[0] == ['Schema:']: del fileData[0] #removes the unessesary first line of the file, if present schema = fileData[0] #extract the tab-deliminated schema into a list refHeader = fileData[1][0] reference = [line[1] for line in fileData[2:]] #extracts the reference from the second columb qualData = [line[headColNum:] for line in fileData[2:]] #removes the first lines and columbs of fileData, which dont contain unextracted data headers, contigs, cntgStarts, qualValues = ([], [], [], []) #initilizes multiple lists at a time refPos = 0 for line in qualData: if len(line) > 0: for index in range(0,len(line),columbWidth): if headers.count(line[index]) == 0: #if the contig header has not been encountered before headers.append(line[index]) cntgStarts.append(refPos) contigs.append([]) qualValues.append([]) headIndex = headers.index(line[index]) contigs[headIndex].append(line[index + 2]) #append the called base character to its contig qualValues[headIndex].append([int(num) for num in line[index + 3: index + columbWidth]]) #converts all of the quality values into integer objects and saves them in qualvalues refPos += 1 alignmentIndexs = [[cntgStarts[index], cntgStarts[index] + len(contigs[index]) - 1] for index in range(0,len(cntgStarts))] return (contigs, qualValues, headers, alignmentIndexs, reference, refHeader, schema) def parseQual(filePath): columbWidth = 9 headColNum = 2 fileHead = readLFile(filePath, 2) if fileHead[0].strip() == 'Schema:': del fileHead[0] #removes the unessesary first line of the file, if present schema = fileHead[0].strip().split('\t') if schema[1] == 'Reference' and len(schema) % columbWidth == headColNum: return parseQualAlignment(filePath) elif len(schema) == 8: return parseQualStandard(filePath) else: errorExit('parseQual: The quality file at %s has an unreconized format based on the schema on line 2.' % filePath) def listToStr(aList): out = '' for char in aList: out += char return out def saveFasta(filePath, headers, sequences): outHandle = open(filePath, 'w') if len(headers) != 0 and len(sequences) != 0: if len(headers) != len(sequences): errorExit('saveFastaFile: different number of headers and sequences') if type(headers) == str and type(sequences) == str: headers = [headers] sequences = [sequences] if type(sequences[0]) == list: sequences = [listToStr(seq) for seq in sequences] for index in range(0,len(headers)): outHandle.write('>%s\n' % headers[index]) if type(sequences[index]) == str: sequences[index] = [sequences[index]] #if there is only one line of sequence for a given header, make it a one-item list for the next loop for seq in sequences[index]: outHandle.write(seq + '\n') outHandle.close() def parseFasta(filePath): headers = [] seqs = [] fileHandle = open(filePath, 'r') fileData = fileHandle.readlines() + ['>'] #the additional header line is there make to following for loop process the last sequence fileHandle.close() currSeq = '' for line in fileData: if line[0] == '>': #if the line is a fasta header headers.append(line[1:].strip() ) #the fisrt character is the '>' and the last is a newline '\n' seqs.append(currSeq) currSeq = '' else: currSeq += line.strip() return (headers[:-1], seqs[1:]) def removeRepeats(aList): for entry in aList: if aList.count(entry) > 1: aList.remove(entry) return removeRepeats(aList) return aList def maskingCallback(option, opt_str, value, parser): value = [] def floatable(str): try: float(str) return True except ValueError: return False for arg in parser.rargs: if arg[:2] == "--" and len(arg) > 2: break # stop on --foo like options if arg[:1] == "-" and len(arg) > 1 and not floatable(arg): break # stop on -a, but not on -3 or -3.0 value.append(arg) if len(value) == 2: break if len(value) == 0: errorExit("option '%s' requires one or two arguments; none supplied." % option.dest) del parser.rargs[:len(value)] if len(value) == 1: value.append(0) value[0] = int(value[0]) value[1] = float(value[1]) if value[0] < 0: errorExit("The first argument of the option '%s' must be a positive integer; %d supplied." % (option.dest,value[0])) if value[1] < 0 or value[1] > 1: errorExit("The second argument of the option '%s' must be a decimal between 0 and 1; %f supplied." % (option.dest,value[1])) setattr(parser.values, option.dest, value) ###Command Line Parser####################################################################################################### cmndLineParser = OptionParser(usage=progUsage, version="Version %s" % progVersion) maksingGroup = OptionGroup(cmndLineParser, 'Coverage and Call Proportion Masking','The following options take one integer argument and one decimal argument between 0 and 1, if the second is not supplied it is assumed to be 0.') cmndLineParser.add_option( "-c", "--include-contigs", action="store_true", default=False, help="Include each contig on its own line. (Default: only output the consensus)") maksingGroup.add_option( "-m", "--mask-contigs", action="callback", default=[0,0], metavar='INT FLOAT', callback=maskingCallback, dest='mask_contigs', help="Set minimum coverage depth and call proportion for contig masking; independent of the consensus. Requires the use of the -c modifier.(Default: 0, 0)") maksingGroup.add_option( "-s", "--mask-contig-SNPs", action="callback", default=[0,0], metavar='INT FLOAT', callback=maskingCallback, dest='mask_contig_SNPs', help="Set minimum coverage depth and call proportion for contig SNP masking; independent of the consensus. Requires the use of the -c modifier.(Default: 0, 0)") maksingGroup.add_option( "-M", "--mask-consensus", action="callback", default=[0,0], metavar='INT FLOAT', callback=maskingCallback, dest='mask_consensus', help="Set minimum coverage depth and call proportion for the consensus; a new masked sequence will be added to the output file. (Default: 0, 0)") maksingGroup.add_option( "-S", "--mask-SNPs", action="callback", default=[0,0], metavar='INT FLOAT', callback=maskingCallback, dest='mask_SNPs', help="Set minimum coverage depth and call proportion for SNPs in the consensus; a new masked sequence will be added to the output file. (Default: 0, 0)") cmndLineParser.add_option_group(maksingGroup) cmndLineParser.add_option( "-t", "--end-trim", action="store", default=0, type="int", metavar="INT", help="Trim all the bases on either end of all contigs by a specified number of bases. (Default: 0)") cmndLineParser.add_option( "-q", "--end-trim-qual", action="store", default=0, type="int", metavar="INT", help="Trim all the bases on either end of all contigs that have a quality value less than the specified amount. This option is implemented after standard end trimming (see -t option). (Default: 0)") cmndLineParser.add_option( "-l", "--min-match-length", action="store", default=0, type="int", metavar="INT", help="Only include contigs that have a least one match longer or as long as the specified amount. This is implemented after any end trimming (see -q and -t options). (Default: 0)") cmndLineParser.add_option( "-n", "--no-overlap", action="store_true", default=False, help="Add deletions (i.e. -'s) to the reference to accommodate any overlapping sequence, including unmatched sequence. (Default: Condense all overlapping regions of the consensus into IUPAC ambiguity codes.)") cmndLineParser.add_option( "-a", "--no-match-overlap", action="store_true", default=False, help="Add deletions to the reference to accommodate overlapping matches. (Default: Condense all overlapping regions of the consensus into IUPAC ambiguity codes.)") cmndLineParser.add_option( "-k", "--keep-contained", action="store_true", default=False, help="Include contained contigs in the alignment and consensus. (Default: save sequences of contained contigs to a separate file)") cmndLineParser.add_option( "-p", "--save-SNPs", action="store_true", default=False, help="Save SNPs to a .qual file. (Default: Do not save SNP file)") cmndLineParser.add_option( "-d", "--dont-align-contigs", action="store_true", default=False, help="Do not align contigs to the reference. Requires the use of the -c modifier. (Default: align contigs to the reference)") cmndLineParser.add_option( "-r", "--external-ref", action="store", default=None, type="string", metavar="PATH", help="Specify a reference to use instead of the one contained in the quality file.") cmndLineParser.add_option( "-v", "--verbose", action="store_true", default=False, help="Print progress updates and relevant statistics to the standard output. (Default: run silently)") cmndLineParser.add_option( "-i", "--save-run-info", action="store_true", default=False, help="Save a txt file of anything printed to the screen. Requires the use of -v/--verbose. (Default: Do not save)") (options, args) = cmndLineParser.parse_args(commandLine) if options.include_contigs == False: if sum(options.mask_contigs) > 1: errorExit('The option -m/--mask-contigs requires the use of the option -c/--include-contigs.') if sum(options.mask_contig_SNPs) > 1: errorExit('The option -s/--mask-contig-SNPs requires the use of the option -c/--include-contigs.') if options.dont_align_contigs: errorExit('The option -d/--dont-align-contigs requires the use of the option -c/--include-contigs.') if len(args) == 1: cmndLineParser.print_help() sys.exit(0) argNum = len(args) - 1 #counts the amount of arguments, negating the 'qualtofa' at the start of the command line if argNum != 1: errorExit('qualtofa takes exactly 1 argument; %d supplied' % argNum, 0) qualPath = args[1] #The file path to the input file supplied by the user ############################################################################################################################# def verbose(toPrint): if options.verbose: print toPrint, printLog.append(toPrint) ###Quality file parsing###################################################################################################### verbose('qualtofa: Loading data from %s...' % qualPath) qualData = parseQual(qualPath) verbose('Complete\n') if len(qualData) == 4: #if the quality file contains a list of unaligned contigs (e.g. one directly from yasra) isAlignment = False if options.no_match_overlap: errorExit('The option -a/--no-match-overlap requires an aligned quality consensous file, like that given by sumqual.py.') if any(options.mask_consensus): errorExit('The option -M/--mask-consensus requires an aligned quality consensous file, like that given by sumqual.py.') if any(options.mask_SNPs): errorExit('The option -S/--mask-SNPs requires an aligned quality consensous file, like that given by sumqual.py.') if any(options.mask_contig_SNPs): errorExit('The option -s/--mask-contig-SNPs requires an aligned quality consensous file, like that given by sumqual.py.') if options.keep_contained: errorExit('The option -k/--keep-contained requires an aligned quality consensous file, like that given by sumqual.py.') if options.min_match_length: errorExit('The option -l/--min-match-length requires an aligned quality consensous file, like that given by sumqual.py.') if options.save_SNPs: errorExit('The option -p/--save-SNPs requires an aligned quality consensous file, like that given by sumqual.py.') if options.no_overlap: errorExit('The option -n/--no-overlap requires an aligned quality consensous file, like that given by sumqual.py.') if options.external_ref: errorExit('The option -r/--external-ref requires an aligned quality consensous file, like that given by sumqual.py.') contigs, qualities, headers, schema = qualData elif len(qualData) == 7: #if the quality file contains an alignment (i.e. from sumqual) isAlignment = True contigs, qualities, headers, alignmentIndexs, reference, refHeader, schema = qualData else: errorExit() ############################################################################################################################# ###External reference parsing(-r option implimintation)###################################################################### if options.external_ref is not None: verbose('qualtofa: option -r/--external-ref: loading reference at %s...' % options.external_ref) (seqs,headers) = parseFasta(options.external_ref) reference, refHeader = seqs[0], headers[0] verbose('Complete\n') ############################################################################################################################# def delContig(index): del contigs[index] del headers[index] del qualities[index] if isAlignment: del alignmentIndexs[index] if cntgMatchRanges is not None: del cntgMatchRanges[index] def delContigs(indexes): if len(indexes) > 0: indexes.sort() indexes.reverse() indexes = removeRepeats(indexes) for index in indexes: delContig(index) #deletes all contigs def delBase(cntgIndex, baseIndex): del qualities[cntgIndex][baseIndex] del contigs[cntgIndex][baseIndex] if isAlignment: if baseIndex == 0: alignmentIndexs[cntgIndex][0] += 1 else: alignmentIndexs[cntgIndex][1] -= 1 def delBases(cntgIndex, baseIndexes): baseIndexes.sort() baseIndexes.reverse() for index in baseIndexes: delBase(cntgIndex, index) ###Contig end Trimming (-q and -t option implimentation)##################################################################### if options.end_trim > 0: verbose('qualtofa: option -t/--end-trim: Trimming %d bases from both ends of each contig...' % options.end_trim) rejectIndexes = [index for index in range(0,len(contigs)) if len(contigs[index]) <= options.end_trim * 2] #makes a list of all the indexs of contigs that are too short to be trimmed. delContigs(rejectIndexes) #deletes all rejected contigs contigs = [cntg[options.end_trim : -options.end_trim] for cntg in contigs] #trims all remaining contigs qualities = [qual[options.end_trim : -options.end_trim] for qual in qualities] #trims all remaining contigs alignmentIndexs = [[indexRange[0] + options.end_trim,indexRange[1] - options.end_trim] for indexRange in alignmentIndexs] if options.verbose: verbose('Complete\n') if len(rejectIndexes) > 0: verbose('option -t/--end-trim: %d contig(s) were removed from the analysis because they were shorter than twice the trim length of %d.\n' % (len(rejectIndexes),options.end_trim)) if options.end_trim_qual > 0: verbose('qualtofa: option -q/--end-trim-qual: Trimming contig ends until a base with a coverage of %d or more is reached...' % options.end_trim_qual) for index in range(0, len(contigs)): while len(qualities[index]) > 0 and qualities[index][0][-1] < options.end_trim_qual: delBase(index,0) while len(qualities[index]) > 0 and qualities[index][-1][-1] < options.end_trim_qual: delBase(index,-1) rejectIndexes = [index for index in range(0,len(qualities)) if len(qualities[index]) == 0] #makes a list of all the indexs of contigs that dont contain sequence after trimming delContigs(rejectIndexes) #deletes all rejected contigs if options.verbose: verbose('Complete\n') if len(rejectIndexes) > 0: verbose('option -q/--end-trim-qual: %d contig(s) removed from the analysis because all of their sequence had quality values less than %d.\n' % (len(rejectIndexes),options.end_trim_qual)) ############################################################################################################################# ###Contigs Match Coordinate Extraction (-l option implimentation)############################################################ if isAlignment: cntgMatchRanges = [mkRange([index for index in range(0,len(cntg)) if cntg[index].isupper()]) for cntg in contigs] if options.min_match_length > 0: verbose('qualtofa: option -l/--min-match-length: filtering contigs without a match of at least %d base pairs...' % options.min_match_length) maxMatchLens = [max([match[1] - match[0] + 1 for match in matchList] + [0]) for matchList in cntgMatchRanges] rejectIndexes = [index for index in range(0,len(contigs)) if maxMatchLens[index] < options.min_match_length] delContigs(rejectIndexes) if options.verbose: verbose('Complete\n') if len(rejectIndexes) > 0: verbose('option -l/--min-match-length: %d contig(s) removed from the analysis due to the lack of a match %d base pairs long.\n' % (len(rejectIndexes),options.min_match_length)) ############################################################################################################################# ###Contained contig extraction (-k option implimentation)#################################################################### if isAlignment: verbose('qualtofa: Searching for contained contigs...') matchRanges = [[cntgMatchRanges[index][0][0] + alignmentIndexs[index][0],cntgMatchRanges[index][-1][1] + alignmentIndexs[index][0]] for index in range(0,len(cntgMatchRanges))] containedIndexes = [[bIndex for bIndex in range(0,len(matchRanges)) if matchRanges[aIndex][0] <= matchRanges[bIndex][0] and matchRanges[aIndex][1] >= matchRanges[bIndex][1] and aIndex != bIndex] for aIndex in range(0,len(matchRanges))] containedIndexes = [index for contained in containedIndexes for index in contained] containedContigs = [(headers[index], contigs[index]) for index in containedIndexes] verbose('Complete\n') if options.keep_contained == False: delContigs(containedIndexes) if len(containedIndexes) > 0: verbose('qualtofa: %d contained contig(s) removed from the analysis.\n' % len(containedIndexes)) elif len(containedIndexes) > 0: verbose('qualtofa: option -k/--keep-contained: %d contained contig(s) included in the analysis.\n' % len(containedIndexes)) ############################################################################################################################# def getIUPAC(baseValues): '''Takes a list of base character values (ACTG-~N) and condenses them into a single IUPAC code. Assumes only input characters to be ACTG-~N''' def IUPAC(bases): conversion = {'CT':'Y','AG':'R','AT':'W','CG':'S','GT':'K','AC':'M','AGT':'D','ACG':'V','ACT':'H','CGT':'B','ACGT':'N'} bases.sort() #['A', 'C', 'G', 'T'] if bases[0].islower(): bases = map(str.upper,bases) return str.lower(conversion[''.join(bases)]) else: return conversion[''.join(bases)] chars = removeRepeats(baseValues) chars.sort() #['-', 'A', 'C', 'G', 'N', 'T', 'a', 'c', 'g', 'n', 't', '~'] if len(chars) == 0: return '-' if len(chars) == 1: return chars[0] if chars[-1] == '~': #converts '~' to '-' del chars[-1] if chars[0] != '-': chars.insert(0,'-') priorityList = ('ACGT','N','-','acgt','n') for group in priorityList: matchs = [base for base in chars if group.find(base) != -1] if len(matchs) == 0: continue elif len(matchs) == 1: return matchs[0] else: return IUPAC(matchs) ###Overlap handleing (-i, -n options implimentation)############################################################################ if isAlignment: if options.no_match_overlap or options.no_overlap: if options.no_overlap: verbose('qualtofa: option -n/--no-overlap: Adding deletions to the reference to accomodate all overlapping sequence...') else: verbose('qualtofa: option -a/--no-match-overlap: Adding deletions to the reference to accomodate overlapping matches...') matchRanges = [[cntgMatchRanges[index][0][0] + alignmentIndexs[index][0],cntgMatchRanges[index][-1][1] + alignmentIndexs[index][0]] for index in range(0,len(cntgMatchRanges))] if options.no_overlap: interSeqGaps = [alignmentIndexs[index + 1][0] - alignmentIndexs[index][1] -1 for index in range(0,len(contigs) - 1)] else: interSeqGaps = [matchRanges[index + 1][0] - matchRanges[index][1] - 1 for index in range(0,len(contigs) - 1)] verbose('Complete\n') gapSum = sum([abs(gap) for gap in interSeqGaps if gap < 0]) totalLen = sum(map(len,contigs)) if options.no_overlap: verbose('qualtofa: option -n/--no-overlap: %d of %d total bases (%d percent) in the consensus is overlaping sequence.\n' % (gapSum,totalLen,gapSum*100/len(reference))) else: verbose('qualtofa: option -a/--no-match-overlap: %d of %d total bases (%.1f%%) in the consensus are overlapping matchs.\n' % (gapSum,totalLen,float(gapSum)*100/len(reference))) refOffset = 0 for index in range(0,len(interSeqGaps)): if interSeqGaps[index] < 0: #if matching sequence overlaps... for count in range(0,abs(interSeqGaps[index])): reference.insert(matchRanges[index][1] + refOffset + 1, '-') refOffset += abs(interSeqGaps[index]) alignmentIndexs[index + 1][0] += refOffset alignmentIndexs[index + 1][1] += refOffset ########################################################################################################################################## def getProp(call, qual): equivalancies = {'A':0,'C':1,'G':2,'T':3,'N':4} try: callIndex = equivalancies[call.upper()] except KeyError: errorExit('getProp: Non-neucleotide base %s encountered...' % call) if callIndex == 4: return float(max(qual[:4]))/float(qual[-1]) return float(qual[callIndex])/float(qual[-1]) ###Consensous Creation (-M, -S options implimentation)############################################################################ if isAlignment: verbose('qualtofa: Creating consensus...') numMasked, numQualMasked, SNPmasked, SNPqualmasked, numPropMasked, SNPpropMasked, SNPcount = (0,0,0,0,0,0,0) if sum(options.mask_SNPs + options.mask_consensus) > 0: includeMaskedCon = True else: includeMaskedCon = False rawConsensous = [[] for index in range(0,len(reference))] if includeMaskedCon: maskedRawCon = [[] for index in range(0,len(reference))] for cIndex in range(0,len(contigs)): for bIndex in range(0,len(contigs[cIndex])): rIndex = alignmentIndexs[cIndex][0] + bIndex refChar = reference[rIndex] cntgChar = contigs[cIndex][bIndex] rawConsensous[rIndex].append(cntgChar) if includeMaskedCon: if cntgChar.isupper() and refChar.isalpha() and refChar != cntgChar: SNPcount += 1 if qualities[cIndex][bIndex][-1] < options.mask_SNPs[0]: cntgChar = 'N' SNPqualmasked += 1 if getProp(contigs[cIndex][bIndex],qualities[cIndex][bIndex]) < options.mask_SNPs[1]: cntgChar = 'N' SNPpropMasked += 1 if qualities[cIndex][bIndex][-1] < options.mask_SNPs[0] or getProp(contigs[cIndex][bIndex],qualities[cIndex][bIndex]) < options.mask_SNPs[1]: SNPmasked += 1 if contigs[cIndex][bIndex].isalpha(): if qualities[cIndex][bIndex][-1] < options.mask_consensus[0]: numQualMasked += 1 if contigs[cIndex][bIndex].isupper(): cntgChar = 'N' else: cntgChar = 'n' if getProp(contigs[cIndex][bIndex],qualities[cIndex][bIndex]) < options.mask_consensus[1]: numPropMasked += 1 if contigs[cIndex][bIndex].isupper(): cntgChar = 'N' else: cntgChar = 'n' if qualities[cIndex][bIndex][-1] < options.mask_consensus[0] or getProp(contigs[cIndex][bIndex],qualities[cIndex][bIndex]) < options.mask_consensus[1]: numMasked += 1 maskedRawCon[rIndex].append(cntgChar) conSequence = [getIUPAC(charList) for charList in rawConsensous] if includeMaskedCon: maskedConSeq = [getIUPAC(charList) for charList in maskedRawCon] verbose('Complete\n') totalLen = len(conSequence) if sum(options.mask_SNPs) > 0: verbose('qualtofa: %d of %d total consensus SNPs (%.1f%%) were masked.\n' % (SNPmasked,SNPcount, float(SNPmasked*100)/SNPcount)) if options.mask_SNPs[0] > 0: verbose('qualtofa: option -S/--mask-SNPs: %d of %d total SNPs (%.1f%%) have less than %d coverage depth.\n' % (SNPqualmasked,SNPcount, float(SNPqualmasked*100)/SNPcount, options.mask_SNPs[0])) if options.mask_SNPs[1] > 0: verbose('qualtofa: option -S/--mask-SNPs: %d of %d total SNPs (%.1f%%) have less than a %.2f call proportion.\n' % (SNPpropMasked,SNPcount, float(SNPpropMasked*100)/SNPcount, options.mask_SNPs[1])) if options.mask_consensus[0] > 0: verbose('qualtofa: %d of %d total consensus bases (%.1f%% )were masked.\n' % (numMasked,totalLen, float(numMasked*100)/totalLen)) if options.mask_consensus[0] > 0: verbose('qualtofa: option -M/--mask-consensus: %d of %d total bases (%.1f%%) have less than %d coverage depth.\n' % (numQualMasked,totalLen, float(numQualMasked*100)/totalLen, options.mask_consensus[0])) if options.mask_consensus[1] > 0: verbose('qualtofa: option -M/--mask-consensus: %d of %d total bases (%.1f%%) have less than a %.2f call proportion.\n' % (numPropMasked,totalLen, float(numPropMasked*100)/totalLen, options.mask_consensus[1])) ############################################################################################################################# ###Contigs SNP Detection/Masking(-s option implimentation)################################################################### if isAlignment: SNPcovNum, SNPpropNum, SNPmaskedNum, SNPcount = (0,0,0,0) SNPs = [] for cIndex in range(0,len(contigs)): for bIndex in range(0,len(contigs[cIndex])): refPos = bIndex + alignmentIndexs[cIndex][0] refChar = reference[refPos] cntgChar = copy.deepcopy(contigs[cIndex][bIndex]) if cntgChar.isupper() and refChar.isalpha() and cntgChar.isalpha() and refChar != cntgChar: SNPcount += 1 SNPs.append([refPos,refChar,headers[cIndex],bIndex,cntgChar] + qualities[cIndex][bIndex]) if qualities[cIndex][bIndex][-1] < options.mask_contig_SNPs[0]: contigs[cIndex][bIndex] = 'N' SNPcovNum += 1 if getProp(cntgChar,qualities[cIndex][bIndex]) < options.mask_contig_SNPs[1]: contigs[cIndex][bIndex] = 'N' SNPpropNum += 1 if qualities[cIndex][bIndex][-1] < options.mask_contig_SNPs[0] or getProp(cntgChar,qualities[cIndex][bIndex]) < options.mask_contig_SNPs[1]: SNPmaskedNum += 1 totalLen = sum(map(len,contigs)) if sum(options.mask_contig_SNPs) > 0: verbose('qualtofa: %d of %d total SNPs (%.1f%%) were masked\n' % (SNPmaskedNum,SNPcount,float(SNPmaskedNum*100)/SNPcount)) if options.mask_contig_SNPs[0] > 0: verbose('qualtofa: option -s/--mask-contig-SNPs: %d of %d total SNPs (%.1f%%) have less than %d coverage depth.\n' % (SNPcovNum,SNPcount,float(SNPcovNum*100)/SNPcount,options.mask_contig_SNPs[0])) if options.mask_contig_SNPs[1] > 0: verbose('qualtofa: option -s/--mask-contig-SNPs: %d of %d total SNPs (%.1f%%) have less than a %.2f call proportion.\n' % (SNPpropNum,SNPcount,float(SNPpropNum*100)/SNPcount,options.mask_contig_SNPs[1])) ############################################################################################################################# ###Contigs Masking(-m option implimentation)################################################################################# if sum(options.mask_contigs) > 0: numMasked, numQualMasked, numPropMasked = (0,0,0) for cIndex in range(0,len(contigs)): for bIndex in range(0,len(contigs[cIndex])): cntgChar = copy.deepcopy(contigs[cIndex][bIndex]) if cntgChar.isalpha(): if qualities[cIndex][bIndex][-1] < options.mask_contigs[0] or getProp(cntgChar,qualities[cIndex][bIndex]) < options.mask_contigs[1]: numMasked += 1 if qualities[cIndex][bIndex][-1] < options.mask_contigs[0]: if contigs[cIndex][bIndex].isupper(): contigs[cIndex][bIndex] = 'N' else: contigs[cIndex][bIndex] = 'n' numQualMasked += 1 if getProp(cntgChar,qualities[cIndex][bIndex]) < options.mask_contigs[1]: if contigs[cIndex][bIndex].isupper(): contigs[cIndex][bIndex] = 'N' else: contigs[cIndex][bIndex] = 'n' numPropMasked += 1 totalLen = sum(map(len,contigs)) verbose('qualtofa: %d bases of %d total bases (%.1f%%) were masked.\n' % (numMasked,totalLen, float(numMasked*100)/totalLen)) verbose('qualtofa: option -m/--mask-contigs: %d of %d total bases (%.1f%%) have less than %d coverage depth.\n' % (numQualMasked,totalLen, float(numQualMasked*100)/totalLen, options.mask_contigs[0])) verbose('qualtofa: option -m/--mask-contigs: %d of %d total bases (%.1f%%) have less than a %.2f call proportion.\n' % (numPropMasked,totalLen, float(numPropMasked*100)/totalLen, options.mask_contigs[1])) ############################################################################################################################# ###Contianed Contigs file saving############################################################################################# if len(containedContigs) > 0: outPath = qualPath + '_contained_contigs.fsa' verbose('qualtofa: Saving contained contigs to %s...' % outPath) saveFasta(outPath, *zip(*containedContigs)) verbose('Complete\n') ############################################################################################################################# ###SNP file saving########################################################################################################### if options.save_SNPs: outPath = qualPath + '_SNPs.qual' verbose('qualtofa: Saving %d SNPs to %s...' % (len(SNPs),outPath)) outHandle = open(outPath, 'w') outHandle.write('Schema:\nPosition\tReference\tName\tIndex\tcall\tA\tC\tG\tT\tdash\tsum\n') for SNP in SNPs: outHandle.write('\t'.join(map(str,SNP)) + '\n') outHandle.close() verbose('Complete\n') ############################################################################################################################# ###Contig sequence alignment################################################################################################# if isAlignment and options.dont_align_contigs == False: contigs = [['-'] * alignmentIndexs[index][0] + contigs[index] for index in range(0,len(contigs))] ############################################################################################################################# ###Output file Saving######################################################################################################## verbose('qualtofa: Saving fasta output to %s...' % (qualPath + '.fsa')) conHeader = 'Consensus' maskedConHeader = 'Masked Consensus' if isAlignment: outputHeads, outputSeqs = ([refHeader,conHeader],[reference,conSequence]) if includeMaskedCon: outputHeads.append(maskedConHeader) outputSeqs.append(maskedConSeq) else: outputHeads, outputSeqs = ([],[]) if options.include_contigs: outputHeads += headers outputSeqs += contigs outPath = os.path.join(os.getcwd(), os.path.basename(qualPath) + '.fsa') saveFasta(outPath, outputHeads, outputSeqs) verbose('Complete\n') if options.save_run_info: runtimeOutPath = os.path.join(os.getcwd(), 'qualtofa_runtime_output_%s.txt' % timeStamp) verbose('qualtofa: option -i/--save-run-info: saving run-time output to %s...' % runtimeOutPath) runHandle = open(runtimeOutPath, 'w') for line in printLog: runHandle.write(line) runHandle.close() verbose('Complete\n') #############################################################################################################################