Sumqual.py

From OpenWetWare
Jump to navigationJump to search

Copy and past the following into a text editor and save it as sumqual.py to use the script.

#!/usr/bin/env python

###Imports / Variable Initilization##########################################################################################
import os, string, sys, time, copy
from datetime import *
from optparse import *
from subprocess import *
commandLine = copy.deepcopy(sys.argv)
cwd = os.getcwd()
timeStamp = datetime.now().ctime().replace(' ','-').replace(':','-').replace('--','-')  #Ex: 'Mon-Jun-14-11-08-55-2010'
cntgMatchRanges = None
progName, progVersion = ('sumqual.py','2.12')
progUsage = 'python %s [options] <.qual file> <NUCmer file path> <Reference Path>' % progName
printLog = []
#############################################################################################################################

def errorExit(message=None,exitStatus=1):
    '''Version 1.0
    Is called when the script encounters a fatal error. Prints the debug log to standard out (usually the screen)'''
    
    if message:
        print '%s: Error: %s' % (progName, message)
    else:
        print progName +': Unknown error, printing debug log...'
        for line in debugLog: print line   #print debug log
    sys.exit(exitStatus)

def mkRange(numList):
    '''Function designed to make a list of ranges (e.g. 2-5, 8-10) out of a list of integers (e.g. 2,3,4,5,8,9,10)'''
    outList = []
    if len(numList) > 0:
        start   = numList[0]
        current = numList[0]
        numList.append(numList[-1])
        for num in numList[1:]:
            if (num - 1) != current:
                outList.append([start,current])
                start = num
            current = num
    return outList

def saveQualStandard(filePath, contigs, qualValues, headers, schema=None):
    if schema is None: schema = ['Position in the contig(1 based)', 'Called base', "Number of reads suppoting 'A'", "Number of reads supporting 'C'", "Number of reads supporting 'G'", "Number of reads supporting 'T'", "Number of reads supporting '_'", 'Coverage']
    if len(contigs) == 0: errorExit('saveQualStandard: empty list supplied')
    if len(contigs) != len(qualValues) or len(contigs) != len(headers): errorExit('saveQualStandard: Unequal number of entries is input list')
    outHandle = open(filePath, 'w')
    outHandle.write('Schema:'+ '\n' + '\t'.join(schema) + '\n')
    for cIndex in range(0,len(contigs)):
        outHandle.write(headers[cIndex] + '\n')
        for bIndex in range(0,len(contigs[cIndex])): outHandle.write('\t'.join([contigs[cIdnex][bIndex]] + qualValues[cIndex]) + '\n')
    outHandle.close()
                
def parseQualStandard(filePath):
    columbWidth = 8
    fileHandle = open(filePath, 'r')
    fileData = [line.strip().split('\t') for line in fileHandle.readlines()]   #fileData is a list of lists representing the tab-deliminated structure of the .qual file      
    fileHandle.close()
    if fileData[0] == ['Schema:']: del fileData[0]   #removes the unessesary first line of the file, if present
    schema = fileData[0]   #extract the tab-deliminated schema into a list
    headers, contigs, qualValues = ([], [], [])   #initilizes multiple lists at a time
    index = 0
    for line in fileData[1:]:
        if len(line) == 1:   #if it is a header
            headers.append(line[0])
            contigs.append([])
            qualValues.append([])
        elif len(line) == columbWidth:   #if it is  quality value
            contigs[-1].append(line[1])
            qualValues[-1].append([int(num) for num in line[2:]])   #converts all of the quality values into integer objects and saves them in qualvalues
        else: errorExit('parseQualStandard: Quality file at %s deviated from the expected format on line %d.' % (filePath, index))
        index += 1
    for i in range(0,len(contigs)):
        if headers[i] == 'Contig128': print map(len,(contigs[i], qualValues[i]))
    return (contigs, qualValues, headers, schema)

def parseQualAlignment(filePath):
    columbWidth = 9
    headColNum = 2
    fileHandle = open(filePath, 'r')
    fileData = [line.strip().split('\t') for line in fileHandle.readlines()]   #fileData is a list of lists representing the tab-deliminated structure of the .qual file  
    fileHandle.close()
    if fileData[0] == ['Schema:']: del fileData[0]   #removes the unessesary first line of the file, if present
    schema = fileData[0]   #extract the tab-deliminated schema into a list
    refHeader = fileData[1][0]
    reference = [line[1] for line in fileData[2:]]   #extracts the reference from the second columb
    qualData = [line[headColNum:] for line in fileData[2:]]   #removes the first lines and columbs of fileData, which dont contain unextracted data
    headers, contigs, cntgStarts, qualValues = ([], [], [], [])   #initilizes multiple lists at a time
    refPos = 0
    for line in qualData:
        if len(line) > 0:
            for index in range(0,len(line),columbWidth):
                if headers.count(line[index]) == 0:   #if the contig header has not been encountered before
                    headers.append(line[index])
                    cntgStarts.append(refPos)
                    contigs.append([])
                    qualValues.append([])
                headIndex = headers.index(line[index])
                contigs[headIndex].append(line[index + 2])   #append the called base character to its contig
                qualValues[headIndex].append([int(num) for num in line[index + 3: index + columbWidth]])   #converts all of the quality values into integer objects and saves them in qualvalues 
        refPos += 1
    alignmentIndexs = [[cntgStarts[index], cntgStarts[index] + len(contigs[index]) - 1] for index in range(0,len(cntgStarts))]
    return (contigs, qualValues, headers, alignmentIndexs, reference, refHeader, schema)

def parseQual(filePath):
    columbWidth = 9
    headColNum = 2
    fileHandle = open(filePath, 'r')    
    fileHead = fileHandle.readline()
    if fileHead[0].strip() == 'Schema:': fileHandle.readline()   #removes the unessesary first line of the file, if present
    schema = fileHandle.readline().strip().split('\t')
    fileHandle.close()
    if schema[1] == 'Reference' and len(schema) % columbWidth == headColNum:
        return parseQualAlignment(filePath)
    elif len(schema) == 8:
        return parseQualStandard(filePath)
    else: errorExit('parseQual: The quality file at %s has an unreconized format based on the schema on line 2.' % filePath)

def parseFasta(filePath):
    headers = []
    seqs = []
    fileHandle = open(filePath, 'r')
    fileData = fileHandle.readlines() + ['>']   #the additional header line is there make to following for loop process the last sequence 
    fileHandle.close()
    currSeq = ''
    for line in fileData:
        if line[0] == '>':   #if the line is a fasta header
            headers.append(line[1:].strip() )   #the fisrt character is the '>' and the last is a newline '\n'
            seqs.append(list(currSeq))
            currSeq = ''
        else:
            currSeq += line.strip()
    return (headers[:-1], seqs[1:])

def removeRepeats(aList):
    for entry in aList:
        if aList.count(entry) > 1:
            aList.remove(entry)
            return removeRepeats(aList)
    return aList

###Command Line Parser#######################################################################################################
cmndLineParser  = OptionParser(usage=progUsage, version="Version %s" % progVersion)
cmndLineParser.add_option(   "-v",   "--verbose",            action="store_true",    default=False,     help="Print progress updates and relevant statistics to the standard output. (Default: run silently)")
cmndLineParser.add_option(   "-g",   "--save-gaps",          action="store_true",    default=False,     help="Save the reference sequence corresponding to empty gaps in the consensus in a fasta file. (Default: Do not save)")
cmndLineParser.add_option(   "-n",   "--no-names",           action="store_true",    default=False,     help="Do not include the name of the original contig with each base. Output cannot be used with qualtofa. (Default: the name of the contig each base came from is included)")
cmndLineParser.add_option(   "-r",   "--no-reference",       action="store_true",    default=False,     help="Do not include the reference in the ouput. Output cannot be used with qualtofa. (Default: include reference)")
cmndLineParser.add_option(   "-i",   "--save-run-info",      action="store_true",    default=False,     help="Save a txt file of anything printed to the screen. Requires the use of -v/--verbose. (Default: Do not save)")
(options, args) = cmndLineParser.parse_args(commandLine)
if len(args) == 1:
    cmndLineParser.print_help()
    sys.exit(0)
argNum = len(args) - 1   #counts the amount of arguments, negating the 'qualtofa' at the start of the command line
if argNum != 3: errorExit('qualtofa takes exactly 3 arguments; %d supplied' % argNum, 0)
qualPath, nucPath, refPath = args[1:]   #The file path to the input file supplied by the user
#############################################################################################################################

def verbose(toPrint):
    if options.verbose:
        print toPrint,
        printLog.append(toPrint)

def parseNucmer(filePath):
    try: fileHandle = open(filePath, 'r')
    except IOError: errorExit('parseNucmer: could not open file at %s' % filePath)
    fileData = fileHandle.read()
    fileHandle.close()
    nucInPaths = fileData[0]
    #a '>' appears before the name of every contig and each match is sepeated by a '0' on its own line (i.e. '\n0\n')
    #data is parsed by contig ('>'), match ('\n0\n'), and line '\n'. 
    contigData = [[cntg.split('\n') for cntg in group.split('\n0\n')[:-1]] for group in fileData[1:].split('>')][1:] 
    for cIndex in range(0,len(contigData)):
        if len(contigData[cIndex]) > 1:
            for gIndex in range(1,len(contigData[cIndex])):
                cntgHeadParts = contigData[cIndex][0][0].split()
                cntgHeadParts[1] += '(%d)' % (gIndex + 1)
                contigData[cIndex][gIndex].insert(0,' '.join(cntgHeadParts))   #adds contigs information to each sub contig
    contigData = [cntg for group in contigData for cntg in group]   #makes into one contig list
    headers = zip(*[cntg[0].split(' ') for cntg in contigData])
    refHeads, cntgHeads, refLenghts, cntgLengths = (headers[0],list(headers[1]),map(int,headers[2]),map(int,headers[3]))
    alignStats = [map(int,cntg[1].split()) for cntg in contigData]
    deltas = [map(int,cntg[2:]) for cntg in contigData]
    outList = (alignStats, cntgHeads, refHeads, refLenghts, cntgLengths, deltas)
    return outList
    
###Input data file parsing and consolidation#################################################################################
verbose('sumqual: Loading quality information from file at %s...' % qualPath,)
qualData = parseQual(qualPath)
verbose('Complete\nsumqual: Loading Nucmer output from file at %s...' % nucPath,)
nucData = parseNucmer(nucPath)
verbose('Complete\nsumqual: Loading reference from file at %s...' % refPath,)
refData = parseFasta(refPath)
verbose('Complete\nsumqual: Consolidating input data...',)
if len(qualData) == 7: errorExit('Sumqual requires a standard quality file (e.g one directly from YASRA); qualitiy alignment file, %s supplied.' % qualPath)
nucInQualCounts = [list(nucData[1]).count(name) for name in qualData[2]] #the number of times each name from the qual file appears in the nucmer one
if max(nucInQualCounts) > 1: errorExit('Contigs with duplicate names deteced in the nucmer file')
delIndexes = [index for index in range(0,len(nucInQualCounts)) if nucInQualCounts[index] == 0] #the indexes of all contigs that dont appear in the nucmer output
delIndexes.reverse()
for index in range(0,3):
    for dIndex in delIndexes: del qualData[index][dIndex]
matchCount = 1
for nucIndex in range(0,len(nucData[1])):
    nucHeader = nucData[1][nucIndex]
    if nucIndex > 0 and nucHeader.startswith(nucData[1][nucIndex - matchCount]) and nucHeader.endswith('(%d)' % (matchCount + 1)):
        matchCount += 1
        for listIndex in range(0,3): qualData[listIndex].insert(nucIndex,copy.deepcopy(qualData[listIndex][nucIndex - matchCount + 1]))
        continue
    else: matchCount = 1
    if qualData[2].count(nucHeader) == 0: errorExit('Contig "%s" in file %s could not be found in %s' % (nucHeader,nucPath,qualPath))
    inIndex = qualData[2].index(nucHeader)
    for listIndex in range(0,3):
        qualData[listIndex].insert(nucIndex,qualData[listIndex][inIndex])
        del qualData[listIndex][inIndex + 1]
contigs, qualValues, qualHeaders, inSchema = qualData
matchStats, headers, refHeads, refLenghts, cntgLengths, deltas = nucData
if sum([refHeads[0] != refHeads[index] for index in range(0,len(refHeads))]) > 1: errorExit('Multiple references detected in %s. sumqual does not support multiple references.' % nucPath)
nucRefHeader = refHeads[0]
refHeaders, refSeqs = refData
if len(refHeaders) == 0: errorExit('The reference file at %s is empty.' % refPath)
if len(refHeaders) > 1: print 'Caution: Multiple sequences decteced in reference file at %s. Only the first sequence, named %s, will be used.\n' % (refPath,refHeaders[0])
refHeader, refSeq = (refHeaders[0], refSeqs[0])
verbose('Complete\n')
#############################################################################################################################

###Revsersed sequence Handleing##############################################################################################
def compliment(seq):
    conversionKey = {'A':'T','T':'A','G':'C','C':'G'}
    outSeq = []
    for base in seq: outSeq.append(conversionKey[base])
    return outSeq


reversedIndexes = []
for cIndex in range(0,len(contigs)):
    if matchStats[cIndex][2] > matchStats[cIndex][3] and matchStats[cIndex][1] >= matchStats[cIndex][0]:
        revMatch = matchStats[cIndex]
        matchStats[cIndex][2] = len(contigs[cIndex]) - revMatch[2] + 1
        matchStats[cIndex][3] = len(contigs[cIndex]) - revMatch[3] + 1
        reversedIndexes.append(cIndex)
    elif matchStats[cIndex][0] > matchStats[cIndex][1] and matchStats[cIndex][3] >= matchStats[cIndex][2]:
        temp = matchStats[cIndex][0]
        matchStats[cIndex][0] = matchStats[cIndex][1]
        matchStats[cIndex][1] = temp
        reversedIndexes.append(cIndex)
for index in reversedIndexes:
    headers[index] += '(reverse)'
    contigs[index].reverse()
    contigs[index] = compliment(contigs[index])

if len(reversedIndexes) > 0: verbose('sumqual: %d reversed contig alignments found.\n' % len(reversedIndexes))
#############################################################################################################################   

###Case Modification#########################################################################################################
verbose('sumqual: Makeing all matching sequence uppercase and all unmatching lowercase...',)
for cIndex in range(0,len(contigs)):
    contigs[cIndex] = [char.lower() for char in contigs[cIndex][:matchStats[cIndex][2] - 1]] + \
                      [char.upper() for char in contigs[cIndex][matchStats[cIndex][2] - 1:matchStats[cIndex][3]]] + \
                      [char.lower() for char in contigs[cIndex][matchStats[cIndex][3]:]]
verbose('Complete\n')
#############################################################################################################################

###Indel incorperation#######################################################################################################
def deltaCoords(deltas, refStart, cntgStart):
    if type(deltas) != list: errorExit('deltaCoords: list required...')
    if len(deltas) == 0: return ([],[])
    refOut, cntgOut = ([],[])
    cntgPos, refPos = cntgStart, refStart
    for delta in deltas:
        cntgPos += abs(delta)
        refPos += abs(delta)
        if delta > 0: cntgOut.append(cntgPos)
        elif delta < 0: refOut.append(refPos)
    return (refOut, cntgOut)

def addCntgDel(delIndex, cntgIndex):
    contigs[cntgIndex].insert(delIndex,'-')
    qualValues[cntgIndex].insert(delIndex,[0]*6)
    matchStats[cIndex][3] += 1
    
def addRefDel(delIndex, cntgIndex):
    isAfterIndel = [index for index in range(0,len(contigs)) if delIndex <= matchStats[index][0] - 1 and index != cntgIndex]
    for index in isAfterIndel:
        matchStats[index][0] += 1
        matchStats[index][1] += 1
    refSeq.insert(delIndex,'-')
    delsToAdd = [(delIndex - matchStats[cIndex][0] + matchStats[cIndex][2], cIndex) for cIndex in range(0,len(contigs)) if delIndex >= matchStats[cIndex][0] - 1 \
                                                                         and delIndex <= matchStats[cIndex][1] - 1 \
                                                                         and (delIndex,cIndex) not in allRefDels \
                                                                         and cntgIndex != cIndex]
    for cDel in delsToAdd: addCntgDel(*cDel)
    for index in range(0,len(allRefDels)):
        if allRefDels[index][1] != cntgIndex and allRefDels[index][0] > delIndex: allRefDels[index][0] += 1


verbose('sumqual: Incorperating Nucmer indel information...',)
allCntgDels = [[cntgDel,index] for index in range(0,len(contigs)) for cntgDel in deltaCoords(deltas[index], matchStats[index][0] - 2, matchStats[index][2] - 2)[1]]
allRefDels = [[refDel,index] for index in range(0,len(contigs)) for refDel in deltaCoords(deltas[index], matchStats[index][0] - 2, matchStats[index][2] - 2)[0]]
for cntgDel in allCntgDels: addCntgDel(*cntgDel)
for refDel in allRefDels: addRefDel(*refDel)
verbose('Complete\nsumqual: %d reference deltions and %d contig deletions (i.e. "-") added.\n' % (len(allRefDels),len(allCntgDels)))                      
#############################################################################################################################

for i in range(0,len(contigs)):
    if headers[i] == 'Contig128': print map(len,(contigs[i], qualValues[i]))


def saveQualAlignment(savePath, contigs, qualValues, headers, alignmentIndexs, reference, refHeader, schema=None):
    firstBaseIndex = min([stats[0] - stats[2] for stats in alignmentIndexs])
    lastBaseIndex = max([alignmentIndexs[index][0] - alignmentIndexs[index][2] + len(contigs[index]) - 1 for index in range(0,len(contigs))])
    outList = [[str(index + 1),refSeq[index]] for index in range(0,len(refSeq))]
    outList = [[str(index),'-'] for index in range(firstBaseIndex,0)] + outList + [[str(index),'-'] for index in range(len(refSeq),lastBaseIndex + 1)]
    for cIndex in range(0,len(headers)):
        if headers[cIndex] == 'Contig128':
            print map(len,(contigs[cIndex],qualValues[cIndex]))
            #print contigs[cIndex]
        quals = [[headers[cIndex], index + 1, contigs[cIndex][index]] + qualValues[cIndex][index] for index in range(0,len(contigs[cIndex]))]
        quals = [map(str,line) for line in quals]
        outPos = alignmentIndexs[cIndex][0] - alignmentIndexs[cIndex][2] #+ alignmentIndexs[0][2] - alignmentIndexs[0][0]
        if firstBaseIndex < 0: outPos += abs(firstBaseIndex)
        for qIndex in range(0,len(quals)): outList[outPos + qIndex] += quals[qIndex]
    if schema == None: schema = ['Position', 'Reference', 'Name', 'Index', 'Call', 'A', 'C', 'G', 'T', 'Dash', 'Sum']
    maxCols = (max(map(len,outList)) - 2) / 6
    if maxCols > 1: schema += [part + '-alt' + str(count) for count in range(2,maxCols + 1) for part in schema[2:]]
    outHandle = open(savePath, 'w')
    outHandle.write('Schema:\n' + '\t'.join(schema) + '\n' + refHeader + '\n')
    for line in map('\t'.join,outList): outHandle.write(line + '\n')
    outHandle.close()
    
###Output file saveing#######################################################################################################
outPath = os.path.join(os.getcwd(), os.path.basename(nucPath) + '.qual')
verbose('sumqual: Saving quality consensus alignment to %s...' % outPath,)
saveQualAlignment(outPath, contigs, qualValues, headers, matchStats, refSeq, refHeader)
verbose('Complete\n')
if options.save_run_info:
    runtimeOutPath = os.path.join(os.getcwd(), 'sumqual_runtime_output_%s.txt' % timeStamp)
    verbose('sumqual: option -i/--save-run-info: saving run-time output to %s...' % runtimeOutPath)
    runHandle = open(runtimeOutPath, 'w')
    for line in printLog: runHandle.write(line)
    runHandle.close()
    verbose('Complete\n')
#############################################################################################################################