Gapstrip.py
From OpenWetWare
Jump to navigationJump to search
Copy and past the following into a text editor and save it as gapstrip.py to use the script.
#!/usr/bin/env python
#############################################################################################################################
#gapstrip [options] <fasta file> #
#Example: python ../gapstrip -n 4 -p ../myfile.fsa #
#Version: 1.0 Last Modified: 12/30/09 Author: Zachary S. L. Foster #
#---------------------------------------------------------------------------------------------------------------------------#
#Argument: A fasta file containing sequences aligned to a reference or each other. The first sequence is considered to the #
# reference by default #
#---------------------------------------------------------------------------------------------------------------------------#
#>Searches for base indices at which all of the aligned sequences have a '-' #
#>Ignores differences caused by the reference (a.k.a. first sequence in the input file by default), or the number of lines #
# specified by the -n modifier, if used. #
#>Removes the data for every sequence at each indice where all bases were '-', effectively removing each gap. #
#---------------------------------------------------------------------------------------------------------------------------#
#Modifiers: #
# -n : specify number of top lines to ignore in gap strip (default = 1) #
# -d : save debug log to current working directory #
# -p : print output #
#############################################################################################################################
###Imports / Variable Initilization##########################################################################################
import os, string, sys
defArgList = ['gapstrip.py','-p','-n','2','C:/Python26/gapStripDef.fsa'] #argument list used during script testing the with Python GUI IDLE
argList = sys.argv #argument list supplied by user
argNum = len(argList) #the number of arguments supplied
minArgNum = 1 #the smallest amount of arguments with which it is possible to run the script
saveDebug = False #is True if the debug is to be saved
warning = False #is set to 'True' if the program encounters any minor errors during the analysis; recorded in debug law
printOut = False #is Ture if the -p modifier is supplied; the output will be printed to standard out (usually the screen)
helpOnly = False #is 'True' when no arguments are given; only help menu is printed
debugLog = ['***********DEBUG LOG***********\n'] #where all errors/anomalies are recorded; saved if the -d modifier is supplied
savePath = './' #the directory in which the output will be saved
defSavePath = 'C:/Python26/' #save path used during script testing with the python GUI IDLE
modifiers = [] #eventually contains a list of all modifiers and their arguments supplied by the user
allMods = 'ndp' #all the modifiers recognized by the script
activeMods = '' #a string containing all modifiers specified by the user
modArgs = [] #arguments supplied for the modifiers; assumed in the same order as the modifiers
faData = [] #will hold all data from faRaw in the following format [[name of contig, sequence],[..]..]
outData = []
editLog = [] #a list of modifications made to the data in order to align it to the reference
topNum = 1 #indicates the number of sequences to ignore; can be changed with the -n modifier
#############################################################################################################################
def printHelp():
print '/---------------------------------------------------------------------------------------\\'
print '| gapstrip [options] <fasta file> |'
print '|---------------------------------------------------------------------------------------|'
print '| Example: python ../gapstrip -n 4 -p ../myfile.fsa |'
print '|---------------------------------------------------------------------------------------|'
print '| Version: 1.0 Last Modified: 12/30/09 Author: Zachary S. L. Foster |'
print '| >Searches for base indices at which all of the aligned sequences have a - |'
print '| >Ignores differences caused by the reference (a.k.a. first sequence in the input file|'
print '| by default), or the number of lines specified by the -n modifier, if used. |'
print '| >Removes the data for every sequence at each index where all bases were -, |'
print '| effectively removing each gap. |'
print '|---------------------------------------------------------------------------------------|'
print '| Modifiers: |'
print '| -d : Save debug log |'
print '| -p : Print output to standard out (usually the shell being called from) |'
print '| -n : specify number of top lines to ignore in gap strip (default = 1) |'
print '\\---------------------------------------------------------------------------------------/'
#Error handling function#####################################################################################################
#>Is called when the script encounters a fatal error #
#>Prints the debug log to standard out (usually the screen)
def errorExit():
print 'The program was forced to exit prematurely, printing debug log...\n'
for line in debugLog:
print line
sys.exit()
#############################################################################################################################
#############################################################################################################################
#>function designed to make a list of ranges (e.g. 2-5, 8-10) out of a list of integers (e.g. 2,3,4,5,8,9,10) #
def mkRange(numList):
outList = []
start = numList[0]
current = numList[0]
numList.append(numList[-1])
for num in numList[1:]:
if (num - 1) != current:
outList.append([start,current])
start = num
current = num
return outList
#############################################################################################################################
###Argument Interprtation####################################################################################################
if __name__ == '__main__': #If the program is being called independent of the Python GUI, IDLE...
if argNum > minArgNum: #If at least the minimum number of arguments necessary is supplied...
if os.path.exists(argList[-1]) == 0: #if the path dose not exist
debugLog.append('Error: Invalid file path to fasta file')
errorExit() #end the program
elif argNum == 1: #If no arguments are supplied...
helpOnly = True
printHelp() #prints help menu
else:
debugLog.append('Error: Too few arguments supplied\n')
errorExit() #end the program
else: #If the script is being imported on to IDLE
argList = defArgList #use default arguments
argNum = len(defArgList)
if argNum == 1: #If no arguments are supplied...
helpOnly = True
printHelp() #prints help menu
savePath = defSavePath #sets the save path to the default, specified in the variable initialization section
debugLog.append('Alert: default debugging input arguments are being used\n')
if helpOnly == False: #if arguments were supplied...
faPath = argList[-1] #the path to the fasta file containing the input data
if argNum > minArgNum + 1: #if modifiers are present (i.e. more the minimum number of arguments)
modifiers = argList[1:-minArgNum] #everything before the required arguments are modifiers and their argum
#############################################################################################################################
if helpOnly == False:
###Modifier Interpretation###############################################################################################
#>Parses any modifiers and modifier arguments determined by the previous section of code, "Argument Interpretation" #
#>Given arguments are compared against a list of known arguments #
#>Matches found change the appropriate variable for the desired effect of the modifier the script #
if len(modifiers) > 0: #if modifiers are supplied
for mod in modifiers: #loops through the list of modifiers and modifier arguments
if mod[0] == '-' and len(mod) == 2: #list entry considered modifier if it starts with - and is only two characters
activeMods += mod[1:] #sorts the modifiers into activeMods...
else:
modArgs.append(mod) #assumes everything else to be a modifier argument
for letter in activeMods: #checks if the modifiers are recognized
if string.find(allMods,letter) == -1: #checks if the modifier is recognized by this script
debugLog.append('Warning: Unexpected modifier: ' + letter + '\n')
warning = True #if the input modifier is not found
else:
if letter == 'd': #if -d is supplied...
saveDebug = True #The debug log will be saved to the current working directory
if letter == 'p': #if -p is supplied...
printOut = True #The results will be printed to the standard output (usually the screen)
if letter == 'n': #if -n is supplied...
if len(modArgs) > 0: #if there is at least one non-processed modifier argument
topNum = int(modArgs[0]) #The string specifying the desired lines to process from the input file
del modArgs[0] #the original argument is deleted from the list
else: #if the list of modifier arguments is empty...
print 'Error: Modifier argument not supplied\n'
sys.exit() #exit the script
#########################################################################################################################
###Query fasta input data parsing and file procedures####################################################################
#>extracts the input data from the files at the supplied path #
#>Parses the data into a list in the following format [[name of contig, sequence],[..]..] #
faHandle = open(faPath, 'r') #creates a file object
faRaw = faHandle.readlines() #all of the content of the input file in its raw form
faHandle.close() #closes the file object
faRaw.append('> ') #makes it so the last sequence can be processed in the following loop
cntgSeq = ''
cntgName = ''
for line in faRaw: #loops through the lines of the input file
if line[0] == '>': #if the first character of the line is a '>'
if len(cntgName) > 0 and len(cntgSeq) > 0:
faData.append([cntgName,cntgSeq])
cntgSeq = ''
cntgName = line[1:-1] #[1:-1] removes '>' and '\n'
else:
if line[0] == ' ': #if the line begins with a space...
line = line[1:] #remove the space
if line[-1] == '\n': #if the line ends with a newline...
line = line[:-1] #remove the newline
cntgSeq += line #add contents of the line to the total sequence for this contig
#########################################################################################################################
###Pointless spacer removal##############################################################################################
delList = [] #will contain the indices at which all of the sequences have '-' and must be deleted
refData = faData[0][1] #the reference is assumed to be the first sequence (even if the -n modifier is set to 0)
for refIndex in range(0,len(refData)): #loops through each index in terms of the reference
pointlessSpace = True #each index is assumed to be a space
for cntgIndex in range(topNum,len(faData)): #loops through all the sequences that are being checked for spaces
seqLen = len(faData[cntgIndex][1])
if refIndex < seqLen: #if the index being tested is present on this contig...
bp = faData[cntgIndex][1][refIndex]
if bp != '-': #if the base is not a space...
pointlessSpace = False
if pointlessSpace:
delList.append(refIndex)
if len(delList) > 0:
delList = mkRange(delList) #makes a list of indices into a list of index ranges
delList.reverse() #reverses the order of the list to preserve the accuracy of the index values in delList as they are deleted from faData
for theRange in delList: #for every range specified by delList...
for cntgIndex in range(0,len(faData)): #for every contig is faData...
faData[cntgIndex][1] = faData[cntgIndex][1][:theRange[0]] + faData[cntgIndex][1][theRange[1] + 1:] #remove the section determined to be spaces
else:
print 'No gaps found...'
#########################################################################################################################
###Out file writing and saveing procedures###############################################################################
fileSavePath = savePath + 'GapStrip.' + os.path.basename(faPath) #the path to where the output is saved
outHandle = open(fileSavePath, 'w') #opens the file object for saving the output
for line in faData: #prints every line of outData to the output file...
outHandle.write('>' + line[0] + '\n') #write contig name
outHandle.write(line[1] + '\n') #write contig sequence
if printOut: #if the -p modifier is supplied
for line in faData: #prints every line to the standard output
print '>' + line[0]
print line[1]
outHandle.close()
#########################################################################################################################
###Debug Saving procedures###############################################################################################
if saveDebug: #if the -d modifier is supplied
fileSavePath = savePath + 'GSDebug.' + os.path.basename(faPath) #the path to where the debug log is to be saved
debugHandle = open(fileSavePath, 'w') #opens the file object for saving the debug log
for line in debugLog:
debugHandle.write(line)
debugHandle.close()
#########################################################################################################################