Gapstrip.py

Copy and past the following into a text editor and save it as gapstrip.py to use the script.
 * 1) !/usr/bin/env python


 * 1) gapstrip [options]                                                                                           #
 * 2) Example: python ../gapstrip -n 4 -p ../myfile.fsa                                                                         #
 * 3) Version: 1.0  Last Modified: 12/30/09     Author: Zachary S. L. Foster                                                    #
 * 4) Argument: A fasta file containing sequences aligned to a reference or each other. The first sequence is considered to the #
 * 5)   reference by default                                                                                                    #
 * 6) >Searches for base indices at which all of the aligned sequences have a '-'                                               #
 * 7) >Ignores differences caused by the reference (a.k.a. first sequence in the input file by default), or the number of lines #
 * 8)   specified by the -n modifier, if used.                                                                                  #
 * 9) >Removes the data for every sequence at each indice where all bases were '-', effectively removing each gap.              #
 * 10) Modifiers:                                                                                                                #
 * 11)   -n      : specify number of top lines to ignore in gap strip        (default = 1)                                       #
 * 12)   -d      : save debug log to current working directory                                                                   #
 * 13)   -p      : print output                                                                                                  #
 * 1) Modifiers:                                                                                                                #
 * 2)   -n      : specify number of top lines to ignore in gap strip        (default = 1)                                       #
 * 3)   -d      : save debug log to current working directory                                                                   #
 * 4)   -p      : print output                                                                                                  #

import os, string, sys defArgList = ['gapstrip.py','-p','-n','2','C:/Python26/gapStripDef.fsa'] #argument list used during script testing the with Python GUI IDLE argList    = sys.argv #argument list supplied by user argNum     = len(argList) #the number of arguments supplied minArgNum  = 1 #the smallest amount of arguments with which it is possible to run the script saveDebug  = False #is True if the debug is to be saved warning    = False #is set to 'True' if the program encounters any minor errors during the analysis; recorded in debug law printOut   = False #is Ture if the -p modifier is supplied; the output will be printed to standard out (usually the screen) helpOnly   = False #is 'True' when no arguments are given; only help menu is printed debugLog   = ['***********DEBUG LOG***********\n'] #where all errors/anomalies are recorded; saved if the -d modifier is supplied savePath   = './' #the directory in which the output will be saved defSavePath = 'C:/Python26/' #save path used during script testing with the python GUI IDLE modifiers  = [] #eventually contains a list of all modifiers and their arguments supplied by the user allMods    = 'ndp' #all the modifiers recognized by the script activeMods = '' #a string containing all modifiers specified by the user modArgs    = [] #arguments supplied for the modifiers; assumed in the same order as the modifiers faData     = [] #will hold all data from faRaw in the following format [[name of contig, sequence],[..]..] outData     = [] editLog     = [] #a list of modifications made to the data in order to align it to the reference topNum      = 1 #indicates the number of sequences to ignore; can be changed with the -n modifier
 * 1) Imports / Variable Initilization##########################################################################################

def printHelp: print '/---\\' print '| gapstrip [options]                                                      |' print '|---|' print '| Example: python ../gapstrip -n 4 -p ../myfile.fsa                                    |' print '|---|' print '| Version: 1.0  Last Modified: 12/30/09     Author: Zachary S. L. Foster               |' print '| >Searches for base indices at which all of the aligned sequences have a -            |' print '| >Ignores differences caused by the reference (a.k.a. first sequence in the input file|'    print '|    by default), or the number of lines specified by the -n modifier, if used. |'   print '|  >Removes the data for every sequence at each index where all bases were -,           |' print '|   effectively removing each gap. |'   print '|---|' print '| Modifiers:                                                                           |' print '| -d  : Save debug log                                                                 |' print '| -p  : Print output to standard out (usually the shell being called from)             |' print '| -n  : specify number of top lines to ignore in gap strip        (default = 1)        |' print '\\---/'

def errorExit: print 'The program was forced to exit prematurely, printing debug log...\n' for line in debugLog: print line sys.exit
 * 1) Error handling function#####################################################################################################
 * 2) >Is called when the script encounters a fatal error                                                                       #
 * 3) >Prints the debug log to standard out (usually the screen)

def mkRange(numList): outList = [] start  = numList[0] current = numList[0] numList.append(numList[-1]) for num in numList[1:]: if (num - 1) != current: outList.append([start,current]) start = num current = num return outList
 * 1) >function designed to make a list of ranges (e.g. 2-5, 8-10) out of a list of integers (e.g. 2,3,4,5,8,9,10)              #
 * 1) >function designed to make a list of ranges (e.g. 2-5, 8-10) out of a list of integers (e.g. 2,3,4,5,8,9,10)              #

if __name__ == '__main__': #If the program is being called independent of the Python GUI, IDLE...   if argNum > minArgNum: #If at least the minimum number of arguments necessary is supplied... if os.path.exists(argList[-1]) == 0: #if the path dose not exist debugLog.append('Error: Invalid file path to fasta file') errorExit #end the program elif argNum == 1: #If no arguments are supplied... helpOnly = True printHelp #prints help menu else: debugLog.append('Error: Too few arguments supplied\n') errorExit #end the program else: #If the script is being imported on to IDLE argList = defArgList #use default arguments argNum  = len(defArgList) if argNum == 1: #If no arguments are supplied... helpOnly = True printHelp #prints help menu savePath = defSavePath #sets the save path to the default, specified in the variable initialization section debugLog.append('Alert: default debugging input arguments are being used\n') if helpOnly == False: #if arguments were supplied... faPath = argList[-1] #the path to the fasta file containing the input data if argNum > minArgNum + 1: #if modifiers are present (i.e. more the minimum number of arguments) modifiers = argList[1:-minArgNum] #everything before the required arguments are modifiers and their argum
 * 1) Argument Interprtation####################################################################################################

if helpOnly == False: ###Modifier Interpretation############################################################################################### #>Parses any modifiers and modifier arguments determined by the previous section of code, "Argument Interpretation"    # #>Given arguments are compared against a list of known arguments                                                       # #>Matches found change the appropriate variable for the desired effect of the modifier the script                      # if len(modifiers) > 0: #if modifiers are supplied for mod in modifiers: #loops through the list of modifiers and modifier arguments if mod[0] == '-' and len(mod) == 2: #list entry considered modifier if it starts with - and is only two characters activeMods += mod[1:] #sorts the modifiers into activeMods... else: modArgs.append(mod) #assumes everything else to be a modifier argument for letter in activeMods: #checks if the modifiers are recognized if string.find(allMods,letter) == -1: #checks if the modifier is recognized by this script debugLog.append('Warning: Unexpected modifier: ' + letter + '\n') warning = True #if the input modifier is not found else: if letter == 'd': #if -d is supplied... saveDebug = True #The debug log will be saved to the current working directory if letter == 'p': #if -p is supplied... printOut = True #The results will be printed to the standard output (usually the screen) if letter == 'n': #if -n is supplied... if len(modArgs) > 0: #if there is at least one non-processed modifier argument topNum = int(modArgs[0]) #The string specifying the desired lines to process from the input file del modArgs[0] #the original argument is deleted from the list else: #if the list of modifier arguments is empty... print 'Error: Modifier argument not supplied\n' sys.exit #exit the script #########################################################################################################################

###Query fasta input data parsing and file procedures#################################################################### #>extracts the input data from the files at the supplied path                                                          # #>Parses the data into a list in the following format [[name of contig, sequence],[..]..]                              #    faHandle  = open(faPath, 'r') #creates a file object    faRaw     = faHandle.readlines #all of the content of the input file in its raw form    faHandle.close #closes the file object    faRaw.append('> ') #makes it so the last sequence can be processed in the following loop    cntgSeq =     cntgName =     for line in faRaw: #loops through the lines of the input file        if line[0] == '>': #if the first character of the line is a '>'            if len(cntgName) > 0 and len(cntgSeq) > 0:                faData.append([cntgName,cntgSeq])            cntgSeq = ''            cntgName = line[1:-1]       #[1:-1] removes '>' and '\n'        else:            if line[0] == ' ': #if the line begins with a space...                line = line[1:] #remove the space            if line[-1] == '\n': #if the line ends with a newline...                line = line[:-1] #remove the newline            cntgSeq += line #add contents of the line to the total sequence for this contig    #########################################################################################################################

###Pointless spacer removal############################################################################################## delList = [] #will contain the indices at which all of the sequences have '-' and must be deleted refData = faData[0][1] #the reference is assumed to be the first sequence (even if the -n modifier is set to 0) for refIndex in range(0,len(refData)): #loops through each index in terms of the reference pointlessSpace = True #each index is assumed to be a space for cntgIndex in range(topNum,len(faData)): #loops through all the sequences that are being checked for spaces seqLen = len(faData[cntgIndex][1]) if refIndex < seqLen: #if the index being tested is present on this contig...                bp = faData[cntgIndex][1][refIndex] if bp != '-': #if the base is not a space... pointlessSpace = False if pointlessSpace: delList.append(refIndex) if len(delList) > 0: delList = mkRange(delList) #makes a list of indices into a list of index ranges delList.reverse #reverses the order of the list to preserve the accuracy of the index values in delList as they are deleted from faData for theRange in delList: #for every range specified by delList...           for cntgIndex in range(0,len(faData)): #for every contig is faData... faData[cntgIndex][1] = faData[cntgIndex][1][:theRange[0]] + faData[cntgIndex][1][theRange[1] + 1:] #remove the section determined to be spaces else: print 'No gaps found...' #########################################################################################################################

###Out file writing and saveing procedures############################################################################### fileSavePath = savePath + 'GapStrip.' + os.path.basename(faPath) #the path to where the output is saved outHandle = open(fileSavePath, 'w') #opens the file object for saving the output for line in faData: #prints every line of outData to the output file... outHandle.write('>' + line[0] + '\n') #write contig name outHandle.write(line[1] + '\n') #write contig sequence if printOut: #if the -p modifier is supplied for line in faData: #prints every line to the standard output print '>' + line[0] print line[1] outHandle.close #########################################################################################################################

###Debug Saving procedures############################################################################################### if saveDebug: #if the -d modifier is supplied fileSavePath = savePath + 'GSDebug.' + os.path.basename(faPath) #the path to where the debug log is to be saved debugHandle = open(fileSavePath, 'w') #opens the file object for saving the debug log for line in debugLog: debugHandle.write(line) debugHandle.close #########################################################################################################################