Harvard:Biophysics 101/2007/Notebook:Katie Fifer/2007-5-3

From OpenWetWare
Jump to navigationJump to search

The Completed Script! (minus the interaction with the web interface) You can run this from a terminal and it works great but you need to download feedparser from Google and the CA dx05.txt file from http://www.oshpd.ca.gov/hqad/PatientLevel/ICD9_Codes/index.htm .

The basic sequence of steps is Sequence->blastSNP->rs#(and other info) (xiaodi and katie) which is checked to make sure it's actually valid (chris' code) ->OMIM search for info (xiaodi and kay). The rs# is then also used to generate mesh terms from PubMed (resmi and cynthia). Zach's code figures out which are most relevant and uses those to look at the potential disease and its prevalence (currently california state prevelance data). Those good mesh terms are also used with Deniz' code to get news about the disease. Then all this output is fed to xiaodi's web interface. Mike's code and Tiff's code didn't get incorporated but is still relevant and can be found on their respective pages. Mike's was not incorporated because it broke on too many rs#s and we haven't really dealt with robustness at this stage. Tiff's was not incorporated because it spit out URLs that we couldn't work with well at this stage. Hetmann worked on documentation and a summary. <syntax type='python'> from Bio import SeqIO from Bio.Blast import NCBIWWW from Bio.EUtils import DBIdsClient import xml.dom.minidom from xml.dom.minidom import parse, parseString from threading import Thread import time, sys import pickle import os import string import urllib from Bio import PubMed from Bio import Medline import feedparser id = "1" outputlist = []

  1. All the relevant output will be to a file.

output_file = file("out" + id + ".txt", 'w')

  1. C-style struct to pass parameters

class AllelicVariant: pass

  1. basic function to open fasta file and get sequence

def get_sequence_from_fasta(file): file_handle = open(file) records = SeqIO.parse(file_handle, format="fasta") record = records.next() return record.seq.data

  1. lookup blast snp data

def blast_snp_lookup(seq): result_handle = NCBIWWW.qblast("blastn", "snp/human_9606/human_9606", seq) return result_handle.read()

  1. basic text extraction from XML; based on http://docs.python.org/lib/dom-example.html

def get_text(node_list): rc = "" for node in node_list: if node.nodeType == node.TEXT_NODE: rc = rc + node.data return rc

  1. extracts snp data

def extract_snp_data(str): dom = parseString(str) variants = dom.getElementsByTagName("Hit") if len(variants) == 0: return parsed = [] for v in variants: # now populate the struct hit_def = get_text(v.getElementsByTagName("Hit_def")[0].childNodes) id_query = get_text(v.getElementsByTagName("Hsp_hseq")[0].childNodes) id_hit = get_text(v.getElementsByTagName("Hsp_qseq")[0].childNodes) score = get_text(v.getElementsByTagName("Hsp_score")[0].childNodes) id = get_text(v.getElementsByTagName("Hit_accession")[0].childNodes) # extract position of the SNP from Hit Definition lower_bound = hit_def.find("pos=")+4 upper_bound = hit_def.find("len=")-1 position = int(hit_def[lower_bound:upper_bound]) # only consider it a genuine snp if the hit score is above 100, # the query/hit sequences are longer than the position of the SNP # and the query sequence matches the hit sequence at the SNP position if int(score) > 100 and position < len(id_hit) and id_query[position] == id_hit[position]: parsed.append(id) return parsed

  1. queries the database and returns all info in an XML format

def omim_snp_search(dnsnp_id): client = DBIdsClient.DBIdsClient() query = client.search(dnsnp_id, "omim") records = [i.efetch(rettype="xml") for i in query] return records

  1. extracts allelic variant data, as the name implies, using the struct above

def extract_allelic_variant_data(str): dom = parseString(str) variants = dom.getElementsByTagName("Mim-allelic-variant") if len(variants) == 0: return parsed = [] for v in variants: a = AllelicVariant() # create empty instance of struct # now populate the struct a.name = get_text(v.getElementsByTagName("Mim-allelic-variant_name")[0].childNodes) a.mutation = get_text(v.getElementsByTagName("Mim-allelic-variant_mutation")[0].getElementsByTagName("Mim-text_text")[0].childNodes) a.description = get_text(v.getElementsByTagName("Mim-allelic-variant_description")[0].getElementsByTagName("Mim-text_text")[0].childNodes) parsed.append(a) return parsed

  1. Kay's stuff - OMIM stuff

# queries the SNP database and returns geneID tag as a string

    1. Currently, DBIdsClient does not support snp parsing - so it's not used.
    2. A future update should correct this when possible, for ease of reading.

def parse_geneID_tag(snp_id): try: SNP_URL = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=snp' url = SNP_URL + '&id=' + snp_id + '&mode=xml' dom = minidom.parse(urllib.urlopen(url)) symbol = dom.getElementsByTagName("FxnSet_symbol") return symbol[0].toxml().split('>')[1].split('<')[0] except: return ""

    1. Note: This code is a temporary solution to a dbSNP formatting issue.
    2. Older entries are best searched directly by ID #.
    3. - > this case is the first covered
    4. Newer entries are not indexed in this fashion, although SNP ID data is
    5. available on the individual entry. These contain Allelic Variant data
    6. which is located and extracted by the script
    7. - > this case is the second covered
  1. takes SNP ID and gets search results from OMIM in XML format

def snp_to_omim(snp_id): records = omim_snp_search(snp_id) if records == list(): tag_id = parse_geneID_tag(snp_id) if len(tag_id) == 0: return "" records = omim_tag_search(tag_id) return records

      1. cynthias section ###
  1. parses a mesh term to remove * and /

def parse_term(str, bool): parsed_term = str if(bool): parsed_term = parsed_term.replace('*', ) if str.find('/') != -1: parsed_term = parsed_term.replace('/', ' ') return parsed_term

  1. parses list of mesh terms
  2. returns embedded list, one with all terms and one major terms

def parse_mesh(list): all_mesh_terms = [] major_mesh_terms = [] mesh_term = for i in range(len(list)): major = False if list[i].find('*') == -1: mesh_term = parse_term(list[i], major) all_mesh_terms.append(mesh_term) else: major = True mesh_term = parse_term(list[i], major) major_mesh_terms.append(mesh_term) all_mesh_terms.append(mesh_term) all_mesh = [all_mesh_terms, major_mesh_terms] return all_mesh

  1. DENIZ's stuff
  2. empty class

class Feedoutput: pass

  1. I want a single string as input. This should be done as often as needed for multiple strings

def outputzilla(inputstring): #This will parse the blogs we want blogresults = inputstring.replace(' ', '+') medstory_results = inputstring.replace(' ', '%20') medstory_root = 'http://www.medstory.com/rss?q=' + medstory_results medstory_web = medstory_root + '&af=true&c=true&s=Web&i=' medstory_news = medstory_root + '&af=false&c=true&s=NewsFeed&i=' medstory_multimedia = medstory_root + '&af=true&c=true&s=AudioVideo&i=' medstory_clinical = medstory_root + '&af=true&c=true&s=ClinicalTrial&i=' medstory_research = medstory_root + '&af=false&c=true&s=Research&i=' technorati_blog = 'http://feeds.technorati.com/search/' + blogresults + '?authority=a7' #The feeds output = Feedoutput() output.web_feed = feedparser.parse(medstory_web) output.news_feed = feedparser.parse(medstory_news) output.multimedia_feed = feedparser.parse(medstory_multimedia) output.clinical_feed = feedparser.parse(medstory_clinical) output.research_feed = feedparser.parse(medstory_research) output.blog_feed = feedparser.parse(technorati_blog) return output

  1. LO AND BEHOLD Here are the variables that you have access to
  2. output.web_feed.entries[0].title
  3. output.web_feed.entries[0].description
  4. output.web_feed.entries[0].link
  5. output.news_feed.entries[0].title
  2. read sequence from file

sequence = get_sequence_from_fasta("example.fasta") print "Sequence received; please wait."

  1. look up data

b = blast_snp_lookup(sequence)

  1. extract data

e = extract_snp_data(b) print e outputlist.append(str(len(e)) + " single nucleotide polymorphism(s) found:\n") if len(e) > 0: for snp_id in e: outputlist.append(snp_id + "\n") else: outputlist.append("[none]\n") print "Didn't find any snp. Guess you're healthy :)" sys.exit() # nothing more to be done if no snp found print "got a snp... still thinking..." TEMP = ['rs11200638']

  1. do an omim search on each snp

for snp_id in e: o = snp_to_omim(snp_id) if len(o) == 0: # there was some problem with this way of doing the omim lookup for this rs number. so try xiaodi's way. o = omim_snp_search(snp_id) if len(o) == 0: outputlist.append("No information found for " + snp_id + "\n") # nothing more to be done if no records can be found # otherwise, find the allelic variant data outputlist.append(snp_id + " details:" + "\n") for i in o: v = extract_allelic_variant_data(i.read()) if v != None: for a in v: outputlist.append(a.name + "\n") outputlist.append(a.mutation + "\n") outputlist.append(a.description + "\n") # The mesh terms stuff - THIS NEEDS TO BE ADDED TO OUTPUTLIST at some point article_ids = PubMed.search_for(snp_id) rec_parser = Medline.RecordParser() medline_dict = PubMed.Dictionary(parser = rec_parser) all_mesh = [] all_mesh_terms = [] major_mesh_terms = [] for did in article_ids[0:5]: cur_record = medline_dict[did] # print '\n', cur_record.title, cur_record.authors, cur_record.source mesh_headings = cur_record.mesh_headings if len(mesh_headings) != 0: all_mesh = parse_mesh(mesh_headings) all_mesh_terms.extend(all_mesh[0]) major_mesh_terms.extend(all_mesh[1]) print "ALL MESH TERMS", '\n', all_mesh_terms, '\n', major_mesh_terms

  1. Problem to solve: Although Cynthia's output parses out the
  2. major mesh terms from the paper, each mesh term can give a hit
  3. ie: it does not figure out which is really the "major" major mesh term.
  4. I intend to solve it by hitting against the title to see if there are matches
  5. and by removing connecting words from the mesh terms.
    1. modifying cynthia's output

new_term = [] for term in major_mesh_terms: test = term.split(" ") #split by " " temp = "" for test_term in test: if len(test_term) <= 3: #if it is a small connecting word pass else: if temp != "": temp = temp + "+" temp = temp + test_term else: temp = test_term #print temp new_term.append(temp) titles = []

  1. create list of article id's

for did in article_ids: cur_record = medline_dict[did] titles.append(cur_record.title)

  1. titles = a list of all titles returned by PubMed

count = 0 #counter bestterm = [0,"ERROR"] for term in new_term: count = 0 test = term.split("+") #split by "+" for test_term in test: #hit each word in term against title for title in titles: if str.upper(title).find(str.upper(test_term)) != -1: #if found the term in title count += 1 if count > bestterm[0]: bestterm[0] = count bestterm[1] = term #print bestterm[1] #bestterm[1] is the main disease name if(bestterm[1] == "ERROR"): # if not hit print "Warning: There are no relevant mesh terms for the passed rs#!" sys.exit(0) print "BEST TERMS:", bestterm[1] fh = open(os.path.join(os.curdir, "dx05.txt")) #the CA data file line = fh.readline() disease_name = bestterm[1] #INSERT DISEASE NAME HERE FOR TESTING ICD9code = [] ICD9dName = [] found = 0

  1. queueing http://icd9cm.chrisendres.com for code lookup

queue_name = 'http://icd9cm.chrisendres.com/index.php?action=search&srchtype=diseases&srchtext=' queue_name = queue_name + disease_name code_lookup = urllib.urlopen(queue_name).read() #send queue request to site, returns dirty html out = open("index.txt", "w") #write dirty html to file out.write(code_lookup) out.close() tempName = "" readCode = open("index.txt", "r") #read dirty html lookup_line = readCode.readline() print "***ICD9 and hits, arranged by importance***\n" while lookup_line: w= lookup_line.find("

") #the unique marker before the disease

if w != -1: #if it is found tempCode = lookup_line[32:40] #code in this section tempCode = string.split(tempCode, ' ') #split into number print tempCode ICD9code.append(tempCode[0])

w = lookup_line.find("

") if w != -1: #if there is a marker (junk html)

tempName = lookup_line[32:w] tempName = tempName[tempName.find(" ")+1:] print tempName ICD9dName.append(tempName) else: tempName = lookup_line[32:len(lookup_line)-7] tempName = tempName[tempName.find(" ")+1:] #print tempName ICD9dName.append(tempName) lookup_line = readCode.readline()

  1. searching incidence data in CA data file
  2. note: returns class of hits

print "\n\n***Prevalance data per IDC9 above***" print "Note: returns incidence data/yr, including subclasses\n"

fh = open(os.path.join(os.curdir, "dx05.txt")) #the CA data file

  1. new data structure: [0] = code, [1] = name, [2] = incidence

ICD9_prev_output= [[],[],[]]

for (code,name) in zip(ICD9code,ICD9dName): fh.seek(0) line = fh.readline() totalIncidence = 0 sumCount = 0 while line: #for every line in the data line = line[:-1] #remove /n lineVector = string.split(line, ',') #split to vector if lineVector[0].find(code) != -1: #if disease hit totalIncidence += int(lineVector[1]) sumCount += 1 else: if sumCount > 0: print "ID: ", code, "name: ", name, "incidence: ", totalIncidence ICD9_prev_output[0].append(code) ICD9_prev_output[1].append(name) ICD9_prev_output[2].append(totalIncidence) totalIncidence = 0 sumCount = 0 line = fh.readline() fh.close()

  1. returns the most relevant ICD9 information

mainCode = 0 mainName = 0 mainIncidence = 0

counter = 0 for c,n,incid in zip(ICD9_prev_output[0], ICD9_prev_output[1], ICD9_prev_output[2]): if incid > counter: counter = incid mainCode = c mainName = n mainIncidence = incid

print "THE MOST RELEVANT HIT:\n" print "CDC CODE: ", mainCode print "Disease Name: ", mainName print "Incidence/yr in CA: " , mainIncidence

# CALL DENIZ'S FUNCTION TO GET NEWS ABOUT DISEASE # use the best term that was figured out before temp_terms = str(bestterm[1]).split('+') forxiaodi = [] for term in temp_terms: forxiaodi.append(outputzilla(str(term))) print "did deniz' news stuff again"

  1. Some of the output stuff

for item in outputlist: print item pickle.dump(outputlist, output_file) print "wrote pickled version to file"

output_file.close() </syntax>