Harvard:Biophysics 101/2007/Notebook:Katie Fifer/2007-5-3

The Completed Script! (minus the interaction with the web interface) You can run this from a terminal and it works great but you need to download feedparser from Google and the CA dx05.txt file from http://www.oshpd.ca.gov/hqad/PatientLevel/ICD9_Codes/index.htm.

The basic sequence of steps is Sequence->blastSNP->rs#(and other info) (xiaodi and katie) which is checked to make sure it's actually valid (chris' code) ->OMIM search for info (xiaodi and kay). The rs# is then also used to generate mesh terms from PubMed (resmi and cynthia). Zach's code figures out which are most relevant and uses those to look at the potential disease and its prevalence (currently california state prevelance data). Those good mesh terms are also used with Deniz' code to get news about the disease. Then all this output is fed to xiaodi's web interface. Mike's code and Tiff's code didn't get incorporated but is still relevant and can be found on their respective pages. Mike's was not incorporated because it broke on too many rs#s and we haven't really dealt with robustness at this stage. Tiff's was not incorporated because it spit out URLs that we couldn't work with well at this stage. Hetmann worked on documentation and a summary.



from Bio import SeqIO from Bio.Blast import NCBIWWW from Bio.EUtils import DBIdsClient import xml.dom.minidom from xml.dom.minidom import parse, parseString from threading import Thread import time, sys import pickle import os import string import urllib from Bio import PubMed from Bio import Medline import feedparser

id = "1" outputlist = []

output_file = file("out" + id + ".txt", 'w')
 * 1) All the relevant output will be to a file.

class AllelicVariant: pass def get_sequence_from_fasta(file): file_handle = open(file) records = SeqIO.parse(file_handle, format="fasta") record = records.next return record.seq.data def blast_snp_lookup(seq): result_handle = NCBIWWW.qblast("blastn", "snp/human_9606/human_9606", seq) return result_handle.read def get_text(node_list): rc = "" for node in node_list: if node.nodeType == node.TEXT_NODE: rc = rc + node.data return rc def extract_snp_data(str): dom = parseString(str) variants = dom.getElementsByTagName("Hit") if len(variants) == 0: return parsed = [] for v in variants: # now populate the struct hit_def = get_text(v.getElementsByTagName("Hit_def")[0].childNodes) id_query = get_text(v.getElementsByTagName("Hsp_hseq")[0].childNodes) id_hit = get_text(v.getElementsByTagName("Hsp_qseq")[0].childNodes) score = get_text(v.getElementsByTagName("Hsp_score")[0].childNodes) id = get_text(v.getElementsByTagName("Hit_accession")[0].childNodes) # extract position of the SNP from Hit Definition lower_bound = hit_def.find("pos=")+4 upper_bound = hit_def.find("len=")-1 position = int(hit_def[lower_bound:upper_bound]) # only consider it a genuine snp if the hit score is above 100, # the query/hit sequences are longer than the position of the SNP # and the query sequence matches the hit sequence at the SNP position if int(score) > 100 and position < len(id_hit) and id_query[position] == id_hit[position]: parsed.append(id) return parsed
 * 1) C-style struct to pass parameters
 * 1) basic function to open fasta file and get sequence
 * 1) lookup blast snp data
 * 1) basic text extraction from XML; based on http://docs.python.org/lib/dom-example.html
 * 1) extracts snp data

def omim_snp_search(dnsnp_id): client = DBIdsClient.DBIdsClient query = client.search(dnsnp_id, "omim") records = [i.efetch(rettype="xml") for i in query] return records def extract_allelic_variant_data(str): dom = parseString(str) variants = dom.getElementsByTagName("Mim-allelic-variant") if len(variants) == 0: return parsed = [] for v in variants: a = AllelicVariant # create empty instance of struct # now populate the struct a.name = get_text(v.getElementsByTagName("Mim-allelic-variant_name")[0].childNodes) a.mutation = get_text(v.getElementsByTagName("Mim-allelic-variant_mutation")[0].getElementsByTagName("Mim-text_text")[0].childNodes) a.description = get_text(v.getElementsByTagName("Mim-allelic-variant_description")[0].getElementsByTagName("Mim-text_text")[0].childNodes) parsed.append(a) return parsed
 * 1) queries the database and returns all info in an XML format
 * 1) extracts allelic variant data, as the name implies, using the struct above

def parse_geneID_tag(snp_id): try: SNP_URL = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=snp' url = SNP_URL + '&id=' + snp_id + '&mode=xml' dom = minidom.parse(urllib.urlopen(url)) symbol = dom.getElementsByTagName("FxnSet_symbol") return symbol[0].toxml.split('>')[1].split('<')[0] except: return ""
 * 1) Kay's stuff - OMIM stuff
 * 2) queries the SNP database and returns geneID tag as a string
 * 3) Currently, DBIdsClient does not support snp parsing - so it's not used.
 * 4) A future update should correct this when possible, for ease of reading.


 * 1) Note: This code is a temporary solution to a dbSNP formatting issue.
 * 2) Older entries are best searched directly by ID #.
 * 3)  - > this case is the first covered
 * 4) Newer entries are not indexed in this fashion, although SNP ID data is
 * 5)  available on the individual entry.  These contain Allelic Variant data
 * 6)  which is located and extracted by the script
 * 7)  - > this case is the second covered

def snp_to_omim(snp_id): records = omim_snp_search(snp_id)
 * 1) takes SNP ID and gets search results from OMIM in XML format

if records == list: tag_id = parse_geneID_tag(snp_id) if len(tag_id) == 0: return "" records = omim_tag_search(tag_id)

return records
 * 1) cynthias section ###

def parse_term(str, bool): parsed_term = str if(bool): parsed_term = parsed_term.replace('*', '') if str.find('/') != -1: parsed_term = parsed_term.replace('/', ' ') return parsed_term
 * 1) parses a mesh term to remove * and /

def parse_mesh(list): all_mesh_terms = [] major_mesh_terms = [] mesh_term = '' for i in range(len(list)): major = False if list[i].find('*') == -1: mesh_term = parse_term(list[i], major) all_mesh_terms.append(mesh_term) else: major = True mesh_term = parse_term(list[i], major) major_mesh_terms.append(mesh_term) all_mesh_terms.append(mesh_term) all_mesh = [all_mesh_terms, major_mesh_terms] return all_mesh
 * 1) parses list of mesh terms
 * 2) returns embedded list, one with all terms and one major  terms

class Feedoutput: pass
 * 1) DENIZ's stuff
 * 2) empty class

def outputzilla(inputstring): #This will parse the blogs we want blogresults = inputstring.replace(' ', '+')
 * 1) I want a single string as input. This should be done as often as needed for multiple strings

medstory_results = inputstring.replace(' ', '%20') medstory_root = 'http://www.medstory.com/rss?q=' + medstory_results

medstory_web = medstory_root + '&af=true&c=true&s=Web&i=' medstory_news = medstory_root + '&af=false&c=true&s=NewsFeed&i=' medstory_multimedia = medstory_root + '&af=true&c=true&s=AudioVideo&i=' medstory_clinical = medstory_root + '&af=true&c=true&s=ClinicalTrial&i=' medstory_research = medstory_root + '&af=false&c=true&s=Research&i=' technorati_blog = 'http://feeds.technorati.com/search/' + blogresults + '?authority=a7'

#The feeds output = Feedoutput output.web_feed = feedparser.parse(medstory_web) output.news_feed = feedparser.parse(medstory_news) output.multimedia_feed = feedparser.parse(medstory_multimedia) output.clinical_feed = feedparser.parse(medstory_clinical) output.research_feed = feedparser.parse(medstory_research) output.blog_feed = feedparser.parse(technorati_blog)

return output


 * 1) LO AND BEHOLD Here are the variables that you have access to
 * 2) output.web_feed.entries[0].title
 * 3) output.web_feed.entries[0].description
 * 4) output.web_feed.entries[0].link
 * 5) output.news_feed.entries[0].title

sequence = get_sequence_from_fasta("example.fasta") print "Sequence received; please wait." b = blast_snp_lookup(sequence) e = extract_snp_data(b) print e outputlist.append(str(len(e)) + " single nucleotide polymorphism(s) found:\n") if len(e) > 0: for snp_id in e:		outputlist.append(snp_id + "\n") else: outputlist.append("[none]\n") print "Didn't find any snp. Guess you're healthy :)"	sys.exit # nothing more to be done if no snp found print "got a snp... still thinking..." TEMP = ['rs11200638'] for snp_id in e:	o = snp_to_omim(snp_id)	if len(o) == 0:		# there was some problem with this way of doing the omim lookup for this rs number. so try xiaodi's way. 		o = omim_snp_search(snp_id)	if len(o) == 0:		outputlist.append("No information found for " + snp_id + "\n")      	# nothing more to be done if no records can be found	# otherwise, find the allelic variant data	outputlist.append(snp_id + " details:" + "\n")	for i in o:		v = extract_allelic_variant_data(i.read)		if v != None:			for a in v:				outputlist.append(a.name + "\n")				outputlist.append(a.mutation + "\n")				outputlist.append(a.description + "\n")
 * 1) BEGIN ACTUAL PROGRAM
 * 2) read sequence from file
 * 1) look up data
 * 1) extract data
 * 1) do an omim search on each snp

# The mesh terms stuff - THIS NEEDS TO BE ADDED TO OUTPUTLIST at some point article_ids = PubMed.search_for(snp_id) rec_parser = Medline.RecordParser medline_dict = PubMed.Dictionary(parser = rec_parser) all_mesh = [] all_mesh_terms = [] major_mesh_terms = [] for did in article_ids[0:5]: cur_record = medline_dict[did] # print '\n', cur_record.title, cur_record.authors, cur_record.source mesh_headings = cur_record.mesh_headings if len(mesh_headings) != 0: all_mesh = parse_mesh(mesh_headings) all_mesh_terms.extend(all_mesh[0]) major_mesh_terms.extend(all_mesh[1])

print "ALL MESH TERMS", '\n', all_mesh_terms, '\n', major_mesh_terms


 * 1) Problem to solve: Although Cynthia's output parses out the
 * 2) major mesh terms from the paper, each mesh term can give a hit
 * ie: it does not figure out which is really the "major" major mesh term.
 * 1) I intend to solve it by hitting against the title to see if there are matches
 * 2) and by removing connecting words from the mesh terms.
 * 1) and by removing connecting words from the mesh terms.


 * 1) modifying cynthia's output

new_term = [] for term in major_mesh_terms: test = term.split(" ") #split by " " temp = "" for test_term in test: if len(test_term) <= 3: #if it is a small connecting word pass else: if temp != "": temp = temp + "+" temp = temp + test_term else: temp = test_term #print temp new_term.append(temp) titles = [] for did in article_ids: cur_record = medline_dict[did] titles.append(cur_record.title)
 * 1) create list of article id's

count = 0 #counter bestterm = [0,"ERROR"] for term in new_term: count = 0 test = term.split("+") #split by "+" for test_term in test: #hit each word in term against title for title in titles: if str.upper(title).find(str.upper(test_term)) != -1: #if found the term in title count += 1 if count > bestterm[0]: bestterm[0] = count bestterm[1] = term
 * 1) titles = a list of all titles returned by PubMed

#print bestterm[1] #bestterm[1] is the main disease name

if(bestterm[1] == "ERROR"): # if not hit print "Warning: There are no relevant mesh terms for the passed rs#!" sys.exit(0) print "BEST TERMS:", bestterm[1] fh = open(os.path.join(os.curdir, "dx05.txt")) #the CA data file line = fh.readline disease_name = bestterm[1] #INSERT DISEASE NAME HERE FOR TESTING ICD9code = [] ICD9dName = [] found = 0 queue_name = 'http://icd9cm.chrisendres.com/index.php?action=search&srchtype=diseases&srchtext=' queue_name = queue_name + disease_name code_lookup = urllib.urlopen(queue_name).read #send queue request to site, returns dirty html out = open("index.txt", "w") #write dirty html to file out.write(code_lookup) out.close tempName = "" readCode = open("index.txt", "r") #read dirty html lookup_line = readCode.readline print "***ICD9 and hits, arranged by importance***\n" while lookup_line: w= lookup_line.find(" ") #the unique marker before the disease if w != -1: #if it is found tempCode = lookup_line[32:40] #code in this section tempCode = string.split(tempCode, ' ') #split into number print tempCode ICD9code.append(tempCode[0]) w = lookup_line.find(" ") if w != -1: #if there is a marker (junk html) tempName = lookup_line[32:w] tempName = tempName[tempName.find(" ")+1:] print tempName ICD9dName.append(tempName) else: tempName = lookup_line[32:len(lookup_line)-7] tempName = tempName[tempName.find(" ")+1:] #print tempName ICD9dName.append(tempName) lookup_line = readCode.readline
 * 1) queueing http://icd9cm.chrisendres.com for code lookup
 * 1) queueing http://icd9cm.chrisendres.com for code lookup
 * 1) queueing http://icd9cm.chrisendres.com for code lookup



print "\n\n***Prevalance data per IDC9 above***" print "Note: returns incidence data/yr, including subclasses\n"
 * 1) searching incidence data in CA data file
 * 2) note: returns class of hits
 * 1) note: returns class of hits

fh = open(os.path.join(os.curdir, "dx05.txt")) #the CA data file

ICD9_prev_output= ],[],[
 * 1) new data structure: [0] = code, [1] = name, [2] = incidence

for (code,name) in zip(ICD9code,ICD9dName): fh.seek(0) line = fh.readline totalIncidence = 0 sumCount = 0 while line: #for every line in the data line = line[:-1] #remove /n lineVector = string.split(line, ',') #split to vector if lineVector[0].find(code) != -1: #if disease hit totalIncidence += int(lineVector[1]) sumCount += 1 else: if sumCount > 0: print "ID: ", code, "name: ", name, "incidence: ", totalIncidence ICD9_prev_output[0].append(code) ICD9_prev_output[1].append(name) ICD9_prev_output[2].append(totalIncidence) totalIncidence = 0 sumCount = 0 line = fh.readline fh.close

mainCode = 0 mainName = 0 mainIncidence = 0 counter = 0 for c,n,incid in zip(ICD9_prev_output[0], ICD9_prev_output[1], ICD9_prev_output[2]): if incid > counter: counter = incid mainCode = c					mainName = n					mainIncidence = incid
 * 1) returns the most relevant ICD9 information
 * 1) THIS IS THE DATA YOU PROBABLY WANT

print "THE MOST RELEVANT HIT:\n" print "CDC CODE: ", mainCode print "Disease Name: ", mainName print "Incidence/yr in CA: ", mainIncidence # CALL DENIZ'S FUNCTION TO GET NEWS ABOUT DISEASE # use the best term that was figured out before temp_terms = str(bestterm[1]).split('+') forxiaodi = [] for term in temp_terms: forxiaodi.append(outputzilla(str(term))) print "did deniz' news stuff again"

for item in outputlist: print item pickle.dump(outputlist, output_file) print "wrote pickled version to file"
 * 1) Some of the output stuff

output_file.close