Harvard:Biophysics 101/2007/Notebook:Katie Fifer/2007-5-3: Difference between revisions
(New page: <syntax type='python'> from Bio import SeqIO from Bio.Blast import NCBIWWW from Bio.EUtils import DBIdsClient import xml.dom.minidom from xml.dom.minidom import parse, parseString from t...) |
(No difference)
|
Revision as of 01:05, 3 May 2007
<syntax type='python'>
from Bio import SeqIO from Bio.Blast import NCBIWWW from Bio.EUtils import DBIdsClient import xml.dom.minidom from xml.dom.minidom import parse, parseString from threading import Thread import time, sys import pickle import os import string import urllib from Bio import PubMed from Bio import Medline import feedparser
id = "1" outputlist = []
- All the relevant output will be to a file.
output_file = file("out" + id + ".txt", 'w')
- C-style struct to pass parameters
class AllelicVariant: pass
- basic function to open fasta file and get sequence
def get_sequence_from_fasta(file): file_handle = open(file) records = SeqIO.parse(file_handle, format="fasta") record = records.next() return record.seq.data
- lookup blast snp data
def blast_snp_lookup(seq): result_handle = NCBIWWW.qblast("blastn", "snp/human_9606/human_9606", seq) return result_handle.read()
- basic text extraction from XML; based on http://docs.python.org/lib/dom-example.html
def get_text(node_list):
rc = "" for node in node_list: if node.nodeType == node.TEXT_NODE: rc = rc + node.data return rc
- extracts snp data
def extract_snp_data(str):
dom = parseString(str) variants = dom.getElementsByTagName("Hit") if len(variants) == 0: return parsed = [] for v in variants: # now populate the struct hit_def = get_text(v.getElementsByTagName("Hit_def")[0].childNodes) id_query = get_text(v.getElementsByTagName("Hsp_hseq")[0].childNodes) id_hit = get_text(v.getElementsByTagName("Hsp_qseq")[0].childNodes) score = get_text(v.getElementsByTagName("Hsp_score")[0].childNodes) id = get_text(v.getElementsByTagName("Hit_accession")[0].childNodes) # extract position of the SNP from Hit Definition lower_bound = hit_def.find("pos=")+4 upper_bound = hit_def.find("len=")-1 position = int(hit_def[lower_bound:upper_bound]) # only consider it a genuine snp if the hit score is above 100, # the query/hit sequences are longer than the position of the SNP # and the query sequence matches the hit sequence at the SNP position if int(score) > 100 and position < len(id_hit) and id_query[position] == id_hit[position]: parsed.append(id) return parsed
- queries the database and returns all info in an XML format
def omim_snp_search(dnsnp_id): client = DBIdsClient.DBIdsClient() query = client.search(dnsnp_id, "omim") records = [i.efetch(rettype="xml") for i in query] return records
- extracts allelic variant data, as the name implies, using the struct above
def extract_allelic_variant_data(str): dom = parseString(str) variants = dom.getElementsByTagName("Mim-allelic-variant") if len(variants) == 0: return parsed = [] for v in variants: a = AllelicVariant() # create empty instance of struct # now populate the struct a.name = get_text(v.getElementsByTagName("Mim-allelic-variant_name")[0].childNodes) a.mutation = get_text(v.getElementsByTagName("Mim-allelic-variant_mutation")[0].getElementsByTagName("Mim-text_text")[0].childNodes) a.description = get_text(v.getElementsByTagName("Mim-allelic-variant_description")[0].getElementsByTagName("Mim-text_text")[0].childNodes) parsed.append(a) return parsed
- Kay's stuff - OMIM stuff
# queries the SNP database and returns geneID tag as a string
- Currently, DBIdsClient does not support snp parsing - so it's not used.
- A future update should correct this when possible, for ease of reading.
def parse_geneID_tag(snp_id): try: SNP_URL = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=snp' url = SNP_URL + '&id=' + snp_id + '&mode=xml' dom = minidom.parse(urllib.urlopen(url)) symbol = dom.getElementsByTagName("FxnSet_symbol") return symbol[0].toxml().split('>')[1].split('<')[0] except: return ""
- Note: This code is a temporary solution to a dbSNP formatting issue.
- Older entries are best searched directly by ID #.
- - > this case is the first covered
- Newer entries are not indexed in this fashion, although SNP ID data is
- available on the individual entry. These contain Allelic Variant data
- which is located and extracted by the script
- - > this case is the second covered
- takes SNP ID and gets search results from OMIM in XML format
def snp_to_omim(snp_id):
records = omim_snp_search(snp_id)
if records == list(): tag_id = parse_geneID_tag(snp_id)
if len(tag_id) == 0: return ""
records = omim_tag_search(tag_id)
return records
- cynthias section ###
- parses a mesh term to remove * and /
def parse_term(str, bool):
parsed_term = str if(bool): parsed_term = parsed_term.replace('*', ) if str.find('/') != -1: parsed_term = parsed_term.replace('/', ' ') return parsed_term
- parses list of mesh terms
- returns embedded list, one with all terms and one major terms
def parse_mesh(list):
all_mesh_terms = [] major_mesh_terms = [] mesh_term = for i in range(len(list)): major = False if list[i].find('*') == -1: mesh_term = parse_term(list[i], major) all_mesh_terms.append(mesh_term) else: major = True mesh_term = parse_term(list[i], major) major_mesh_terms.append(mesh_term) all_mesh_terms.append(mesh_term) all_mesh = [all_mesh_terms, major_mesh_terms] return all_mesh
- DENIZ's stuff
- empty class
class Feedoutput: pass
- I want a single string as input. This should be done as often as needed for multiple strings
def outputzilla(inputstring): #This will parse the blogs we want blogresults = inputstring.replace(' ', '+')
medstory_results = inputstring.replace(' ', '%20') medstory_root = 'http://www.medstory.com/rss?q=' + medstory_results
medstory_web = medstory_root + '&af=true&c=true&s=Web&i=' medstory_news = medstory_root + '&af=false&c=true&s=NewsFeed&i=' medstory_multimedia = medstory_root + '&af=true&c=true&s=AudioVideo&i=' medstory_clinical = medstory_root + '&af=true&c=true&s=ClinicalTrial&i=' medstory_research = medstory_root + '&af=false&c=true&s=Research&i=' technorati_blog = 'http://feeds.technorati.com/search/' + blogresults + '?authority=a7'
#The feeds output = Feedoutput() output.web_feed = feedparser.parse(medstory_web) output.news_feed = feedparser.parse(medstory_news) output.multimedia_feed = feedparser.parse(medstory_multimedia) output.clinical_feed = feedparser.parse(medstory_clinical) output.research_feed = feedparser.parse(medstory_research) output.blog_feed = feedparser.parse(technorati_blog)
return output
- LO AND BEHOLD Here are the variables that you have access to
- output.web_feed.entries[0].title
- output.web_feed.entries[0].description
- output.web_feed.entries[0].link
- output.news_feed.entries[0].title
- BEGIN ACTUAL PROGRAM
- read sequence from file
sequence = get_sequence_from_fasta("example.fasta") print "Sequence received; please wait."
- look up data
b = blast_snp_lookup(sequence)
- extract data
e = extract_snp_data(b) print e outputlist.append(str(len(e)) + " single nucleotide polymorphism(s) found:\n") if len(e) > 0: for snp_id in e: outputlist.append(snp_id + "\n") else: outputlist.append("[none]\n") print "Didn't find any snp. Guess you're healthy :)" sys.exit() # nothing more to be done if no snp found
print "got a snp... still thinking..." TEMP = ['rs11200638']
- do an omim search on each snp
for snp_id in e: o = snp_to_omim(snp_id) if len(o) == 0: # there was some problem with this way of doing the omim lookup for this rs number. so try xiaodi's way. o = omim_snp_search(snp_id) if len(o) == 0: outputlist.append("No information found for " + snp_id + "\n")
# nothing more to be done if no records can be found
# otherwise, find the allelic variant data outputlist.append(snp_id + " details:" + "\n") for i in o: v = extract_allelic_variant_data(i.read()) if v != None: for a in v: outputlist.append(a.name + "\n") outputlist.append(a.mutation + "\n") outputlist.append(a.description + "\n")
# The mesh terms stuff - THIS NEEDS TO BE ADDED TO OUTPUTLIST at some point
article_ids = PubMed.search_for(snp_id)
rec_parser = Medline.RecordParser()
medline_dict = PubMed.Dictionary(parser = rec_parser)
all_mesh = [] all_mesh_terms = [] major_mesh_terms = [] for did in article_ids[0:5]: cur_record = medline_dict[did] # print '\n', cur_record.title, cur_record.authors, cur_record.source mesh_headings = cur_record.mesh_headings if len(mesh_headings) != 0: all_mesh = parse_mesh(mesh_headings) all_mesh_terms.extend(all_mesh[0]) major_mesh_terms.extend(all_mesh[1])
print "ALL MESH TERMS", '\n', all_mesh_terms, '\n', major_mesh_terms
- Problem to solve: Although Cynthia's output parses out the
- major mesh terms from the paper, each mesh term can give a hit
- ie: it does not figure out which is really the "major" major mesh term.
- I intend to solve it by hitting against the title to see if there are matches
- and by removing connecting words from the mesh terms.
- modifying cynthia's output
new_term = [] for term in major_mesh_terms: test = term.split(" ") #split by " " temp = "" for test_term in test: if len(test_term) <= 3: #if it is a small connecting word pass else: if temp != "": temp = temp + "+" temp = temp + test_term else: temp = test_term
#print temp
new_term.append(temp) titles = []
- create list of article id's
for did in article_ids: cur_record = medline_dict[did] titles.append(cur_record.title)
- titles = a list of all titles returned by PubMed
count = 0 #counter bestterm = [0,"ERROR"] for term in new_term: count = 0 test = term.split("+") #split by "+" for test_term in test: #hit each word in term against title for title in titles: if str.upper(title).find(str.upper(test_term)) != -1: #if found the term in title count += 1 if count > bestterm[0]: bestterm[0] = count bestterm[1] = term
#print bestterm[1] #bestterm[1] is the main disease name
if(bestterm[1] == "ERROR"): # if not hit print "Warning: There are no relevant mesh terms for the passed rs#!" sys.exit(0) print "BEST TERMS:", bestterm[1] fh = open(os.path.join(os.curdir, "dx05.txt")) #the CA data file line = fh.readline() disease_name = bestterm[1] #INSERT DISEASE NAME HERE FOR TESTING ICD9code = [] ICD9dName = [] found = 0
- queueing http://icd9cm.chrisendres.com for code lookup
queue_name = 'http://icd9cm.chrisendres.com/index.php?action=search&srchtype=diseases&srchtext=' queue_name = queue_name + disease_name
code_lookup = urllib.urlopen(queue_name).read() #send queue request to site, returns dirty html out = open("index.txt", "w") #write dirty html to file out.write(code_lookup) out.close()
tempName = "" readCode = open("index.txt", "r") #read dirty html lookup_line = readCode.readline() print "***ICD9 and hits, arranged by importance***\n" while lookup_line:
w= lookup_line.find("
if w != -1: #if it is found tempCode = lookup_line[32:40] #code in this section tempCode = string.split(tempCode, ' ') #split into number print tempCode ICD9code.append(tempCode[0])
w = lookup_line.find("") if w != -1: #if there is a marker (junk html)
tempName = lookup_line[32:w] tempName = tempName[tempName.find(" ")+1:] print tempName ICD9dName.append(tempName) else: tempName = lookup_line[32:len(lookup_line)-7] tempName = tempName[tempName.find(" ")+1:] #print tempName ICD9dName.append(tempName) lookup_line = readCode.readline()
- searching incidence data in CA data file
- note: returns class of hits
print "\n\n***Prevalance data per IDC9 above***" print "Note: returns incidence data/yr, including subclasses\n"
fh = open(os.path.join(os.curdir, "dx05.txt")) #the CA data file
- new data structure: [0] = code, [1] = name, [2] = incidence
ICD9_prev_output= [[],[],[]]
for (code,name) in zip(ICD9code,ICD9dName): fh.seek(0) line = fh.readline() totalIncidence = 0 sumCount = 0 while line: #for every line in the data line = line[:-1] #remove /n lineVector = string.split(line, ',') #split to vector if lineVector[0].find(code) != -1: #if disease hit totalIncidence += int(lineVector[1]) sumCount += 1 else: if sumCount > 0: print "ID: ", code, "name: ", name, "incidence: ", totalIncidence ICD9_prev_output[0].append(code) ICD9_prev_output[1].append(name) ICD9_prev_output[2].append(totalIncidence) totalIncidence = 0 sumCount = 0 line = fh.readline() fh.close()
- returns the most relevant ICD9 information
- THIS IS THE DATA YOU PROBABLY WANT
mainCode = 0 mainName = 0 mainIncidence = 0
counter = 0 for c,n,incid in zip(ICD9_prev_output[0], ICD9_prev_output[1], ICD9_prev_output[2]): if incid > counter: counter = incid mainCode = c mainName = n mainIncidence = incid
print "THE MOST RELEVANT HIT:\n" print "CDC CODE: ", mainCode print "Disease Name: ", mainName print "Incidence/yr in CA: " , mainIncidence
# CALL DENIZ'S FUNCTION TO GET NEWS ABOUT DISEASE # use the best term that was figured out before temp_terms = str(bestterm[1]).split('+') forxiaodi = [] for term in temp_terms: forxiaodi.append(outputzilla(str(term))) print "did deniz' news stuff again"
- Some of the output stuff
for item in outputlist: print item pickle.dump(outputlist, output_file) print "wrote pickled version to file"
output_file.close() </syntax>