Harvard:Biophysics 101/2007/Notebook:Katie Fifer/2007-5-3: Difference between revisions

Revision as of 01:05, 3 May 2007

from Bio import SeqIO from Bio.Blast import NCBIWWW from Bio.EUtils import DBIdsClient import xml.dom.minidom from xml.dom.minidom import parse, parseString from threading import Thread import time, sys import pickle import os import string import urllib from Bio import PubMed from Bio import Medline import feedparser

id = "1" outputlist = []

All the relevant output will be to a file.

output_file = file("out" + id + ".txt", 'w')

C-style struct to pass parameters

class AllelicVariant: pass

basic function to open fasta file and get sequence

def get_sequence_from_fasta(file): file_handle = open(file) records = SeqIO.parse(file_handle, format="fasta") record = records.next() return record.seq.data

lookup blast snp data

def blast_snp_lookup(seq): result_handle = NCBIWWW.qblast("blastn", "snp/human_9606/human_9606", seq) return result_handle.read()

basic text extraction from XML; based on http://docs.python.org/lib/dom-example.html

def get_text(node_list):

   rc = ""
   for node in node_list:
       if node.nodeType == node.TEXT_NODE:
           rc = rc + node.data
   return rc

extracts snp data

def extract_snp_data(str):

    dom = parseString(str)
    variants = dom.getElementsByTagName("Hit")
    if len(variants) == 0:
         return
    parsed = []
    for v in variants:
         # now populate the struct
         hit_def = get_text(v.getElementsByTagName("Hit_def")[0].childNodes)
         id_query = get_text(v.getElementsByTagName("Hsp_hseq")[0].childNodes)
         id_hit = get_text(v.getElementsByTagName("Hsp_qseq")[0].childNodes)
         score = get_text(v.getElementsByTagName("Hsp_score")[0].childNodes)
         id = get_text(v.getElementsByTagName("Hit_accession")[0].childNodes)
         # extract position of the SNP from Hit Definition      
         lower_bound = hit_def.find("pos=")+4
         upper_bound = hit_def.find("len=")-1
         position = int(hit_def[lower_bound:upper_bound])
         # only consider it a genuine snp if the hit score is above 100,
         # the query/hit sequences are longer than the position of the SNP
         # and the query sequence matches the hit sequence at the SNP position
         if int(score) > 100 and position < len(id_hit) and id_query[position] == id_hit[position]:
                 parsed.append(id)
    return parsed

queries the database and returns all info in an XML format

def omim_snp_search(dnsnp_id): client = DBIdsClient.DBIdsClient() query = client.search(dnsnp_id, "omim") records = [i.efetch(rettype="xml") for i in query] return records

extracts allelic variant data, as the name implies, using the struct above

def extract_allelic_variant_data(str): dom = parseString(str) variants = dom.getElementsByTagName("Mim-allelic-variant") if len(variants) == 0: return parsed = [] for v in variants: a = AllelicVariant() # create empty instance of struct # now populate the struct a.name = get_text(v.getElementsByTagName("Mim-allelic-variant_name")[0].childNodes) a.mutation = get_text(v.getElementsByTagName("Mim-allelic-variant_mutation")[0].getElementsByTagName("Mim-text_text")[0].childNodes) a.description = get_text(v.getElementsByTagName("Mim-allelic-variant_description")[0].getElementsByTagName("Mim-text_text")[0].childNodes) parsed.append(a) return parsed

Kay's stuff - OMIM stuff

# queries the SNP database and returns geneID tag as a string

1. Currently, DBIdsClient does not support snp parsing - so it's not used.
2. A future update should correct this when possible, for ease of reading.

def parse_geneID_tag(snp_id): try: SNP_URL = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=snp' url = SNP_URL + '&id=' + snp_id + '&mode=xml' dom = minidom.parse(urllib.urlopen(url)) symbol = dom.getElementsByTagName("FxnSet_symbol") return symbol[0].toxml().split('>')[1].split('<')[0] except: return ""

1. Note: This code is a temporary solution to a dbSNP formatting issue.
2. Older entries are best searched directly by ID #.
3. - > this case is the first covered
4. Newer entries are not indexed in this fashion, although SNP ID data is
5. available on the individual entry. These contain Allelic Variant data
6. which is located and extracted by the script
7. - > this case is the second covered

takes SNP ID and gets search results from OMIM in XML format

def snp_to_omim(snp_id):

   records = omim_snp_search(snp_id)

   if records == list():
       tag_id = parse_geneID_tag(snp_id)

if len(tag_id) == 0: return ""

       records = omim_tag_search(tag_id)

   return records

1. 1. cynthias section ###

parses a mesh term to remove * and /

def parse_term(str, bool):

   parsed_term = str
   if(bool):
       parsed_term = parsed_term.replace('*', )
   if str.find('/') != -1:
      parsed_term = parsed_term.replace('/', ' ')
   return parsed_term

parses list of mesh terms
returns embedded list, one with all terms and one major terms

def parse_mesh(list):

   all_mesh_terms = []
   major_mesh_terms = []
   mesh_term = 
   for i in range(len(list)):
       major = False
       if list[i].find('*') == -1:
           mesh_term = parse_term(list[i], major)
           all_mesh_terms.append(mesh_term)
       else:
           major = True
           mesh_term = parse_term(list[i], major)
           major_mesh_terms.append(mesh_term)
           all_mesh_terms.append(mesh_term)
   all_mesh = [all_mesh_terms, major_mesh_terms]
   return all_mesh

DENIZ's stuff
empty class

class Feedoutput: pass

I want a single string as input. This should be done as often as needed for multiple strings

def outputzilla(inputstring): #This will parse the blogs we want blogresults = inputstring.replace(' ', '+')

medstory_results = inputstring.replace(' ', '%20') medstory_root = 'http://www.medstory.com/rss?q=' + medstory_results

medstory_web = medstory_root + '&af=true&c=true&s=Web&i=' medstory_news = medstory_root + '&af=false&c=true&s=NewsFeed&i=' medstory_multimedia = medstory_root + '&af=true&c=true&s=AudioVideo&i=' medstory_clinical = medstory_root + '&af=true&c=true&s=ClinicalTrial&i=' medstory_research = medstory_root + '&af=false&c=true&s=Research&i=' technorati_blog = 'http://feeds.technorati.com/search/' + blogresults + '?authority=a7'

#The feeds output = Feedoutput() output.web_feed = feedparser.parse(medstory_web) output.news_feed = feedparser.parse(medstory_news) output.multimedia_feed = feedparser.parse(medstory_multimedia) output.clinical_feed = feedparser.parse(medstory_clinical) output.research_feed = feedparser.parse(medstory_research) output.blog_feed = feedparser.parse(technorati_blog)

return output

LO AND BEHOLD Here are the variables that you have access to
output.web_feed.entries[0].title
output.web_feed.entries[0].description
output.web_feed.entries[0].link
output.news_feed.entries[0].title

BEGIN ACTUAL PROGRAM
read sequence from file

sequence = get_sequence_from_fasta("example.fasta") print "Sequence received; please wait."

look up data

b = blast_snp_lookup(sequence)

extract data

e = extract_snp_data(b) print e outputlist.append(str(len(e)) + " single nucleotide polymorphism(s) found:\n") if len(e) > 0: for snp_id in e: outputlist.append(snp_id + "\n") else: outputlist.append("[none]\n") print "Didn't find any snp. Guess you're healthy :)" sys.exit() # nothing more to be done if no snp found

print "got a snp... still thinking..." TEMP = ['rs11200638']

do an omim search on each snp

for snp_id in e: o = snp_to_omim(snp_id) if len(o) == 0: # there was some problem with this way of doing the omim lookup for this rs number. so try xiaodi's way. o = omim_snp_search(snp_id) if len(o) == 0: outputlist.append("No information found for " + snp_id + "\n")

      	# nothing more to be done if no records can be found

# otherwise, find the allelic variant data outputlist.append(snp_id + " details:" + "\n") for i in o: v = extract_allelic_variant_data(i.read()) if v != None: for a in v: outputlist.append(a.name + "\n") outputlist.append(a.mutation + "\n") outputlist.append(a.description + "\n")

# The mesh terms stuff - THIS NEEDS TO BE ADDED TO OUTPUTLIST at some point article_ids = PubMed.search_for(snp_id) rec_parser = Medline.RecordParser() medline_dict = PubMed.Dictionary(parser = rec_parser)

all_mesh = [] all_mesh_terms = [] major_mesh_terms = [] for did in article_ids[0:5]: cur_record = medline_dict[did] # print '\n', cur_record.title, cur_record.authors, cur_record.source mesh_headings = cur_record.mesh_headings if len(mesh_headings) != 0: all_mesh = parse_mesh(mesh_headings) all_mesh_terms.extend(all_mesh[0]) major_mesh_terms.extend(all_mesh[1])

print "ALL MESH TERMS", '\n', all_mesh_terms, '\n', major_mesh_terms

Problem to solve: Although Cynthia's output parses out the
major mesh terms from the paper, each mesh term can give a hit
ie: it does not figure out which is really the "major" major mesh term.
I intend to solve it by hitting against the title to see if there are matches
and by removing connecting words from the mesh terms.

1. modifying cynthia's output

new_term = [] for term in major_mesh_terms: test = term.split(" ") #split by " " temp = "" for test_term in test: if len(test_term) <= 3: #if it is a small connecting word pass else: if temp != "": temp = temp + "+" temp = temp + test_term else: temp = test_term

   #print temp

new_term.append(temp) titles = []

create list of article id's

for did in article_ids: cur_record = medline_dict[did] titles.append(cur_record.title)

titles = a list of all titles returned by PubMed

count = 0 #counter bestterm = [0,"ERROR"] for term in new_term: count = 0 test = term.split("+") #split by "+" for test_term in test: #hit each word in term against title for title in titles: if str.upper(title).find(str.upper(test_term)) != -1: #if found the term in title count += 1 if count > bestterm[0]: bestterm[0] = count bestterm[1] = term

#print bestterm[1] #bestterm[1] is the main disease name

if(bestterm[1] == "ERROR"): # if not hit print "Warning: There are no relevant mesh terms for the passed rs#!" sys.exit(0) print "BEST TERMS:", bestterm[1] fh = open(os.path.join(os.curdir, "dx05.txt")) #the CA data file line = fh.readline() disease_name = bestterm[1] #INSERT DISEASE NAME HERE FOR TESTING ICD9code = [] ICD9dName = [] found = 0

queueing http://icd9cm.chrisendres.com for code lookup

queue_name = 'http://icd9cm.chrisendres.com/index.php?action=search&srchtype=diseases&srchtext=' queue_name = queue_name + disease_name

code_lookup = urllib.urlopen(queue_name).read() #send queue request to site, returns dirty html out = open("index.txt", "w") #write dirty html to file out.write(code_lookup) out.close()

tempName = "" readCode = open("index.txt", "r") #read dirty html lookup_line = readCode.readline() print "***ICD9 and hits, arranged by importance***\n" while lookup_line:

w= lookup_line.find("

") #the unique marker before the disease

if w != -1: #if it is found tempCode = lookup_line[32:40] #code in this section tempCode = string.split(tempCode, ' ') #split into number print tempCode ICD9code.append(tempCode[0])

w = lookup_line.find("

") if w != -1: #if there is a marker (junk html)

tempName = lookup_line[32:w] tempName = tempName[tempName.find(" ")+1:] print tempName ICD9dName.append(tempName) else: tempName = lookup_line[32:len(lookup_line)-7] tempName = tempName[tempName.find(" ")+1:] #print tempName ICD9dName.append(tempName) lookup_line = readCode.readline()

searching incidence data in CA data file
note: returns class of hits

print "\n\n***Prevalance data per IDC9 above***" print "Note: returns incidence data/yr, including subclasses\n"

fh = open(os.path.join(os.curdir, "dx05.txt")) #the CA data file

new data structure: [0] = code, [1] = name, [2] = incidence

ICD9_prev_output= [[],[],[]]

for (code,name) in zip(ICD9code,ICD9dName): fh.seek(0) line = fh.readline() totalIncidence = 0 sumCount = 0 while line: #for every line in the data line = line[:-1] #remove /n lineVector = string.split(line, ',') #split to vector if lineVector[0].find(code) != -1: #if disease hit totalIncidence += int(lineVector[1]) sumCount += 1 else: if sumCount > 0: print "ID: ", code, "name: ", name, "incidence: ", totalIncidence ICD9_prev_output[0].append(code) ICD9_prev_output[1].append(name) ICD9_prev_output[2].append(totalIncidence) totalIncidence = 0 sumCount = 0 line = fh.readline() fh.close()

returns the most relevant ICD9 information

THIS IS THE DATA YOU PROBABLY WANT

mainCode = 0 mainName = 0 mainIncidence = 0

counter = 0 for c,n,incid in zip(ICD9_prev_output[0], ICD9_prev_output[1], ICD9_prev_output[2]): if incid > counter: counter = incid mainCode = c mainName = n mainIncidence = incid

print "THE MOST RELEVANT HIT:\n" print "CDC CODE: ", mainCode print "Disease Name: ", mainName print "Incidence/yr in CA: " , mainIncidence

# CALL DENIZ'S FUNCTION TO GET NEWS ABOUT DISEASE # use the best term that was figured out before temp_terms = str(bestterm[1]).split('+') forxiaodi = [] for term in temp_terms: forxiaodi.append(outputzilla(str(term))) print "did deniz' news stuff again"

Some of the output stuff

for item in outputlist: print item pickle.dump(outputlist, output_file) print "wrote pickled version to file"

output_file.close() </syntax>

Harvard:Biophysics 101/2007/Notebook:Katie Fifer/2007-5-3: Difference between revisions

Revision as of 01:05, 3 May 2007

Navigation menu

Page actions

Page actions

Personal tools

Navigation

Search

research

Tools