Harvard:Biophysics 101/2007/Notebook:Resmi Charalel/2007-4-24
From OpenWetWare
				
				
				Jump to navigationJump to search
				
				
Tasks Accomplished
- Grasped a much better understanding of XML parsing.
- Finished code to parse OMIM records and return all Pubmed ID numbers (without errors).
- Wrote code to search Pubmed ID numbers in Pubmed and then parse through the result to look for key words (or meshterms).
- Came up for some ideas about how to deal with non-OMIM records. (i.e. dbSNP, HapMap, make an entry into one of these databases?)
Tasks to Complete for Thursday
- Ask class how they would like key words to be determined. (i.e. search first Pubmed ID number and return meshterms; repeat for all Pubmed IDs?)
- Correct how code to parse through Pubmed is implemented since it currently causes a stalling.
- Think more about how to handle genotypes not found/annotated in OMIM and then implement this in code.
Code So Far
from Bio.EUtils import DBIdsClient
import xml.dom.minidom
from xml.dom.minidom import parse, parseString
# C-style struct to pass parameters
class AllelicVariant:
	pass
class PubmedID:
        pass
# queries the database and returns all info in an XML format
def omim_snp_search(dnsnp_id):
	client = DBIdsClient.DBIdsClient()
	query = client.search(dnsnp_id, "omim")
	records = [i.efetch(rettype="xml") for i in query]
	return records
# basic text extraction from XML; based on http://docs.python.org/lib/dom-example.html
def get_text(node_list):
    rc = ""
    for node in node_list:
        if node.nodeType == node.TEXT_NODE:
            rc = rc + node.data
    return rc
# extracts allelic variant data, as the name implies, using the struct above
def extract_allelic_variant_data(str):
    dom = parseString(str)
    variants = dom.getElementsByTagName("Mim-allelic-variant")
#    print "variant:", variants
    if len(variants) == 0:
        return
    parsed = []
    for v in variants:
        a = AllelicVariant() # create empty instance of struct
        # now populate the struct
        a.name = get_text(v.getElementsByTagName("Mim-allelic-variant_name")[0].childNodes)
        a.mutation = get_text(v.getElementsByTagName("Mim-allelic-variant_mutation")[0].getElementsByTagName("Mim-text_text")[0].childNodes)
        a.description = get_text(v.getElementsByTagName("Mim-allelic-variant_description")[0].getElementsByTagName("Mim-text_text")[0].childNodes)
        parsed.append(a)
    return parsed
def extract_allelic_variant_pmid(str):
    dom = parseString(str)
    pmids = dom.getElementsByTagName("Mim-reference")
    if len(pmids) == 0:
        return
    ids = []
    for p in pmids:
        i = PubmedID()
        i.pmid = get_text(p.getElementsByTagName("Mim-reference_pubmedUID")[0].childNodes)
        ids.append(i)
    return ids
	
for i in omim_snp_search("rs11200638"):
    result = i.read()
    if result:
        v = extract_allelic_variant_data(result)
        p = extract_allelic_variant_pmid(result)
    if v != None:
        for a in v:
            print a.name
            print a.mutation
            print a.description
    #if p != None:
       # for i in p:
            #print i.pmid
from Bio import PubMed
from Bio import Medline
import string
disease = a.name
search_term = "Review[ptyp] "+disease
#print search_term
review_ids = PubMed.search_for(search_term)
rec_parser = Medline.RecordParser()
medline_dict = PubMed.Dictionary(parser = rec_parser)
count = 1
for did in review_ids[0:5]:
    cur_record = medline_dict[did]
    print '\n', count, ')  ', string.rstrip(cur_record.title), cur_record.authors, string.strip(cur_record.source)
    count=count+1
for i in omim_snp_search("rs11200638"):
    result = i.read()
    if result:
        p = extract_allelic_variant_pmid(result)
    if p != None:
        key_source = PubMed.search_for(p[0])
        key_rec = medline_dict[0]
        print key_rec
        keywords = get_text(key_rec.getElementsByTagName("MeshHeading")[0].childNodes)
        print keywords