Harvard:Biophysics 101/2007/Notebook:Resmi Charalel/2007-5-1

Update
Traceback (most recent call last): File "/Users/resmicharalel/Documents/biophysics101/pubmed.py", line 93, in    m = extract_meshterms(cur_record) File "/Users/resmicharalel/Documents/biophysics101/pubmed.py", line 60, in extract_meshterms dom = parseString(str) File "/Library/Frameworks/Python.framework/Versions/2.5/lib/python2.5/xml/dom/minidom.py", line 1923, in parseString return expatbuilder.parseString(string) File "/Library/Frameworks/Python.framework/Versions/2.5/lib/python2.5/xml/dom/expatbuilder.py", line 940, in parseString return builder.parseString(string) File "/Library/Frameworks/Python.framework/Versions/2.5/lib/python2.5/xml/dom/expatbuilder.py", line 223, in parseString parser.Parse(string, True) TypeError: Parse argument 1 must be string or read-only buffer, not instance" from Bio.EUtils import DBIdsClient import xml.dom.minidom from xml.dom.minidom import parse, parseString
 * I have been out of town since last Thursday's class for a revisit weekend and did not have email/computer access there. So, unfortunately, I did not get as much to work on this program as much as I had hoped.  But, I definitely plan to continue working on the program and contributing to the project further over the next couple weeks and hope to be able to stay involved this summer.
 * So, I have written all of the code to appropriately:
 * 1) Parse through OMIM records and obtain all associated PMIDs
 * 2) Search all obtained PMIDs in PubMed to return XML output
 * 3) Parse the XML output of PubMed for each entry to return the MeshTerms that are labelled as major
 * 4) Take the entry title of OMIM and search this in PubMed
 * 5) Return the references of the top five PubMed review articles associated with this condition/name as further reading on the general allelic variant
 * However, there is still one error in the program, which is difficult to understand and through which I am still working. Here it is:
 * The following is the code thus far:

class AllelicVariant: pass
 * 1) C-style struct to pass parameters

class PubmedID: pass

class MeshTerms: pass

def omim_snp_search(dnsnp_id): client = DBIdsClient.DBIdsClient query = client.search(dnsnp_id, "omim") records = [i.efetch(rettype="xml") for i in query] return records
 * 1) queries the database and returns all info in an XML format

def get_text(node_list): rc = "" for node in node_list: if node.nodeType == node.TEXT_NODE: rc = rc + node.data return rc
 * 1) basic text extraction from XML; based on http://docs.python.org/lib/dom-example.html

def extract_allelic_variant_data(str): dom = parseString(str) variants = dom.getElementsByTagName("Mim-allelic-variant") if len(variants) == 0: return parsed = [] for v in variants: a = AllelicVariant # create empty instance of struct # now populate the struct a.name = get_text(v.getElementsByTagName("Mim-allelic-variant_name")[0].childNodes) a.mutation = get_text(v.getElementsByTagName("Mim-allelic-variant_mutation")[0].getElementsByTagName("Mim-text_text")[0].childNodes) a.description = get_text(v.getElementsByTagName("Mim-allelic-variant_description")[0].getElementsByTagName("Mim-text_text")[0].childNodes) parsed.append(a) return parsed
 * 1) extracts allelic variant data, as the name implies, using the struct above
 * 1)    print "variant:", variants

def extract_allelic_variant_pmid(str): dom = parseString(str) pmids = dom.getElementsByTagName("Mim-reference") if len(pmids) == 0: return ids = [] for p in pmids: i = PubmedID i.pmid = get_text(p.getElementsByTagName("Mim-reference_pubmedUID")[0].childNodes) ids.append(i.pmid) return ids def extract_meshterms(str): dom = parseString(str) meshheadings = dom.getElementsByTagName("MeshHeading") if len(meshterms) == 0: return meshterms = [] for h in meshheadings: m = MeshTerms m = get_text(h.getElementsByTagName("DescriptorName MajorTopicYN")[0].childNodes) if m == "Y": mesh = get_text(h.getElementsByTagName("DescriptorName")[0].childNodes) meshterms.append(mesh) return meshterms

from Bio import PubMed from Bio import Medline import string

rec_parser = Medline.RecordParser medline_dict = PubMed.Dictionary(parser = rec_parser)

for i in omim_snp_search("rs11200638"): result = i.read if result: v = extract_allelic_variant_data(result) p = extract_allelic_variant_pmid(result) if v != None: for a in v:           print a.name print a.mutation print a.description if p != None: for s in p:           cur_record = medline_dict[s] m = extract_meshterms(cur_record) if m != None: for mh in m:                   print mh    #if p != None: # for i in p:           #print i.pmid

disease = a.name

search_term = "Review[ptyp] "+disease
 * 1) print search_term

review_ids = PubMed.search_for(search_term)

count = 1
 * 1) rec_parser = Medline.RecordParser
 * 2) medline_dict = PubMed.Dictionary(parser = rec_parser)

for did in review_ids[0:5]: cur_record = medline_dict[did] print '\n', count, ') ', string.rstrip(cur_record.title), cur_record.authors, string.strip(cur_record.source)    count=count+1


 * 1) for i in omim_snp_search("rs11200638"):
 * 2)    result = i.read
 * 3)    if result:
 * 4)        p = extract_allelic_variant_pmid(result)
 * 5)    if p != None:
 * 6)        key_source = PubMed.search_for(p[0])
 * 7)        key_rec = medline_dict[0]
 * 8)        print key_rec
 * 9)        keywords = get_text(key_rec.getElementsByTagName("MeshHeading")[0].childNodes)
 * 10)        print keywords
 * 1)        print keywords
 * Also, I am working on better annotating this code so that it is easier to read through and understand for anyone interested in working with it.