Harvard:Biophysics 101/2007/Notebook:Katie Fifer/2007-4-19

This script does a regular BLAST lookup and parses out the top human hit and relevant info (hit to, hit from, hit id, hit def, hit seq, etc). The next step will be to get the relevant part of the contig file. Then we'll use that to get the gene and figure out the mutation and do relevant searching thereafter.

 from Bio import SeqIO from Bio.Blast import NCBIWWW from Bio.EUtils import DBIdsClient import xml.dom.minidom from xml.dom.minidom import parse, parseString from threading import Thread import time, sys

class AllelicVariant: pass
 * 1) C-style struct to pass parameters

class Hit: # the fields are added on the fly but include hit_from, # hit_to, etc. (see extract_data) pass def get_sequence_from_fasta(file): file_handle = open(file) records = SeqIO.parse(file_handle, format="fasta") record = records.next return record.seq.data def blast_lookup(seq): result_handle = NCBIWWW.qblast("blastn", 'refseq_genomic', seq) return result_handle.read def get_text(node_list): rc = "" for node in node_list: if node.nodeType == node.TEXT_NODE: rc = rc + node.data return rc def extract_data(str): dom = parseString(str) variants = dom.getElementsByTagName("Hit") if len(variants) == 0: return parsed = [] for v in variants: # now populate the struct access = get_text(v.getElementsByTagName("Hit_accession")[0].childNodes) score = get_text(v.getElementsByTagName("Hsp_score")[0].childNodes) # only consider it a genuine snp if the hit score is above 100 and the accession number starts with NC (meaning that it's human). if int(score) > 100 and access.startswith("NC"): hit_from = get_text(v.getElementsByTagName("Hsp_hit-from")[0].childNodes) hit_to = get_text(v.getElementsByTagName("Hsp_hit-to")[0].childNodes) hit_def = get_text(v.getElementsByTagName("Hit_def")[0].childNodes) hit_seq = get_text(v.getElementsByTagName("Hsp_hseq")[0].childNodes) hit_id = get_text(v.getElementsByTagName("Hit_id")[0].childNodes) print access print score print "YAY!" new_hit = Hit new_hit.hit_access = access new_hit.hit_score = score new_hit.hit_from = hit_from new_hit.hit_to = hit_to new_hit.hit_def = hit_def new_hit.hit_seq = hit_seq new_hit.hit_id = hit_id parsed.append(new_hit) return parsed def omim_snp_search(dnsnp_id): client = DBIdsClient.DBIdsClient query = client.search(dnsnp_id, "omim") records = [i.efetch(rettype="xml") for i in query] return records def extract_allelic_variant_data(str): dom = parseString(str) variants = dom.getElementsByTagName("Mim-allelic-variant") if len(variants) == 0: return parsed = [] for v in variants: a = AllelicVariant # create empty instance of struct # now populate the struct a.name = get_text(v.getElementsByTagName("Mim-allelic-variant_name")[0].childNodes) a.mutation = get_text(v.getElementsByTagName("Mim-allelic-variant_mutation")[0].getElementsByTagName("Mim-text_text")[0].childNodes) a.description = get_text(v.getElementsByTagName("Mim-allelic-variant_description")[0].getElementsByTagName("Mim-text_text")[0].childNodes) parsed.append(a) return parsed sequence = get_sequence_from_fasta("example.fasta") print "Sequence received; please wait." b = blast_lookup(sequence)
 * 1) basic function to open fasta file and get sequence
 * 1) lookup blast snp data
 * 1) basic text extraction from XML; based on http://docs.python.org/lib/dom-example.html
 * 1) extracts snp data
 * 1) queries the database and returns all info in an XML format
 * 1) extracts allelic variant data, as the name implies, using the struct above
 * 1) BEGIN ACTUAL PROGRAM
 * 2) read sequence from file
 * 1) look up data - HERE WE ARE USING BLAST NOT BLAST_SNP

print b

e = extract_data(b) print e[0].hit_access print e[0].hit_to print e[0].hit_from print e[0].hit_id print e[0].hit_seq print e[0].hit_def
 * 1) extract data