Harvard:Biophysics 101/2007/Notebook:Katie Fifer/2007-4-19

From OpenWetWare
Jump to navigationJump to search

This script does a regular BLAST lookup and parses out the top human hit and relevant info (hit to, hit from, hit id, hit def, hit seq, etc). The next step will be to get the relevant part of the contig file. Then we'll use that to get the gene and figure out the mutation and do relevant searching thereafter.

<syntax type='python'> from Bio import SeqIO from Bio.Blast import NCBIWWW from Bio.EUtils import DBIdsClient import xml.dom.minidom from xml.dom.minidom import parse, parseString from threading import Thread import time, sys

  1. C-style struct to pass parameters

class AllelicVariant: pass

class Hit: # the fields are added on the fly but include hit_from, # hit_to, etc. (see extract_data()) pass

  1. basic function to open fasta file and get sequence

def get_sequence_from_fasta(file): file_handle = open(file) records = SeqIO.parse(file_handle, format="fasta") record = records.next() return record.seq.data

  1. lookup blast snp data

def blast_lookup(seq): result_handle = NCBIWWW.qblast("blastn", 'refseq_genomic', seq) return result_handle.read()

  1. basic text extraction from XML; based on http://docs.python.org/lib/dom-example.html

def get_text(node_list):

   rc = ""
   for node in node_list:
       if node.nodeType == node.TEXT_NODE:
           rc = rc + node.data
   return rc

  1. extracts snp data

def extract_data(str): dom = parseString(str) variants = dom.getElementsByTagName("Hit") if len(variants) == 0: return parsed = [] for v in variants: # now populate the struct access = get_text(v.getElementsByTagName("Hit_accession")[0].childNodes) score = get_text(v.getElementsByTagName("Hsp_score")[0].childNodes) # only consider it a genuine snp if the hit score is above 100 and the accession number starts with NC (meaning that it's human). if int(score) > 100 and access.startswith("NC"): hit_from = get_text(v.getElementsByTagName("Hsp_hit-from")[0].childNodes) hit_to = get_text(v.getElementsByTagName("Hsp_hit-to")[0].childNodes) hit_def = get_text(v.getElementsByTagName("Hit_def")[0].childNodes) hit_seq = get_text(v.getElementsByTagName("Hsp_hseq")[0].childNodes) hit_id = get_text(v.getElementsByTagName("Hit_id")[0].childNodes) print access print score print "YAY!" new_hit = Hit() new_hit.hit_access = access new_hit.hit_score = score new_hit.hit_from = hit_from new_hit.hit_to = hit_to new_hit.hit_def = hit_def new_hit.hit_seq = hit_seq new_hit.hit_id = hit_id parsed.append(new_hit)

return parsed

  1. queries the database and returns all info in an XML format

def omim_snp_search(dnsnp_id): client = DBIdsClient.DBIdsClient() query = client.search(dnsnp_id, "omim") records = [i.efetch(rettype="xml") for i in query] return records

  1. extracts allelic variant data, as the name implies, using the struct above

def extract_allelic_variant_data(str): dom = parseString(str) variants = dom.getElementsByTagName("Mim-allelic-variant") if len(variants) == 0: return parsed = [] for v in variants: a = AllelicVariant() # create empty instance of struct # now populate the struct a.name = get_text(v.getElementsByTagName("Mim-allelic-variant_name")[0].childNodes) a.mutation = get_text(v.getElementsByTagName("Mim-allelic-variant_mutation")[0].getElementsByTagName("Mim-text_text")[0].childNodes) a.description = get_text(v.getElementsByTagName("Mim-allelic-variant_description")[0].getElementsByTagName("Mim-text_text")[0].childNodes) parsed.append(a) return parsed

  2. read sequence from file

sequence = get_sequence_from_fasta("example.fasta") print "Sequence received; please wait."


b = blast_lookup(sequence)

print b

  1. extract data

e = extract_data(b) print e[0].hit_access print e[0].hit_to print e[0].hit_from print e[0].hit_id print e[0].hit_seq print e[0].hit_def </syntax>