User:Lindenb/Notebook/UMR915/20100712

From OpenWetWare
Jump to navigationJump to search

20100709        Top        20100713       


SQL query to scan for candidate genes

 use user_anonymous;
 create temporary table T1
 (
 id int unsigned not null,
 chrom varchar(30) not null,
 position int not null,
 index(chrom),index(chrom,position),index(position)
 );
 
 insert into T1(id,chrom,position)
 select distinct
 V.id,V.chrom,V.position
 from
 umr915.variation as V
 
 left join umr915.polyphen as P on P.variation_id=V.id and P.prediction in ('PROBABLY_DAMAGING','POSSIBLY_DAMAGING')
 left join umr915.sift as F on F.variation_id=V.id and F.prediction in ('TOLERATED','DAMAGING_LOW','DAMAGING')
 where 
 not(P.id is null or F.id is null)
 ;
 /* delete T1 from T1,hg18.genomicSuperDups as D where T1.chrom=D.chrom and T1.position>=D.chromStart and T1.position<D.chromEnd and D.score>0;  */
 
 delete T1 from T1,hg18.snp129 as D where T1.chrom=D.chrom and T1.position>=D.chromStart and T1.position<D.chromEnd; 
 
 create temporary table T2
 (
 chrom varchar(30) not null,
 position int not null,
 g1 int, g2 int, g3 int, g4 int, g5 int,
 g6 int, g7 int, g8 int, g9 int, g10 int,
 total int,
 index(chrom),index(chrom,position),index(position)
 );
 
 
 insert into T2(chrom,position, g1,g2,g3,g4,g5,g6,g7,g8,g9,g10)
 select distinct
 T1.chrom,T1.position,
 IF(C1.sample_id is null,0,1) as G1,
 IF(C2.sample_id is null,0,1) as G2,
 IF(C3.sample_id is null,0,1) as G3,
 IF(C4.sample_id is null,0,1) as G4,
 IF(C5.sample_id is null,0,1) as G5,
 IF(C6.sample_id is null,0,1) as G6,
 IF(C7.sample_id is null,0,1) as G7,
 IF(C8.sample_id is null,0,1) as G8,
 IF(C9.sample_id is null,0,1) as G9,
 IF(C10.sample_id is null,0,1) as G10
 from
 T1
 left join umr915.vcf_call as C1 on C1.variation_id=T1.id and C1.sample_id=1 and (C1.depth is null or C1.depth>4) and C1.snp_quality >= 25
 left join umr915.vcf_call as C2 on C2.variation_id=T1.id and C2.sample_id=2 and (C2.depth is null or C2.depth>4) and C2.snp_quality >= 25
 left join umr915.vcf_call as C3 on C3.variation_id=T1.id and C3.sample_id=3 and (C3.depth is null or C3.depth>4) and C3.snp_quality >= 25
 left join umr915.vcf_call as C4 on C4.variation_id=T1.id and C4.sample_id=4 and (C4.depth is null or C4.depth>4) and C4.snp_quality >= 25
 left join umr915.vcf_call as C5 on C5.variation_id=T1.id and C5.sample_id=5 and (C5.depth is null or C5.depth>4) and C5.snp_quality >= 25
 left join umr915.vcf_call as C6 on C6.variation_id=T1.id and C6.sample_id=6 and (C6.depth is null or C6.depth>4) and C6.snp_quality >= 25
 left join umr915.vcf_call as C7 on C7.variation_id=T1.id and C7.sample_id=7 and (C7.depth is null or C7.depth>4) and C7.snp_quality >= 25
 left join umr915.vcf_call as C8 on C8.variation_id=T1.id and C8.sample_id=8 and (C8.depth is null or C8.depth>4) and C8.snp_quality >= 25
 left join umr915.vcf_call as C9 on C9.variation_id=T1.id and C9.sample_id=9 and (C9.depth is null or C9.depth>4) and C9.snp_quality >= 25
 left join umr915.vcf_call as C10 on C10.variation_id=T1.id and C10.sample_id=10 and (C10.depth is null or C10.depth>4)and C10.snp_quality >= 25
 ;
 
 update T2 set total= g1+g2+g3+g4+g5+g6+g7+g8+g9+g10;
 delete from T2 where total>5;
 
 
 create temporary table T3
 (
 name varchar(50) not null,
 chrom varchar(30) not null,
 txStart int not null,
 txEnd int not null,
 g1 int, g2 int, g3 int, g4 int, g5 int,
 g6 int, g7 int, g8 int, g9 int, g10 int,
 total int,
 index(chrom),index(chrom,txStart),index(txStart)
 );
 
 insert into T3(name,chrom,txStart,txEnd,g1,g2,g3,g4,g5,g6,g7,g8,g9,g10)
 select
 K.name,K.chrom,K.txStart,K.txEnd,
 max(T2.g1), max(T2.g2), max(T2.g3), max(T2.g4), max(T2.g5),
 max(T2.g6), max(T2.g7), max(T2.g8), max(T2.g9), max(T2.g10)
 from
 hg18.knownGene as K ,T2
 where
 T2.chrom=K.chrom and
 T2.position>=K.txStart and T2.position<K.txEnd 
 group by 
 1,2,3,4
 ;
 
 
 update T3 set total= g1+g2+g3+g4+g5+g6+g7+g8+g9+g10;
 delete from T3 where total<9;
 
 select distinct T3.*,X.description from T3, hg18.kgXref as X where X.kgID=T3.name;
 
 select "Done.";

Running GATK for INdels

cf. http://www.broadinstitute.org/gsa/wiki/index.php/Indel_Genotyper_V2.0

 for I in Brs1 Brs2 Brs3 Brs4 Brs5 Brs6 Brs7 Brs8 Brs9 Brs10
 do
       echo "processing $I :"
       java -jar /usr/local/package/gatk/GenomeAnalysisTK-1.0.3471/GenomeAnalysisTK.jar \
               -T IndelGenotyperV2 \
               -R /GENOTYPAGE/data/pubdb/ucsc/hg18/chromosomes/hg18.fa \
               -O indels_${I}.out.txt \
               -I ../20100607_mapping/recal_bwa_rmdup_${I}.bam | gzip --best > indels_${I}.tsv.gz
               gzip --best indels_${I}.out.txt
 done

=Output

Brief output file (specified with -O option) will look as follows:

 chr1    556817  556817  +G:3/7
 chr1    3535035 3535054 -TTCTGGGAGCTCCTCCCCC:9/21
 chr1    3778838 3778838 +A:15/48

This is a .bed track that can be loaded into UCSC browser.

The event locations on the chromosomes are 1-based

other

working on

  • JSPs
  • mutation prediction. Added support for intron/exons donor/acceptor sites