User:Lindenb/Notebook/UMR915/20100712

SQL query to scan for candidate genes
use user_anonymous; create temporary table T1 (  id int unsigned not null,  chrom varchar(30) not null,  position int not null,  index(chrom),index(chrom,position),index(position)  ); insert into T1(id,chrom,position) select distinct V.id,V.chrom,V.position from umr915.variation as V left join umr915.polyphen as P on P.variation_id=V.id and P.prediction in ('PROBABLY_DAMAGING','POSSIBLY_DAMAGING') left join umr915.sift as F on F.variation_id=V.id and F.prediction in ('TOLERATED','DAMAGING_LOW','DAMAGING') where not(P.id is null or F.id is null) ; /* delete T1 from T1,hg18.genomicSuperDups as D where T1.chrom=D.chrom and T1.position>=D.chromStart and T1.position0;  */ delete T1 from T1,hg18.snp129 as D where T1.chrom=D.chrom and T1.position>=D.chromStart and T1.position<D.chromEnd; create temporary table T2 (  chrom varchar(30) not null,  position int not null,  g1 int, g2 int, g3 int, g4 int, g5 int,  g6 int, g7 int, g8 int, g9 int, g10 int,  total int,  index(chrom),index(chrom,position),index(position)  ); insert into T2(chrom,position, g1,g2,g3,g4,g5,g6,g7,g8,g9,g10) select distinct T1.chrom,T1.position, IF(C1.sample_id is null,0,1) as G1, IF(C2.sample_id is null,0,1) as G2, IF(C3.sample_id is null,0,1) as G3, IF(C4.sample_id is null,0,1) as G4, IF(C5.sample_id is null,0,1) as G5, IF(C6.sample_id is null,0,1) as G6, IF(C7.sample_id is null,0,1) as G7, IF(C8.sample_id is null,0,1) as G8, IF(C9.sample_id is null,0,1) as G9, IF(C10.sample_id is null,0,1) as G10 from T1 left join umr915.vcf_call as C1 on C1.variation_id=T1.id and C1.sample_id=1 and (C1.depth is null or C1.depth>4) and C1.snp_quality >= 25 left join umr915.vcf_call as C2 on C2.variation_id=T1.id and C2.sample_id=2 and (C2.depth is null or C2.depth>4) and C2.snp_quality >= 25 left join umr915.vcf_call as C3 on C3.variation_id=T1.id and C3.sample_id=3 and (C3.depth is null or C3.depth>4) and C3.snp_quality >= 25 left join umr915.vcf_call as C4 on C4.variation_id=T1.id and C4.sample_id=4 and (C4.depth is null or C4.depth>4) and C4.snp_quality >= 25 left join umr915.vcf_call as C5 on C5.variation_id=T1.id and C5.sample_id=5 and (C5.depth is null or C5.depth>4) and C5.snp_quality >= 25 left join umr915.vcf_call as C6 on C6.variation_id=T1.id and C6.sample_id=6 and (C6.depth is null or C6.depth>4) and C6.snp_quality >= 25 left join umr915.vcf_call as C7 on C7.variation_id=T1.id and C7.sample_id=7 and (C7.depth is null or C7.depth>4) and C7.snp_quality >= 25 left join umr915.vcf_call as C8 on C8.variation_id=T1.id and C8.sample_id=8 and (C8.depth is null or C8.depth>4) and C8.snp_quality >= 25 left join umr915.vcf_call as C9 on C9.variation_id=T1.id and C9.sample_id=9 and (C9.depth is null or C9.depth>4) and C9.snp_quality >= 25 left join umr915.vcf_call as C10 on C10.variation_id=T1.id and C10.sample_id=10 and (C10.depth is null or C10.depth>4)and C10.snp_quality >= 25 ; update T2 set total= g1+g2+g3+g4+g5+g6+g7+g8+g9+g10; delete from T2 where total>5; create temporary table T3 (  name varchar(50) not null,  chrom varchar(30) not null,  txStart int not null,  txEnd int not null,  g1 int, g2 int, g3 int, g4 int, g5 int,  g6 int, g7 int, g8 int, g9 int, g10 int,  total int,  index(chrom),index(chrom,txStart),index(txStart)  ); insert into T3(name,chrom,txStart,txEnd,g1,g2,g3,g4,g5,g6,g7,g8,g9,g10) select K.name,K.chrom,K.txStart,K.txEnd, max(T2.g1), max(T2.g2), max(T2.g3), max(T2.g4), max(T2.g5), max(T2.g6), max(T2.g7), max(T2.g8), max(T2.g9), max(T2.g10) from hg18.knownGene as K ,T2 where T2.chrom=K.chrom and T2.position>=K.txStart and T2.position<K.txEnd group by  1,2,3,4 ; update T3 set total= g1+g2+g3+g4+g5+g6+g7+g8+g9+g10; delete from T3 where total<9; select distinct T3.*,X.description from T3, hg18.kgXref as X where X.kgID=T3.name; select "Done.";

Running GATK for INdels
cf. http://www.broadinstitute.org/gsa/wiki/index.php/Indel_Genotyper_V2.0

for I in Brs1 Brs2 Brs3 Brs4 Brs5 Brs6 Brs7 Brs8 Brs9 Brs10 do       echo "processing $I :" java -jar /usr/local/package/gatk/GenomeAnalysisTK-1.0.3471/GenomeAnalysisTK.jar \ -T IndelGenotyperV2 \ -R /GENOTYPAGE/data/pubdb/ucsc/hg18/chromosomes/hg18.fa \ -O indels_${I}.out.txt \ -I ../20100607_mapping/recal_bwa_rmdup_${I}.bam | gzip --best > indels_${I}.tsv.gz               gzip --best indels_${I}.out.txt done

Output
Brief output file (specified with -O option) will look as follows: chr1   556817  556817  +G:3/7 chr1   3535035 3535054 -TTCTGGGAGCTCCTCCCCC:9/21 chr1   3778838 3778838 +A:15/48

This is a .bed track that can be loaded into UCSC browser.

The event locations on the chromosomes are 1-based

other
working on
 * JSPs
 * mutation prediction. Added support for intron/exons donor/acceptor sites