User:Lindenb/Notebook/UMR915/20110511

=Integragen=

analyse SNP_het (het. composite)
gunzip -c AllChrom.exome.snp.pool.new.annotation.gz |\ awk -F '	' '{if(substr($4,1,2)!="rs") print;}' |\ awk -F '	' '{if(index($19,"douteux")==0) print;}' |\ awk -F '	' '{if(index($19,"_het")!=0) print;}' |\ awk -F '	' '{if(index($26,"nonsense")!=0 || index($26,"missense")!=0) print;}' |\ cut -d '	'	-f 1-27 |\ awk -F '	' '{printf("%s_%s\t%s\n",$2,$1,$0);}' |\ sort -t '	' -k1,1 > _jeter1.txt
 * 1) remove rs
 * 2) only keep the 'SNP_het'
 * 3) remove the low qualities
 * 4) remove SNP_het*
 * 5) only the non-synonymous or stop
 * 6) remove DNA & prot sequences
 * 7) add chrom_position flag
 * 8) sort

cut -d '	' -f 1 _jeter1.txt | sort -t '	' -k1,1 | uniq > _jeter2.txt
 * 1) get all distinct chrom_pos in file

gunzip -c AllChrom.exome.snp.u2437.new.annotation.gz |\ cut -d '	' -f 1,2 |\ awk -F '	' '{printf("%s_%s\n",$2,$1);}' |\ sort -t '	' -k 1,1 | uniq  > _jeter3.txt
 * 1) extract wild exome
 * 2) keep chrom,position
 * 3) add chrom_position flag
 * 4) sort

comm -2 -3 _jeter2.txt _jeter3.txt  > _jeter4.txt
 * 1) get [m] chrom_pos not in [+] chrom_pos set

join -t '	' --check-order  -1 1 -2 1  _jeter1.txt _jeter4.txt|\ cut -d '	' -f 2- |\ sort -t '	' -k 20 > _jeter5.txt
 * 1) join uniq [m] chrom_pos & mutated data
 * 2) remove chrom_pos
 * 3) order by gene

join -t '	' -j 20 _jeter5.txt _jeter5.txt |\ awk -F '	' '{if($3==$29 && int($2) < int($28) ) print;}' |\ cut -d '	' -f 1,2,3,20,26,28,46,52 > _jeter6.txt
 * 1) join to self using key= "gene name"
 * 2) only keep if first mutation in same gene/chromosome and pos1< pos2
 * 3) keep some columns

cut -d '	' -f 1 _jeter6.txt | sort | uniq
 * 1) extract gene names

rm _jeter[12345].txt

Slideshow
 Analyzing Exome Data with KNIME  View more presentations from Pierre Lindenbaum

SNP_diff
gunzip -c AllChrom.exome.snp.pool.new.annotation.gz |\ awk -F '	' '{if(substr($4,1,2)!="rs") print;}' |\ awk -F '	' '{if($20!="") print;}' |\ awk -F '	' '{if(index($19,"douteux")==0) print;}' |\ awk -F '	' '{if(index($19,"_diff")!=0) print;}' |\ awk -F '	' '{if(index($26,"nonsense")!=0 || index($26,"missense")!=0) print;}' |\ cut -d '	'	-f 1-27 |\ sort -t '	' -k20,20 > _jeter1.txt
 * 1) remove rs
 * 2) in gene
 * 3) remove the low qualities
 * 4) keep SNP_diff
 * 5) only the non-synonymous or stop
 * 6) remove DNA & prot sequences
 * 7) order by GENE

gunzip -c AllChrom.exome.snp.u2437.new.annotation.gz |\ awk -F '	' '{if(substr($4,1,2)!="rs") print;}' |\ awk -F '	' '{if(index($19,"douteux")==0) print;}' |\ awk -F '	' '{if(index($19,"_diff")==0) print;}' |\ awk -F '	' '{if($20!="") print;}' |\ cut -d '	'	-f 1-27 |\ sort -t '	' -k20,20 > _jeter3.txt
 * 1) extract wild exome
 * 2) remove rs
 * 3) remove SNP_diff
 * 4) in gene
 * 5) order by gene

join  -t '	' -1 20 -2 20 _jeter1.txt _jeter3.txt |\ awk -F '	' '{if($3==$29 && int($2) == int($28) ) print;}' |\ cut -d '	' -f 1 |\ sort | uniq
 * 1) join wild & mutated data by gene
 * 2) check wild sample has no mutation in the pair of mutated snps
 * 3) remove wild data

rm _jeter*.txt