User:Lindenb/Notebook/UMR915/20110511
From OpenWetWare
Integragen
analyse SNP_het (het. composite)
#remove rs #only keep the 'SNP_het' #remove the low qualities #remove SNP_het* #only the non-synonymous or stop #remove DNA & prot sequences #add chrom_position flag #sort gunzip -c AllChrom.exome.snp.pool.new.annotation.gz |\ awk -F ' ' '{if(substr($4,1,2)!="rs") print;}' |\ awk -F ' ' '{if(index($19,"douteux")==0) print;}' |\ awk -F ' ' '{if(index($19,"_het")!=0) print;}' |\ awk -F ' ' '{if(index($26,"nonsense")!=0 || index($26,"missense")!=0) print;}' |\ cut -d ' ' -f 1-27 |\ awk -F ' ' '{printf("%s_%s\t%s\n",$2,$1,$0);}' |\ sort -t ' ' -k1,1 > _jeter1.txt #get all distinct chrom_pos in file cut -d ' ' -f 1 _jeter1.txt | sort -t ' ' -k1,1 | uniq > _jeter2.txt #extract wild exome #keep chrom,position #add chrom_position flag #sort gunzip -c AllChrom.exome.snp.u2437.new.annotation.gz |\ cut -d ' ' -f 1,2 |\ awk -F ' ' '{printf("%s_%s\n",$2,$1);}' |\ sort -t ' ' -k 1,1 | uniq > _jeter3.txt #get [m] chrom_pos not in [+] chrom_pos set comm -2 -3 _jeter2.txt _jeter3.txt > _jeter4.txt #join uniq [m] chrom_pos & mutated data #remove chrom_pos #order by gene join -t ' ' --check-order -1 1 -2 1 _jeter1.txt _jeter4.txt|\ cut -d ' ' -f 2- |\ sort -t ' ' -k 20 > _jeter5.txt #join to self using key= "gene name" #only keep if first mutation in same gene/chromosome and pos1< pos2 #keep some columns join -t ' ' -j 20 _jeter5.txt _jeter5.txt |\ awk -F ' ' '{if($3==$29 && int($2) < int($28) ) print;}' |\ cut -d ' ' -f 1,2,3,20,26,28,46,52 > _jeter6.txt #extract gene names cut -d ' ' -f 1 _jeter6.txt | sort | uniq rm _jeter[12345].txt
Knime equivalent
Slideshow
<html><div style="width:425px" id="__ss_7923093"> <strong style="display:block;margin:12px 0 4px"><a href="http://www.slideshare.net/lindenb/analyzing-exome-data-with-knime" title="Analyzing Exome Data with KNIME">Analyzing Exome Data with KNIME</a></strong> <iframe src="http://www.slideshare.net/slideshow/embed_code/7923093" width="425" height="355" frameborder="0" marginwidth="0" marginheight="0" scrolling="no"></iframe> <div style="padding:5px 0 12px"> View more <a href="http://www.slideshare.net/">presentations</a> from <a href="http://www.slideshare.net/lindenb">Pierre Lindenbaum</a> </div> </div></html>
SNP_diff
#remove rs #in gene #remove the low qualities #keep SNP_diff #only the non-synonymous or stop #remove DNA & prot sequences #order by GENE gunzip -c AllChrom.exome.snp.pool.new.annotation.gz |\ awk -F ' ' '{if(substr($4,1,2)!="rs") print;}' |\ awk -F ' ' '{if($20!="") print;}' |\ awk -F ' ' '{if(index($19,"douteux")==0) print;}' |\ awk -F ' ' '{if(index($19,"_diff")!=0) print;}' |\ awk -F ' ' '{if(index($26,"nonsense")!=0 || index($26,"missense")!=0) print;}' |\ cut -d ' ' -f 1-27 |\ sort -t ' ' -k20,20 > _jeter1.txt #extract wild exome #remove rs #remove SNP_diff #in gene #order by gene gunzip -c AllChrom.exome.snp.u2437.new.annotation.gz |\ awk -F ' ' '{if(substr($4,1,2)!="rs") print;}' |\ awk -F ' ' '{if(index($19,"douteux")==0) print;}' |\ awk -F ' ' '{if(index($19,"_diff")==0) print;}' |\ awk -F ' ' '{if($20!="") print;}' |\ cut -d ' ' -f 1-27 |\ sort -t ' ' -k20,20 > _jeter3.txt #join wild & mutated data by gene #check wild sample has no mutation in the pair of mutated snps #remove wild data join -t ' ' -1 20 -2 20 _jeter1.txt _jeter3.txt |\ awk -F ' ' '{if($3==$29 && int($2) == int($28) ) print;}' |\ cut -d ' ' -f 1 |\ sort | uniq rm _jeter*.txt