User:Lindenb/Notebook/UMR915/20110511

From OpenWetWare
Jump to navigationJump to search

20110510        Top        20110517       


Integragen

analyse SNP_het (het. composite)

#remove rs
#only keep the 'SNP_het'
#remove the low qualities
#remove SNP_het*
#only the non-synonymous or stop
#remove DNA & prot sequences
#add chrom_position flag
#sort
gunzip -c AllChrom.exome.snp.pool.new.annotation.gz |\
awk -F '	' '{if(substr($4,1,2)!="rs") print;}' |\
awk -F '	' '{if(index($19,"douteux")==0) print;}' |\
awk -F '	' '{if(index($19,"_het")!=0) print;}' |\
awk -F '	' '{if(index($26,"nonsense")!=0 || index($26,"missense")!=0) print;}' |\
cut -d '	'	-f 1-27 |\
awk -F '	' '{printf("%s_%s\t%s\n",$2,$1,$0);}' |\
sort  -t '	' -k1,1 > _jeter1.txt 

#get all distinct chrom_pos in file
cut -d '	' -f 1 _jeter1.txt | sort -t '	' -k1,1 | uniq > _jeter2.txt 

#extract wild exome
#keep chrom,position
#add chrom_position flag
#sort
gunzip -c AllChrom.exome.snp.u2437.new.annotation.gz |\
cut -d '	' -f 1,2 |\
awk -F '	' '{printf("%s_%s\n",$2,$1);}' |\
sort  -t '	' -k 1,1 | uniq  > _jeter3.txt 

#get [m] chrom_pos not in [+] chrom_pos set
comm -2 -3 _jeter2.txt  _jeter3.txt  > _jeter4.txt 


#join uniq [m] chrom_pos & mutated data
#remove chrom_pos
#order by gene
join  -t '	' --check-order  -1 1 -2 1  _jeter1.txt _jeter4.txt|\
cut -d '	' -f 2- |\
sort -t '	' -k 20 > _jeter5.txt 


#join to self using key= "gene name"
#only keep if first mutation in same gene/chromosome and pos1< pos2
#keep some columns
join  -t '	' -j 20 _jeter5.txt _jeter5.txt |\
awk -F '	' '{if($3==$29 && int($2) < int($28) ) print;}' |\
cut -d '	' -f 1,2,3,20,26,28,46,52 > _jeter6.txt

#extract gene names
cut -d '	' -f 1 _jeter6.txt | sort | uniq


rm _jeter[12345].txt

Knime equivalent

Slideshow

<html><div style="width:425px" id="__ss_7923093"> <strong style="display:block;margin:12px 0 4px"><a href="http://www.slideshare.net/lindenb/analyzing-exome-data-with-knime" title="Analyzing Exome Data with KNIME">Analyzing Exome Data with KNIME</a></strong> <iframe src="http://www.slideshare.net/slideshow/embed_code/7923093" width="425" height="355" frameborder="0" marginwidth="0" marginheight="0" scrolling="no"></iframe> <div style="padding:5px 0 12px"> View more <a href="http://www.slideshare.net/">presentations</a> from <a href="http://www.slideshare.net/lindenb">Pierre Lindenbaum</a> </div> </div></html>

SNP_diff

#remove rs
#in gene
#remove the low qualities
#keep SNP_diff
#only the non-synonymous or stop
#remove DNA & prot sequences
#order by GENE
gunzip -c AllChrom.exome.snp.pool.new.annotation.gz |\
awk -F '	' '{if(substr($4,1,2)!="rs") print;}' |\
awk -F '	' '{if($20!="") print;}' |\
awk -F '	' '{if(index($19,"douteux")==0) print;}' |\
awk -F '	' '{if(index($19,"_diff")!=0) print;}' |\
awk -F '	' '{if(index($26,"nonsense")!=0 || index($26,"missense")!=0) print;}' |\
cut -d '	'	-f 1-27 |\
sort  -t '	' -k20,20 > _jeter1.txt 

#extract wild exome
#remove rs
#remove SNP_diff
#in gene
#order by gene
gunzip -c AllChrom.exome.snp.u2437.new.annotation.gz |\
awk -F '	' '{if(substr($4,1,2)!="rs") print;}' |\
awk -F '	' '{if(index($19,"douteux")==0) print;}' |\
awk -F '	' '{if(index($19,"_diff")==0) print;}' |\
awk -F '	' '{if($20!="") print;}' |\
cut -d '	'	-f 1-27 |\
sort  -t '	' -k20,20 > _jeter3.txt 

#join wild & mutated data by gene
#check wild sample has no mutation in the pair of mutated snps
#remove wild data
join   -t '	' -1 20 -2 20 _jeter1.txt _jeter3.txt |\
awk -F '	' '{if($3==$29 && int($2) == int($28) ) print;}' |\
cut -d '	' -f 1 |\
sort | uniq


rm _jeter*.txt