User:Lindenb/Notebook/UMR915/20110510

Integragen
Columns:    1  Position.hg19 2 chrom 3 sample.ID     4  rs.name 5 hapmap_ref_other 6 X1000Genome.obs 7 X1000Genome.desc 8 Freq.HTZ.ExomesV1 9 Freq.Hom.ExomesV1 10 A    11  C    12  G    13  T    14  modified_call 15 total 16 used 17 score 18 reference 19 type 20 Gene.name 21 Gene.start 22 Gene.end 23 strand 24 nbre.exon 25 refseq 26 typeannot 27 type.pos 28 index.cdna 29 index.prot 30 Taille.cdna 31 Intron.start 32 Intron.end 33 codon.wild 34 aa.wild 35 codon.mut 36 aa.mut 37 cds.wild 38 cds.mut 39 prot.wild 40 prot.mut 41 mirna 42 region.splice

problem with sort/join
http://unix.stackexchange.com/questions/12942

script heterozygote 2x
gunzip -c AllChrom.exome.snp.pool.new.annotation.gz |\ awk -F '	' '{if(substr($4,1,2)!="rs") print;}' |\ awk -F '	' '{if(index($19,"douteux")==0) print;}' |\ awk -F '	' '{if(index($19,"_het")!=0) print;}' |\ awk -F '	' '{if(index($26,"nonsense")!=0 || index($26,"missense")!=0) print;}' |\ cut -d '	'	-f 1-27 |\ sort -f -t '	' -k20,20 > _jeter1.txt
 * 1) remove rs
 * 2) only keep the 'SNP_het'
 * 3) remove the low qualities
 * 4) remove SNP_het*
 * 5) only the non-synonymous or stop
 * 6) remove DNA & prot sequences
 * 7) order by GENE

join -i -t '	' -j 20 _jeter1.txt _jeter1.txt |\ awk -F '	' '{if($3==$29 && int($2) < int($28) ) print;}' |\ cut -d '	' -f 1,2,3,20,26,28,46,52 > _jeter2.txt
 * 1) join to self using key= "gene name"
 * 2) only keep if first mutation in same gene/chromosome and pos1< pos2
 * 3) keep some columns

gunzip -c AllChrom.exome.snp.u2437.new.annotation.gz |\ cut -d '	' -f 1,2,20 |\ sort -f -t '	' -k 3,3 > _jeter3.txt
 * 1) extract wild exome
 * 2) keep chrom,position,gene
 * 3) order by gene

join -i -t '	' -1 1 -2 3 _jeter2.txt _jeter3.txt|\ awk -F '	' '{if($3==$10 && int($9) != int($2) && int($9) != int($6)) print;}' |\ cut -d '	' -f 1-8 > _jeter4.txt
 * 1) join wild & mutated data by gene
 * 2) check wild sample has no mutation in the pair of mutated snps
 * 3) remove wild data

cut -d '	' -f 1 _jeter4.txt | sort | uniq
 * 1) extract gene names

rm _jeter*.txt

script one mutation diff
gunzip -c AllChrom.exome.snp.pool.new.annotation.gz |\ awk -F '	' '{if(substr($4,1,2)!="rs") print;}' |\ awk -F '	' '{if($20!="") print;}' |\ awk -F '	' '{if(index($19,"douteux")==0) print;}' |\ awk -F '	' '{if(index($19,"_diff")!=0) print;}' |\ awk -F '	' '{if(index($26,"nonsense")!=0 || index($26,"missense")!=0) print;}' |\ cut -d '	'	-f 1-27 |\ sort -f -t '	' -k20,20 > _jeter1.txt
 * 1) remove rs
 * 2) in gene
 * 3) remove the low qualities
 * 4) keep SNP_diff
 * 5) only the non-synonymous or stop
 * 6) remove DNA & prot sequences
 * 7) order by GENE

gunzip -c AllChrom.exome.snp.u2437.new.annotation.gz |\ awk -F '	' '{if(substr($4,1,2)!="rs") print;}' |\ awk -F '	' '{if(index($19,"_diff")==0) print;}' |\ awk -F '	' '{if($20!="") print;}' |\ cut -d '	'	-f 1-27 |\ sort -f -t '	' -k20,20 > _jeter3.txt
 * 1) extract wild exome
 * 2) remove rs
 * 3) remove SNP_diff
 * 4) in gene
 * 5) order by gene

join -i -t '	' -1 20 -2 20 _jeter1.txt _jeter3.txt |\ awk -F '	' '{if($3==$29 && int($2) == int($28) ) print;}' |\ cut -d '	' -f 1 |\ sort | uniq
 * 1) join wild & mutated data by gene
 * 2) check wild sample has no mutation in the pair of mutated snps
 * 3) remove wild data

rm _jeter*.txt