User:Lindenb/Notebook/UMR915/20110510
From OpenWetWare

Integragen
Columns:
1 Position.hg19
2 chrom
3 sample.ID
4 rs.name
5 hapmap_ref_other
6 X1000Genome.obs
7 X1000Genome.desc
8 Freq.HTZ.ExomesV1
9 Freq.Hom.ExomesV1
10 A
11 C
12 G
13 T
14 modified_call
15 total
16 used
17 score
18 reference
19 type
20 Gene.name
21 Gene.start
22 Gene.end
23 strand
24 nbre.exon
25 refseq
26 typeannot
27 type.pos
28 index.cdna
29 index.prot
30 Taille.cdna
31 Intron.start
32 Intron.end
33 codon.wild
34 aa.wild
35 codon.mut
36 aa.mut
37 cds.wild
38 cds.mut
39 prot.wild
40 prot.mut
41 mirna
42 region.splice
problem with sort/join
http://unix.stackexchange.com/questions/12942
script heterozygote 2x
#remove rs
#only keep the 'SNP_het'
#remove the low qualities
#remove SNP_het*
#only the non-synonymous or stop
#remove DNA & prot sequences
#order by GENE
gunzip -c AllChrom.exome.snp.pool.new.annotation.gz |\
awk -F ' ' '{if(substr($4,1,2)!="rs") print;}' |\
awk -F ' ' '{if(index($19,"douteux")==0) print;}' |\
awk -F ' ' '{if(index($19,"_het")!=0) print;}' |\
awk -F ' ' '{if(index($26,"nonsense")!=0 || index($26,"missense")!=0) print;}' |\
cut -d ' ' -f 1-27 |\
sort -f -t ' ' -k20,20 > _jeter1.txt
#join to self using key= "gene name"
#only keep if first mutation in same gene/chromosome and pos1< pos2
#keep some columns
join -i -t ' ' -j 20 _jeter1.txt _jeter1.txt |\
awk -F ' ' '{if($3==$29 && int($2) < int($28) ) print;}' |\
cut -d ' ' -f 1,2,3,20,26,28,46,52 > _jeter2.txt
#extract wild exome
#keep chrom,position,gene
#order by gene
gunzip -c AllChrom.exome.snp.u2437.new.annotation.gz |\
cut -d ' ' -f 1,2,20 |\
sort -f -t ' ' -k 3,3 > _jeter3.txt
#join wild & mutated data by gene
#check wild sample has no mutation in the pair of mutated snps
#remove wild data
join -i -t ' ' -1 1 -2 3 _jeter2.txt _jeter3.txt|\
awk -F ' ' '{if($3==$10 && int($9) != int($2) && int($9) != int($6)) print;}' |\
cut -d ' ' -f 1-8 > _jeter4.txt
#extract gene names
cut -d ' ' -f 1 _jeter4.txt | sort | uniq
rm _jeter*.txt
script one mutation diff
#remove rs
#in gene
#remove the low qualities
#keep SNP_diff
#only the non-synonymous or stop
#remove DNA & prot sequences
#order by GENE
gunzip -c AllChrom.exome.snp.pool.new.annotation.gz |\
awk -F ' ' '{if(substr($4,1,2)!="rs") print;}' |\
awk -F ' ' '{if($20!="") print;}' |\
awk -F ' ' '{if(index($19,"douteux")==0) print;}' |\
awk -F ' ' '{if(index($19,"_diff")!=0) print;}' |\
awk -F ' ' '{if(index($26,"nonsense")!=0 || index($26,"missense")!=0) print;}' |\
cut -d ' ' -f 1-27 |\
sort -f -t ' ' -k20,20 > _jeter1.txt
#extract wild exome
#remove rs
#remove SNP_diff
#in gene
#order by gene
gunzip -c AllChrom.exome.snp.u2437.new.annotation.gz |\
awk -F ' ' '{if(substr($4,1,2)!="rs") print;}' |\
awk -F ' ' '{if(index($19,"_diff")==0) print;}' |\
awk -F ' ' '{if($20!="") print;}' |\
cut -d ' ' -f 1-27 |\
sort -f -t ' ' -k20,20 > _jeter3.txt
#join wild & mutated data by gene
#check wild sample has no mutation in the pair of mutated snps
#remove wild data
join -i -t ' ' -1 20 -2 20 _jeter1.txt _jeter3.txt |\
awk -F ' ' '{if($3==$29 && int($2) == int($28) ) print;}' |\
cut -d ' ' -f 1 |\
sort | uniq
rm _jeter*.txt