User:Lindenb/Notebook/UMR915/20110510

From OpenWetWare
Jump to: navigation, search
Owwnotebook icon.png

20110427‎        Top        20110511       


Integragen

Columns:
     1  Position.hg19
     2  chrom
     3  sample.ID
     4  rs.name
     5  hapmap_ref_other
     6  X1000Genome.obs
     7  X1000Genome.desc
     8  Freq.HTZ.ExomesV1
     9  Freq.Hom.ExomesV1
    10  A
    11  C
    12  G
    13  T
    14  modified_call
    15  total
    16  used
    17  score
    18  reference
    19  type
    20  Gene.name
    21  Gene.start
    22  Gene.end
    23  strand
    24  nbre.exon
    25  refseq
    26  typeannot
    27  type.pos
    28  index.cdna
    29  index.prot
    30  Taille.cdna
    31  Intron.start
    32  Intron.end
    33  codon.wild
    34  aa.wild
    35  codon.mut
    36  aa.mut
    37  cds.wild
    38  cds.mut
    39  prot.wild
    40  prot.mut
    41  mirna
    42  region.splice

problem with sort/join

http://unix.stackexchange.com/questions/12942

script heterozygote 2x

#remove rs
#only keep the 'SNP_het'
#remove the low qualities
#remove SNP_het*
#only the non-synonymous or stop
#remove DNA & prot sequences
#order by GENE
gunzip -c AllChrom.exome.snp.pool.new.annotation.gz |\
awk -F '	' '{if(substr($4,1,2)!="rs") print;}' |\
awk -F '	' '{if(index($19,"douteux")==0) print;}' |\
awk -F '	' '{if(index($19,"_het")!=0) print;}' |\
awk -F '	' '{if(index($26,"nonsense")!=0 || index($26,"missense")!=0) print;}' |\
cut -d '	'	-f 1-27 |\
sort -f -t '	' -k20,20 > _jeter1.txt 

#join to self using key= "gene name"
#only keep if first mutation in same gene/chromosome and pos1< pos2
#keep some columns
join -i -t '	' -j 20 _jeter1.txt _jeter1.txt |\
awk -F '	' '{if($3==$29 && int($2) < int($28) ) print;}' |\
cut -d '	' -f 1,2,3,20,26,28,46,52 > _jeter2.txt


#extract wild exome
#keep chrom,position,gene
#order by gene
gunzip -c AllChrom.exome.snp.u2437.new.annotation.gz |\
cut -d '	' -f 1,2,20 |\
sort -f -t '	' -k 3,3 > _jeter3.txt 

#join wild & mutated data by gene
#check wild sample has no mutation in the pair of mutated snps
#remove wild data
join -i -t '	' -1 1 -2 3  _jeter2.txt _jeter3.txt|\
awk -F '	' '{if($3==$10 && int($9) != int($2) && int($9) != int($6)) print;}' |\
cut -d '	' -f 1-8 > _jeter4.txt 

#extract gene names
cut -d '	' -f 1 _jeter4.txt | sort | uniq


rm _jeter*.txt

script one mutation diff

#remove rs
#in gene
#remove the low qualities
#keep SNP_diff
#only the non-synonymous or stop
#remove DNA & prot sequences
#order by GENE
gunzip -c AllChrom.exome.snp.pool.new.annotation.gz |\
awk -F '	' '{if(substr($4,1,2)!="rs") print;}' |\
awk -F '	' '{if($20!="") print;}' |\
awk -F '	' '{if(index($19,"douteux")==0) print;}' |\
awk -F '	' '{if(index($19,"_diff")!=0) print;}' |\
awk -F '	' '{if(index($26,"nonsense")!=0 || index($26,"missense")!=0) print;}' |\
cut -d '	'	-f 1-27 |\
sort -f -t '	' -k20,20 > _jeter1.txt 

#extract wild exome
#remove rs
#remove SNP_diff
#in gene
#order by gene
gunzip -c AllChrom.exome.snp.u2437.new.annotation.gz |\
awk -F '	' '{if(substr($4,1,2)!="rs") print;}' |\
awk -F '	' '{if(index($19,"_diff")==0) print;}' |\
awk -F '	' '{if($20!="") print;}' |\
cut -d '	'	-f 1-27 |\
sort -f -t '	' -k20,20 > _jeter3.txt 

#join wild & mutated data by gene
#check wild sample has no mutation in the pair of mutated snps
#remove wild data
join -i  -t '	' -1 20 -2 20 _jeter1.txt _jeter3.txt |\
awk -F '	' '{if($3==$29 && int($2) == int($28) ) print;}' |\
cut -d '	' -f 1 |\
sort | uniq


rm _jeter*.txt