Difference between revisions of "Researching SNPs TSD Journal"
m (→dbSNP) |
(→Plotting) |
||
(28 intermediate revisions by 2 users not shown) | |||
Line 1: | Line 1: | ||
+ | Back to [[Researching_SNPs_TSD | results]]. |
||
== HGMD == |
== HGMD == |
||
=== Sequence retrieval and comparison === |
=== Sequence retrieval and comparison === |
||
Line 30: | Line 31: | ||
Navigate to the subsite with the missense/nonsense mutations and manually save it to 'hgmd_hexa_misssense_nonsense.php'. Then apply the following command: |
Navigate to the subsite with the missense/nonsense mutations and manually save it to 'hgmd_hexa_misssense_nonsense.php'. Then apply the following command: |
||
<source lang="bash"> |
<source lang="bash"> |
||
− | parseHGMDMutations.pl hgmd_hexa_misssense_nonsense.php > |
+ | parseHGMDMutations.pl hgmd_hexa_misssense_nonsense.php > missenseSNPHGMD_NONPROF.txt |
+ | perl parseHGMDMutations.pl -h ./hgmd_hexa_misssense_nonsense_FULL.php -p > missenseSNPHGMD.txt #If given access to the professional version |
||
+ | #Afterwards manually remove non-unique entries, if present |
||
</source> |
</source> |
||
Where 'parseHGMDMutations.pl' is the following script: |
Where 'parseHGMDMutations.pl' is the following script: |
||
Line 42: | Line 45: | ||
use Bio::SeqUtils; |
use Bio::SeqUtils; |
||
− | my $ |
+ | my $professional = 0; |
+ | my $hgmd_page; |
||
+ | |||
+ | use Getopt::Long; |
||
+ | my $data = "file.dat"; |
||
+ | my $length = 24; |
||
+ | my $verbose; |
||
+ | my $result = GetOptions ( |
||
+ | "h=s" => \$hgmd_page, # string |
||
+ | "p" => \$professional); # flag |
||
+ | |||
+ | if(!defined $hgmd_page) |
||
+ | { |
||
+ | die "Define path to hgmd PHP file\n"; |
||
+ | } |
||
my $fh; |
my $fh; |
||
Line 48: | Line 65: | ||
while(my $line = <$fh>) |
while(my $line = <$fh>) |
||
{ |
{ |
||
+ | if($professional) |
||
− | #This pattern matching will not find nonsense mutations, therefore already implicitly excluding these |
||
+ | { |
||
− | if($line =~ m|<tr bgcolor="#\w{6}"><td align="center"><a name="\w{8}">(\w{8})</a></td><td align="left">\w+-\w+</td><td align="left">(\w{3})-(\w{3})</td><td align="right">(\d+)</td><td> |
||
+ | # <td align=left>cATG-GTG</td><td align=left>Met1Val</td><td align=left>c.1A>G</td><td>p.M1V</td><td align=left>Tay-Sachs disease</td><td><a href='http://www.ncbi.nlm.nih.gov/sites/entrez?cmd=Retrieve&db=PubMed&list_uids=1532289&dopt=Abstract' target=ref>Mules (1992) Am J Hum Genet <b>50,</b> 834</a></td><td align=center> <img src='bar/dm.gif' title='Disease causing mutation' alt='Disease causing mutation'> <a href='http://www.ncbi.nlm.nih.gov/sites/entrez?db=snp&cmd=search&term=rs121907965' target='dbSNP-entry'><img src='bar/dbsnp.gif' border='0' title='dbSNP ID: rs121907965' alt='dbSNP ID: rs121907965'></a> <img src='bar/genomic.gif' title='Coordinate Chr 15: 72668313' alt='Coordinate Chr 15: 72668313'></td></tr> |
||
− | <a href="http://www.biobase-international.com/product/hgmd"><img src=".+" alt="BioBase" border="0"></a></td><td align="left">(.+)</td><td>.+</tr>|) |
||
+ | if($line =~ m|<td align=left>\w+-\w+</td><td align=left>(\w{3})(\d+)(\w{3})</td><td align=left>.+</td><td>p\..+</td><td align=left>(.+?)</td>.+|) |
||
− | { |
||
− | + | { |
|
− | my $ |
+ | my $id = ''; |
− | my $ |
+ | my $aa_wt = $1; |
− | my $ |
+ | my $aa_mt = $3; |
− | my $ |
+ | my $pos = $2; |
+ | my $pheno = $4; |
||
− | |||
+ | |||
− | print $Bio::SeqUtils::ONECODE{$aa_wt} . $pos . $Bio::SeqUtils::ONECODE{$aa_mt} . "\tHGMD\t$id\t$pheno\n"; |
||
+ | print $Bio::SeqUtils::ONECODE{$aa_wt} . $pos . $Bio::SeqUtils::ONECODE{$aa_mt} . "\tHGMD\t$id\t$pheno\tmissense\n"; |
||
− | } |
||
+ | } |
||
+ | } |
||
+ | else |
||
+ | { |
||
+ | #This pattern matching will not find nonsense mutations, therefore already implicitly excluding these |
||
+ | if($line =~ m|<tr bgcolor="#\w{6}"><td align="center"><a name="\w{8}">(\w{8})</a></td><td align="left">\w+-\w+</td><td align="left">(\w{3})-(\w{3})</td><td align="right">(\d+)</td><td><a href="http://www.biobase-international.com/product/hgmd"><img src=".+" alt="BioBase" border="0"></a></td><td align="left">(.+)</td><td>.+</tr>|) |
||
+ | { |
||
+ | my $id = $1; |
||
+ | my $aa_wt = $2; |
||
+ | my $aa_mt = $3; |
||
+ | my $pos = $4; |
||
+ | my $pheno = $5; |
||
+ | |||
+ | print $Bio::SeqUtils::ONECODE{$aa_wt} . $pos . $Bio::SeqUtils::ONECODE{$aa_mt} . "\tHGMD\t$id\t$pheno\tmissense\n"; |
||
+ | } |
||
+ | } |
||
} |
} |
||
close($fh); |
close($fh); |
||
</source> |
</source> |
||
+ | To find the intersection and unique elements between the free and professional version of HGMD use the general mechanism exemplified for [[Researching_SNPs_TSD_Journal#Overlap_to_dbSNP|SNPedia]]. |
||
== dbSNP == |
== dbSNP == |
||
Line 125: | Line 159: | ||
if (!$opt_missense){ |
if (!$opt_missense){ |
||
if ($reference[$pos-1] eq $aa){ |
if ($reference[$pos-1] eq $aa){ |
||
− | print "$aa$pos$aa\t$opt_d\t$rs\t\n"; |
+ | print "$aa$pos$aa\t$opt_d\t$rs\t\t". (($opt_missense) ? 'missense' : 'synonymous') ."\n"; |
} else { |
} else { |
||
print "! not matching reference $reference[$pos-1]: $aa$pos$aa\n"; |
print "! not matching reference $reference[$pos-1]: $aa$pos$aa\n"; |
||
Line 133: | Line 167: | ||
if ($key ne $aa){ |
if ($key ne $aa){ |
||
if ($reference[$pos-1] eq $aa){ |
if ($reference[$pos-1] eq $aa){ |
||
− | print "$aa$pos$key\t$opt_d\t$rs\t\n"; |
+ | print "$aa$pos$key\t$opt_d\t$rs\t\t". (($opt_missense) ? 'missense' : 'synonymous') . "\n"; |
} else { |
} else { |
||
print "! not matching reference $reference[$pos-1]: $aa$pos$key\n"; |
print "! not matching reference $reference[$pos-1]: $aa$pos$key\n"; |
||
Line 167: | Line 201: | ||
== SNPdbe == |
== SNPdbe == |
||
+ | The results are provided on the webpage in a tabular format, where a lot of information can be obtained very easily just by sorting the columns (e.g. conservation scores, exp. evidence). |
||
+ | A text file with the results snps.txt can also be downloaded directly from the [http://www.rostlab.org/services/snpdbe/search.php webpage]. Within the text file the fields are separated by "|". In the first field the mutations is displayed. To format the SNPs according to the previously used conventions SNP ORIGIN ID this script can be used. |
||
+ | |||
+ | <source lang="perl"> |
||
+ | #!usr/bin/perl |
||
+ | open(IN,"<$ARGV[0]") or die "no input file"; |
||
+ | while (<IN>){ |
||
+ | $sep=""; |
||
+ | $iddbsnp=""; |
||
+ | $idswissvar=""; |
||
+ | $Dsep=""; |
||
+ | @line = split('\|', $_); |
||
+ | print $line[0]."\tsnpdbe: "; |
||
+ | if ($line[1] ne "N/A"){ |
||
+ | print "dbSNP"; |
||
+ | $sep=","; |
||
+ | $Dsep=","; |
||
+ | $iddbsnp = $line[1]; |
||
+ | } |
||
+ | if ($line[2] ne "N/A"){ |
||
+ | print $sep."SwissProt"; |
||
+ | $sep=","; |
||
+ | } |
||
+ | if ($line[3] ne "N/A"){ |
||
+ | print $sep."SwissVar"; |
||
+ | $sep=","; |
||
+ | $idswissvar = $line[3]; |
||
+ | } else { $Dsep="";} |
||
+ | if ($line[4] ne "N/A"){ |
||
+ | print $sep."PMD"; |
||
+ | $sep=","; |
||
+ | } |
||
+ | if ($line[5] ne "N/A"){ |
||
+ | print $sep."1000genomes"; |
||
+ | } |
||
+ | print "\t $iddbsnp$Dsep$idswissvar \n"; |
||
+ | } |
||
+ | </source> |
||
+ | |||
+ | The conservation scores are stored in the field 21 and 24, perc_wt and perc_mt: |
||
+ | cut -d "|" -f 1,22,25 snps.txt | sed s/\|/" "/g |
||
+ | |||
== OMIM == |
== OMIM == |
||
+ | === Entries with rs-identifiers === |
||
+ | Retrieve all entries that have rs-Identifiers: |
||
+ | <source lang="bash"> |
||
+ | curl -s http://omim.org/allelicVariant/606869 | grep -P --only-matching "target=\"_blank\">rs\d+</a>" | sed 's/\s//' | grep -P --only-matching "rs\d+" | sort | uniq > rs_OMIM.txt |
||
+ | </source> |
||
+ | Get more information on the entries from dbSNP, using the same script as used for SNPedia: |
||
+ | <source lang="bash"> |
||
+ | ../snpedia/getSNPData.sh rs_OMIM.txt |
||
+ | </source> |
||
+ | and use the same script as used for dbSNP to format the final files with missense and synonymous SNPs: |
||
+ | <source lang="bash"> |
||
+ | perl ../dbSNP/formatSNPs.pl -i ./enriched_mis_rs_OMIM.txt -f ../dbSNP/P06865.fasta -missense -d OMIM > missenseSNPOMIM.txt |
||
+ | perl ../dbSNP/formatSNPs.pl -i ./enriched_syn_rs_OMIM.txt -f ../dbSNP/P06865.fasta -d OMIM > synonymousSNPOMIM.txt |
||
+ | </source> |
||
+ | Intersection to dbSNP can be found analogous to the description for SNPedia. Note that in this case only 29 rs-identifiers are supposed to be in the intersection. However since one identifer appears two times in OMIM (it describes two mutations), there is actually no entry unqiue to OMIM. This can also be verified with the following commandline (using the same naming scheme as for SNPedia): |
||
+ | <source lang="bash"> |
||
+ | grep -Fxv -f dbsnpc tempc #Does not return anything |
||
+ | </source> |
||
+ | |||
+ | Finally add the one entry that does not have an rs-Identifier but describes a valid missense variant (C58Y). |
||
+ | |||
== SNPedia == |
== SNPedia == |
||
Retrieve all rs-identifiers from SNPedia (and already exclude one entry without protein information), using the following commandline: |
Retrieve all rs-identifiers from SNPedia (and already exclude one entry without protein information), using the following commandline: |
||
Line 182: | Line 279: | ||
#!/usr/bin/bash |
#!/usr/bin/bash |
||
in=$1 |
in=$1 |
||
− | rm -f |
+ | rm -f enriched_syn_$in |
+ | rm -f enriched_mis_$in |
||
+ | |||
for rs in `cat $in`; do |
for rs in `cat $in`; do |
||
− | -s "http://www.ncbi.nlm.nih.gov/projects/SNP/snp_gene.cgi?rs=$rs" > temp |
+ | curl -s "http://www.ncbi.nlm.nih.gov/projects/SNP/snp_gene.cgi?rs=$rs" > temp |
− | if [ `egrep 'missense |
+ | if [ `egrep 'missense' ./temp |wc -l` -gt 0 ] |
then |
then |
||
− | echo $rs >> |
+ | echo $rs >> enriched_mis_$in |
cat temp | egrep "aaCode|proteinPos" | cut -d \" -f 4 > temp2 |
cat temp | egrep "aaCode|proteinPos" | cut -d \" -f 4 > temp2 |
||
#In theory, do special parsing of entries that consist of two or more mutations. Luckily for our case there are none ;) |
#In theory, do special parsing of entries that consist of two or more mutations. Luckily for our case there are none ;) |
||
− | cat temp2 >> |
+ | cat temp2 >> enriched_mis_$in |
− | echo "---" >> |
+ | echo "---" >> enriched_mis_$in |
else |
else |
||
+ | if [ `egrep 'synon' ./temp |wc -l` -gt 0 ] |
||
− | echo "Ignoring $rs because neither missense nor synon" |
||
+ | then |
||
− | fi |
||
+ | echo $rs >> enriched_syn_$in |
||
+ | cat temp | egrep "aaCode|proteinPos" | cut -d \" -f 4 > temp2 |
||
+ | #In theory, do special parsing of entries that consist of two or more mutations. Luckily for our case there are none ;) |
||
+ | cat temp2 >> enriched_syn_$in |
||
+ | echo "---" >> enriched_syn_$in |
||
+ | else |
||
+ | echo "Ignoring $rs because neither missense nor synon" |
||
+ | fi |
||
+ | fi |
||
done |
done |
||
Line 202: | Line 310: | ||
With the output of this, we can use the script TODO from above to get the final formatted SNPs: |
With the output of this, we can use the script TODO from above to get the final formatted SNPs: |
||
<source lang="bash"> |
<source lang="bash"> |
||
− | perl ../dbSNP/formatSNPs.pl -i ./ |
+ | perl ../dbSNP/formatSNPs.pl -i ./enriched_mis_rs_SNPedia.txt -f ../dbSNP/P06865.fasta -missense -d SNPedia > missenseSNPSNPedia.txt |
+ | perl ../dbSNP/formatSNPs.pl -i ./enriched_syn_rs_SNPedia.txt -f ../dbSNP/P06865.fasta -d SNPedia > > synonymousSNPSNPedia.txt |
||
+ | </source> |
||
+ | === Overlap to dbSNP === |
||
+ | <source lang="bash"> |
||
+ | cut -f 3 missenseSNPSNPedia.txt > temp1 |
||
+ | cut -f 3 synonymousSNPSNPedia.txt > temp2 |
||
+ | cat temp1 temp2 > tempc |
||
+ | cat ../dbSNP/synonymous_rsids.txt ../dbSNP/missense_rsids.txt > dbsnpc |
||
+ | grep -Fx -f tempc dbsnpc | wc -l #Yields: 30 |
||
</source> |
</source> |
||
== Mutation map == |
== Mutation map == |
||
+ | Create a complete map of mutations by combining all previous files and then call the master script that creates all source files for plotting: |
||
+ | <source lang="bash"> |
||
+ | cat ../dbSNP/missenseSNPdbSNP.txt ../dbSNP/synonymousSNPdbSNP.txt ../OMIM/missenseSNPOMIM.txt ../OMIM/synonymousSNPOMIM.txt ../HGMD/missenseSNPHGMD.txt ../snpdbe/mutSnpdbe.txt |
||
+ | ../snpedia/missenseSNPSNPedia.txt ../snpedia/synonymousSNPSNPedia.txt > combinedSNP.txt |
||
+ | perl createMap.pl combinedSNP.txt > mutationMap #More files are created in the background |
||
+ | </source> |
||
+ | [[TSD createMap.pl|createMap.pl]] |
||
+ | === Plotting === |
||
+ | ==== Wildtype and Mutant Amino Acids ==== |
||
+ | <source lang="bash"> |
||
+ | library(gplots) #Needs caTools, bitops and a few more. Gives us: barplot2 |
||
+ | |||
+ | clwdaxis <- 20 |
||
+ | ccaxis <- 2 |
||
+ | ccmain <- 1.7 |
||
+ | cbarnames <- 2 |
||
+ | cclab <- 2.7 |
||
+ | |||
+ | d <- read.table("aaPropensities") |
||
+ | |||
+ | colors <- c("DarkGreen", "red") |
||
+ | |||
+ | png(width=1000, height=900, "TSD SNP aaPropensities.png") |
||
+ | par(mar=par()$mar+c(0,1,-1,0)) |
||
+ | barplot2(as.matrix(d), beside=TRUE, col=colors, cex.axis=ccaxis, cex.main=3.0, cex.lab=cclab, cex.names=cbarnames, main="Frequency of wildtype and mutant amino acids", ylab="# of occurences",ylim=c(0,20)) |
||
+ | legend("topleft", c("Wildtype", "Mutant"),fill=colors, cex=2.5, inset=0.05) |
||
+ | dev.off() |
||
+ | </source> |
||
+ | ==== Databases per positon ==== |
||
+ | TODO |
||
+ | ==== Pymol ==== |
||
+ | Based on the script by Fabry-Disease. Execute, save output to script.pml and execute with python -cq script.pml |
||
+ | <source lang="bash"> |
||
+ | #!/bin/bash |
||
+ | |||
+ | cat <<EOT |
||
+ | reinitialize |
||
+ | load referenz_close_lessshallow.pse, referenz |
||
+ | bg_color white |
||
+ | set ray_opaque_background, off |
||
+ | |||
+ | |||
+ | ###non disease causing### |
||
+ | unset resi |
||
+ | for resi in `cat resi_nsynndisonly`;do |
||
+ | echo "select non-disease, resi $resi, merge = 1" |
||
+ | done |
||
+ | if [[ -n $resi ]]; then |
||
+ | cat <<EOT |
||
+ | color blue, non-disease |
||
+ | disable non-disease |
||
+ | |||
+ | EOT |
||
+ | fi |
||
+ | |||
+ | ###disease### |
||
+ | |||
+ | unset resi |
||
+ | for resi in `cat resi_nsyndisonly`;do |
||
+ | echo "select disease, resi $resi, merge = 1" |
||
+ | done |
||
+ | if [[ -n $resi ]]; then |
||
+ | cat <<EOT |
||
+ | color red, disease |
||
+ | disable disease |
||
+ | EOT |
||
+ | fi |
||
+ | |||
+ | EOT |
||
+ | ###silent### |
||
+ | unset resi |
||
+ | for resi in `cat resi_synonly`;do |
||
+ | echo "select silent, resi $resi, merge = 1" |
||
+ | done |
||
+ | |||
+ | if [[ -n $resi ]]; then |
||
+ | cat <<EOT |
||
+ | color slate, silent |
||
+ | disable non-disease |
||
+ | |||
+ | set ray_trace_mode, 0; # color raytrace, no outlines |
||
+ | set antialias, 2; #High anti-aliasing |
||
+ | #How do I crank-up the glossyness of rendered atoms? |
||
+ | set spec_power = 200; # in range 20 - 250 |
||
+ | set spec_refl=1.5; # in range 1.0 - 2.0 |
||
+ | |||
+ | set orthoscopic, off; #perspective |
||
+ | set ray_shadows,0; # turn off shadows |
||
+ | ray 1450,900; |
||
+ | save highStructure.png |
||
+ | EOT |
||
+ | fi |
||
+ | </source> |
Latest revision as of 01:21, 12 June 2012
Back to results.
Contents
HGMD
Sequence retrieval and comparison
Retrieve the protein sequence from HGMD by going to the sequence view and applying switch view, as described in the task description. Download the page manually and save to a file (cdna_new.php). Then execute the following command: <source lang="bash">
grep -P --only-matching "\w{3}
\w{3}" ./cdna_new.php | cut -c99-101 | sed -n '$!p' | tr -d '\n' | tr '[:lower:]' '[:upper:]' |
./ThreeToOneLetterCode.py | tr -d '\n' > hgmdSequence_aa </source> Where 'ThreeToOneLetterCode.py' is the following script <source lang="python">
- !/usr/bin/env python
from Bio.PDB import to_one_letter_code as one_letter import sys
seq = sys.stdin.readline()
for aaa in range(0, len(seq), 3) :
print one_letter[seq[aaa:aaa+3]]
</source> To check whether the protein sequence from HGMD is the same than the one in Uniprot perform the following operations: <source lang="bash"> curl -s http://www.uniprot.org/uniprot/P06865.fasta | sed '1d' | tr -d '\n' > HEXA_HUMAN #Get Uniprot sequence without header perl -p -e 's|(.)|$1\n|g' HEXA_HUMAN > temp1 #Insert linebreak after every character in both sequences, for easy diff'ing perl -p -e 's|(.)|$1\n|g' hgmdSequence_aa > temp2 diff temp1 temp2 </source> This yields that position 436 differs with an Ile in Uniprot and a Val in HGMD.
Mutation retrieval and parsing
Navigate to the subsite with the missense/nonsense mutations and manually save it to 'hgmd_hexa_misssense_nonsense.php'. Then apply the following command: <source lang="bash"> parseHGMDMutations.pl hgmd_hexa_misssense_nonsense.php > missenseSNPHGMD_NONPROF.txt perl parseHGMDMutations.pl -h ./hgmd_hexa_misssense_nonsense_FULL.php -p > missenseSNPHGMD.txt #If given access to the professional version
- Afterwards manually remove non-unique entries, if present
</source> Where 'parseHGMDMutations.pl' is the following script: <source lang="perl">
- !/usr/bin/perl
use strict; use warnings; use sigtrap; use autodie; use Bio::SeqUtils;
my $professional = 0; my $hgmd_page;
use Getopt::Long;
my $data = "file.dat"; my $length = 24; my $verbose; my $result = GetOptions ( "h=s" => \$hgmd_page, # string "p" => \$professional); # flag
if(!defined $hgmd_page) {
die "Define path to hgmd PHP file\n";
}
my $fh; open($fh, '<', $hgmd_page); while(my $line = <$fh>) {
if($professional) {
# cATG-GTGMet1Valc.1A>Gp.M1VTay-Sachs disease<a href='http://www.ncbi.nlm.nih.gov/sites/entrez?cmd=Retrieve&db=PubMed&list_uids=1532289&dopt=Abstract' target=ref>Mules (1992) Am J Hum Genet 50, 834</a> <img src='bar/dm.gif' title='Disease causing mutation' alt='Disease causing mutation'> <a href='http://www.ncbi.nlm.nih.gov/sites/entrez?db=snp&cmd=search&term=rs121907965' target='dbSNP-entry'><img src='bar/dbsnp.gif' border='0' title='dbSNP ID: rs121907965' alt='dbSNP ID: rs121907965'></a> <img src='bar/genomic.gif' title='Coordinate Chr 15: 72668313' alt='Coordinate Chr 15: 72668313'> if($line =~ m|\w+-\w+(\w{3})(\d+)(\w{3}).+p\..+(.+?).+|)
{ my $id = ; my $aa_wt = $1; my $aa_mt = $3; my $pos = $2; my $pheno = $4; print $Bio::SeqUtils::ONECODE{$aa_wt} . $pos . $Bio::SeqUtils::ONECODE{$aa_mt} . "\tHGMD\t$id\t$pheno\tmissense\n"; } } else { #This pattern matching will not find nonsense mutations, therefore already implicitly excluding these
if($line =~ m|<a name="\w{8}">(\w{8})</a>\w+-\w+(\w{3})-(\w{3})(\d+)<a href="http://www.biobase-international.com/product/hgmd"><img src=".+" alt="BioBase" border="0"></a>(.+).+|)
{ my $id = $1; my $aa_wt = $2; my $aa_mt = $3; my $pos = $4; my $pheno = $5; print $Bio::SeqUtils::ONECODE{$aa_wt} . $pos . $Bio::SeqUtils::ONECODE{$aa_mt} . "\tHGMD\t$id\t$pheno\tmissense\n"; } }
} close($fh); </source> To find the intersection and unique elements between the free and professional version of HGMD use the general mechanism exemplified for SNPedia.
dbSNP
At first the SNPs were queried with "synonymous-codon"[Function_Class] AND HEXA[GENE] AND "human"[ORGN] AND "snp"[SNP_CLASS] in dbSNP. Those results were saved and then the mutations extracted with
bash getdbSNP.sh entrez.html
It is important to note that in the dbSNP results the rs ids can be redundant. getdbSNP.sh: <source lang="bash">
- !/bin/bash
- ${1?"Usage: $0 textfile"}
website=$1
rm -f Mut_$website
for rs in `egrep -o ">rs[0-9]+<" $website |uniq | egrep -o "rs[0-9]+"`; do echo $rs >> Mut_$website curl -s "http://www.ncbi.nlm.nih.gov/projects/SNP/snp_gene.cgi?rs=$rs" | egrep "aaCode|proteinPos" | cut -d \" -f 4 >> Mut_$website echo "---" >> Mut_$website done </source>
The mutations were then checked with the reference sequence (!) and brought into the right format:
perl formatSNPs.pl -i Mut_entrez.html -f P06865.fasta
<source lang="perl">
- !usr/bin/perl
use Getopt::Long; our $opt_f = (); our $opt_d = "dbSNP"; sub usage {
printf "Usage: $0 [flags] file...\n"; printf "flags:\n";
printf " -i Infile raw mutations from getdbSNP.sh \n"; printf " -f Referenzfile fastafile containing referenz sequenz "; printf " -missense Missesne mutations are considered, default = synonymous "; printf " -d name of database to write in output file, default = dbSNP "; exit 1;
} GetOptions( "i=s", "f=s", "missense", "d=s" ); usage() if !( $opt_i && $opt_f );
open( REF, "<$opt_f" ) or die "Referenzfile not found";
@reference = (); while (<REF>){ chomp; @line = split( , $_ ); if ($line[0] ne ">") { @reference = (@reference,@line); } }
open (IN, "<$opt_i") or die "Inputfile not found"; %muts = (); my $rs; while(<IN>){ chomp; if ($_ eq "---"){ if (!$opt_missense){ if ($reference[$pos-1] eq $aa){ print "$aa$pos$aa\t$opt_d\t$rs\t\t". (($opt_missense) ? 'missense' : 'synonymous') ."\n"; } else { print "! not matching reference $reference[$pos-1]: $aa$pos$aa\n"; } } else { foreach my $key ( keys %muts ){ if ($key ne $aa){ if ($reference[$pos-1] eq $aa){ print "$aa$pos$key\t$opt_d\t$rs\t\t". (($opt_missense) ? 'missense' : 'synonymous') . "\n"; } else { print "! not matching reference $reference[$pos-1]: $aa$pos$key\n"; } } } } $count=0; $aa=""; %muts = ();
$rs = ;
} elsif ($count==0){ $rs=$_; $count++; } elsif ($count == 1)
{ $pos = $_; $count++; }
elsif ($count>=2){ $muts{$_}=1; $aa=$_; $count++; } } </source>
The same procedure can be performed with the missense Mutations from dbSNP with the dbSNP search "missense"[Function_Class] AND HEXA[GENE] AND "human"[ORGN] AND "snp"[SNP_CLASS] followed by
bash getdbSNP.sh entrezMissense.html perl formatSNPs.pl -i Mut_entrezMissense.html -f P06865.fasta -missense
SNPdbe
The results are provided on the webpage in a tabular format, where a lot of information can be obtained very easily just by sorting the columns (e.g. conservation scores, exp. evidence). A text file with the results snps.txt can also be downloaded directly from the webpage. Within the text file the fields are separated by "|". In the first field the mutations is displayed. To format the SNPs according to the previously used conventions SNP ORIGIN ID this script can be used.
<source lang="perl">
- !usr/bin/perl
open(IN,"<$ARGV[0]") or die "no input file"; while (<IN>){
$sep=""; $iddbsnp=""; $idswissvar=""; $Dsep=""; @line = split('\|', $_); print $line[0]."\tsnpdbe: "; if ($line[1] ne "N/A"){ print "dbSNP"; $sep=","; $Dsep=","; $iddbsnp = $line[1]; } if ($line[2] ne "N/A"){ print $sep."SwissProt"; $sep=","; } if ($line[3] ne "N/A"){ print $sep."SwissVar"; $sep=","; $idswissvar = $line[3]; } else { $Dsep="";} if ($line[4] ne "N/A"){ print $sep."PMD"; $sep=","; } if ($line[5] ne "N/A"){ print $sep."1000genomes"; } print "\t $iddbsnp$Dsep$idswissvar \n";
} </source>
The conservation scores are stored in the field 21 and 24, perc_wt and perc_mt:
cut -d "|" -f 1,22,25 snps.txt | sed s/\|/" "/g
OMIM
Entries with rs-identifiers
Retrieve all entries that have rs-Identifiers: <source lang="bash"> curl -s http://omim.org/allelicVariant/606869 | grep -P --only-matching "target=\"_blank\">rs\d+</a>" | sed 's/\s//' | grep -P --only-matching "rs\d+" | sort | uniq > rs_OMIM.txt </source> Get more information on the entries from dbSNP, using the same script as used for SNPedia: <source lang="bash"> ../snpedia/getSNPData.sh rs_OMIM.txt </source> and use the same script as used for dbSNP to format the final files with missense and synonymous SNPs: <source lang="bash"> perl ../dbSNP/formatSNPs.pl -i ./enriched_mis_rs_OMIM.txt -f ../dbSNP/P06865.fasta -missense -d OMIM > missenseSNPOMIM.txt perl ../dbSNP/formatSNPs.pl -i ./enriched_syn_rs_OMIM.txt -f ../dbSNP/P06865.fasta -d OMIM > synonymousSNPOMIM.txt </source> Intersection to dbSNP can be found analogous to the description for SNPedia. Note that in this case only 29 rs-identifiers are supposed to be in the intersection. However since one identifer appears two times in OMIM (it describes two mutations), there is actually no entry unqiue to OMIM. This can also be verified with the following commandline (using the same naming scheme as for SNPedia): <source lang="bash"> grep -Fxv -f dbsnpc tempc #Does not return anything </source>
Finally add the one entry that does not have an rs-Identifier but describes a valid missense variant (C58Y).
SNPedia
Retrieve all rs-identifiers from SNPedia (and already exclude one entry without protein information), using the following commandline: <source lang="bash"> curl -s http://www.snpedia.com/index.php\?title\=Special:Search\&ns0\=1\&redirs\=0\&search\=Gene%3DHEXA\&limit\=500\&offset\=0 | grep -P --only-matching "title=\"Rs\d+\"" | sed 's|title=||' | tr -d '"' | tr '[:upper:]' '[:lower:]' | grep -v 'rs121907980' > rs_SNPedia.txt </source> Then get more information on the rs-identifiers from dbSNP with the following comamndline <source lang="bash"> ./getSNPData.sh rs_SNPedia.txt </source> where getSNPData.sh ist the following script; <source lang="bash">
- !/usr/bin/bash
in=$1 rm -f enriched_syn_$in rm -f enriched_mis_$in
for rs in `cat $in`; do
curl -s "http://www.ncbi.nlm.nih.gov/projects/SNP/snp_gene.cgi?rs=$rs" > temp if [ `egrep 'missense' ./temp |wc -l` -gt 0 ] then echo $rs >> enriched_mis_$in cat temp | egrep "aaCode|proteinPos" | cut -d \" -f 4 > temp2 #In theory, do special parsing of entries that consist of two or more mutations. Luckily for our case there are none ;) cat temp2 >> enriched_mis_$in echo "---" >> enriched_mis_$in else if [ `egrep 'synon' ./temp |wc -l` -gt 0 ] then echo $rs >> enriched_syn_$in cat temp | egrep "aaCode|proteinPos" | cut -d \" -f 4 > temp2 #In theory, do special parsing of entries that consist of two or more mutations. Luckily for our case there are none ;) cat temp2 >> enriched_syn_$in echo "---" >> enriched_syn_$in else echo "Ignoring $rs because neither missense nor synon" fi fi
done
rm -f ./temp rm -f ./temp2 </source> With the output of this, we can use the script TODO from above to get the final formatted SNPs: <source lang="bash"> perl ../dbSNP/formatSNPs.pl -i ./enriched_mis_rs_SNPedia.txt -f ../dbSNP/P06865.fasta -missense -d SNPedia > missenseSNPSNPedia.txt perl ../dbSNP/formatSNPs.pl -i ./enriched_syn_rs_SNPedia.txt -f ../dbSNP/P06865.fasta -d SNPedia > > synonymousSNPSNPedia.txt </source>
Overlap to dbSNP
<source lang="bash"> cut -f 3 missenseSNPSNPedia.txt > temp1 cut -f 3 synonymousSNPSNPedia.txt > temp2 cat temp1 temp2 > tempc cat ../dbSNP/synonymous_rsids.txt ../dbSNP/missense_rsids.txt > dbsnpc grep -Fx -f tempc dbsnpc | wc -l #Yields: 30 </source>
Mutation map
Create a complete map of mutations by combining all previous files and then call the master script that creates all source files for plotting: <source lang="bash"> cat ../dbSNP/missenseSNPdbSNP.txt ../dbSNP/synonymousSNPdbSNP.txt ../OMIM/missenseSNPOMIM.txt ../OMIM/synonymousSNPOMIM.txt ../HGMD/missenseSNPHGMD.txt ../snpdbe/mutSnpdbe.txt ../snpedia/missenseSNPSNPedia.txt ../snpedia/synonymousSNPSNPedia.txt > combinedSNP.txt perl createMap.pl combinedSNP.txt > mutationMap #More files are created in the background </source> createMap.pl
Plotting
Wildtype and Mutant Amino Acids
<source lang="bash"> library(gplots) #Needs caTools, bitops and a few more. Gives us: barplot2
clwdaxis <- 20 ccaxis <- 2 ccmain <- 1.7 cbarnames <- 2 cclab <- 2.7
d <- read.table("aaPropensities")
colors <- c("DarkGreen", "red")
png(width=1000, height=900, "TSD SNP aaPropensities.png") par(mar=par()$mar+c(0,1,-1,0)) barplot2(as.matrix(d), beside=TRUE, col=colors, cex.axis=ccaxis, cex.main=3.0, cex.lab=cclab, cex.names=cbarnames, main="Frequency of wildtype and mutant amino acids", ylab="# of occurences",ylim=c(0,20)) legend("topleft", c("Wildtype", "Mutant"),fill=colors, cex=2.5, inset=0.05) dev.off() </source>
Databases per positon
TODO
Pymol
Based on the script by Fabry-Disease. Execute, save output to script.pml and execute with python -cq script.pml <source lang="bash">
- !/bin/bash
cat <<EOT reinitialize load referenz_close_lessshallow.pse, referenz bg_color white set ray_opaque_background, off
- non disease causing###
unset resi for resi in `cat resi_nsynndisonly`;do echo "select non-disease, resi $resi, merge = 1" done if -n $resi ; then cat <<EOT color blue, non-disease disable non-disease
EOT fi
- disease###
unset resi for resi in `cat resi_nsyndisonly`;do echo "select disease, resi $resi, merge = 1" done if -n $resi ; then cat <<EOT color red, disease disable disease EOT fi
EOT
- silent###
unset resi for resi in `cat resi_synonly`;do
echo "select silent, resi $resi, merge = 1"
done
if -n $resi ; then
cat <<EOT
color slate, silent disable non-disease
set ray_trace_mode, 0; # color raytrace, no outlines set antialias, 2; #High anti-aliasing
- How do I crank-up the glossyness of rendered atoms?
set spec_power = 200; # in range 20 - 250 set spec_refl=1.5; # in range 1.0 - 2.0
set orthoscopic, off; #perspective set ray_shadows,0; # turn off shadows ray 1450,900; save highStructure.png EOT fi </source>