Difference between revisions of "Task5 Hemochromatosis Protocol"
Bernhoferm (talk | contribs) (→OMIM) |
Bernhoferm (talk | contribs) (→SNPdbe) |
||
(2 intermediate revisions by the same user not shown) | |||
Line 17: | Line 17: | ||
We used the non-professional version of [http://www.hgmd.org/ HGMD] and looked for SNPs using the gene symbol HFE as query. |
We used the non-professional version of [http://www.hgmd.org/ HGMD] and looked for SNPs using the gene symbol HFE as query. |
||
+ | |||
+ | In order to get the SNP statistics the following script was used: |
||
+ | |||
+ | <source lang="perl"> |
||
+ | #!/usr/local/bin/perl |
||
+ | use strict; |
||
+ | use warnings; |
||
+ | use diagnostics; |
||
+ | use sigtrap; |
||
+ | use autodie; |
||
+ | |||
+ | my $inFile = "HFE.htm"; |
||
+ | my $outFile = "HGMD.txt"; |
||
+ | |||
+ | |||
+ | open (IN, "<", $inFile) or die ("Could not open input file!"); |
||
+ | open (OUT, ">", $outFile) or die ("Could not open output file!"); |
||
+ | |||
+ | #print header |
||
+ | print OUT "ID\tType\tCodon change\tAmino acid change\tPosition\tPhenotype\tReference\n"; |
||
+ | |||
+ | while (<IN>) |
||
+ | { |
||
+ | my $line = $_; |
||
+ | my $dummy = $line; |
||
+ | |||
+ | $dummy=~s/\s//gi; |
||
+ | |||
+ | if ($dummy ne "") |
||
+ | { |
||
+ | my @content = split(/<\/td>/, $line); |
||
+ | |||
+ | my $id = $content[0]; |
||
+ | my $type; |
||
+ | my $codon = $content[1]; |
||
+ | my $aa = $content[2]; |
||
+ | my $pos = $content[3]; |
||
+ | my $phenotype = $content[5]; |
||
+ | my $ref = $content[6]; |
||
+ | my $link; |
||
+ | |||
+ | $id =~m/>(\w+?)</gi; |
||
+ | $id = $1; |
||
+ | |||
+ | $codon =~m/>([\w-]+?)$/gi; |
||
+ | $codon = $1; |
||
+ | |||
+ | $aa =~m/>([\w-]+?)$/gi; |
||
+ | $aa = $1; |
||
+ | |||
+ | $pos =~m/>(\d+?)$/gi; |
||
+ | $pos = $1; |
||
+ | |||
+ | $phenotype =~m/>([\w\s,?]+?)$/gi; |
||
+ | $phenotype = $1; |
||
+ | |||
+ | $ref =~m/^<.*?href="(.+?)">(.*?)<\/a>/gi; |
||
+ | $link = $1; |
||
+ | $ref = $2; |
||
+ | $ref =~ s/<.*?>//gi; |
||
+ | |||
+ | if ($aa =~m/Term/gi) |
||
+ | { |
||
+ | $type = "Nonsense"; |
||
+ | } |
||
+ | elsif ($aa =~m/^(.+?)-(.+?)$/g && $1 eq $2) |
||
+ | { |
||
+ | $type = "Synonymous"; |
||
+ | } |
||
+ | else |
||
+ | { |
||
+ | $type = "Non-synonymous"; |
||
+ | } |
||
+ | |||
+ | print OUT "$id\t$type\t$codon\t$aa\t$pos\t$phenotype\t[$link $ref]\n"; |
||
+ | } |
||
+ | } |
||
+ | |||
+ | close IN; |
||
+ | close OUT; |
||
+ | </source> |
||
<br style="clear:both;"> |
<br style="clear:both;"> |
||
Line 32: | Line 113: | ||
* Quick-Link: [http://www.rostlab.org/services/snpdbe/dosearch.php?id=name&val=Q30201&organism2=human&organism1= Q30201] |
* Quick-Link: [http://www.rostlab.org/services/snpdbe/dosearch.php?id=name&val=Q30201&organism2=human&organism1= Q30201] |
||
* Quick-Link: [http://www.rostlab.org/services/snpdbe/dosearch.php?id=name&val=HFE&organism2=human&organism1= HFE] |
* Quick-Link: [http://www.rostlab.org/services/snpdbe/dosearch.php?id=name&val=HFE&organism2=human&organism1= HFE] |
||
+ | |||
+ | The SNP data was extracted from the textfiles with a perl script: |
||
+ | |||
+ | <source lang="perl"> |
||
+ | #!/usr/local/bin/perl |
||
+ | use strict; |
||
+ | use warnings; |
||
+ | use diagnostics; |
||
+ | use sigtrap; |
||
+ | use autodie; |
||
+ | |||
+ | #my $inFile = "snpsQ30201.txt"; |
||
+ | #my $outFile = "snpsQ30201_out.txt"; |
||
+ | #my $seqFile = "snpsQ30201_seq.txt"; |
||
+ | #my $inFile = "snpsNP000401.txt"; |
||
+ | #my $outFile = "snpsNP000401_out.txt"; |
||
+ | #my $seqFile = "snpsNP000401_seq.txt"; |
||
+ | my $inFile = "snpsHFE.txt"; |
||
+ | my $outFile = "snpsHFE_out.txt"; |
||
+ | my $seqFile = "snpsHFE_seq.txt"; |
||
+ | |||
+ | my %mutationToPos; |
||
+ | my %mutationToLine; |
||
+ | my %uniqueSeq; |
||
+ | |||
+ | |||
+ | open (IN, "<", $inFile) or die; |
||
+ | open (OUT, ">", $outFile) or die; |
||
+ | open (SEQ, ">", $seqFile) or die; |
||
+ | |||
+ | my $line = <IN>; |
||
+ | |||
+ | print OUT "dbSNP ID\tSequence\tType\tAmino acid change\tPosition\tEffect\tExperimental evidence\n"; |
||
+ | |||
+ | |||
+ | while (<IN>) |
||
+ | { |
||
+ | my $line = $_; |
||
+ | my @content = split(/\|/, $line); |
||
+ | |||
+ | my $name = $content[0]; |
||
+ | my $dbSNP = $content[1]; |
||
+ | my $gene = $content[6]; |
||
+ | my $protein = $content[7]; |
||
+ | my $seq = $content[10]; |
||
+ | my $disease = $content[19]; |
||
+ | my $wt = $content[21]; |
||
+ | my $mt = $content[24]; |
||
+ | my $effect = "TODO"; |
||
+ | my $evidence = $content[31]; |
||
+ | |||
+ | if ($gene eq "HFE") |
||
+ | { |
||
+ | $name=~m/(\D)(\d+?)(\D)/gi; |
||
+ | |||
+ | my $pos = $2; |
||
+ | my $first = oneTo3($1); |
||
+ | my $last = oneTo3($3); |
||
+ | my $type; |
||
+ | |||
+ | if ($first eq $last) |
||
+ | { |
||
+ | $type = "Synonymous"; |
||
+ | } |
||
+ | else |
||
+ | { |
||
+ | $type = "Non-synonymous"; |
||
+ | } |
||
+ | |||
+ | if ($disease =~m/h(a)*emochromatosis/gi || $wt >= 40) |
||
+ | { |
||
+ | $effect = "Disease causing"; |
||
+ | } |
||
+ | else |
||
+ | { |
||
+ | $effect = "No effect"; |
||
+ | } |
||
+ | |||
+ | if (defined $uniqueSeq{$protein} && $uniqueSeq{$protein} ne $seq) |
||
+ | { |
||
+ | print "Warning: different sequences for same protein id: $protein"; |
||
+ | } |
||
+ | |||
+ | $uniqueSeq{$protein} = $seq; |
||
+ | |||
+ | print OUT "$dbSNP\t$protein\t$type\t$first-$last\t$pos\t$effect\t$evidence\n"; |
||
+ | } |
||
+ | } |
||
+ | |||
+ | print SEQ ">sp|Q30201|HFE_HUMAN\n"; |
||
+ | print SEQ "...\n"; #sequence removed for protocol site (too long) |
||
+ | |||
+ | for my $key (keys %uniqueSeq) |
||
+ | { |
||
+ | print SEQ ">$key\n"; |
||
+ | print SEQ "$uniqueSeq{$key}\n"; |
||
+ | } |
||
+ | |||
+ | close IN; |
||
+ | close OUT; |
||
+ | close SEQ; |
||
+ | |||
+ | |||
+ | sub oneTo3 |
||
+ | { |
||
+ | my $single = $_[0]; |
||
+ | |||
+ | if ($single eq "A"){return "Ala";} |
||
+ | if ($single eq "R"){return "Arg";} |
||
+ | if ($single eq "N"){return "Asn";} |
||
+ | if ($single eq "D"){return "Asp";} |
||
+ | if ($single eq "C"){return "Cys";} |
||
+ | if ($single eq "E"){return "Glu";} |
||
+ | if ($single eq "Q"){return "Gln";} |
||
+ | if ($single eq "G"){return "Gly";} |
||
+ | if ($single eq "H"){return "His";} |
||
+ | if ($single eq "I"){return "Ile";} |
||
+ | if ($single eq "L"){return "Leu";} |
||
+ | if ($single eq "K"){return "Lys";} |
||
+ | if ($single eq "M"){return "Met";} |
||
+ | if ($single eq "F"){return "Phe";} |
||
+ | if ($single eq "P"){return "Pro";} |
||
+ | if ($single eq "S"){return "Ser";} |
||
+ | if ($single eq "T"){return "Thr";} |
||
+ | if ($single eq "W"){return "Trp";} |
||
+ | if ($single eq "Y"){return "Tyr";} |
||
+ | if ($single eq "V"){return "Val";} |
||
+ | |||
+ | return "XXX"; |
||
+ | } |
||
+ | </source> |
||
+ | |||
+ | |||
+ | For the HFE search the positions had to be corrected for the reference sequence: |
||
+ | |||
+ | <source lang="perl"> |
||
+ | #!/usr/local/bin/perl |
||
+ | use strict; |
||
+ | use warnings; |
||
+ | use diagnostics; |
||
+ | use sigtrap; |
||
+ | use autodie; |
||
+ | |||
+ | my $inFile = "snpsHFE_out.txt"; |
||
+ | my $alnFile = "snpsHFE.fasta_aln"; |
||
+ | my $outFile = "snpsHFE_cor.txt"; |
||
+ | |||
+ | my %lineToPos; |
||
+ | |||
+ | my @refSeq = split(//, "..."); #sequence removed for protocol site (too long) |
||
+ | |||
+ | open (IN, "<", $inFile) or die; |
||
+ | open (OUT, ">", $outFile) or die; |
||
+ | |||
+ | my $header = <IN>; |
||
+ | |||
+ | print OUT $header; |
||
+ | |||
+ | while (<IN>) |
||
+ | { |
||
+ | my $line = $_; |
||
+ | my @content = split(/\t/, $line); |
||
+ | |||
+ | my $mutation = $content[3]; |
||
+ | my $pos = $content[4]; |
||
+ | my $protein = $content[1]; |
||
+ | |||
+ | $pos = makePosMap($protein, $pos); |
||
+ | $content[4] = $pos; |
||
+ | $line = ""; |
||
+ | |||
+ | #rebuild line; |
||
+ | for my $item (@content) |
||
+ | { |
||
+ | $line = $line.$item; |
||
+ | |||
+ | if ($item ne $content[scalar(@content)-1]) |
||
+ | { |
||
+ | $line = $line."\t"; |
||
+ | } |
||
+ | } |
||
+ | |||
+ | print OUT $line; |
||
+ | } |
||
+ | |||
+ | close IN; |
||
+ | close OUT; |
||
+ | |||
+ | |||
+ | |||
+ | sub makePosMap |
||
+ | { |
||
+ | my $protein = $_[0]; |
||
+ | my $oldPos = $_[1]; |
||
+ | my $seq = ""; |
||
+ | |||
+ | open (ALN, "<", $alnFile) or die; |
||
+ | |||
+ | while (<ALN>) |
||
+ | { |
||
+ | my $line = $_; |
||
+ | |||
+ | if ($line =~m/^>$protein/) |
||
+ | { |
||
+ | while ($line = <ALN>) |
||
+ | { |
||
+ | if ($line =~m/^>/) |
||
+ | { |
||
+ | last; |
||
+ | } |
||
+ | else |
||
+ | { |
||
+ | $line =~s/\s//gi; |
||
+ | $seq = $seq.$line; |
||
+ | } |
||
+ | } |
||
+ | last; |
||
+ | } |
||
+ | } |
||
+ | |||
+ | close ALN; |
||
+ | |||
+ | my @chars = split(//, $seq); |
||
+ | my $relPos = 1; |
||
+ | my $newPos = 1; |
||
+ | |||
+ | #assuming no gaps in the MSA's reference sequence |
||
+ | |||
+ | for (my $pos = 0; $pos < scalar(@chars); $pos++) |
||
+ | { |
||
+ | if ($chars[$pos] ne "-") |
||
+ | { |
||
+ | if ($oldPos == $relPos) |
||
+ | { |
||
+ | $newPos = $pos+1; |
||
+ | |||
+ | last; |
||
+ | } |
||
+ | |||
+ | $relPos++; |
||
+ | } |
||
+ | elsif ($oldPos == $relPos) |
||
+ | { |
||
+ | print "Something went wrong! Given position is a gap!\n"; |
||
+ | } |
||
+ | } |
||
+ | |||
+ | if ($chars[$newPos-1] ne $refSeq[$newPos-1]) |
||
+ | { |
||
+ | print "Residue at mutation position differs from reference sequence! $chars[$newPos]<>$refSeq[$newPos]\n"; |
||
+ | } |
||
+ | |||
+ | return $newPos; |
||
+ | } |
||
+ | </source> |
||
+ | |||
+ | |||
+ | In the last step all search results were combined, filtered for unique results, and sorted by their position: |
||
+ | |||
+ | <source lang="perl"> |
||
+ | #!/usr/local/bin/perl |
||
+ | use strict; |
||
+ | use warnings; |
||
+ | use diagnostics; |
||
+ | use sigtrap; |
||
+ | use autodie; |
||
+ | |||
+ | my @inFiles = ("snpsNP000401_out.txt", "snpsQ30201_out.txt", "snpsHFE_cor.txt"); |
||
+ | my $outFile = "SNPdbe.txt"; |
||
+ | |||
+ | my %mutationToID; |
||
+ | my %mutationToSeq; |
||
+ | my %mutationToType; |
||
+ | my %mutationToAA; |
||
+ | my %mutationToPos; |
||
+ | my %mutationToEffect; |
||
+ | my %mutationToEvidence; |
||
+ | |||
+ | |||
+ | for my $file (@inFiles) |
||
+ | { |
||
+ | open (IN, "<", $file) or die; |
||
+ | |||
+ | my $line = <IN>; |
||
+ | my @content; |
||
+ | |||
+ | while (<IN>) |
||
+ | { |
||
+ | $line = $_; |
||
+ | @content = split(/\t/, $line); |
||
+ | |||
+ | my $id = $content[0]; |
||
+ | my $seq = $content[1]; |
||
+ | my $type = $content[2]; |
||
+ | my $aa = $content[3]; |
||
+ | my $pos = $content[4]; |
||
+ | my $effect = $content[5]; |
||
+ | my $evidence = $content[6]; |
||
+ | my $unique = $aa.$pos; |
||
+ | |||
+ | $evidence =~s/by//gi; |
||
+ | $evidence =~s/\s//gi; |
||
+ | |||
+ | if (not defined $mutationToID{$unique} && (not($id=~m/N\/A/gi))) |
||
+ | { |
||
+ | $mutationToID{$unique} = $id; |
||
+ | } |
||
+ | |||
+ | if (defined $mutationToSeq{$unique}) |
||
+ | { |
||
+ | if ($seq eq "Q30201") |
||
+ | { |
||
+ | $mutationToSeq{$unique} = $seq; |
||
+ | } |
||
+ | elsif ($seq eq "NP_000401" && $mutationToSeq{$unique} ne "Q30201") |
||
+ | { |
||
+ | $mutationToSeq{$unique} = $seq; |
||
+ | } |
||
+ | } |
||
+ | else |
||
+ | { |
||
+ | $mutationToSeq{$unique} = $seq; |
||
+ | } |
||
+ | |||
+ | if (not defined $mutationToType{$unique}) |
||
+ | { |
||
+ | $mutationToType{$unique} = $type; |
||
+ | } |
||
+ | |||
+ | if (not defined $mutationToAA{$unique}) |
||
+ | { |
||
+ | $mutationToAA{$unique} = $aa; |
||
+ | } |
||
+ | |||
+ | if (not defined $mutationToPos{$unique}) |
||
+ | { |
||
+ | $mutationToPos{$unique} = $pos; |
||
+ | } |
||
+ | |||
+ | if ((not defined $mutationToEffect{$unique}) && $effect eq "Disease causing") |
||
+ | { |
||
+ | $mutationToEffect{$unique} = $effect; |
||
+ | } |
||
+ | |||
+ | if ((not defined $mutationToEvidence{$unique}) && ($evidence!~m/Notvalidated/gi)) |
||
+ | { |
||
+ | $mutationToEvidence{$unique} = $evidence; |
||
+ | } |
||
+ | elsif (defined $mutationToEvidence{$unique} && ($evidence!~m/Notvalidated/gi)) |
||
+ | { |
||
+ | my @tmp1 = split(/,/, $mutationToEvidence{$unique}); |
||
+ | my @tmp2 = split(/,/, $evidence); |
||
+ | my %tmp3; |
||
+ | |||
+ | for my $ev1 (@tmp1) |
||
+ | { |
||
+ | $tmp3{$ev1} = 1; |
||
+ | } |
||
+ | |||
+ | for my $ev2 (@tmp2) |
||
+ | { |
||
+ | $tmp3{$ev2} = 1; |
||
+ | } |
||
+ | |||
+ | my $new = ""; |
||
+ | |||
+ | for my $key (keys %tmp3) |
||
+ | { |
||
+ | $new = $new.$key.","; |
||
+ | } |
||
+ | |||
+ | chop($new); |
||
+ | |||
+ | $mutationToEvidence{$unique}= $new; |
||
+ | } |
||
+ | } |
||
+ | } |
||
+ | |||
+ | my @keys = sort{$mutationToPos{$a} <=> $mutationToPos{$b}} keys %mutationToPos; |
||
+ | |||
+ | |||
+ | open (OUT, ">", $outFile) or die; |
||
+ | |||
+ | print OUT "dbSNP ID\tSequence\tType\tAmino acid change\tPosition\tEffect\tExperimental evidence\n"; |
||
+ | |||
+ | for my $key (@keys) |
||
+ | { |
||
+ | if (not defined $mutationToID{$key}) |
||
+ | { |
||
+ | $mutationToID{$key} = "N/A"; |
||
+ | } |
||
+ | |||
+ | if (not defined $mutationToEffect{$key}) |
||
+ | { |
||
+ | $mutationToEffect{$key} = "No effect"; |
||
+ | } |
||
+ | |||
+ | if (not defined $mutationToEvidence{$key}) |
||
+ | { |
||
+ | $mutationToEvidence{$key} = "Not validated"; |
||
+ | } |
||
+ | |||
+ | print OUT $mutationToID{$key}."\t"; |
||
+ | print OUT $mutationToSeq{$key}."\t"; |
||
+ | print OUT $mutationToType{$key}."\t"; |
||
+ | print OUT $mutationToAA{$key}."\t"; |
||
+ | print OUT $mutationToPos{$key}."\t"; |
||
+ | print OUT $mutationToEffect{$key}."\t"; |
||
+ | print OUT $mutationToEvidence{$key}."\n"; |
||
+ | } |
||
+ | |||
+ | close OUT; |
||
+ | </source> |
||
<br style="clear:both;"> |
<br style="clear:both;"> |
||
Line 44: | Line 538: | ||
=== SNPedia === |
=== SNPedia === |
||
+ | |||
+ | For [http://www.snpedia.com/index.php/SNPedia SNPedia] we looked at the SNPs listed under [http://www.snpedia.com/index.php/HFE HFE]. |
||
<br style="clear:both;"> |
<br style="clear:both;"> |
Latest revision as of 14:10, 10 June 2012
Contents
Translation and alignment tools
SNP search
HGMD
We used the non-professional version of HGMD and looked for SNPs using the gene symbol HFE as query.
In order to get the SNP statistics the following script was used:
<source lang="perl">
- !/usr/local/bin/perl
use strict; use warnings; use diagnostics; use sigtrap; use autodie;
my $inFile = "HFE.htm"; my $outFile = "HGMD.txt";
open (IN, "<", $inFile) or die ("Could not open input file!");
open (OUT, ">", $outFile) or die ("Could not open output file!");
- print header
print OUT "ID\tType\tCodon change\tAmino acid change\tPosition\tPhenotype\tReference\n";
while (<IN>) { my $line = $_; my $dummy = $line;
$dummy=~s/\s//gi;
if ($dummy ne "") { my @content = split(/<\/td>/, $line);
my $id = $content[0]; my $type; my $codon = $content[1]; my $aa = $content[2]; my $pos = $content[3]; my $phenotype = $content[5]; my $ref = $content[6]; my $link;
$id =~m/>(\w+?)</gi; $id = $1;
$codon =~m/>([\w-]+?)$/gi; $codon = $1;
$aa =~m/>([\w-]+?)$/gi; $aa = $1;
$pos =~m/>(\d+?)$/gi; $pos = $1;
$phenotype =~m/>([\w\s,?]+?)$/gi; $phenotype = $1;
$ref =~m/^<.*?href="(.+?)">(.*?)<\/a>/gi; $link = $1; $ref = $2; $ref =~ s/<.*?>//gi;
if ($aa =~m/Term/gi) { $type = "Nonsense"; } elsif ($aa =~m/^(.+?)-(.+?)$/g && $1 eq $2) { $type = "Synonymous"; } else { $type = "Non-synonymous"; }
print OUT "$id\t$type\t$codon\t$aa\t$pos\t$phenotype\t[$link $ref]\n"; } }
close IN; close OUT; </source>
dbSNP
SNPdbe
We've searched SNPdbe for SNPs using NP_000401, Q30201, and HFE as query IDs.
The SNP data was extracted from the textfiles with a perl script:
<source lang="perl">
- !/usr/local/bin/perl
use strict; use warnings; use diagnostics; use sigtrap; use autodie;
- my $inFile = "snpsQ30201.txt";
- my $outFile = "snpsQ30201_out.txt";
- my $seqFile = "snpsQ30201_seq.txt";
- my $inFile = "snpsNP000401.txt";
- my $outFile = "snpsNP000401_out.txt";
- my $seqFile = "snpsNP000401_seq.txt";
my $inFile = "snpsHFE.txt"; my $outFile = "snpsHFE_out.txt"; my $seqFile = "snpsHFE_seq.txt";
my %mutationToPos; my %mutationToLine; my %uniqueSeq;
open (IN, "<", $inFile) or die;
open (OUT, ">", $outFile) or die;
open (SEQ, ">", $seqFile) or die;
my $line = <IN>;
print OUT "dbSNP ID\tSequence\tType\tAmino acid change\tPosition\tEffect\tExperimental evidence\n";
while (<IN>)
{
my $line = $_;
my @content = split(/\|/, $line);
my $name = $content[0]; my $dbSNP = $content[1]; my $gene = $content[6]; my $protein = $content[7]; my $seq = $content[10]; my $disease = $content[19]; my $wt = $content[21]; my $mt = $content[24]; my $effect = "TODO"; my $evidence = $content[31];
if ($gene eq "HFE") { $name=~m/(\D)(\d+?)(\D)/gi;
my $pos = $2; my $first = oneTo3($1); my $last = oneTo3($3); my $type;
if ($first eq $last) { $type = "Synonymous"; } else { $type = "Non-synonymous"; }
if ($disease =~m/h(a)*emochromatosis/gi || $wt >= 40) { $effect = "Disease causing"; } else { $effect = "No effect"; }
if (defined $uniqueSeq{$protein} && $uniqueSeq{$protein} ne $seq) { print "Warning: different sequences for same protein id: $protein"; }
$uniqueSeq{$protein} = $seq;
print OUT "$dbSNP\t$protein\t$type\t$first-$last\t$pos\t$effect\t$evidence\n"; } }
print SEQ ">sp|Q30201|HFE_HUMAN\n"; print SEQ "...\n"; #sequence removed for protocol site (too long)
for my $key (keys %uniqueSeq) { print SEQ ">$key\n"; print SEQ "$uniqueSeq{$key}\n"; }
close IN; close OUT; close SEQ;
sub oneTo3
{
my $single = $_[0];
if ($single eq "A"){return "Ala";} if ($single eq "R"){return "Arg";} if ($single eq "N"){return "Asn";} if ($single eq "D"){return "Asp";} if ($single eq "C"){return "Cys";} if ($single eq "E"){return "Glu";} if ($single eq "Q"){return "Gln";} if ($single eq "G"){return "Gly";} if ($single eq "H"){return "His";} if ($single eq "I"){return "Ile";} if ($single eq "L"){return "Leu";} if ($single eq "K"){return "Lys";} if ($single eq "M"){return "Met";} if ($single eq "F"){return "Phe";} if ($single eq "P"){return "Pro";} if ($single eq "S"){return "Ser";} if ($single eq "T"){return "Thr";} if ($single eq "W"){return "Trp";} if ($single eq "Y"){return "Tyr";} if ($single eq "V"){return "Val";}
return "XXX"; } </source>
For the HFE search the positions had to be corrected for the reference sequence:
<source lang="perl">
- !/usr/local/bin/perl
use strict; use warnings; use diagnostics; use sigtrap; use autodie;
my $inFile = "snpsHFE_out.txt"; my $alnFile = "snpsHFE.fasta_aln"; my $outFile = "snpsHFE_cor.txt";
my %lineToPos;
my @refSeq = split(//, "..."); #sequence removed for protocol site (too long)
open (IN, "<", $inFile) or die; open (OUT, ">", $outFile) or die;
my $header = <IN>;
print OUT $header;
while (<IN>) { my $line = $_; my @content = split(/\t/, $line);
my $mutation = $content[3]; my $pos = $content[4]; my $protein = $content[1];
$pos = makePosMap($protein, $pos); $content[4] = $pos; $line = "";
#rebuild line; for my $item (@content) { $line = $line.$item;
if ($item ne $content[scalar(@content)-1]) { $line = $line."\t"; } }
print OUT $line; }
close IN; close OUT;
sub makePosMap { my $protein = $_[0]; my $oldPos = $_[1]; my $seq = "";
open (ALN, "<", $alnFile) or die;
while (<ALN>) { my $line = $_;
if ($line =~m/^>$protein/) { while ($line = <ALN>) { if ($line =~m/^>/) { last; } else { $line =~s/\s//gi; $seq = $seq.$line; } } last; } }
close ALN;
my @chars = split(//, $seq); my $relPos = 1; my $newPos = 1;
#assuming no gaps in the MSA's reference sequence
for (my $pos = 0; $pos < scalar(@chars); $pos++) { if ($chars[$pos] ne "-") { if ($oldPos == $relPos) { $newPos = $pos+1;
last; }
$relPos++; } elsif ($oldPos == $relPos) { print "Something went wrong! Given position is a gap!\n"; } }
if ($chars[$newPos-1] ne $refSeq[$newPos-1]) { print "Residue at mutation position differs from reference sequence! $chars[$newPos]<>$refSeq[$newPos]\n"; }
return $newPos; } </source>
In the last step all search results were combined, filtered for unique results, and sorted by their position:
<source lang="perl">
- !/usr/local/bin/perl
use strict; use warnings; use diagnostics; use sigtrap; use autodie;
my @inFiles = ("snpsNP000401_out.txt", "snpsQ30201_out.txt", "snpsHFE_cor.txt"); my $outFile = "SNPdbe.txt";
my %mutationToID; my %mutationToSeq; my %mutationToType; my %mutationToAA; my %mutationToPos; my %mutationToEffect; my %mutationToEvidence;
for my $file (@inFiles)
{
open (IN, "<", $file) or die;
my $line = <IN>; my @content;
while (<IN>) { $line = $_; @content = split(/\t/, $line);
my $id = $content[0]; my $seq = $content[1]; my $type = $content[2]; my $aa = $content[3]; my $pos = $content[4]; my $effect = $content[5]; my $evidence = $content[6]; my $unique = $aa.$pos;
$evidence =~s/by//gi; $evidence =~s/\s//gi;
if (not defined $mutationToID{$unique} && (not($id=~m/N\/A/gi))) { $mutationToID{$unique} = $id; }
if (defined $mutationToSeq{$unique}) { if ($seq eq "Q30201") { $mutationToSeq{$unique} = $seq; } elsif ($seq eq "NP_000401" && $mutationToSeq{$unique} ne "Q30201") { $mutationToSeq{$unique} = $seq; } } else { $mutationToSeq{$unique} = $seq; }
if (not defined $mutationToType{$unique}) { $mutationToType{$unique} = $type; }
if (not defined $mutationToAA{$unique}) { $mutationToAA{$unique} = $aa; }
if (not defined $mutationToPos{$unique}) { $mutationToPos{$unique} = $pos; }
if ((not defined $mutationToEffect{$unique}) && $effect eq "Disease causing") { $mutationToEffect{$unique} = $effect; }
if ((not defined $mutationToEvidence{$unique}) && ($evidence!~m/Notvalidated/gi)) { $mutationToEvidence{$unique} = $evidence; } elsif (defined $mutationToEvidence{$unique} && ($evidence!~m/Notvalidated/gi)) { my @tmp1 = split(/,/, $mutationToEvidence{$unique}); my @tmp2 = split(/,/, $evidence); my %tmp3;
for my $ev1 (@tmp1) { $tmp3{$ev1} = 1; }
for my $ev2 (@tmp2) { $tmp3{$ev2} = 1; }
my $new = "";
for my $key (keys %tmp3) { $new = $new.$key.","; }
chop($new);
$mutationToEvidence{$unique}= $new; } } }
my @keys = sort{$mutationToPos{$a} <=> $mutationToPos{$b}} keys %mutationToPos;
open (OUT, ">", $outFile) or die;
print OUT "dbSNP ID\tSequence\tType\tAmino acid change\tPosition\tEffect\tExperimental evidence\n";
for my $key (@keys) { if (not defined $mutationToID{$key}) { $mutationToID{$key} = "N/A"; }
if (not defined $mutationToEffect{$key}) { $mutationToEffect{$key} = "No effect"; }
if (not defined $mutationToEvidence{$key}) { $mutationToEvidence{$key} = "Not validated"; }
print OUT $mutationToID{$key}."\t"; print OUT $mutationToSeq{$key}."\t"; print OUT $mutationToType{$key}."\t"; print OUT $mutationToAA{$key}."\t"; print OUT $mutationToPos{$key}."\t"; print OUT $mutationToEffect{$key}."\t"; print OUT $mutationToEvidence{$key}."\n"; }
close OUT; </source>
OMIM
We searched OMIM for the HFE Gene and looked at the listed allelic variants.
SNPedia
For SNPedia we looked at the SNPs listed under HFE.
Mapping