Task5 Hemochromatosis Protocol
Contents
Translation and alignment tools
SNP search
HGMD
We used the non-professional version of HGMD and looked for SNPs using the gene symbol HFE as query.
In order to get the SNP statistics the following script was used:
<source lang="perl">
- !/usr/local/bin/perl
use strict; use warnings; use diagnostics; use sigtrap; use autodie;
my $inFile = "HFE.htm"; my $outFile = "HGMD.txt";
open (IN, "<", $inFile) or die ("Could not open input file!");
open (OUT, ">", $outFile) or die ("Could not open output file!");
- print header
print OUT "ID\tType\tCodon change\tAmino acid change\tPosition\tPhenotype\tReference\n";
while (<IN>) { my $line = $_; my $dummy = $line;
$dummy=~s/\s//gi;
if ($dummy ne "") { my @content = split(/<\/td>/, $line);
my $id = $content[0]; my $type; my $codon = $content[1]; my $aa = $content[2]; my $pos = $content[3]; my $phenotype = $content[5]; my $ref = $content[6]; my $link;
$id =~m/>(\w+?)</gi; $id = $1;
$codon =~m/>([\w-]+?)$/gi; $codon = $1;
$aa =~m/>([\w-]+?)$/gi; $aa = $1;
$pos =~m/>(\d+?)$/gi; $pos = $1;
$phenotype =~m/>([\w\s,?]+?)$/gi; $phenotype = $1;
$ref =~m/^<.*?href="(.+?)">(.*?)<\/a>/gi; $link = $1; $ref = $2; $ref =~ s/<.*?>//gi;
if ($aa =~m/Term/gi) { $type = "Nonsense"; } elsif ($aa =~m/^(.+?)-(.+?)$/g && $1 eq $2) { $type = "Synonymous"; } else { $type = "Non-synonymous"; }
print OUT "$id\t$type\t$codon\t$aa\t$pos\t$phenotype\t[$link $ref]\n"; } }
close IN; close OUT; </source>
dbSNP
SNPdbe
We've searched SNPdbe for SNPs using NP_000401, Q30201, and HFE as query IDs.
The SNP data was extracted from the textfiles with a perl script:
<source lang="perl">
- !/usr/local/bin/perl
use strict; use warnings; use diagnostics; use sigtrap; use autodie;
- my $inFile = "snpsQ30201.txt";
- my $outFile = "snpsQ30201_out.txt";
- my $seqFile = "snpsQ30201_seq.txt";
- my $inFile = "snpsNP000401.txt";
- my $outFile = "snpsNP000401_out.txt";
- my $seqFile = "snpsNP000401_seq.txt";
my $inFile = "snpsHFE.txt"; my $outFile = "snpsHFE_out.txt"; my $seqFile = "snpsHFE_seq.txt";
my %mutationToPos; my %mutationToLine; my %uniqueSeq;
open (IN, "<", $inFile) or die;
open (OUT, ">", $outFile) or die;
open (SEQ, ">", $seqFile) or die;
my $line = <IN>;
print OUT "dbSNP ID\tSequence\tType\tAmino acid change\tPosition\tEffect\tExperimental evidence\n";
while (<IN>)
{
my $line = $_;
my @content = split(/\|/, $line);
my $name = $content[0]; my $dbSNP = $content[1]; my $gene = $content[6]; my $protein = $content[7]; my $seq = $content[10]; my $disease = $content[19]; my $wt = $content[21]; my $mt = $content[24]; my $effect = "TODO"; my $evidence = $content[31];
if ($gene eq "HFE") { $name=~m/(\D)(\d+?)(\D)/gi;
my $pos = $2; my $first = oneTo3($1); my $last = oneTo3($3); my $type;
if ($first eq $last) { $type = "Synonymous"; } else { $type = "Non-synonymous"; }
if ($disease =~m/h(a)*emochromatosis/gi || $wt >= 40) { $effect = "Disease causing"; } else { $effect = "No effect"; }
if (defined $uniqueSeq{$protein} && $uniqueSeq{$protein} ne $seq) { print "Warning: different sequences for same protein id: $protein"; }
$uniqueSeq{$protein} = $seq;
print OUT "$dbSNP\t$protein\t$type\t$first-$last\t$pos\t$effect\t$evidence\n"; } }
print SEQ ">sp|Q30201|HFE_HUMAN\n"; print SEQ "...\n"; #sequence removed for protocol site (too long)
for my $key (keys %uniqueSeq) { print SEQ ">$key\n"; print SEQ "$uniqueSeq{$key}\n"; }
close IN; close OUT; close SEQ;
sub oneTo3
{
my $single = $_[0];
if ($single eq "A"){return "Ala";} if ($single eq "R"){return "Arg";} if ($single eq "N"){return "Asn";} if ($single eq "D"){return "Asp";} if ($single eq "C"){return "Cys";} if ($single eq "E"){return "Glu";} if ($single eq "Q"){return "Gln";} if ($single eq "G"){return "Gly";} if ($single eq "H"){return "His";} if ($single eq "I"){return "Ile";} if ($single eq "L"){return "Leu";} if ($single eq "K"){return "Lys";} if ($single eq "M"){return "Met";} if ($single eq "F"){return "Phe";} if ($single eq "P"){return "Pro";} if ($single eq "S"){return "Ser";} if ($single eq "T"){return "Thr";} if ($single eq "W"){return "Trp";} if ($single eq "Y"){return "Tyr";} if ($single eq "V"){return "Val";}
return "XXX"; } </source>
For the HFE search the positions had to be corrected for the reference sequence:
<source lang="perl">
- !/usr/local/bin/perl
use strict; use warnings; use diagnostics; use sigtrap; use autodie;
my $inFile = "snpsHFE_out.txt"; my $alnFile = "snpsHFE.fasta_aln"; my $outFile = "snpsHFE_cor.txt";
my %lineToPos;
my @refSeq = split(//, "..."); #sequence removed for protocol site (too long)
open (IN, "<", $inFile) or die; open (OUT, ">", $outFile) or die;
my $header = <IN>;
print OUT $header;
while (<IN>) { my $line = $_; my @content = split(/\t/, $line);
my $mutation = $content[3]; my $pos = $content[4]; my $protein = $content[1];
$pos = makePosMap($protein, $pos); $content[4] = $pos; $line = "";
#rebuild line; for my $item (@content) { $line = $line.$item;
if ($item ne $content[scalar(@content)-1]) { $line = $line."\t"; } }
print OUT $line; }
close IN; close OUT;
sub makePosMap { my $protein = $_[0]; my $oldPos = $_[1]; my $seq = "";
open (ALN, "<", $alnFile) or die;
while (<ALN>) { my $line = $_;
if ($line =~m/^>$protein/) { while ($line = <ALN>) { if ($line =~m/^>/) { last; } else { $line =~s/\s//gi; $seq = $seq.$line; } } last; } }
close ALN;
my @chars = split(//, $seq); my $relPos = 1; my $newPos = 1;
#assuming no gaps in the MSA's reference sequence
for (my $pos = 0; $pos < scalar(@chars); $pos++) { if ($chars[$pos] ne "-") { if ($oldPos == $relPos) { $newPos = $pos+1;
last; }
$relPos++; } elsif ($oldPos == $relPos) { print "Something went wrong! Given position is a gap!\n"; } }
if ($chars[$newPos-1] ne $refSeq[$newPos-1]) { print "Residue at mutation position differs from reference sequence! $chars[$newPos]<>$refSeq[$newPos]\n"; }
return $newPos; } </source>
In the last step all search results were combined, filtered for unique results, and sorted by their position:
<source lang="perl">
- !/usr/local/bin/perl
use strict; use warnings; use diagnostics; use sigtrap; use autodie;
my @inFiles = ("snpsNP000401_out.txt", "snpsQ30201_out.txt", "snpsHFE_cor.txt"); my $outFile = "SNPdbe.txt";
my %mutationToID; my %mutationToSeq; my %mutationToType; my %mutationToAA; my %mutationToPos; my %mutationToEffect; my %mutationToEvidence;
for my $file (@inFiles)
{
open (IN, "<", $file) or die;
my $line = <IN>; my @content;
while (<IN>) { $line = $_; @content = split(/\t/, $line);
my $id = $content[0]; my $seq = $content[1]; my $type = $content[2]; my $aa = $content[3]; my $pos = $content[4]; my $effect = $content[5]; my $evidence = $content[6]; my $unique = $aa.$pos;
$evidence =~s/by//gi; $evidence =~s/\s//gi;
if (not defined $mutationToID{$unique} && (not($id=~m/N\/A/gi))) { $mutationToID{$unique} = $id; }
if (defined $mutationToSeq{$unique}) { if ($seq eq "Q30201") { $mutationToSeq{$unique} = $seq; } elsif ($seq eq "NP_000401" && $mutationToSeq{$unique} ne "Q30201") { $mutationToSeq{$unique} = $seq; } } else { $mutationToSeq{$unique} = $seq; }
if (not defined $mutationToType{$unique}) { $mutationToType{$unique} = $type; }
if (not defined $mutationToAA{$unique}) { $mutationToAA{$unique} = $aa; }
if (not defined $mutationToPos{$unique}) { $mutationToPos{$unique} = $pos; }
if ((not defined $mutationToEffect{$unique}) && $effect eq "Disease causing") { $mutationToEffect{$unique} = $effect; }
if ((not defined $mutationToEvidence{$unique}) && ($evidence!~m/Notvalidated/gi)) { $mutationToEvidence{$unique} = $evidence; } elsif (defined $mutationToEvidence{$unique} && ($evidence!~m/Notvalidated/gi)) { my @tmp1 = split(/,/, $mutationToEvidence{$unique}); my @tmp2 = split(/,/, $evidence); my %tmp3;
for my $ev1 (@tmp1) { $tmp3{$ev1} = 1; }
for my $ev2 (@tmp2) { $tmp3{$ev2} = 1; }
my $new = "";
for my $key (keys %tmp3) { $new = $new.$key.","; }
chop($new);
$mutationToEvidence{$unique}= $new; } } }
my @keys = sort{$mutationToPos{$a} <=> $mutationToPos{$b}} keys %mutationToPos;
open (OUT, ">", $outFile) or die;
print OUT "dbSNP ID\tSequence\tType\tAmino acid change\tPosition\tEffect\tExperimental evidence\n";
for my $key (@keys) { if (not defined $mutationToID{$key}) { $mutationToID{$key} = "N/A"; }
if (not defined $mutationToEffect{$key}) { $mutationToEffect{$key} = "No effect"; }
if (not defined $mutationToEvidence{$key}) { $mutationToEvidence{$key} = "Not validated"; }
print OUT $mutationToID{$key}."\t"; print OUT $mutationToSeq{$key}."\t"; print OUT $mutationToType{$key}."\t"; print OUT $mutationToAA{$key}."\t"; print OUT $mutationToPos{$key}."\t"; print OUT $mutationToEffect{$key}."\t"; print OUT $mutationToEvidence{$key}."\n"; }
close OUT; </source>
OMIM
We searched OMIM for the HFE Gene and looked at the listed allelic variants.
SNPedia
For SNPedia we looked at the SNPs listed under HFE.
Mapping