Task5 Hemochromatosis Protocol

From Bioinformatikpedia
Revision as of 14:10, 10 June 2012 by Bernhoferm (talk | contribs) (SNPdbe)
(diff) ← Older revision | Latest revision (diff) | Newer revision → (diff)

Translation and alignment tools

  • Translation (RNA/DNA->AA): Link
  • Online alignment: Link
  • Reference protein sequence: Q30201


SNP search


HGMD

We used the non-professional version of HGMD and looked for SNPs using the gene symbol HFE as query.

In order to get the SNP statistics the following script was used:

<source lang="perl">

  1. !/usr/local/bin/perl

use strict; use warnings; use diagnostics; use sigtrap; use autodie;

my $inFile = "HFE.htm"; my $outFile = "HGMD.txt";


open (IN, "<", $inFile) or die ("Could not open input file!"); open (OUT, ">", $outFile) or die ("Could not open output file!");

  1. print header

print OUT "ID\tType\tCodon change\tAmino acid change\tPosition\tPhenotype\tReference\n";

while (<IN>) { my $line = $_; my $dummy = $line;

$dummy=~s/\s//gi;

if ($dummy ne "") { my @content = split(/<\/td>/, $line);

my $id = $content[0]; my $type; my $codon = $content[1]; my $aa = $content[2]; my $pos = $content[3]; my $phenotype = $content[5]; my $ref = $content[6]; my $link;

$id =~m/>(\w+?)</gi; $id = $1;

$codon =~m/>([\w-]+?)$/gi; $codon = $1;

$aa =~m/>([\w-]+?)$/gi; $aa = $1;

$pos =~m/>(\d+?)$/gi; $pos = $1;

$phenotype =~m/>([\w\s,?]+?)$/gi; $phenotype = $1;

$ref =~m/^<.*?href="(.+?)">(.*?)<\/a>/gi; $link = $1; $ref = $2; $ref =~ s/<.*?>//gi;

if ($aa =~m/Term/gi) { $type = "Nonsense"; } elsif ($aa =~m/^(.+?)-(.+?)$/g && $1 eq $2) { $type = "Synonymous"; } else { $type = "Non-synonymous"; }

print OUT "$id\t$type\t$codon\t$aa\t$pos\t$phenotype\t[$link $ref]\n"; } }

close IN; close OUT; </source>


dbSNP


SNPdbe

We've searched SNPdbe for SNPs using NP_000401, Q30201, and HFE as query IDs.

The SNP data was extracted from the textfiles with a perl script:

<source lang="perl">

  1. !/usr/local/bin/perl

use strict; use warnings; use diagnostics; use sigtrap; use autodie;

  1. my $inFile = "snpsQ30201.txt";
  2. my $outFile = "snpsQ30201_out.txt";
  3. my $seqFile = "snpsQ30201_seq.txt";
  4. my $inFile = "snpsNP000401.txt";
  5. my $outFile = "snpsNP000401_out.txt";
  6. my $seqFile = "snpsNP000401_seq.txt";

my $inFile = "snpsHFE.txt"; my $outFile = "snpsHFE_out.txt"; my $seqFile = "snpsHFE_seq.txt";

my %mutationToPos; my %mutationToLine; my %uniqueSeq;


open (IN, "<", $inFile) or die; open (OUT, ">", $outFile) or die; open (SEQ, ">", $seqFile) or die;

my $line = <IN>;

print OUT "dbSNP ID\tSequence\tType\tAmino acid change\tPosition\tEffect\tExperimental evidence\n";


while (<IN>) { my $line = $_; my @content = split(/\|/, $line);

my $name = $content[0]; my $dbSNP = $content[1]; my $gene = $content[6]; my $protein = $content[7]; my $seq = $content[10]; my $disease = $content[19]; my $wt = $content[21]; my $mt = $content[24]; my $effect = "TODO"; my $evidence = $content[31];

if ($gene eq "HFE") { $name=~m/(\D)(\d+?)(\D)/gi;

my $pos = $2; my $first = oneTo3($1); my $last = oneTo3($3); my $type;

if ($first eq $last) { $type = "Synonymous"; } else { $type = "Non-synonymous"; }

if ($disease =~m/h(a)*emochromatosis/gi || $wt >= 40) { $effect = "Disease causing"; } else { $effect = "No effect"; }

if (defined $uniqueSeq{$protein} && $uniqueSeq{$protein} ne $seq) { print "Warning: different sequences for same protein id: $protein"; }

$uniqueSeq{$protein} = $seq;

print OUT "$dbSNP\t$protein\t$type\t$first-$last\t$pos\t$effect\t$evidence\n"; } }

print SEQ ">sp|Q30201|HFE_HUMAN\n"; print SEQ "...\n"; #sequence removed for protocol site (too long)

for my $key (keys %uniqueSeq) { print SEQ ">$key\n"; print SEQ "$uniqueSeq{$key}\n"; }

close IN; close OUT; close SEQ;


sub oneTo3 { my $single = $_[0];

if ($single eq "A"){return "Ala";} if ($single eq "R"){return "Arg";} if ($single eq "N"){return "Asn";} if ($single eq "D"){return "Asp";} if ($single eq "C"){return "Cys";} if ($single eq "E"){return "Glu";} if ($single eq "Q"){return "Gln";} if ($single eq "G"){return "Gly";} if ($single eq "H"){return "His";} if ($single eq "I"){return "Ile";} if ($single eq "L"){return "Leu";} if ($single eq "K"){return "Lys";} if ($single eq "M"){return "Met";} if ($single eq "F"){return "Phe";} if ($single eq "P"){return "Pro";} if ($single eq "S"){return "Ser";} if ($single eq "T"){return "Thr";} if ($single eq "W"){return "Trp";} if ($single eq "Y"){return "Tyr";} if ($single eq "V"){return "Val";}

return "XXX"; } </source>


For the HFE search the positions had to be corrected for the reference sequence:

<source lang="perl">

  1. !/usr/local/bin/perl

use strict; use warnings; use diagnostics; use sigtrap; use autodie;

my $inFile = "snpsHFE_out.txt"; my $alnFile = "snpsHFE.fasta_aln"; my $outFile = "snpsHFE_cor.txt";

my %lineToPos;

my @refSeq = split(//, "..."); #sequence removed for protocol site (too long)

open (IN, "<", $inFile) or die; open (OUT, ">", $outFile) or die;

my $header = <IN>;

print OUT $header;

while (<IN>) { my $line = $_; my @content = split(/\t/, $line);

my $mutation = $content[3]; my $pos = $content[4]; my $protein = $content[1];

$pos = makePosMap($protein, $pos); $content[4] = $pos; $line = "";

#rebuild line; for my $item (@content) { $line = $line.$item;

if ($item ne $content[scalar(@content)-1]) { $line = $line."\t"; } }

print OUT $line; }

close IN; close OUT;


sub makePosMap { my $protein = $_[0]; my $oldPos = $_[1]; my $seq = "";

open (ALN, "<", $alnFile) or die;

while (<ALN>) { my $line = $_;

if ($line =~m/^>$protein/) { while ($line = <ALN>) { if ($line =~m/^>/) { last; } else { $line =~s/\s//gi; $seq = $seq.$line; } } last; } }

close ALN;

my @chars = split(//, $seq); my $relPos = 1; my $newPos = 1;

#assuming no gaps in the MSA's reference sequence

for (my $pos = 0; $pos < scalar(@chars); $pos++) { if ($chars[$pos] ne "-") { if ($oldPos == $relPos) { $newPos = $pos+1;

last; }

$relPos++; } elsif ($oldPos == $relPos) { print "Something went wrong! Given position is a gap!\n"; } }

if ($chars[$newPos-1] ne $refSeq[$newPos-1]) { print "Residue at mutation position differs from reference sequence! $chars[$newPos]<>$refSeq[$newPos]\n"; }

return $newPos; } </source>


In the last step all search results were combined, filtered for unique results, and sorted by their position:

<source lang="perl">

  1. !/usr/local/bin/perl

use strict; use warnings; use diagnostics; use sigtrap; use autodie;

my @inFiles = ("snpsNP000401_out.txt", "snpsQ30201_out.txt", "snpsHFE_cor.txt"); my $outFile = "SNPdbe.txt";

my %mutationToID; my %mutationToSeq; my %mutationToType; my %mutationToAA; my %mutationToPos; my %mutationToEffect; my %mutationToEvidence;


for my $file (@inFiles) { open (IN, "<", $file) or die;

my $line = <IN>; my @content;

while (<IN>) { $line = $_; @content = split(/\t/, $line);

my $id = $content[0]; my $seq = $content[1]; my $type = $content[2]; my $aa = $content[3]; my $pos = $content[4]; my $effect = $content[5]; my $evidence = $content[6]; my $unique = $aa.$pos;

$evidence =~s/by//gi; $evidence =~s/\s//gi;

if (not defined $mutationToID{$unique} && (not($id=~m/N\/A/gi))) { $mutationToID{$unique} = $id; }

if (defined $mutationToSeq{$unique}) { if ($seq eq "Q30201") { $mutationToSeq{$unique} = $seq; } elsif ($seq eq "NP_000401" && $mutationToSeq{$unique} ne "Q30201") { $mutationToSeq{$unique} = $seq; } } else { $mutationToSeq{$unique} = $seq; }

if (not defined $mutationToType{$unique}) { $mutationToType{$unique} = $type; }

if (not defined $mutationToAA{$unique}) { $mutationToAA{$unique} = $aa; }

if (not defined $mutationToPos{$unique}) { $mutationToPos{$unique} = $pos; }

if ((not defined $mutationToEffect{$unique}) && $effect eq "Disease causing") { $mutationToEffect{$unique} = $effect; }

if ((not defined $mutationToEvidence{$unique}) && ($evidence!~m/Notvalidated/gi)) { $mutationToEvidence{$unique} = $evidence; } elsif (defined $mutationToEvidence{$unique} && ($evidence!~m/Notvalidated/gi)) { my @tmp1 = split(/,/, $mutationToEvidence{$unique}); my @tmp2 = split(/,/, $evidence); my %tmp3;

for my $ev1 (@tmp1) { $tmp3{$ev1} = 1; }

for my $ev2 (@tmp2) { $tmp3{$ev2} = 1; }

my $new = "";

for my $key (keys %tmp3) { $new = $new.$key.","; }

chop($new);

$mutationToEvidence{$unique}= $new; } } }

my @keys = sort{$mutationToPos{$a} <=> $mutationToPos{$b}} keys %mutationToPos;


open (OUT, ">", $outFile) or die;

print OUT "dbSNP ID\tSequence\tType\tAmino acid change\tPosition\tEffect\tExperimental evidence\n";

for my $key (@keys) { if (not defined $mutationToID{$key}) { $mutationToID{$key} = "N/A"; }

if (not defined $mutationToEffect{$key}) { $mutationToEffect{$key} = "No effect"; }

if (not defined $mutationToEvidence{$key}) { $mutationToEvidence{$key} = "Not validated"; }

print OUT $mutationToID{$key}."\t"; print OUT $mutationToSeq{$key}."\t"; print OUT $mutationToType{$key}."\t"; print OUT $mutationToAA{$key}."\t"; print OUT $mutationToPos{$key}."\t"; print OUT $mutationToEffect{$key}."\t"; print OUT $mutationToEvidence{$key}."\n"; }

close OUT; </source>


OMIM

We searched OMIM for the HFE Gene and looked at the listed allelic variants.


SNPedia

For SNPedia we looked at the SNPs listed under HFE.


Mapping