Difference between revisions of "Task2 Hemochromatosis Protocol"

From Bioinformatikpedia
(Blast)
(MSA)
 
(22 intermediate revisions by 2 users not shown)
Line 4: Line 4:
 
TSK="/mnt/home/student/bernhoferm/task2"
 
TSK="/mnt/home/student/bernhoferm/task2"
 
BIG="/mnt/project/pracstrucfunc12/data/big"
 
BIG="/mnt/project/pracstrucfunc12/data/big"
  +
time blastall -p blastp -d $BIG/big_80 -i $TSK/Q30201.fasta -b 1250 -v 1250 -o $TSK/align/blast_default_1250.out
 
  +
time blastall -p blastp -d $BIG/big_80 -i $TSK/Q30201.fasta -b 1500 -m 8 -o $TSK/align2/blast_default_1500.out
 
  +
real 1m8.589s
 
  +
real 1m24.100s
user 1m6.810s
 
sys 0m1.110s
+
user 1m5.950s
  +
sys 0m3.500s
   
 
=== PSI-Blast ===
 
=== PSI-Blast ===
  +
TSK="/mnt/home/student/bernhoferm/task2"
  +
BIG="/mnt/project/pracstrucfunc12/data/big"
  +
  +
  +
time blastpgp -C $TSK/h002j2.chk -d $BIG/big_80 -i $TSK/Q30201.fasta -h 0.002 -j 2 -b 10000 -m 8 -o $TSK/align2/blastpgp_h0.002_j2_10000.out
  +
  +
real 3m21.031s
  +
user 2m44.750s
  +
sys 0m10.270s
  +
  +
  +
time blastpgp -C $TSK/h002j10.chk -d $BIG/big_80 -i $TSK/Q30201.fasta -h 0.002 -j 10 -b 10000 -m 8 -o $TSK/align2/blastpgp_h0.002_j10_10000.out
  +
  +
real 16m39.699s
  +
user 14m22.150s
  +
sys 0m41.310s
  +
  +
  +
time blastpgp -C $TSK/h1e-10j2.chk -d $BIG/big_80 -i $TSK/Q30201.fasta -h 1e-10 -j 2 -b 10000 -m 8 -o $TSK/align2/blastpgp_h1e-10_j2_10000.out
  +
  +
real 3m6.093s
  +
user 2m48.260s
  +
sys 0m6.020s
  +
  +
  +
time blastpgp -C $TSK/h1e-10j10.chk -d $BIG/big_80 -i $TSK/Q30201.fasta -h 1e-10 -j 10 -b 10000 -m 8 -o $TSK/align2/blastpgp_h1e-10_j10_10000.out
  +
  +
real 16m41.797s
  +
user 15m18.350s
  +
sys 0m28.890s
  +
  +
  +
time blastpgp -R $TSK/h002j2.chk -d $BIG/big -i $TSK/Q30201.fasta -h 0.002 -j 2 -b 100000 -m 8 -o $TSK/align2/blastpgp_big_h0.002_j2_100000.out
  +
  +
real 28m17.459s
  +
user 18m26.930s
  +
sys 2m46.200s
  +
  +
  +
time blastpgp -R $TSK/h002j10.chk -d $BIG/big -i $TSK/Q30201.fasta -h 0.002 -j 10 -b 100000 -m 8 -o $TSK/align2/blastpgp_big_h0.002_j10_100000.out
  +
  +
real 367m15.021s (srsly...)
  +
user 310m31.210s
  +
sys 14m39.460s
  +
...and some thousand "[blastpgp] ERROR: ncbiapi [000.000] ObjMgrNextAvailEntityID failed with idx 2048" messages
  +
  +
  +
time blastpgp -R $TSK/h1e-10j2.chk -d $BIG/big -i $TSK/Q30201.fasta -h 1e-10 -j 2 -b 100000 -m 8 -o $TSK/align2/blastpgp_big_h1e-10_j2_100000.out
  +
  +
real 26m43.350s
  +
user 19m54.380s
  +
sys 2m3.900s
  +
  +
  +
time blastpgp -R $TSK/h1e-10j10.chk -d $BIG/big -i $TSK/Q30201.fasta -h 1e-10 -j 10 -b 100000 -m 8 -o $TSK/align2/blastpgp_big_h1e-10_j10_100000.out
  +
  +
real 64m4.575s
  +
user 47m16.970s
  +
sys 4m54.890s
  +
...and some thousand "[blastpgp] ERROR: ncbiapi [000.000] ObjMgrNextAvailEntityID failed with idx 2048" messages
   
 
=== HHblits ===
 
=== HHblits ===
  +
TSK="/mnt/home/student/bernhoferm/task2"
  +
HHB="/mnt/project/pracstrucfunc12/data/hhblits"
  +
  +
  +
time hhblits -d $HHB/uniprot20_current -i $TSK/Q30201.fasta -Z 600 -B 600 -o $TSK/align2/hhblits_default_600.out
  +
  +
real 13m13.813s
  +
user 5m4.400s
  +
sys 2m45.520s
   
 
== MSA ==
 
== MSA ==
  +
#!/bin/bash
  +
time t_coffee /mnt/home/student/joerdensv/newGroups/group0-40.fasta -output=fasta_aln
  +
time t_coffee /mnt/home/student/joerdensv/newGroups/group0-99.fasta -output=fasta_aln
  +
time t_coffee /mnt/home/student/joerdensv/newGroups/group60-99.fasta -output=fasta_aln
  +
time muscle -in /mnt/home/student/joerdensv/newGroups/group60-99.fasta -out group60-99muscle.fasta -quiet
  +
time muscle -in /mnt/home/student/joerdensv/newGroups/group0-99.fasta -out group0-99muscle.fasta -quiet
  +
time muscle -in /mnt/home/student/joerdensv/newGroups/group0-40.fasta -out group0-40muscle.fasta -quiet
  +
time clustalw -align -infile=/mnt/home/student/joerdensv/newGroups/group60-99.fasta -outfile=/mnt/home/student/joerdensv/newGroups/group60-99clustalw.fasta -output=fasta
  +
time clustalw -align -infile=/mnt/home/student/joerdensv/newGroups/group0-99.fasta -outfile=/mnt/home/student/joerdensv/newGroups/group0-99clustalw.fasta -output=fasta
  +
time clustalw -align -infile=/mnt/home/student/joerdensv/newGroups/group0-40.fasta -outfile=/mnt/home/student/joerdensv/newGroups/group0-40clustalw.fasta -output=fasta
  +
  +
GRP="/mnt/home/student/bernhoferm/task2/groups"
  +
time t_coffee $GRP/group0-40.fasta -method pdb_pair,proba_pair -template_file $GRP/group0-40.template -output=fasta_aln
  +
  +
real 3m42.931s
  +
user 1m50.940s
  +
sys 0m23.180s
  +
  +
  +
>group0-40.template
  +
>sp|Q30201|HFE_HUMAN Hereditary hemochromatosis protein OS=Homo sapiens GN=HFE PE=1 SV=1 _P_ 1a6zA
  +
>sp|P16391|HA12_RAT RT1 class I histocompatibility antigen, AA alpha chain OS=Rattus norvegicus PE=1 SV=2 _P_ 1ed3A
  +
>sp|P05534|1A24_HUMAN HLA class I histocompatibility antigen, A-24 alpha chain OS=Homo sapiens GN=HLA-A PE=1 SV=2 _P_ 2bckA
  +
>tr|Q30597|Q30597_MACMU MHC class I Mamu-A*02 (Fragment) OS=Macaca mulatta PE=1 SV=1 _P_ 3jtsA
  +
>sp|P01900|HA12_MOUSE H-2 class I histocompatibility antigen, D-D alpha chain OS=Mus musculus GN=H2-D1 PE=1 SV=1 _P_ 1biiA
  +
>tr|Q31093|Q31093_MOUSE Histocompatibility 2, M region locus 3 OS=Mus musculus GN=H2-M3 PE=1 SV=1 _P_ 1mhcA
  +
>sp|P14432|HA1T_MOUSE H-2 class I histocompatibility antigen, TLA(B) alpha chain OS=Mus musculus GN=H2-T3 PE=1 SV=2 _P_ 1nezA
  +
>tr|Q860W6|Q860W6_MOUSE Major histocompatibility complex class Ib M10.5 (Fragment) OS=Mus musculus GN=H2-M10.4 PE=1 SV=1 _P_ 1zs8A
  +
>tr|Q31615|Q31615_MOUSE MHC class I H2-TL-27-129 mRNA (b haplotype), complete cds OS=Mus musculus GN=H2-T22 PE=1 SV=1 _P_ 1c16A
  +
>tr|Q31206|Q31206_MOUSE MHC class I H2-TL-T10-129 mRNA (b haplotype), complete cds OS=Mus musculus GN=H2-T10 PE=1 SV=1 _P_ 1r3hA
  +
>sp|P01921|HB2D_MOUSE H-2 class II histocompatibility antigen, A-D beta chain OS=Mus musculus GN=H2-Ab1 PE=1 SV=1 _P_ 1iaoB
  +
  +
== Scripts ==
  +
  +
=== (PSI-)BLAST ===
  +
  +
The output files from BLAST and PSI-BLAST were parsed and filtered with the following perl script. The PSI-BLAST output "filtered" for the last iteration step by hand (before using the script). If mutliple hits with the same ID were found, only the first one was considered (the one with the best e-Value).
  +
  +
  +
>getM8StatsAndIDs.pl
  +
<source lang="perl">
  +
#!/usr/local/bin/perl
  +
use strict;
  +
use warnings;
  +
use diagnostics;
  +
use sigtrap;
  +
use autodie;
  +
  +
my $filename = shift;
  +
my $outputFile = "$filename.stats";
  +
my $outputFile2 = "$filename.ids";
  +
  +
my %IDtoEVal;
  +
my %IDtoIdent;
  +
  +
my $id;
  +
my $eVal;
  +
my $ident;
  +
  +
open (FILE, "<", $filename) or die;
  +
  +
while (<FILE>)
  +
{
  +
my $line = $_;
  +
  +
my @content = split(/\t/, $line);
  +
  +
$id = $content[1];
  +
$eVal = $content[10];
  +
$ident = $content[2];
  +
  +
$id =~s/\s//gi;
  +
$eVal =~s/\s//gi;
  +
$ident =~s/\s//gi;
  +
  +
$id =~m/.*?\|(.*?)\|(.*?)$/gi;
  +
  +
my $uniID = $1;
  +
my $pdbID = $2;
  +
  +
if ($uniID=~m/^pdb$/gi)
  +
{
  +
$id = "PDB:".$pdbID;
  +
}
  +
else
  +
{
  +
$id = $uniID;
  +
}
  +
  +
if ($eVal=~m/^e/)
  +
{
  +
$eVal="1".$eVal;
  +
}
  +
  +
if (not defined $IDtoEVal{$id})
  +
{
  +
$IDtoEVal{$id} = $eVal;
  +
$IDtoIdent{$id} = $ident;
  +
}
  +
}
  +
  +
close FILE;
  +
  +
  +
  +
#Sort Data
  +
#my @keys = sort{$IDtoEVal{$a} <=> $IDtoEVal{$b}} keys %IDtoEVal;
  +
my @keys = sort{$IDtoIdent{$b} <=> $IDtoIdent{$a}} keys %IDtoIdent;
  +
  +
  +
  +
#Thresholds
  +
my $EValThreshold = 0.002;
  +
my $IdentThreshold = 0;
  +
  +
  +
open (OUTFILE, ">", $outputFile) or die;
  +
open (OUTFILE2, ">", $outputFile2) or die;
  +
  +
foreach my $key (@keys)
  +
{
  +
if ($IDtoEVal{$key} <= $EValThreshold && $IDtoIdent{$key} >= $IdentThreshold)
  +
{
  +
print OUTFILE "$key\t$IDtoEVal{$key}\t$IDtoIdent{$key}\n";
  +
print OUTFILE2 "$key\n";
  +
}
  +
}
  +
  +
close OUTFILE;
  +
close OUTFILE2;
  +
</source>
  +
  +
=== HHblits ===
  +
  +
The perl script bellow was used to parse and filter the HHblits output. For multiple hits again only the first one was considered.
  +
  +
  +
<source lang="perl">
  +
#!/usr/local/bin/perl
  +
use strict;
  +
use warnings;
  +
use diagnostics;
  +
use sigtrap;
  +
use autodie;
  +
  +
my $filename = shift;
  +
my $outputFile = "$filename.stats";
  +
my $outputFile2 = "$filename.ids";
  +
  +
my %IDtoEVal;
  +
my %IDtoIdent;
  +
my %UniprotIDs;
  +
  +
my $id;
  +
my $eVal;
  +
my $ident;
  +
my $count = 1;
  +
  +
  +
open (FILE, "<", $filename) or die;
  +
  +
while (<FILE>)
  +
{
  +
my $line = $_;
  +
  +
if ($line=~m/>UP20\|(.*?)\|.+/gi)
  +
{
  +
$id = $1;
  +
  +
my @tmp = split(/\|/, $line);
  +
my $uid;
  +
  +
for (my $i=4; $i < scalar(@tmp); $i++)
  +
{
  +
$uid = $tmp[$i];
  +
$uid =~ s/\s*\[.*?\]//;
  +
$uid =~ s/\.//;
  +
$uid =~ s/\s//;
  +
$UniprotIDs{$uid}="alive";
  +
}
  +
}
  +
  +
if ($line=~m/.*?E-value\s*=\s*(\d+?\D*?\d*?\D*?\d*?)\s+/gi)
  +
{
  +
$eVal = $1;
  +
  +
if ($eVal=~m/^e/)
  +
{
  +
$eVal="1".$eVal;
  +
}
  +
  +
if (not defined $IDtoEVal{$id})
  +
{
  +
$IDtoEVal{$id} = $eVal;
  +
}
  +
}
  +
  +
if ($line=~m/.*?Identities\s*=\s*(\d*?)\D/gi)
  +
{
  +
$ident = $1;
  +
  +
if (not defined $IDtoIdent{$id})
  +
{
  +
$IDtoIdent{$id} = $ident/100;
  +
}
  +
}
  +
}
  +
  +
close FILE;
  +
  +
  +
  +
#Sort Data
  +
#my @keys = sort{$IDtoEVal{$a} <=> $IDtoEVal{$b}} keys %IDtoEVal;
  +
my @keys = sort{$IDtoIdent{$b} <=> $IDtoIdent{$a}} keys %IDtoIdent;
  +
  +
  +
  +
#Thresholds
  +
my $EValThreshold = 0.003; #To include the only cluster with a worse e-Value than 0.001 which has 0.002!
  +
  +
  +
open (OUTFILE, ">", $outputFile) or die;
  +
  +
foreach my $key (@keys)
  +
{
  +
if ($IDtoEVal{$key} <= $EValThreshold)
  +
{
  +
print OUTFILE "$key\t$IDtoEVal{$key}\t$IDtoIdent{$key}\n";
  +
}
  +
}
  +
  +
close OUTFILE;
  +
  +
  +
open (OUTFILE2, ">", $outputFile2) or die;
  +
  +
foreach my $key (keys %UniprotIDs)
  +
{
  +
print OUTFILE2 "$key\n";
  +
}
  +
  +
close OUTFILE2;
  +
</source>
  +
  +
  +
===Statistics===
  +
  +
COPS-Cluster extraction:
  +
  +
<source lang="perl">
  +
#!/usr/local/bin/perl
  +
  +
  +
use strict;
  +
#use warnings;
  +
  +
my $fileWithUniprotIDs=shift;
  +
my $pdbData=shift;
  +
my $COPSData=shift;
  +
my $outputFile=shift;
  +
  +
my $line;
  +
my %nameHash;
  +
my @arrayOfPDBIDs;
  +
  +
my @usedIDs;
  +
open (MYFILE, $fileWithUniprotIDs) or die ("Uniprot ID Data missing... :*( \n");
  +
while (<MYFILE>) {
  +
$line=$_;
  +
chomp $line;
  +
if ($line=~m/PDB:(.{4})/gi){
  +
$line=uc $1;
  +
push (@arrayOfPDBIDs, $line);
  +
} else {
  +
push(@usedIDs, $line);
  +
}
  +
#print "$line\n";
  +
  +
#print "$_\n";
  +
}
  +
close (MYFILE);
  +
my $schweinehundcounter;
  +
my %uniprotToPDBHash;
  +
my @tempArray;
  +
  +
  +
open (UNIPROTTOPDB, "<$pdbData") or die ("PDB conversion file missing!");
  +
while (<UNIPROTTOPDB>) {
  +
$line=$_;
  +
chomp $line;
  +
@tempArray=split('\s',$line);
  +
  +
if (exists $uniprotToPDBHash{$tempArray[0]}) {
  +
push($uniprotToPDBHash{$tempArray[0]},$tempArray[1]);
  +
$schweinehundcounter++;
  +
#print "Schweinehunde fliegen weit!";
  +
  +
} else {
  +
my @newPDBArray=($tempArray[1]);
  +
$uniprotToPDBHash{$tempArray[0]}=\@newPDBArray;
  +
}
  +
  +
#$uniprotToPDB{$tempArray[0]}=$tempArray[1];
  +
# print "!!!!!$tempArray[0]!!!!!! !!!!!!!!$tempArray[1]";
  +
# print "$uniprotToPDB{$tempArray[0]} is da Shit!";
  +
  +
}
  +
  +
  +
  +
open (COPSDATA, "<$COPSData") or die "COPS file, where are you????WHERE :*(";
  +
  +
my %refChainHash;
  +
my %L99Hash;
  +
my %L80Hash;
  +
my %L60Hash;
  +
my %L40Hash;
  +
my %L30Hash;
  +
my %S90Hash;
  +
my %S30Hash;
  +
  +
  +
  +
my %L99HashReverse;
  +
my %L80HashReverse;
  +
my %L60HashReverse;
  +
my %L40HashReverse;
  +
my %L30HashReverse;
  +
my %S90HashReverse;
  +
my %S30HashReverse;
  +
  +
  +
while (<COPSDATA>) {
  +
$line=$_;
  +
chomp $line;
  +
@tempArray=split('\s',$line);
  +
my $refChain =uc substr $tempArray[0],0,4;
  +
my $L99 =uc substr $tempArray[1],0,4;
  +
my $L80 =uc substr $tempArray[2],0,4;
  +
my $L60 =uc substr $tempArray[3],0,4;
  +
my $L40 =uc substr $tempArray[4],0,4;
  +
my $L30 =uc substr $tempArray[5],0,4;
  +
my $S90 =uc substr $tempArray[6],0,4;
  +
my $S30 =uc substr $tempArray[7],0,4;
  +
  +
#print "$refChain $L99 $L80 $L60 $L40 $L30 $S90 $S30\n";
  +
  +
$L99Hash{$L99}{$refChain}=1;
  +
  +
$L80Hash{$L80}{$refChain}=1;
  +
  +
$L60Hash{$L60}{$refChain}=1;
  +
  +
$L40Hash{$L40}{$refChain}=1;
  +
  +
$L30Hash{$L30}{$refChain}=1;
  +
  +
$S90Hash{$S90}{$refChain}=1;
  +
  +
$S30Hash{$S30}{$refChain}=1;
  +
  +
  +
$L99HashReverse{$refChain}{$L99}=1;
  +
  +
$L80HashReverse{$refChain}{$L80}=1;
  +
  +
$L60HashReverse{$refChain}{$L60}=1;
  +
  +
$L40HashReverse{$refChain}{$L40}=1;
  +
  +
$L30HashReverse{$refChain}{$L30}=1;
  +
  +
$S90HashReverse{$refChain}{$S90}=1;
  +
  +
$S30HashReverse{$refChain}{$S30}=1;
  +
  +
}
  +
  +
  +
#print "HOOOHOOOOO: ".$L99HashReverse{"1DE4"};
  +
  +
  +
  +
  +
  +
  +
  +
  +
my $testRechner=0;
  +
  +
#open (FILOR, ">test/hhblitsCompleteUniprot600.txt") or die ("du penner!");
  +
  +
  +
foreach my $entry (@usedIDs){
  +
  +
#print FILOR "$entry\n";
  +
  +
  +
if (exists $uniprotToPDBHash{$entry}) {
  +
# print "Hallo ich bin $entry, mein PDB-pendant ist/sind: @{$uniprotToPDBHash{$entry}}!\n";
  +
push (@arrayOfPDBIDs, @{$uniprotToPDBHash{$entry}});
  +
# print scalar(@{$uniprotToPDBHash{$entry}})."\n";
  +
# $testRechner=$testRechner+scalar(@{$uniprotToPDBHash{$entry}});
  +
# print $testRechner."\n";
  +
}
  +
}
  +
#print "While there were ". scalar(@sshole)." uniprot IDs.\n";
  +
  +
  +
  +
#close (FILOR);
  +
  +
my $hardCodedPDB1="1DE4";
  +
my $hardCodedPDB2="1A6Z";
  +
  +
my @hardCodedPDBs=($hardCodedPDB1,$hardCodedPDB2);
  +
  +
my %alreadyLFound;
  +
my %alreadySFound;
  +
  +
#print "@arrayofPDBIDs";
  +
  +
my %L99EndHash;
  +
my %L80EndHash;
  +
my %L60EndHash;
  +
my %L40EndHash;
  +
my %L30EndHash;
  +
my %S90EndHash;
  +
my %S30EndHash;
  +
  +
foreach my $foundPDBID (@arrayOfPDBIDs) {
  +
foreach my $hardCoded (@hardCodedPDBs){
  +
foreach my $representativeL99OfFoundPDBID (keys $L99HashReverse{$hardCoded}){
  +
#print "$representativeL99OfFoundPDBID TROLOLOLOLO!";
  +
if (exists $L99Hash{$representativeL99OfFoundPDBID}{$foundPDBID}){
  +
$L99EndHash{$foundPDBID}=1;
  +
#print "IT WORKED!!!!!\n"; # funktioniert hier weiterschreiben...
  +
}
  +
}
  +
  +
  +
foreach my $representativeL80OfFoundPDBID (keys $L80HashReverse{$hardCoded}){
  +
  +
if (exists $L80Hash{$representativeL80OfFoundPDBID}{$foundPDBID}){
  +
$L80EndHash{$foundPDBID}=1;
  +
}
  +
}
  +
  +
  +
foreach my $representativeL60OfFoundPDBID (keys $L60HashReverse{$hardCoded}){
  +
if (exists $L60Hash{$representativeL60OfFoundPDBID}{$foundPDBID}){
  +
$L60EndHash{$foundPDBID}=1;
  +
}
  +
}
  +
  +
foreach my $representativeL40OfFoundPDBID (keys $L40HashReverse{$hardCoded}){
  +
  +
if (exists $L40Hash{$representativeL40OfFoundPDBID}{$foundPDBID}){
  +
$L40EndHash{$foundPDBID}=1;
  +
}
  +
}
  +
  +
foreach my $representativeL30OfFoundPDBID (keys $L30HashReverse{$hardCoded}){
  +
  +
if (exists $L30Hash{$representativeL30OfFoundPDBID}{$foundPDBID}){
  +
$L30EndHash{$foundPDBID}=1;
  +
}
  +
}
  +
  +
foreach my $representativeS90OfFoundPDBID (keys $S90HashReverse{$hardCoded}){
  +
  +
if (exists $S90Hash{$representativeS90OfFoundPDBID}{$foundPDBID}){
  +
$S90EndHash{$foundPDBID}=1;
  +
}
  +
}
  +
  +
foreach my $representativeS30OfFoundPDBID (keys $S30HashReverse{$hardCoded}){
  +
  +
if (exists $S30Hash{$representativeS30OfFoundPDBID}{$foundPDBID}){
  +
$S30EndHash{$foundPDBID}=1;
  +
}
  +
}
  +
  +
}
  +
  +
  +
  +
}
  +
  +
open(OUTPUTFILE, ">$outputFile");
  +
open(OUTPUTFILE2, ">$outputFile.pcs");
  +
print scalar(@arrayOfPDBIDs);
  +
print scalar (keys %L99EndHash)."\tL99er\n";
  +
print scalar (keys %L80EndHash)."\tL80er\n";
  +
print scalar (keys %L60EndHash)."\tL60er\n";
  +
print scalar (keys %L40EndHash)."\tL40er\n";
  +
print scalar (keys %L30EndHash)."\tL30er\n";
  +
print scalar (keys %S90EndHash)."\tS90er\n";
  +
print scalar (keys %S30EndHash)."\tS30er\n";
  +
print "\n";
  +
print OUTPUTFILE scalar (keys %L99EndHash)."\tL99er\n";
  +
print OUTPUTFILE scalar (keys %L80EndHash)."\tL80er\n";
  +
print OUTPUTFILE scalar (keys %L60EndHash)."\tL60er\n";
  +
print OUTPUTFILE scalar (keys %L40EndHash)."\tL40er\n";
  +
print OUTPUTFILE scalar (keys %L30EndHash)."\tL30er\n";
  +
print OUTPUTFILE scalar (keys %S90EndHash)."\tS90er\n";
  +
print OUTPUTFILE scalar (keys %S30EndHash)."\tS30er\n";
  +
  +
  +
print OUTPUTFILE2 scalar (keys %L99EndHash)/scalar(@arrayOfPDBIDs)."\tL99er\n";
  +
print OUTPUTFILE2 scalar (keys %L80EndHash)/scalar(@arrayOfPDBIDs)."\tL80er\n";
  +
print OUTPUTFILE2 scalar (keys %L60EndHash)/scalar(@arrayOfPDBIDs)."\tL60er\n";
  +
print OUTPUTFILE2 scalar (keys %L40EndHash)/scalar(@arrayOfPDBIDs)."\tL40er\n";
  +
print OUTPUTFILE2 scalar (keys %L30EndHash)/scalar(@arrayOfPDBIDs)."\tL30er\n";
  +
print OUTPUTFILE2 scalar (keys %S90EndHash)/scalar(@arrayOfPDBIDs)."\tS90er\n";
  +
print OUTPUTFILE2 scalar (keys %S30EndHash)/scalar(@arrayOfPDBIDs)."\tS30er\n";
  +
  +
  +
</source>
  +
  +
  +
  +
  +
Get how many of the found proteins have a common GO term with our HFE protein
  +
  +
<source lang="perl">
  +
#!/usr/local/bin/perl
  +
  +
use strict;
  +
use warnings;
  +
use diagnostics;
  +
use sigtrap;
  +
use autodie;
  +
  +
  +
  +
my $idFile=shift;
  +
my $GOData=shift;
  +
my $outputData=shift;
  +
  +
my $line;
  +
  +
my $hardcodedUniprotID="Q30201";
  +
  +
my %hashOfAnnihilation;
  +
$hashOfAnnihilation{$hardcodedUniprotID}=1;
  +
  +
open (MYFILE, $idFile) or die ("du penner!");
  +
while (<MYFILE>) {
  +
$line=$_;
  +
chomp $line;
  +
if ($line=~m/PDB:(.{4})/gi){
  +
  +
} else {
  +
my $currentID=$line;
  +
$hashOfAnnihilation{$currentID}=1;
  +
  +
}
  +
  +
  +
  +
}
  +
close (MYFILE);
  +
  +
my %uniprotToPDBHash;
  +
my @tempArray;
  +
  +
  +
  +
my %gOAssocs;
  +
open (GODATA, "<$GOData") or die "GO Data, GO Data GO? GO???? :*(";
  +
  +
while (<GODATA>) {
  +
$line=$_;
  +
chomp $line;
  +
  +
  +
  +
my @tempGOArray=split('\s',$line);
  +
my $tempID=$tempGOArray[0];
  +
shift @tempGOArray;
  +
foreach my $tempGOTerm (@tempGOArray){
  +
if (exists $hashOfAnnihilation{$tempID}){
  +
  +
$gOAssocs{$tempID}{$tempGOTerm}=1;
  +
  +
  +
}
  +
}
  +
  +
  +
}
  +
close (GODATA);
  +
  +
my $numberOfSimilar=0;
  +
my $totalFound=0;
  +
  +
my %wantedHash;
  +
  +
open (PERCENTAGEOUTPUTOFSHAREDCLUSTER, ">$outputData") or die "WHY, CRUEL WORLD? WHY? meh whatever...";
  +
  +
foreach my $uniprotIDofCluster (keys %hashOfAnnihilation) {
  +
my $numberOfCurrentHits=0;
  +
  +
if (exists $gOAssocs{$uniprotIDofCluster}){
  +
  +
foreach my $currentGO (keys $gOAssocs{$uniprotIDofCluster}){
  +
  +
  +
  +
if (exists $gOAssocs{$hardcodedUniprotID}{$currentGO}){
  +
$numberOfCurrentHits++;
  +
$totalFound++;
  +
if (exists $wantedHash{$currentGO}) {
  +
$wantedHash{$currentGO}=$wantedHash{$currentGO}+1;
  +
} else {
  +
$wantedHash{$currentGO}=1;
  +
}
  +
  +
  +
}
  +
  +
}
  +
my $roundedPercentageOfHardCodedProteinsGOTerms=int(10000 * ($numberOfCurrentHits/(scalar (keys $gOAssocs{$hardcodedUniprotID}))) + 0.5) / 10000;
  +
my $roundedPercentageOfCurrentProteinsGOTerms=int(10000 * ($numberOfCurrentHits/(scalar (keys $gOAssocs{$uniprotIDofCluster}))) + 0.5) / 10000;
  +
  +
  +
print PERCENTAGEOUTPUTOFSHAREDCLUSTER "$uniprotIDofCluster\t$roundedPercentageOfHardCodedProteinsGOTerms\t$roundedPercentageOfCurrentProteinsGOTerms\t$numberOfCurrentHits\n";
  +
}
  +
  +
if ($numberOfCurrentHits>0){
  +
$numberOfSimilar++;
  +
}
  +
  +
  +
}
  +
print "$totalFound wurden insgesamt gefunden, mit insgesamt $numberOfSimilar ähnlichen.\n";
  +
close (PERCENTAGEOUTPUTOFSHAREDCLUSTER);
  +
foreach my $key (keys %wantedHash){
  +
print "$key: $wantedHash{$key}\n";
  +
}
  +
  +
</source>
  +
  +
  +
  +
  +
  +
=== MSA ===
  +
  +
GapConserve
  +
<source lang="perl">
  +
#!/usr/local/bin/perl
  +
  +
use strict;
  +
use warnings;
  +
use diagnostics;
  +
use sigtrap;
  +
use autodie;
  +
  +
my $aliFile=shift;
  +
  +
my $lastID="Trötofant";
  +
my $currentSequence;
  +
my $currentNumberOfGaps=0;
  +
my %sequenceHash;
  +
my %gapCounter;
  +
open (MYFILE, $aliFile) or die ("weak!");
  +
while (<MYFILE>) {
  +
  +
my $line=$_;
  +
chomp $line;
  +
if ($line=~m/^>.*?\|(.*?)\|/gi){
  +
if ($lastID ne "Trötofant"){
  +
$sequenceHash{$lastID}=$currentSequence;
  +
$currentSequence="";
  +
}
  +
$lastID=$1;
  +
  +
} else {
  +
$currentSequence=$currentSequence . $line;
  +
  +
}
  +
  +
}
  +
close MYFILE;
  +
$sequenceHash{$lastID}=$currentSequence;
  +
  +
foreach my $key (keys %sequenceHash){
  +
my @gapCount=$sequenceHash{$key}=~ m/-/gi;
  +
$gapCounter{$key}= scalar @gapCount;
  +
print "$gapCounter{$key} Gaps gefunden! Bei Uniprot $key\n";
  +
}
  +
  +
my $sequenceLength=scalar (split ("",$currentSequence));
  +
  +
my $consensusCounter=0;
  +
for (my $i=0;$i<$sequenceLength;$i++){
  +
my $bool=1;
  +
my $lastChar="";
  +
foreach my $key (keys %sequenceHash){
  +
if ( $bool==1 && ($lastChar eq (split ("",$sequenceHash{$key}))[$i] || $lastChar eq "") && (split ("",$sequenceHash{$key}))[$i] ne "-"){
  +
#print "alter Char: ----> $lastChar <----";
  +
$lastChar=(split ("",$sequenceHash{$key}))[$i];
  +
#print "selber Char gefunden, Baby! ----> $lastChar <---- an Pos $i\n";
  +
} else {
  +
$bool=0;
  +
$lastChar="-";
  +
}
  +
}
  +
if ($bool==1){
  +
$consensusCounter++
  +
}
  +
}
  +
  +
print "$consensusCounter komplett übereinstimmende Positionen (Conserved)\n";
  +
</source>
  +
  +
  +
=== Other ===
  +
  +
Script to get only GO annotations of proteins one is interested in out of a GO file:
  +
  +
<source lang="perl">
  +
#!/usr/local/bin/perl
  +
  +
  +
use strict;
  +
use warnings;
  +
use diagnostics;
  +
use sigtrap;
  +
use autodie;
  +
  +
my $fileWithUniprotIDs=shift;
  +
my $GOData=shift;
  +
my $outputFile=shift;
  +
my $line;
  +
my $hardcodedUniprotID="Q30201";
  +
my %hashOfAnnihilation;
  +
my @arrayOfPDBIDs;
  +
my @usedIDs;
  +
push(@usedIDs, $hardcodedUniprotID);
  +
  +
open (MYFILE, $fileWithUniprotIDs) or die ("Uniprot ID Data missing... :*( \n");
  +
while (<MYFILE>) {
  +
$line=$_;
  +
chomp $line;
  +
if ($line=~m/PDB:(.{4})/gi){
  +
$line=uc $1;
  +
push (@arrayOfPDBIDs, $line);
  +
} else {
  +
push(@usedIDs, $line);
  +
$hashOfAnnihilation{$line}=1;
  +
}
  +
  +
}
  +
close (MYFILE);
  +
  +
  +
  +
  +
  +
  +
  +
my %uniprotToPDBHash;
  +
my @tempArray;
  +
  +
  +
  +
my %gOAssocs;
  +
open (GODATA, "<$GOData") or die "GO Data, GO Data GO? GO???? :*(";
  +
  +
while (<GODATA>) {
  +
$line=$_;
  +
chomp $line;
  +
  +
if ($line=~m/^UniprotKB.*/gi) {
  +
  +
my @tempGOArray=split('\s',$line);
  +
  +
if (exists $hashOfAnnihilation{$tempGOArray[1]}){
  +
  +
$gOAssocs{$tempGOArray[1]}{$tempGOArray[4]}=1;
  +
  +
  +
}
  +
# print "$tempGOArray[1]!wurd hinzugefuegt mit $tempGOArray[4]!.\n";
  +
}
  +
}
  +
close (GODATA);
  +
  +
  +
  +
open (OUTPUT, ">$outputFile") or die "Cannot write to $outputFile...\n";
  +
foreach my $uniprotIDofCluster (@usedIDs) {
  +
my $numberOfCurrentHits=0;
  +
  +
if (exists $gOAssocs{$uniprotIDofCluster}){
  +
print OUTPUT "$uniprotIDofCluster\t";
  +
print "$uniprotIDofCluster\t";
  +
foreach my $currentGO (keys $gOAssocs{$uniprotIDofCluster}){
  +
print "$currentGO\t";
  +
print OUTPUT "$currentGO\t";
  +
  +
}
  +
print "\n";
  +
print OUTPUT "\n";
  +
  +
}
  +
  +
}
  +
close (OUTPUT);
  +
  +
  +
</source>
  +
  +
  +
R script for the e-Value and identity distributions (yep, everything hardcoded^^).
  +
  +
  +
<source lang="bash">
  +
data1<-read.table("blastpgp_h0.002_j2_10000.last.stats");
  +
data2<-read.table("blastpgp_h0.002_j10_10000.last.stats");
  +
data3<-read.table("blastpgp_h1e-10_j2_10000.last.stats");
  +
data4<-read.table("blastpgp_h1e-10_j10_10000.last.stats");
  +
data5<-read.table("blast_default_1500.out.stats");
  +
data6<-read.table("hhblits_default_600.out.stats");
  +
  +
data7<-read.table("blastpgp_big_h0.002_j2_100000.last.stats");
  +
data8<-read.table("blastpgp_big_h0.002_j10_100000.last.stats");
  +
data9<-read.table("blastpgp_big_h1e-10_j2_100000.last.stats");
  +
data10<-read.table("blastpgp_big_h1e-10_j10_100000.last.stats");
  +
  +
  +
png(filename="eval_psi_80.png", height=400, width=800, bg="white");
  +
cols = rainbow(4, alpha=0.6);
  +
br = seq(-150,0,1);
  +
hist(log10(data1$V2), col=cols[1], breaks=br, ylim=range(0,150), xlab="log10(E-Value)", main="PSI-Blast (big80)");
  +
hist(log10(data2$V2), col=cols[2], breaks=br, ylim=range(0,150), xlab="log10(E-Value)", add=TRUE);
  +
hist(log10(data3$V2), col=cols[3], breaks=br, ylim=range(0,150), xlab="log10(E-Value)", add=TRUE);
  +
hist(log10(data4$V2), col=cols[4], breaks=br, ylim=range(0,150), xlab="log10(E-Value)", add=TRUE);
  +
legend(-150,100,c('h=0.002, j=2','h=0.002, j=10','h=1e-10, j=2','h=1e-10, j=10'),col=cols,fill=cols,border=cols);
  +
dev.off();
  +
  +
  +
png(filename="eval_blast_80.png", height=400, width=800, bg="white");
  +
hist(log10(data5$V2), col="blue", breaks=100, xlab="log10(E-Value)", main="Blast (big80)");
  +
dev.off();
  +
  +
  +
png(filename="eval_hhblits.png", height=400, width=800, bg="white");
  +
hist(log10(data6$V2), col="blue", breaks=50, xlab="log10(E-Value)", main="HHblits (Uniprot20)");
  +
dev.off();
  +
  +
  +
png(filename="ident_psi_80.png", height=400, width=800, bg="white");
  +
cols2 = rainbow(4, alpha=0.6);
  +
br2 = seq(0,100,1);
  +
hist(data1$V3, col=cols[1], breaks=br2, xlim=range(0,100), ylim=range(0,150), xlab="Identity (%)", main="PSI-Blast (big80)");
  +
hist(data2$V3, col=cols[2], breaks=br2, xlim=range(0,100), ylim=range(0,150), xlab="Identity (%)", add=TRUE);
  +
hist(data3$V3, col=cols[3], breaks=br2, xlim=range(0,100), ylim=range(0,150), xlab="Identity (%)", add=TRUE);
  +
hist(data4$V3, col=cols[4], breaks=br2, xlim=range(0,100), ylim=range(0,150), xlab="Identity (%)", add=TRUE);
  +
legend(60,100,c('h=0.002, j=2','h=0.002, j=10','h=1e-10, j=2','h=1e-10, j=10'),col=cols,fill=cols,border=cols);
  +
dev.off();
  +
  +
  +
png(filename="ident_blast_80.png", height=400, width=800, bg="white");
  +
hist(data5$V3, col="blue", breaks=100, xlim=range(0,100), xlab="Identity (%)", main="Blast (big80)");
  +
dev.off();
  +
  +
  +
png(filename="ident_hhblits.png", height=400, width=800, bg="white");
  +
hist(100*(data6$V3), col="blue", breaks=50, xlim=range(0,100), xlab="Identity (%)", main="HHblits (Uniprot20)");
  +
dev.off();
  +
</source>

Latest revision as of 19:41, 7 May 2012

Sequence Searches

Blast

TSK="/mnt/home/student/bernhoferm/task2"
BIG="/mnt/project/pracstrucfunc12/data/big"

time blastall -p blastp -d $BIG/big_80 -i $TSK/Q30201.fasta -b 1500 -m 8 -o $TSK/align2/blast_default_1500.out

real	1m24.100s
user	1m5.950s
sys	0m3.500s

PSI-Blast

TSK="/mnt/home/student/bernhoferm/task2"
BIG="/mnt/project/pracstrucfunc12/data/big"


time blastpgp -C $TSK/h002j2.chk -d $BIG/big_80 -i $TSK/Q30201.fasta -h 0.002 -j 2 -b 10000 -m 8 -o $TSK/align2/blastpgp_h0.002_j2_10000.out

real	3m21.031s
user	2m44.750s
sys	0m10.270s


time blastpgp -C $TSK/h002j10.chk -d $BIG/big_80 -i $TSK/Q30201.fasta -h 0.002 -j 10 -b 10000 -m 8 -o $TSK/align2/blastpgp_h0.002_j10_10000.out

real	16m39.699s
user	14m22.150s
sys	0m41.310s


time blastpgp -C $TSK/h1e-10j2.chk -d $BIG/big_80 -i $TSK/Q30201.fasta -h 1e-10 -j 2 -b 10000 -m 8 -o $TSK/align2/blastpgp_h1e-10_j2_10000.out

real	3m6.093s
user	2m48.260s
sys	0m6.020s


time blastpgp -C $TSK/h1e-10j10.chk -d $BIG/big_80 -i $TSK/Q30201.fasta -h 1e-10 -j 10 -b 10000 -m 8 -o $TSK/align2/blastpgp_h1e-10_j10_10000.out

real	16m41.797s
user	15m18.350s
sys	0m28.890s


time blastpgp -R $TSK/h002j2.chk -d $BIG/big -i $TSK/Q30201.fasta -h 0.002 -j 2 -b 100000 -m 8 -o $TSK/align2/blastpgp_big_h0.002_j2_100000.out

real	28m17.459s
user	18m26.930s
sys	2m46.200s


time blastpgp -R $TSK/h002j10.chk -d $BIG/big -i $TSK/Q30201.fasta -h 0.002 -j 10 -b 100000 -m 8 -o $TSK/align2/blastpgp_big_h0.002_j10_100000.out

real	367m15.021s (srsly...)
user	310m31.210s
sys	14m39.460s
...and some thousand "[blastpgp] ERROR: ncbiapi [000.000]  ObjMgrNextAvailEntityID failed with idx 2048" messages


time blastpgp -R $TSK/h1e-10j2.chk -d $BIG/big -i $TSK/Q30201.fasta -h 1e-10 -j 2 -b 100000 -m 8 -o $TSK/align2/blastpgp_big_h1e-10_j2_100000.out

real	26m43.350s
user	19m54.380s
sys	2m3.900s


time blastpgp -R $TSK/h1e-10j10.chk -d $BIG/big -i $TSK/Q30201.fasta -h 1e-10 -j 10 -b 100000 -m 8 -o $TSK/align2/blastpgp_big_h1e-10_j10_100000.out

real	64m4.575s
user	47m16.970s
sys	4m54.890s
...and some thousand "[blastpgp] ERROR: ncbiapi [000.000]  ObjMgrNextAvailEntityID failed with idx 2048" messages

HHblits

TSK="/mnt/home/student/bernhoferm/task2"
HHB="/mnt/project/pracstrucfunc12/data/hhblits"


time hhblits -d $HHB/uniprot20_current -i $TSK/Q30201.fasta -Z 600 -B 600 -o $TSK/align2/hhblits_default_600.out

real	13m13.813s
user	5m4.400s
sys	2m45.520s

MSA

#!/bin/bash
time t_coffee /mnt/home/student/joerdensv/newGroups/group0-40.fasta -output=fasta_aln
time t_coffee /mnt/home/student/joerdensv/newGroups/group0-99.fasta -output=fasta_aln
time t_coffee /mnt/home/student/joerdensv/newGroups/group60-99.fasta -output=fasta_aln
time muscle -in /mnt/home/student/joerdensv/newGroups/group60-99.fasta -out group60-99muscle.fasta -quiet
time muscle -in /mnt/home/student/joerdensv/newGroups/group0-99.fasta -out group0-99muscle.fasta -quiet
time muscle -in /mnt/home/student/joerdensv/newGroups/group0-40.fasta -out group0-40muscle.fasta -quiet
time clustalw -align -infile=/mnt/home/student/joerdensv/newGroups/group60-99.fasta -outfile=/mnt/home/student/joerdensv/newGroups/group60-99clustalw.fasta -output=fasta
time clustalw -align -infile=/mnt/home/student/joerdensv/newGroups/group0-99.fasta -outfile=/mnt/home/student/joerdensv/newGroups/group0-99clustalw.fasta -output=fasta
time clustalw -align -infile=/mnt/home/student/joerdensv/newGroups/group0-40.fasta -outfile=/mnt/home/student/joerdensv/newGroups/group0-40clustalw.fasta -output=fasta
GRP="/mnt/home/student/bernhoferm/task2/groups"
time t_coffee $GRP/group0-40.fasta -method pdb_pair,proba_pair -template_file $GRP/group0-40.template -output=fasta_aln

real    3m42.931s
user    1m50.940s
sys     0m23.180s


>group0-40.template

>sp|Q30201|HFE_HUMAN Hereditary hemochromatosis protein OS=Homo sapiens GN=HFE PE=1 SV=1 _P_ 1a6zA
>sp|P16391|HA12_RAT RT1 class I histocompatibility antigen, AA alpha chain OS=Rattus norvegicus PE=1 SV=2 _P_ 1ed3A
>sp|P05534|1A24_HUMAN HLA class I histocompatibility antigen, A-24 alpha chain OS=Homo sapiens GN=HLA-A PE=1 SV=2 _P_ 2bckA
>tr|Q30597|Q30597_MACMU MHC class I Mamu-A*02 (Fragment) OS=Macaca mulatta PE=1 SV=1 _P_ 3jtsA
>sp|P01900|HA12_MOUSE H-2 class I histocompatibility antigen, D-D alpha chain OS=Mus musculus GN=H2-D1 PE=1 SV=1 _P_ 1biiA
>tr|Q31093|Q31093_MOUSE Histocompatibility 2, M region locus 3 OS=Mus musculus GN=H2-M3 PE=1 SV=1 _P_ 1mhcA
>sp|P14432|HA1T_MOUSE H-2 class I histocompatibility antigen, TLA(B) alpha chain OS=Mus musculus GN=H2-T3 PE=1 SV=2 _P_ 1nezA
>tr|Q860W6|Q860W6_MOUSE Major histocompatibility complex class Ib M10.5 (Fragment) OS=Mus musculus GN=H2-M10.4 PE=1 SV=1 _P_ 1zs8A
>tr|Q31615|Q31615_MOUSE MHC class I H2-TL-27-129 mRNA (b haplotype), complete cds OS=Mus musculus GN=H2-T22 PE=1 SV=1 _P_ 1c16A
>tr|Q31206|Q31206_MOUSE MHC class I H2-TL-T10-129 mRNA (b haplotype), complete cds OS=Mus musculus GN=H2-T10 PE=1 SV=1 _P_ 1r3hA
>sp|P01921|HB2D_MOUSE H-2 class II histocompatibility antigen, A-D beta chain OS=Mus musculus GN=H2-Ab1 PE=1 SV=1 _P_ 1iaoB

Scripts

(PSI-)BLAST

The output files from BLAST and PSI-BLAST were parsed and filtered with the following perl script. The PSI-BLAST output "filtered" for the last iteration step by hand (before using the script). If mutliple hits with the same ID were found, only the first one was considered (the one with the best e-Value).


>getM8StatsAndIDs.pl <source lang="perl">

  1. !/usr/local/bin/perl

use strict; use warnings; use diagnostics; use sigtrap; use autodie;

my $filename = shift; my $outputFile = "$filename.stats"; my $outputFile2 = "$filename.ids";

my %IDtoEVal; my %IDtoIdent;

my $id; my $eVal; my $ident;

open (FILE, "<", $filename) or die;

while (<FILE>) { my $line = $_;

my @content = split(/\t/, $line);

$id = $content[1]; $eVal = $content[10]; $ident = $content[2];

$id =~s/\s//gi; $eVal =~s/\s//gi; $ident =~s/\s//gi;

$id =~m/.*?\|(.*?)\|(.*?)$/gi;

my $uniID = $1; my $pdbID = $2;

if ($uniID=~m/^pdb$/gi) { $id = "PDB:".$pdbID; } else { $id = $uniID; }

if ($eVal=~m/^e/) { $eVal="1".$eVal; }

if (not defined $IDtoEVal{$id}) { $IDtoEVal{$id} = $eVal; $IDtoIdent{$id} = $ident; } }

close FILE;


  1. Sort Data
  2. my @keys = sort{$IDtoEVal{$a} <=> $IDtoEVal{$b}} keys %IDtoEVal;

my @keys = sort{$IDtoIdent{$b} <=> $IDtoIdent{$a}} keys %IDtoIdent;


  1. Thresholds

my $EValThreshold = 0.002; my $IdentThreshold = 0;


open (OUTFILE, ">", $outputFile) or die; open (OUTFILE2, ">", $outputFile2) or die;

foreach my $key (@keys) { if ($IDtoEVal{$key} <= $EValThreshold && $IDtoIdent{$key} >= $IdentThreshold) { print OUTFILE "$key\t$IDtoEVal{$key}\t$IDtoIdent{$key}\n"; print OUTFILE2 "$key\n"; } }

close OUTFILE; close OUTFILE2; </source>

HHblits

The perl script bellow was used to parse and filter the HHblits output. For multiple hits again only the first one was considered.


<source lang="perl">

  1. !/usr/local/bin/perl

use strict; use warnings; use diagnostics; use sigtrap; use autodie;

my $filename = shift; my $outputFile = "$filename.stats"; my $outputFile2 = "$filename.ids";

my %IDtoEVal; my %IDtoIdent; my %UniprotIDs;

my $id; my $eVal; my $ident; my $count = 1;


open (FILE, "<", $filename) or die;

while (<FILE>) { my $line = $_;

if ($line=~m/>UP20\|(.*?)\|.+/gi) { $id = $1;

my @tmp = split(/\|/, $line); my $uid;

for (my $i=4; $i < scalar(@tmp); $i++) { $uid = $tmp[$i]; $uid =~ s/\s*\[.*?\]//; $uid =~ s/\.//; $uid =~ s/\s//; $UniprotIDs{$uid}="alive"; } }

if ($line=~m/.*?E-value\s*=\s*(\d+?\D*?\d*?\D*?\d*?)\s+/gi) { $eVal = $1;

if ($eVal=~m/^e/) { $eVal="1".$eVal; }

if (not defined $IDtoEVal{$id}) { $IDtoEVal{$id} = $eVal; } }

if ($line=~m/.*?Identities\s*=\s*(\d*?)\D/gi) { $ident = $1;

if (not defined $IDtoIdent{$id}) { $IDtoIdent{$id} = $ident/100; } } }

close FILE;


  1. Sort Data
  2. my @keys = sort{$IDtoEVal{$a} <=> $IDtoEVal{$b}} keys %IDtoEVal;

my @keys = sort{$IDtoIdent{$b} <=> $IDtoIdent{$a}} keys %IDtoIdent;


  1. Thresholds

my $EValThreshold = 0.003; #To include the only cluster with a worse e-Value than 0.001 which has 0.002!


open (OUTFILE, ">", $outputFile) or die;

foreach my $key (@keys) { if ($IDtoEVal{$key} <= $EValThreshold) { print OUTFILE "$key\t$IDtoEVal{$key}\t$IDtoIdent{$key}\n"; } }

close OUTFILE;


open (OUTFILE2, ">", $outputFile2) or die;

foreach my $key (keys %UniprotIDs) { print OUTFILE2 "$key\n"; }

close OUTFILE2; </source>


Statistics

COPS-Cluster extraction:

<source lang="perl">

  1. !/usr/local/bin/perl


use strict;

  1. use warnings;

my $fileWithUniprotIDs=shift; my $pdbData=shift; my $COPSData=shift; my $outputFile=shift;

my $line; my %nameHash; my @arrayOfPDBIDs;

my @usedIDs; open (MYFILE, $fileWithUniprotIDs) or die ("Uniprot ID Data missing... :*( \n");

while (<MYFILE>) {

$line=$_; chomp $line; if ($line=~m/PDB:(.{4})/gi){ $line=uc $1; push (@arrayOfPDBIDs, $line); } else { push(@usedIDs, $line); }

  1. print "$line\n";
  1. print "$_\n";
}
close (MYFILE); 

my $schweinehundcounter; my %uniprotToPDBHash; my @tempArray;


open (UNIPROTTOPDB, "<$pdbData") or die ("PDB conversion file missing!"); while (<UNIPROTTOPDB>) { $line=$_; chomp $line; @tempArray=split('\s',$line);

if (exists $uniprotToPDBHash{$tempArray[0]}) { push($uniprotToPDBHash{$tempArray[0]},$tempArray[1]); $schweinehundcounter++; #print "Schweinehunde fliegen weit!";

} else { my @newPDBArray=($tempArray[1]); $uniprotToPDBHash{$tempArray[0]}=\@newPDBArray; }

#$uniprotToPDB{$tempArray[0]}=$tempArray[1];

  1. print "!!!!!$tempArray[0]!!!!!! !!!!!!!!$tempArray[1]";
  2. print "$uniprotToPDB{$tempArray[0]} is da Shit!";

}


open (COPSDATA, "<$COPSData") or die "COPS file, where are you????WHERE :*(";

my %refChainHash; my %L99Hash; my %L80Hash; my %L60Hash; my %L40Hash; my %L30Hash; my %S90Hash; my %S30Hash;


my %L99HashReverse; my %L80HashReverse; my %L60HashReverse; my %L40HashReverse; my %L30HashReverse; my %S90HashReverse; my %S30HashReverse;


while (<COPSDATA>) { $line=$_; chomp $line; @tempArray=split('\s',$line); my $refChain =uc substr $tempArray[0],0,4; my $L99 =uc substr $tempArray[1],0,4; my $L80 =uc substr $tempArray[2],0,4; my $L60 =uc substr $tempArray[3],0,4; my $L40 =uc substr $tempArray[4],0,4; my $L30 =uc substr $tempArray[5],0,4; my $S90 =uc substr $tempArray[6],0,4; my $S30 =uc substr $tempArray[7],0,4;

#print "$refChain $L99 $L80 $L60 $L40 $L30 $S90 $S30\n";

$L99Hash{$L99}{$refChain}=1;

$L80Hash{$L80}{$refChain}=1;

$L60Hash{$L60}{$refChain}=1;

$L40Hash{$L40}{$refChain}=1;

$L30Hash{$L30}{$refChain}=1;

$S90Hash{$S90}{$refChain}=1;

$S30Hash{$S30}{$refChain}=1;


$L99HashReverse{$refChain}{$L99}=1;

$L80HashReverse{$refChain}{$L80}=1;

$L60HashReverse{$refChain}{$L60}=1;

$L40HashReverse{$refChain}{$L40}=1;

$L30HashReverse{$refChain}{$L30}=1;

$S90HashReverse{$refChain}{$S90}=1;

$S30HashReverse{$refChain}{$S30}=1;

}


  1. print "HOOOHOOOOO: ".$L99HashReverse{"1DE4"};





my $testRechner=0;

  1. open (FILOR, ">test/hhblitsCompleteUniprot600.txt") or die ("du penner!");


foreach my $entry (@usedIDs){

#print FILOR "$entry\n";


if (exists $uniprotToPDBHash{$entry}) { # print "Hallo ich bin $entry, mein PDB-pendant ist/sind: @{$uniprotToPDBHash{$entry}}!\n"; push (@arrayOfPDBIDs, @{$uniprotToPDBHash{$entry}}); # print scalar(@{$uniprotToPDBHash{$entry}})."\n"; # $testRechner=$testRechner+scalar(@{$uniprotToPDBHash{$entry}}); # print $testRechner."\n"; } }

  1. print "While there were ". scalar(@sshole)." uniprot IDs.\n";


  1. close (FILOR);

my $hardCodedPDB1="1DE4"; my $hardCodedPDB2="1A6Z";

my @hardCodedPDBs=($hardCodedPDB1,$hardCodedPDB2);

my %alreadyLFound; my %alreadySFound;

  1. print "@arrayofPDBIDs";

my %L99EndHash; my %L80EndHash; my %L60EndHash; my %L40EndHash; my %L30EndHash; my %S90EndHash; my %S30EndHash;

foreach my $foundPDBID (@arrayOfPDBIDs) { foreach my $hardCoded (@hardCodedPDBs){ foreach my $representativeL99OfFoundPDBID (keys $L99HashReverse{$hardCoded}){ #print "$representativeL99OfFoundPDBID TROLOLOLOLO!"; if (exists $L99Hash{$representativeL99OfFoundPDBID}{$foundPDBID}){ $L99EndHash{$foundPDBID}=1; #print "IT WORKED!!!!!\n"; # funktioniert hier weiterschreiben... } }


foreach my $representativeL80OfFoundPDBID (keys $L80HashReverse{$hardCoded}){

if (exists $L80Hash{$representativeL80OfFoundPDBID}{$foundPDBID}){ $L80EndHash{$foundPDBID}=1; } }


foreach my $representativeL60OfFoundPDBID (keys $L60HashReverse{$hardCoded}){ if (exists $L60Hash{$representativeL60OfFoundPDBID}{$foundPDBID}){ $L60EndHash{$foundPDBID}=1; } }

foreach my $representativeL40OfFoundPDBID (keys $L40HashReverse{$hardCoded}){

if (exists $L40Hash{$representativeL40OfFoundPDBID}{$foundPDBID}){ $L40EndHash{$foundPDBID}=1; } }

foreach my $representativeL30OfFoundPDBID (keys $L30HashReverse{$hardCoded}){

if (exists $L30Hash{$representativeL30OfFoundPDBID}{$foundPDBID}){ $L30EndHash{$foundPDBID}=1; } }

foreach my $representativeS90OfFoundPDBID (keys $S90HashReverse{$hardCoded}){

if (exists $S90Hash{$representativeS90OfFoundPDBID}{$foundPDBID}){ $S90EndHash{$foundPDBID}=1; } }

foreach my $representativeS30OfFoundPDBID (keys $S30HashReverse{$hardCoded}){

if (exists $S30Hash{$representativeS30OfFoundPDBID}{$foundPDBID}){ $S30EndHash{$foundPDBID}=1; } }

}


}

open(OUTPUTFILE, ">$outputFile"); open(OUTPUTFILE2, ">$outputFile.pcs"); print scalar(@arrayOfPDBIDs); print scalar (keys %L99EndHash)."\tL99er\n"; print scalar (keys %L80EndHash)."\tL80er\n"; print scalar (keys %L60EndHash)."\tL60er\n"; print scalar (keys %L40EndHash)."\tL40er\n"; print scalar (keys %L30EndHash)."\tL30er\n"; print scalar (keys %S90EndHash)."\tS90er\n"; print scalar (keys %S30EndHash)."\tS30er\n"; print "\n"; print OUTPUTFILE scalar (keys %L99EndHash)."\tL99er\n"; print OUTPUTFILE scalar (keys %L80EndHash)."\tL80er\n"; print OUTPUTFILE scalar (keys %L60EndHash)."\tL60er\n"; print OUTPUTFILE scalar (keys %L40EndHash)."\tL40er\n"; print OUTPUTFILE scalar (keys %L30EndHash)."\tL30er\n"; print OUTPUTFILE scalar (keys %S90EndHash)."\tS90er\n"; print OUTPUTFILE scalar (keys %S30EndHash)."\tS30er\n";


print OUTPUTFILE2 scalar (keys %L99EndHash)/scalar(@arrayOfPDBIDs)."\tL99er\n"; print OUTPUTFILE2 scalar (keys %L80EndHash)/scalar(@arrayOfPDBIDs)."\tL80er\n"; print OUTPUTFILE2 scalar (keys %L60EndHash)/scalar(@arrayOfPDBIDs)."\tL60er\n"; print OUTPUTFILE2 scalar (keys %L40EndHash)/scalar(@arrayOfPDBIDs)."\tL40er\n"; print OUTPUTFILE2 scalar (keys %L30EndHash)/scalar(@arrayOfPDBIDs)."\tL30er\n"; print OUTPUTFILE2 scalar (keys %S90EndHash)/scalar(@arrayOfPDBIDs)."\tS90er\n"; print OUTPUTFILE2 scalar (keys %S30EndHash)/scalar(@arrayOfPDBIDs)."\tS30er\n";


</source>



Get how many of the found proteins have a common GO term with our HFE protein

<source lang="perl">

  1. !/usr/local/bin/perl

use strict; use warnings; use diagnostics; use sigtrap; use autodie;


my $idFile=shift; my $GOData=shift; my $outputData=shift;

my $line;

my $hardcodedUniprotID="Q30201";

my %hashOfAnnihilation; $hashOfAnnihilation{$hardcodedUniprotID}=1;

open (MYFILE, $idFile) or die ("du penner!");

while (<MYFILE>) {

$line=$_; chomp $line;

	if ($line=~m/PDB:(.{4})/gi){

} else { my $currentID=$line; $hashOfAnnihilation{$currentID}=1;

}


}
close (MYFILE); 

my %uniprotToPDBHash; my @tempArray;


my %gOAssocs; open (GODATA, "<$GOData") or die "GO Data, GO Data GO? GO???? :*(";

while (<GODATA>) { $line=$_; chomp $line;


my @tempGOArray=split('\s',$line); my $tempID=$tempGOArray[0]; shift @tempGOArray; foreach my $tempGOTerm (@tempGOArray){ if (exists $hashOfAnnihilation{$tempID}){

$gOAssocs{$tempID}{$tempGOTerm}=1;


} }


} close (GODATA);

my $numberOfSimilar=0; my $totalFound=0;

my %wantedHash;

open (PERCENTAGEOUTPUTOFSHAREDCLUSTER, ">$outputData") or die "WHY, CRUEL WORLD? WHY? meh whatever...";

foreach my $uniprotIDofCluster (keys %hashOfAnnihilation) { my $numberOfCurrentHits=0;

if (exists $gOAssocs{$uniprotIDofCluster}){

foreach my $currentGO (keys $gOAssocs{$uniprotIDofCluster}){


if (exists $gOAssocs{$hardcodedUniprotID}{$currentGO}){ $numberOfCurrentHits++; $totalFound++; if (exists $wantedHash{$currentGO}) { $wantedHash{$currentGO}=$wantedHash{$currentGO}+1; } else { $wantedHash{$currentGO}=1; }


}

} my $roundedPercentageOfHardCodedProteinsGOTerms=int(10000 * ($numberOfCurrentHits/(scalar (keys $gOAssocs{$hardcodedUniprotID}))) + 0.5) / 10000; my $roundedPercentageOfCurrentProteinsGOTerms=int(10000 * ($numberOfCurrentHits/(scalar (keys $gOAssocs{$uniprotIDofCluster}))) + 0.5) / 10000;


print PERCENTAGEOUTPUTOFSHAREDCLUSTER "$uniprotIDofCluster\t$roundedPercentageOfHardCodedProteinsGOTerms\t$roundedPercentageOfCurrentProteinsGOTerms\t$numberOfCurrentHits\n"; }

if ($numberOfCurrentHits>0){ $numberOfSimilar++; }


} print "$totalFound wurden insgesamt gefunden, mit insgesamt $numberOfSimilar ähnlichen.\n"; close (PERCENTAGEOUTPUTOFSHAREDCLUSTER); foreach my $key (keys %wantedHash){ print "$key: $wantedHash{$key}\n"; }

</source>



MSA

GapConserve <source lang="perl">

  1. !/usr/local/bin/perl

use strict; use warnings; use diagnostics; use sigtrap; use autodie;

my $aliFile=shift;

my $lastID="Trötofant"; my $currentSequence; my $currentNumberOfGaps=0; my %sequenceHash; my %gapCounter; open (MYFILE, $aliFile) or die ("weak!");

while (<MYFILE>) {

my $line=$_; chomp $line; if ($line=~m/^>.*?\|(.*?)\|/gi){ if ($lastID ne "Trötofant"){ $sequenceHash{$lastID}=$currentSequence; $currentSequence=""; } $lastID=$1;

} else { $currentSequence=$currentSequence . $line;

}

} close MYFILE; $sequenceHash{$lastID}=$currentSequence;

foreach my $key (keys %sequenceHash){ my @gapCount=$sequenceHash{$key}=~ m/-/gi; $gapCounter{$key}= scalar @gapCount; print "$gapCounter{$key} Gaps gefunden! Bei Uniprot $key\n"; }

my $sequenceLength=scalar (split ("",$currentSequence));

my $consensusCounter=0; for (my $i=0;$i<$sequenceLength;$i++){ my $bool=1; my $lastChar=""; foreach my $key (keys %sequenceHash){ if ( $bool==1 && ($lastChar eq (split ("",$sequenceHash{$key}))[$i] || $lastChar eq "") && (split ("",$sequenceHash{$key}))[$i] ne "-"){ #print "alter Char: ----> $lastChar <----"; $lastChar=(split ("",$sequenceHash{$key}))[$i]; #print "selber Char gefunden, Baby! ----> $lastChar <---- an Pos $i\n"; } else { $bool=0; $lastChar="-"; } } if ($bool==1){ $consensusCounter++ } }

print "$consensusCounter komplett übereinstimmende Positionen (Conserved)\n"; </source>


Other

Script to get only GO annotations of proteins one is interested in out of a GO file:

<source lang="perl">

  1. !/usr/local/bin/perl


use strict; use warnings; use diagnostics; use sigtrap; use autodie;

my $fileWithUniprotIDs=shift; my $GOData=shift; my $outputFile=shift; my $line; my $hardcodedUniprotID="Q30201"; my %hashOfAnnihilation; my @arrayOfPDBIDs; my @usedIDs; push(@usedIDs, $hardcodedUniprotID);

open (MYFILE, $fileWithUniprotIDs) or die ("Uniprot ID Data missing... :*( \n");

while (<MYFILE>) {

$line=$_; chomp $line; if ($line=~m/PDB:(.{4})/gi){ $line=uc $1; push (@arrayOfPDBIDs, $line); } else { push(@usedIDs, $line); $hashOfAnnihilation{$line}=1; }

}
close (MYFILE);




my %uniprotToPDBHash; my @tempArray;


my %gOAssocs; open (GODATA, "<$GOData") or die "GO Data, GO Data GO? GO???? :*(";

while (<GODATA>) { $line=$_; chomp $line;

if ($line=~m/^UniprotKB.*/gi) {

my @tempGOArray=split('\s',$line);

if (exists $hashOfAnnihilation{$tempGOArray[1]}){

$gOAssocs{$tempGOArray[1]}{$tempGOArray[4]}=1;


} # print "$tempGOArray[1]!wurd hinzugefuegt mit $tempGOArray[4]!.\n"; } } close (GODATA);


open (OUTPUT, ">$outputFile") or die "Cannot write to $outputFile...\n"; foreach my $uniprotIDofCluster (@usedIDs) { my $numberOfCurrentHits=0;

if (exists $gOAssocs{$uniprotIDofCluster}){ print OUTPUT "$uniprotIDofCluster\t"; print "$uniprotIDofCluster\t"; foreach my $currentGO (keys $gOAssocs{$uniprotIDofCluster}){ print "$currentGO\t"; print OUTPUT "$currentGO\t";

} print "\n"; print OUTPUT "\n";

}

} close (OUTPUT);


</source>


R script for the e-Value and identity distributions (yep, everything hardcoded^^).


<source lang="bash"> data1<-read.table("blastpgp_h0.002_j2_10000.last.stats"); data2<-read.table("blastpgp_h0.002_j10_10000.last.stats"); data3<-read.table("blastpgp_h1e-10_j2_10000.last.stats"); data4<-read.table("blastpgp_h1e-10_j10_10000.last.stats"); data5<-read.table("blast_default_1500.out.stats"); data6<-read.table("hhblits_default_600.out.stats");

data7<-read.table("blastpgp_big_h0.002_j2_100000.last.stats"); data8<-read.table("blastpgp_big_h0.002_j10_100000.last.stats"); data9<-read.table("blastpgp_big_h1e-10_j2_100000.last.stats"); data10<-read.table("blastpgp_big_h1e-10_j10_100000.last.stats");


png(filename="eval_psi_80.png", height=400, width=800, bg="white"); cols = rainbow(4, alpha=0.6); br = seq(-150,0,1); hist(log10(data1$V2), col=cols[1], breaks=br, ylim=range(0,150), xlab="log10(E-Value)", main="PSI-Blast (big80)"); hist(log10(data2$V2), col=cols[2], breaks=br, ylim=range(0,150), xlab="log10(E-Value)", add=TRUE); hist(log10(data3$V2), col=cols[3], breaks=br, ylim=range(0,150), xlab="log10(E-Value)", add=TRUE); hist(log10(data4$V2), col=cols[4], breaks=br, ylim=range(0,150), xlab="log10(E-Value)", add=TRUE); legend(-150,100,c('h=0.002, j=2','h=0.002, j=10','h=1e-10, j=2','h=1e-10, j=10'),col=cols,fill=cols,border=cols); dev.off();


png(filename="eval_blast_80.png", height=400, width=800, bg="white"); hist(log10(data5$V2), col="blue", breaks=100, xlab="log10(E-Value)", main="Blast (big80)"); dev.off();


png(filename="eval_hhblits.png", height=400, width=800, bg="white"); hist(log10(data6$V2), col="blue", breaks=50, xlab="log10(E-Value)", main="HHblits (Uniprot20)"); dev.off();


png(filename="ident_psi_80.png", height=400, width=800, bg="white"); cols2 = rainbow(4, alpha=0.6); br2 = seq(0,100,1); hist(data1$V3, col=cols[1], breaks=br2, xlim=range(0,100), ylim=range(0,150), xlab="Identity (%)", main="PSI-Blast (big80)"); hist(data2$V3, col=cols[2], breaks=br2, xlim=range(0,100), ylim=range(0,150), xlab="Identity (%)", add=TRUE); hist(data3$V3, col=cols[3], breaks=br2, xlim=range(0,100), ylim=range(0,150), xlab="Identity (%)", add=TRUE); hist(data4$V3, col=cols[4], breaks=br2, xlim=range(0,100), ylim=range(0,150), xlab="Identity (%)", add=TRUE); legend(60,100,c('h=0.002, j=2','h=0.002, j=10','h=1e-10, j=2','h=1e-10, j=10'),col=cols,fill=cols,border=cols); dev.off();


png(filename="ident_blast_80.png", height=400, width=800, bg="white"); hist(data5$V3, col="blue", breaks=100, xlim=range(0,100), xlab="Identity (%)", main="Blast (big80)"); dev.off();


png(filename="ident_hhblits.png", height=400, width=800, bg="white"); hist(100*(data6$V3), col="blue", breaks=50, xlim=range(0,100), xlab="Identity (%)", main="HHblits (Uniprot20)"); dev.off(); </source>