Difference between revisions of "Task2 Hemochromatosis Protocol"
Bernhoferm (talk | contribs) (→HHblits) |
Bernhoferm (talk | contribs) (→MSA) |
||
(6 intermediate revisions by 2 users not shown) | |||
Line 95: | Line 95: | ||
time clustalw -align -infile=/mnt/home/student/joerdensv/newGroups/group0-99.fasta -outfile=/mnt/home/student/joerdensv/newGroups/group0-99clustalw.fasta -output=fasta |
time clustalw -align -infile=/mnt/home/student/joerdensv/newGroups/group0-99.fasta -outfile=/mnt/home/student/joerdensv/newGroups/group0-99clustalw.fasta -output=fasta |
||
time clustalw -align -infile=/mnt/home/student/joerdensv/newGroups/group0-40.fasta -outfile=/mnt/home/student/joerdensv/newGroups/group0-40clustalw.fasta -output=fasta |
time clustalw -align -infile=/mnt/home/student/joerdensv/newGroups/group0-40.fasta -outfile=/mnt/home/student/joerdensv/newGroups/group0-40clustalw.fasta -output=fasta |
||
+ | |||
+ | GRP="/mnt/home/student/bernhoferm/task2/groups" |
||
+ | time t_coffee $GRP/group0-40.fasta -method pdb_pair,proba_pair -template_file $GRP/group0-40.template -output=fasta_aln |
||
+ | |||
+ | real 3m42.931s |
||
+ | user 1m50.940s |
||
+ | sys 0m23.180s |
||
+ | |||
+ | |||
+ | >group0-40.template |
||
+ | >sp|Q30201|HFE_HUMAN Hereditary hemochromatosis protein OS=Homo sapiens GN=HFE PE=1 SV=1 _P_ 1a6zA |
||
+ | >sp|P16391|HA12_RAT RT1 class I histocompatibility antigen, AA alpha chain OS=Rattus norvegicus PE=1 SV=2 _P_ 1ed3A |
||
+ | >sp|P05534|1A24_HUMAN HLA class I histocompatibility antigen, A-24 alpha chain OS=Homo sapiens GN=HLA-A PE=1 SV=2 _P_ 2bckA |
||
+ | >tr|Q30597|Q30597_MACMU MHC class I Mamu-A*02 (Fragment) OS=Macaca mulatta PE=1 SV=1 _P_ 3jtsA |
||
+ | >sp|P01900|HA12_MOUSE H-2 class I histocompatibility antigen, D-D alpha chain OS=Mus musculus GN=H2-D1 PE=1 SV=1 _P_ 1biiA |
||
+ | >tr|Q31093|Q31093_MOUSE Histocompatibility 2, M region locus 3 OS=Mus musculus GN=H2-M3 PE=1 SV=1 _P_ 1mhcA |
||
+ | >sp|P14432|HA1T_MOUSE H-2 class I histocompatibility antigen, TLA(B) alpha chain OS=Mus musculus GN=H2-T3 PE=1 SV=2 _P_ 1nezA |
||
+ | >tr|Q860W6|Q860W6_MOUSE Major histocompatibility complex class Ib M10.5 (Fragment) OS=Mus musculus GN=H2-M10.4 PE=1 SV=1 _P_ 1zs8A |
||
+ | >tr|Q31615|Q31615_MOUSE MHC class I H2-TL-27-129 mRNA (b haplotype), complete cds OS=Mus musculus GN=H2-T22 PE=1 SV=1 _P_ 1c16A |
||
+ | >tr|Q31206|Q31206_MOUSE MHC class I H2-TL-T10-129 mRNA (b haplotype), complete cds OS=Mus musculus GN=H2-T10 PE=1 SV=1 _P_ 1r3hA |
||
+ | >sp|P01921|HB2D_MOUSE H-2 class II histocompatibility antigen, A-D beta chain OS=Mus musculus GN=H2-Ab1 PE=1 SV=1 _P_ 1iaoB |
||
== Scripts == |
== Scripts == |
||
Line 308: | Line 329: | ||
close OUTFILE2; |
close OUTFILE2; |
||
</source> |
</source> |
||
+ | |||
+ | |||
+ | ===Statistics=== |
||
+ | |||
+ | COPS-Cluster extraction: |
||
+ | |||
+ | <source lang="perl"> |
||
+ | #!/usr/local/bin/perl |
||
+ | |||
+ | |||
+ | use strict; |
||
+ | #use warnings; |
||
+ | |||
+ | my $fileWithUniprotIDs=shift; |
||
+ | my $pdbData=shift; |
||
+ | my $COPSData=shift; |
||
+ | my $outputFile=shift; |
||
+ | |||
+ | my $line; |
||
+ | my %nameHash; |
||
+ | my @arrayOfPDBIDs; |
||
+ | |||
+ | my @usedIDs; |
||
+ | open (MYFILE, $fileWithUniprotIDs) or die ("Uniprot ID Data missing... :*( \n"); |
||
+ | while (<MYFILE>) { |
||
+ | $line=$_; |
||
+ | chomp $line; |
||
+ | if ($line=~m/PDB:(.{4})/gi){ |
||
+ | $line=uc $1; |
||
+ | push (@arrayOfPDBIDs, $line); |
||
+ | } else { |
||
+ | push(@usedIDs, $line); |
||
+ | } |
||
+ | #print "$line\n"; |
||
+ | |||
+ | #print "$_\n"; |
||
+ | } |
||
+ | close (MYFILE); |
||
+ | my $schweinehundcounter; |
||
+ | my %uniprotToPDBHash; |
||
+ | my @tempArray; |
||
+ | |||
+ | |||
+ | open (UNIPROTTOPDB, "<$pdbData") or die ("PDB conversion file missing!"); |
||
+ | while (<UNIPROTTOPDB>) { |
||
+ | $line=$_; |
||
+ | chomp $line; |
||
+ | @tempArray=split('\s',$line); |
||
+ | |||
+ | if (exists $uniprotToPDBHash{$tempArray[0]}) { |
||
+ | push($uniprotToPDBHash{$tempArray[0]},$tempArray[1]); |
||
+ | $schweinehundcounter++; |
||
+ | #print "Schweinehunde fliegen weit!"; |
||
+ | |||
+ | } else { |
||
+ | my @newPDBArray=($tempArray[1]); |
||
+ | $uniprotToPDBHash{$tempArray[0]}=\@newPDBArray; |
||
+ | } |
||
+ | |||
+ | #$uniprotToPDB{$tempArray[0]}=$tempArray[1]; |
||
+ | # print "!!!!!$tempArray[0]!!!!!! !!!!!!!!$tempArray[1]"; |
||
+ | # print "$uniprotToPDB{$tempArray[0]} is da Shit!"; |
||
+ | |||
+ | } |
||
+ | |||
+ | |||
+ | |||
+ | open (COPSDATA, "<$COPSData") or die "COPS file, where are you????WHERE :*("; |
||
+ | |||
+ | my %refChainHash; |
||
+ | my %L99Hash; |
||
+ | my %L80Hash; |
||
+ | my %L60Hash; |
||
+ | my %L40Hash; |
||
+ | my %L30Hash; |
||
+ | my %S90Hash; |
||
+ | my %S30Hash; |
||
+ | |||
+ | |||
+ | |||
+ | my %L99HashReverse; |
||
+ | my %L80HashReverse; |
||
+ | my %L60HashReverse; |
||
+ | my %L40HashReverse; |
||
+ | my %L30HashReverse; |
||
+ | my %S90HashReverse; |
||
+ | my %S30HashReverse; |
||
+ | |||
+ | |||
+ | while (<COPSDATA>) { |
||
+ | $line=$_; |
||
+ | chomp $line; |
||
+ | @tempArray=split('\s',$line); |
||
+ | my $refChain =uc substr $tempArray[0],0,4; |
||
+ | my $L99 =uc substr $tempArray[1],0,4; |
||
+ | my $L80 =uc substr $tempArray[2],0,4; |
||
+ | my $L60 =uc substr $tempArray[3],0,4; |
||
+ | my $L40 =uc substr $tempArray[4],0,4; |
||
+ | my $L30 =uc substr $tempArray[5],0,4; |
||
+ | my $S90 =uc substr $tempArray[6],0,4; |
||
+ | my $S30 =uc substr $tempArray[7],0,4; |
||
+ | |||
+ | #print "$refChain $L99 $L80 $L60 $L40 $L30 $S90 $S30\n"; |
||
+ | |||
+ | $L99Hash{$L99}{$refChain}=1; |
||
+ | |||
+ | $L80Hash{$L80}{$refChain}=1; |
||
+ | |||
+ | $L60Hash{$L60}{$refChain}=1; |
||
+ | |||
+ | $L40Hash{$L40}{$refChain}=1; |
||
+ | |||
+ | $L30Hash{$L30}{$refChain}=1; |
||
+ | |||
+ | $S90Hash{$S90}{$refChain}=1; |
||
+ | |||
+ | $S30Hash{$S30}{$refChain}=1; |
||
+ | |||
+ | |||
+ | $L99HashReverse{$refChain}{$L99}=1; |
||
+ | |||
+ | $L80HashReverse{$refChain}{$L80}=1; |
||
+ | |||
+ | $L60HashReverse{$refChain}{$L60}=1; |
||
+ | |||
+ | $L40HashReverse{$refChain}{$L40}=1; |
||
+ | |||
+ | $L30HashReverse{$refChain}{$L30}=1; |
||
+ | |||
+ | $S90HashReverse{$refChain}{$S90}=1; |
||
+ | |||
+ | $S30HashReverse{$refChain}{$S30}=1; |
||
+ | |||
+ | } |
||
+ | |||
+ | |||
+ | #print "HOOOHOOOOO: ".$L99HashReverse{"1DE4"}; |
||
+ | |||
+ | |||
+ | |||
+ | |||
+ | |||
+ | |||
+ | |||
+ | |||
+ | my $testRechner=0; |
||
+ | |||
+ | #open (FILOR, ">test/hhblitsCompleteUniprot600.txt") or die ("du penner!"); |
||
+ | |||
+ | |||
+ | foreach my $entry (@usedIDs){ |
||
+ | |||
+ | #print FILOR "$entry\n"; |
||
+ | |||
+ | |||
+ | if (exists $uniprotToPDBHash{$entry}) { |
||
+ | # print "Hallo ich bin $entry, mein PDB-pendant ist/sind: @{$uniprotToPDBHash{$entry}}!\n"; |
||
+ | push (@arrayOfPDBIDs, @{$uniprotToPDBHash{$entry}}); |
||
+ | # print scalar(@{$uniprotToPDBHash{$entry}})."\n"; |
||
+ | # $testRechner=$testRechner+scalar(@{$uniprotToPDBHash{$entry}}); |
||
+ | # print $testRechner."\n"; |
||
+ | } |
||
+ | } |
||
+ | #print "While there were ". scalar(@sshole)." uniprot IDs.\n"; |
||
+ | |||
+ | |||
+ | |||
+ | #close (FILOR); |
||
+ | |||
+ | my $hardCodedPDB1="1DE4"; |
||
+ | my $hardCodedPDB2="1A6Z"; |
||
+ | |||
+ | my @hardCodedPDBs=($hardCodedPDB1,$hardCodedPDB2); |
||
+ | |||
+ | my %alreadyLFound; |
||
+ | my %alreadySFound; |
||
+ | |||
+ | #print "@arrayofPDBIDs"; |
||
+ | |||
+ | my %L99EndHash; |
||
+ | my %L80EndHash; |
||
+ | my %L60EndHash; |
||
+ | my %L40EndHash; |
||
+ | my %L30EndHash; |
||
+ | my %S90EndHash; |
||
+ | my %S30EndHash; |
||
+ | |||
+ | foreach my $foundPDBID (@arrayOfPDBIDs) { |
||
+ | foreach my $hardCoded (@hardCodedPDBs){ |
||
+ | foreach my $representativeL99OfFoundPDBID (keys $L99HashReverse{$hardCoded}){ |
||
+ | #print "$representativeL99OfFoundPDBID TROLOLOLOLO!"; |
||
+ | if (exists $L99Hash{$representativeL99OfFoundPDBID}{$foundPDBID}){ |
||
+ | $L99EndHash{$foundPDBID}=1; |
||
+ | #print "IT WORKED!!!!!\n"; # funktioniert hier weiterschreiben... |
||
+ | } |
||
+ | } |
||
+ | |||
+ | |||
+ | foreach my $representativeL80OfFoundPDBID (keys $L80HashReverse{$hardCoded}){ |
||
+ | |||
+ | if (exists $L80Hash{$representativeL80OfFoundPDBID}{$foundPDBID}){ |
||
+ | $L80EndHash{$foundPDBID}=1; |
||
+ | } |
||
+ | } |
||
+ | |||
+ | |||
+ | foreach my $representativeL60OfFoundPDBID (keys $L60HashReverse{$hardCoded}){ |
||
+ | if (exists $L60Hash{$representativeL60OfFoundPDBID}{$foundPDBID}){ |
||
+ | $L60EndHash{$foundPDBID}=1; |
||
+ | } |
||
+ | } |
||
+ | |||
+ | foreach my $representativeL40OfFoundPDBID (keys $L40HashReverse{$hardCoded}){ |
||
+ | |||
+ | if (exists $L40Hash{$representativeL40OfFoundPDBID}{$foundPDBID}){ |
||
+ | $L40EndHash{$foundPDBID}=1; |
||
+ | } |
||
+ | } |
||
+ | |||
+ | foreach my $representativeL30OfFoundPDBID (keys $L30HashReverse{$hardCoded}){ |
||
+ | |||
+ | if (exists $L30Hash{$representativeL30OfFoundPDBID}{$foundPDBID}){ |
||
+ | $L30EndHash{$foundPDBID}=1; |
||
+ | } |
||
+ | } |
||
+ | |||
+ | foreach my $representativeS90OfFoundPDBID (keys $S90HashReverse{$hardCoded}){ |
||
+ | |||
+ | if (exists $S90Hash{$representativeS90OfFoundPDBID}{$foundPDBID}){ |
||
+ | $S90EndHash{$foundPDBID}=1; |
||
+ | } |
||
+ | } |
||
+ | |||
+ | foreach my $representativeS30OfFoundPDBID (keys $S30HashReverse{$hardCoded}){ |
||
+ | |||
+ | if (exists $S30Hash{$representativeS30OfFoundPDBID}{$foundPDBID}){ |
||
+ | $S30EndHash{$foundPDBID}=1; |
||
+ | } |
||
+ | } |
||
+ | |||
+ | } |
||
+ | |||
+ | |||
+ | |||
+ | } |
||
+ | |||
+ | open(OUTPUTFILE, ">$outputFile"); |
||
+ | open(OUTPUTFILE2, ">$outputFile.pcs"); |
||
+ | print scalar(@arrayOfPDBIDs); |
||
+ | print scalar (keys %L99EndHash)."\tL99er\n"; |
||
+ | print scalar (keys %L80EndHash)."\tL80er\n"; |
||
+ | print scalar (keys %L60EndHash)."\tL60er\n"; |
||
+ | print scalar (keys %L40EndHash)."\tL40er\n"; |
||
+ | print scalar (keys %L30EndHash)."\tL30er\n"; |
||
+ | print scalar (keys %S90EndHash)."\tS90er\n"; |
||
+ | print scalar (keys %S30EndHash)."\tS30er\n"; |
||
+ | print "\n"; |
||
+ | print OUTPUTFILE scalar (keys %L99EndHash)."\tL99er\n"; |
||
+ | print OUTPUTFILE scalar (keys %L80EndHash)."\tL80er\n"; |
||
+ | print OUTPUTFILE scalar (keys %L60EndHash)."\tL60er\n"; |
||
+ | print OUTPUTFILE scalar (keys %L40EndHash)."\tL40er\n"; |
||
+ | print OUTPUTFILE scalar (keys %L30EndHash)."\tL30er\n"; |
||
+ | print OUTPUTFILE scalar (keys %S90EndHash)."\tS90er\n"; |
||
+ | print OUTPUTFILE scalar (keys %S30EndHash)."\tS30er\n"; |
||
+ | |||
+ | |||
+ | print OUTPUTFILE2 scalar (keys %L99EndHash)/scalar(@arrayOfPDBIDs)."\tL99er\n"; |
||
+ | print OUTPUTFILE2 scalar (keys %L80EndHash)/scalar(@arrayOfPDBIDs)."\tL80er\n"; |
||
+ | print OUTPUTFILE2 scalar (keys %L60EndHash)/scalar(@arrayOfPDBIDs)."\tL60er\n"; |
||
+ | print OUTPUTFILE2 scalar (keys %L40EndHash)/scalar(@arrayOfPDBIDs)."\tL40er\n"; |
||
+ | print OUTPUTFILE2 scalar (keys %L30EndHash)/scalar(@arrayOfPDBIDs)."\tL30er\n"; |
||
+ | print OUTPUTFILE2 scalar (keys %S90EndHash)/scalar(@arrayOfPDBIDs)."\tS90er\n"; |
||
+ | print OUTPUTFILE2 scalar (keys %S30EndHash)/scalar(@arrayOfPDBIDs)."\tS30er\n"; |
||
+ | |||
+ | |||
+ | </source> |
||
+ | |||
+ | |||
+ | |||
+ | |||
+ | Get how many of the found proteins have a common GO term with our HFE protein |
||
+ | |||
+ | <source lang="perl"> |
||
+ | #!/usr/local/bin/perl |
||
+ | |||
+ | use strict; |
||
+ | use warnings; |
||
+ | use diagnostics; |
||
+ | use sigtrap; |
||
+ | use autodie; |
||
+ | |||
+ | |||
+ | |||
+ | my $idFile=shift; |
||
+ | my $GOData=shift; |
||
+ | my $outputData=shift; |
||
+ | |||
+ | my $line; |
||
+ | |||
+ | my $hardcodedUniprotID="Q30201"; |
||
+ | |||
+ | my %hashOfAnnihilation; |
||
+ | $hashOfAnnihilation{$hardcodedUniprotID}=1; |
||
+ | |||
+ | open (MYFILE, $idFile) or die ("du penner!"); |
||
+ | while (<MYFILE>) { |
||
+ | $line=$_; |
||
+ | chomp $line; |
||
+ | if ($line=~m/PDB:(.{4})/gi){ |
||
+ | |||
+ | } else { |
||
+ | my $currentID=$line; |
||
+ | $hashOfAnnihilation{$currentID}=1; |
||
+ | |||
+ | } |
||
+ | |||
+ | |||
+ | |||
+ | } |
||
+ | close (MYFILE); |
||
+ | |||
+ | my %uniprotToPDBHash; |
||
+ | my @tempArray; |
||
+ | |||
+ | |||
+ | |||
+ | my %gOAssocs; |
||
+ | open (GODATA, "<$GOData") or die "GO Data, GO Data GO? GO???? :*("; |
||
+ | |||
+ | while (<GODATA>) { |
||
+ | $line=$_; |
||
+ | chomp $line; |
||
+ | |||
+ | |||
+ | |||
+ | my @tempGOArray=split('\s',$line); |
||
+ | my $tempID=$tempGOArray[0]; |
||
+ | shift @tempGOArray; |
||
+ | foreach my $tempGOTerm (@tempGOArray){ |
||
+ | if (exists $hashOfAnnihilation{$tempID}){ |
||
+ | |||
+ | $gOAssocs{$tempID}{$tempGOTerm}=1; |
||
+ | |||
+ | |||
+ | } |
||
+ | } |
||
+ | |||
+ | |||
+ | } |
||
+ | close (GODATA); |
||
+ | |||
+ | my $numberOfSimilar=0; |
||
+ | my $totalFound=0; |
||
+ | |||
+ | my %wantedHash; |
||
+ | |||
+ | open (PERCENTAGEOUTPUTOFSHAREDCLUSTER, ">$outputData") or die "WHY, CRUEL WORLD? WHY? meh whatever..."; |
||
+ | |||
+ | foreach my $uniprotIDofCluster (keys %hashOfAnnihilation) { |
||
+ | my $numberOfCurrentHits=0; |
||
+ | |||
+ | if (exists $gOAssocs{$uniprotIDofCluster}){ |
||
+ | |||
+ | foreach my $currentGO (keys $gOAssocs{$uniprotIDofCluster}){ |
||
+ | |||
+ | |||
+ | |||
+ | if (exists $gOAssocs{$hardcodedUniprotID}{$currentGO}){ |
||
+ | $numberOfCurrentHits++; |
||
+ | $totalFound++; |
||
+ | if (exists $wantedHash{$currentGO}) { |
||
+ | $wantedHash{$currentGO}=$wantedHash{$currentGO}+1; |
||
+ | } else { |
||
+ | $wantedHash{$currentGO}=1; |
||
+ | } |
||
+ | |||
+ | |||
+ | } |
||
+ | |||
+ | } |
||
+ | my $roundedPercentageOfHardCodedProteinsGOTerms=int(10000 * ($numberOfCurrentHits/(scalar (keys $gOAssocs{$hardcodedUniprotID}))) + 0.5) / 10000; |
||
+ | my $roundedPercentageOfCurrentProteinsGOTerms=int(10000 * ($numberOfCurrentHits/(scalar (keys $gOAssocs{$uniprotIDofCluster}))) + 0.5) / 10000; |
||
+ | |||
+ | |||
+ | print PERCENTAGEOUTPUTOFSHAREDCLUSTER "$uniprotIDofCluster\t$roundedPercentageOfHardCodedProteinsGOTerms\t$roundedPercentageOfCurrentProteinsGOTerms\t$numberOfCurrentHits\n"; |
||
+ | } |
||
+ | |||
+ | if ($numberOfCurrentHits>0){ |
||
+ | $numberOfSimilar++; |
||
+ | } |
||
+ | |||
+ | |||
+ | } |
||
+ | print "$totalFound wurden insgesamt gefunden, mit insgesamt $numberOfSimilar ähnlichen.\n"; |
||
+ | close (PERCENTAGEOUTPUTOFSHAREDCLUSTER); |
||
+ | foreach my $key (keys %wantedHash){ |
||
+ | print "$key: $wantedHash{$key}\n"; |
||
+ | } |
||
+ | |||
+ | </source> |
||
+ | |||
+ | |||
+ | |||
+ | |||
=== MSA === |
=== MSA === |
||
+ | |||
+ | GapConserve |
||
+ | <source lang="perl"> |
||
+ | #!/usr/local/bin/perl |
||
+ | |||
+ | use strict; |
||
+ | use warnings; |
||
+ | use diagnostics; |
||
+ | use sigtrap; |
||
+ | use autodie; |
||
+ | |||
+ | my $aliFile=shift; |
||
+ | |||
+ | my $lastID="Trötofant"; |
||
+ | my $currentSequence; |
||
+ | my $currentNumberOfGaps=0; |
||
+ | my %sequenceHash; |
||
+ | my %gapCounter; |
||
+ | open (MYFILE, $aliFile) or die ("weak!"); |
||
+ | while (<MYFILE>) { |
||
+ | |||
+ | my $line=$_; |
||
+ | chomp $line; |
||
+ | if ($line=~m/^>.*?\|(.*?)\|/gi){ |
||
+ | if ($lastID ne "Trötofant"){ |
||
+ | $sequenceHash{$lastID}=$currentSequence; |
||
+ | $currentSequence=""; |
||
+ | } |
||
+ | $lastID=$1; |
||
+ | |||
+ | } else { |
||
+ | $currentSequence=$currentSequence . $line; |
||
+ | |||
+ | } |
||
+ | |||
+ | } |
||
+ | close MYFILE; |
||
+ | $sequenceHash{$lastID}=$currentSequence; |
||
+ | |||
+ | foreach my $key (keys %sequenceHash){ |
||
+ | my @gapCount=$sequenceHash{$key}=~ m/-/gi; |
||
+ | $gapCounter{$key}= scalar @gapCount; |
||
+ | print "$gapCounter{$key} Gaps gefunden! Bei Uniprot $key\n"; |
||
+ | } |
||
+ | |||
+ | my $sequenceLength=scalar (split ("",$currentSequence)); |
||
+ | |||
+ | my $consensusCounter=0; |
||
+ | for (my $i=0;$i<$sequenceLength;$i++){ |
||
+ | my $bool=1; |
||
+ | my $lastChar=""; |
||
+ | foreach my $key (keys %sequenceHash){ |
||
+ | if ( $bool==1 && ($lastChar eq (split ("",$sequenceHash{$key}))[$i] || $lastChar eq "") && (split ("",$sequenceHash{$key}))[$i] ne "-"){ |
||
+ | #print "alter Char: ----> $lastChar <----"; |
||
+ | $lastChar=(split ("",$sequenceHash{$key}))[$i]; |
||
+ | #print "selber Char gefunden, Baby! ----> $lastChar <---- an Pos $i\n"; |
||
+ | } else { |
||
+ | $bool=0; |
||
+ | $lastChar="-"; |
||
+ | } |
||
+ | } |
||
+ | if ($bool==1){ |
||
+ | $consensusCounter++ |
||
+ | } |
||
+ | } |
||
+ | |||
+ | print "$consensusCounter komplett übereinstimmende Positionen (Conserved)\n"; |
||
+ | </source> |
||
+ | |||
+ | |||
+ | === Other === |
||
+ | |||
+ | Script to get only GO annotations of proteins one is interested in out of a GO file: |
||
+ | |||
+ | <source lang="perl"> |
||
+ | #!/usr/local/bin/perl |
||
+ | |||
+ | |||
+ | use strict; |
||
+ | use warnings; |
||
+ | use diagnostics; |
||
+ | use sigtrap; |
||
+ | use autodie; |
||
+ | |||
+ | my $fileWithUniprotIDs=shift; |
||
+ | my $GOData=shift; |
||
+ | my $outputFile=shift; |
||
+ | my $line; |
||
+ | my $hardcodedUniprotID="Q30201"; |
||
+ | my %hashOfAnnihilation; |
||
+ | my @arrayOfPDBIDs; |
||
+ | my @usedIDs; |
||
+ | push(@usedIDs, $hardcodedUniprotID); |
||
+ | |||
+ | open (MYFILE, $fileWithUniprotIDs) or die ("Uniprot ID Data missing... :*( \n"); |
||
+ | while (<MYFILE>) { |
||
+ | $line=$_; |
||
+ | chomp $line; |
||
+ | if ($line=~m/PDB:(.{4})/gi){ |
||
+ | $line=uc $1; |
||
+ | push (@arrayOfPDBIDs, $line); |
||
+ | } else { |
||
+ | push(@usedIDs, $line); |
||
+ | $hashOfAnnihilation{$line}=1; |
||
+ | } |
||
+ | |||
+ | } |
||
+ | close (MYFILE); |
||
+ | |||
+ | |||
+ | |||
+ | |||
+ | |||
+ | |||
+ | |||
+ | my %uniprotToPDBHash; |
||
+ | my @tempArray; |
||
+ | |||
+ | |||
+ | |||
+ | my %gOAssocs; |
||
+ | open (GODATA, "<$GOData") or die "GO Data, GO Data GO? GO???? :*("; |
||
+ | |||
+ | while (<GODATA>) { |
||
+ | $line=$_; |
||
+ | chomp $line; |
||
+ | |||
+ | if ($line=~m/^UniprotKB.*/gi) { |
||
+ | |||
+ | my @tempGOArray=split('\s',$line); |
||
+ | |||
+ | if (exists $hashOfAnnihilation{$tempGOArray[1]}){ |
||
+ | |||
+ | $gOAssocs{$tempGOArray[1]}{$tempGOArray[4]}=1; |
||
+ | |||
+ | |||
+ | } |
||
+ | # print "$tempGOArray[1]!wurd hinzugefuegt mit $tempGOArray[4]!.\n"; |
||
+ | } |
||
+ | } |
||
+ | close (GODATA); |
||
+ | |||
+ | |||
+ | |||
+ | open (OUTPUT, ">$outputFile") or die "Cannot write to $outputFile...\n"; |
||
+ | foreach my $uniprotIDofCluster (@usedIDs) { |
||
+ | my $numberOfCurrentHits=0; |
||
+ | |||
+ | if (exists $gOAssocs{$uniprotIDofCluster}){ |
||
+ | print OUTPUT "$uniprotIDofCluster\t"; |
||
+ | print "$uniprotIDofCluster\t"; |
||
+ | foreach my $currentGO (keys $gOAssocs{$uniprotIDofCluster}){ |
||
+ | print "$currentGO\t"; |
||
+ | print OUTPUT "$currentGO\t"; |
||
+ | |||
+ | } |
||
+ | print "\n"; |
||
+ | print OUTPUT "\n"; |
||
+ | |||
+ | } |
||
+ | |||
+ | } |
||
+ | close (OUTPUT); |
||
+ | |||
+ | |||
+ | </source> |
||
+ | |||
+ | |||
+ | R script for the e-Value and identity distributions (yep, everything hardcoded^^). |
||
+ | |||
+ | |||
+ | <source lang="bash"> |
||
+ | data1<-read.table("blastpgp_h0.002_j2_10000.last.stats"); |
||
+ | data2<-read.table("blastpgp_h0.002_j10_10000.last.stats"); |
||
+ | data3<-read.table("blastpgp_h1e-10_j2_10000.last.stats"); |
||
+ | data4<-read.table("blastpgp_h1e-10_j10_10000.last.stats"); |
||
+ | data5<-read.table("blast_default_1500.out.stats"); |
||
+ | data6<-read.table("hhblits_default_600.out.stats"); |
||
+ | |||
+ | data7<-read.table("blastpgp_big_h0.002_j2_100000.last.stats"); |
||
+ | data8<-read.table("blastpgp_big_h0.002_j10_100000.last.stats"); |
||
+ | data9<-read.table("blastpgp_big_h1e-10_j2_100000.last.stats"); |
||
+ | data10<-read.table("blastpgp_big_h1e-10_j10_100000.last.stats"); |
||
+ | |||
+ | |||
+ | png(filename="eval_psi_80.png", height=400, width=800, bg="white"); |
||
+ | cols = rainbow(4, alpha=0.6); |
||
+ | br = seq(-150,0,1); |
||
+ | hist(log10(data1$V2), col=cols[1], breaks=br, ylim=range(0,150), xlab="log10(E-Value)", main="PSI-Blast (big80)"); |
||
+ | hist(log10(data2$V2), col=cols[2], breaks=br, ylim=range(0,150), xlab="log10(E-Value)", add=TRUE); |
||
+ | hist(log10(data3$V2), col=cols[3], breaks=br, ylim=range(0,150), xlab="log10(E-Value)", add=TRUE); |
||
+ | hist(log10(data4$V2), col=cols[4], breaks=br, ylim=range(0,150), xlab="log10(E-Value)", add=TRUE); |
||
+ | legend(-150,100,c('h=0.002, j=2','h=0.002, j=10','h=1e-10, j=2','h=1e-10, j=10'),col=cols,fill=cols,border=cols); |
||
+ | dev.off(); |
||
+ | |||
+ | |||
+ | png(filename="eval_blast_80.png", height=400, width=800, bg="white"); |
||
+ | hist(log10(data5$V2), col="blue", breaks=100, xlab="log10(E-Value)", main="Blast (big80)"); |
||
+ | dev.off(); |
||
+ | |||
+ | |||
+ | png(filename="eval_hhblits.png", height=400, width=800, bg="white"); |
||
+ | hist(log10(data6$V2), col="blue", breaks=50, xlab="log10(E-Value)", main="HHblits (Uniprot20)"); |
||
+ | dev.off(); |
||
+ | |||
+ | |||
+ | png(filename="ident_psi_80.png", height=400, width=800, bg="white"); |
||
+ | cols2 = rainbow(4, alpha=0.6); |
||
+ | br2 = seq(0,100,1); |
||
+ | hist(data1$V3, col=cols[1], breaks=br2, xlim=range(0,100), ylim=range(0,150), xlab="Identity (%)", main="PSI-Blast (big80)"); |
||
+ | hist(data2$V3, col=cols[2], breaks=br2, xlim=range(0,100), ylim=range(0,150), xlab="Identity (%)", add=TRUE); |
||
+ | hist(data3$V3, col=cols[3], breaks=br2, xlim=range(0,100), ylim=range(0,150), xlab="Identity (%)", add=TRUE); |
||
+ | hist(data4$V3, col=cols[4], breaks=br2, xlim=range(0,100), ylim=range(0,150), xlab="Identity (%)", add=TRUE); |
||
+ | legend(60,100,c('h=0.002, j=2','h=0.002, j=10','h=1e-10, j=2','h=1e-10, j=10'),col=cols,fill=cols,border=cols); |
||
+ | dev.off(); |
||
+ | |||
+ | |||
+ | png(filename="ident_blast_80.png", height=400, width=800, bg="white"); |
||
+ | hist(data5$V3, col="blue", breaks=100, xlim=range(0,100), xlab="Identity (%)", main="Blast (big80)"); |
||
+ | dev.off(); |
||
+ | |||
+ | |||
+ | png(filename="ident_hhblits.png", height=400, width=800, bg="white"); |
||
+ | hist(100*(data6$V3), col="blue", breaks=50, xlim=range(0,100), xlab="Identity (%)", main="HHblits (Uniprot20)"); |
||
+ | dev.off(); |
||
+ | </source> |
Latest revision as of 19:41, 7 May 2012
Contents
Sequence Searches
Blast
TSK="/mnt/home/student/bernhoferm/task2" BIG="/mnt/project/pracstrucfunc12/data/big" time blastall -p blastp -d $BIG/big_80 -i $TSK/Q30201.fasta -b 1500 -m 8 -o $TSK/align2/blast_default_1500.out real 1m24.100s user 1m5.950s sys 0m3.500s
PSI-Blast
TSK="/mnt/home/student/bernhoferm/task2" BIG="/mnt/project/pracstrucfunc12/data/big" time blastpgp -C $TSK/h002j2.chk -d $BIG/big_80 -i $TSK/Q30201.fasta -h 0.002 -j 2 -b 10000 -m 8 -o $TSK/align2/blastpgp_h0.002_j2_10000.out real 3m21.031s user 2m44.750s sys 0m10.270s time blastpgp -C $TSK/h002j10.chk -d $BIG/big_80 -i $TSK/Q30201.fasta -h 0.002 -j 10 -b 10000 -m 8 -o $TSK/align2/blastpgp_h0.002_j10_10000.out real 16m39.699s user 14m22.150s sys 0m41.310s time blastpgp -C $TSK/h1e-10j2.chk -d $BIG/big_80 -i $TSK/Q30201.fasta -h 1e-10 -j 2 -b 10000 -m 8 -o $TSK/align2/blastpgp_h1e-10_j2_10000.out real 3m6.093s user 2m48.260s sys 0m6.020s time blastpgp -C $TSK/h1e-10j10.chk -d $BIG/big_80 -i $TSK/Q30201.fasta -h 1e-10 -j 10 -b 10000 -m 8 -o $TSK/align2/blastpgp_h1e-10_j10_10000.out real 16m41.797s user 15m18.350s sys 0m28.890s time blastpgp -R $TSK/h002j2.chk -d $BIG/big -i $TSK/Q30201.fasta -h 0.002 -j 2 -b 100000 -m 8 -o $TSK/align2/blastpgp_big_h0.002_j2_100000.out real 28m17.459s user 18m26.930s sys 2m46.200s time blastpgp -R $TSK/h002j10.chk -d $BIG/big -i $TSK/Q30201.fasta -h 0.002 -j 10 -b 100000 -m 8 -o $TSK/align2/blastpgp_big_h0.002_j10_100000.out real 367m15.021s (srsly...) user 310m31.210s sys 14m39.460s ...and some thousand "[blastpgp] ERROR: ncbiapi [000.000] ObjMgrNextAvailEntityID failed with idx 2048" messages time blastpgp -R $TSK/h1e-10j2.chk -d $BIG/big -i $TSK/Q30201.fasta -h 1e-10 -j 2 -b 100000 -m 8 -o $TSK/align2/blastpgp_big_h1e-10_j2_100000.out real 26m43.350s user 19m54.380s sys 2m3.900s time blastpgp -R $TSK/h1e-10j10.chk -d $BIG/big -i $TSK/Q30201.fasta -h 1e-10 -j 10 -b 100000 -m 8 -o $TSK/align2/blastpgp_big_h1e-10_j10_100000.out real 64m4.575s user 47m16.970s sys 4m54.890s ...and some thousand "[blastpgp] ERROR: ncbiapi [000.000] ObjMgrNextAvailEntityID failed with idx 2048" messages
HHblits
TSK="/mnt/home/student/bernhoferm/task2" HHB="/mnt/project/pracstrucfunc12/data/hhblits" time hhblits -d $HHB/uniprot20_current -i $TSK/Q30201.fasta -Z 600 -B 600 -o $TSK/align2/hhblits_default_600.out real 13m13.813s user 5m4.400s sys 2m45.520s
MSA
#!/bin/bash time t_coffee /mnt/home/student/joerdensv/newGroups/group0-40.fasta -output=fasta_aln time t_coffee /mnt/home/student/joerdensv/newGroups/group0-99.fasta -output=fasta_aln time t_coffee /mnt/home/student/joerdensv/newGroups/group60-99.fasta -output=fasta_aln time muscle -in /mnt/home/student/joerdensv/newGroups/group60-99.fasta -out group60-99muscle.fasta -quiet time muscle -in /mnt/home/student/joerdensv/newGroups/group0-99.fasta -out group0-99muscle.fasta -quiet time muscle -in /mnt/home/student/joerdensv/newGroups/group0-40.fasta -out group0-40muscle.fasta -quiet time clustalw -align -infile=/mnt/home/student/joerdensv/newGroups/group60-99.fasta -outfile=/mnt/home/student/joerdensv/newGroups/group60-99clustalw.fasta -output=fasta time clustalw -align -infile=/mnt/home/student/joerdensv/newGroups/group0-99.fasta -outfile=/mnt/home/student/joerdensv/newGroups/group0-99clustalw.fasta -output=fasta time clustalw -align -infile=/mnt/home/student/joerdensv/newGroups/group0-40.fasta -outfile=/mnt/home/student/joerdensv/newGroups/group0-40clustalw.fasta -output=fasta
GRP="/mnt/home/student/bernhoferm/task2/groups" time t_coffee $GRP/group0-40.fasta -method pdb_pair,proba_pair -template_file $GRP/group0-40.template -output=fasta_aln real 3m42.931s user 1m50.940s sys 0m23.180s
>group0-40.template
>sp|Q30201|HFE_HUMAN Hereditary hemochromatosis protein OS=Homo sapiens GN=HFE PE=1 SV=1 _P_ 1a6zA >sp|P16391|HA12_RAT RT1 class I histocompatibility antigen, AA alpha chain OS=Rattus norvegicus PE=1 SV=2 _P_ 1ed3A >sp|P05534|1A24_HUMAN HLA class I histocompatibility antigen, A-24 alpha chain OS=Homo sapiens GN=HLA-A PE=1 SV=2 _P_ 2bckA >tr|Q30597|Q30597_MACMU MHC class I Mamu-A*02 (Fragment) OS=Macaca mulatta PE=1 SV=1 _P_ 3jtsA >sp|P01900|HA12_MOUSE H-2 class I histocompatibility antigen, D-D alpha chain OS=Mus musculus GN=H2-D1 PE=1 SV=1 _P_ 1biiA >tr|Q31093|Q31093_MOUSE Histocompatibility 2, M region locus 3 OS=Mus musculus GN=H2-M3 PE=1 SV=1 _P_ 1mhcA >sp|P14432|HA1T_MOUSE H-2 class I histocompatibility antigen, TLA(B) alpha chain OS=Mus musculus GN=H2-T3 PE=1 SV=2 _P_ 1nezA >tr|Q860W6|Q860W6_MOUSE Major histocompatibility complex class Ib M10.5 (Fragment) OS=Mus musculus GN=H2-M10.4 PE=1 SV=1 _P_ 1zs8A >tr|Q31615|Q31615_MOUSE MHC class I H2-TL-27-129 mRNA (b haplotype), complete cds OS=Mus musculus GN=H2-T22 PE=1 SV=1 _P_ 1c16A >tr|Q31206|Q31206_MOUSE MHC class I H2-TL-T10-129 mRNA (b haplotype), complete cds OS=Mus musculus GN=H2-T10 PE=1 SV=1 _P_ 1r3hA >sp|P01921|HB2D_MOUSE H-2 class II histocompatibility antigen, A-D beta chain OS=Mus musculus GN=H2-Ab1 PE=1 SV=1 _P_ 1iaoB
Scripts
(PSI-)BLAST
The output files from BLAST and PSI-BLAST were parsed and filtered with the following perl script. The PSI-BLAST output "filtered" for the last iteration step by hand (before using the script). If mutliple hits with the same ID were found, only the first one was considered (the one with the best e-Value).
>getM8StatsAndIDs.pl
<source lang="perl">
- !/usr/local/bin/perl
use strict; use warnings; use diagnostics; use sigtrap; use autodie;
my $filename = shift; my $outputFile = "$filename.stats"; my $outputFile2 = "$filename.ids";
my %IDtoEVal; my %IDtoIdent;
my $id; my $eVal; my $ident;
open (FILE, "<", $filename) or die;
while (<FILE>) { my $line = $_;
my @content = split(/\t/, $line);
$id = $content[1]; $eVal = $content[10]; $ident = $content[2];
$id =~s/\s//gi; $eVal =~s/\s//gi; $ident =~s/\s//gi;
$id =~m/.*?\|(.*?)\|(.*?)$/gi;
my $uniID = $1; my $pdbID = $2;
if ($uniID=~m/^pdb$/gi) { $id = "PDB:".$pdbID; } else { $id = $uniID; }
if ($eVal=~m/^e/) { $eVal="1".$eVal; }
if (not defined $IDtoEVal{$id}) { $IDtoEVal{$id} = $eVal; $IDtoIdent{$id} = $ident; } }
close FILE;
- Sort Data
- my @keys = sort{$IDtoEVal{$a} <=> $IDtoEVal{$b}} keys %IDtoEVal;
my @keys = sort{$IDtoIdent{$b} <=> $IDtoIdent{$a}} keys %IDtoIdent;
- Thresholds
my $EValThreshold = 0.002; my $IdentThreshold = 0;
open (OUTFILE, ">", $outputFile) or die;
open (OUTFILE2, ">", $outputFile2) or die;
foreach my $key (@keys) { if ($IDtoEVal{$key} <= $EValThreshold && $IDtoIdent{$key} >= $IdentThreshold) { print OUTFILE "$key\t$IDtoEVal{$key}\t$IDtoIdent{$key}\n"; print OUTFILE2 "$key\n"; } }
close OUTFILE; close OUTFILE2; </source>
HHblits
The perl script bellow was used to parse and filter the HHblits output. For multiple hits again only the first one was considered.
<source lang="perl">
- !/usr/local/bin/perl
use strict; use warnings; use diagnostics; use sigtrap; use autodie;
my $filename = shift; my $outputFile = "$filename.stats"; my $outputFile2 = "$filename.ids";
my %IDtoEVal; my %IDtoIdent; my %UniprotIDs;
my $id; my $eVal; my $ident; my $count = 1;
open (FILE, "<", $filename) or die;
while (<FILE>) { my $line = $_;
if ($line=~m/>UP20\|(.*?)\|.+/gi) { $id = $1;
my @tmp = split(/\|/, $line); my $uid;
for (my $i=4; $i < scalar(@tmp); $i++) { $uid = $tmp[$i]; $uid =~ s/\s*\[.*?\]//; $uid =~ s/\.//; $uid =~ s/\s//; $UniprotIDs{$uid}="alive"; } }
if ($line=~m/.*?E-value\s*=\s*(\d+?\D*?\d*?\D*?\d*?)\s+/gi) { $eVal = $1;
if ($eVal=~m/^e/) { $eVal="1".$eVal; }
if (not defined $IDtoEVal{$id}) { $IDtoEVal{$id} = $eVal; } }
if ($line=~m/.*?Identities\s*=\s*(\d*?)\D/gi) { $ident = $1;
if (not defined $IDtoIdent{$id}) { $IDtoIdent{$id} = $ident/100; } } }
close FILE;
- Sort Data
- my @keys = sort{$IDtoEVal{$a} <=> $IDtoEVal{$b}} keys %IDtoEVal;
my @keys = sort{$IDtoIdent{$b} <=> $IDtoIdent{$a}} keys %IDtoIdent;
- Thresholds
my $EValThreshold = 0.003; #To include the only cluster with a worse e-Value than 0.001 which has 0.002!
open (OUTFILE, ">", $outputFile) or die;
foreach my $key (@keys) { if ($IDtoEVal{$key} <= $EValThreshold) { print OUTFILE "$key\t$IDtoEVal{$key}\t$IDtoIdent{$key}\n"; } }
close OUTFILE;
open (OUTFILE2, ">", $outputFile2) or die;
foreach my $key (keys %UniprotIDs) { print OUTFILE2 "$key\n"; }
close OUTFILE2; </source>
Statistics
COPS-Cluster extraction:
<source lang="perl">
- !/usr/local/bin/perl
use strict;
- use warnings;
my $fileWithUniprotIDs=shift; my $pdbData=shift; my $COPSData=shift; my $outputFile=shift;
my $line; my %nameHash; my @arrayOfPDBIDs;
my @usedIDs; open (MYFILE, $fileWithUniprotIDs) or die ("Uniprot ID Data missing... :*( \n");
while (<MYFILE>) {
$line=$_; chomp $line; if ($line=~m/PDB:(.{4})/gi){ $line=uc $1; push (@arrayOfPDBIDs, $line); } else { push(@usedIDs, $line); }
- print "$line\n";
- print "$_\n";
} close (MYFILE);
my $schweinehundcounter; my %uniprotToPDBHash; my @tempArray;
open (UNIPROTTOPDB, "<$pdbData") or die ("PDB conversion file missing!");
while (<UNIPROTTOPDB>) {
$line=$_;
chomp $line;
@tempArray=split('\s',$line);
if (exists $uniprotToPDBHash{$tempArray[0]}) { push($uniprotToPDBHash{$tempArray[0]},$tempArray[1]); $schweinehundcounter++; #print "Schweinehunde fliegen weit!";
} else { my @newPDBArray=($tempArray[1]); $uniprotToPDBHash{$tempArray[0]}=\@newPDBArray; }
#$uniprotToPDB{$tempArray[0]}=$tempArray[1];
- print "!!!!!$tempArray[0]!!!!!! !!!!!!!!$tempArray[1]";
- print "$uniprotToPDB{$tempArray[0]} is da Shit!";
}
open (COPSDATA, "<$COPSData") or die "COPS file, where are you????WHERE :*(";
my %refChainHash; my %L99Hash; my %L80Hash; my %L60Hash; my %L40Hash; my %L30Hash; my %S90Hash; my %S30Hash;
my %L99HashReverse; my %L80HashReverse; my %L60HashReverse; my %L40HashReverse; my %L30HashReverse; my %S90HashReverse; my %S30HashReverse;
while (<COPSDATA>) {
$line=$_;
chomp $line;
@tempArray=split('\s',$line);
my $refChain =uc substr $tempArray[0],0,4;
my $L99 =uc substr $tempArray[1],0,4;
my $L80 =uc substr $tempArray[2],0,4;
my $L60 =uc substr $tempArray[3],0,4;
my $L40 =uc substr $tempArray[4],0,4;
my $L30 =uc substr $tempArray[5],0,4;
my $S90 =uc substr $tempArray[6],0,4;
my $S30 =uc substr $tempArray[7],0,4;
#print "$refChain $L99 $L80 $L60 $L40 $L30 $S90 $S30\n";
$L99Hash{$L99}{$refChain}=1;
$L80Hash{$L80}{$refChain}=1;
$L60Hash{$L60}{$refChain}=1;
$L40Hash{$L40}{$refChain}=1;
$L30Hash{$L30}{$refChain}=1;
$S90Hash{$S90}{$refChain}=1;
$S30Hash{$S30}{$refChain}=1;
$L99HashReverse{$refChain}{$L99}=1;
$L80HashReverse{$refChain}{$L80}=1;
$L60HashReverse{$refChain}{$L60}=1;
$L40HashReverse{$refChain}{$L40}=1;
$L30HashReverse{$refChain}{$L30}=1;
$S90HashReverse{$refChain}{$S90}=1;
$S30HashReverse{$refChain}{$S30}=1;
}
- print "HOOOHOOOOO: ".$L99HashReverse{"1DE4"};
my $testRechner=0;
- open (FILOR, ">test/hhblitsCompleteUniprot600.txt") or die ("du penner!");
foreach my $entry (@usedIDs){
#print FILOR "$entry\n";
if (exists $uniprotToPDBHash{$entry}) {
# print "Hallo ich bin $entry, mein PDB-pendant ist/sind: @{$uniprotToPDBHash{$entry}}!\n";
push (@arrayOfPDBIDs, @{$uniprotToPDBHash{$entry}});
# print scalar(@{$uniprotToPDBHash{$entry}})."\n";
# $testRechner=$testRechner+scalar(@{$uniprotToPDBHash{$entry}});
# print $testRechner."\n";
}
}
- print "While there were ". scalar(@sshole)." uniprot IDs.\n";
- close (FILOR);
my $hardCodedPDB1="1DE4"; my $hardCodedPDB2="1A6Z";
my @hardCodedPDBs=($hardCodedPDB1,$hardCodedPDB2);
my %alreadyLFound; my %alreadySFound;
- print "@arrayofPDBIDs";
my %L99EndHash; my %L80EndHash; my %L60EndHash; my %L40EndHash; my %L30EndHash; my %S90EndHash; my %S30EndHash;
foreach my $foundPDBID (@arrayOfPDBIDs) { foreach my $hardCoded (@hardCodedPDBs){ foreach my $representativeL99OfFoundPDBID (keys $L99HashReverse{$hardCoded}){ #print "$representativeL99OfFoundPDBID TROLOLOLOLO!"; if (exists $L99Hash{$representativeL99OfFoundPDBID}{$foundPDBID}){ $L99EndHash{$foundPDBID}=1; #print "IT WORKED!!!!!\n"; # funktioniert hier weiterschreiben... } }
foreach my $representativeL80OfFoundPDBID (keys $L80HashReverse{$hardCoded}){
if (exists $L80Hash{$representativeL80OfFoundPDBID}{$foundPDBID}){ $L80EndHash{$foundPDBID}=1; } }
foreach my $representativeL60OfFoundPDBID (keys $L60HashReverse{$hardCoded}){
if (exists $L60Hash{$representativeL60OfFoundPDBID}{$foundPDBID}){
$L60EndHash{$foundPDBID}=1;
}
}
foreach my $representativeL40OfFoundPDBID (keys $L40HashReverse{$hardCoded}){
if (exists $L40Hash{$representativeL40OfFoundPDBID}{$foundPDBID}){ $L40EndHash{$foundPDBID}=1; } }
foreach my $representativeL30OfFoundPDBID (keys $L30HashReverse{$hardCoded}){
if (exists $L30Hash{$representativeL30OfFoundPDBID}{$foundPDBID}){ $L30EndHash{$foundPDBID}=1; } }
foreach my $representativeS90OfFoundPDBID (keys $S90HashReverse{$hardCoded}){
if (exists $S90Hash{$representativeS90OfFoundPDBID}{$foundPDBID}){ $S90EndHash{$foundPDBID}=1; } }
foreach my $representativeS30OfFoundPDBID (keys $S30HashReverse{$hardCoded}){
if (exists $S30Hash{$representativeS30OfFoundPDBID}{$foundPDBID}){ $S30EndHash{$foundPDBID}=1; } }
}
}
open(OUTPUTFILE, ">$outputFile"); open(OUTPUTFILE2, ">$outputFile.pcs"); print scalar(@arrayOfPDBIDs); print scalar (keys %L99EndHash)."\tL99er\n"; print scalar (keys %L80EndHash)."\tL80er\n"; print scalar (keys %L60EndHash)."\tL60er\n"; print scalar (keys %L40EndHash)."\tL40er\n"; print scalar (keys %L30EndHash)."\tL30er\n"; print scalar (keys %S90EndHash)."\tS90er\n"; print scalar (keys %S30EndHash)."\tS30er\n"; print "\n"; print OUTPUTFILE scalar (keys %L99EndHash)."\tL99er\n"; print OUTPUTFILE scalar (keys %L80EndHash)."\tL80er\n"; print OUTPUTFILE scalar (keys %L60EndHash)."\tL60er\n"; print OUTPUTFILE scalar (keys %L40EndHash)."\tL40er\n"; print OUTPUTFILE scalar (keys %L30EndHash)."\tL30er\n"; print OUTPUTFILE scalar (keys %S90EndHash)."\tS90er\n"; print OUTPUTFILE scalar (keys %S30EndHash)."\tS30er\n";
print OUTPUTFILE2 scalar (keys %L99EndHash)/scalar(@arrayOfPDBIDs)."\tL99er\n";
print OUTPUTFILE2 scalar (keys %L80EndHash)/scalar(@arrayOfPDBIDs)."\tL80er\n";
print OUTPUTFILE2 scalar (keys %L60EndHash)/scalar(@arrayOfPDBIDs)."\tL60er\n";
print OUTPUTFILE2 scalar (keys %L40EndHash)/scalar(@arrayOfPDBIDs)."\tL40er\n";
print OUTPUTFILE2 scalar (keys %L30EndHash)/scalar(@arrayOfPDBIDs)."\tL30er\n";
print OUTPUTFILE2 scalar (keys %S90EndHash)/scalar(@arrayOfPDBIDs)."\tS90er\n";
print OUTPUTFILE2 scalar (keys %S30EndHash)/scalar(@arrayOfPDBIDs)."\tS30er\n";
</source>
Get how many of the found proteins have a common GO term with our HFE protein
<source lang="perl">
- !/usr/local/bin/perl
use strict; use warnings; use diagnostics; use sigtrap; use autodie;
my $idFile=shift; my $GOData=shift; my $outputData=shift;
my $line;
my $hardcodedUniprotID="Q30201";
my %hashOfAnnihilation; $hashOfAnnihilation{$hardcodedUniprotID}=1;
open (MYFILE, $idFile) or die ("du penner!");
while (<MYFILE>) {
$line=$_; chomp $line;
if ($line=~m/PDB:(.{4})/gi){
} else { my $currentID=$line; $hashOfAnnihilation{$currentID}=1;
}
} close (MYFILE);
my %uniprotToPDBHash; my @tempArray;
my %gOAssocs; open (GODATA, "<$GOData") or die "GO Data, GO Data GO? GO???? :*(";
while (<GODATA>) { $line=$_; chomp $line;
my @tempGOArray=split('\s',$line); my $tempID=$tempGOArray[0]; shift @tempGOArray; foreach my $tempGOTerm (@tempGOArray){ if (exists $hashOfAnnihilation{$tempID}){
$gOAssocs{$tempID}{$tempGOTerm}=1;
}
}
}
close (GODATA);
my $numberOfSimilar=0; my $totalFound=0;
my %wantedHash;
open (PERCENTAGEOUTPUTOFSHAREDCLUSTER, ">$outputData") or die "WHY, CRUEL WORLD? WHY? meh whatever...";
foreach my $uniprotIDofCluster (keys %hashOfAnnihilation) { my $numberOfCurrentHits=0;
if (exists $gOAssocs{$uniprotIDofCluster}){
foreach my $currentGO (keys $gOAssocs{$uniprotIDofCluster}){
if (exists $gOAssocs{$hardcodedUniprotID}{$currentGO}){ $numberOfCurrentHits++; $totalFound++; if (exists $wantedHash{$currentGO}) { $wantedHash{$currentGO}=$wantedHash{$currentGO}+1; } else { $wantedHash{$currentGO}=1; }
}
} my $roundedPercentageOfHardCodedProteinsGOTerms=int(10000 * ($numberOfCurrentHits/(scalar (keys $gOAssocs{$hardcodedUniprotID}))) + 0.5) / 10000; my $roundedPercentageOfCurrentProteinsGOTerms=int(10000 * ($numberOfCurrentHits/(scalar (keys $gOAssocs{$uniprotIDofCluster}))) + 0.5) / 10000;
print PERCENTAGEOUTPUTOFSHAREDCLUSTER "$uniprotIDofCluster\t$roundedPercentageOfHardCodedProteinsGOTerms\t$roundedPercentageOfCurrentProteinsGOTerms\t$numberOfCurrentHits\n";
}
if ($numberOfCurrentHits>0){ $numberOfSimilar++; }
}
print "$totalFound wurden insgesamt gefunden, mit insgesamt $numberOfSimilar ähnlichen.\n";
close (PERCENTAGEOUTPUTOFSHAREDCLUSTER);
foreach my $key (keys %wantedHash){
print "$key: $wantedHash{$key}\n";
}
</source>
MSA
GapConserve <source lang="perl">
- !/usr/local/bin/perl
use strict; use warnings; use diagnostics; use sigtrap; use autodie;
my $aliFile=shift;
my $lastID="Trötofant"; my $currentSequence; my $currentNumberOfGaps=0; my %sequenceHash; my %gapCounter; open (MYFILE, $aliFile) or die ("weak!");
while (<MYFILE>) {
my $line=$_; chomp $line; if ($line=~m/^>.*?\|(.*?)\|/gi){ if ($lastID ne "Trötofant"){ $sequenceHash{$lastID}=$currentSequence; $currentSequence=""; } $lastID=$1;
} else { $currentSequence=$currentSequence . $line;
}
} close MYFILE; $sequenceHash{$lastID}=$currentSequence;
foreach my $key (keys %sequenceHash){ my @gapCount=$sequenceHash{$key}=~ m/-/gi; $gapCounter{$key}= scalar @gapCount; print "$gapCounter{$key} Gaps gefunden! Bei Uniprot $key\n"; }
my $sequenceLength=scalar (split ("",$currentSequence));
my $consensusCounter=0; for (my $i=0;$i<$sequenceLength;$i++){ my $bool=1; my $lastChar=""; foreach my $key (keys %sequenceHash){ if ( $bool==1 && ($lastChar eq (split ("",$sequenceHash{$key}))[$i] || $lastChar eq "") && (split ("",$sequenceHash{$key}))[$i] ne "-"){ #print "alter Char: ----> $lastChar <----"; $lastChar=(split ("",$sequenceHash{$key}))[$i]; #print "selber Char gefunden, Baby! ----> $lastChar <---- an Pos $i\n"; } else { $bool=0; $lastChar="-"; } } if ($bool==1){ $consensusCounter++ } }
print "$consensusCounter komplett übereinstimmende Positionen (Conserved)\n"; </source>
Other
Script to get only GO annotations of proteins one is interested in out of a GO file:
<source lang="perl">
- !/usr/local/bin/perl
use strict;
use warnings;
use diagnostics;
use sigtrap;
use autodie;
my $fileWithUniprotIDs=shift; my $GOData=shift; my $outputFile=shift; my $line; my $hardcodedUniprotID="Q30201"; my %hashOfAnnihilation; my @arrayOfPDBIDs; my @usedIDs; push(@usedIDs, $hardcodedUniprotID);
open (MYFILE, $fileWithUniprotIDs) or die ("Uniprot ID Data missing... :*( \n");
while (<MYFILE>) {
$line=$_; chomp $line; if ($line=~m/PDB:(.{4})/gi){ $line=uc $1; push (@arrayOfPDBIDs, $line); } else { push(@usedIDs, $line); $hashOfAnnihilation{$line}=1; }
} close (MYFILE);
my %uniprotToPDBHash; my @tempArray;
my %gOAssocs; open (GODATA, "<$GOData") or die "GO Data, GO Data GO? GO???? :*(";
while (<GODATA>) { $line=$_; chomp $line;
if ($line=~m/^UniprotKB.*/gi) {
my @tempGOArray=split('\s',$line);
if (exists $hashOfAnnihilation{$tempGOArray[1]}){
$gOAssocs{$tempGOArray[1]}{$tempGOArray[4]}=1;
}
# print "$tempGOArray[1]!wurd hinzugefuegt mit $tempGOArray[4]!.\n";
}
}
close (GODATA);
open (OUTPUT, ">$outputFile") or die "Cannot write to $outputFile...\n"; foreach my $uniprotIDofCluster (@usedIDs) { my $numberOfCurrentHits=0;
if (exists $gOAssocs{$uniprotIDofCluster}){ print OUTPUT "$uniprotIDofCluster\t"; print "$uniprotIDofCluster\t"; foreach my $currentGO (keys $gOAssocs{$uniprotIDofCluster}){ print "$currentGO\t"; print OUTPUT "$currentGO\t";
} print "\n"; print OUTPUT "\n";
}
} close (OUTPUT);
</source>
R script for the e-Value and identity distributions (yep, everything hardcoded^^).
<source lang="bash">
data1<-read.table("blastpgp_h0.002_j2_10000.last.stats");
data2<-read.table("blastpgp_h0.002_j10_10000.last.stats");
data3<-read.table("blastpgp_h1e-10_j2_10000.last.stats");
data4<-read.table("blastpgp_h1e-10_j10_10000.last.stats");
data5<-read.table("blast_default_1500.out.stats");
data6<-read.table("hhblits_default_600.out.stats");
data7<-read.table("blastpgp_big_h0.002_j2_100000.last.stats"); data8<-read.table("blastpgp_big_h0.002_j10_100000.last.stats"); data9<-read.table("blastpgp_big_h1e-10_j2_100000.last.stats"); data10<-read.table("blastpgp_big_h1e-10_j10_100000.last.stats");
png(filename="eval_psi_80.png", height=400, width=800, bg="white");
cols = rainbow(4, alpha=0.6);
br = seq(-150,0,1);
hist(log10(data1$V2), col=cols[1], breaks=br, ylim=range(0,150), xlab="log10(E-Value)", main="PSI-Blast (big80)");
hist(log10(data2$V2), col=cols[2], breaks=br, ylim=range(0,150), xlab="log10(E-Value)", add=TRUE);
hist(log10(data3$V2), col=cols[3], breaks=br, ylim=range(0,150), xlab="log10(E-Value)", add=TRUE);
hist(log10(data4$V2), col=cols[4], breaks=br, ylim=range(0,150), xlab="log10(E-Value)", add=TRUE);
legend(-150,100,c('h=0.002, j=2','h=0.002, j=10','h=1e-10, j=2','h=1e-10, j=10'),col=cols,fill=cols,border=cols);
dev.off();
png(filename="eval_blast_80.png", height=400, width=800, bg="white");
hist(log10(data5$V2), col="blue", breaks=100, xlab="log10(E-Value)", main="Blast (big80)");
dev.off();
png(filename="eval_hhblits.png", height=400, width=800, bg="white");
hist(log10(data6$V2), col="blue", breaks=50, xlab="log10(E-Value)", main="HHblits (Uniprot20)");
dev.off();
png(filename="ident_psi_80.png", height=400, width=800, bg="white");
cols2 = rainbow(4, alpha=0.6);
br2 = seq(0,100,1);
hist(data1$V3, col=cols[1], breaks=br2, xlim=range(0,100), ylim=range(0,150), xlab="Identity (%)", main="PSI-Blast (big80)");
hist(data2$V3, col=cols[2], breaks=br2, xlim=range(0,100), ylim=range(0,150), xlab="Identity (%)", add=TRUE);
hist(data3$V3, col=cols[3], breaks=br2, xlim=range(0,100), ylim=range(0,150), xlab="Identity (%)", add=TRUE);
hist(data4$V3, col=cols[4], breaks=br2, xlim=range(0,100), ylim=range(0,150), xlab="Identity (%)", add=TRUE);
legend(60,100,c('h=0.002, j=2','h=0.002, j=10','h=1e-10, j=2','h=1e-10, j=10'),col=cols,fill=cols,border=cols);
dev.off();
png(filename="ident_blast_80.png", height=400, width=800, bg="white");
hist(data5$V3, col="blue", breaks=100, xlim=range(0,100), xlab="Identity (%)", main="Blast (big80)");
dev.off();
png(filename="ident_hhblits.png", height=400, width=800, bg="white");
hist(100*(data6$V3), col="blue", breaks=50, xlim=range(0,100), xlab="Identity (%)", main="HHblits (Uniprot20)");
dev.off();
</source>