Sequence Alignments Protocol TSD

From Bioinformatikpedia
Revision as of 11:07, 7 May 2012 by Meiera (talk | contribs) (Find unique matches: example)

Blast

blastall -p blastp -d /mnt/project/pracstrucfunc12/data/big/big_80 -i /mnt/home/student/meiera/1_SeqAli/1_SeqSearch/P06865.fasta -b 1200 -v 1200 > /mnt/home/student/meiera /1_SeqAli/1_SeqSearch/blastall_1200alis.out

PSI-Blast

Big80

time blastpgp -C blastpgp1200_pssmdefault -d /mnt/project/pracstrucfunc12/data/big/big_80 -i /mnt/home/student/meiera/1_SeqAli/1_SeqSearch/P06865.fasta -v 1200 -b 1200 > /mnt/home/student/meiera/1_SeqAli/1_SeqSearch/blastpgp1200_default.out
time blastpgp -C blastpgp1200_pssmit2e002 -d /mnt/project/pracstrucfunc12/data/big/big_80 -i /mnt/home/student/meiera/1_SeqAli/1_SeqSearch/P06865.fasta -j 2 -h "0.002" -v 1200 -b 1200 >  /mnt/home/student/meiera/1_SeqAli/1_SeqSearch/blastpgp1200_it2e002.out
time blastpgp -C blastpgp1200_pssmit10e002 -d /mnt/project/pracstrucfunc12/data/big/big_80 -i /mnt/home/student/meiera/1_SeqAli/1_SeqSearch/P06865.fasta -j 10 -h "0.002" -v 1200 -b 1200 >  /mnt/home/student/meiera/1_SeqAli/1_SeqSearch/blastpgp1200_it10e002.out
time blastpgp -C blastpgp1200_pssmit10e-10 -d /mnt/project/pracstrucfunc12/data/big/big_80 -i /mnt/home/student/meiera/1_SeqAli/1_SeqSearch/P06865.fasta -j 10 -h "10E-10" -v 1200 -b 1200 >  /mnt/home/student/meiera/1_SeqAli/1_SeqSearch/blastpgp1200_it10e-10.out
time blastpgp -C blastpgp1200_pssmit2e-10 -d /mnt/project/pracstrucfunc12/data/big/big_80 -i /mnt/home/student/meiera/1_SeqAli/1_SeqSearch/P06865.fasta -j 2 -h "10E-10" -v 1200 -b 1200 >  /mnt/home/student/meiera/1_SeqAli/1_SeqSearch/blastpgp1200_it2e-10.out

Big

PAT="/mnt/home/student/meiera/1_SeqAli/1_SeqSearch"
time blastpgp -R $PAT/blastpgp1200_pssmit2e002 -d /mnt/project/pracstrucfunc12/data/big/big -i $PAT/P06865.fasta -v 3800 -b 3800 > $PAT/blastpgpBIG_3800_it2e002.out
time blastpgp -R $PAT/blastpgp1200_pssmit10e002 -d /mnt/project/pracstrucfunc12/data/big/big -i $PAT/P06865.fasta -v 3800 -b 3800 > $PAT/blastpgpBIG_3800_it10e002.out
time blastpgp -R $PAT/blastpgp1200_pssmit2e-10 -d /mnt/project/pracstrucfunc12/data/big/big -i $PAT/P06865.fasta -v 3800 -b 3800 > $PAT/blastpgpBIG_3800_it2e-10.out
time blastpgp -R $PAT/blastpgp1200_pssmit10e-10 -d /mnt/project/pracstrucfunc12/data/big/big -i $PAT/P06865.fasta -v 3800 -b 3800 > $PAT/blastpgpBIG_3800_it10e-10.out

Count unique matches: example

head -1 m8blastpgp1200_it2e002.out
grep -n G1RUL9 m8blastpgp1200_it2e002.out
tail -n +1233 m8blastpgp1200_it2e002.out | cut -f 2 | wc -l
tail -n +1233 m8blastpgp1200_it2e002.out | uniq -w 44 | wc -l

HHblits

WD="/mnt/home/student/meiera/1_SeqAli/1_SeqSearch"
time hhblits -i $WD/P06865.fasta -o $WD/hhblits_460_P06865.hhr -d /mnt/project/pracstrucfunc12/data/hhblits/uniprot20_current -Z 460 -B 460 > $WD/hhblits_460_P06865_stdout.log
time hhblits -i $WD/P06865.fasta -n 10 -o $WD/hhblits_460_P06865_10it.hhr -d /mnt/project/pracstrucfunc12/data/hhblits/uniprot20_current -Z 460 -B 460 > $WD/hhblits_460_P06865_10_stdout.log
time hhblits -i $WD/P06865.fasta -n 10 -o $WD/hhblits_2500_P06865_10it.hhr -d /mnt/project/pracstrucfunc12/data/hhblits/uniprot20_current -Z 2500 -B 2500 > $WD/hhblits_2500_P06865_10_stdout.log
time hhblits -i $WD/P06865.fasta -n 2 -o $WD/hhblits_460_P06865_2it.hhr -d /mnt/project/pracstrucfunc12/data/hhblits/uniprot20_current -Z 460 -B 460 > $WD/hhblits_460_P06865_2_stdout.log
time hhblits -i $WD/P06865.fasta -n 2 -o $WD/hhblits_2500_P06865_2it.hhr -d /mnt/project/pracstrucfunc12/data/hhblits/uniprot20_current -Z 2500 -B 2500 > $WD/hhblits_2500_P06865_2_stdout.log
time hhsearch -i $WD/P06865.fasta -o $WD/hhblits_460_P06865.hhr -d /mnt/project/pracstrucfunc12/data/hhblits/pdb70_current_hhm_db -Z 14 -B 14 > $WD/hhblits_pdb_1500_P06865_stdout.log


Preparation of outputs

Sequence identity and e-values

#for both Blast and HHblits output
/mnt/home/student/meiera/bin/1_parseidentity.pl
Usage: 1_parseidentity.pl [flags] file...
flags:
   -i Infile   reads the input from fileIN 
   -o OutFile  writes output to fileOUT 
   -h	Input is hhblits, default is blast

Uniprot sets for Venn diagrams

#for both Blast and HHblits output, also computation of the unique ids for HHblits
 /mnt/home/student/meiera/bin/1_getUniprotIDS.pl
Usage: 1_getUniprotIDS.pl [flags] file...
flags:
   -i Infile   reads the input from fileIN 
   -o OutFile  writes uniprot identifiers to fileOUT 
   -h = hhblits input default is set to blastoutput
   -m mappingfile   uniprot mappingfile

Build PDB sets

Unique identifiers for Blast and HHblits

/mnt/home/student/meiera/bin/1_forPdbMapping.pl
Usage: 1_forPdbMapping.pl [flags] file...
flags:
   -i Infile   reads the input from fileIN 
   -o OutFile  writes uniprot identifiers to fileOUT 
   -h = hhblits input, default is set to blastoutput


#Use uniprot.org's online mapping service on the .pdbmapping files and save the results to .mapped files
cut -f 2 psiblastBIGit10e002.mapped | tr '[A-Z]' '[a-z]' | sort | uniq | grep -P "\w{4}" >  psiblastBIGit10e002.finpdb1
grep -P "\w{4}_" psiblastBIGit10e002.pdbmapping --only-matching | sed "s/_//" >  psiblastBIGit10e002.finpdb2
cat psiblastBIGit10e002.finpdb2 psiblastBIGit10e002.finpdb1 | sort | uniq > psiblastBIGit10e002.finpdb
cut -f 2  psiblastBIGit10e-10.mapped | tr '[A-Z]' '[a-z]' | sort | uniq | grep -P "\w{4}" >   psiblastBIGit10e-10.finpdb1
grep -P "\w{4}_" psiblastBIGit10e-10.pdbmapping --only-matching | sed "s/_//" >  psiblastBIGit10e-10.finpdb2
cat psiblastBIGit10e-10.finpdb1 psiblastBIGit10e-10.finpdb2 | sort | uniq > psiblastBIGit10e-10.finpdb
cut -f 2 psiblastBIGit2e002.mapped | tr '[A-Z]' '[a-z]' | sort | uniq | grep -P "\w{4}" >  psiblastBIGit2e002.finpdb1
grep -P "\w{4}_" psiblastBIGit2e002.pdbmapping --only-matching | sed "s/_//" >  psiblastBIGit2e002.finpdb2
cat psiblastBIGit2e002.finpdb1 psiblastBIGit2e002.finpdb2 | sort | uniq > psiblastBIGit2e002.finpdb
cut -f 2 psiblastBIGit2e-10.mapped | tr '[A-Z]' '[a-z]' | sort | uniq | grep -P "\w{4}" >  psiblastBIGit2e-10.finpdb1
grep -P "\w{4}_" psiblastBIGit2e-10.pdbmapping --only-matching | sed "s/_//" >  psiblastBIGit2e-10.finpdb2
cat psiblastBIGit2e-10.finpdb1 psiblastBIGit2e-10.finpdb2 | sort | uniq > psiblastBIGit2e-10.finpdb


cut -f 2 hhblits__460.mapped | tr '[A-Z]' '[a-z]' | sort | uniq | grep -P "\w{4}" > hhbtemp
sed "s/_\w//g" hhblits_pdb_1500.pdbmapping > hhbtemp2
cat hhbtemp hhbtemp2 | sed "s/\s//g" |  sort | uniq > hhblits.finpdb
rm -f hhbtemp hhbtemp2
# dont' have to do the special stuff here, the normal one doesn't find any because it's only on uniprot

COPS Parsing

The following script will read the COPS clusters (-c) and print out for every classification the members of the cluster that one or several IDs of interest (-i) are contained in. Additionally, if given a list of hits (-hi), it will count the number of hits that fall in the according cluster and the ones that don't and print out that information to a file for plotting e.g. with R. Note that you can ignore chains with the '-nc' option.

Also be aware that ignoring chains might lead to a merge of clusters. That is if given xxxx als protein of interest and xxxx,A as well was xxxx,B are contained in COPS, the final cluster which is used to calculate how many hits fall into it, will be a combination of the cluster from xxxx,A and the one from xxxx,B. That of course might be overestimating the number of hits that fall into a cluster.

Sample call: ./parseCOPSClusters.pl -c ~/Desktop/COPS-ChainHierarchy.txt -i ./interestIds -nc -hi ./union.finpdb

#!/usr/bin/env perl

use strict;
use warnings;
use sigtrap;
use autodie;
use diagnostics;
use Pod::Usage;
use Getopt::Long qw(:config require_order);
use 5.010;    #For smart match, which makes this script unbelievably slow, but I'm lazy :)

##Program variables
my %clusters;
my @clusterOrder;
my @interestIDs = ();
my %interestIDLocations;
my @hitsIDs = ();

##Argument
my $copsPath     = '/mnt/project/pracstrucfunc12/data/COPS/COPS-ChainHierarchy.txt';
my $nochain      = 0;
my $interestPath = './interestIds';
my $hitsPath     = './union.finpdb';

########################
###Argument Handling####
########################
my $result = GetOptions(
    'help|h|?'     => sub { pod2usage( -verbose => 1 ); },
    'man|m'        => sub { pod2usage( -verbose => 2 ); },
    'c|cops=s'     => \$copsPath,
    'nc|nocain'    => \$nochain,
    'i|interest=s' => \$interestPath,
    'hi|hits=s'    => \$hitsPath
) or pod2usage( -verbose => 1 ) && exit;

#Read IDs to look out for
my $fhi;
open( $fhi, '<', $interestPath );
while ( my $line = <$fhi> )
{
    if ( $line =~ m/(\w{4},\w)/ || $line =~ m/(\w{4})/ )
    {
        my $iid = lc($1);
        push( @interestIDs, $iid );
        $interestIDLocations{$iid} = {};
    }
}

close($fhi);

#Read ids that were found by prediction method(s)
my $fhh;
open($fhh, '<', $hitsPath);
while(my $line =<$fhh>)
{
    if($line =~ m/^(\w{4})$/)
    {
        push(@hitsIDs, lc($1));
    }
}
close($fhh);

print "Found # hits: " . scalar(@hitsIDs) . "\n";

#Parse the COPS structure
my $fh;
open( $fh, '<', $copsPath );
while ( my $line = <$fh> )
{
    if ( $line =~ m/^Chain (.+)$/ )
    {
        #Header line found. Inititialize the cluster hash
        my @matches = $1 =~ m/\w+/g;
        for my $clusterName (@matches)
        {
            $clusters{$clusterName} = {};
        }

        @clusterOrder = @matches;
    }
    elsif ( $line =~ m/\w{4},/ )
    {
        #Cluster info line
        my @matches = $line =~ m/(\w{4},\w)/g;
        my $currentId = shift(@matches);

        #         print "current id is $currentId\n";
        $currentId = sanitizePdbIDs($currentId);

        my $count = 0;

        for my $representative (@matches)
        {
            $representative = sanitizePdbIDs($representative);

            my $currentCluster = $clusterOrder[ $count++ ];

            #             print "currentCluster is $currentCluster\n";

            if ( !exists $clusters{$currentCluster} || !defined $clusters{$currentCluster} )
            {
                print "this should not have happened :)\n";
                next;
            }
            if ( !exists $clusters{$currentCluster}{$representative} )
            {

                #Inititialize the cluster with the representative
                $clusters{$currentCluster}{$representative} = [$representative];
            }

            #Add the current pdb id

            if ( !( $currentId ~~ @{ $clusters{$currentCluster}{$representative} } ) )
            {

                #This case can only occur when we do not use the chains
                push( @{ $clusters{$currentCluster}{$representative} }, $currentId );
            }

            if ( $currentId ~~ @interestIDs )
            {
                if(!defined $interestIDLocations{$currentId}{$currentCluster})
                {
                    $interestIDLocations{$currentId}{$currentCluster}= ();
                }
                push(@{$interestIDLocations{$currentId}{$currentCluster} }, $representative);
            }
        }
    }
}
close($fh);


#Do some maybe interesting output
for my $intId (@interestIDs)
{
    printClustersForId($intId);
}

sub printClustersForId
{
    my $id = shift;
    
    my @allIn = ();
    my @allOut = ();
    
    for my $cluster ( sort(keys( $interestIDLocations{$id} ) ) )
    {        
        my %clusterMembersMerged = ();
        
        print "MEMBERS: $id -- $cluster : ";
        my @currentRepresentatives = @{$interestIDLocations{$id}{$cluster}};
        
        for my $currentRepresentative (@currentRepresentatives)
        {
            my @clusterMembers        = @{ $clusters{$cluster}{$currentRepresentative} };

            for my $member (@clusterMembers)
            {
#                 print "$member ";
                $clusterMembersMerged{$member} = ;
            }
#             print "\n"; 
        }
        
        my @clusterMembersMergedArr= keys(%clusterMembersMerged);
        
        for my $e (@clusterMembersMergedArr)
        {
            print $e." ";
        }
        print "\n";
        
        
        my $hitsIn = 0;
        my $hitsOut = 0;
        
        print "OVERLAPS: $id -- $cluster : ";
        for my $hit (@hitsIDs)
        {
            if($hit ~~ @clusterMembersMergedArr)
            {
                $hitsIn++;
            }
            else
            {
                $hitsOut++;
            }
        }
        
        print "In: $hitsIn, Out: $hitsOut\n";     
        push(@allIn, $hitsIn);
        push(@allOut, $hitsOut);
    }
    
    #Write Out In/Out over Cluster stats for plotting in R
    my $fho;
    open($fho, '>', "stats$id.plotme");
    for my $cluster (sort (keys( $interestIDLocations{$id})))
    {
        print $fho "\t$cluster";
    }
    print $fho "\n";
    
    print $fho "IN";
    for my $elem (@allIn)
    {
        print $fho "\t$elem";
    }
    print $fho "\n";
    
    print $fho "OUT";
    for my $elem (@allOut)
    {
        print $fho "\t$elem";
    }
    print $fho "\n";
    
    close($fho);    
}



sub sanitizePdbIDs
{
    my $id = shift;
    if ($nochain)
    {
        $id = substr( $id, 0, 4, );
    }
    return $id;
}

Plotting

Distribution of hits that full in and out of clusters

library(gplots)     #Needs caTools, bitops and a few more. Gives us: barplot2

cbarnames <- 2.5
clwd <- 5
clwdaxis <- 20
ccaxis <- 2

maindat <- read.table('stats2gjx.plotme')
mainmat <- as.matrix(maindat, header=TRUE)

lab <- c("IN", "OUT")

 lab <- row.names(mainmat)
        len <- length(lab)

        method_col <- c("red", "blue")

png("./tsd_inoutdistri.png", height=800, width=800)
# par(xpd=T, mar=par()$mar+c(0,0,0,13))
barplot2(mainmat, beside=TRUE, col=method_col, cex.axis=ccaxis, cex.names=cbarnames, xpd=F, plot.ci=FALSE,cex.main=cbarnames, main="Overlap of union PDB hits with COPS Clusters")        #names.arg=c("IN","OUT","IN","OUT","IN","OUT","IN","OUT","IN","OUT")
legend(4,70, lab, fill=method_col, cex=2, ncol=1)
par(mar=c(5, 4, 4, 2) + 0.1)    # Restore default clipping rect
dev.off()

Overlap of PDB hits found by the different methods

library("VennDiagram")
main1<-"Overlap of PDB hits"
sub1<-"BIG database"

hhb <- read.table("data/hhblits.finpdb")
psiit10e002 <- read.table("data/psiblastBIGit10e002.finpdb")
psiit10e10<-read.table("data/psiblastBIGit10e-10.finpdb")
psiit2e002<-read.table("data/psiblastBIGit2e002.finpdb")    # psiit2e10<-read.table("data/psiblastBIGit2e-10.finpdb")

psi=list("HHblits"=hhb[,1], "It10, Ev10e-10"=psiit10e10[,1],"It10, Ev002"=psiit10e002[,1],"It2, Ev10e-10/Ev002"=psiit2e002[,1])


venn.diagram(psi,"pdbHitsOverlap.tif", scaled = TRUE,col=c("darkblue", "darkgreen", "orange","darkorchid4"), fill=c("cornflowerblue", "green", "yellow","darkorchid1"), 
             fontfamily = "serif", fontface = "bold", cat.col = c("darkblue", "darkgreen", "orange","darkorchid4"),cat.cex = 1.5, cat.pos = 0, cat.fontfamily = "serif", 
             rotation.degree = 200, main=main1, sub=sub1,main.cex=2,sub.cex=1.3,lwd=1,euler.d=2,lty=1, cat.dist=c(0.1,0.1,0.13,0.1)) 


e-Value and identity distributions

ccmain <- 1.7
ccaxis <- 2
cclab <- 2

col1 <- "#0000ff"
col2 <- "#ff0000"
col3 <- "#00ff00"
col4 <- "#808080"
col1t <- "#0000FF80"
col2t <- "#FF000080"
col3t <- "#00ff0080"
col4t <- "#80808080"

###Blast
d <- read.table("./temp/blastall1200_IdEvfixed.plot")
d_id <- d$V1
d_ev <- d$V2

#Plot identities
png(width=600, height=600, "tsd_blastallIdent.png")
par(mar=par()$mar+c(0,1,-1,0))
hist(d_id, breaks=100,xlim=c(0,100), main="Blast - Histogram of identities",xlab="% sequence identity in Blast alignment", cex.axis=ccaxis, cex.main=ccmain, cex.lab=cclab, col=col1)
dev.off()

#Plot evalues
png(width=600, height=600, "tsd_blastallEval.png")
par(mar=par()$mar+c(0,1,-1,0))
hist(log(d_ev), breaks=100, main="Blast - Histogram of eValue distribution",xlab="log(eValue)", cex.axis=ccaxis, cex.main=ccmain, cex.lab=cclab, col=col1)
dev.off()

###Psi-Blast normal
# #Iterations e0002
# d1 <- read.table("./temp/blastpgp1200_it10e002_IdEvfixed.plot")
# d2 <- read.table("./temp/blastpgp1200_it2e002_IdEvfixed.plot")
# 
# d1_id <- d1$V1
# d1_ev <- d1$V2
# d2_id <- d2$V1
# d2_ev <- d2$V2
# 
# 
# lab = c("2 iterations", "10 iterations")
# 
# png(width=600, height=600, "tsd_psiblastEvals_Iterations_e002.png")
# par(mar=par()$mar+c(0,1,0,0))
# hist(log(d1_ev), breaks=100,xlim=c(-430,0), ylim=c(0,70), main="PSI-Blast - Histogram of eValue distributions", xlab="log(eValue)", cex.axis=ccaxis, cex.main=ccmain, cex.lab=cclab, col=col1t)     
# mtext("Profile inclusion threshold 0.002, BIG80 database", cex=1.4)
# hist(log(d2_ev), breaks=100,xlim=c(-430,0) ,ylim=c(0,70), col=col2t, add=T)
# legend("topleft", lab, fill=c(col1t, col2t), cex=1.6, ncol=1)
# dev.off()
# 
# png(width=600, height=600, "tsd_psiblastIdent_Iterations_e002.png")
# par(mar=par()$mar+c(0,1,0,0))
# hist(d1_id, breaks=100, xlim=c(0,160),ylim=c(0,200), main="PSI-Blast - Histogram of identities ", xlab="% sequence identity in Blast alignment", cex.axis=ccaxis, cex.main=ccmain, cex.lab=cclab, col=col1t)     
# mtext("Profile inclusion threshold 0.002, BIG80 database", cex=1.4)
# hist(d2_id, breaks=100,ylim=c(0,160), col=col2t, add=T)
# legend("topright", lab, fill=c(col1t, col2t), cex=1.6, ncol=1)
# dev.off()
# 
# #Iterations 10e-10
# d1 <- read.table("./temp/blastpgp1200_it10e-10_IdEvfixed.plot")
# d2 <- read.table("./temp/blastpgp1200_it2e-10_IdEvfixed.plot")
# 
# d1_id <- d1$V1
# d1_ev <- d1$V2
# d2_id <- d2$V1
# d2_ev <- d2$V2
# 
# lab = c("2 iterations", "10 iterations")
# 
# png("tsd_psiblastEvals_Iterations_10e-10.png", width=600, height=600)
# par(mar=par()$mar+c(0,1,0,0))
# hist(log(d1_ev), breaks=100, ylim=c(0,70), xlim=c(-430,0) ,main="PSI-Blast - Histogram of eValue distributions", xlab="log(eValue)", cex.axis=ccaxis, cex.main=ccmain, cex.lab=cclab, col=col1t)     
# mtext("Profile inclusion threshold 10e-10, BIG80 database", cex=1.4)
# hist(log(d2_ev), breaks=100, ylim=c(0,70),xlim=c(-430,0) ,col=col2t, add=T)
# legend("topleft", lab, fill=c(col1t, col2t), cex=1.6, ncol=1)
# dev.off()
# 
# png(width=600, height=600, "tsd_psiblastIdent_Iterations_10e-10.png")
# par(mar=par()$mar+c(0,1,0,0))
# hist(d1_id, breaks=100, xlim=c(0,100), ylim=c(0,180), main="PSI-Blast - Histogram of identities ", xlab="% sequence identity in Blast alignment", cex.axis=ccaxis, cex.main=ccmain, cex.lab=cclab, col=col1t)     
# mtext("Profile inclusion threshold 10e-10, BIG80 database", cex=1.4)
# hist(d2_id, breaks=100, col=col2t, ylim=c(0,180),add=T)
# legend("topright", lab, fill=c(col1t, col2t), cex=1.6, ncol=1)
# dev.off()
d1 <- read.table("./temp/blastpgp1200_it2e002_IdEvfixed.plot")
d2 <- read.table("./temp/blastpgp1200_it10e002_IdEvfixed.plot")
d3 <- read.table("./temp/blastpgp1200_it2e-10_IdEvfixed.plot")
d4 <- read.table("./temp/blastpgp1200_it10e-10_IdEvfixed.plot")

d1_id <- d1$V1
d1_ev <- d1$V2
d2_id <- d2$V1
d2_ev <- d2$V2

d3_id <- d3$V1
d3_ev <- d3$V2
d4_id <- d4$V1
d4_ev <- d4$V2

lab = c("2 iterations, e002", "10 iterations, e002", "2 Iterations, 10e-10", "10 Iterations, 10e-10")
br <- seq(-450,0,5)

png("tsd_psiblastEvals_allfour.png", width=600, height=600)
par(mar=par()$mar+c(0,1,0,0))
hist(log(d1_ev), breaks=br, ylim=c(0,70), xlim=c(-450,0) ,main="PSI-Blast - Histogram of eValue distributions", xlab="log(eValue)", cex.axis=ccaxis, cex.main=ccmain, cex.lab=cclab, col=col1t)     
mtext("BIG80 database", cex=1.4)
hist(log(d2_ev), breaks=br, ylim=c(0,70),xlim=c(-450,0) ,col=col2t, add=T)
hist(log(d3_ev), breaks=br, ylim=c(0,70),xlim=c(-450,0) ,col=col3t, add=T)
hist(log(d4_ev), breaks=br, ylim=c(0,70),xlim=c(-450,0) ,col=col4t, add=T)
legend("topright", lab, fill=c(col1t, col2t, col3t, col4t), cex=1.6, ncol=1)
dev.off()


lab = c("2 iterations, e002", "10 iterations, e002", "2 Iterations, 10e-10", "10 Iterations, 10e-10")
br <- seq(0,100,2.5)

png("tsd_psiblastIdents_allfour.png", width=600, height=600)
par(mar=par()$mar+c(0,1,0,0))
hist(d1_id, breaks=br , ylim=c(0,400),main="PSI-Blast - Histogram of identities", xlab="% sequence identity in Blast alignment", cex.axis=ccaxis, cex.main=ccmain, cex.lab=cclab, col=col1t)     
mtext("BIG80 database", cex=1.4)
hist(d2_id, breaks=br,col=col2t, add=T)
hist(d3_id, breaks=br, col=col3t, add=T)
hist(d4_id, breaks=br, col=col4t, add=T)
legend("topright", lab, fill=c(col1t, col2t, col3t, col4t), cex=1.6, ncol=1)
dev.off()



###PSI-Blast BIG
d1 <- read.table("./temp/blastpgpBIG_3800_it2e002_IdEvfixed.plot")
d2 <- read.table("./temp/blastpgpBIG_3800_it10e002_IdEvfixed.plot")
d3 <- read.table("./temp/blastpgpBIG_3800_it2e-10_IdEvfixed.plot")
d4 <- read.table("./temp/blastpgpBIG_3800_it10e-10_IdEvfixed.plot")


d1_id <- d1$V1
d1_ev <- d1$V2
d2_id <- d2$V1
d2_ev <- d2$V2

d3_id <- d3$V1
d3_ev <- d3$V2
d4_id <- d4$V1
d4_ev <- d4$V2

lab = c("2 iterations, e002", "10 iterations, e002", "2 Iterations, 10e-10", "10 Iterations, 10e-10")
br <- seq(-450,0,5)

png("tsd_psiblastBIGEvals_allfour.png", width=600, height=600)
par(mar=par()$mar+c(0,1,0,0))
hist(log(d1_ev), breaks=br, ylim=c(0,260), xlim=c(-450,0) ,main="PSI-Blast - Histogram of eValue distributions", xlab="log(eValue)", cex.axis=ccaxis, cex.main=ccmain, cex.lab=cclab, col=col1t)     
mtext("BIG database", cex=1.4)
hist(log(d2_ev), breaks=br, ylim=c(0,260),xlim=c(-450,0) ,col=col2t, add=T)
hist(log(d3_ev), breaks=br, ylim=c(0,260),xlim=c(-450,0) ,col=col3t, add=T)
hist(log(d4_ev), breaks=br, ylim=c(0,260),xlim=c(-450,0) ,col=col4t, add=T)
legend("topleft", lab, fill=c(col1t, col2t, col3t, col4t), cex=1.6, ncol=1)
dev.off()

lab = c("2 iterations, e002", "10 iterations, e002", "2 Iterations, 10e-10", "10 Iterations, 10e-10")
br <- seq(0,100,2.5)

png("tsd_psiblastBIGIdents_allfour.png", width=600, height=600)
par(mar=par()$mar+c(0,1,0,0))
hist(d1_id, breaks=br , ylim=c(0,1100),main="PSI-Blast - Histogram of identities", xlab="% sequence identity in Blast alignment", cex.axis=ccaxis, cex.main=ccmain, cex.lab=cclab, col=col1t)     
mtext("BIG database", cex=1.4)
hist(d2_id, breaks=br,col=col2t, add=T)
hist(d3_id, breaks=br, col=col3t, add=T)
hist(d4_id, breaks=br, col=col4t, add=T)
legend("topright", lab, fill=c(col1t, col2t, col3t, col4t), cex=1.6, ncol=1)
dev.off()


# #Iterations e0002
# d1 <- read.table("./temp/blastpgpBIG_3800_it10e002_IdEvfixed.plot")
# d2 <- read.table("./temp/blastpgpBIG_3800_it2e002_IdEvfixed.plot")
# 
# d1_id <- d1$V1
# d1_ev <- d1$V2
# d2_id <- d2$V1
# d2_ev <- d2$V2
# 
# 
# lab = c("2 iterations", "10 iterations")
# 
# png(width=600, height=600, "tsd_psiblastBIGEvals_Iterations_e002.png")
# par(mar=par()$mar+c(0,1,0,0))
# hist(log(d1_ev), breaks=100, xlim=c(-430,0) ,ylim=c(0,200),main="PSI-Blast - Histogram of eValue distributions", xlab="log(eValue)", cex.axis=ccaxis, cex.main=ccmain, cex.lab=cclab, col=col1t)     
# mtext("Profile inclusion threshold 0.002, BIG database", cex=1.4)
# hist(log(d2_ev), breaks=100, xlim=c(-430,0) ,ylim=c(0,200),col=col2t, add=T)
# legend("topleft", lab, fill=c(col1t, col2t), cex=1.6, ncol=1)
# dev.off()
# 
# png(width=600, height=600, "tsd_psiblastBIGIdent_Iterations_e002.png")
# par(mar=par()$mar+c(0,1,0,0))
# hist(d1_id, breaks=100, xlim=c(0,100), ylim=c(0,430), main="PSI-Blast - Histogram of identities ", xlab="% sequence identity in Blast alignment", cex.axis=ccaxis, cex.main=ccmain, cex.lab=cclab, col=col1t)     
# mtext("Profile inclusion threshold 0.002, BIG database", cex=1.4)
# hist(d2_id, breaks=100, col=col2t,ylim=c(0,430), add=T)
# legend("topright", lab, fill=c(col1t, col2t), cex=1.6, ncol=1)
# dev.off()
# 
# #Iterations 10e-10
# d1 <- read.table("./temp/blastpgpBIG_3800_it10e-10_IdEvfixed.plot")
# d2 <- read.table("./temp/blastpgpBIG_3800_it2e-10_IdEvfixed.plot")
# 
# d1_id <- d1$V1
# d1_ev <- d1$V2
# d2_id <- d2$V1
# d2_ev <- d2$V2
# 
# lab = c("2 iterations", "10 iterations")
# 
# png(width=600, height=600, "tsd_psiblastBIGEvals_Iterations_10e-10.png")
# par(mar=par()$mar+c(0,1,0,0))
# hist(log(d1_ev), breaks=100, xlim=c(-430,0) ,ylim=c(0,280),main="PSI-Blast - Histogram of eValue distributions", xlab="log(eValue)", cex.axis=ccaxis, cex.main=ccmain, cex.lab=cclab, col=col1t)     
# mtext("Profile inclusion threshold 10e-10, BIG database", cex=1.4)
# hist(log(d2_ev), breaks=100, xlim=c(-430,0), ylim=c(0,280),col=col2t, add=T)
# legend("topleft", lab, fill=c(col1t, col2t), cex=1.6, ncol=1)
# dev.off()
# 
# png(width=600, height=600, "tsd_psiblastBIGIdent_Iterations_10e-10.png")
# par(mar=par()$mar+c(0,1,0,0))
# hist(d1_id, breaks=100, xlim=c(0,100),ylim=c(0,400), main="PSI-Blast - Histogram of identities ", xlab="% sequence identity in Blast alignment", cex.axis=ccaxis, cex.main=ccmain, cex.lab=cclab, col=col1t)     
# mtext("Profile inclusion threshold 10e-10, BIG database", cex=1.4)
# hist(d2_id, breaks=100, col=col2t, ylim=c(0,400),add=T)
# legend("topright", lab, fill=c(col1t, col2t), cex=1.6, ncol=1)
# dev.off()

###Hblits
d <- read.table("./temp/hhblits_460_P06865_IdEvfixed.plot")
d_id <- d$V1
d_ev <- d$V2

#Plot identities
png(width=600, height=600, "tsd_hhblitsIdent.png")
par(mar=par()$mar+c(0,1,-1,0))
hist(d_id, breaks=100, xlim=c(0,100), main="HHblits - Histogram of identities",xlab="% sequence identity in HHblits alignment", cex.axis=ccaxis, cex.main=ccmain, cex.lab=cclab, col=col1)
dev.off()

#Plot evalues
png(width=600, height=600, "tsd_hhblitsEval.png")
par(mar=par()$mar+c(0,1,-1,0))
hist(log(d_ev), breaks=100, main="HHblits - Histogram of eValue distribution",xlab="log(eValue)", cex.axis=ccaxis, cex.main=ccmain, cex.lab=cclab, col=col1)
dev.off()

MSA

Dataset creation

# Get details for the pdb proteins union
grep -f  pdbSets/union.finpdb  m8blastpgpBIG_3800*.out > pdbunionBlast

Runs

p="/mnt/home/student/meiera/1_SeqAli/2_MSA"
export PATH=$PATH:/mnt/opt/T-Coffee/bin/
clustalw -align -infile=$p/datasets/msaset_40.fasta -outfile=$p/clustalw_msaset_40.aln
clustalw -align -infile=$p/datasets/msaset_60.fasta -outfile=$p/clustalw_msaset_60.aln
clustalw -align -infile=$p/datasets/msaset_all.fasta -outfile=$p/clustalw_msaset_all.aln
clustalw -infile=$p/datasets/msaset_40.fasta -outfile=$p/clustalw_TESTmsaset_40.aln
muscle -in $p/datasets/msaset_40.fasta -out $p/muscle_msaset_40.msa -quiet
muscle -in $p/datasets/msaset_60.fasta -out $p/muscle_msaset_60.msa -quiet
muscle -in $p/datasets/msaset_all.fasta -out $p/muscle_msaset_all.msa -quiet
t_coffee $p/datasets/msaset_40.fasta -outfile tcdef_40.aln
t_coffee $p/datasets/msaset_60.fasta -outfile tcdef_60.aln
t_coffee $p/datasets/msaset_all.fasta -outfile tcdef_all.aln
t_coffee $p/datasets/msaset_40.fasta -method sap_pair -template_file 3nsn_A -outfile tc3d_40.aln
t_coffee $p/datasets/msaset_60.fasta -method sap_pair -template_file 2gjx_A -outfile tc3d_60.aln
t_coffee $p/datasets/msaset_all.fasta -method sap_pair -template_file 3lmy_A -outfile tc3d_all.aln