Difference between revisions of "Fabry:Sequence alignments (sequence searches and multiple alignments)/Scripts"

From Bioinformatikpedia
(Results)
(Results)
Line 527: Line 527:
 
}
 
}
 
 
print "\n";
+
print "$infile\n";
 
print "\n";
 
print "\n";
 
 

Revision as of 12:06, 6 May 2012

Please see Task 2 for our line of action on this topic. The results are located at Task 2 results


Sequence searches

>compare_GO_terms.pl
#!/usr/bin/perl

use LWP::UserAgent;


#reference..."OUR" protein
our %reference_hash = ();

$ua = LWP::UserAgent->new;
$req = HTTP::Request->new(GET => "http://www.ebi.ac.uk/QuickGO/GAnnotation?protein=" . $ARGV[0] . "&format=tsv");
$res = $ua->request($req,'annotation.tsv');

open (FILE, '<annotation.tsv');
$head=<FILE>;
while (<FILE>) {
	chomp;
	($db,$id,$splice,$symbol,$taxon,$qualifier,$goID,$goName,$reference,$evidence,$with,$aspect,$date,$source) = split(/\t/);
	#make GO terms unique by writing them into hash
	$reference_hash{$goID} = "";
}

	#print "@{[ %reference_hash ]}\t" . "\n\n";
my $size_ref_hash = scalar(keys %reference_hash);
	
#comparing to...
$filename = $ARGV[1];
$filename =~ /(.*)\..*/;
$outname = $1 . '_comparison.txt';
open(IDS,"<".$filename);
open(OUT,">".$outname);

while(<IDS>) {

	our %compare_hash = ();

	@array = split(/\s/);
	pop(@array);
	for ($i=1;$i<@array;$i++){
		$compare_hash{$array[$i]} = "";
	}
	
	#print "@{[ %compare_hash ]}\t". "\n";
	my @common = grep  exists $reference_hash{ $_ }, keys %compare_hash;
	
	my $size_common = scalar(@common);
	my $size_comp_hash = scalar(keys %compare_hash);

	print OUT $size_common / $size_ref_hash  ."\t";
	if ( $size_comp_hash > 0)  	{ print OUT $size_common / $size_comp_hash ."\n"; }
	else 			{ print OUT "0\n"; }
}
>download-annotation.pl
#!/usr/bin/perl


use LWP::UserAgent;
	
$filename = $ARGV[0];
$filename =~ /(.*)\..*/;
$outname = $1 . '_GOterms.tsv';
open IDS, $filename or die $!;
open (OUT, '>' . $outname);

$result = "";

while(<IDS>){
	chomp;
	$_ =~ s/\.//gi;
	$ua = LWP::UserAgent->new;
	$req = HTTP::Request->new(GET => "http://www.ebi.ac.uk/QuickGO/GAnnotation?protein=" . $_ . "&format=tsv");
	$res = $ua->request($req,'annotation.tsv');
	 
	%hash = ();
	open (FILE, '<annotation.tsv');
	print OUT "$_\t";
	$head=<FILE>;
	while (<FILE>) {
		chomp;
		($db,$id,$splice,$symbol,$taxon,$qualifier,$goID,$goName,$reference,$evidence,$with,$aspect,$date,$source) = split(/\t/);

		#make GO terms unique by writing them into hash
		$hash{$goID} = "";
	}
	print OUT "@{[ %hash ]}\t";
	print OUT "\n";
	close (FILE);
}

close (OUT);
close IDS;

Blast

> extract_ids_blast.pl
#!/bin/perl

my $Z = 0.003;
my $filename	= "blastsearch_default_v700.out";
my $outname	= "blastsearch_default_v700_ids.txt";
my $outEval	= "blastsearch_default_v700_Evalues.txt";
open IN, $filename or die $!;
#open(OUT, ">" . $outname);
open(OUT2, ">" . $outEval);

while(<IN>) {
	if($_ =~ /^(tr|sp)/){
		#print $_;
		chomp;
		$_ =~ /.+\|(.+)\|.+(.{5})/;
		$store_1 = $1;
		$store_2 = $2;
		$Evalue = $2;		
		$Evalue =~ s/^\s+//;
		$Evalue =~ s/\s+$//;

		if( $Evalue > 0.003 ) {}
		else {
#			print OUT  $store_1 . "\n";
			if($Evalue eq  "0.0"){
				print OUT2 "-Inf\n";
			}
			elsif($Evalue =~ /(.*)e(.+)/){
				$Evalue =~ /(.*)e(.+)/;
				print OUT2 $2 ."\n";
			}
			else {
				print OUT2 log($Evalue)/log(10) . "\n";
			}
		}
	}
}


> parse_blast.pl
#!/usr/bin/perl

use strict;
use warnings;

my $bool = 0;
my $ID;
my $Length;
my $Evalue;
my $Identities;
my $Positives;

open(IN, "< " .$ARGV[0]) or die $!;
open(OUT, "> " .$ARGV[0] . "_ident_cov.tsv");
print OUT "ID\tLength\tEvalue\tIdentities\tPositives\n";
while (<IN>) {
	chomp;
	if($_ =~ /^>/) {
		$_ =~ />.+\|(.+)\|.+/;
		$ID = $1;
		$bool = 1;
	}
	elsif($_ =~ /^\s+Length/ && $bool == 1){
		$_ =~ /\s+Length\ =\ (\d+)/;
		$Length = $1;
	}
	elsif ($_ =~ /^\s*Score\ =/ && $bool == 1){
		$_ =~ /\s*Score\s+=\s+.+,\s+Expect\s+=\s+(.+?),\s+Method: .+/;
		$Evalue = $1;		
		$Evalue =~ s/^\s+//;
		$Evalue =~ s/\s+$//;
		if($Evalue eq  "0.0"){
			$Evalue = "-Inf";
		}
		elsif($Evalue =~ /(.*)e(.+)/){
			$Evalue =~ /(.*)e(.+)/;
			$Evalue = $2;
		}
		else {
			$Evalue = log($Evalue)/log(10);
		}
	}
	elsif ($_ =~ /^\s*Identities/ && $bool == 1){
		if($_ =~ /\s+Identities.+\((.+)\%\).+Positives.+\((.+)\%\).+Gaps.+\((.+)\%\)/){
		$Identities = $1;
		$Positives = $2;
		}
		elsif($_ =~ /\s*Identities.+\((\d+)\%\).+Positives.+\((\d+)\%\).*/){
		$Identities = $1;
		$Positives = $2;	
		}
		if( $Evalue < -2.39794000867204 ) {
			print OUT "$ID\t$Length\t$Evalue\t$Identities\t$Positives\n";
			$bool = 0;
		}	
	}
}
close(IN);


>hist_blast.r

table <- read.table(file="blastsearch_default_v700_ids_GOterms_comparison.txt")

png(file="blastsearch_default_v700_ids_GOterms_comparison.png", width = 480, height = 700)
layout(matrix(data=c(1,2), nrow=2, ncol=1), widths=c(1,1), heights=c(3,3))

hx_1 <- hist(table[,1], main = "Common GO classifications vs. # GO terms P06280", xlab ="Percentage in common", xaxt = "n", ylim=c(0,500), xlim = c(0,1), col = "dodgerblue", breaks = 10)
text(hx_1$mids, hx_1$counts+2, label=c(hx_1$counts), pos = 3, col = "royalblue4")
text(0.1,500,expression("mean:"))
text(0.23,500,round(mean(table[,1]),4))
axis(side=1, at=seq(0,1, 0.1) ,labels = seq(0,100, 10))

hx_2 <- hist(table[,2], main = "Common GO classifications vs. # GO terms blast search",  xlab ="Percentage in common", xaxt = "n", ylim=c(0,500), xlim = c(0,1), col = "dodgerblue", breaks = 10)
text(hx_2$mids, hx_2$counts+2, label=c(hx_2$counts), pos = 3, col = "royalblue4")
text(0.1,500,expression("mean:"))
text(0.23,500,round(mean(table[,2]),4))
axis(side=1, at=seq(0,1, 0.1) ,labels = seq(0,100, 10))

dev.off()


ip <- read.table(file="blastsearch_default_v700.out_ident_cov.tsv", header = T)

png(file="blastsearch_default_v700_Evalues.png", width = 480, height = 400)
hist(ip$Evalue, xlab="log_10(E-value)", breaks=30, col = "dodgerblue", ylim = c(0,80), main = "Histogramm of E-values, blastsearch", xlim=c(-200,0) )
text(-130,80,round(mean(ip$Evalue[7:664]),4))
text(-160,80,expression("mean:"))

dev.off()

png(file="blastsearch_default_v700_Identities.png", width = 480, height = 400)
hist(ip$Identities, xlab="Identity in %", breaks=30, col = "dodgerblue", ylim = c(0,80), main = "Histogramm of Identities, blastsearch", xlim=c(0,100) )
text(75,80,expression("mean:"))
text(90,80,round(mean(ip$Identities),4))
dev.off()

png(file="blastsearch_default_v700_Positives.png", width = 480, height = 400)
hist(ip$Positives, xlab="Positives in %", breaks=30, col = "dodgerblue",  main = "Histogramm of Positives, blastsearch", xlim=c(0,100), ylim = c(0,80) )
text(75,80,expression("mean:"))
text(90,80,round(mean(ip$Positives),4))

dev.off()


png(file="blastsearch_default_v700_Lenghts.png", width = 480, height = 400)
hist(ip$Length, xlab="Sequence length", breaks=30, col = "dodgerblue", ylim = c(0,250), xlim = c(0,2000), main = "Histogramm of sequence length, blastsearch" )

text(1000,180,expression("mean length:"))
text(1500,180,round(mean(ip$Length),4))
text(1000,160,expression("P06280 length:"))
text(1500,160,expression("429"))

dev.off()


Psi-Blast

HHblits

>extract_evalues_hhblits.pl
#!/bin/perl

my $Z = 0.003;
my $filename	= $ARGV[0];
my $outEval	= $ARGV[0] . "_Evalues.txt";
open IN, $filename or die $!;
open(OUT2, ">" . $outEval);

while(<IN>) {
	if($_ =~ /^\s*\d+\sUP20/){
# 		print $_;
		chomp;
		
			$_ =~ /.{41}(.{7}).+/;
			$Evalue = $1;
		
		#print $Evalue ."\t" . $_ ."\n";
		
		$Evalue =~ s/^\s+//;
		$Evalue =~ s/\s+$//;
			
		if( $Evalue > 0.003 ) {}
		else {
			if($Evalue eq  "0.0"){
				print OUT2 "-Inf\n";
			}
			elsif($Evalue =~ /(.*)E(.+)/){
				$Evalue =~ /(.*)E(.+)/;
				print OUT2 $2 ."\n";
			}
			else {
				print OUT2 log($Evalue)/log(10) . "\n";
			}
		}
	}
}
>parse_hhblits.pl
#!/usr/bin/perl

use strict;
use warnings;

my $bool = 0;
my $ID;
my $Evalue;
my $Identities;
my $Similarity;

open(IN, "< " .$ARGV[0]) or die $!;
open(OUT, "> " .$ARGV[0] . "_ident_cov.tsv");
print OUT "ID\tEvalue\tIdentities\tSimilarity\n";
while (<IN>) {
	chomp;
	if($_ =~ /^>/) {
		my @array = split('\|', $_);
		$array[4]  =~ s/\.$//;
		my @array2 = split('\ ', $array[4]);
		$ID = $array2[0];
		$bool = 1;
	}
	elsif ($_ =~ /^\s*Probab/ && $bool == 1){
		$_ =~ /Probab=.+E-value=(.+)\s+Score=.+Aligned_cols=.+Identities=(\d+).+Similarity=(.+)\s+Sum_probs=.+/;
		$Evalue = $1;
		
		$Identities = $2;
		$Similarity = $3;
		
		$Evalue =~ s/^\s+//;
		$Evalue =~ s/\s+$//;
		if($Evalue eq  "0.0"){
			$Evalue = "-Inf";
		}
		elsif($Evalue =~ /(.*)e(.+)/){
			$Evalue =~ /(.*)e(.+)/;
			$Evalue = $2;
		}
		else {
			$Evalue = log($Evalue)/log(10);
		}
		if( $Evalue < -2.39794000867204 ) {
			print OUT "$ID\t$Evalue\t$Identities\t$Similarity\n";
			$bool = 0;
		}	
	}
}
close(IN);


> hist_hhblits.R
table <- read.table(file="hhblits_default_ids_GOterms_comparison.txt")

png(file="hhblits_default_ids_GOterms_comparison.png", width = 480, height = 700)
layout(matrix(data=c(1,2), nrow=2, ncol=1), widths=c(1,1), heights=c(3,3))

hx_1 <- hist(table[,1], main = "Common GO classifications vs. # GO terms P06280\n(2 iterations - default)", xlab ="Percentage in common", xaxt = "n", ylim=c(0,200), xlim = c(0,1), col = "dodgerblue", breaks = 10)
text(hx_1$mids, hx_1$counts+2, label=c(hx_1$counts), pos = 3, col = "royalblue4")
text(0.1,200,expression("mean:"))
text(0.23,200,round(mean(table[,1]),4))
axis(side=1, at=seq(0,1, 0.1) ,labels = seq(0,100, 10))

hx_2 <- hist(table[,2], main = "Common GO classifications vs. # GO terms HHblits search\n(2 iterations - default)",  xlab ="Percentage in common", xaxt = "n", ylim=c(0,200), xlim = c(0,1), col = "dodgerblue", breaks = 10)
text(hx_2$mids, hx_2$counts+2, label=c(hx_2$counts), pos = 3, col = "royalblue4")
text(0.1,200,expression("mean:"))
text(0.23,200,round(mean(table[,2]),4))
axis(side=1, at=seq(0,1, 0.1) ,labels = seq(0,100, 10))

dev.off()



a <- read.table(file="hhblits_default.out_Evalues.txt")

png(file="hhblits_default.out_Evalues.png", width = 480, height = 400)
hist(a[,1], xlab="log_10(E-value)", breaks=30, col = "dodgerblue", ylim = c(0,80), main = "Histogramm of E-values, HHblits search\n(2 iterations - default)", xlim=c(-200,0) )
text(-100,80,round(mean(a[,1])),4)
text(-160,80,expression("mean:"))

dev.off()


ip_def <- read.table(file="hhblits_default.out_ident_cov.tsv", header = T)

png(file="hhblits_default_Evalues.png", width = 480, height = 400)
hist(ip_def$Evalue, xlab="log_10(E-value)", breaks=30, col = "dodgerblue", ylim = c(0,80), main = "Histogramm of E-values, HHblits search\n(2 iterations - default)", xlim=c(-200,0) )
text(-130,80,round(mean(ip_def$Evalue),4))
text(-160,80,expression("mean:"))

dev.off()

png(file="hhblits_default_Identities.png", width = 480, height = 400)
hist(ip_def$Identities, xlab="Identity in %", breaks=30, col = "dodgerblue", ylim = c(0,40), main = "Histogramm of Identities, HHblits search\n(2 iterations - default)", xlim=c(0,100) )
text(75,40,expression("mean:"))
text(90,40,round(mean(ip_def$Identities),4))
dev.off()

png(file="hhblits_default_Similarity.png", width = 480, height = 400)
hist(ip_def$Similarity, xlab="Similarity", breaks=30, col = "dodgerblue",  main = "Histogramm of Similarity, HHblits search\n(2 iterations - default)", ylim = c(0,40), xlim=c(0,2) )
text(1.5,40,expression("mean:"))
text(1.8,40,round(mean(ip_def$Similarity),4))

dev.off()

# ####################################n8##############################################
table <- read.table(file="hhblits_n8_neu_ids_GOterms_comparison.txt")

png(file="hhblits_n8_neu_ids_GOterms_comparison.png", width = 480, height = 700)
layout(matrix(data=c(1,2), nrow=2, ncol=1), widths=c(1,1), heights=c(3,3))

hx_1 <- hist(table[,1], main = "Common GO classifications vs. # GO terms P06280\n(8 iterations)", xlab ="Percentage in common", xaxt = "n", ylim=c(0,400), xlim = c(0,1), col = "dodgerblue", breaks = 10)
text(hx_1$mids, hx_1$counts+2, label=c(hx_1$counts), pos = 3, col = "royalblue4")
text(0.8,380,expression("mean:"))
text(0.93,380,round(mean(table[,1]),4))
axis(side=1, at=seq(0,1, 0.1) ,labels = seq(0,100, 10))

hx_2 <- hist(table[,2], main = "Common GO classifications vs. # GO terms HHblits search\n(8 iterations)",  xlab ="Percentage in common", xaxt = "n", ylim=c(0,400), xlim = c(0,1), col = "dodgerblue", breaks = 10)
text(hx_2$mids, hx_2$counts+2, label=c(hx_2$counts), pos = 3, col = "royalblue4")
text(0.8,380,expression("mean:"))
text(0.93,380,round(mean(table[,2]),4))
axis(side=1, at=seq(0,1, 0.1) ,labels = seq(0,100, 10))

dev.off()




ip_n8_neu <- read.table(file="hhblits_n8_neu.out_ident_cov.tsv", header = T)

png(file="hhblits_n8_neu_Evalues.png", width = 480, height = 400)
hist(ip_n8_neu$Evalue, xlab="log_10(E-value)", breaks=30, col = "dodgerblue", ylim = c(0,250), main = "Histogramm of E-values, HHblits search\n(8 iterations)", xlim=c(-200,0) )
text(-130,250,round(mean(ip_n8_neu$Evalue),4))
text(-160,250,expression("mean:"))

dev.off()

png(file="hhblits_n8_neu_Identities.png", width = 480, height = 400)
hist(ip_n8_neu$Identities, xlab="Identity in %", breaks=30, col = "dodgerblue", ylim = c(0,300), main = "Histogramm of Identities, HHblits search\n(8 iterations)", xlim=c(0,100) )
text(75,300,expression("mean:"))
text(90,300,round(mean(ip_n8_neu$Identities),4))
dev.off()

png(file="hhblits_n8_neu_Similarity.png", width = 480, height = 400)
hist(ip_n8_neu$Similarity, xlab="Similarity", breaks=30, col = "dodgerblue",  main = "Histogramm of Similarity, HHblits search\n(8 iterations)", ylim = c(0,150), xlim=c(0,2) )
text(1.5,150,expression("mean:"))
text(1.8,150,round(mean(ip_n8_neu$Similarity),4))

dev.off()

Comparison

Comparing the hits

Comparing the Evalues

  >R script all_Evalues.R
  #This R Script is based on Andrea's Rscript  psiBlast.evalueHist.Rscript
  # Thank you Andrea :) ... Julia
  
  
  library("animation")
  
  
  
  cols3 <- hcl(h = seq(30, by=300 / 3, length = 3), l = 65, alpha = 0.5)
  br <- seq(-180,15,1.25)
  Blast <- read.table("Blast/blastsearch_default_v700.out_ident_cov.tsv", header = T)
  Psi <- read.table("Blast/psi_results_10its_eVal_0.000000001.txt_ident_cov.tsv", header = T)
  HHBlits <- read.table("HHBlits/hhblits_n8_neu.out_ident_cov.tsv", header = T)
  
  
  saveMovie({
  for (i in 1:4) {
  	if(i == 1){
  		data <- Blast$Evalue
  	}
  	if(i == 2){
  		data <- Psi$Evalue
  	}
  	if(i == 3){
  		data <- HHBlits$Evalue
  	}
  	if(i < 4) {
  		hist(data ,border=cols3[i],col=cols3[i],breaks=br,main='E-values',panel.first = grid(),xlab='log_10(E- 
               value)',axes=TRUE,xlim=range(-200,1),ylim=range(0,200))
  		legend(-200,200,c("BLAST","Psi-BLAST","HHBlits"),col=cols3, pch = c("_","_","_"), lwd = 3)
  	}
  	if(i == 4) {
  		hist(Blast$Evalue,border=cols3[1],col=cols3[1],breaks=br,main='E-values',panel.first = grid(),xlab='log_10(E-
               value)',axes=TRUE,xlim=range(-200,1),ylim=range(0,200))
  	        hist(Psi$Evalue,border=cols3[2],col=cols3[2],breaks=br,xlim=range(-200,1),ylim=range(0,200),add=TRUE)	   
               hist(HHBlits$Evalue,border=cols3[3],col=cols3[3],breaks=br,xlim=range(-200,1),ylim=range(0,200),add=TRUE)
  		legend(-200,200,c("BLAST","Psi-BLAST","HHBlits"),col=cols3, pch = c("_","_","_"), lwd = 3)
  		Sys.sleep(5)   
  	}
  } 
  }, interval = 5, outdir = getwd())

Multiple sequence alignments

Dataset

Results

> countGaps.pl
#!usr/bin/perl

$infile = $ARGV[0];
open(IN, "<" . $infile);


$conserved = 0;
%hash = ();
	
if($infile =~ /^(msa\/clu|msa\/tco|msa\/3Dtco)/){
	while(<IN>) {
		chomp;
		if($_ =~ /^(tr|sp)/ ){
			@array = split(/\s+/, $_);		
			$count = ($array[1] =~ tr/-//);
			$hash{$array[0]} += $count;
		}
		elsif(scalar keys %hash && $_ =~ /^(\s)/){
			$count = ($_ =~ tr/*//);
			$conserved += $count;
		}
	}

	print "$infile\n";
	print "\n";
	
	print "{| style=\"border-collapse: separate; border-spacing: 0; border-width: 1px; border-style: solid; border-color: #000; padding: 4\"\n! style=\"border-style: solid; border-width: 0 3px 3px 0\" | Sequence ID\n!  style=\"border-style: solid; border-width: 0 0 3px 0; \" align=\"center\"| Number of gaps\n|-\n";
	
	while ( ($k,$v) = each %hash ) {
		print "\| style=\"border-style: solid; border-width: 0 3px 3px 0\" \|$k\n\|  style=\"border-style: solid; border-width: 0 0 3px 0; \" align=\"center\" \| $v\n\|-\n";
	}

	print "\| style=\"border-style: solid; border-width: 0 3px 3px 0\" \|\n\| style=\"border-style: solid; border-width: 0 0 3px 0; align:center\"\| \n\|-\n";
	print "! style=\"border-style: solid; border-width: 0 3px 3px 0\" \|conserved\n! style=\"border-style: solid; border-width: 0 0 3px 0; align:center\"\| $conserved\n\|-\n\|}";
}
else {
	$names = -1;
	@name_array = ();
	while(<IN>) {
		chomp;
		if($_ =~ /^>/){
			$names ++;
			@a = split(/\s+/,$_);
			$a[0] =~ />(.+)/;
			$name_array[$names][0] = $1;
		}
		else{
			$name_array[$names][1] =  $name_array[$names][1] . $_;
		}
	}
	
	print "{| style=\"border-collapse: separate; border-spacing: 0; border-width: 1px; border-style: solid; border-color: #000; padding: 4\"\n! style=\"border-style: solid; border-width: 0 3px 3px 0\" | Sequence ID\n!  style=\"border-style: solid; border-width: 0 0 3px 0; \" align=\"center\"| Number of gaps\n|-\n";
	
	for($i = 0; $i <= $names; $i++) {
		$count = ($name_array[$i][1] =~ tr/-//);
		
		print "\| style=\"border-style: solid; border-width: 0 3px 3px 0\" \|$name_array[$i][0]\n\|  style=\"border-style: solid; border-width: 0 0 3px 0; \" align=\"center\" \| $count\n\|-\n";
	
	}
	
	$length = length($name_array[0][1]);
	
	for($i = 0; $i < $length; $i++) {
		if(substr($name_array[0][1],$i,1) eq substr($name_array[1][1],$i,1) && substr($name_array[0][1],$i,1) eq substr($name_array[2][1],$i,1) && substr($name_array[0][1],$i,1) eq substr($name_array[3][1],$i,1) && substr($name_array[0][1],$i,1) eq substr($name_array[4][1],$i,1) && substr($name_array[0][1],$i,1) eq substr($name_array[5][1],$i,1) && substr($name_array[0][1],$i,1) eq substr($name_array[6][1],$i,1) && substr($name_array[0][1],$i,1) eq substr($name_array[7][1],$i,1) && substr($name_array[0][1],$i,1) eq substr($name_array[8][1],$i,1) && substr($name_array[0][1],$i,1) eq substr($name_array[9][1],$i,1) )
		{
			$conserved++;
		}
	}
	
	print "\| style=\"border-style: solid; border-width: 0 3px 3px 0\" \|\n\| style=\"border-style: solid; border-width: 0 0 3px 0; align:center\"\| \n\|-\n";
	print "! style=\"border-style: solid; border-width: 0 3px 3px 0\" \|conserved\n! style=\"border-style: solid; border-width: 0 0 3px 0; align:center\"\| $conserved\n\|-\n\|}";
}