Difference between revisions of "Fabry:Sequence alignments (sequence searches and multiple alignments)/Scripts"

Revision as of 08:39, 5 May 2012

Please see Task 2 for our line of action on this topic. The results are located at Task 2 results

Sequence searches

>compare_GO_terms.pl
#!/usr/bin/perl

use LWP::UserAgent;


#reference..."OUR" protein
our %reference_hash = ();

$ua = LWP::UserAgent->new;
$req = HTTP::Request->new(GET => "http://www.ebi.ac.uk/QuickGO/GAnnotation?protein=" . $ARGV[0] . "&format=tsv");
$res = $ua->request($req,'annotation.tsv');

open (FILE, '<annotation.tsv');
$head=<FILE>;
while (<FILE>) {
	chomp;
	($db,$id,$splice,$symbol,$taxon,$qualifier,$goID,$goName,$reference,$evidence,$with,$aspect,$date,$source) = split(/\t/);
	#make GO terms unique by writing them into hash
	$reference_hash{$goID} = "";
}

	#print "@{[ %reference_hash ]}\t" . "\n\n";
my $size_ref_hash = scalar(keys %reference_hash);
	
#comparing to...
$filename = $ARGV[1];
$filename =~ /(.*)\..*/;
$outname = $1 . '_comparison.txt';
open(IDS,"<".$filename);
open(OUT,">".$outname);

while(<IDS>) {

	our %compare_hash = ();

	@array = split(/\s/);
	pop(@array);
	for ($i=1;$i<@array;$i++){
		$compare_hash{$array[$i]} = "";
	}
	
	#print "@{[ %compare_hash ]}\t". "\n";
	my @common = grep  exists $reference_hash{ $_ }, keys %compare_hash;
	
	my $size_common = scalar(@common);
	my $size_comp_hash = scalar(keys %compare_hash);

	print OUT $size_common / $size_ref_hash  ."\t";
	if ( $size_comp_hash > 0)  	{ print OUT $size_common / $size_comp_hash ."\n"; }
	else 			{ print OUT "0\n"; }
}

>download-annotation.pl
#!/usr/bin/perl


use LWP::UserAgent;
	
$filename = $ARGV[0];
$filename =~ /(.*)\..*/;
$outname = $1 . '_GOterms.tsv';
open IDS, $filename or die $!;
open (OUT, '>' . $outname);

$result = "";

while(<IDS>){
	chomp;
	$_ =~ s/\.//gi;
	$ua = LWP::UserAgent->new;
	$req = HTTP::Request->new(GET => "http://www.ebi.ac.uk/QuickGO/GAnnotation?protein=" . $_ . "&format=tsv");
	$res = $ua->request($req,'annotation.tsv');
	 
	%hash = ();
	open (FILE, '<annotation.tsv');
	print OUT "$_\t";
	$head=<FILE>;
	while (<FILE>) {
		chomp;
		($db,$id,$splice,$symbol,$taxon,$qualifier,$goID,$goName,$reference,$evidence,$with,$aspect,$date,$source) = split(/\t/);

		#make GO terms unique by writing them into hash
		$hash{$goID} = "";
	}
	print OUT "@{[ %hash ]}\t";
	print OUT "\n";
	close (FILE);
}

close (OUT);
close IDS;

Blast

> extract_ids_blast.pl
#!/bin/perl

my $Z = 0.003;
my $filename	= "blastsearch_default_v700.out";
my $outname	= "blastsearch_default_v700_ids.txt";
my $outEval	= "blastsearch_default_v700_Evalues.txt";
open IN, $filename or die $!;
#open(OUT, ">" . $outname);
open(OUT2, ">" . $outEval);

while(<IN>) {
	if($_ =~ /^(tr|sp)/){
		#print $_;
		chomp;
		$_ =~ /.+\|(.+)\|.+(.{5})/;
		$store_1 = $1;
		$store_2 = $2;
		$Evalue = $2;		
		$Evalue =~ s/^\s+//;
		$Evalue =~ s/\s+$//;

		if( $Evalue > 0.003 ) {}
		else {
#			print OUT  $store_1 . "\n";
			if($Evalue eq  "0.0"){
				print OUT2 "-Inf\n";
			}
			elsif($Evalue =~ /(.*)e(.+)/){
				$Evalue =~ /(.*)e(.+)/;
				print OUT2 $2 ."\n";
			}
			else {
				print OUT2 log($Evalue)/log(10) . "\n";
			}
		}
	}
}

> parse_blast.pl
#!/usr/bin/perl

use strict;
use warnings;

my $bool = 0;
my $ID;
my $Length;
my $Evalue;
my $Identities;
my $Positives;

open(IN, "< " .$ARGV[0]) or die $!;
open(OUT, "> " .$ARGV[0] . "_ident_cov.tsv");
print OUT "ID\tLength\tEvalue\tIdentities\tPositives\n";
while (<IN>) {
	chomp;
	if($_ =~ /^>/) {
		$_ =~ />.+\|(.+)\|.+/;
		$ID = $1;
		$bool = 1;
	}
	elsif($_ =~ /^\s+Length/ && $bool == 1){
		$_ =~ /\s+Length\ =\ (\d+)/;
		$Length = $1;
	}
	elsif ($_ =~ /^\s*Score\ =/ && $bool == 1){
		$_ =~ /\s*Score\s+=\s+.+,\s+Expect\s+=\s+(.+?),\s+Method: .+/;
		$Evalue = $1;		
		$Evalue =~ s/^\s+//;
		$Evalue =~ s/\s+$//;
		if($Evalue eq  "0.0"){
			$Evalue = "-Inf";
		}
		elsif($Evalue =~ /(.*)e(.+)/){
			$Evalue =~ /(.*)e(.+)/;
			$Evalue = $2;
		}
		else {
			$Evalue = log($Evalue)/log(10);
		}
	}
	elsif ($_ =~ /^\s*Identities/ && $bool == 1){
		if($_ =~ /\s+Identities.+\((.+)\%\).+Positives.+\((.+)\%\).+Gaps.+\((.+)\%\)/){
		$Identities = $1;
		$Positives = $2;
		}
		elsif($_ =~ /\s*Identities.+\((\d+)\%\).+Positives.+\((\d+)\%\).*/){
		$Identities = $1;
		$Positives = $2;	
		}
		if( $Evalue < -2.39794000867204 ) {
			print OUT "$ID\t$Length\t$Evalue\t$Identities\t$Positives\n";
			$bool = 0;
		}	
	}
}
close(IN);

>hist_blast.r

table <- read.table(file="blastsearch_default_v700_ids_GOterms_comparison.txt")

png(file="blastsearch_default_v700_ids_GOterms_comparison.png", width = 480, height = 700)
layout(matrix(data=c(1,2), nrow=2, ncol=1), widths=c(1,1), heights=c(3,3))

hx_1 <- hist(table[,1], main = "Common GO classifications vs. # GO terms P06280", xlab ="Percentage in common", xaxt = "n", ylim=c(0,500), xlim = c(0,1), col = "dodgerblue", breaks = 10)
text(hx_1$mids, hx_1$counts+2, label=c(hx_1$counts), pos = 3, col = "royalblue4")
text(0.1,500,expression("mean:"))
text(0.23,500,round(mean(table[,1]),4))
axis(side=1, at=seq(0,1, 0.1) ,labels = seq(0,100, 10))

hx_2 <- hist(table[,2], main = "Common GO classifications vs. # GO terms blast search",  xlab ="Percentage in common", xaxt = "n", ylim=c(0,500), xlim = c(0,1), col = "dodgerblue", breaks = 10)
text(hx_2$mids, hx_2$counts+2, label=c(hx_2$counts), pos = 3, col = "royalblue4")
text(0.1,500,expression("mean:"))
text(0.23,500,round(mean(table[,2]),4))
axis(side=1, at=seq(0,1, 0.1) ,labels = seq(0,100, 10))

dev.off()


ip <- read.table(file="blastsearch_default_v700.out_ident_cov.tsv", header = T)

png(file="blastsearch_default_v700_Evalues.png", width = 480, height = 400)
hist(ip$Evalue, xlab="log_10(E-value)", breaks=30, col = "dodgerblue", ylim = c(0,80), main = "Histogramm of E-values, blastsearch", xlim=c(-200,0) )
text(-130,80,round(mean(ip$Evalue[7:664]),4))
text(-160,80,expression("mean:"))

dev.off()

png(file="blastsearch_default_v700_Identities.png", width = 480, height = 400)
hist(ip$Identities, xlab="Identity in %", breaks=30, col = "dodgerblue", ylim = c(0,80), main = "Histogramm of Identities, blastsearch", xlim=c(0,100) )
text(75,80,expression("mean:"))
text(90,80,round(mean(ip$Identities),4))
dev.off()

png(file="blastsearch_default_v700_Positives.png", width = 480, height = 400)
hist(ip$Positives, xlab="Positives in %", breaks=30, col = "dodgerblue",  main = "Histogramm of Positives, blastsearch", xlim=c(0,100), ylim = c(0,80) )
text(75,80,expression("mean:"))
text(90,80,round(mean(ip$Positives),4))

dev.off()


png(file="blastsearch_default_v700_Lenghts.png", width = 480, height = 400)
hist(ip$Length, xlab="Sequence length", breaks=30, col = "dodgerblue", ylim = c(0,250), xlim = c(0,2000), main = "Histogramm of sequence length, blastsearch" )

text(1000,180,expression("mean length:"))
text(1500,180,round(mean(ip$Length),4))
text(1000,160,expression("P06280 length:"))
text(1500,160,expression("429"))

dev.off()

Psi-Blast

HHblits

>extract_evalues_hhblits.pl
#!/bin/perl

my $Z = 0.003;
my $filename	= $ARGV[0];
my $outEval	= $ARGV[0] . "_Evalues.txt";
open IN, $filename or die $!;
open(OUT2, ">" . $outEval);

while(<IN>) {
	if($_ =~ /^\s*\d+\sUP20/){
# 		print $_;
		chomp;
		
			$_ =~ /.{41}(.{7}).+/;
			$Evalue = $1;
		
		#print $Evalue ."\t" . $_ ."\n";
		
		$Evalue =~ s/^\s+//;
		$Evalue =~ s/\s+$//;
			
		if( $Evalue > 0.003 ) {}
		else {
			if($Evalue eq  "0.0"){
				print OUT2 "-Inf\n";
			}
			elsif($Evalue =~ /(.*)E(.+)/){
				$Evalue =~ /(.*)E(.+)/;
				print OUT2 $2 ."\n";
			}
			else {
				print OUT2 log($Evalue)/log(10) . "\n";
			}
		}
	}
}

>parse_hhblits.pl
#!/usr/bin/perl

use strict;
use warnings;

my $bool = 0;
my $ID;
my $Evalue;
my $Identities;
my $Similarity;

open(IN, "< " .$ARGV[0]) or die $!;
open(OUT, "> " .$ARGV[0] . "_ident_cov.tsv");
print OUT "ID\tEvalue\tIdentities\tSimilarity\n";
while (<IN>) {
	chomp;
	if($_ =~ /^>/) {
		my @array = split('\|', $_);
		$array[4]  =~ s/\.$//;
		my @array2 = split('\ ', $array[4]);
		$ID = $array2[0];
		$bool = 1;
	}
	elsif ($_ =~ /^\s*Probab/ && $bool == 1){
		$_ =~ /Probab=.+E-value=(.+)\s+Score=.+Aligned_cols=.+Identities=(\d+).+Similarity=(.+)\s+Sum_probs=.+/;
		$Evalue = $1;
		
		$Identities = $2;
		$Similarity = $3;
		
		$Evalue =~ s/^\s+//;
		$Evalue =~ s/\s+$//;
		if($Evalue eq  "0.0"){
			$Evalue = "-Inf";
		}
		elsif($Evalue =~ /(.*)e(.+)/){
			$Evalue =~ /(.*)e(.+)/;
			$Evalue = $2;
		}
		else {
			$Evalue = log($Evalue)/log(10);
		}
		if( $Evalue < -2.39794000867204 ) {
			print OUT "$ID\t$Evalue\t$Identities\t$Similarity\n";
			$bool = 0;
		}	
	}
}
close(IN);

> hist_hhblits.R
table <- read.table(file="hhblits_default_ids_GOterms_comparison.txt")

png(file="hhblits_default_ids_GOterms_comparison.png", width = 480, height = 700)
layout(matrix(data=c(1,2), nrow=2, ncol=1), widths=c(1,1), heights=c(3,3))

hx_1 <- hist(table[,1], main = "Common GO classifications vs. # GO terms P06280\n(2 iterations - default)", xlab ="Percentage in common", xaxt = "n", ylim=c(0,200), xlim = c(0,1), col = "dodgerblue", breaks = 10)
text(hx_1$mids, hx_1$counts+2, label=c(hx_1$counts), pos = 3, col = "royalblue4")
text(0.1,200,expression("mean:"))
text(0.23,200,round(mean(table[,1]),4))
axis(side=1, at=seq(0,1, 0.1) ,labels = seq(0,100, 10))

hx_2 <- hist(table[,2], main = "Common GO classifications vs. # GO terms HHblits search\n(2 iterations - default)",  xlab ="Percentage in common", xaxt = "n", ylim=c(0,200), xlim = c(0,1), col = "dodgerblue", breaks = 10)
text(hx_2$mids, hx_2$counts+2, label=c(hx_2$counts), pos = 3, col = "royalblue4")
text(0.1,200,expression("mean:"))
text(0.23,200,round(mean(table[,2]),4))
axis(side=1, at=seq(0,1, 0.1) ,labels = seq(0,100, 10))

dev.off()



a <- read.table(file="hhblits_default.out_Evalues.txt")

png(file="hhblits_default.out_Evalues.png", width = 480, height = 400)
hist(a[,1], xlab="log_10(E-value)", breaks=30, col = "dodgerblue", ylim = c(0,80), main = "Histogramm of E-values, HHblits search\n(2 iterations - default)", xlim=c(-200,0) )
text(-100,80,round(mean(a[,1])),4)
text(-160,80,expression("mean:"))

dev.off()


ip_def <- read.table(file="hhblits_default.out_ident_cov.tsv", header = T)

png(file="hhblits_default_Evalues.png", width = 480, height = 400)
hist(ip_def$Evalue, xlab="log_10(E-value)", breaks=30, col = "dodgerblue", ylim = c(0,80), main = "Histogramm of E-values, HHblits search\n(2 iterations - default)", xlim=c(-200,0) )
text(-130,80,round(mean(ip_def$Evalue),4))
text(-160,80,expression("mean:"))

dev.off()

png(file="hhblits_default_Identities.png", width = 480, height = 400)
hist(ip_def$Identities, xlab="Identity in %", breaks=30, col = "dodgerblue", ylim = c(0,40), main = "Histogramm of Identities, HHblits search\n(2 iterations - default)", xlim=c(0,100) )
text(75,40,expression("mean:"))
text(90,40,round(mean(ip_def$Identities),4))
dev.off()

png(file="hhblits_default_Similarity.png", width = 480, height = 400)
hist(ip_def$Similarity, xlab="Similarity", breaks=30, col = "dodgerblue",  main = "Histogramm of Similarity, HHblits search\n(2 iterations - default)", ylim = c(0,40), xlim=c(0,2) )
text(1.5,40,expression("mean:"))
text(1.8,40,round(mean(ip_def$Similarity),4))

dev.off()

# ####################################n8##############################################
table <- read.table(file="hhblits_n8_neu_ids_GOterms_comparison.txt")

png(file="hhblits_n8_neu_ids_GOterms_comparison.png", width = 480, height = 700)
layout(matrix(data=c(1,2), nrow=2, ncol=1), widths=c(1,1), heights=c(3,3))

hx_1 <- hist(table[,1], main = "Common GO classifications vs. # GO terms P06280\n(8 iterations)", xlab ="Percentage in common", xaxt = "n", ylim=c(0,400), xlim = c(0,1), col = "dodgerblue", breaks = 10)
text(hx_1$mids, hx_1$counts+2, label=c(hx_1$counts), pos = 3, col = "royalblue4")
text(0.8,380,expression("mean:"))
text(0.93,380,round(mean(table[,1]),4))
axis(side=1, at=seq(0,1, 0.1) ,labels = seq(0,100, 10))

hx_2 <- hist(table[,2], main = "Common GO classifications vs. # GO terms HHblits search\n(8 iterations)",  xlab ="Percentage in common", xaxt = "n", ylim=c(0,400), xlim = c(0,1), col = "dodgerblue", breaks = 10)
text(hx_2$mids, hx_2$counts+2, label=c(hx_2$counts), pos = 3, col = "royalblue4")
text(0.8,380,expression("mean:"))
text(0.93,380,round(mean(table[,2]),4))
axis(side=1, at=seq(0,1, 0.1) ,labels = seq(0,100, 10))

dev.off()




ip_n8_neu <- read.table(file="hhblits_n8_neu.out_ident_cov.tsv", header = T)

png(file="hhblits_n8_neu_Evalues.png", width = 480, height = 400)
hist(ip_n8_neu$Evalue, xlab="log_10(E-value)", breaks=30, col = "dodgerblue", ylim = c(0,250), main = "Histogramm of E-values, HHblits search\n(8 iterations)", xlim=c(-200,0) )
text(-130,250,round(mean(ip_n8_neu$Evalue),4))
text(-160,250,expression("mean:"))

dev.off()

png(file="hhblits_n8_neu_Identities.png", width = 480, height = 400)
hist(ip_n8_neu$Identities, xlab="Identity in %", breaks=30, col = "dodgerblue", ylim = c(0,300), main = "Histogramm of Identities, HHblits search\n(8 iterations)", xlim=c(0,100) )
text(75,300,expression("mean:"))
text(90,300,round(mean(ip_n8_neu$Identities),4))
dev.off()

png(file="hhblits_n8_neu_Similarity.png", width = 480, height = 400)
hist(ip_n8_neu$Similarity, xlab="Similarity", breaks=30, col = "dodgerblue",  main = "Histogramm of Similarity, HHblits search\n(8 iterations)", ylim = c(0,150), xlim=c(0,2) )
text(1.5,150,expression("mean:"))
text(1.8,150,round(mean(ip_n8_neu$Similarity),4))

dev.off()

Difference between revisions of "Fabry:Sequence alignments (sequence searches and multiple alignments)/Scripts"

Revision as of 08:39, 5 May 2012

Contents

Sequence searches

Blast

Psi-Blast

HHblits

Comparison

Comparing the hits

Comparing the Evalues

Navigation menu

Views

Personal tools

Bioinformatik navigation

MediaWiki navigation

Search

Tools