Fabry:Sequence alignments (sequence searches and multiple alignments)/Scripts

From Bioinformatikpedia
Revision as of 08:34, 5 May 2012 by Rackersederj (talk | contribs) (Sequence searches)

Please see Task 2 for our line of action on this topic. The results are located at Task 2 results


Sequence searches

Blast

> extract_ids_blast.pl
#!/bin/perl

my $Z = 0.003;
my $filename	= "blastsearch_default_v700.out";
my $outname	= "blastsearch_default_v700_ids.txt";
my $outEval	= "blastsearch_default_v700_Evalues.txt";
open IN, $filename or die $!;
#open(OUT, ">" . $outname);
open(OUT2, ">" . $outEval);

while(<IN>) {
	if($_ =~ /^(tr|sp)/){
		#print $_;
		chomp;
		$_ =~ /.+\|(.+)\|.+(.{5})/;
		$store_1 = $1;
		$store_2 = $2;
		$Evalue = $2;		
		$Evalue =~ s/^\s+//;
		$Evalue =~ s/\s+$//;

		if( $Evalue > 0.003 ) {}
		else {
#			print OUT  $store_1 . "\n";
			if($Evalue eq  "0.0"){
				print OUT2 "-Inf\n";
			}
			elsif($Evalue =~ /(.*)e(.+)/){
				$Evalue =~ /(.*)e(.+)/;
				print OUT2 $2 ."\n";
			}
			else {
				print OUT2 log($Evalue)/log(10) . "\n";
			}
		}
	}
}


> parse_blast.pl
#!/usr/bin/perl

use strict;
use warnings;

my $bool = 0;
my $ID;
my $Length;
my $Evalue;
my $Identities;
my $Positives;

open(IN, "< " .$ARGV[0]) or die $!;
open(OUT, "> " .$ARGV[0] . "_ident_cov.tsv");
print OUT "ID\tLength\tEvalue\tIdentities\tPositives\n";
while (<IN>) {
	chomp;
	if($_ =~ /^>/) {
		$_ =~ />.+\|(.+)\|.+/;
		$ID = $1;
		$bool = 1;
	}
	elsif($_ =~ /^\s+Length/ && $bool == 1){
		$_ =~ /\s+Length\ =\ (\d+)/;
		$Length = $1;
	}
	elsif ($_ =~ /^\s*Score\ =/ && $bool == 1){
		$_ =~ /\s*Score\s+=\s+.+,\s+Expect\s+=\s+(.+?),\s+Method: .+/;
		$Evalue = $1;		
		$Evalue =~ s/^\s+//;
		$Evalue =~ s/\s+$//;
		if($Evalue eq  "0.0"){
			$Evalue = "-Inf";
		}
		elsif($Evalue =~ /(.*)e(.+)/){
			$Evalue =~ /(.*)e(.+)/;
			$Evalue = $2;
		}
		else {
			$Evalue = log($Evalue)/log(10);
		}
	}
	elsif ($_ =~ /^\s*Identities/ && $bool == 1){
		if($_ =~ /\s+Identities.+\((.+)\%\).+Positives.+\((.+)\%\).+Gaps.+\((.+)\%\)/){
		$Identities = $1;
		$Positives = $2;
		}
		elsif($_ =~ /\s*Identities.+\((\d+)\%\).+Positives.+\((\d+)\%\).*/){
		$Identities = $1;
		$Positives = $2;	
		}
		if( $Evalue < -2.39794000867204 ) {
			print OUT "$ID\t$Length\t$Evalue\t$Identities\t$Positives\n";
			$bool = 0;
		}	
	}
}
close(IN);
 
>hist_blast.r

table <- read.table(file="blastsearch_default_v700_ids_GOterms_comparison.txt")

png(file="blastsearch_default_v700_ids_GOterms_comparison.png", width = 480, height = 700)
layout(matrix(data=c(1,2), nrow=2, ncol=1), widths=c(1,1), heights=c(3,3))

hx_1 <- hist(table[,1], main = "Common GO classifications vs. # GO terms P06280", xlab ="Percentage in common", xaxt = "n", ylim=c(0,500), xlim = c(0,1), col = "dodgerblue", breaks = 10)
text(hx_1$mids, hx_1$counts+2, label=c(hx_1$counts), pos = 3, col = "royalblue4")
text(0.1,500,expression("mean:"))
text(0.23,500,round(mean(table[,1]),4))
axis(side=1, at=seq(0,1, 0.1) ,labels = seq(0,100, 10))

hx_2 <- hist(table[,2], main = "Common GO classifications vs. # GO terms blast search",  xlab ="Percentage in common", xaxt = "n", ylim=c(0,500), xlim = c(0,1), col = "dodgerblue", breaks = 10)
text(hx_2$mids, hx_2$counts+2, label=c(hx_2$counts), pos = 3, col = "royalblue4")
text(0.1,500,expression("mean:"))
text(0.23,500,round(mean(table[,2]),4))
axis(side=1, at=seq(0,1, 0.1) ,labels = seq(0,100, 10))

dev.off()


ip <- read.table(file="blastsearch_default_v700.out_ident_cov.tsv", header = T)

png(file="blastsearch_default_v700_Evalues.png", width = 480, height = 400)
hist(ip$Evalue, xlab="log_10(E-value)", breaks=30, col = "dodgerblue", ylim = c(0,80), main = "Histogramm of E-values, blastsearch", xlim=c(-200,0) )
text(-130,80,round(mean(ip$Evalue[7:664]),4))
text(-160,80,expression("mean:"))

dev.off()

png(file="blastsearch_default_v700_Identities.png", width = 480, height = 400)
hist(ip$Identities, xlab="Identity in %", breaks=30, col = "dodgerblue", ylim = c(0,80), main = "Histogramm of Identities, blastsearch", xlim=c(0,100) )
text(75,80,expression("mean:"))
text(90,80,round(mean(ip$Identities),4))
dev.off()

png(file="blastsearch_default_v700_Positives.png", width = 480, height = 400)
hist(ip$Positives, xlab="Positives in %", breaks=30, col = "dodgerblue",  main = "Histogramm of Positives, blastsearch", xlim=c(0,100), ylim = c(0,80) )
text(75,80,expression("mean:"))
text(90,80,round(mean(ip$Positives),4))

dev.off()


png(file="blastsearch_default_v700_Lenghts.png", width = 480, height = 400)
hist(ip$Length, xlab="Sequence length", breaks=30, col = "dodgerblue", ylim = c(0,250), xlim = c(0,2000), main = "Histogramm of sequence length, blastsearch" )

text(1000,180,expression("mean length:"))
text(1500,180,round(mean(ip$Length),4))
text(1000,160,expression("P06280 length:"))
text(1500,160,expression("429"))

dev.off()


Psi-Blast

HHblits

Comparison

Comparing the hits

Comparing the Evalues