Difference between revisions of "Fabry:Sequence alignments (sequence searches and multiple alignments)/Scripts"
From Bioinformatikpedia
Rackersederj (talk | contribs) (→Sequence searches) |
Rackersederj (talk | contribs) (→Sequence searches) |
||
Line 5: | Line 5: | ||
== Sequence searches == |
== Sequence searches == |
||
+ | >compare_GO_terms.pl |
||
+ | #!/usr/bin/perl |
||
+ | |||
+ | use LWP::UserAgent; |
||
+ | |||
+ | |||
+ | #reference..."OUR" protein |
||
+ | our %reference_hash = (); |
||
+ | |||
+ | $ua = LWP::UserAgent->new; |
||
+ | $req = HTTP::Request->new(GET => "http://www.ebi.ac.uk/QuickGO/GAnnotation?protein=" . $ARGV[0] . "&format=tsv"); |
||
+ | $res = $ua->request($req,'annotation.tsv'); |
||
+ | |||
+ | open (FILE, '<annotation.tsv'); |
||
+ | $head=<FILE>; |
||
+ | while (<FILE>) { |
||
+ | chomp; |
||
+ | ($db,$id,$splice,$symbol,$taxon,$qualifier,$goID,$goName,$reference,$evidence,$with,$aspect,$date,$source) = split(/\t/); |
||
+ | #make GO terms unique by writing them into hash |
||
+ | $reference_hash{$goID} = ""; |
||
+ | } |
||
+ | |||
+ | #print "@{[ %reference_hash ]}\t" . "\n\n"; |
||
+ | my $size_ref_hash = scalar(keys %reference_hash); |
||
+ | |||
+ | #comparing to... |
||
+ | $filename = $ARGV[1]; |
||
+ | $filename =~ /(.*)\..*/; |
||
+ | $outname = $1 . '_comparison.txt'; |
||
+ | open(IDS,"<".$filename); |
||
+ | open(OUT,">".$outname); |
||
+ | |||
+ | while(<IDS>) { |
||
+ | |||
+ | our %compare_hash = (); |
||
+ | |||
+ | @array = split(/\s/); |
||
+ | pop(@array); |
||
+ | for ($i=1;$i<@array;$i++){ |
||
+ | $compare_hash{$array[$i]} = ""; |
||
+ | } |
||
+ | |||
+ | #print "@{[ %compare_hash ]}\t". "\n"; |
||
+ | my @common = grep exists $reference_hash{ $_ }, keys %compare_hash; |
||
+ | |||
+ | my $size_common = scalar(@common); |
||
+ | my $size_comp_hash = scalar(keys %compare_hash); |
||
+ | |||
+ | print OUT $size_common / $size_ref_hash ."\t"; |
||
+ | if ( $size_comp_hash > 0) { print OUT $size_common / $size_comp_hash ."\n"; } |
||
+ | else { print OUT "0\n"; } |
||
+ | } |
||
+ | |||
+ | >download-annotation.pl |
||
+ | #!/usr/bin/perl |
||
+ | |||
+ | |||
+ | use LWP::UserAgent; |
||
+ | |||
+ | $filename = $ARGV[0]; |
||
+ | $filename =~ /(.*)\..*/; |
||
+ | $outname = $1 . '_GOterms.tsv'; |
||
+ | open IDS, $filename or die $!; |
||
+ | open (OUT, '>' . $outname); |
||
+ | |||
+ | $result = ""; |
||
+ | |||
+ | while(<IDS>){ |
||
+ | chomp; |
||
+ | $_ =~ s/\.//gi; |
||
+ | $ua = LWP::UserAgent->new; |
||
+ | $req = HTTP::Request->new(GET => "http://www.ebi.ac.uk/QuickGO/GAnnotation?protein=" . $_ . "&format=tsv"); |
||
+ | $res = $ua->request($req,'annotation.tsv'); |
||
+ | |||
+ | %hash = (); |
||
+ | open (FILE, '<annotation.tsv'); |
||
+ | print OUT "$_\t"; |
||
+ | $head=<FILE>; |
||
+ | while (<FILE>) { |
||
+ | chomp; |
||
+ | ($db,$id,$splice,$symbol,$taxon,$qualifier,$goID,$goName,$reference,$evidence,$with,$aspect,$date,$source) = split(/\t/); |
||
+ | |||
+ | #make GO terms unique by writing them into hash |
||
+ | $hash{$goID} = ""; |
||
+ | } |
||
+ | print OUT "@{[ %hash ]}\t"; |
||
+ | print OUT "\n"; |
||
+ | close (FILE); |
||
+ | } |
||
+ | |||
+ | close (OUT); |
||
+ | close IDS; |
||
+ | |||
=== Blast === |
=== Blast === |
||
Line 106: | Line 199: | ||
} |
} |
||
close(IN); |
close(IN); |
||
+ | |||
− | |||
+ | |||
>hist_blast.r |
>hist_blast.r |
||
Revision as of 08:36, 5 May 2012
Please see Task 2 for our line of action on this topic. The results are located at Task 2 results
Contents
Sequence searches
>compare_GO_terms.pl #!/usr/bin/perl use LWP::UserAgent; #reference..."OUR" protein our %reference_hash = (); $ua = LWP::UserAgent->new; $req = HTTP::Request->new(GET => "http://www.ebi.ac.uk/QuickGO/GAnnotation?protein=" . $ARGV[0] . "&format=tsv"); $res = $ua->request($req,'annotation.tsv'); open (FILE, '<annotation.tsv'); $head=<FILE>; while (<FILE>) { chomp; ($db,$id,$splice,$symbol,$taxon,$qualifier,$goID,$goName,$reference,$evidence,$with,$aspect,$date,$source) = split(/\t/); #make GO terms unique by writing them into hash $reference_hash{$goID} = ""; } #print "@{[ %reference_hash ]}\t" . "\n\n"; my $size_ref_hash = scalar(keys %reference_hash); #comparing to... $filename = $ARGV[1]; $filename =~ /(.*)\..*/; $outname = $1 . '_comparison.txt'; open(IDS,"<".$filename); open(OUT,">".$outname); while(<IDS>) { our %compare_hash = (); @array = split(/\s/); pop(@array); for ($i=1;$i<@array;$i++){ $compare_hash{$array[$i]} = ""; } #print "@{[ %compare_hash ]}\t". "\n"; my @common = grep exists $reference_hash{ $_ }, keys %compare_hash; my $size_common = scalar(@common); my $size_comp_hash = scalar(keys %compare_hash); print OUT $size_common / $size_ref_hash ."\t"; if ( $size_comp_hash > 0) { print OUT $size_common / $size_comp_hash ."\n"; } else { print OUT "0\n"; } }
>download-annotation.pl #!/usr/bin/perl use LWP::UserAgent; $filename = $ARGV[0]; $filename =~ /(.*)\..*/; $outname = $1 . '_GOterms.tsv'; open IDS, $filename or die $!; open (OUT, '>' . $outname); $result = ""; while(<IDS>){ chomp; $_ =~ s/\.//gi; $ua = LWP::UserAgent->new; $req = HTTP::Request->new(GET => "http://www.ebi.ac.uk/QuickGO/GAnnotation?protein=" . $_ . "&format=tsv"); $res = $ua->request($req,'annotation.tsv'); %hash = (); open (FILE, '<annotation.tsv'); print OUT "$_\t"; $head=<FILE>; while (<FILE>) { chomp; ($db,$id,$splice,$symbol,$taxon,$qualifier,$goID,$goName,$reference,$evidence,$with,$aspect,$date,$source) = split(/\t/); #make GO terms unique by writing them into hash $hash{$goID} = ""; } print OUT "@{[ %hash ]}\t"; print OUT "\n"; close (FILE); } close (OUT); close IDS;
Blast
> extract_ids_blast.pl #!/bin/perl my $Z = 0.003; my $filename = "blastsearch_default_v700.out"; my $outname = "blastsearch_default_v700_ids.txt"; my $outEval = "blastsearch_default_v700_Evalues.txt"; open IN, $filename or die $!; #open(OUT, ">" . $outname); open(OUT2, ">" . $outEval); while(<IN>) { if($_ =~ /^(tr|sp)/){ #print $_; chomp; $_ =~ /.+\|(.+)\|.+(.{5})/; $store_1 = $1; $store_2 = $2; $Evalue = $2; $Evalue =~ s/^\s+//; $Evalue =~ s/\s+$//; if( $Evalue > 0.003 ) {} else { # print OUT $store_1 . "\n"; if($Evalue eq "0.0"){ print OUT2 "-Inf\n"; } elsif($Evalue =~ /(.*)e(.+)/){ $Evalue =~ /(.*)e(.+)/; print OUT2 $2 ."\n"; } else { print OUT2 log($Evalue)/log(10) . "\n"; } } } }
> parse_blast.pl #!/usr/bin/perl use strict; use warnings; my $bool = 0; my $ID; my $Length; my $Evalue; my $Identities; my $Positives; open(IN, "< " .$ARGV[0]) or die $!; open(OUT, "> " .$ARGV[0] . "_ident_cov.tsv"); print OUT "ID\tLength\tEvalue\tIdentities\tPositives\n"; while (<IN>) { chomp; if($_ =~ /^>/) { $_ =~ />.+\|(.+)\|.+/; $ID = $1; $bool = 1; } elsif($_ =~ /^\s+Length/ && $bool == 1){ $_ =~ /\s+Length\ =\ (\d+)/; $Length = $1; } elsif ($_ =~ /^\s*Score\ =/ && $bool == 1){ $_ =~ /\s*Score\s+=\s+.+,\s+Expect\s+=\s+(.+?),\s+Method: .+/; $Evalue = $1; $Evalue =~ s/^\s+//; $Evalue =~ s/\s+$//; if($Evalue eq "0.0"){ $Evalue = "-Inf"; } elsif($Evalue =~ /(.*)e(.+)/){ $Evalue =~ /(.*)e(.+)/; $Evalue = $2; } else { $Evalue = log($Evalue)/log(10); } } elsif ($_ =~ /^\s*Identities/ && $bool == 1){ if($_ =~ /\s+Identities.+\((.+)\%\).+Positives.+\((.+)\%\).+Gaps.+\((.+)\%\)/){ $Identities = $1; $Positives = $2; } elsif($_ =~ /\s*Identities.+\((\d+)\%\).+Positives.+\((\d+)\%\).*/){ $Identities = $1; $Positives = $2; } if( $Evalue < -2.39794000867204 ) { print OUT "$ID\t$Length\t$Evalue\t$Identities\t$Positives\n"; $bool = 0; } } } close(IN);
>hist_blast.r table <- read.table(file="blastsearch_default_v700_ids_GOterms_comparison.txt") png(file="blastsearch_default_v700_ids_GOterms_comparison.png", width = 480, height = 700) layout(matrix(data=c(1,2), nrow=2, ncol=1), widths=c(1,1), heights=c(3,3)) hx_1 <- hist(table[,1], main = "Common GO classifications vs. # GO terms P06280", xlab ="Percentage in common", xaxt = "n", ylim=c(0,500), xlim = c(0,1), col = "dodgerblue", breaks = 10) text(hx_1$mids, hx_1$counts+2, label=c(hx_1$counts), pos = 3, col = "royalblue4") text(0.1,500,expression("mean:")) text(0.23,500,round(mean(table[,1]),4)) axis(side=1, at=seq(0,1, 0.1) ,labels = seq(0,100, 10)) hx_2 <- hist(table[,2], main = "Common GO classifications vs. # GO terms blast search", xlab ="Percentage in common", xaxt = "n", ylim=c(0,500), xlim = c(0,1), col = "dodgerblue", breaks = 10) text(hx_2$mids, hx_2$counts+2, label=c(hx_2$counts), pos = 3, col = "royalblue4") text(0.1,500,expression("mean:")) text(0.23,500,round(mean(table[,2]),4)) axis(side=1, at=seq(0,1, 0.1) ,labels = seq(0,100, 10)) dev.off() ip <- read.table(file="blastsearch_default_v700.out_ident_cov.tsv", header = T) png(file="blastsearch_default_v700_Evalues.png", width = 480, height = 400) hist(ip$Evalue, xlab="log_10(E-value)", breaks=30, col = "dodgerblue", ylim = c(0,80), main = "Histogramm of E-values, blastsearch", xlim=c(-200,0) ) text(-130,80,round(mean(ip$Evalue[7:664]),4)) text(-160,80,expression("mean:")) dev.off() png(file="blastsearch_default_v700_Identities.png", width = 480, height = 400) hist(ip$Identities, xlab="Identity in %", breaks=30, col = "dodgerblue", ylim = c(0,80), main = "Histogramm of Identities, blastsearch", xlim=c(0,100) ) text(75,80,expression("mean:")) text(90,80,round(mean(ip$Identities),4)) dev.off() png(file="blastsearch_default_v700_Positives.png", width = 480, height = 400) hist(ip$Positives, xlab="Positives in %", breaks=30, col = "dodgerblue", main = "Histogramm of Positives, blastsearch", xlim=c(0,100), ylim = c(0,80) ) text(75,80,expression("mean:")) text(90,80,round(mean(ip$Positives),4)) dev.off() png(file="blastsearch_default_v700_Lenghts.png", width = 480, height = 400) hist(ip$Length, xlab="Sequence length", breaks=30, col = "dodgerblue", ylim = c(0,250), xlim = c(0,2000), main = "Histogramm of sequence length, blastsearch" ) text(1000,180,expression("mean length:")) text(1500,180,round(mean(ip$Length),4)) text(1000,160,expression("P06280 length:")) text(1500,160,expression("429")) dev.off()