Difference between revisions of "Collection of scripts"
From Bioinformatikpedia
Line 160: | Line 160: | ||
infile.close() |
infile.close() |
||
outfile.close() |
outfile.close() |
||
+ | |||
+ | ==getEvalues.pl== |
||
+ | #!/usr/bin/perl |
||
+ | use strict; |
||
+ | use DirHandle; |
||
+ | |||
+ | my $dir = $ARGV[0]; |
||
+ | my $blast = $ARGV[1]; |
||
+ | |||
+ | sub plainfiles { |
||
+ | my $dir = shift; |
||
+ | my $dh = DirHandle->new($dir) or die "can't opendir $dir: $!"; |
||
+ | return sort # sort pathnames |
||
+ | grep { -f } # choose only "plain" files |
||
+ | map { "$dir/$_" } # create full paths |
||
+ | grep { !/^\./ } # filter out dot files |
||
+ | grep { !/\.dat$/} |
||
+ | grep { !/\.pl$/} |
||
+ | grep { !/\.py$/} |
||
+ | grep { !/\.png$/} |
||
+ | $dh-> |
||
+ | read() |
||
+ | ; # read all entries |
||
+ | } |
||
+ | |||
+ | my @files = plainfiles($dir); |
||
+ | |||
+ | if($blast eq "blast"){ |
||
+ | for(@files){ |
||
+ | open(READER ,"<",$_); |
||
+ | open(WRITER ,">",$_.".dat"); |
||
+ | my $eval; |
||
+ | while(<READER>){ |
||
+ | if(/Expect = (.*?),/){ |
||
+ | #print "buuuuu"; |
||
+ | if($1 == 0){ |
||
+ | $eval = "-INF"; |
||
+ | }else{ |
||
+ | $eval = log($1)/log(10); |
||
+ | } |
||
+ | #Identities = 314/440 (71%) |
||
+ | }elsif(/Identities = .*?\/.*? \((.*?)\%\),/){ |
||
+ | #print "tralaöa"; |
||
+ | print WRITER "$eval 0\.$1\n"; |
||
+ | }elsif(/Results of round/){ |
||
+ | close WRITER; |
||
+ | open(WRITER ,">",$_.".dat"); |
||
+ | } |
||
+ | } |
||
+ | close READER; |
||
+ | close WRITER; |
||
+ | } |
||
+ | }else{ |
||
+ | for(@files){ |
||
+ | open(READER ,"<",$_); |
||
+ | open(WRITER ,">",$_.".dat"); |
||
+ | while(<READER>){ |
||
+ | |||
+ | my @fields = split(" ",$_); |
||
+ | my $eval; |
||
+ | if($fields[2] == 0){ |
||
+ | $eval = "-INF"; |
||
+ | }else{ |
||
+ | $eval = log($fields[2])/log(10); |
||
+ | } |
||
+ | my $ident =$fields[3]; |
||
+ | $ident =~ s/\%//; |
||
+ | $ident = "0."+$ident; |
||
+ | print WRITER "$eval $ident\n"; |
||
+ | |||
+ | } |
||
+ | close READER; |
||
+ | close WRITER; |
||
+ | } |
||
+ | } |
Revision as of 15:26, 7 May 2012
A completely unordered, yet to become truly usefull collection of scripts we used.
PSIBlasttrainer.pl
#! /usr/bin/perl use strict; use Getopt::Long; my $input; my $output; my $database; my $maxIt; my $before; my $after; my $result = GetOptions ("i=s" => \$input,"o=s" => \$output,"d=s" => \$database,"j=i" => \$maxIt); open(TIME ,">","time.txt"); print TIME "\# time used for psiblast\n"; #run psiblast each time and save the time; for(my $i =1;$i<=$maxIt;$i++){ print "iteration: $i\n"; $before = time(); open STATUS, "-|","time blastpgp -j $i -d $database -i $input -o $output"."_".$i."_it" or die "can't fork: $!"; while(<STATUS>){ print TIME $_; } $after = time(); print TIME ($after-$before)."\n"; close STATUS; } print TIME ">"; for(my $i=1;$i<$maxIt;$i++){ my $eval = 1/(10**$i); print "iteration: $i\n"; $before = time(); system("time blastpgp -j 1 -d $database -i $input -o $output" . "_eVal_" . "$eval -e $eval"); $after = time(); print TIME ($after-$before)."\n"; close STATUS; } print TIME ">\n" for(my $i=1;$i<$maxIt;$i++){ my $eval = 1/(10**$i); print "iteration: $i\n"; $before = time(); system("time blastpgp -j 1 -d $database -i $input -o $output" . "_integr_" . "$eval -h $eval"); $after = time(); print TIME ($after-$before)."\n"; close STATUS; } close TIME;
Sequence Alignments
#!/usr/bin/env python #simple sript to count gaps and conserved columns in multiple alignment files import sys from collections import defaultdict def default_string(): return "" def aln_stats(filename): fh = open(filename,'r') sequences = defaultdict(default_string) for line in fh: if line[:2].strip() and not line.startswith("CLUSTAL"): seq = line.split() sequences[seq[0]]+="".join(seq[1:]) fh.close() for key in sequences: print key, sequences[key].count('-') print "conserved:",conserved(sequences) def fasta_stats(filename): fh = open(filename,'r') gap_count = 0 seq_id = "" sequences = defaultdict(default_string) for line in fh: if line.startswith(">"): if gap_count: print seq_id, gap_count gap_count = 0 seq_id = line.split()[0][1:] else: gap_count+=line.count('-') sequences[seq_id]+=line.strip() print seq_id, gap_count,"\nconserved:",conserved(sequences) fh.close() def conserved(sequence_dict): conserved_count = 0 for i in range(len(sequence_dict.values()[0])): col = set([]) for seq in sequence_dict.values(): col.add(seq[i]) if len(col)== 1: conserved_count+=1 return conserved_count if "--help" in sys.argv or len(sys.argv)<2: print "usage: python %s <filename>"%sys.argv[0] elif sys.argv[1].endswith(".aln"): aln_stats(sys.argv[1]) elif sys.argv[1].endswith(".fasta"): fasta_stats(sys.argv[1]) else: print "Only .aln or .fasta files, sorry!"
HHextract.py
#!/usr/bin/env python #this simply takes output of HHblits, extracts the IDs of cluster members and adds a clusteridentifier #output: #ID1 CL1 E-Value Identities #ID2 CL1 E-Value Identities #ID3 CL1 E-Value Identities #ID4 CL2 E-Value Identities #ID4 CL2 E-Value Identities #.. # #usage: python hhextract.py <infile> <outfile> import sys, re try: infile = open(sys.argv[1]) except IOError: print "Couldn't open input file" exit(-1) try: outfile= open(sys.argv[2],'w') except Exception: print "Couldn't open output file" exit(-1) for line in infile: if line.startswith('>'): splits = line.split('|') to_find = int(splits[2]) #check number of clustermembers, skip and print clusterID if confusion #if len(splits)!= (4+to_find): # print line[:15] # print to_find, splits # exit(-1) line = infile.next() splits2 = line.split() for member in splits[-to_find:]: memberID = member.split()[0].strip() if memberID.endswith('.'): memberID = memberID[:-1] outfile.write("%s %s %s %s\n"%(memberID, splits[1], splits2[1].split('=')[1], splits2[4].split('=')[1])) infile.close() outfile.close()
getEvalues.pl
#!/usr/bin/perl use strict; use DirHandle; my $dir = $ARGV[0]; my $blast = $ARGV[1]; sub plainfiles { my $dir = shift; my $dh = DirHandle->new($dir) or die "can't opendir $dir: $!"; return sort # sort pathnames grep { -f } # choose only "plain" files map { "$dir/$_" } # create full paths grep { !/^\./ } # filter out dot files grep { !/\.dat$/} grep { !/\.pl$/} grep { !/\.py$/} grep { !/\.png$/} $dh-> read() ; # read all entries } my @files = plainfiles($dir); if($blast eq "blast"){ for(@files){ open(READER ,"<",$_); open(WRITER ,">",$_.".dat"); my $eval; while(<READER>){ if(/Expect = (.*?),/){ #print "buuuuu"; if($1 == 0){ $eval = "-INF"; }else{ $eval = log($1)/log(10); } #Identities = 314/440 (71%) }elsif(/Identities = .*?\/.*? \((.*?)\%\),/){ #print "tralaöa"; print WRITER "$eval 0\.$1\n"; }elsif(/Results of round/){ close WRITER; open(WRITER ,">",$_.".dat"); } } close READER; close WRITER; } }else{ for(@files){ open(READER ,"<",$_); open(WRITER ,">",$_.".dat"); while(<READER>){ my @fields = split(" ",$_); my $eval; if($fields[2] == 0){ $eval = "-INF"; }else{ $eval = log($fields[2])/log(10); } my $ident =$fields[3]; $ident =~ s/\%//; $ident = "0."+$ident; print WRITER "$eval $ident\n"; } close READER; close WRITER; } }