Collection of scripts
From Bioinformatikpedia
A completely unordered, yet to become truly usefull collection of scripts we used.
PSIBlasttrainer.pl
#! /usr/bin/perl use strict; use Getopt::Long; my $input; my $output; my $database; my $maxIt; my $before; my $after; my $result = GetOptions ("i=s" => \$input,"o=s" => \$output,"d=s" => \$database,"j=i" => \$maxIt); open(TIME ,">","time.txt"); print TIME "\# time used for psiblast\n"; #run psiblast each time and save the time; for(my $i =1;$i<=$maxIt;$i++){ print "iteration: $i\n"; $before = time(); open STATUS, "-|","time blastpgp -j $i -d $database -i $input -o $output"."_".$i."_it" or die "can't fork: $!"; while(<STATUS>){ print TIME $_; } $after = time(); print TIME ($after-$before)."\n"; close STATUS; } print TIME ">"; for(my $i=1;$i<$maxIt;$i++){ my $eval = 1/(10**$i); print "iteration: $i\n"; $before = time(); system("time blastpgp -j 1 -d $database -i $input -o $output" . "_eVal_" . "$eval -e $eval"); $after = time(); print TIME ($after-$before)."\n"; close STATUS; } print TIME ">\n" for(my $i=1;$i<$maxIt;$i++){ my $eval = 1/(10**$i); print "iteration: $i\n"; $before = time(); system("time blastpgp -j 1 -d $database -i $input -o $output" . "_integr_" . "$eval -h $eval"); $after = time(); print TIME ($after-$before)."\n"; close STATUS; } close TIME;
Sequence Alignments
#!/usr/bin/env python #simple sript to count gaps and conserved columns in multiple alignment files import sys from collections import defaultdict def default_string(): return "" def aln_stats(filename): fh = open(filename,'r') sequences = defaultdict(default_string) for line in fh: if line[:2].strip() and not line.startswith("CLUSTAL"): seq = line.split() sequences[seq[0]]+="".join(seq[1:]) fh.close() for key in sequences: print key, sequences[key].count('-') print "conserved:",conserved(sequences) def fasta_stats(filename): fh = open(filename,'r') gap_count = 0 seq_id = "" sequences = defaultdict(default_string) for line in fh: if line.startswith(">"): if gap_count: print seq_id, gap_count gap_count = 0 seq_id = line.split()[0][1:] else: gap_count+=line.count('-') sequences[seq_id]+=line.strip() print seq_id, gap_count,"\nconserved:",conserved(sequences) fh.close() def conserved(sequence_dict): conserved_count = 0 for i in range(len(sequence_dict.values()[0])): col = set([]) for seq in sequence_dict.values(): col.add(seq[i]) if len(col)== 1: conserved_count+=1 return conserved_count if "--help" in sys.argv or len(sys.argv)<2: print "usage: python %s <filename>"%sys.argv[0] elif sys.argv[1].endswith(".aln"): aln_stats(sys.argv[1]) elif sys.argv[1].endswith(".fasta"): fasta_stats(sys.argv[1]) else: print "Only .aln or .fasta files, sorry!"
HHextract.py
#!/usr/bin/env python #this simply takes output of HHblits, extracts the IDs of cluster members and adds a clusteridentifier #output: #ID1 CL1 E-Value Identities #ID2 CL1 E-Value Identities #ID3 CL1 E-Value Identities #ID4 CL2 E-Value Identities #ID4 CL2 E-Value Identities #.. # #usage: python hhextract.py <infile> <outfile> import sys, re try: infile = open(sys.argv[1]) except IOError: print "Couldn't open input file" exit(-1) try: outfile= open(sys.argv[2],'w') except Exception: print "Couldn't open output file" exit(-1) for line in infile: if line.startswith('>'): splits = line.split('|') to_find = int(splits[2]) #check number of clustermembers, skip and print clusterID if confusion #if len(splits)!= (4+to_find): # print line[:15] # print to_find, splits # exit(-1) line = infile.next() splits2 = line.split() for member in splits[-to_find:]: memberID = member.split()[0].strip() if memberID.endswith('.'): memberID = memberID[:-1] outfile.write("%s %s %s %s\n"%(memberID, splits[1], splits2[1].split('=')[1], splits2[4].split('=')[1])) infile.close() outfile.close()