Difference between revisions of "Collection of scripts"
From Bioinformatikpedia
Line 114: | Line 114: | ||
else: |
else: |
||
print "Only .aln or .fasta files, sorry!" |
print "Only .aln or .fasta files, sorry!" |
||
+ | |||
+ | |||
+ | ==HHextract.py== |
||
+ | |||
+ | #!/usr/bin/env python |
||
+ | #this simply takes output of HHblits, extracts the IDs of cluster members and adds a clusteridentifier |
||
+ | #output: |
||
+ | #ID1 CL1 E-Value Identities |
||
+ | #ID2 CL1 E-Value Identities |
||
+ | #ID3 CL1 E-Value Identities |
||
+ | #ID4 CL2 E-Value Identities |
||
+ | #ID4 CL2 E-Value Identities |
||
+ | #.. |
||
+ | # |
||
+ | #usage: python hhextract.py <infile> <outfile> |
||
+ | |||
+ | import sys, re |
||
+ | try: |
||
+ | infile = open(sys.argv[1]) |
||
+ | except IOError: |
||
+ | print "Couldn't open input file" |
||
+ | exit(-1) |
||
+ | try: |
||
+ | outfile= open(sys.argv[2],'w') |
||
+ | except Exception: |
||
+ | print "Couldn't open output file" |
||
+ | exit(-1) |
||
+ | |||
+ | for line in infile: |
||
+ | if line.startswith('>'): |
||
+ | splits = line.split('|') |
||
+ | to_find = int(splits[2]) |
||
+ | #check number of clustermembers, skip and print clusterID if confusion |
||
+ | #if len(splits)!= (4+to_find): |
||
+ | # print line[:15] |
||
+ | # print to_find, splits |
||
+ | # exit(-1) |
||
+ | line = infile.next() |
||
+ | splits2 = line.split() |
||
+ | for member in splits[-to_find:]: |
||
+ | memberID = member.split()[0].strip() |
||
+ | if memberID.endswith('.'): |
||
+ | memberID = memberID[:-1] |
||
+ | outfile.write("%s %s %s %s\n"%(memberID, splits[1], splits2[1].split('=')[1], splits2[4].split('=')[1])) |
||
+ | infile.close() |
||
+ | outfile.close() |
Revision as of 15:20, 7 May 2012
A completely unordered, yet to become truly usefull collection of scripts we used.
PSIBlasttrainer.pl
#! /usr/bin/perl use strict; use Getopt::Long; my $input; my $output; my $database; my $maxIt; my $before; my $after; my $result = GetOptions ("i=s" => \$input,"o=s" => \$output,"d=s" => \$database,"j=i" => \$maxIt); open(TIME ,">","time.txt"); print TIME "\# time used for psiblast\n"; #run psiblast each time and save the time; for(my $i =1;$i<=$maxIt;$i++){ print "iteration: $i\n"; $before = time(); open STATUS, "-|","time blastpgp -j $i -d $database -i $input -o $output"."_".$i."_it" or die "can't fork: $!"; while(<STATUS>){ print TIME $_; } $after = time(); print TIME ($after-$before)."\n"; close STATUS; } print TIME ">"; for(my $i=1;$i<$maxIt;$i++){ my $eval = 1/(10**$i); print "iteration: $i\n"; $before = time(); system("time blastpgp -j 1 -d $database -i $input -o $output" . "_eVal_" . "$eval -e $eval"); $after = time(); print TIME ($after-$before)."\n"; close STATUS; } print TIME ">\n" for(my $i=1;$i<$maxIt;$i++){ my $eval = 1/(10**$i); print "iteration: $i\n"; $before = time(); system("time blastpgp -j 1 -d $database -i $input -o $output" . "_integr_" . "$eval -h $eval"); $after = time(); print TIME ($after-$before)."\n"; close STATUS; } close TIME;
Sequence Alignments
#!/usr/bin/env python #simple sript to count gaps and conserved columns in multiple alignment files import sys from collections import defaultdict def default_string(): return "" def aln_stats(filename): fh = open(filename,'r') sequences = defaultdict(default_string) for line in fh: if line[:2].strip() and not line.startswith("CLUSTAL"): seq = line.split() sequences[seq[0]]+="".join(seq[1:]) fh.close() for key in sequences: print key, sequences[key].count('-') print "conserved:",conserved(sequences) def fasta_stats(filename): fh = open(filename,'r') gap_count = 0 seq_id = "" sequences = defaultdict(default_string) for line in fh: if line.startswith(">"): if gap_count: print seq_id, gap_count gap_count = 0 seq_id = line.split()[0][1:] else: gap_count+=line.count('-') sequences[seq_id]+=line.strip() print seq_id, gap_count,"\nconserved:",conserved(sequences) fh.close() def conserved(sequence_dict): conserved_count = 0 for i in range(len(sequence_dict.values()[0])): col = set([]) for seq in sequence_dict.values(): col.add(seq[i]) if len(col)== 1: conserved_count+=1 return conserved_count if "--help" in sys.argv or len(sys.argv)<2: print "usage: python %s <filename>"%sys.argv[0] elif sys.argv[1].endswith(".aln"): aln_stats(sys.argv[1]) elif sys.argv[1].endswith(".fasta"): fasta_stats(sys.argv[1]) else: print "Only .aln or .fasta files, sorry!"
HHextract.py
#!/usr/bin/env python #this simply takes output of HHblits, extracts the IDs of cluster members and adds a clusteridentifier #output: #ID1 CL1 E-Value Identities #ID2 CL1 E-Value Identities #ID3 CL1 E-Value Identities #ID4 CL2 E-Value Identities #ID4 CL2 E-Value Identities #.. # #usage: python hhextract.py <infile> <outfile> import sys, re try: infile = open(sys.argv[1]) except IOError: print "Couldn't open input file" exit(-1) try: outfile= open(sys.argv[2],'w') except Exception: print "Couldn't open output file" exit(-1) for line in infile: if line.startswith('>'): splits = line.split('|') to_find = int(splits[2]) #check number of clustermembers, skip and print clusterID if confusion #if len(splits)!= (4+to_find): # print line[:15] # print to_find, splits # exit(-1) line = infile.next() splits2 = line.split() for member in splits[-to_find:]: memberID = member.split()[0].strip() if memberID.endswith('.'): memberID = memberID[:-1] outfile.write("%s %s %s %s\n"%(memberID, splits[1], splits2[1].split('=')[1], splits2[4].split('=')[1])) infile.close() outfile.close()