Difference between revisions of "Collection of scripts"

From Bioinformatikpedia
Line 114: Line 114:
 
else:
 
else:
 
print "Only .aln or .fasta files, sorry!"
 
print "Only .aln or .fasta files, sorry!"
  +
  +
  +
==HHextract.py==
  +
  +
#!/usr/bin/env python
  +
#this simply takes output of HHblits, extracts the IDs of cluster members and adds a clusteridentifier
  +
#output:
  +
#ID1 CL1 E-Value Identities
  +
#ID2 CL1 E-Value Identities
  +
#ID3 CL1 E-Value Identities
  +
#ID4 CL2 E-Value Identities
  +
#ID4 CL2 E-Value Identities
  +
#..
  +
#
  +
#usage: python hhextract.py <infile> <outfile>
  +
  +
import sys, re
  +
try:
  +
infile = open(sys.argv[1])
  +
except IOError:
  +
print "Couldn't open input file"
  +
exit(-1)
  +
try:
  +
outfile= open(sys.argv[2],'w')
  +
except Exception:
  +
print "Couldn't open output file"
  +
exit(-1)
  +
  +
for line in infile:
  +
if line.startswith('>'):
  +
splits = line.split('|')
  +
to_find = int(splits[2])
  +
#check number of clustermembers, skip and print clusterID if confusion
  +
#if len(splits)!= (4+to_find):
  +
# print line[:15]
  +
# print to_find, splits
  +
# exit(-1)
  +
line = infile.next()
  +
splits2 = line.split()
  +
for member in splits[-to_find:]:
  +
memberID = member.split()[0].strip()
  +
if memberID.endswith('.'):
  +
memberID = memberID[:-1]
  +
outfile.write("%s %s %s %s\n"%(memberID, splits[1], splits2[1].split('=')[1], splits2[4].split('=')[1]))
  +
infile.close()
  +
outfile.close()

Revision as of 15:20, 7 May 2012

A completely unordered, yet to become truly usefull collection of scripts we used.

PSIBlasttrainer.pl

#! /usr/bin/perl
use strict;
use Getopt::Long;

my $input;
my $output;
my $database;
my $maxIt;
my $before;
my $after;

my $result = GetOptions ("i=s" => \$input,"o=s" => \$output,"d=s" => \$database,"j=i" => \$maxIt);
open(TIME ,">","time.txt");
print TIME "\# time used for psiblast\n";

#run psiblast each time and save the time;
for(my $i =1;$i<=$maxIt;$i++){
	print "iteration: $i\n";
	$before = time();
	open STATUS, "-|","time blastpgp -j $i -d $database -i $input -o $output"."_".$i."_it" 
		or die "can't fork: $!";
	while(<STATUS>){
		print TIME $_;
	}	
	$after = time();
	print TIME ($after-$before)."\n";
	close STATUS;
}
print TIME ">";

for(my $i=1;$i<$maxIt;$i++){
	my $eval = 1/(10**$i);
	print "iteration: $i\n";
	$before = time();
	system("time blastpgp -j 1 -d $database -i $input -o $output" . "_eVal_" . "$eval -e $eval");
	$after = time();
	print TIME ($after-$before)."\n";
	close STATUS;
} 

print TIME ">\n"
for(my $i=1;$i<$maxIt;$i++){
	my $eval = 1/(10**$i);
	print "iteration: $i\n";
	$before = time();
	system("time blastpgp -j 1 -d $database -i $input -o $output" . "_integr_" . "$eval -h $eval");
	$after = time();
	print TIME ($after-$before)."\n";
	close STATUS;
}
 
close TIME;


Sequence Alignments

#!/usr/bin/env python
#simple sript to count gaps and conserved columns in multiple alignment files
import sys
from collections import defaultdict

def default_string():
    return ""

def aln_stats(filename):
    fh = open(filename,'r')
    sequences = defaultdict(default_string)
    for line in fh:
        if line[:2].strip() and not line.startswith("CLUSTAL"):
            seq = line.split()
            sequences[seq[0]]+="".join(seq[1:])
    fh.close()

    for key in sequences:
        print key, sequences[key].count('-')
    print "conserved:",conserved(sequences)
    
def fasta_stats(filename):
    fh = open(filename,'r')
    gap_count = 0
    seq_id = ""
    sequences = defaultdict(default_string)
    for line in fh:
        if line.startswith(">"):
            if gap_count:
                print seq_id, gap_count
            gap_count = 0
            seq_id = line.split()[0][1:]
        else:
            gap_count+=line.count('-')
            sequences[seq_id]+=line.strip()

    print seq_id, gap_count,"\nconserved:",conserved(sequences)
    fh.close() 

def conserved(sequence_dict):
    conserved_count = 0
    for i in range(len(sequence_dict.values()[0])):
        col = set([])
        for seq in sequence_dict.values():
            col.add(seq[i])
        if len(col)== 1:
            conserved_count+=1
    return conserved_count 

if "--help" in sys.argv or len(sys.argv)<2:
    print "usage: python %s <filename>"%sys.argv[0]
elif sys.argv[1].endswith(".aln"):
    aln_stats(sys.argv[1])
elif sys.argv[1].endswith(".fasta"):
    fasta_stats(sys.argv[1])
else:
   print "Only .aln or .fasta files, sorry!"


HHextract.py

#!/usr/bin/env python
#this simply takes output of HHblits, extracts the IDs of cluster members and adds a clusteridentifier
#output:
#ID1 CL1 E-Value Identities
#ID2 CL1 E-Value Identities
#ID3 CL1 E-Value Identities
#ID4 CL2 E-Value Identities
#ID4 CL2 E-Value Identities
#..
#
#usage: python hhextract.py <infile> <outfile>
 
import sys, re
try:
    infile = open(sys.argv[1])
except IOError:
    print "Couldn't open input file"
    exit(-1)
try:
    outfile= open(sys.argv[2],'w')
except Exception:
    print "Couldn't open output file"
    exit(-1)
 
for line in infile:
    if line.startswith('>'):
        splits = line.split('|')
        to_find = int(splits[2])
        #check number of clustermembers, skip and print clusterID if confusion
        #if len(splits)!= (4+to_find):
        #   print line[:15]
        #   print to_find, splits
        #   exit(-1)
        line = infile.next()
        splits2 = line.split()
        for member in splits[-to_find:]:
            memberID = member.split()[0].strip()
            if memberID.endswith('.'):
                memberID = memberID[:-1]
            outfile.write("%s %s %s %s\n"%(memberID, splits[1], splits2[1].split('=')[1], splits2[4].split('=')[1]))
infile.close()
outfile.close()