Difference between revisions of "Collection of scripts"

Latest revision as of 12:01, 29 August 2012

A completely unordered, yet to become truly useful collection of scripts we used.

Sequence Alignments

PSIBlasttrainer.pl

#! /usr/bin/perl
use strict;
use Getopt::Long;

my $input;
my $output;
my $database;
my $maxIt;
my $before;
my $after;

my $result = GetOptions ("i=s" => \$input,"o=s" => \$output,"d=s" => \$database,"j=i" => \$maxIt);
open(TIME ,">","time.txt");
print TIME "\# time used for psiblast\n";

#run psiblast each time and save the time;
for(my $i =1;$i<=$maxIt;$i++){
	print "iteration: $i\n";
	$before = time();
	open STATUS, "-|","time blastpgp -j $i -d $database -i $input -o $output"."_".$i."_it" 
		or die "can't fork: $!";
	while(<STATUS>){
		print TIME $_;
	}	
	$after = time();
	print TIME ($after-$before)."\n";
	close STATUS;
}
print TIME ">";

for(my $i=1;$i<$maxIt;$i++){
	my $eval = 1/(10**$i);
	print "iteration: $i\n";
	$before = time();
	system("time blastpgp -j 1 -d $database -i $input -o $output" . "_eVal_" . "$eval -e $eval");
	$after = time();
	print TIME ($after-$before)."\n";
	close STATUS;
} 

print TIME ">\n"
for(my $i=1;$i<$maxIt;$i++){
	my $eval = 1/(10**$i);
	print "iteration: $i\n";
	$before = time();
	system("time blastpgp -j 1 -d $database -i $input -o $output" . "_integr_" . "$eval -h $eval");
	$after = time();
	print TIME ($after-$before)."\n";
	close STATUS;
}
 
close TIME;

</source>

Sequence Alignment Statistics

#!/usr/bin/env python
#simple sript to count gaps and conserved columns in multiple alignment files
import sys
from collections import defaultdict

def default_string():
    return ""

def aln_stats(filename):
    fh = open(filename,'r')
    sequences = defaultdict(default_string)
    for line in fh:
        if line[:2].strip() and not line.startswith("CLUSTAL"):
            seq = line.split()
            sequences[seq[0]]+="".join(seq[1:])
    fh.close()

    for key in sequences:
        print key, sequences[key].count('-')
    print "conserved:",conserved(sequences)
    
def fasta_stats(filename):
    fh = open(filename,'r')
    gap_count = 0
    seq_id = ""
    sequences = defaultdict(default_string)
    for line in fh:
        if line.startswith(">"):
            if gap_count:
                print seq_id, gap_count
            gap_count = 0
            seq_id = line.split()[0][1:]
        else:
            gap_count+=line.count('-')
            sequences[seq_id]+=line.strip()

    print seq_id, gap_count,"\nconserved:",conserved(sequences)
    fh.close() 

def conserved(sequence_dict):
    conserved_count = 0
    for i in range(len(sequence_dict.values()[0])):
        col = set([])
        for seq in sequence_dict.values():
            col.add(seq[i])
        if len(col)== 1:
            conserved_count+=1
    return conserved_count 

if "--help" in sys.argv or len(sys.argv)<2:
    print "usage: python %s <filename>"%sys.argv[0]
elif sys.argv[1].endswith(".aln"):
    aln_stats(sys.argv[1])
elif sys.argv[1].endswith(".fasta"):
    fasta_stats(sys.argv[1])
else:
   print "Only .aln or .fasta files, sorry!"

</source>

HHextract.py

#!/usr/bin/env python
#this simply takes output of HHblits, extracts the IDs of cluster members and adds a clusteridentifier
#output:
#ID1 CL1 E-Value Identities
#ID2 CL1 E-Value Identities
#ID3 CL1 E-Value Identities
#ID4 CL2 E-Value Identities
#ID4 CL2 E-Value Identities
#..
#
#usage: python hhextract.py <infile> <outfile>
 
import sys, re
try:
    infile = open(sys.argv[1])
except IOError:
    print "Couldn't open input file"
    exit(-1)
try:
    outfile= open(sys.argv[2],'w')
except Exception:
    print "Couldn't open output file"
    exit(-1)
 
for line in infile:
    if line.startswith('>'):
        splits = line.split('|')
        to_find = int(splits[2])
        #check number of clustermembers, skip and print clusterID if confusion
        #if len(splits)!= (4+to_find):
        #   print line[:15]
        #   print to_find, splits
        #   exit(-1)
        line = infile.next()
        splits2 = line.split()
        for member in splits[-to_find:]:
            memberID = member.split()[0].strip()
            if memberID.endswith('.'):
                memberID = memberID[:-1]
            outfile.write("%s %s %s %s\n"%(memberID, splits[1], splits2[1].split('=')[1], splits2[4].split('=')[1]))
infile.close()
outfile.close()

</source>

getEvalues.pl

#!/usr/bin/perl
use strict;
use DirHandle;

my $dir = $ARGV[0];
my $blast = $ARGV[1];

sub plainfiles {
   my $dir = shift;
   my $dh = DirHandle->new($dir)   or die "can't opendir $dir: $!";
   return sort                     # sort pathnames
         grep {    -f     }       # choose only "plain" files
         map  { "$dir/$_" }       # create full paths
         grep {  !/^\./   }       # filter out dot files
	  grep {  !/\.dat$/}
	  grep {  !/\.pl$/}
	  grep {  !/\.py$/}
	  grep {  !/\.png$/}
          $dh->
read()
;             # read all entries
}

my @files =  plainfiles($dir); 

if($blast eq "blast"){
  for(@files){
    open(READER ,"<",$_);
    open(WRITER ,">",$_.".dat");
    my $eval;
    while(<READER>){
      if(/Expect = (.*?),/){
	 #print "buuuuu";
	 if($1 == 0){
		$eval = "-INF";
	}else{
	 $eval = log($1)/log(10);
	}
	#Identities = 314/440 (71%)
      }elsif(/Identities = .*?\/.*? \((.*?)\%\),/){
	#print "tralaöa";
	print WRITER "$eval 0\.$1\n";
      }elsif(/Results of round/){
	close WRITER; 
	open(WRITER ,">",$_.".dat");
      }
    }
    close READER;
    close WRITER;
  }
}else{
  for(@files){
    open(READER ,"<",$_);
    open(WRITER ,">",$_.".dat");
    while(<READER>){
     
	my @fields = split(" ",$_);
	my $eval;
 	if($fields[2] == 0){
		$eval = "-INF";
	}else{
	 $eval = log($fields[2])/log(10);
	}
	my $ident =$fields[3]; 
	$ident =~ s/\%//;
	$ident = "0."+$ident;  
	print WRITER "$eval $ident\n";
       
    }
    close READER;
    close WRITER;
  }
}

</source>

Sequence based predictions

ss_score.py

#!/usr/bin/env python
#calculates Q3 and SOV for two (already aligned) assignments of structure
#in files containing only the structure (no comment line, no header, mayyybe newlines..)
#usage: python ss_score.py <dssp_file> <prediction_file>

import sys

def main():
    #read files, map DSSP characters to EHL and C to L
    #strip of newlines..
    dssp = open(sys.argv[1]).read().strip().replace("S","L").replace("B","L").replace("I","L").replace("G","H").replace("T","L").replace("\n","")
    pred_tmp = open(sys.argv[2]).read().strip().replace("C","L").replace("\n","")

    #if DSSP has only '-', ignore in prediction as well
    pred = ""
    for i in range(len(dssp)):
        if dssp[i] == "-":
            pred+= "-"
        else:
            pred+=pred_tmp[i]

    q_values = q3(dssp,pred)
    sov_values = sov(dssp,pred)
    print "Q3 =", q_values[0],"\nQ_E =", q_values[1],"\nQ_H =", q_values[2],"\nQ_L =", q_values[3]
    print "SOV =", sov_values[0],"\nSOV_E =", sov_values[1],"\nSOV_H =", sov_values[2],"\nSOV_L =", sov_values[3]

def q3(dssp,pred):
    residues_E = 0
    residues_H = 0
    residues_L = 0
    match_E = 0
    match_H = 0
    match_L = 0
    residues = {'E':residues_E,'H':residues_H,'L':residues_L}
    match = {'E':match_E,'H':match_H,'L':match_L}
    for i in range(len(dssp)):
        if dssp[i] != "-":
            residues[dssp[i]]+=1
            if dssp[i] == pred[i]:
                match[dssp[i]]+=1
    matches = sum(match.values())
    resid = sum(residues.values())
    return (matches*100.0/resid,match['E']*100.0/residues['E'],match['H']*100.0/residues['H'],match['L']*100.0/residues['L'])

def sov(dssp,pred):
    sov = 0
    sov_E = 0
    sov_H = 0
    sov_L = 0
    segment_sum = 0
    for typ in ["E","H","L"]:
        stretch_A = []
        stretch_B = []

        started = False
        start = 0
        
        sov_x = 0
        segment_x = 0
        
        for pairs in [(stretch_A, dssp),(stretch_B, pred)]:
            seq = pairs[1]
            sets = pairs[0]
            for i in range(len(seq)):
                if not started and seq[i] == typ:
                    started = True
                    start = i

                if started and seq[i] != typ:
                    started = False
                    #first position and first position after
                    sets.append((start,i))
            start = 0
        for segment in stretch_A:
            segment_x+=segment[1]-segment[0]
        overlapping_pairs = []
        a = 0
        b = 0
        while a < len(stretch_A) and b < len(stretch_B):
            a_start = stretch_A[a][0]
            a_end = stretch_A[a][1]

            b_start = stretch_B[b][0]
            b_end = stretch_B[b][1]
            if a_start <= b_start:
                if b_start <= a_end:
                    overlapping_pairs.append((stretch_A[a],stretch_B[b]))
                    b+=1
                else:
                    a+=1 
            else:
                if a_start <= b_end:
                    overlapping_pairs.append((stretch_A[a],stretch_B[b]))
                    a+=1
                else:
                    b+=1

        segment_sum+=segment_x

        for pair in overlapping_pairs:
            sov_x+= (pair[0][1]-pair[0][0])*(minov(pair)+delta(pair))/maxov(pair)
        sov+=sov_x
        if typ == 'E':
            sov_E = sov_x*100/segment_x
        elif typ == 'H':
            sov_H = sov_x*100/segment_x
        elif typ =='L':
            sov_L = sov_x*100/segment_x
    return (sov*100/segment_sum,sov_E,sov_H, sov_L)

def minov(pair):
    a = pair[0]
    b = pair[1]
    return float(min(a[1],b[1]) - max(a[0],b[0]))

def maxov(pair):
    a = pair[0]
    b = pair[1]
    return float(max([a[1],b[1]]) - min([a[0],b[0]]))

def delta(pair):
    return float(min([maxov(pair)-minov(pair),minov(pair),(pair[0][1]-pair[0][0])/2,(pair[1][1]-pair[1][0])/2])) 

main()

</source>

Difference between revisions of "Collection of scripts"

Latest revision as of 12:01, 29 August 2012

Contents

Sequence Alignments

PSIBlasttrainer.pl

Sequence Alignment Statistics

HHextract.py

getEvalues.pl

Sequence based predictions

ss_score.py

Navigation menu

Views

Personal tools

Bioinformatik navigation

MediaWiki navigation

Search

Tools

@@ Line 1: / Line 1: @@
 A completely unordered, yet to become truly useful collection of scripts [[Phenylketonuria|we]] used.
+==Sequence Alignments==
-==PSIBlasttrainer.pl==
+===PSIBlasttrainer.pl===
+<source lang="perl">
  #! /usr/bin/perl
  use strict;
@@ Line 54: / Line 56: @@
  close TIME;
+</source>
+===Sequence Alignment Statistics===
+<source lang="python">
-==Sequence Alignments==
  #!/usr/bin/env python
  #simple sript to count gaps and conserved columns in multiple alignment files
@@ Line 115: / Line 118: @@
     print "Only .aln or .fasta files, sorry!"
+</source>
-==HHextract.py==
+===HHextract.py===
+<source lang="python">
  #!/usr/bin/env python
  #this simply takes output of HHblits, extracts the IDs of cluster members and adds a clusteridentifier
@@ Line 161: / Line 165: @@
  outfile.close()
+</source>
-==getEvalues.pl==
+===getEvalues.pl===
+<source lang="perl">
  #!/usr/bin/perl
  use strict;
@@ Line 235: / Line 242: @@
    }
  }
-==ss_score.py==
+</source>
-    #!/usr/bin/env python
-    #calculates Q3 and SOV for two (already aligned) assignments of structure
+==Sequence based predictions==
-    #in files containing only the structure (no comment line, no header, mayyybe newlines..)
+===ss_score.py===
-    #usage: python ss_score.py <dssp_file> <prediction_file>
+<source lang="python">
+ #!/usr/bin/env python
+ #calculates Q3 and SOV for two (already aligned) assignments of structure
+ #in files containing only the structure (no comment line, no header, mayyybe newlines..)
+ #usage: python ss_score.py <dssp_file> <prediction_file>
-    import sys
+ import sys
-    def main():
+ def main():
-        #read files, map DSSP characters to EHL and C to L
+     #read files, map DSSP characters to EHL and C to L
-        #strip of newlines..
+     #strip of newlines..
-        dssp = open(sys.argv[1]).read().strip().replace("S","L").replace("B","L").replace("I","L").replace("G","H").replace("T","L").replace("\n","")
+     dssp = open(sys.argv[1]).read().strip().replace("S","L").replace("B","L").replace("I","L").replace("G","H").replace("T","L").replace("\n","")
-        pred_tmp = open(sys.argv[2]).read().strip().replace("C","L").replace("\n","")
+     pred_tmp = open(sys.argv[2]).read().strip().replace("C","L").replace("\n","")
-        #if DSSP has only '-', ignore in prediction as well
+     #if DSSP has only '-', ignore in prediction as well
-        pred = ""
+     pred = ""
-        for i in range(len(dssp)):
+     for i in range(len(dssp)):
-            if dssp[i] == "-":
+         if dssp[i] == "-":
-                pred+= "-"
+             pred+= "-"
-            else:
+         else:
-                pred+=pred_tmp[i]
+             pred+=pred_tmp[i]
-        #print pred_tmp
-        #print pred
-        #print dssp
-        print "Q3 =" ,q3(dssp,pred)
+     q_values = q3(dssp,pred)
-        print "SOV =",sov(dssp,pred)
+     sov_values = sov(dssp,pred)
+     print "Q3 =", q_values[0],"\nQ_E =", q_values[1],"\nQ_H =", q_values[2],"\nQ_L =", q_values[3]
+     print "SOV =", sov_values[0],"\nSOV_E =", sov_values[1],"\nSOV_H =", sov_values[2],"\nSOV_L =", sov_values[3]
-    def q3(dssp,pred):
+ def q3(dssp,pred):
-        residues = 0
+     residues_E = 0
-        match = 0
+     residues_H = 0
+     residues_L = 0
-        for i in range(len(dssp)):
-            if dssp[i] != "-":
+     match_E = 0
+     match_H = 0
-                residues+=1
+     match_L = 0
-                if dssp[i] == pred[i]:
+     residues = {'E':residues_E,'H':residues_H,'L':residues_L}
-                    match+=1
+     match = {'E':match_E,'H':match_H,'L':match_L}
-        return match*100.0/residues
+     for i in range(len(dssp)):
+         if dssp[i] != "-":
+             residues[dssp[i]]+=1
+             if dssp[i] == pred[i]:
+                 match[dssp[i]]+=1
+     matches = sum(match.values())
+     resid = sum(residues.values())
+     return (matches*100.0/resid,match['E']*100.0/residues['E'],match['H']*100.0/residues['H'],match['L']*100.0/residues['L'])
-    def sov(dssp,pred):
+ def sov(dssp,pred):
-        sov = 0
+     sov = 0
-        segment_sum = 0
+     sov_E = 0
+     sov_H = 0
-        for typ in ["E","H","L"]:
-            stretch_A = []
+     sov_L = 0
-            stretch_B = []
+     segment_sum = 0
+     for typ in ["E","H","L"]:
+         stretch_A = []
+         stretch_B = []
-            started = False
+         started = False
-            start = 0
+         start = 0
-            sov_x = 0
+         sov_x = 0
+         segment_x = 0
-            for pairs in [(stretch_A, dssp),(stretch_B, pred)]:
-                seq = pairs[1]
+         for pairs in [(stretch_A, dssp),(stretch_B, pred)]:
-                sets = pairs[0]
+             seq = pairs[1]
-                #print seq, sets
+             sets = pairs[0]
-                for i in range(len(seq)):
+             for i in range(len(seq)):
-                    if not started and seq[i] == typ:
+                 if not started and seq[i] == typ:
-                        started = True
+                     started = True
-                        start = i
+                     start = i
-                    if started and seq[i] != typ:
+                 if started and seq[i] != typ:
-                        started = False
+                     started = False
-                        #first position and first position after
+                     #first position and first position after
-                        sets.append((start,i))
+                     sets.append((start,i))
-                start = 0
+             start = 0
-            #print stretch_A
+         for segment in stretch_A:
-            #print stretch_B
+             segment_x+=segment[1]-segment[0]
-            overlapping_pairs = []
+         overlapping_pairs = []
-            a = 0
+         a = 0
-            b = 0
+         b = 0
-            while a < len(stretch_A) and b < len(stretch_B):
+         while a < len(stretch_A) and b < len(stretch_B):
-                a_start = stretch_A[a][0]
+             a_start = stretch_A[a][0]
-                a_end = stretch_A[a][1]
+             a_end = stretch_A[a][1]
-                b_start = stretch_B[b][0]
+             b_start = stretch_B[b][0]
-                b_end = stretch_B[b][1]
+             b_end = stretch_B[b][1]
-                #print "examining",stretch_A[a],stretch_B[b]
+             if a_start <= b_start:
-                if a_start <= b_start:
+                 if b_start <= a_end:
-                    if b_start <= a_end:
+                     overlapping_pairs.append((stretch_A[a],stretch_B[b]))
-                        overlapping_pairs.append((stretch_A[a],stretch_B[b]))
+                     b+=1
-                        segment_sum+=stretch_A[a][1]-stretch_A[a][0]
+                 else:
-                        b+=1
+                     a+=1
-                    else:
+             else:
-                        a+=1
+                 if a_start <= b_end:
-                else:
+                     overlapping_pairs.append((stretch_A[a],stretch_B[b]))
-                    if a_start <= b_end:
+                     a+=1
-                        overlapping_pairs.append((stretch_A[a],stretch_B[b]))
+                 else:
-                        segment_sum+=stretch_A[a][1]-stretch_A[a][0]
+                     b+=1
-                        a+=1
-                    else:
-                        b+=1
-            for pair in overlapping_pairs:
-                sov_x+= (pair[0][1]-pair[0][0])*(minov(pair)+delta(pair))/maxov(pair)
-            sov+=sov_x
-        return sov*100/segment_sum
+         segment_sum+=segment_x
-    def minov(pair):
-        a = pair[0]
-        b = pair[1]
-        return float(min(a[1],b[1]) - max(a[0],b[0]))
-    def maxov(pair):
+         for pair in overlapping_pairs:
-        a = pair[0]
+             sov_x+= (pair[0][1]-pair[0][0])*(minov(pair)+delta(pair))/maxov(pair)
-        b = pair[1]
+         sov+=sov_x
+         if typ == 'E':
-        return float(max([a[1],b[1]]) - min([a[0],b[0]]))
+             sov_E = sov_x*100/segment_x
+         elif typ == 'H':
+             sov_H = sov_x*100/segment_x
+         elif typ =='L':
+             sov_L = sov_x*100/segment_x
+     return (sov*100/segment_sum,sov_E,sov_H, sov_L)
-    def delta(pair):
+ def minov(pair):
+     a = pair[0]
-        return float(min([maxov(pair)-minov(pair),minov(pair),(pair[0][1]-pair[0][0])/2,(pair[1][1]-pair[1][0])/2]))
-    main()
+     b = pair[1]
+     return float(min(a[1],b[1]) - max(a[0],b[0]))
+ def maxov(pair):
+     a = pair[0]
+     b = pair[1]
+     return float(max([a[1],b[1]]) - min([a[0],b[0]]))
+ def delta(pair):
+     return float(min([maxov(pair)-minov(pair),minov(pair),(pair[0][1]-pair[0][0])/2,(pair[1][1]-pair[1][0])/2]))
+ main()
+</source>
+[[Category: Phenylketonuria 2012]]