Difference between revisions of "Collection of scripts"
From Bioinformatikpedia
(→ss_score.py) |
|||
(12 intermediate revisions by the same user not shown) | |||
Line 1: | Line 1: | ||
− | A completely unordered, yet to become truly |
+ | A completely unordered, yet to become truly useful collection of scripts [[Phenylketonuria|we]] used. |
+ | ==Sequence Alignments== |
||
− | ==PSIBlasttrainer.pl== |
||
+ | ===PSIBlasttrainer.pl=== |
||
+ | <source lang="perl"> |
||
#! /usr/bin/perl |
#! /usr/bin/perl |
||
use strict; |
use strict; |
||
Line 54: | Line 56: | ||
close TIME; |
close TIME; |
||
+ | </source> |
||
+ | ===Sequence Alignment Statistics=== |
||
− | |||
+ | <source lang="python"> |
||
− | ==Sequence Alignments== |
||
#!/usr/bin/env python |
#!/usr/bin/env python |
||
#simple sript to count gaps and conserved columns in multiple alignment files |
#simple sript to count gaps and conserved columns in multiple alignment files |
||
Line 114: | Line 117: | ||
else: |
else: |
||
print "Only .aln or .fasta files, sorry!" |
print "Only .aln or .fasta files, sorry!" |
||
+ | |||
+ | </source> |
||
+ | |||
+ | ===HHextract.py=== |
||
+ | <source lang="python"> |
||
+ | #!/usr/bin/env python |
||
+ | #this simply takes output of HHblits, extracts the IDs of cluster members and adds a clusteridentifier |
||
+ | #output: |
||
+ | #ID1 CL1 E-Value Identities |
||
+ | #ID2 CL1 E-Value Identities |
||
+ | #ID3 CL1 E-Value Identities |
||
+ | #ID4 CL2 E-Value Identities |
||
+ | #ID4 CL2 E-Value Identities |
||
+ | #.. |
||
+ | # |
||
+ | #usage: python hhextract.py <infile> <outfile> |
||
+ | |||
+ | import sys, re |
||
+ | try: |
||
+ | infile = open(sys.argv[1]) |
||
+ | except IOError: |
||
+ | print "Couldn't open input file" |
||
+ | exit(-1) |
||
+ | try: |
||
+ | outfile= open(sys.argv[2],'w') |
||
+ | except Exception: |
||
+ | print "Couldn't open output file" |
||
+ | exit(-1) |
||
+ | |||
+ | for line in infile: |
||
+ | if line.startswith('>'): |
||
+ | splits = line.split('|') |
||
+ | to_find = int(splits[2]) |
||
+ | #check number of clustermembers, skip and print clusterID if confusion |
||
+ | #if len(splits)!= (4+to_find): |
||
+ | # print line[:15] |
||
+ | # print to_find, splits |
||
+ | # exit(-1) |
||
+ | line = infile.next() |
||
+ | splits2 = line.split() |
||
+ | for member in splits[-to_find:]: |
||
+ | memberID = member.split()[0].strip() |
||
+ | if memberID.endswith('.'): |
||
+ | memberID = memberID[:-1] |
||
+ | outfile.write("%s %s %s %s\n"%(memberID, splits[1], splits2[1].split('=')[1], splits2[4].split('=')[1])) |
||
+ | infile.close() |
||
+ | outfile.close() |
||
+ | |||
+ | </source> |
||
+ | |||
+ | ===getEvalues.pl=== |
||
+ | <source lang="perl"> |
||
+ | #!/usr/bin/perl |
||
+ | use strict; |
||
+ | use DirHandle; |
||
+ | |||
+ | my $dir = $ARGV[0]; |
||
+ | my $blast = $ARGV[1]; |
||
+ | |||
+ | sub plainfiles { |
||
+ | my $dir = shift; |
||
+ | my $dh = DirHandle->new($dir) or die "can't opendir $dir: $!"; |
||
+ | return sort # sort pathnames |
||
+ | grep { -f } # choose only "plain" files |
||
+ | map { "$dir/$_" } # create full paths |
||
+ | grep { !/^\./ } # filter out dot files |
||
+ | grep { !/\.dat$/} |
||
+ | grep { !/\.pl$/} |
||
+ | grep { !/\.py$/} |
||
+ | grep { !/\.png$/} |
||
+ | $dh-> |
||
+ | read() |
||
+ | ; # read all entries |
||
+ | } |
||
+ | |||
+ | my @files = plainfiles($dir); |
||
+ | |||
+ | if($blast eq "blast"){ |
||
+ | for(@files){ |
||
+ | open(READER ,"<",$_); |
||
+ | open(WRITER ,">",$_.".dat"); |
||
+ | my $eval; |
||
+ | while(<READER>){ |
||
+ | if(/Expect = (.*?),/){ |
||
+ | #print "buuuuu"; |
||
+ | if($1 == 0){ |
||
+ | $eval = "-INF"; |
||
+ | }else{ |
||
+ | $eval = log($1)/log(10); |
||
+ | } |
||
+ | #Identities = 314/440 (71%) |
||
+ | }elsif(/Identities = .*?\/.*? \((.*?)\%\),/){ |
||
+ | #print "tralaöa"; |
||
+ | print WRITER "$eval 0\.$1\n"; |
||
+ | }elsif(/Results of round/){ |
||
+ | close WRITER; |
||
+ | open(WRITER ,">",$_.".dat"); |
||
+ | } |
||
+ | } |
||
+ | close READER; |
||
+ | close WRITER; |
||
+ | } |
||
+ | }else{ |
||
+ | for(@files){ |
||
+ | open(READER ,"<",$_); |
||
+ | open(WRITER ,">",$_.".dat"); |
||
+ | while(<READER>){ |
||
+ | |||
+ | my @fields = split(" ",$_); |
||
+ | my $eval; |
||
+ | if($fields[2] == 0){ |
||
+ | $eval = "-INF"; |
||
+ | }else{ |
||
+ | $eval = log($fields[2])/log(10); |
||
+ | } |
||
+ | my $ident =$fields[3]; |
||
+ | $ident =~ s/\%//; |
||
+ | $ident = "0."+$ident; |
||
+ | print WRITER "$eval $ident\n"; |
||
+ | |||
+ | } |
||
+ | close READER; |
||
+ | close WRITER; |
||
+ | } |
||
+ | } |
||
+ | |||
+ | </source> |
||
+ | |||
+ | ==Sequence based predictions== |
||
+ | ===ss_score.py=== |
||
+ | <source lang="python"> |
||
+ | #!/usr/bin/env python |
||
+ | #calculates Q3 and SOV for two (already aligned) assignments of structure |
||
+ | #in files containing only the structure (no comment line, no header, mayyybe newlines..) |
||
+ | #usage: python ss_score.py <dssp_file> <prediction_file> |
||
+ | |||
+ | import sys |
||
+ | |||
+ | def main(): |
||
+ | #read files, map DSSP characters to EHL and C to L |
||
+ | #strip of newlines.. |
||
+ | dssp = open(sys.argv[1]).read().strip().replace("S","L").replace("B","L").replace("I","L").replace("G","H").replace("T","L").replace("\n","") |
||
+ | pred_tmp = open(sys.argv[2]).read().strip().replace("C","L").replace("\n","") |
||
+ | |||
+ | #if DSSP has only '-', ignore in prediction as well |
||
+ | pred = "" |
||
+ | for i in range(len(dssp)): |
||
+ | if dssp[i] == "-": |
||
+ | pred+= "-" |
||
+ | else: |
||
+ | pred+=pred_tmp[i] |
||
+ | |||
+ | q_values = q3(dssp,pred) |
||
+ | sov_values = sov(dssp,pred) |
||
+ | print "Q3 =", q_values[0],"\nQ_E =", q_values[1],"\nQ_H =", q_values[2],"\nQ_L =", q_values[3] |
||
+ | print "SOV =", sov_values[0],"\nSOV_E =", sov_values[1],"\nSOV_H =", sov_values[2],"\nSOV_L =", sov_values[3] |
||
+ | |||
+ | def q3(dssp,pred): |
||
+ | residues_E = 0 |
||
+ | residues_H = 0 |
||
+ | residues_L = 0 |
||
+ | match_E = 0 |
||
+ | match_H = 0 |
||
+ | match_L = 0 |
||
+ | residues = {'E':residues_E,'H':residues_H,'L':residues_L} |
||
+ | match = {'E':match_E,'H':match_H,'L':match_L} |
||
+ | for i in range(len(dssp)): |
||
+ | if dssp[i] != "-": |
||
+ | residues[dssp[i]]+=1 |
||
+ | if dssp[i] == pred[i]: |
||
+ | match[dssp[i]]+=1 |
||
+ | matches = sum(match.values()) |
||
+ | resid = sum(residues.values()) |
||
+ | return (matches*100.0/resid,match['E']*100.0/residues['E'],match['H']*100.0/residues['H'],match['L']*100.0/residues['L']) |
||
+ | |||
+ | def sov(dssp,pred): |
||
+ | sov = 0 |
||
+ | sov_E = 0 |
||
+ | sov_H = 0 |
||
+ | sov_L = 0 |
||
+ | segment_sum = 0 |
||
+ | for typ in ["E","H","L"]: |
||
+ | stretch_A = [] |
||
+ | stretch_B = [] |
||
+ | |||
+ | started = False |
||
+ | start = 0 |
||
+ | |||
+ | sov_x = 0 |
||
+ | segment_x = 0 |
||
+ | |||
+ | for pairs in [(stretch_A, dssp),(stretch_B, pred)]: |
||
+ | seq = pairs[1] |
||
+ | sets = pairs[0] |
||
+ | for i in range(len(seq)): |
||
+ | if not started and seq[i] == typ: |
||
+ | started = True |
||
+ | start = i |
||
+ | |||
+ | if started and seq[i] != typ: |
||
+ | started = False |
||
+ | #first position and first position after |
||
+ | sets.append((start,i)) |
||
+ | start = 0 |
||
+ | for segment in stretch_A: |
||
+ | segment_x+=segment[1]-segment[0] |
||
+ | overlapping_pairs = [] |
||
+ | a = 0 |
||
+ | b = 0 |
||
+ | while a < len(stretch_A) and b < len(stretch_B): |
||
+ | a_start = stretch_A[a][0] |
||
+ | a_end = stretch_A[a][1] |
||
+ | |||
+ | b_start = stretch_B[b][0] |
||
+ | b_end = stretch_B[b][1] |
||
+ | if a_start <= b_start: |
||
+ | if b_start <= a_end: |
||
+ | overlapping_pairs.append((stretch_A[a],stretch_B[b])) |
||
+ | b+=1 |
||
+ | else: |
||
+ | a+=1 |
||
+ | else: |
||
+ | if a_start <= b_end: |
||
+ | overlapping_pairs.append((stretch_A[a],stretch_B[b])) |
||
+ | a+=1 |
||
+ | else: |
||
+ | b+=1 |
||
+ | |||
+ | segment_sum+=segment_x |
||
+ | |||
+ | for pair in overlapping_pairs: |
||
+ | sov_x+= (pair[0][1]-pair[0][0])*(minov(pair)+delta(pair))/maxov(pair) |
||
+ | sov+=sov_x |
||
+ | if typ == 'E': |
||
+ | sov_E = sov_x*100/segment_x |
||
+ | elif typ == 'H': |
||
+ | sov_H = sov_x*100/segment_x |
||
+ | elif typ =='L': |
||
+ | sov_L = sov_x*100/segment_x |
||
+ | return (sov*100/segment_sum,sov_E,sov_H, sov_L) |
||
+ | |||
+ | def minov(pair): |
||
+ | a = pair[0] |
||
+ | b = pair[1] |
||
+ | return float(min(a[1],b[1]) - max(a[0],b[0])) |
||
+ | |||
+ | def maxov(pair): |
||
+ | a = pair[0] |
||
+ | b = pair[1] |
||
+ | return float(max([a[1],b[1]]) - min([a[0],b[0]])) |
||
+ | |||
+ | def delta(pair): |
||
+ | return float(min([maxov(pair)-minov(pair),minov(pair),(pair[0][1]-pair[0][0])/2,(pair[1][1]-pair[1][0])/2])) |
||
+ | |||
+ | main() |
||
+ | </source> |
||
+ | |||
+ | [[Category: Phenylketonuria 2012]] |
Latest revision as of 12:01, 29 August 2012
A completely unordered, yet to become truly useful collection of scripts we used.
Contents
Sequence Alignments
PSIBlasttrainer.pl
<source lang="perl">
#! /usr/bin/perl use strict; use Getopt::Long; my $input; my $output; my $database; my $maxIt; my $before; my $after; my $result = GetOptions ("i=s" => \$input,"o=s" => \$output,"d=s" => \$database,"j=i" => \$maxIt); open(TIME ,">","time.txt"); print TIME "\# time used for psiblast\n"; #run psiblast each time and save the time; for(my $i =1;$i<=$maxIt;$i++){ print "iteration: $i\n"; $before = time(); open STATUS, "-|","time blastpgp -j $i -d $database -i $input -o $output"."_".$i."_it" or die "can't fork: $!"; while(<STATUS>){ print TIME $_; } $after = time(); print TIME ($after-$before)."\n"; close STATUS; } print TIME ">"; for(my $i=1;$i<$maxIt;$i++){ my $eval = 1/(10**$i); print "iteration: $i\n"; $before = time(); system("time blastpgp -j 1 -d $database -i $input -o $output" . "_eVal_" . "$eval -e $eval"); $after = time(); print TIME ($after-$before)."\n"; close STATUS; } print TIME ">\n" for(my $i=1;$i<$maxIt;$i++){ my $eval = 1/(10**$i); print "iteration: $i\n"; $before = time(); system("time blastpgp -j 1 -d $database -i $input -o $output" . "_integr_" . "$eval -h $eval"); $after = time(); print TIME ($after-$before)."\n"; close STATUS; } close TIME;
</source>
Sequence Alignment Statistics
<source lang="python">
#!/usr/bin/env python #simple sript to count gaps and conserved columns in multiple alignment files import sys from collections import defaultdict def default_string(): return "" def aln_stats(filename): fh = open(filename,'r') sequences = defaultdict(default_string) for line in fh: if line[:2].strip() and not line.startswith("CLUSTAL"): seq = line.split() sequences[seq[0]]+="".join(seq[1:]) fh.close() for key in sequences: print key, sequences[key].count('-') print "conserved:",conserved(sequences) def fasta_stats(filename): fh = open(filename,'r') gap_count = 0 seq_id = "" sequences = defaultdict(default_string) for line in fh: if line.startswith(">"): if gap_count: print seq_id, gap_count gap_count = 0 seq_id = line.split()[0][1:] else: gap_count+=line.count('-') sequences[seq_id]+=line.strip() print seq_id, gap_count,"\nconserved:",conserved(sequences) fh.close() def conserved(sequence_dict): conserved_count = 0 for i in range(len(sequence_dict.values()[0])): col = set([]) for seq in sequence_dict.values(): col.add(seq[i]) if len(col)== 1: conserved_count+=1 return conserved_count if "--help" in sys.argv or len(sys.argv)<2: print "usage: python %s <filename>"%sys.argv[0] elif sys.argv[1].endswith(".aln"): aln_stats(sys.argv[1]) elif sys.argv[1].endswith(".fasta"): fasta_stats(sys.argv[1]) else: print "Only .aln or .fasta files, sorry!"
</source>
HHextract.py
<source lang="python">
#!/usr/bin/env python #this simply takes output of HHblits, extracts the IDs of cluster members and adds a clusteridentifier #output: #ID1 CL1 E-Value Identities #ID2 CL1 E-Value Identities #ID3 CL1 E-Value Identities #ID4 CL2 E-Value Identities #ID4 CL2 E-Value Identities #.. # #usage: python hhextract.py <infile> <outfile> import sys, re try: infile = open(sys.argv[1]) except IOError: print "Couldn't open input file" exit(-1) try: outfile= open(sys.argv[2],'w') except Exception: print "Couldn't open output file" exit(-1) for line in infile: if line.startswith('>'): splits = line.split('|') to_find = int(splits[2]) #check number of clustermembers, skip and print clusterID if confusion #if len(splits)!= (4+to_find): # print line[:15] # print to_find, splits # exit(-1) line = infile.next() splits2 = line.split() for member in splits[-to_find:]: memberID = member.split()[0].strip() if memberID.endswith('.'): memberID = memberID[:-1] outfile.write("%s %s %s %s\n"%(memberID, splits[1], splits2[1].split('=')[1], splits2[4].split('=')[1])) infile.close() outfile.close()
</source>
getEvalues.pl
<source lang="perl">
#!/usr/bin/perl use strict; use DirHandle; my $dir = $ARGV[0]; my $blast = $ARGV[1]; sub plainfiles { my $dir = shift; my $dh = DirHandle->new($dir) or die "can't opendir $dir: $!"; return sort # sort pathnames grep { -f } # choose only "plain" files map { "$dir/$_" } # create full paths grep { !/^\./ } # filter out dot files grep { !/\.dat$/} grep { !/\.pl$/} grep { !/\.py$/} grep { !/\.png$/} $dh-> read() ; # read all entries } my @files = plainfiles($dir); if($blast eq "blast"){ for(@files){ open(READER ,"<",$_); open(WRITER ,">",$_.".dat"); my $eval; while(<READER>){ if(/Expect = (.*?),/){ #print "buuuuu"; if($1 == 0){ $eval = "-INF"; }else{ $eval = log($1)/log(10); } #Identities = 314/440 (71%) }elsif(/Identities = .*?\/.*? \((.*?)\%\),/){ #print "tralaöa"; print WRITER "$eval 0\.$1\n"; }elsif(/Results of round/){ close WRITER; open(WRITER ,">",$_.".dat"); } } close READER; close WRITER; } }else{ for(@files){ open(READER ,"<",$_); open(WRITER ,">",$_.".dat"); while(<READER>){ my @fields = split(" ",$_); my $eval; if($fields[2] == 0){ $eval = "-INF"; }else{ $eval = log($fields[2])/log(10); } my $ident =$fields[3]; $ident =~ s/\%//; $ident = "0."+$ident; print WRITER "$eval $ident\n"; } close READER; close WRITER; } }
</source>
Sequence based predictions
ss_score.py
<source lang="python">
#!/usr/bin/env python #calculates Q3 and SOV for two (already aligned) assignments of structure #in files containing only the structure (no comment line, no header, mayyybe newlines..) #usage: python ss_score.py <dssp_file> <prediction_file> import sys def main(): #read files, map DSSP characters to EHL and C to L #strip of newlines.. dssp = open(sys.argv[1]).read().strip().replace("S","L").replace("B","L").replace("I","L").replace("G","H").replace("T","L").replace("\n","") pred_tmp = open(sys.argv[2]).read().strip().replace("C","L").replace("\n","") #if DSSP has only '-', ignore in prediction as well pred = "" for i in range(len(dssp)): if dssp[i] == "-": pred+= "-" else: pred+=pred_tmp[i] q_values = q3(dssp,pred) sov_values = sov(dssp,pred) print "Q3 =", q_values[0],"\nQ_E =", q_values[1],"\nQ_H =", q_values[2],"\nQ_L =", q_values[3] print "SOV =", sov_values[0],"\nSOV_E =", sov_values[1],"\nSOV_H =", sov_values[2],"\nSOV_L =", sov_values[3] def q3(dssp,pred): residues_E = 0 residues_H = 0 residues_L = 0 match_E = 0 match_H = 0 match_L = 0 residues = {'E':residues_E,'H':residues_H,'L':residues_L} match = {'E':match_E,'H':match_H,'L':match_L} for i in range(len(dssp)): if dssp[i] != "-": residues[dssp[i]]+=1 if dssp[i] == pred[i]: match[dssp[i]]+=1 matches = sum(match.values()) resid = sum(residues.values()) return (matches*100.0/resid,match['E']*100.0/residues['E'],match['H']*100.0/residues['H'],match['L']*100.0/residues['L']) def sov(dssp,pred): sov = 0 sov_E = 0 sov_H = 0 sov_L = 0 segment_sum = 0 for typ in ["E","H","L"]: stretch_A = [] stretch_B = [] started = False start = 0 sov_x = 0 segment_x = 0 for pairs in [(stretch_A, dssp),(stretch_B, pred)]: seq = pairs[1] sets = pairs[0] for i in range(len(seq)): if not started and seq[i] == typ: started = True start = i if started and seq[i] != typ: started = False #first position and first position after sets.append((start,i)) start = 0 for segment in stretch_A: segment_x+=segment[1]-segment[0] overlapping_pairs = [] a = 0 b = 0 while a < len(stretch_A) and b < len(stretch_B): a_start = stretch_A[a][0] a_end = stretch_A[a][1] b_start = stretch_B[b][0] b_end = stretch_B[b][1] if a_start <= b_start: if b_start <= a_end: overlapping_pairs.append((stretch_A[a],stretch_B[b])) b+=1 else: a+=1 else: if a_start <= b_end: overlapping_pairs.append((stretch_A[a],stretch_B[b])) a+=1 else: b+=1 segment_sum+=segment_x for pair in overlapping_pairs: sov_x+= (pair[0][1]-pair[0][0])*(minov(pair)+delta(pair))/maxov(pair) sov+=sov_x if typ == 'E': sov_E = sov_x*100/segment_x elif typ == 'H': sov_H = sov_x*100/segment_x elif typ =='L': sov_L = sov_x*100/segment_x return (sov*100/segment_sum,sov_E,sov_H, sov_L) def minov(pair): a = pair[0] b = pair[1] return float(min(a[1],b[1]) - max(a[0],b[0])) def maxov(pair): a = pair[0] b = pair[1] return float(max([a[1],b[1]]) - min([a[0],b[0]])) def delta(pair): return float(min([maxov(pair)-minov(pair),minov(pair),(pair[0][1]-pair[0][0])/2,(pair[1][1]-pair[1][0])/2])) main()
</source>