Collection of scripts

From Bioinformatikpedia
Revision as of 17:09, 6 May 2012 by Boidolj (talk | contribs) (Created page with "A completely unordered, yet to become truly usefull collection of scripts we used. ==Sequence Alignments== #!/usr/bin/env python #simple sript to count gap…")
(diff) ← Older revision | Latest revision (diff) | Newer revision → (diff)

A completely unordered, yet to become truly usefull collection of scripts we used.

Sequence Alignments

#!/usr/bin/env python
#simple sript to count gaps and conserved columns in multiple alignment files
import sys
from collections import defaultdict

def default_string():
    return ""

def aln_stats(filename):
    fh = open(filename,'r')
    sequences = defaultdict(default_string)
    for line in fh:
        if line[:2].strip() and not line.startswith("CLUSTAL"):
            seq = line.split()
            sequences[seq[0]]+="".join(seq[1:])
    fh.close()

    for key in sequences:
        print key, sequences[key].count('-')
    print "conserved:",conserved(sequences)
    
def fasta_stats(filename):
    fh = open(filename,'r')
    gap_count = 0
    seq_id = ""
    sequences = defaultdict(default_string)
    for line in fh:
        if line.startswith(">"):
            if gap_count:
                print seq_id, gap_count
            gap_count = 0
            seq_id = line.split()[0][1:]
        else:
            gap_count+=line.count('-')
            sequences[seq_id]+=line.strip()

    print seq_id, gap_count,"\nconserved:",conserved(sequences)
    fh.close() 

def conserved(sequence_dict):
    conserved_count = 0
    for i in range(len(sequence_dict.values()[0])):
        col = set([])
        for seq in sequence_dict.values():
            col.add(seq[i])
        if len(col)== 1:
            conserved_count+=1
    return conserved_count 

if "--help" in sys.argv or len(sys.argv)<2:
    print "usage: python %s <filename>"%sys.argv[0]
elif sys.argv[1].endswith(".aln"):
    aln_stats(sys.argv[1])
elif sys.argv[1].endswith(".fasta"):
    fasta_stats(sys.argv[1])
else:
   print "Only .aln or .fasta files, sorry!"