Difference between revisions of "Task 2 lab journal (MSUD)"

From Bioinformatikpedia
(Created page with "== Sequence searches == == Multiple sequence alignments ==")
 
(Multiple sequence alignments)
Line 2: Line 2:
   
 
== Multiple sequence alignments ==
 
== Multiple sequence alignments ==
  +
  +
=== Dataset creation ===
  +
For creating datasets with low, high and whole range sequence identity, the following Python script was used:
  +
  +
  +
  +
<source lang="python">
  +
'''
  +
Use blast output to create datasets with different sequence identities.
  +
  +
Call with:
  +
python create_dataset.py <query fasta file> <blast xml output> <database fasta file>
  +
  +
@author: Laura Schiller
  +
'''
  +
  +
import sys
  +
from Bio import SeqIO, pairwise2
  +
from Bio.Blast import NCBIXML
  +
from Bio.SubsMat import MatrixInfo
  +
  +
def get_sequences(query, blast_xml, db_path, out_file):
  +
'''
  +
Fetch full length sequences for a BLAST result and write them in a FASTA file.
  +
  +
@param query: fasta file with query sequence.
  +
@param blast_xml: xml file with BLAST search result.
  +
@param db_path: path of db fasta file.
  +
@param out_file: file to store the sequences.
  +
@return: name of the file where the sequences are stored.
  +
'''
  +
  +
print("get all sequences for blast result in %s" % blast_xml)
  +
query_sequence = SeqIO.read(query, "fasta")
  +
hit_seqs = [query_sequence]
  +
  +
blast_output = open(blast_xml)
  +
blast_result = NCBIXML.read(blast_output)
  +
blast_output.close()
  +
  +
hit_list = []
  +
for alignment in blast_result.alignments:
  +
hit_list.append(alignment.title.split(" ")[1]) # the id of the sequence
  +
  +
# get all blast hits
  +
seqs_db = SeqIO.parse(db_path, "fasta")
  +
counter = 0
  +
number = len(blast_result.alignments)
  +
for seq in seqs_db:
  +
if seq.id in hit_list:
  +
hit_seqs.append(seq)
  +
counter = counter + 1
  +
if (counter % 100) == 0:
  +
print("%d of %d sequences found" % (counter, number))
  +
if counter == number:
  +
break
  +
print("%d of %d sequences found" % (counter, len(blast_result.alignments)))
  +
  +
SeqIO.write(hit_seqs, out_file, "fasta")
  +
  +
print("sequences saved in %s" % out_file)
  +
return out_file
  +
  +
def filter_seqs(seq_file, name):
  +
'''
  +
Filter sequences according to sequence identity limits.
  +
  +
@param seq_file: fasta file with sequences.
  +
@param name: string used for output file names.
  +
'''
  +
  +
sequences = SeqIO.parse(seq_file, "fasta")
  +
seqs = []
  +
for seq in sequences:
  +
seqs.append(seq)
  +
query = seqs.pop(0) # always keep query (first sequence)
  +
# lists for low / high / whole range sequence identity
  +
filtered = [[query], [query], [query]]
  +
  +
# identify hits with pdb structures -> these are preferentially taken
  +
hits_pdb = [hit for hit in seqs if (hit.id.split("|")[0] == "pdb")]
  +
seqs = [seq for seq in seqs if not (seq.id in [pdb_seq.id for pdb_seq in hits_pdb])]
  +
hits_pdb.extend(seqs) # now pdb hits are at the beginning
  +
  +
print("filter sequences")
  +
for seq in hits_pdb:
  +
try:
  +
ident = identity(query, seq)
  +
except KeyError: # raises if there is a non amino acid letter
  +
continue
  +
if (len(filtered[0]) < 10):
  +
keep = True
  +
for seq2 in filtered[0]:
  +
try:
  +
ident2 = identity(seq, seq2)
  +
except KeyError:
  +
keep = False
  +
continue
  +
if ident2 >= 0.3:
  +
keep = False
  +
break
  +
if keep:
  +
filtered[0].append(seq)
  +
if (len(filtered[1]) < 10) and (ident > 0.6):
  +
filtered[1].append(seq)
  +
if (len(filtered[2]) < 8) and (ident >= 0.3) and (ident <= 0.6):
  +
filtered[2].append(seq)
  +
if (len(filtered[0]) == 10) and (len(filtered[1]) == 10) and (len(filtered[2]) == 8):
  +
break
  +
  +
# for whole range take a part of low and a part of high sequence identity
  +
# plus the rest middle sequence identity
  +
filtered[2].extend(filtered[0][1:min(len(filtered[0]), 7)])
  +
filtered[2].extend(filtered[1][1:min(len(filtered[1]), 7)])
  +
  +
SeqIO.write(filtered[0], name + "_low_seq_ident.fasta", "fasta")
  +
SeqIO.write(filtered[1], name + "_high_seq_ident.fasta", "fasta")
  +
SeqIO.write(filtered[2], name + "_whole_range_seq_ident.fasta", "fasta")
  +
  +
print("sequences with low / high / whole range sequence identity saved in:")
  +
print(name + "_low_seq_ident.fasta")
  +
print(name + "_high_seq_ident.fasta")
  +
print(name + "_whole_range_seq_ident.fasta")
  +
  +
def identity(seq1, seq2):
  +
#pairwise alignment
  +
matrix = MatrixInfo.blosum62
  +
gap_open = -10
  +
gap_extend = -0.5
  +
alignment = pairwise2.align.globalds(seq1, seq2, matrix, gap_open, gap_extend)
  +
seq1_aligned = alignment[0][0]
  +
seq2_aligned = alignment[0][1]
  +
  +
#sequence identity
  +
ident = sum(c1 == c2 for c1, c2 in zip(seq1_aligned, seq2_aligned))
  +
ref_length = (len(seq1) + len(seq2)) / 2 # mean length
  +
return float(ident) / ref_length
  +
  +
  +
if __name__ == '__main__':
  +
namestring = sys.argv[1].split("/")[-1].split(".")[0] # used as beginning of the output files
  +
print("-----------------------------------------------------------")
  +
all_seqs = get_sequences(sys.argv[1], sys.argv[2], sys.argv[3], namestring + "_all_sequences.fasta")
  +
filter_seqs(all_seqs, namestring)
  +
</source>
  +
  +
  +
  +
=== Calling MSA programs ===
  +
  +
Call of T-Coffee:
  +
  +
<pre>
  +
#!/bin/bash
  +
  +
proteins=( BCKDHA BCKDHB DBT DLD )
  +
identities=( low high whole_range )
  +
  +
for protein in ${proteins[*]}
  +
do
  +
for identity in ${identities[*]}
  +
do
  +
t_coffee -output fasta -infile ${protein}_${identity}_seq_ident.fasta -outfile ${protein}_${identity}_seq_ident_tcoffee.fasta
  +
done
  +
done
  +
</pre>
  +
  +
Muscle:
  +
  +
<pre>
  +
#!/bin/bash
  +
  +
proteins=( BCKDHA BCKDHB DBT DLD )
  +
identities=( low high whole_range )
  +
  +
for protein in ${proteins[*]}
  +
do
  +
for identity in ${identities[*]}
  +
do
  +
muscle -in ${protein}_${identity}_seq_ident.fasta -out ${protein}_${identity}_seq_ident_muscle.fasta
  +
done
  +
done
  +
</pre>
  +
  +
Mafft:
  +
  +
<pre>
  +
#!/bin/bash
  +
  +
proteins=( BCKDHA BCKDHB DBT DLD )
  +
identities=( low high whole_range )
  +
  +
for protein in ${proteins[*]}
  +
do
  +
for identity in ${identities[*]}
  +
do
  +
mafft ${protein}_${identity}_seq_ident.fasta > ${protein}_${identity}_seq_ident_mafft.fasta
  +
done
  +
done
  +
</pre>

Revision as of 10:44, 4 May 2013

Sequence searches

Multiple sequence alignments

Dataset creation

For creating datasets with low, high and whole range sequence identity, the following Python script was used:


<source lang="python"> Use blast output to create datasets with different sequence identities.

Call with: python create_dataset.py <query fasta file> <blast xml output> <database fasta file>

@author: Laura Schiller

import sys from Bio import SeqIO, pairwise2 from Bio.Blast import NCBIXML from Bio.SubsMat import MatrixInfo

def get_sequences(query, blast_xml, db_path, out_file):

   
   Fetch full length sequences for a BLAST result and write them in a FASTA file.
   
   @param query:     fasta file with query sequence.
   @param blast_xml: xml file with BLAST search result.
   @param db_path:   path of db fasta file.
   @param out_file:  file to store the sequences.
   @return:          name of the file where the sequences are stored.
   
   
   print("get all sequences for blast result in %s" % blast_xml)
   query_sequence = SeqIO.read(query, "fasta")
   hit_seqs = [query_sequence]
   
   blast_output = open(blast_xml)
   blast_result = NCBIXML.read(blast_output)
   blast_output.close()
   
   hit_list = []
   for alignment in blast_result.alignments:
       hit_list.append(alignment.title.split(" ")[1]) # the id of the sequence    
   
   # get all blast hits
   seqs_db = SeqIO.parse(db_path, "fasta")
   counter = 0
   number = len(blast_result.alignments)
   for seq in seqs_db:
       if seq.id in hit_list:
           hit_seqs.append(seq)
           counter = counter + 1
           if (counter % 100) == 0:
               print("%d of %d sequences found" % (counter, number))
           if counter == number:
               break
   print("%d of %d sequences found" % (counter, len(blast_result.alignments)))        
   
   SeqIO.write(hit_seqs, out_file, "fasta")
   
   print("sequences saved in %s" % out_file)
   return out_file

def filter_seqs(seq_file, name):

   
   Filter sequences according to sequence identity limits.
   
   @param seq_file: fasta file with sequences.
   @param name:     string used for output file names.
   
   
   sequences = SeqIO.parse(seq_file, "fasta")
   seqs = []
   for seq in sequences:
       seqs.append(seq)
   query = seqs.pop(0) # always keep query (first sequence)
   # lists for low / high / whole range sequence identity
   filtered = [[query], [query], [query]]
   
   # identify hits with pdb structures -> these are preferentially taken
   hits_pdb = [hit for hit in seqs if (hit.id.split("|")[0] == "pdb")]
   seqs = [seq for seq in seqs if not (seq.id in [pdb_seq.id for pdb_seq in hits_pdb])]
   hits_pdb.extend(seqs) # now pdb hits are at the beginning
   
   print("filter sequences")
   for seq in hits_pdb:           
       try: 
           ident = identity(query, seq) 
       except KeyError: # raises if there is a non amino acid letter
           continue
       if (len(filtered[0]) < 10):
           keep = True            
           for seq2 in filtered[0]:
               try: 
                   ident2 = identity(seq, seq2) 
               except KeyError:
                   keep = False
                   continue
               if ident2 >= 0.3:
                   keep = False
                   break
           if keep:
               filtered[0].append(seq)
       if (len(filtered[1]) < 10) and (ident > 0.6):
           filtered[1].append(seq)
       if (len(filtered[2]) < 8) and (ident >= 0.3) and (ident <= 0.6):
           filtered[2].append(seq)
       if (len(filtered[0]) == 10) and (len(filtered[1]) == 10) and (len(filtered[2]) == 8):
           break
   
   # for whole range take a part of low and a part of high sequence identity
   # plus the rest middle sequence identity
   filtered[2].extend(filtered[0][1:min(len(filtered[0]), 7)])
   filtered[2].extend(filtered[1][1:min(len(filtered[1]), 7)])
   
   SeqIO.write(filtered[0], name + "_low_seq_ident.fasta", "fasta")
   SeqIO.write(filtered[1], name + "_high_seq_ident.fasta", "fasta")
   SeqIO.write(filtered[2], name + "_whole_range_seq_ident.fasta", "fasta")
   
   print("sequences with low / high / whole range sequence identity saved in:")
   print(name + "_low_seq_ident.fasta")
   print(name + "_high_seq_ident.fasta")
   print(name + "_whole_range_seq_ident.fasta") 

def identity(seq1, seq2):

   #pairwise alignment
   matrix = MatrixInfo.blosum62
   gap_open = -10
   gap_extend = -0.5     
   alignment = pairwise2.align.globalds(seq1, seq2, matrix, gap_open, gap_extend)
   seq1_aligned = alignment[0][0]
   seq2_aligned = alignment[0][1]
   #sequence identity
   ident = sum(c1 == c2 for c1, c2 in zip(seq1_aligned, seq2_aligned))
   ref_length = (len(seq1) + len(seq2)) / 2 # mean length
   return float(ident) / ref_length


if __name__ == '__main__':

   namestring = sys.argv[1].split("/")[-1].split(".")[0] # used as beginning of the output files
   print("-----------------------------------------------------------")
   all_seqs = get_sequences(sys.argv[1], sys.argv[2], sys.argv[3], namestring + "_all_sequences.fasta")
   filter_seqs(all_seqs, namestring)

</source>


Calling MSA programs

Call of T-Coffee:

#!/bin/bash

proteins=( BCKDHA BCKDHB DBT DLD )
identities=( low high whole_range )

for protein in ${proteins[*]}
do
	for identity in ${identities[*]}
	do
		t_coffee -output fasta -infile ${protein}_${identity}_seq_ident.fasta -outfile ${protein}_${identity}_seq_ident_tcoffee.fasta 
	done
done

Muscle:

#!/bin/bash

proteins=( BCKDHA BCKDHB DBT DLD )
identities=( low high whole_range )

for protein in ${proteins[*]}
do
	for identity in ${identities[*]}
	do
		muscle -in ${protein}_${identity}_seq_ident.fasta -out ${protein}_${identity}_seq_ident_muscle.fasta 
	done
done

Mafft:

#!/bin/bash

proteins=( BCKDHA BCKDHB DBT DLD )
identities=( low high whole_range )

for protein in ${proteins[*]}
do
	for identity in ${identities[*]}
	do
		mafft ${protein}_${identity}_seq_ident.fasta > ${protein}_${identity}_seq_ident_mafft.fasta  
	done
done