Difference between revisions of "Task 2 lab journal (MSUD)"
(→Dataset creation) |
(→Dataset creation) |
||
Line 155: | Line 155: | ||
filter_seqs(all_seqs, namestring) |
filter_seqs(all_seqs, namestring) |
||
</source> |
</source> |
||
+ | |||
+ | |||
+ | |||
+ | Note: for DBT there were some problems aligning very long sequences to calculate mutual sequence identities. So there, for the low sequence identity group, only sequences that have less than 6 times the length of the query sequence were taken. |
||
=== Calling MSA programs === |
=== Calling MSA programs === |
Revision as of 10:23, 5 May 2013
Contents
Sequence searches
Multiple sequence alignments
Dataset creation
The datasets were created from the Blast output.
For creating datasets with low, high and whole range sequence identity, the following Python script was used:
<source lang="python"> Use blast output to create datasets with different sequence identities.
Call with: python create_dataset.py <query fasta file> <blast xml output> <database fasta file>
@author: Laura Schiller
import sys from Bio import SeqIO, pairwise2 from Bio.Blast import NCBIXML from Bio.SubsMat import MatrixInfo
def get_sequences(query, blast_xml, db_path, out_file):
Fetch full length sequences for a BLAST result and write them in a FASTA file. @param query: fasta file with query sequence. @param blast_xml: xml file with BLAST search result. @param db_path: path of db fasta file. @param out_file: file to store the sequences. @return: name of the file where the sequences are stored. print("get all sequences for blast result in %s" % blast_xml) query_sequence = SeqIO.read(query, "fasta") hit_seqs = [query_sequence] blast_output = open(blast_xml) blast_result = NCBIXML.read(blast_output) blast_output.close() hit_list = [] for alignment in blast_result.alignments: hit_list.append(alignment.title.split(" ")[1]) # the id of the sequence # get all blast hits seqs_db = SeqIO.parse(db_path, "fasta") counter = 0 number = len(blast_result.alignments) for seq in seqs_db: if seq.id in hit_list: hit_seqs.append(seq) counter = counter + 1 if (counter % 100) == 0: print("%d of %d sequences found" % (counter, number)) if counter == number: break print("%d of %d sequences found" % (counter, len(blast_result.alignments))) SeqIO.write(hit_seqs, out_file, "fasta") print("sequences saved in %s" % out_file) return out_file
def filter_seqs(seq_file, name):
Filter sequences according to sequence identity limits. @param seq_file: fasta file with sequences. @param name: string used for output file names. sequences = SeqIO.parse(seq_file, "fasta") seqs = [] for seq in sequences: seqs.append(seq) query = seqs.pop(0) # always keep query (first sequence) # lists for low / high / whole range sequence identity filtered = [[query], [query], [query]] # identify hits with pdb structures -> these are preferentially taken hits_pdb = [hit for hit in seqs if (hit.id.split("|")[0] == "pdb")] seqs = [seq for seq in seqs if not (seq.id in [pdb_seq.id for pdb_seq in hits_pdb])] hits_pdb.extend(seqs) # now pdb hits are at the beginning print("filter sequences") for seq in hits_pdb: try: ident = identity(query, seq) except KeyError: # raises if there is a non amino acid letter continue if (len(filtered[0]) < 10): keep = True for seq2 in filtered[0]: try: ident2 = identity(seq, seq2) except KeyError: keep = False continue if ident2 >= 0.3: keep = False break if keep: filtered[0].append(seq) if (len(filtered[1]) < 10) and (ident > 0.6): filtered[1].append(seq) if (len(filtered[2]) < 8) and (ident >= 0.3) and (ident <= 0.6): filtered[2].append(seq) if (len(filtered[0]) == 10) and (len(filtered[1]) == 10) and (len(filtered[2]) == 8): break # for whole range take a part of low and a part of high sequence identity # plus the rest middle sequence identity filtered[2].extend(filtered[0][1:min(len(filtered[0]), 7)]) filtered[2].extend(filtered[1][1:min(len(filtered[1]), 7)]) SeqIO.write(filtered[0], name + "_low_seq_ident.fasta", "fasta") SeqIO.write(filtered[1], name + "_high_seq_ident.fasta", "fasta") SeqIO.write(filtered[2], name + "_whole_range_seq_ident.fasta", "fasta") print("sequences with low / high / whole range sequence identity saved in:") print(name + "_low_seq_ident.fasta") print(name + "_high_seq_ident.fasta") print(name + "_whole_range_seq_ident.fasta")
def identity(seq1, seq2):
Calculate relative sequence identity of two sequences. @return: number of identical residues divided by mean length.
#pairwise alignment matrix = MatrixInfo.blosum62 gap_open = -10 gap_extend = -0.5 alignment = pairwise2.align.globalds(seq1, seq2, matrix, gap_open, gap_extend) seq1_aligned = alignment[0][0] seq2_aligned = alignment[0][1]
#sequence identity ident = sum(c1 == c2 for c1, c2 in zip(seq1_aligned, seq2_aligned)) ref_length = (len(seq1) + len(seq2)) / 2 # mean length return float(ident) / ref_length
if __name__ == '__main__':
namestring = sys.argv[1].split("/")[-1].split(".")[0] # used as beginning of the output files print("-----------------------------------------------------------") all_seqs = get_sequences(sys.argv[1], sys.argv[2], sys.argv[3], namestring + "_all_sequences.fasta") filter_seqs(all_seqs, namestring)
</source>
Note: for DBT there were some problems aligning very long sequences to calculate mutual sequence identities. So there, for the low sequence identity group, only sequences that have less than 6 times the length of the query sequence were taken.
Calling MSA programs
Call of T-Coffee:
#!/bin/bash proteins=( BCKDHA BCKDHB DBT DLD ) identities=( low high whole_range ) for protein in ${proteins[*]} do for identity in ${identities[*]} do t_coffee -output fasta -infile ${protein}_${identity}_seq_ident.fasta -outfile ${protein}_${identity}_seq_ident_tcoffee.fasta done done
Muscle:
#!/bin/bash proteins=( BCKDHA BCKDHB DBT DLD ) identities=( low high whole_range ) for protein in ${proteins[*]} do for identity in ${identities[*]} do muscle -in ${protein}_${identity}_seq_ident.fasta -out ${protein}_${identity}_seq_ident_muscle.fasta done done
Mafft:
#!/bin/bash proteins=( BCKDHA BCKDHB DBT DLD ) identities=( low high whole_range ) for protein in ${proteins[*]} do for identity in ${identities[*]} do mafft ${protein}_${identity}_seq_ident.fasta > ${protein}_${identity}_seq_ident_mafft.fasta done done