Difference between revisions of "Task 2 lab journal (MSUD)"
From Bioinformatikpedia
(Created page with "== Sequence searches == == Multiple sequence alignments ==") |
(→Multiple sequence alignments) |
||
Line 2: | Line 2: | ||
== Multiple sequence alignments == |
== Multiple sequence alignments == |
||
+ | |||
+ | === Dataset creation === |
||
+ | For creating datasets with low, high and whole range sequence identity, the following Python script was used: |
||
+ | |||
+ | |||
+ | |||
+ | <source lang="python"> |
||
+ | ''' |
||
+ | Use blast output to create datasets with different sequence identities. |
||
+ | |||
+ | Call with: |
||
+ | python create_dataset.py <query fasta file> <blast xml output> <database fasta file> |
||
+ | |||
+ | @author: Laura Schiller |
||
+ | ''' |
||
+ | |||
+ | import sys |
||
+ | from Bio import SeqIO, pairwise2 |
||
+ | from Bio.Blast import NCBIXML |
||
+ | from Bio.SubsMat import MatrixInfo |
||
+ | |||
+ | def get_sequences(query, blast_xml, db_path, out_file): |
||
+ | ''' |
||
+ | Fetch full length sequences for a BLAST result and write them in a FASTA file. |
||
+ | |||
+ | @param query: fasta file with query sequence. |
||
+ | @param blast_xml: xml file with BLAST search result. |
||
+ | @param db_path: path of db fasta file. |
||
+ | @param out_file: file to store the sequences. |
||
+ | @return: name of the file where the sequences are stored. |
||
+ | ''' |
||
+ | |||
+ | print("get all sequences for blast result in %s" % blast_xml) |
||
+ | query_sequence = SeqIO.read(query, "fasta") |
||
+ | hit_seqs = [query_sequence] |
||
+ | |||
+ | blast_output = open(blast_xml) |
||
+ | blast_result = NCBIXML.read(blast_output) |
||
+ | blast_output.close() |
||
+ | |||
+ | hit_list = [] |
||
+ | for alignment in blast_result.alignments: |
||
+ | hit_list.append(alignment.title.split(" ")[1]) # the id of the sequence |
||
+ | |||
+ | # get all blast hits |
||
+ | seqs_db = SeqIO.parse(db_path, "fasta") |
||
+ | counter = 0 |
||
+ | number = len(blast_result.alignments) |
||
+ | for seq in seqs_db: |
||
+ | if seq.id in hit_list: |
||
+ | hit_seqs.append(seq) |
||
+ | counter = counter + 1 |
||
+ | if (counter % 100) == 0: |
||
+ | print("%d of %d sequences found" % (counter, number)) |
||
+ | if counter == number: |
||
+ | break |
||
+ | print("%d of %d sequences found" % (counter, len(blast_result.alignments))) |
||
+ | |||
+ | SeqIO.write(hit_seqs, out_file, "fasta") |
||
+ | |||
+ | print("sequences saved in %s" % out_file) |
||
+ | return out_file |
||
+ | |||
+ | def filter_seqs(seq_file, name): |
||
+ | ''' |
||
+ | Filter sequences according to sequence identity limits. |
||
+ | |||
+ | @param seq_file: fasta file with sequences. |
||
+ | @param name: string used for output file names. |
||
+ | ''' |
||
+ | |||
+ | sequences = SeqIO.parse(seq_file, "fasta") |
||
+ | seqs = [] |
||
+ | for seq in sequences: |
||
+ | seqs.append(seq) |
||
+ | query = seqs.pop(0) # always keep query (first sequence) |
||
+ | # lists for low / high / whole range sequence identity |
||
+ | filtered = [[query], [query], [query]] |
||
+ | |||
+ | # identify hits with pdb structures -> these are preferentially taken |
||
+ | hits_pdb = [hit for hit in seqs if (hit.id.split("|")[0] == "pdb")] |
||
+ | seqs = [seq for seq in seqs if not (seq.id in [pdb_seq.id for pdb_seq in hits_pdb])] |
||
+ | hits_pdb.extend(seqs) # now pdb hits are at the beginning |
||
+ | |||
+ | print("filter sequences") |
||
+ | for seq in hits_pdb: |
||
+ | try: |
||
+ | ident = identity(query, seq) |
||
+ | except KeyError: # raises if there is a non amino acid letter |
||
+ | continue |
||
+ | if (len(filtered[0]) < 10): |
||
+ | keep = True |
||
+ | for seq2 in filtered[0]: |
||
+ | try: |
||
+ | ident2 = identity(seq, seq2) |
||
+ | except KeyError: |
||
+ | keep = False |
||
+ | continue |
||
+ | if ident2 >= 0.3: |
||
+ | keep = False |
||
+ | break |
||
+ | if keep: |
||
+ | filtered[0].append(seq) |
||
+ | if (len(filtered[1]) < 10) and (ident > 0.6): |
||
+ | filtered[1].append(seq) |
||
+ | if (len(filtered[2]) < 8) and (ident >= 0.3) and (ident <= 0.6): |
||
+ | filtered[2].append(seq) |
||
+ | if (len(filtered[0]) == 10) and (len(filtered[1]) == 10) and (len(filtered[2]) == 8): |
||
+ | break |
||
+ | |||
+ | # for whole range take a part of low and a part of high sequence identity |
||
+ | # plus the rest middle sequence identity |
||
+ | filtered[2].extend(filtered[0][1:min(len(filtered[0]), 7)]) |
||
+ | filtered[2].extend(filtered[1][1:min(len(filtered[1]), 7)]) |
||
+ | |||
+ | SeqIO.write(filtered[0], name + "_low_seq_ident.fasta", "fasta") |
||
+ | SeqIO.write(filtered[1], name + "_high_seq_ident.fasta", "fasta") |
||
+ | SeqIO.write(filtered[2], name + "_whole_range_seq_ident.fasta", "fasta") |
||
+ | |||
+ | print("sequences with low / high / whole range sequence identity saved in:") |
||
+ | print(name + "_low_seq_ident.fasta") |
||
+ | print(name + "_high_seq_ident.fasta") |
||
+ | print(name + "_whole_range_seq_ident.fasta") |
||
+ | |||
+ | def identity(seq1, seq2): |
||
+ | #pairwise alignment |
||
+ | matrix = MatrixInfo.blosum62 |
||
+ | gap_open = -10 |
||
+ | gap_extend = -0.5 |
||
+ | alignment = pairwise2.align.globalds(seq1, seq2, matrix, gap_open, gap_extend) |
||
+ | seq1_aligned = alignment[0][0] |
||
+ | seq2_aligned = alignment[0][1] |
||
+ | |||
+ | #sequence identity |
||
+ | ident = sum(c1 == c2 for c1, c2 in zip(seq1_aligned, seq2_aligned)) |
||
+ | ref_length = (len(seq1) + len(seq2)) / 2 # mean length |
||
+ | return float(ident) / ref_length |
||
+ | |||
+ | |||
+ | if __name__ == '__main__': |
||
+ | namestring = sys.argv[1].split("/")[-1].split(".")[0] # used as beginning of the output files |
||
+ | print("-----------------------------------------------------------") |
||
+ | all_seqs = get_sequences(sys.argv[1], sys.argv[2], sys.argv[3], namestring + "_all_sequences.fasta") |
||
+ | filter_seqs(all_seqs, namestring) |
||
+ | </source> |
||
+ | |||
+ | |||
+ | |||
+ | === Calling MSA programs === |
||
+ | |||
+ | Call of T-Coffee: |
||
+ | |||
+ | <pre> |
||
+ | #!/bin/bash |
||
+ | |||
+ | proteins=( BCKDHA BCKDHB DBT DLD ) |
||
+ | identities=( low high whole_range ) |
||
+ | |||
+ | for protein in ${proteins[*]} |
||
+ | do |
||
+ | for identity in ${identities[*]} |
||
+ | do |
||
+ | t_coffee -output fasta -infile ${protein}_${identity}_seq_ident.fasta -outfile ${protein}_${identity}_seq_ident_tcoffee.fasta |
||
+ | done |
||
+ | done |
||
+ | </pre> |
||
+ | |||
+ | Muscle: |
||
+ | |||
+ | <pre> |
||
+ | #!/bin/bash |
||
+ | |||
+ | proteins=( BCKDHA BCKDHB DBT DLD ) |
||
+ | identities=( low high whole_range ) |
||
+ | |||
+ | for protein in ${proteins[*]} |
||
+ | do |
||
+ | for identity in ${identities[*]} |
||
+ | do |
||
+ | muscle -in ${protein}_${identity}_seq_ident.fasta -out ${protein}_${identity}_seq_ident_muscle.fasta |
||
+ | done |
||
+ | done |
||
+ | </pre> |
||
+ | |||
+ | Mafft: |
||
+ | |||
+ | <pre> |
||
+ | #!/bin/bash |
||
+ | |||
+ | proteins=( BCKDHA BCKDHB DBT DLD ) |
||
+ | identities=( low high whole_range ) |
||
+ | |||
+ | for protein in ${proteins[*]} |
||
+ | do |
||
+ | for identity in ${identities[*]} |
||
+ | do |
||
+ | mafft ${protein}_${identity}_seq_ident.fasta > ${protein}_${identity}_seq_ident_mafft.fasta |
||
+ | done |
||
+ | done |
||
+ | </pre> |
Revision as of 10:44, 4 May 2013
Contents
Sequence searches
Multiple sequence alignments
Dataset creation
For creating datasets with low, high and whole range sequence identity, the following Python script was used:
<source lang="python"> Use blast output to create datasets with different sequence identities.
Call with: python create_dataset.py <query fasta file> <blast xml output> <database fasta file>
@author: Laura Schiller
import sys from Bio import SeqIO, pairwise2 from Bio.Blast import NCBIXML from Bio.SubsMat import MatrixInfo
def get_sequences(query, blast_xml, db_path, out_file):
Fetch full length sequences for a BLAST result and write them in a FASTA file. @param query: fasta file with query sequence. @param blast_xml: xml file with BLAST search result. @param db_path: path of db fasta file. @param out_file: file to store the sequences. @return: name of the file where the sequences are stored. print("get all sequences for blast result in %s" % blast_xml) query_sequence = SeqIO.read(query, "fasta") hit_seqs = [query_sequence] blast_output = open(blast_xml) blast_result = NCBIXML.read(blast_output) blast_output.close() hit_list = [] for alignment in blast_result.alignments: hit_list.append(alignment.title.split(" ")[1]) # the id of the sequence # get all blast hits seqs_db = SeqIO.parse(db_path, "fasta") counter = 0 number = len(blast_result.alignments) for seq in seqs_db: if seq.id in hit_list: hit_seqs.append(seq) counter = counter + 1 if (counter % 100) == 0: print("%d of %d sequences found" % (counter, number)) if counter == number: break print("%d of %d sequences found" % (counter, len(blast_result.alignments))) SeqIO.write(hit_seqs, out_file, "fasta") print("sequences saved in %s" % out_file) return out_file
def filter_seqs(seq_file, name):
Filter sequences according to sequence identity limits. @param seq_file: fasta file with sequences. @param name: string used for output file names. sequences = SeqIO.parse(seq_file, "fasta") seqs = [] for seq in sequences: seqs.append(seq) query = seqs.pop(0) # always keep query (first sequence) # lists for low / high / whole range sequence identity filtered = [[query], [query], [query]] # identify hits with pdb structures -> these are preferentially taken hits_pdb = [hit for hit in seqs if (hit.id.split("|")[0] == "pdb")] seqs = [seq for seq in seqs if not (seq.id in [pdb_seq.id for pdb_seq in hits_pdb])] hits_pdb.extend(seqs) # now pdb hits are at the beginning print("filter sequences") for seq in hits_pdb: try: ident = identity(query, seq) except KeyError: # raises if there is a non amino acid letter continue if (len(filtered[0]) < 10): keep = True for seq2 in filtered[0]: try: ident2 = identity(seq, seq2) except KeyError: keep = False continue if ident2 >= 0.3: keep = False break if keep: filtered[0].append(seq) if (len(filtered[1]) < 10) and (ident > 0.6): filtered[1].append(seq) if (len(filtered[2]) < 8) and (ident >= 0.3) and (ident <= 0.6): filtered[2].append(seq) if (len(filtered[0]) == 10) and (len(filtered[1]) == 10) and (len(filtered[2]) == 8): break # for whole range take a part of low and a part of high sequence identity # plus the rest middle sequence identity filtered[2].extend(filtered[0][1:min(len(filtered[0]), 7)]) filtered[2].extend(filtered[1][1:min(len(filtered[1]), 7)]) SeqIO.write(filtered[0], name + "_low_seq_ident.fasta", "fasta") SeqIO.write(filtered[1], name + "_high_seq_ident.fasta", "fasta") SeqIO.write(filtered[2], name + "_whole_range_seq_ident.fasta", "fasta") print("sequences with low / high / whole range sequence identity saved in:") print(name + "_low_seq_ident.fasta") print(name + "_high_seq_ident.fasta") print(name + "_whole_range_seq_ident.fasta")
def identity(seq1, seq2):
#pairwise alignment matrix = MatrixInfo.blosum62 gap_open = -10 gap_extend = -0.5 alignment = pairwise2.align.globalds(seq1, seq2, matrix, gap_open, gap_extend) seq1_aligned = alignment[0][0] seq2_aligned = alignment[0][1]
#sequence identity ident = sum(c1 == c2 for c1, c2 in zip(seq1_aligned, seq2_aligned)) ref_length = (len(seq1) + len(seq2)) / 2 # mean length return float(ident) / ref_length
if __name__ == '__main__':
namestring = sys.argv[1].split("/")[-1].split(".")[0] # used as beginning of the output files print("-----------------------------------------------------------") all_seqs = get_sequences(sys.argv[1], sys.argv[2], sys.argv[3], namestring + "_all_sequences.fasta") filter_seqs(all_seqs, namestring)
</source>
Calling MSA programs
Call of T-Coffee:
#!/bin/bash proteins=( BCKDHA BCKDHB DBT DLD ) identities=( low high whole_range ) for protein in ${proteins[*]} do for identity in ${identities[*]} do t_coffee -output fasta -infile ${protein}_${identity}_seq_ident.fasta -outfile ${protein}_${identity}_seq_ident_tcoffee.fasta done done
Muscle:
#!/bin/bash proteins=( BCKDHA BCKDHB DBT DLD ) identities=( low high whole_range ) for protein in ${proteins[*]} do for identity in ${identities[*]} do muscle -in ${protein}_${identity}_seq_ident.fasta -out ${protein}_${identity}_seq_ident_muscle.fasta done done
Mafft:
#!/bin/bash proteins=( BCKDHA BCKDHB DBT DLD ) identities=( low high whole_range ) for protein in ${proteins[*]} do for identity in ${identities[*]} do mafft ${protein}_${identity}_seq_ident.fasta > ${protein}_${identity}_seq_ident_mafft.fasta done done