Phenylketonuria/Task2/Scripts

From Bioinformatikpedia
Revision as of 17:46, 6 May 2013 by Worfk (talk | contribs)
(diff) ← Older revision | Latest revision (diff) | Newer revision → (diff)

The python script is not perfect at all. Sometimes some sequences are doubled but we will improve it the next days. In addition, the output files are always diverse, because we use a random function for the choice of sequences.

One can find the script in the following path:

/mnt/home/student/worfk/Masterpractical/Task2/MSA/MSA_grouping.py

If you want to use the script, you have to change the paths for the input and output files ;)


<source lang="python">

  1. !/usr/bin/python
  2. Python Script for grouping the sequences from BLAST for the Multiple Sequence Alignments
  3. The script can be invoked as followed: python MSA_grouping.py low (or med, high, all -> depends on the group/file you want)

import sys # package for commandline parameters import re # package for regular expressions import random # package for generating random numbers

  1. Open the file and find the sequences with < 30%, 30% < x < 60%, > 60% and all and save them into a list
  2. -------------------------------------------------------------------------------------------------------

fobj = open("/mnt/home/student/worfk/Masterpractical/Task2/Blast/PAH_Blast_big_80.out") low_big80 = [] # all sequences with an identity value under 30 per cent are stored here middle_big80 = [] # all sequences with an identity value between 30 and 60 per cent are stored here high_big80 = [] # all sequences with an identity value over 60 per cent are stored here all_big80 = [] # all sequences are stored protein = "" identity = 0 sequence = "" prot = False ident = False seq = False for line in fobj: if line.startswith(">"): # find all lines starting with ">" prot = True seq = False # seq has to be False, because a new protein is starting sequence = "" protein = line elif prot == True and not "Length" in line: protein = protein + line # addd all informtions to the protein name line elif "Length" in line: prot = False # now the name of the protein is ending ident = True # after the protein name the identity value is following elif ident == True and "Identities" in line: # save the corresponding sequence identity value seq = True # now the sequences are following l = line.split(", ") m = re.search('(?<=\()\w+', l[0]) # save only the identity value between the brackets with regexp identity = int(m.group(0)) elif seq == True and line.startswith("Sbjct"): ident = False mm = re.search('(?<=[0-9])\D+', line) s = mm.group(0).replace(" ", "") sequence = sequence + s.replace("-", "")

all = [protein.replace("\n", ""), identity, sequence.replace("\n", "")] all_big80.append(all)

if identity < 30 : under_30 = [protein.replace("\n", ""), identity, sequence.replace("\n", "")] if not under_30 in low_big80: low_big80.append(under_30) elif identity > 30 and int(m.group(0)) < 60 : under_60 = [protein.replace("\n", ""), identity, sequence.replace("\n", "")] if not under_60 in middle_big80: middle_big80.append(under_60) elif identity > 60 : over_60 = [protein.replace("\n", ""), identity, sequence.replace("\n", "")] if not over_60 in high_big80: high_big80.append(over_60) fobj.close()

  1. Afterwards, the pdb Sequences from Blast get the same procedure
  2. ----------------------------------------------------------------

pobj = open("/mnt/home/student/worfk/Masterpractical/Task2/Blast/PAH_Blast_pdb.out") low_pdb = [] # all sequences with an identity value under 30 per cent are stored here middle_pdb = [] # all sequences with an identity value between 30 and 60 per cent are stored here high_pdb = [] # all sequences with an identity value over 60 per cent are stored here all_pdb = [] # all pdb sequences are stored protein = "" identity = 0 sequence = "" prot = False ident = False seq = False for line in pobj: if line.startswith(">"): # find all lines starting with ">" prot = True seq = False # seq has to be False, because a new protein is starting sequence = "" protein = line elif prot == True and not "Length" in line: protein = protein + line # addd all informtions to the protein name line elif "Length" in line: prot = False # now the name of the protein is ending ident = True # after the protein name the identity value is following elif ident == True and "Identities" in line: # save the corresponding sequence identity value seq = True # now the sequences are following l = line.split(", ") m = re.search('(?<=\()\w+', l[0]) # save only the identity value between the brackets with regexp identity = int(m.group(0)) elif seq == True and line.startswith("Sbjct"): ident = False mm = re.search('(?<=[0-9])\D+', line) s = mm.group(0).replace(" ", "") sequence = sequence + s.replace("-", "")

all = [protein.replace("\n", ""), identity, sequence.replace("\n", "")] all_pdb.append(all)

if identity < 33 : # There are no Sequences under 30% sequence identity, the lowest ones are 32% under_30 = [protein.replace("\n", ""), identity, sequence.replace("\n", "")] if not under_30 in low_pdb: low_pdb.append(under_30) elif identity > 32 and int(m.group(0)) < 60 : under_60 = [protein.replace("\n", ""), identity, sequence.replace("\n", "")] if not under_60 in middle_pdb: middle_pdb.append(under_60) elif identity > 60 : over_60 = [protein.replace("\n", ""), identity, sequence.replace("\n", "")] if not over_60 in high_pdb: high_pdb.append(over_60) pobj.close()


  1. Now only ten sequences from < 30%, over 60% and between 30% and 60% (two from pdb sequences) and 20 Sequences
  2. from all identities (four from pdb sequences) have to be saved randomly
  3. --------------------------------------------------------------------------------------------------------------

if "low" in sys.argv: # print 10 random chosen sequences with lower identity than 30% into a new file low = random.sample(low_big80, 8) lowp = random.sample(low_pdb, 2) out = open("/mnt/home/student/worfk/Masterpractical/Task2/MSA/low.fasta", "w") for i in range(len(low)): out.write(low[i][0] + " identity: " + str(low[i][1]) + "\n" + low[i][1] + "\n") for j in range(len(lowp)): out.write(lowp[j][0] + " identity: " + str(lowp[j][1]) + "\n" + lowp[j][1] + "\n") out.close() elif "med" in sys.argv: # print 20 random chosen sequences between 30% and 60% sequence identity into a new file mid = random.sample(middle_big80, 8) midp = random.sample(middle_pdb, 2) out = open("/mnt/home/student/worfk/Masterpractical/Task2/MSA/medium.fasta", "w") for i in range(len(mid)): out.write(mid[i][0] + " identity: " + str(mid[i][1]) + "\n" + mid[i][2] + "\n") for j in range(len(midp)): out.write(midp[j][0] + " identity: " + str(midp[j][1]) + "\n" + midp[j][2] + "\n") out.close() elif "high" in sys.argv: # print 10 random chosen sequences with higher identity than 60% into a new file high = random.sample(high_big80, 8) highp = random.sample(high_pdb, 2) out = open("/mnt/home/student/worfk/Masterpractical/Task2/MSA/high.fasta", "w") for i in range(len(high)): out.write(high[i][0] + " identity: " + str(high[i][1]) + "\n" + high[i][2] + "\n") for j in range(len(highp)): out.write(highp[j][0] + " identity: " + str(highp[j][1]) + "\n" + highp[j][2] + "\n") out.close() elif "all" in sys.argv: # print 20 random chosen sequences with different (whatever) identity into a new file all = random.sample(all_big80, 16) allp = random.sample(all_pdb, 4) out = open("/mnt/home/student/worfk/Masterpractical/Task2/MSA/all.fasta", "w") for i in range(len(all)): out.write(all[i][0] + " identity: " + str(all[i][1]) + "\n" + all[i][2] + "\n") for j in range(len(allp)): out.write(allp[j][0] + " identity: " + str(allp[j][1]) + "\n" + allp[j][2] + "\n") out.close()

</source>