Difference between revisions of "Phenylketonuria/Task2/Scripts"

From Bioinformatikpedia
Line 15: Line 15:
 
# Open the file and find the sequences with < 30%, 30% < x < 60% and > 60% and save them into a list
 
# Open the file and find the sequences with < 30%, 30% < x < 60% and > 60% and save them into a list
 
# ---------------------------------------------------------------------------------------------------
 
# ---------------------------------------------------------------------------------------------------
fobj = open("C:\Dokumente und Einstellungen\Karo\Eigene Dateien\Dropbox\Masterpraktikum\Karo\BLAST\PAH_BLAST_big_80.out")
+
fobj = open("/mnt/home/student/worfk/Masterpractical/Task2/Blast/PAH_BLAST_big_80.out")
 
low_big80 = [] # all sequences with an identity value under 30 per cent are stored here
 
low_big80 = [] # all sequences with an identity value under 30 per cent are stored here
 
middle_big80 = [] # all sequences with an identity value between 30 and 60 per cent are stored here
 
middle_big80 = [] # all sequences with an identity value between 30 and 60 per cent are stored here
Line 64: Line 64:
 
# Afterwards, the pdb Sequences from Blast get the same procedure
 
# Afterwards, the pdb Sequences from Blast get the same procedure
 
# ----------------------------------------------------------------
 
# ----------------------------------------------------------------
pobj = open("C:\Dokumente und Einstellungen\Karo\Eigene Dateien\Dropbox\Masterpraktikum\Karo\BLAST\PAH_BLAST_pdb.out")
+
pobj = open("/mnt/home/student/worfk/Masterpractical/Task2/Blast/PAH_BLAST_pdb.out")
 
low_pdb = [] # all sequences with an identity value under 30 per cent are stored here
 
low_pdb = [] # all sequences with an identity value under 30 per cent are stored here
 
middle_pdb = [] # all sequences with an identity value between 30 and 60 per cent are stored here
 
middle_pdb = [] # all sequences with an identity value between 30 and 60 per cent are stored here
Line 118: Line 118:
 
low = random.sample(low_big80, 8)
 
low = random.sample(low_big80, 8)
 
lowp = random.sample(low_pdb, 2)
 
lowp = random.sample(low_pdb, 2)
out = open("C:\Dokumente und Einstellungen\Karo\Desktop\low.fasta", "w")
+
out = open("/mnt/home/student/worfk/Masterpractical/Task2/MSA/low.fasta", "w")
 
for i in range(len(low)):
 
for i in range(len(low)):
 
out.write(low[i][0] + "\n" + low[i][1] + "\n")
 
out.write(low[i][0] + "\n" + low[i][1] + "\n")
Line 128: Line 128:
 
mid = random.sample(middle_big80, 17)
 
mid = random.sample(middle_big80, 17)
 
midp = random.sample(middle_pdb, 3)
 
midp = random.sample(middle_pdb, 3)
out = open("C:\Dokumente und Einstellungen\Karo\Desktop\medium.fasta", "w")
+
out = open("/mnt/home/student/worfk/Masterpractical/Task2/MSA/medium.fasta", "w")
 
for i in range(len(mid)):
 
for i in range(len(mid)):
 
out.write(mid[i][0] + "\n" + mid[i][1] + "\n")
 
out.write(mid[i][0] + "\n" + mid[i][1] + "\n")
Line 138: Line 138:
 
high = random.sample(high_big80, 8)
 
high = random.sample(high_big80, 8)
 
highp = random.sample(high_pdb, 2)
 
highp = random.sample(high_pdb, 2)
out = open("C:\Dokumente und Einstellungen\Karo\Desktop\high.fasta", "w")
+
out = open("/mnt/home/student/worfk/Masterpractical/Task2/MSA/high.fasta", "w")
 
for i in range(len(high)):
 
for i in range(len(high)):
 
out.write(high[i][0] + "\n" + high[i][1] + "\n")
 
out.write(high[i][0] + "\n" + high[i][1] + "\n")

Revision as of 16:36, 6 May 2013

The python script is not perfect at all. Sometimes some sequences are doubled but we will improve it the next days.

One can find the script in the following path:

/mnt/home/student/worfk/Masterpractical/Task2/MSA/MSA_grouping.py

<source lang="python">

  1. !/usr/bin/python
  2. Python Script for grouping the sequences from BLAST for the Multiple Sequence Alignments
  3. The script can be invoked as followed: python MSA_grouping.py low|med|high

import sys # package for commandline parameters import re # package for regular expressions import random # package for generating random numbers

  1. Open the file and find the sequences with < 30%, 30% < x < 60% and > 60% and save them into a list
  2. ---------------------------------------------------------------------------------------------------

fobj = open("/mnt/home/student/worfk/Masterpractical/Task2/Blast/PAH_BLAST_big_80.out") low_big80 = [] # all sequences with an identity value under 30 per cent are stored here middle_big80 = [] # all sequences with an identity value between 30 and 60 per cent are stored here high_big80 = [] # all sequences with an identity value over 60 per cent are stored here protein = "" identity = 0 sequence = "" prot = False ident = False seq = False for line in fobj: if line.startswith(">"): # find all lines starting with ">" prot = True seq = False # seq has to be False, because a new protein is starting sequence = "" protein = line elif prot == True and not "Length" in line: protein = protein + line # addd all informtions to the protein name line elif "Length" in line: prot = False # now the name of the protein is ending ident = True # after the protein name the identity value is following elif ident == True and "Identities" in line: # save the corresponding sequence identity value seq = True # now the sequences are following l = line.split(", ") m = re.search('(?<=\()\w+', l[0]) # save only the identity value between the brackets with regexp identity = int(m.group(0)) elif seq == True and line.startswith("Sbjct"): ident = False mm = re.search('(?<=[0-9])\D+', line) s = mm.group(0).replace(" ", "") sequence = sequence + s.replace("-", "")

if int(m.group(0)) < 30 : under_30 = [protein.replace("\n", ""), sequence.replace("\n", "")] if not under_30 in low_big80: low_big80.append(under_30) elif int(m.group(0)) > 30 and int(m.group(0)) < 60 : under_60 = [protein.replace("\n", ""), sequence.replace("\n", "")] if not under_60 in middle_big80: middle_big80.append(under_60) elif int(m.group(0)) > 60 : over_60 = [protein.replace("\n", ""), sequence.replace("\n", "")] if not over_60 in high_big80: high_big80.append(over_60) fobj.close()

  1. print(low[1])
  1. Afterwards, the pdb Sequences from Blast get the same procedure
  2. ----------------------------------------------------------------

pobj = open("/mnt/home/student/worfk/Masterpractical/Task2/Blast/PAH_BLAST_pdb.out") low_pdb = [] # all sequences with an identity value under 30 per cent are stored here middle_pdb = [] # all sequences with an identity value between 30 and 60 per cent are stored here high_pdb = [] # all sequences with an identity value over 60 per cent are stored here protein = "" identity = 0 sequence = "" prot = False ident = False seq = False for line in pobj: if line.startswith(">"): # find all lines starting with ">" prot = True seq = False # seq has to be False, because a new protein is starting sequence = "" protein = line elif prot == True and not "Length" in line: protein = protein + line # addd all informtions to the protein name line elif "Length" in line: prot = False # now the name of the protein is ending ident = True # after the protein name the identity value is following elif ident == True and "Identities" in line: # save the corresponding sequence identity value seq = True # now the sequences are following l = line.split(", ") m = re.search('(?<=\()\w+', l[0]) # save only the identity value between the brackets with regexp identity = int(m.group(0)) elif seq == True and line.startswith("Sbjct"): ident = False mm = re.search('(?<=[0-9])\D+', line) s = mm.group(0).replace(" ", "") sequence = sequence + s.replace("-", "")

if int(m.group(0)) < 33 : # There are no Sequences under 30% sequence identity, the lowest ones are 32% under_30 = [protein.replace("\n", ""), sequence.replace("\n", "")] if not under_30 in low_pdb: low_pdb.append(under_30) elif int(m.group(0)) > 32 and int(m.group(0)) < 60 : under_60 = [protein.replace("\n", ""), sequence.replace("\n", "")] if not under_60 in middle_pdb: middle_pdb.append(under_60) elif int(m.group(0)) > 60 : over_60 = [protein.replace("\n", ""), sequence.replace("\n", "")] if not over_60 in high_pdb: high_pdb.append(over_60) pobj.close()


  1. Now only eight sequences from < 30% and over 60% and 18 Sequences from 30% < x < 60% have to be saved randomly
  2. ---------------------------------------------------------------------------------------------------------------

if "low" in sys.argv: # print 10 random chosen sequences with lower identity than 30% into a new file low = random.sample(low_big80, 8) lowp = random.sample(low_pdb, 2) out = open("/mnt/home/student/worfk/Masterpractical/Task2/MSA/low.fasta", "w") for i in range(len(low)): out.write(low[i][0] + "\n" + low[i][1] + "\n") for j in range(len(lowp)): out.write(lowp[j][0] + "\n" + lowp[j][1] + "\n") out.close() elif "med" in sys.argv: # print 20 random chosen sequences between 30% and 60% sequence identity into a new file mid = random.sample(middle_big80, 17) midp = random.sample(middle_pdb, 3) out = open("/mnt/home/student/worfk/Masterpractical/Task2/MSA/medium.fasta", "w") for i in range(len(mid)): out.write(mid[i][0] + "\n" + mid[i][1] + "\n") for j in range(len(midp)): out.write(midp[j][0] + "\n" + midp[j][1] + "\n") out.close() elif "high" in sys.argv: # print 10 random chosen sequences with higher identity than 60% into a new file high = random.sample(high_big80, 8) highp = random.sample(high_pdb, 2) out = open("/mnt/home/student/worfk/Masterpractical/Task2/MSA/high.fasta", "w") for i in range(len(high)): out.write(high[i][0] + "\n" + high[i][1] + "\n") for j in range(len(highp)): out.write(highp[j][0] + "\n" + highp[j][1] + "\n") out.close()

</source>