Difference between revisions of "Phenylketonuria/Task2/Scripts"

From Bioinformatikpedia
(Replaced content with "script follows tomorrow")
Line 1: Line 1:
 
script follows tomorrow
 
script follows tomorrow
  +
  +
<source lang="python">
  +
#!/usr/bin/python
  +
# Python Script for grouping the sequences from BLAST for the Multiple Sequence Alignments
  +
##########################################################################################
  +
# The script can be invoked as followed: python MSA_grouping.py low|med|high
  +
import sys # package for commandline parameters
  +
import re # package for regular expressions
  +
import random # package for generating random numbers
  +
  +
# Open the file and find the sequences with < 30%, 30% < x < 60% and > 60% and save them into a list
  +
# ---------------------------------------------------------------------------------------------------
  +
fobj = open("C:\Dokumente und Einstellungen\Karo\Eigene Dateien\Dropbox\Masterpraktikum\Karo\BLAST\PAH_BLAST_big_80.out")
  +
low_big80 = [] # all sequences with an identity value under 30 per cent are stored here
  +
middle_big80 = [] # all sequences with an identity value between 30 and 60 per cent are stored here
  +
high_big80 = [] # all sequences with an identity value over 60 per cent are stored here
  +
protein = ""
  +
identity = 0
  +
sequence = ""
  +
prot = False
  +
ident = False
  +
seq = False
  +
for line in fobj:
  +
if line.startswith(">"): # find all lines starting with ">"
  +
prot = True
  +
seq = False # seq has to be False, because a new protein is starting
  +
sequence = ""
  +
protein = line
  +
elif prot == True and not "Length" in line:
  +
protein = protein + line # addd all informtions to the protein name line
  +
elif "Length" in line:
  +
prot = False # now the name of the protein is ending
  +
ident = True # after the protein name the identity value is following
  +
elif ident == True and "Identities" in line: # save the corresponding sequence identity value
  +
seq = True # now the sequences are following
  +
l = line.split(", ")
  +
m = re.search('(?<=\()\w+', l[0]) # save only the identity value between the brackets with regexp
  +
identity = int(m.group(0))
  +
elif seq == True and line.startswith("Sbjct"):
  +
ident = False
  +
mm = re.search('(?<=[0-9])\D+', line)
  +
s = mm.group(0).replace(" ", "")
  +
sequence = sequence + s.replace("-", "")
  +
  +
if int(m.group(0)) < 30 :
  +
under_30 = [protein.replace("\n", ""), sequence.replace("\n", "")]
  +
if not under_30 in low_big80:
  +
low_big80.append(under_30)
  +
elif int(m.group(0)) > 30 and int(m.group(0)) < 60 :
  +
under_60 = [protein.replace("\n", ""), sequence.replace("\n", "")]
  +
if not under_60 in middle_big80:
  +
middle_big80.append(under_60)
  +
elif int(m.group(0)) > 60 :
  +
over_60 = [protein.replace("\n", ""), sequence.replace("\n", "")]
  +
if not over_60 in high_big80:
  +
high_big80.append(over_60)
  +
fobj.close()
  +
#print(low[1])
  +
  +
# Afterwards, the pdb Sequences from Blast get the same procedure
  +
# ----------------------------------------------------------------
  +
pobj = open("C:\Dokumente und Einstellungen\Karo\Eigene Dateien\Dropbox\Masterpraktikum\Karo\BLAST\PAH_BLAST_pdb.out")
  +
low_pdb = [] # all sequences with an identity value under 30 per cent are stored here
  +
middle_pdb = [] # all sequences with an identity value between 30 and 60 per cent are stored here
  +
high_pdb = [] # all sequences with an identity value over 60 per cent are stored here
  +
protein = ""
  +
identity = 0
  +
sequence = ""
  +
prot = False
  +
ident = False
  +
seq = False
  +
for line in pobj:
  +
if line.startswith(">"): # find all lines starting with ">"
  +
prot = True
  +
seq = False # seq has to be False, because a new protein is starting
  +
sequence = ""
  +
protein = line
  +
elif prot == True and not "Length" in line:
  +
protein = protein + line # addd all informtions to the protein name line
  +
elif "Length" in line:
  +
prot = False # now the name of the protein is ending
  +
ident = True # after the protein name the identity value is following
  +
elif ident == True and "Identities" in line: # save the corresponding sequence identity value
  +
seq = True # now the sequences are following
  +
l = line.split(", ")
  +
m = re.search('(?<=\()\w+', l[0]) # save only the identity value between the brackets with regexp
  +
identity = int(m.group(0))
  +
elif seq == True and line.startswith("Sbjct"):
  +
ident = False
  +
mm = re.search('(?<=[0-9])\D+', line)
  +
s = mm.group(0).replace(" ", "")
  +
sequence = sequence + s.replace("-", "")
  +
  +
if int(m.group(0)) < 33 : # There are no Sequences under 30% sequence identity, the lowest ones are 32%
  +
under_30 = [protein.replace("\n", ""), sequence.replace("\n", "")]
  +
if not under_30 in low_pdb:
  +
low_pdb.append(under_30)
  +
elif int(m.group(0)) > 32 and int(m.group(0)) < 60 :
  +
under_60 = [protein.replace("\n", ""), sequence.replace("\n", "")]
  +
if not under_60 in middle_pdb:
  +
middle_pdb.append(under_60)
  +
elif int(m.group(0)) > 60 :
  +
over_60 = [protein.replace("\n", ""), sequence.replace("\n", "")]
  +
if not over_60 in high_pdb:
  +
high_pdb.append(over_60)
  +
pobj.close()
  +
  +
  +
# Now only eight sequences from < 30% and over 60% and 18 Sequences from 30% < x < 60% have to be saved randomly
  +
# ---------------------------------------------------------------------------------------------------------------
  +
  +
if "low" in sys.argv:
  +
# print 10 random chosen sequences with lower identity than 30% into a new file
  +
low = random.sample(low_big80, 8)
  +
lowp = random.sample(low_pdb, 2)
  +
out = open("C:\Dokumente und Einstellungen\Karo\Desktop\low.fasta", "w")
  +
for i in range(len(low)):
  +
out.write(low[i][0] + "\n" + low[i][1] + "\n")
  +
for j in range(len(lowp)):
  +
out.write(lowp[j][0] + "\n" + lowp[j][1] + "\n")
  +
out.close()
  +
elif "med" in sys.argv:
  +
# print 20 random chosen sequences between 30% and 60% sequence identity into a new file
  +
mid = random.sample(middle_big80, 17)
  +
midp = random.sample(middle_pdb, 3)
  +
out = open("C:\Dokumente und Einstellungen\Karo\Desktop\medium.fasta", "w")
  +
for i in range(len(mid)):
  +
out.write(mid[i][0] + "\n" + mid[i][1] + "\n")
  +
for j in range(len(midp)):
  +
out.write(midp[j][0] + "\n" + midp[j][1] + "\n")
  +
out.close()
  +
elif "high" in sys.argv:
  +
# print 10 random chosen sequences with higher identity than 60% into a new file
  +
high = random.sample(high_big80, 8)
  +
highp = random.sample(high_pdb, 2)
  +
out = open("C:\Dokumente und Einstellungen\Karo\Desktop\high.fasta", "w")
  +
for i in range(len(high)):
  +
out.write(high[i][0] + "\n" + high[i][1] + "\n")
  +
for j in range(len(highp)):
  +
out.write(highp[j][0] + "\n" + highp[j][1] + "\n")
  +
out.close()
  +
  +
</source>

Revision as of 16:29, 6 May 2013

script follows tomorrow

<source lang="python">

  1. !/usr/bin/python
  2. Python Script for grouping the sequences from BLAST for the Multiple Sequence Alignments
  3. The script can be invoked as followed: python MSA_grouping.py low|med|high

import sys # package for commandline parameters import re # package for regular expressions import random # package for generating random numbers

  1. Open the file and find the sequences with < 30%, 30% < x < 60% and > 60% and save them into a list
  2. ---------------------------------------------------------------------------------------------------

fobj = open("C:\Dokumente und Einstellungen\Karo\Eigene Dateien\Dropbox\Masterpraktikum\Karo\BLAST\PAH_BLAST_big_80.out") low_big80 = [] # all sequences with an identity value under 30 per cent are stored here middle_big80 = [] # all sequences with an identity value between 30 and 60 per cent are stored here high_big80 = [] # all sequences with an identity value over 60 per cent are stored here protein = "" identity = 0 sequence = "" prot = False ident = False seq = False for line in fobj: if line.startswith(">"): # find all lines starting with ">" prot = True seq = False # seq has to be False, because a new protein is starting sequence = "" protein = line elif prot == True and not "Length" in line: protein = protein + line # addd all informtions to the protein name line elif "Length" in line: prot = False # now the name of the protein is ending ident = True # after the protein name the identity value is following elif ident == True and "Identities" in line: # save the corresponding sequence identity value seq = True # now the sequences are following l = line.split(", ") m = re.search('(?<=\()\w+', l[0]) # save only the identity value between the brackets with regexp identity = int(m.group(0)) elif seq == True and line.startswith("Sbjct"): ident = False mm = re.search('(?<=[0-9])\D+', line) s = mm.group(0).replace(" ", "") sequence = sequence + s.replace("-", "")

if int(m.group(0)) < 30 : under_30 = [protein.replace("\n", ""), sequence.replace("\n", "")] if not under_30 in low_big80: low_big80.append(under_30) elif int(m.group(0)) > 30 and int(m.group(0)) < 60 : under_60 = [protein.replace("\n", ""), sequence.replace("\n", "")] if not under_60 in middle_big80: middle_big80.append(under_60) elif int(m.group(0)) > 60 : over_60 = [protein.replace("\n", ""), sequence.replace("\n", "")] if not over_60 in high_big80: high_big80.append(over_60) fobj.close()

  1. print(low[1])
  1. Afterwards, the pdb Sequences from Blast get the same procedure
  2. ----------------------------------------------------------------

pobj = open("C:\Dokumente und Einstellungen\Karo\Eigene Dateien\Dropbox\Masterpraktikum\Karo\BLAST\PAH_BLAST_pdb.out") low_pdb = [] # all sequences with an identity value under 30 per cent are stored here middle_pdb = [] # all sequences with an identity value between 30 and 60 per cent are stored here high_pdb = [] # all sequences with an identity value over 60 per cent are stored here protein = "" identity = 0 sequence = "" prot = False ident = False seq = False for line in pobj: if line.startswith(">"): # find all lines starting with ">" prot = True seq = False # seq has to be False, because a new protein is starting sequence = "" protein = line elif prot == True and not "Length" in line: protein = protein + line # addd all informtions to the protein name line elif "Length" in line: prot = False # now the name of the protein is ending ident = True # after the protein name the identity value is following elif ident == True and "Identities" in line: # save the corresponding sequence identity value seq = True # now the sequences are following l = line.split(", ") m = re.search('(?<=\()\w+', l[0]) # save only the identity value between the brackets with regexp identity = int(m.group(0)) elif seq == True and line.startswith("Sbjct"): ident = False mm = re.search('(?<=[0-9])\D+', line) s = mm.group(0).replace(" ", "") sequence = sequence + s.replace("-", "")

if int(m.group(0)) < 33 : # There are no Sequences under 30% sequence identity, the lowest ones are 32% under_30 = [protein.replace("\n", ""), sequence.replace("\n", "")] if not under_30 in low_pdb: low_pdb.append(under_30) elif int(m.group(0)) > 32 and int(m.group(0)) < 60 : under_60 = [protein.replace("\n", ""), sequence.replace("\n", "")] if not under_60 in middle_pdb: middle_pdb.append(under_60) elif int(m.group(0)) > 60 : over_60 = [protein.replace("\n", ""), sequence.replace("\n", "")] if not over_60 in high_pdb: high_pdb.append(over_60) pobj.close()


  1. Now only eight sequences from < 30% and over 60% and 18 Sequences from 30% < x < 60% have to be saved randomly
  2. ---------------------------------------------------------------------------------------------------------------

if "low" in sys.argv: # print 10 random chosen sequences with lower identity than 30% into a new file low = random.sample(low_big80, 8) lowp = random.sample(low_pdb, 2) out = open("C:\Dokumente und Einstellungen\Karo\Desktop\low.fasta", "w") for i in range(len(low)): out.write(low[i][0] + "\n" + low[i][1] + "\n") for j in range(len(lowp)): out.write(lowp[j][0] + "\n" + lowp[j][1] + "\n") out.close() elif "med" in sys.argv: # print 20 random chosen sequences between 30% and 60% sequence identity into a new file mid = random.sample(middle_big80, 17) midp = random.sample(middle_pdb, 3) out = open("C:\Dokumente und Einstellungen\Karo\Desktop\medium.fasta", "w") for i in range(len(mid)): out.write(mid[i][0] + "\n" + mid[i][1] + "\n") for j in range(len(midp)): out.write(midp[j][0] + "\n" + midp[j][1] + "\n") out.close() elif "high" in sys.argv: # print 10 random chosen sequences with higher identity than 60% into a new file high = random.sample(high_big80, 8) highp = random.sample(high_pdb, 2) out = open("C:\Dokumente und Einstellungen\Karo\Desktop\high.fasta", "w") for i in range(len(high)): out.write(high[i][0] + "\n" + high[i][1] + "\n") for j in range(len(highp)): out.write(highp[j][0] + "\n" + highp[j][1] + "\n") out.close()

</source>