Difference between revisions of "Phenylketonuria/Task2/Scripts"

From Bioinformatikpedia
(Created page with "#!/usr/bin/python # Python Script for grouping the sequences from BLAST for the Multiple Sequence Alignments #####################################################################…")
 
(Replaced content with "script follows tomorrow")
Line 1: Line 1:
  +
script follows tomorrow
#!/usr/bin/python
 
# Python Script for grouping the sequences from BLAST for the Multiple Sequence Alignments
 
##########################################################################################
 
# The script can be invoked as followed: python MSA_grouping.py low|med|high
 
import sys # package for commandline parameters
 
import re # package for regular expressions
 
import random # package for generating random numbers
 
 
# Open the file and find the sequences with < 30%, 30% < x < 60% and > 60% and save them into a list
 
# ---------------------------------------------------------------------------------------------------
 
fobj = open("C:\Dokumente und Einstellungen\Karo\Eigene Dateien\Dropbox\Masterpraktikum\Karo\BLAST\PAH_BLAST_big_80.out")
 
low_big80 = [] # all sequences with an identity value under 30 per cent are stored here
 
middle_big80 = [] # all sequences with an identity value between 30 and 60 per cent are stored here
 
high_big80 = [] # all sequences with an identity value over 60 per cent are stored here
 
protein = ""
 
identity = 0
 
sequence = ""
 
prot = False
 
ident = False
 
seq = False
 
for line in fobj:
 
if line.startswith(">"): # find all lines starting with ">"
 
prot = True
 
seq = False # seq has to be False, because a new protein is starting
 
sequence = ""
 
protein = line
 
elif prot == True and not "Length" in line:
 
protein = protein + line # addd all informtions to the protein name line
 
elif "Length" in line:
 
prot = False # now the name of the protein is ending
 
ident = True # after the protein name the identity value is following
 
elif ident == True and "Identities" in line: # save the corresponding sequence identity value
 
seq = True # now the sequences are following
 
l = line.split(", ")
 
m = re.search('(?<=\()\w+', l[0]) # save only the identity value between the brackets with regexp
 
identity = int(m.group(0))
 
elif seq == True and line.startswith("Sbjct"):
 
ident = False
 
mm = re.search('(?<=[0-9])\D+', line)
 
s = mm.group(0).replace(" ", "")
 
sequence = sequence + s.replace("-", "")
 
 
if int(m.group(0)) < 30 :
 
under_30 = [protein.replace("\n", ""), sequence.replace("\n", "")]
 
if not under_30 in low_big80:
 
low_big80.append(under_30)
 
elif int(m.group(0)) > 30 and int(m.group(0)) < 60 :
 
under_60 = [protein.replace("\n", ""), sequence.replace("\n", "")]
 
if not under_60 in middle_big80:
 
middle_big80.append(under_60)
 
elif int(m.group(0)) > 60 :
 
over_60 = [protein.replace("\n", ""), sequence.replace("\n", "")]
 
if not over_60 in high_big80:
 
high_big80.append(over_60)
 
fobj.close()
 
#print(low[1])
 
 
# Afterwards, the pdb Sequences from Blast get the same procedure
 
# ----------------------------------------------------------------
 
pobj = open("C:\Dokumente und Einstellungen\Karo\Eigene Dateien\Dropbox\Masterpraktikum\Karo\BLAST\PAH_BLAST_pdb.out")
 
low_pdb = [] # all sequences with an identity value under 30 per cent are stored here
 
middle_pdb = [] # all sequences with an identity value between 30 and 60 per cent are stored here
 
high_pdb = [] # all sequences with an identity value over 60 per cent are stored here
 
protein = ""
 
identity = 0
 
sequence = ""
 
prot = False
 
ident = False
 
seq = False
 
for line in pobj:
 
if line.startswith(">"): # find all lines starting with ">"
 
prot = True
 
seq = False # seq has to be False, because a new protein is starting
 
sequence = ""
 
protein = line
 
elif prot == True and not "Length" in line:
 
protein = protein + line # addd all informtions to the protein name line
 
elif "Length" in line:
 
prot = False # now the name of the protein is ending
 
ident = True # after the protein name the identity value is following
 
elif ident == True and "Identities" in line: # save the corresponding sequence identity value
 
seq = True # now the sequences are following
 
l = line.split(", ")
 
m = re.search('(?<=\()\w+', l[0]) # save only the identity value between the brackets with regexp
 
identity = int(m.group(0))
 
elif seq == True and line.startswith("Sbjct"):
 
ident = False
 
mm = re.search('(?<=[0-9])\D+', line)
 
s = mm.group(0).replace(" ", "")
 
sequence = sequence + s.replace("-", "")
 
 
if int(m.group(0)) < 33 : # There are no Sequences under 30% sequence identity, the lowest ones are 32%
 
under_30 = [protein.replace("\n", ""), sequence.replace("\n", "")]
 
if not under_30 in low_pdb:
 
low_pdb.append(under_30)
 
elif int(m.group(0)) > 30 and int(m.group(0)) < 60 :
 
under_60 = [protein.replace("\n", ""), sequence.replace("\n", "")]
 
if not under_60 in middle_pdb:
 
middle_pdb.append(under_60)
 
elif int(m.group(0)) > 60 :
 
over_60 = [protein.replace("\n", ""), sequence.replace("\n", "")]
 
if not over_60 in high_pdb:
 
high_pdb.append(over_60)
 
pobj.close()
 
 
 
# Now only eight sequences from < 30% and over 60% and 18 Sequences from 30% < x < 60% have to be saved randomly
 
# ---------------------------------------------------------------------------------------------------------------
 
 
if "low" in sys.argv:
 
# print 10 random chosen sequences with lower identity than 30% into a new file
 
low = random.sample(low_big80, 8)
 
lowp = random.sample(low_pdb, 2)
 
out = open("C:\Dokumente und Einstellungen\Karo\Desktop\low.fasta", "w")
 
for i in range(len(low)):
 
out.write(low[i][0] + "\n" + low[i][1] + "\n")
 
for j in range(len(lowp)):
 
out.write(lowp[j][0] + "\n" + lowp[j][1] + "\n")
 
out.close()
 
elif "med" in sys.argv:
 
# print 20 random chosen sequences between 30% and 60% sequence identity into a new file
 
mid = random.sample(middle_big80, 16)
 
midp = random.sample(middle_pdb, 4)
 
out = open("C:\Dokumente und Einstellungen\Karo\Desktop\medium.fasta", "w")
 
for i in range(len(mid)):
 
out.write(mid[i][0] + "\n" + mid[i][1] + "\n")
 
for j in range(len(midp)):
 
out.write(midp[j][0] + "\n" + midp[j][1] + "\n")
 
out.close()
 
elif "high" in sys.argv:
 
# print 10 random chosen sequences with higher identity than 60% into a new file
 
high = random.sample(high_big80, 8)
 
highp = random.sample(high_pdb, 2)
 
out = open("C:\Dokumente und Einstellungen\Karo\Desktop\high.fasta", "w")
 
for i in range(len(high)):
 
out.write(high[i][0] + "\n" + high[i][1] + "\n")
 
for j in range(len(highp)):
 
out.write(highp[j][0] + "\n" + highp[j][1] + "\n")
 
out.close()
 

Revision as of 23:06, 5 May 2013

script follows tomorrow