Difference between revisions of "Phenylketonuria/Task2/Scripts"

Latest revision as of 17:46, 6 May 2013

The python script is not perfect at all. Sometimes some sequences are doubled but we will improve it the next days. In addition, the output files are always diverse, because we use a random function for the choice of sequences.

One can find the script in the following path:

/mnt/home/student/worfk/Masterpractical/Task2/MSA/MSA_grouping.py

If you want to use the script, you have to change the paths for the input and output files ;)

!/usr/bin/python
Python Script for grouping the sequences from BLAST for the Multiple Sequence Alignments
The script can be invoked as followed: python MSA_grouping.py low (or med, high, all -> depends on the group/file you want)

import sys # package for commandline parameters import re # package for regular expressions import random # package for generating random numbers

Open the file and find the sequences with < 30%, 30% < x < 60%, > 60% and all and save them into a list
-------------------------------------------------------------------------------------------------------

fobj = open("/mnt/home/student/worfk/Masterpractical/Task2/Blast/PAH_Blast_big_80.out") low_big80 = [] # all sequences with an identity value under 30 per cent are stored here middle_big80 = [] # all sequences with an identity value between 30 and 60 per cent are stored here high_big80 = [] # all sequences with an identity value over 60 per cent are stored here all_big80 = [] # all sequences are stored protein = "" identity = 0 sequence = "" prot = False ident = False seq = False for line in fobj: if line.startswith(">"): # find all lines starting with ">" prot = True seq = False # seq has to be False, because a new protein is starting sequence = "" protein = line elif prot == True and not "Length" in line: protein = protein + line # addd all informtions to the protein name line elif "Length" in line: prot = False # now the name of the protein is ending ident = True # after the protein name the identity value is following elif ident == True and "Identities" in line: # save the corresponding sequence identity value seq = True # now the sequences are following l = line.split(", ") m = re.search('(?<=\()\w+', l[0]) # save only the identity value between the brackets with regexp identity = int(m.group(0)) elif seq == True and line.startswith("Sbjct"): ident = False mm = re.search('(?<=[0-9])\D+', line) s = mm.group(0).replace(" ", "") sequence = sequence + s.replace("-", "")

all = [protein.replace("\n", ""), identity, sequence.replace("\n", "")] all_big80.append(all)

if identity < 30 : under_30 = [protein.replace("\n", ""), identity, sequence.replace("\n", "")] if not under_30 in low_big80: low_big80.append(under_30) elif identity > 30 and int(m.group(0)) < 60 : under_60 = [protein.replace("\n", ""), identity, sequence.replace("\n", "")] if not under_60 in middle_big80: middle_big80.append(under_60) elif identity > 60 : over_60 = [protein.replace("\n", ""), identity, sequence.replace("\n", "")] if not over_60 in high_big80: high_big80.append(over_60) fobj.close()

Afterwards, the pdb Sequences from Blast get the same procedure
----------------------------------------------------------------

pobj = open("/mnt/home/student/worfk/Masterpractical/Task2/Blast/PAH_Blast_pdb.out") low_pdb = [] # all sequences with an identity value under 30 per cent are stored here middle_pdb = [] # all sequences with an identity value between 30 and 60 per cent are stored here high_pdb = [] # all sequences with an identity value over 60 per cent are stored here all_pdb = [] # all pdb sequences are stored protein = "" identity = 0 sequence = "" prot = False ident = False seq = False for line in pobj: if line.startswith(">"): # find all lines starting with ">" prot = True seq = False # seq has to be False, because a new protein is starting sequence = "" protein = line elif prot == True and not "Length" in line: protein = protein + line # addd all informtions to the protein name line elif "Length" in line: prot = False # now the name of the protein is ending ident = True # after the protein name the identity value is following elif ident == True and "Identities" in line: # save the corresponding sequence identity value seq = True # now the sequences are following l = line.split(", ") m = re.search('(?<=\()\w+', l[0]) # save only the identity value between the brackets with regexp identity = int(m.group(0)) elif seq == True and line.startswith("Sbjct"): ident = False mm = re.search('(?<=[0-9])\D+', line) s = mm.group(0).replace(" ", "") sequence = sequence + s.replace("-", "")

all = [protein.replace("\n", ""), identity, sequence.replace("\n", "")] all_pdb.append(all)

if identity < 33 : # There are no Sequences under 30% sequence identity, the lowest ones are 32% under_30 = [protein.replace("\n", ""), identity, sequence.replace("\n", "")] if not under_30 in low_pdb: low_pdb.append(under_30) elif identity > 32 and int(m.group(0)) < 60 : under_60 = [protein.replace("\n", ""), identity, sequence.replace("\n", "")] if not under_60 in middle_pdb: middle_pdb.append(under_60) elif identity > 60 : over_60 = [protein.replace("\n", ""), identity, sequence.replace("\n", "")] if not over_60 in high_pdb: high_pdb.append(over_60) pobj.close()

Now only ten sequences from < 30%, over 60% and between 30% and 60% (two from pdb sequences) and 20 Sequences
from all identities (four from pdb sequences) have to be saved randomly
--------------------------------------------------------------------------------------------------------------

if "low" in sys.argv: # print 10 random chosen sequences with lower identity than 30% into a new file low = random.sample(low_big80, 8) lowp = random.sample(low_pdb, 2) out = open("/mnt/home/student/worfk/Masterpractical/Task2/MSA/low.fasta", "w") for i in range(len(low)): out.write(low[i][0] + " identity: " + str(low[i][1]) + "\n" + low[i][1] + "\n") for j in range(len(lowp)): out.write(lowp[j][0] + " identity: " + str(lowp[j][1]) + "\n" + lowp[j][1] + "\n") out.close() elif "med" in sys.argv: # print 20 random chosen sequences between 30% and 60% sequence identity into a new file mid = random.sample(middle_big80, 8) midp = random.sample(middle_pdb, 2) out = open("/mnt/home/student/worfk/Masterpractical/Task2/MSA/medium.fasta", "w") for i in range(len(mid)): out.write(mid[i][0] + " identity: " + str(mid[i][1]) + "\n" + mid[i][2] + "\n") for j in range(len(midp)): out.write(midp[j][0] + " identity: " + str(midp[j][1]) + "\n" + midp[j][2] + "\n") out.close() elif "high" in sys.argv: # print 10 random chosen sequences with higher identity than 60% into a new file high = random.sample(high_big80, 8) highp = random.sample(high_pdb, 2) out = open("/mnt/home/student/worfk/Masterpractical/Task2/MSA/high.fasta", "w") for i in range(len(high)): out.write(high[i][0] + " identity: " + str(high[i][1]) + "\n" + high[i][2] + "\n") for j in range(len(highp)): out.write(highp[j][0] + " identity: " + str(highp[j][1]) + "\n" + highp[j][2] + "\n") out.close() elif "all" in sys.argv: # print 20 random chosen sequences with different (whatever) identity into a new file all = random.sample(all_big80, 16) allp = random.sample(all_pdb, 4) out = open("/mnt/home/student/worfk/Masterpractical/Task2/MSA/all.fasta", "w") for i in range(len(all)): out.write(all[i][0] + " identity: " + str(all[i][1]) + "\n" + all[i][2] + "\n") for j in range(len(allp)): out.write(allp[j][0] + " identity: " + str(allp[j][1]) + "\n" + allp[j][2] + "\n") out.close()

</source>

Difference between revisions of "Phenylketonuria/Task2/Scripts"

Latest revision as of 17:46, 6 May 2013

Navigation menu

Views

Personal tools

Bioinformatik navigation

MediaWiki navigation

Search

Tools

@@ Line 1: / Line 1: @@
+The python script is not perfect at all. Sometimes some sequences are doubled but we will improve it the next days. In addition, the output files are always diverse, because we use a random function for the choice of sequences.
+One can find the script in the following path:
+ /mnt/home/student/worfk/Masterpractical/Task2/MSA/MSA_grouping.py
+If you want to use the script, you have to change the paths for the input and output files ;)
+<source lang="python">
 #!/usr/bin/python
 # Python Script for grouping the sequences from BLAST for the Multiple Sequence Alignments
 ##########################################################################################
-# The script can be invoked as followed: python MSA_grouping.py low|med|high
+# The script can be invoked as followed: python MSA_grouping.py low (or med, high, all -> depends on the group/file you want)
 import sys # package for commandline parameters
 import re # package for regular expressions
 import random # package for generating random numbers
-# Open the file and find the sequences with < 30%, 30% < x < 60% and > 60% and save them into a list
+# Open the file and find the sequences with < 30%, 30% < x < 60%, > 60% and all and save them into a list
-# ---------------------------------------------------------------------------------------------------
+# -------------------------------------------------------------------------------------------------------
+fobj = open("/mnt/home/student/worfk/Masterpractical/Task2/Blast/PAH_Blast_big_80.out")
-fobj = open("C:\Dokumente und Einstellungen\Karo\Eigene Dateien\Dropbox\Masterpraktikum\Karo\BLAST\PAH_BLAST_big_80.out")
 low_big80 = [] # all sequences with an identity value under 30 per cent are stored here
 middle_big80 = [] # all sequences with an identity value between 30 and 60 per cent are stored here
 high_big80 = [] # all sequences with an identity value over 60 per cent are stored here
+all_big80 = [] # all sequences are stored
 protein = ""
 identity = 0
@@ Line 41: / Line 51: @@
 		sequence = sequence + s.replace("-", "")
+		all = [protein.replace("\n", ""), identity, sequence.replace("\n", "")]
-		if int(m.group(0)) < 30 :
+		all_big80.append(all)
-			under_30 = [protein.replace("\n", ""), sequence.replace("\n", "")]
+		if identity < 30 :
+			under_30 = [protein.replace("\n", ""), identity, sequence.replace("\n", "")]
 			if not under_30 in low_big80:
 				low_big80.append(under_30)
-		elif int(m.group(0)) > 30 and int(m.group(0)) < 60 :
+		elif identity > 30 and int(m.group(0)) < 60 :
-			under_60 = [protein.replace("\n", ""), sequence.replace("\n", "")]
+			under_60 = [protein.replace("\n", ""), identity, sequence.replace("\n", "")]
 			if not under_60 in middle_big80:
 				middle_big80.append(under_60)
-		elif int(m.group(0)) > 60 :
+		elif identity > 60 :
-			over_60 = [protein.replace("\n", ""), sequence.replace("\n", "")]
+			over_60 = [protein.replace("\n", ""), identity, sequence.replace("\n", "")]
 			if not over_60 in high_big80:
 				high_big80.append(over_60)
 fobj.close()
-#print(low[1])
 # Afterwards, the pdb Sequences from Blast get the same procedure
 # ----------------------------------------------------------------
+pobj = open("/mnt/home/student/worfk/Masterpractical/Task2/Blast/PAH_Blast_pdb.out")
-pobj = open("C:\Dokumente und Einstellungen\Karo\Eigene Dateien\Dropbox\Masterpraktikum\Karo\BLAST\PAH_BLAST_pdb.out")
 low_pdb = [] # all sequences with an identity value under 30 per cent are stored here
 middle_pdb = [] # all sequences with an identity value between 30 and 60 per cent are stored here
 high_pdb = [] # all sequences with an identity value over 60 per cent are stored here
+all_pdb = [] # all pdb sequences are stored
 protein = ""
 identity = 0
@@ Line 90: / Line 103: @@
 		sequence = sequence + s.replace("-", "")
+		all = [protein.replace("\n", ""), identity, sequence.replace("\n", "")]
-		if int(m.group(0)) < 33 : # There are no Sequences under 30% sequence identity, the lowest ones are 32%
+		all_pdb.append(all)
-			under_30 = [protein.replace("\n", ""), sequence.replace("\n", "")]
+		if identity < 33 : # There are no Sequences under 30% sequence identity, the lowest ones are 32%
+			under_30 = [protein.replace("\n", ""), identity, sequence.replace("\n", "")]
 			if not under_30 in low_pdb:
 				low_pdb.append(under_30)
-		elif int(m.group(0)) > 30 and int(m.group(0)) < 60 :
+		elif identity > 32 and int(m.group(0)) < 60 :
-			under_60 = [protein.replace("\n", ""), sequence.replace("\n", "")]
+			under_60 = [protein.replace("\n", ""), identity, sequence.replace("\n", "")]
 			if not under_60 in middle_pdb:
 				middle_pdb.append(under_60)
-		elif int(m.group(0)) > 60 :
+		elif identity > 60 :
-			over_60 = [protein.replace("\n", ""), sequence.replace("\n", "")]
+			over_60 = [protein.replace("\n", ""), identity, sequence.replace("\n", "")]
 			if not over_60 in high_pdb:
 				high_pdb.append(over_60)
@@ Line 105: / Line 121: @@
-# Now only eight sequences from < 30% and over 60% and 18 Sequences from 30% < x < 60% have to be saved randomly
+# Now only ten sequences from < 30%, over 60% and between 30% and 60% (two from pdb sequences) and 20 Sequences
+# from all identities (four from pdb sequences) have to be saved randomly
-# ---------------------------------------------------------------------------------------------------------------
+# --------------------------------------------------------------------------------------------------------------
 if "low" in sys.argv:
@@ Line 112: / Line 129: @@
 	low = random.sample(low_big80, 8)
 	lowp = random.sample(low_pdb, 2)
-	out = open("C:\Dokumente und Einstellungen\Karo\Desktop\low.fasta", "w")
+	out = open("/mnt/home/student/worfk/Masterpractical/Task2/MSA/low.fasta", "w")
 	for i in range(len(low)):
-		out.write(low[i][0] + "\n" + low[i][1] + "\n")
+		out.write(low[i][0] + " identity: " + str(low[i][1]) + "\n" + low[i][1] + "\n")
 	for j in range(len(lowp)):
-		out.write(lowp[j][0] + "\n" + lowp[j][1] + "\n")
+		out.write(lowp[j][0] + " identity: " + str(lowp[j][1]) + "\n" + lowp[j][1] + "\n")
 	out.close()
 elif "med" in sys.argv:
 	# print 20 random chosen sequences between 30% and 60% sequence identity into a new file
-	mid = random.sample(middle_big80, 16)
+	mid = random.sample(middle_big80, 8)
-	midp = random.sample(middle_pdb, 4)
+	midp = random.sample(middle_pdb, 2)
-	out = open("C:\Dokumente und Einstellungen\Karo\Desktop\medium.fasta", "w")
+	out = open("/mnt/home/student/worfk/Masterpractical/Task2/MSA/medium.fasta", "w")
 	for i in range(len(mid)):
-		out.write(mid[i][0] + "\n" + mid[i][1] + "\n")
+		out.write(mid[i][0] + " identity: " + str(mid[i][1]) + "\n" + mid[i][2] + "\n")
 	for j in range(len(midp)):
-		out.write(midp[j][0] + "\n" + midp[j][1] + "\n")
+		out.write(midp[j][0] + " identity: " + str(midp[j][1]) + "\n" + midp[j][2] + "\n")
 	out.close()
 elif "high" in sys.argv:
@@ Line 132: / Line 149: @@
 	high = random.sample(high_big80, 8)
 	highp = random.sample(high_pdb, 2)
-	out = open("C:\Dokumente und Einstellungen\Karo\Desktop\high.fasta", "w")
+	out = open("/mnt/home/student/worfk/Masterpractical/Task2/MSA/high.fasta", "w")
 	for i in range(len(high)):
-		out.write(high[i][0] + "\n" + high[i][1] + "\n")
+		out.write(high[i][0] + " identity: " + str(high[i][1]) + "\n" + high[i][2] + "\n")
 	for j in range(len(highp)):
-		out.write(highp[j][0] + "\n" + highp[j][1] + "\n")
+		out.write(highp[j][0] + " identity: " + str(highp[j][1]) + "\n" + highp[j][2] + "\n")
 	out.close()
+elif "all" in sys.argv:
+	# print 20 random chosen sequences with different (whatever) identity into a new file
+	all = random.sample(all_big80, 16)
+	allp = random.sample(all_pdb, 4)
+	out = open("/mnt/home/student/worfk/Masterpractical/Task2/MSA/all.fasta", "w")
+	for i in range(len(all)):
+		out.write(all[i][0] + " identity: " + str(all[i][1]) + "\n" + all[i][2] + "\n")
+	for j in range(len(allp)):
+		out.write(allp[j][0] + " identity: " + str(allp[j][1]) + "\n" + allp[j][2] + "\n")
+	out.close()
+</source>