Difference between revisions of "Phenylketonuria/Task2/Scripts"

Revision as of 22:06, 5 May 2013

script follows tomorrow

@@ Line 1: / Line 1: @@
+script follows tomorrow
-#!/usr/bin/python
-# Python Script for grouping the sequences from BLAST for the Multiple Sequence Alignments
-##########################################################################################
-# The script can be invoked as followed: python MSA_grouping.py low|med|high
-import sys # package for commandline parameters
-import re # package for regular expressions
-import random # package for generating random numbers
-# Open the file and find the sequences with < 30%, 30% < x < 60% and > 60% and save them into a list
-# ---------------------------------------------------------------------------------------------------
-fobj = open("C:\Dokumente und Einstellungen\Karo\Eigene Dateien\Dropbox\Masterpraktikum\Karo\BLAST\PAH_BLAST_big_80.out")
-low_big80 = [] # all sequences with an identity value under 30 per cent are stored here
-middle_big80 = [] # all sequences with an identity value between 30 and 60 per cent are stored here
-high_big80 = [] # all sequences with an identity value over 60 per cent are stored here
-protein = ""
-identity = 0
-sequence = ""
-prot = False
-ident = False
-seq = False
-for line in fobj:
-	if line.startswith(">"): # find all lines starting with ">"
-		prot = True
-		seq = False # seq has to be False, because a new protein is starting
-		sequence = ""
-		protein = line
-	elif prot == True and not "Length" in line:
-		protein = protein + line # addd all informtions to the protein name line
-	elif "Length" in line:
-		prot = False # now the name of the protein is ending
-		ident = True # after the protein name the identity value is following
-	elif ident == True and "Identities" in line: # save the corresponding sequence identity value
-		seq = True # now the sequences are following
-		l = line.split(", ")
-		m = re.search('(?<=\()\w+', l[0]) # save only the identity value between the brackets with regexp
-		identity = int(m.group(0))
-	elif seq == True and line.startswith("Sbjct"):
-		ident = False
-		mm = re.search('(?<=[0-9])\D+', line)
-		s = mm.group(0).replace(" ", "")
-		sequence = sequence + s.replace("-", "")
-		if int(m.group(0)) < 30 :
-			under_30 = [protein.replace("\n", ""), sequence.replace("\n", "")]
-			if not under_30 in low_big80:
-				low_big80.append(under_30)
-		elif int(m.group(0)) > 30 and int(m.group(0)) < 60 :
-			under_60 = [protein.replace("\n", ""), sequence.replace("\n", "")]
-			if not under_60 in middle_big80:
-				middle_big80.append(under_60)
-		elif int(m.group(0)) > 60 :
-			over_60 = [protein.replace("\n", ""), sequence.replace("\n", "")]
-			if not over_60 in high_big80:
-				high_big80.append(over_60)
-fobj.close()
-#print(low[1])
-# Afterwards, the pdb Sequences from Blast get the same procedure
-# ----------------------------------------------------------------
-pobj = open("C:\Dokumente und Einstellungen\Karo\Eigene Dateien\Dropbox\Masterpraktikum\Karo\BLAST\PAH_BLAST_pdb.out")
-low_pdb = [] # all sequences with an identity value under 30 per cent are stored here
-middle_pdb = [] # all sequences with an identity value between 30 and 60 per cent are stored here
-high_pdb = [] # all sequences with an identity value over 60 per cent are stored here
-protein = ""
-identity = 0
-sequence = ""
-prot = False
-ident = False
-seq = False
-for line in pobj:
-	if line.startswith(">"): # find all lines starting with ">"
-		prot = True
-		seq = False # seq has to be False, because a new protein is starting
-		sequence = ""
-		protein = line
-	elif prot == True and not "Length" in line:
-		protein = protein + line # addd all informtions to the protein name line
-	elif "Length" in line:
-		prot = False # now the name of the protein is ending
-		ident = True # after the protein name the identity value is following
-	elif ident == True and "Identities" in line: # save the corresponding sequence identity value
-		seq = True # now the sequences are following
-		l = line.split(", ")
-		m = re.search('(?<=\()\w+', l[0]) # save only the identity value between the brackets with regexp
-		identity = int(m.group(0))
-	elif seq == True and line.startswith("Sbjct"):
-		ident = False
-		mm = re.search('(?<=[0-9])\D+', line)
-		s = mm.group(0).replace(" ", "")
-		sequence = sequence + s.replace("-", "")
-		if int(m.group(0)) < 33 : # There are no Sequences under 30% sequence identity, the lowest ones are 32%
-			under_30 = [protein.replace("\n", ""), sequence.replace("\n", "")]
-			if not under_30 in low_pdb:
-				low_pdb.append(under_30)
-		elif int(m.group(0)) > 30 and int(m.group(0)) < 60 :
-			under_60 = [protein.replace("\n", ""), sequence.replace("\n", "")]
-			if not under_60 in middle_pdb:
-				middle_pdb.append(under_60)
-		elif int(m.group(0)) > 60 :
-			over_60 = [protein.replace("\n", ""), sequence.replace("\n", "")]
-			if not over_60 in high_pdb:
-				high_pdb.append(over_60)
-pobj.close()
-# Now only eight sequences from < 30% and over 60% and 18 Sequences from 30% < x < 60% have to be saved randomly
-# ---------------------------------------------------------------------------------------------------------------
-if "low" in sys.argv:
-	# print 10 random chosen sequences with lower identity than 30% into a new file
-	low = random.sample(low_big80, 8)
-	lowp = random.sample(low_pdb, 2)
-	out = open("C:\Dokumente und Einstellungen\Karo\Desktop\low.fasta", "w")
-	for i in range(len(low)):
-		out.write(low[i][0] + "\n" + low[i][1] + "\n")
-	for j in range(len(lowp)):
-		out.write(lowp[j][0] + "\n" + lowp[j][1] + "\n")
-	out.close()
-elif "med" in sys.argv:
-	# print 20 random chosen sequences between 30% and 60% sequence identity into a new file
-	mid = random.sample(middle_big80, 16)
-	midp = random.sample(middle_pdb, 4)
-	out = open("C:\Dokumente und Einstellungen\Karo\Desktop\medium.fasta", "w")
-	for i in range(len(mid)):
-		out.write(mid[i][0] + "\n" + mid[i][1] + "\n")
-	for j in range(len(midp)):
-		out.write(midp[j][0] + "\n" + midp[j][1] + "\n")
-	out.close()
-elif "high" in sys.argv:
-	# print 10 random chosen sequences with higher identity than 60% into a new file
-	high = random.sample(high_big80, 8)
-	highp = random.sample(high_pdb, 2)
-	out = open("C:\Dokumente und Einstellungen\Karo\Desktop\high.fasta", "w")
-	for i in range(len(high)):
-		out.write(high[i][0] + "\n" + high[i][1] + "\n")
-	for j in range(len(highp)):
-		out.write(highp[j][0] + "\n" + highp[j][1] + "\n")
-	out.close()

Difference between revisions of "Phenylketonuria/Task2/Scripts"

Revision as of 22:06, 5 May 2013

Navigation menu

Views

Personal tools

Bioinformatik navigation

MediaWiki navigation

Search

Tools