Phenylketonuria/Task2/Scripts
The python script is not perfect at all. Sometimes some sequences are doubled, but I will perform it the next days.
One can find the script in the following path:
/mnt/home/student/worfk/Masterpractical/Task2/MSA/MSA_grouping.py
<source lang="python">
- !/usr/bin/python
- Python Script for grouping the sequences from BLAST for the Multiple Sequence Alignments
- The script can be invoked as followed: python MSA_grouping.py low|med|high
import sys # package for commandline parameters import re # package for regular expressions import random # package for generating random numbers
- Open the file and find the sequences with < 30%, 30% < x < 60% and > 60% and save them into a list
- ---------------------------------------------------------------------------------------------------
fobj = open("C:\Dokumente und Einstellungen\Karo\Eigene Dateien\Dropbox\Masterpraktikum\Karo\BLAST\PAH_BLAST_big_80.out") low_big80 = [] # all sequences with an identity value under 30 per cent are stored here middle_big80 = [] # all sequences with an identity value between 30 and 60 per cent are stored here high_big80 = [] # all sequences with an identity value over 60 per cent are stored here protein = "" identity = 0 sequence = "" prot = False ident = False seq = False for line in fobj: if line.startswith(">"): # find all lines starting with ">" prot = True seq = False # seq has to be False, because a new protein is starting sequence = "" protein = line elif prot == True and not "Length" in line: protein = protein + line # addd all informtions to the protein name line elif "Length" in line: prot = False # now the name of the protein is ending ident = True # after the protein name the identity value is following elif ident == True and "Identities" in line: # save the corresponding sequence identity value seq = True # now the sequences are following l = line.split(", ") m = re.search('(?<=\()\w+', l[0]) # save only the identity value between the brackets with regexp identity = int(m.group(0)) elif seq == True and line.startswith("Sbjct"): ident = False mm = re.search('(?<=[0-9])\D+', line) s = mm.group(0).replace(" ", "") sequence = sequence + s.replace("-", "")
if int(m.group(0)) < 30 : under_30 = [protein.replace("\n", ""), sequence.replace("\n", "")] if not under_30 in low_big80: low_big80.append(under_30) elif int(m.group(0)) > 30 and int(m.group(0)) < 60 : under_60 = [protein.replace("\n", ""), sequence.replace("\n", "")] if not under_60 in middle_big80: middle_big80.append(under_60) elif int(m.group(0)) > 60 : over_60 = [protein.replace("\n", ""), sequence.replace("\n", "")] if not over_60 in high_big80: high_big80.append(over_60) fobj.close()
- print(low[1])
- Afterwards, the pdb Sequences from Blast get the same procedure
- ----------------------------------------------------------------
pobj = open("C:\Dokumente und Einstellungen\Karo\Eigene Dateien\Dropbox\Masterpraktikum\Karo\BLAST\PAH_BLAST_pdb.out") low_pdb = [] # all sequences with an identity value under 30 per cent are stored here middle_pdb = [] # all sequences with an identity value between 30 and 60 per cent are stored here high_pdb = [] # all sequences with an identity value over 60 per cent are stored here protein = "" identity = 0 sequence = "" prot = False ident = False seq = False for line in pobj: if line.startswith(">"): # find all lines starting with ">" prot = True seq = False # seq has to be False, because a new protein is starting sequence = "" protein = line elif prot == True and not "Length" in line: protein = protein + line # addd all informtions to the protein name line elif "Length" in line: prot = False # now the name of the protein is ending ident = True # after the protein name the identity value is following elif ident == True and "Identities" in line: # save the corresponding sequence identity value seq = True # now the sequences are following l = line.split(", ") m = re.search('(?<=\()\w+', l[0]) # save only the identity value between the brackets with regexp identity = int(m.group(0)) elif seq == True and line.startswith("Sbjct"): ident = False mm = re.search('(?<=[0-9])\D+', line) s = mm.group(0).replace(" ", "") sequence = sequence + s.replace("-", "")
if int(m.group(0)) < 33 : # There are no Sequences under 30% sequence identity, the lowest ones are 32% under_30 = [protein.replace("\n", ""), sequence.replace("\n", "")] if not under_30 in low_pdb: low_pdb.append(under_30) elif int(m.group(0)) > 32 and int(m.group(0)) < 60 : under_60 = [protein.replace("\n", ""), sequence.replace("\n", "")] if not under_60 in middle_pdb: middle_pdb.append(under_60) elif int(m.group(0)) > 60 : over_60 = [protein.replace("\n", ""), sequence.replace("\n", "")] if not over_60 in high_pdb: high_pdb.append(over_60) pobj.close()
- Now only eight sequences from < 30% and over 60% and 18 Sequences from 30% < x < 60% have to be saved randomly
- ---------------------------------------------------------------------------------------------------------------
if "low" in sys.argv: # print 10 random chosen sequences with lower identity than 30% into a new file low = random.sample(low_big80, 8) lowp = random.sample(low_pdb, 2) out = open("C:\Dokumente und Einstellungen\Karo\Desktop\low.fasta", "w") for i in range(len(low)): out.write(low[i][0] + "\n" + low[i][1] + "\n") for j in range(len(lowp)): out.write(lowp[j][0] + "\n" + lowp[j][1] + "\n") out.close() elif "med" in sys.argv: # print 20 random chosen sequences between 30% and 60% sequence identity into a new file mid = random.sample(middle_big80, 17) midp = random.sample(middle_pdb, 3) out = open("C:\Dokumente und Einstellungen\Karo\Desktop\medium.fasta", "w") for i in range(len(mid)): out.write(mid[i][0] + "\n" + mid[i][1] + "\n") for j in range(len(midp)): out.write(midp[j][0] + "\n" + midp[j][1] + "\n") out.close() elif "high" in sys.argv: # print 10 random chosen sequences with higher identity than 60% into a new file high = random.sample(high_big80, 8) highp = random.sample(high_pdb, 2) out = open("C:\Dokumente und Einstellungen\Karo\Desktop\high.fasta", "w") for i in range(len(high)): out.write(high[i][0] + "\n" + high[i][1] + "\n") for j in range(len(highp)): out.write(highp[j][0] + "\n" + highp[j][1] + "\n") out.close()
</source>