Difference between revisions of "Phenylketonuria/Task2/Scripts"
From Bioinformatikpedia
(Created page with "#!/usr/bin/python # Python Script for grouping the sequences from BLAST for the Multiple Sequence Alignments #####################################################################…") |
(Replaced content with "script follows tomorrow") |
||
Line 1: | Line 1: | ||
+ | script follows tomorrow |
||
− | #!/usr/bin/python |
||
− | # Python Script for grouping the sequences from BLAST for the Multiple Sequence Alignments |
||
− | ########################################################################################## |
||
− | # The script can be invoked as followed: python MSA_grouping.py low|med|high |
||
− | import sys # package for commandline parameters |
||
− | import re # package for regular expressions |
||
− | import random # package for generating random numbers |
||
− | |||
− | # Open the file and find the sequences with < 30%, 30% < x < 60% and > 60% and save them into a list |
||
− | # --------------------------------------------------------------------------------------------------- |
||
− | fobj = open("C:\Dokumente und Einstellungen\Karo\Eigene Dateien\Dropbox\Masterpraktikum\Karo\BLAST\PAH_BLAST_big_80.out") |
||
− | low_big80 = [] # all sequences with an identity value under 30 per cent are stored here |
||
− | middle_big80 = [] # all sequences with an identity value between 30 and 60 per cent are stored here |
||
− | high_big80 = [] # all sequences with an identity value over 60 per cent are stored here |
||
− | protein = "" |
||
− | identity = 0 |
||
− | sequence = "" |
||
− | prot = False |
||
− | ident = False |
||
− | seq = False |
||
− | for line in fobj: |
||
− | if line.startswith(">"): # find all lines starting with ">" |
||
− | prot = True |
||
− | seq = False # seq has to be False, because a new protein is starting |
||
− | sequence = "" |
||
− | protein = line |
||
− | elif prot == True and not "Length" in line: |
||
− | protein = protein + line # addd all informtions to the protein name line |
||
− | elif "Length" in line: |
||
− | prot = False # now the name of the protein is ending |
||
− | ident = True # after the protein name the identity value is following |
||
− | elif ident == True and "Identities" in line: # save the corresponding sequence identity value |
||
− | seq = True # now the sequences are following |
||
− | l = line.split(", ") |
||
− | m = re.search('(?<=\()\w+', l[0]) # save only the identity value between the brackets with regexp |
||
− | identity = int(m.group(0)) |
||
− | elif seq == True and line.startswith("Sbjct"): |
||
− | ident = False |
||
− | mm = re.search('(?<=[0-9])\D+', line) |
||
− | s = mm.group(0).replace(" ", "") |
||
− | sequence = sequence + s.replace("-", "") |
||
− | |||
− | if int(m.group(0)) < 30 : |
||
− | under_30 = [protein.replace("\n", ""), sequence.replace("\n", "")] |
||
− | if not under_30 in low_big80: |
||
− | low_big80.append(under_30) |
||
− | elif int(m.group(0)) > 30 and int(m.group(0)) < 60 : |
||
− | under_60 = [protein.replace("\n", ""), sequence.replace("\n", "")] |
||
− | if not under_60 in middle_big80: |
||
− | middle_big80.append(under_60) |
||
− | elif int(m.group(0)) > 60 : |
||
− | over_60 = [protein.replace("\n", ""), sequence.replace("\n", "")] |
||
− | if not over_60 in high_big80: |
||
− | high_big80.append(over_60) |
||
− | fobj.close() |
||
− | #print(low[1]) |
||
− | |||
− | # Afterwards, the pdb Sequences from Blast get the same procedure |
||
− | # ---------------------------------------------------------------- |
||
− | pobj = open("C:\Dokumente und Einstellungen\Karo\Eigene Dateien\Dropbox\Masterpraktikum\Karo\BLAST\PAH_BLAST_pdb.out") |
||
− | low_pdb = [] # all sequences with an identity value under 30 per cent are stored here |
||
− | middle_pdb = [] # all sequences with an identity value between 30 and 60 per cent are stored here |
||
− | high_pdb = [] # all sequences with an identity value over 60 per cent are stored here |
||
− | protein = "" |
||
− | identity = 0 |
||
− | sequence = "" |
||
− | prot = False |
||
− | ident = False |
||
− | seq = False |
||
− | for line in pobj: |
||
− | if line.startswith(">"): # find all lines starting with ">" |
||
− | prot = True |
||
− | seq = False # seq has to be False, because a new protein is starting |
||
− | sequence = "" |
||
− | protein = line |
||
− | elif prot == True and not "Length" in line: |
||
− | protein = protein + line # addd all informtions to the protein name line |
||
− | elif "Length" in line: |
||
− | prot = False # now the name of the protein is ending |
||
− | ident = True # after the protein name the identity value is following |
||
− | elif ident == True and "Identities" in line: # save the corresponding sequence identity value |
||
− | seq = True # now the sequences are following |
||
− | l = line.split(", ") |
||
− | m = re.search('(?<=\()\w+', l[0]) # save only the identity value between the brackets with regexp |
||
− | identity = int(m.group(0)) |
||
− | elif seq == True and line.startswith("Sbjct"): |
||
− | ident = False |
||
− | mm = re.search('(?<=[0-9])\D+', line) |
||
− | s = mm.group(0).replace(" ", "") |
||
− | sequence = sequence + s.replace("-", "") |
||
− | |||
− | if int(m.group(0)) < 33 : # There are no Sequences under 30% sequence identity, the lowest ones are 32% |
||
− | under_30 = [protein.replace("\n", ""), sequence.replace("\n", "")] |
||
− | if not under_30 in low_pdb: |
||
− | low_pdb.append(under_30) |
||
− | elif int(m.group(0)) > 30 and int(m.group(0)) < 60 : |
||
− | under_60 = [protein.replace("\n", ""), sequence.replace("\n", "")] |
||
− | if not under_60 in middle_pdb: |
||
− | middle_pdb.append(under_60) |
||
− | elif int(m.group(0)) > 60 : |
||
− | over_60 = [protein.replace("\n", ""), sequence.replace("\n", "")] |
||
− | if not over_60 in high_pdb: |
||
− | high_pdb.append(over_60) |
||
− | pobj.close() |
||
− | |||
− | |||
− | # Now only eight sequences from < 30% and over 60% and 18 Sequences from 30% < x < 60% have to be saved randomly |
||
− | # --------------------------------------------------------------------------------------------------------------- |
||
− | |||
− | if "low" in sys.argv: |
||
− | # print 10 random chosen sequences with lower identity than 30% into a new file |
||
− | low = random.sample(low_big80, 8) |
||
− | lowp = random.sample(low_pdb, 2) |
||
− | out = open("C:\Dokumente und Einstellungen\Karo\Desktop\low.fasta", "w") |
||
− | for i in range(len(low)): |
||
− | out.write(low[i][0] + "\n" + low[i][1] + "\n") |
||
− | for j in range(len(lowp)): |
||
− | out.write(lowp[j][0] + "\n" + lowp[j][1] + "\n") |
||
− | out.close() |
||
− | elif "med" in sys.argv: |
||
− | # print 20 random chosen sequences between 30% and 60% sequence identity into a new file |
||
− | mid = random.sample(middle_big80, 16) |
||
− | midp = random.sample(middle_pdb, 4) |
||
− | out = open("C:\Dokumente und Einstellungen\Karo\Desktop\medium.fasta", "w") |
||
− | for i in range(len(mid)): |
||
− | out.write(mid[i][0] + "\n" + mid[i][1] + "\n") |
||
− | for j in range(len(midp)): |
||
− | out.write(midp[j][0] + "\n" + midp[j][1] + "\n") |
||
− | out.close() |
||
− | elif "high" in sys.argv: |
||
− | # print 10 random chosen sequences with higher identity than 60% into a new file |
||
− | high = random.sample(high_big80, 8) |
||
− | highp = random.sample(high_pdb, 2) |
||
− | out = open("C:\Dokumente und Einstellungen\Karo\Desktop\high.fasta", "w") |
||
− | for i in range(len(high)): |
||
− | out.write(high[i][0] + "\n" + high[i][1] + "\n") |
||
− | for j in range(len(highp)): |
||
− | out.write(highp[j][0] + "\n" + highp[j][1] + "\n") |
||
− | out.close() |
Revision as of 22:06, 5 May 2013
script follows tomorrow