Difference between revisions of "Phenylketonuria/Task3/Scripts"
(→Secondary Structure) |
|||
(19 intermediate revisions by the same user not shown) | |||
Line 1: | Line 1: | ||
+ | ==Secondary Structure== |
||
− | This script reads an input file in ReProf format or PsiPred and filters the file for the secondary structur presented with E,H and L/C.<br> |
||
+ | Back to [https://i12r-studfilesrv.informatik.tu-muenchen.de/wiki/index.php/Lab_Journal_-_Task_3_%28PAH%29#Secondary_structure Lab Journal] |
||
− | You can find the script in <code>/mnt/home/student/waldraffs/Masterpraktikum/Task3/filter_secStruc.pl</code> |
||
+ | ===filter_secStruc.pl=== |
||
+ | This script reads an input file in ReProf format or PsiPred and filters the file for the secondary structur presented with E,H and L/C. Also it can read a DSSP file. If the structure forms a loop or irregular a "-" is inserted.<br> |
||
+ | You can find the script in <code>/mnt/home/student/waldraffs/masterpractical/Task03/secondary_structure/filter_secStruc.pl</code> |
||
− | Usage: perl filter_secStrucpl -inp <input-file in ReProf format> -out <output-file> -reprof|-psipred |
+ | Usage: perl filter_secStrucpl -inp <input-file in ReProf format> -out <output-file> -reprof|-psipred|-dssp |
− | -reprof: if the input file is result of a ReProf run |
+ | -reprof: if the input file is result of a ReProf run. |
− | -psipred: if the input file is result of a PsiPred run |
+ | -psipred: if the input file is result of a PsiPred run. |
+ | -dssp: if the input file is result of a DSSP run. |
||
− | The output is the secondary structure shown with the letters E, H and L |
+ | The output is the secondary structure shown with the letters E, H and L. Therefore at PsiPred the letter C is converted to L and at DSSP the letters G and I are replaced by H, B by E and S and T by L. |
Code: |
Code: |
||
Line 18: | Line 22: | ||
my $reprof = 0; |
my $reprof = 0; |
||
my $psipred = 0; |
my $psipred = 0; |
||
+ | my $dssp = 0; |
||
# Reads the command line parameters |
# Reads the command line parameters |
||
Line 32: | Line 37: | ||
if($ARGV[$i] eq "-psipred"){ |
if($ARGV[$i] eq "-psipred"){ |
||
$psipred = 1; |
$psipred = 1; |
||
+ | } |
||
+ | if($ARGV[$i] eq "-dssp"){ |
||
+ | $dssp = 1; |
||
} |
} |
||
} |
} |
||
if(!$inp && !$out){ |
if(!$inp && !$out){ |
||
− | print "Usage: filter_secStruc.pl -inp <input-file(.reprof)> -out <output-file> -reprof|-psipred" |
+ | print "Usage: filter_secStruc.pl -inp <input-file(.reprof)> -out <output-file> -reprof|-psipred|-dssp" |
} |
} |
||
Line 49: | Line 57: | ||
if($no == 1){ |
if($no == 1){ |
||
@splitLine = split(/\t/,$line); |
@splitLine = split(/\t/,$line); |
||
− | $secstructure= |
+ | $secstructure=$secstructure.$splitLine[2]; |
} |
} |
||
Line 62: | Line 70: | ||
@splitLine = split(/ /,$line); |
@splitLine = split(/ /,$line); |
||
chomp($splitLine[1]); |
chomp($splitLine[1]); |
||
− | $secstructure = |
+ | $secstructure = $secstructure.$splitLine[1]; |
} |
} |
||
} |
} |
||
+ | $secstructure =~ s/C/L/g; |
||
} |
} |
||
+ | if($dssp ==1){ |
||
+ | $number = 0; |
||
+ | while (my $line = <FILE>){ |
||
+ | if($no ==1){ |
||
+ | $line =~ s/^\s+//g; |
||
+ | @splitLine = split(/ /,$line); |
||
+ | my $letter = ""; |
||
+ | if($splitLine[1] eq "" && $splitLine[2] eq "" && $splitLine[3] eq ""){ |
||
+ | $number = 1; |
||
+ | } |
||
+ | if($splitLine[1] eq "" && $splitLine[2] eq "" && $splitLine[3] ne ""){ |
||
+ | $number = 2; |
||
+ | } |
||
+ | if($splitLine[1] eq "" && $splitLine[2] ne ""){ |
||
+ | $number = 3; |
||
+ | } |
||
+ | if($splitLine[0] < 10){ |
||
+ | $letter = $splitLine[9-$number]; |
||
+ | } |
||
+ | elsif($splitLine[0] >9 && $splitLine[0] < 100){ |
||
+ | $letter = $splitLine[9-$number]; |
||
+ | } |
||
+ | else{ |
||
+ | $letter = $splitLine[9-$number]; |
||
+ | } |
||
+ | if($letter eq ""){ |
||
+ | $letter="-"; |
||
+ | } |
||
+ | $secstructure = $secstructure.$letter; |
||
+ | |||
+ | } |
||
+ | if($line =~ /^ #/){ |
||
+ | $no = 1; |
||
+ | } |
||
+ | } |
||
+ | $secstructure =~ s/[G,I]/H/g; |
||
+ | $secstructure =~ s/B/E/g; |
||
+ | $secstructure =~ s/[S,T]/L/g; |
||
+ | } |
||
close FILE; |
close FILE; |
||
Line 72: | Line 120: | ||
print OUTFILE $secstructure; |
print OUTFILE $secstructure; |
||
close OUTFILE; |
close OUTFILE; |
||
+ | </source> |
||
+ | ===SecStrucComparison.jar=== |
||
+ | The script takes two sequences as input (presented with E, H, L and -) and calculates the precision between those. |
||
+ | Usage: |
||
+ | java -jar SecStrucComparison.jar |
||
+ | -i_seq1 <input-file 1>: first structure sequence (used as reference for precision) |
||
+ | -i_seq2 <input-file 2>: second structure sequence |
||
+ | -o <output-file>: returns a list with the precision for E, H and L of the first sequence as well as the overall precision. |
||
+ | The structure sequences must be in the format of the filter_secStruc.pl output. Furthermore the two sequences must have the same length. <br> |
||
+ | You can find the script in <code>/mnt/home/student/waldraffs/masterpractical/Task03/secondary_structure/SecStrucComparison.jar</code> |
||
+ | |||
+ | It must be considered which sequence should be used as first or second, as they can have a different number of E, H or L. Those letters are counted for the first sequence only and used for calculation of the precision. For the total precision there is no difference. |
||
+ | |||
+ | Java-code: |
||
+ | <source lang="java"> |
||
+ | package structureComparison; |
||
+ | |||
+ | import java.io.BufferedReader; |
||
+ | import java.io.File; |
||
+ | import java.io.FileReader; |
||
+ | import java.io.FileWriter; |
||
+ | import java.io.PrintWriter; |
||
+ | import java.util.HashMap; |
||
+ | |||
+ | /** |
||
+ | * |
||
+ | * This class takes two secondary structure sequences (represented with E, H, L and -) and calculates the precision between those. |
||
+ | * |
||
+ | * @author Sonja Waldraff |
||
+ | * |
||
+ | */ |
||
+ | public class SecStrucComparison { |
||
+ | private static String i_seq1; |
||
+ | private static String i_seq2; |
||
+ | private static String o; |
||
+ | private static HashMap<String, String> secStruc = new HashMap<String, String>(); |
||
+ | private static double prec_E = 0.0; |
||
+ | private static double prec_H = 0.0; |
||
+ | private static double prec_L = 0.0; |
||
+ | private static double prec_total = 0.0; |
||
+ | |||
+ | /** |
||
+ | * read Input-Parameters |
||
+ | * -i_seq1: Input-file of the secondary structure sequence (#letters are counted) |
||
+ | * -i_seq2: Input-file of the secondary structure sequence (to be compared with seq1) |
||
+ | * -o: Output-file returns a list with precision for the letters E, H and L as well as for all residues. |
||
+ | * |
||
+ | * @param args |
||
+ | */ |
||
+ | public static void getCMDargs(String[] args) { |
||
+ | int j = 0; |
||
+ | if (args.length == 6) { |
||
+ | while (j < args.length) { |
||
+ | if (args[j].equals("-i_seq1")) { |
||
+ | j++; |
||
+ | i_seq1 = new String(args[j]); |
||
+ | } else if (args[j].equals("-i_seq2")) { |
||
+ | j++; |
||
+ | i_seq2 = new String(args[j]); |
||
+ | } else if (args[j].equals("-o")) { |
||
+ | j++; |
||
+ | o = new String(args[j]); |
||
+ | } |
||
+ | j++; |
||
+ | } |
||
+ | } else { |
||
+ | System.err.println("You can call the program like: java -jar SecStrucComparison.jar -i_seq1 <input-file with first structure sequence (used for precision)> -i_seq2 <input-file with second structure sequence> -o <output-file>"); |
||
+ | } |
||
+ | } |
||
+ | |||
+ | /** |
||
+ | * Methods that reads the input file |
||
+ | * |
||
+ | * @param inputfile |
||
+ | * @throws Exception |
||
+ | */ |
||
+ | private static void read_file(String seq, String inputfile) throws Exception { |
||
+ | String line; |
||
+ | try { |
||
+ | FileReader sfreader = new FileReader(inputfile); |
||
+ | BufferedReader bfreader = new BufferedReader(sfreader); |
||
+ | while ((line = bfreader.readLine()) != null) { |
||
+ | secStruc.put(seq, line); |
||
+ | } |
||
+ | sfreader.close(); |
||
+ | bfreader.close(); |
||
+ | } catch (IndexOutOfBoundsException e) { |
||
+ | e.printStackTrace(); |
||
+ | } |
||
+ | |||
+ | } |
||
+ | |||
+ | /** |
||
+ | * In the method compare the two sequences are compared on similarities. Thereby positions with '-' are ignored. |
||
+ | * The sequences have to be represented with E, H and L. |
||
+ | * @param seq1 |
||
+ | * @param seq2 |
||
+ | */ |
||
+ | private static void compare(String seq1, String seq2) { |
||
+ | HashMap<Character, Integer> match = new HashMap<Character, Integer>(); |
||
+ | HashMap<Character, Integer> residues = new HashMap<Character, Integer>(); |
||
+ | |||
+ | int total_match = 0; |
||
+ | int total_res = 0; |
||
+ | |||
+ | match.put('E', 0); match.put('H', 0); match.put('L', 0); |
||
+ | residues.put('E', 0); residues.put('H', 0); residues.put('L', 0); |
||
+ | |||
+ | for (int i = 0; i < seq1.length(); i++) { |
||
+ | char comp = seq1.charAt(i); |
||
+ | //the first sequence is used as reference. Only its letters are count not from the second sequence. |
||
+ | if (comp != '-' && seq2.charAt(i) != '-') { |
||
+ | total_res++; |
||
+ | if (comp == seq2.charAt(i)) { |
||
+ | total_match++; |
||
+ | match.put(comp, match.get(comp) + 1); |
||
+ | } |
||
+ | residues.put(comp, residues.get(comp) + 1); |
||
+ | } |
||
+ | } |
||
+ | |||
+ | // As only the letters of the first are counted, there is a difference by choosing a sequence as first or second. |
||
+ | // There is no difference for the total number. |
||
+ | prec_E = precision(match.get('E'), residues.get('E')); |
||
+ | prec_H = precision(match.get('H'),residues.get('H')); |
||
+ | prec_L = precision(match.get('L'), residues.get('L')); |
||
+ | prec_total = precision(total_match, total_res); |
||
+ | |||
+ | } |
||
+ | |||
+ | /** |
||
+ | * method for calculating the precision: number of matches / total number of residues. |
||
+ | */ |
||
+ | public static double precision(int match, int total) { |
||
+ | double prec = 0.0; |
||
+ | prec = Math.round((Double.valueOf(match)/(Double.valueOf(total))*100)*100)/100.0; |
||
+ | return prec; |
||
+ | } |
||
+ | |||
+ | /** |
||
+ | * Method that writes the outputfile: returns a list with precision for E, H, L and all three. |
||
+ | * |
||
+ | * @throws Exception |
||
+ | */ |
||
+ | private static void write_file() throws Exception { |
||
+ | PrintWriter OUT = new PrintWriter(new FileWriter(new File(o))); |
||
+ | OUT.println("E:\t" + prec_E); |
||
+ | OUT.println("H:\t" + prec_H); |
||
+ | OUT.println("L:\t" + prec_L); |
||
+ | OUT.println("total:\t" + prec_total); |
||
+ | OUT.close(); |
||
+ | |||
+ | } |
||
+ | |||
+ | /** |
||
+ | * Main method that let read parameters, files and starts the comparison. |
||
+ | * @param args |
||
+ | * @throws Exception |
||
+ | */ |
||
+ | public static void main(String[] args) throws Exception { |
||
+ | getCMDargs(args); |
||
+ | String sequence1 = "seq1"; |
||
+ | String sequence2 = "seq2"; |
||
+ | read_file(sequence1,i_seq1); |
||
+ | read_file(sequence2, i_seq2); |
||
+ | compare(secStruc.get(sequence1),secStruc.get(sequence2)); |
||
+ | write_file(); |
||
+ | } |
||
+ | } |
||
</source> |
</source> |
Latest revision as of 15:09, 17 August 2013
Secondary Structure
Back to Lab Journal
filter_secStruc.pl
This script reads an input file in ReProf format or PsiPred and filters the file for the secondary structur presented with E,H and L/C. Also it can read a DSSP file. If the structure forms a loop or irregular a "-" is inserted.
You can find the script in /mnt/home/student/waldraffs/masterpractical/Task03/secondary_structure/filter_secStruc.pl
Usage: perl filter_secStrucpl -inp <input-file in ReProf format> -out <output-file> -reprof|-psipred|-dssp -reprof: if the input file is result of a ReProf run. -psipred: if the input file is result of a PsiPred run. -dssp: if the input file is result of a DSSP run.
The output is the secondary structure shown with the letters E, H and L. Therefore at PsiPred the letter C is converted to L and at DSSP the letters G and I are replaced by H, B by E and S and T by L.
Code:
<source lang="perl">
- !/usr/bin/perl
my $inp; my $out; my $reprof = 0; my $psipred = 0; my $dssp = 0;
- Reads the command line parameters
for ($i=0; $i<=@ARGV;$i++){ if($ARGV[$i] eq "-inp"){ $inp=$ARGV[$i+1]; } if($ARGV[$i] eq "-out"){ $out=$ARGV[$i+1]; } if($ARGV[$i] eq "-reprof"){ $reprof = 1; } if($ARGV[$i] eq "-psipred"){ $psipred = 1; } if($ARGV[$i] eq "-dssp"){ $dssp = 1; } }
if(!$inp && !$out){ print "Usage: filter_secStruc.pl -inp <input-file(.reprof)> -out <output-file> -reprof|-psipred|-dssp" }
my $secstructure= ""; my $no = 0;
- reads input file and filters for the coloumn with secondary structure
open (FILE,"$inp") || die $!; if($reprof == 1){ while (my $line = <FILE>){
if($no == 1){ @splitLine = split(/\t/,$line); $secstructure=$secstructure.$splitLine[2]; }
if($line =~ /^No/){ $no = 1; } } } if($psipred == 1){ while (my $line = <FILE>){ if($line =~ /^Pred/){ @splitLine = split(/ /,$line); chomp($splitLine[1]); $secstructure = $secstructure.$splitLine[1]; } } $secstructure =~ s/C/L/g; } if($dssp ==1){ $number = 0; while (my $line = <FILE>){ if($no ==1){ $line =~ s/^\s+//g; @splitLine = split(/ /,$line); my $letter = ""; if($splitLine[1] eq "" && $splitLine[2] eq "" && $splitLine[3] eq ""){ $number = 1; } if($splitLine[1] eq "" && $splitLine[2] eq "" && $splitLine[3] ne ""){ $number = 2; } if($splitLine[1] eq "" && $splitLine[2] ne ""){ $number = 3; } if($splitLine[0] < 10){ $letter = $splitLine[9-$number]; } elsif($splitLine[0] >9 && $splitLine[0] < 100){ $letter = $splitLine[9-$number]; } else{ $letter = $splitLine[9-$number]; } if($letter eq ""){ $letter="-"; } $secstructure = $secstructure.$letter;
} if($line =~ /^ #/){ $no = 1; } } $secstructure =~ s/[G,I]/H/g; $secstructure =~ s/B/E/g; $secstructure =~ s/[S,T]/L/g; } close FILE;
- writes output file
open(OUTFILE,">$out") || die $!; print OUTFILE $secstructure; close OUTFILE; </source>
SecStrucComparison.jar
The script takes two sequences as input (presented with E, H, L and -) and calculates the precision between those.
Usage: java -jar SecStrucComparison.jar -i_seq1 <input-file 1>: first structure sequence (used as reference for precision) -i_seq2 <input-file 2>: second structure sequence -o <output-file>: returns a list with the precision for E, H and L of the first sequence as well as the overall precision.
The structure sequences must be in the format of the filter_secStruc.pl output. Furthermore the two sequences must have the same length.
You can find the script in /mnt/home/student/waldraffs/masterpractical/Task03/secondary_structure/SecStrucComparison.jar
It must be considered which sequence should be used as first or second, as they can have a different number of E, H or L. Those letters are counted for the first sequence only and used for calculation of the precision. For the total precision there is no difference.
Java-code: <source lang="java"> package structureComparison;
import java.io.BufferedReader; import java.io.File; import java.io.FileReader; import java.io.FileWriter; import java.io.PrintWriter; import java.util.HashMap;
/**
* * This class takes two secondary structure sequences (represented with E, H, L and -) and calculates the precision between those. * * @author Sonja Waldraff * */
public class SecStrucComparison { private static String i_seq1; private static String i_seq2; private static String o; private static HashMap<String, String> secStruc = new HashMap<String, String>(); private static double prec_E = 0.0; private static double prec_H = 0.0; private static double prec_L = 0.0; private static double prec_total = 0.0;
/** * read Input-Parameters * -i_seq1: Input-file of the secondary structure sequence (#letters are counted) * -i_seq2: Input-file of the secondary structure sequence (to be compared with seq1) * -o: Output-file returns a list with precision for the letters E, H and L as well as for all residues. * * @param args */ public static void getCMDargs(String[] args) { int j = 0; if (args.length == 6) { while (j < args.length) { if (args[j].equals("-i_seq1")) { j++; i_seq1 = new String(args[j]); } else if (args[j].equals("-i_seq2")) { j++; i_seq2 = new String(args[j]); } else if (args[j].equals("-o")) { j++; o = new String(args[j]); } j++; } } else { System.err.println("You can call the program like: java -jar SecStrucComparison.jar -i_seq1 <input-file with first structure sequence (used for precision)> -i_seq2 <input-file with second structure sequence> -o <output-file>"); } }
/** * Methods that reads the input file * * @param inputfile * @throws Exception */ private static void read_file(String seq, String inputfile) throws Exception { String line; try { FileReader sfreader = new FileReader(inputfile); BufferedReader bfreader = new BufferedReader(sfreader); while ((line = bfreader.readLine()) != null) { secStruc.put(seq, line); } sfreader.close(); bfreader.close(); } catch (IndexOutOfBoundsException e) { e.printStackTrace(); }
}
/** * In the method compare the two sequences are compared on similarities. Thereby positions with '-' are ignored. * The sequences have to be represented with E, H and L. * @param seq1 * @param seq2 */ private static void compare(String seq1, String seq2) { HashMap<Character, Integer> match = new HashMap<Character, Integer>(); HashMap<Character, Integer> residues = new HashMap<Character, Integer>();
int total_match = 0; int total_res = 0;
match.put('E', 0); match.put('H', 0); match.put('L', 0); residues.put('E', 0); residues.put('H', 0); residues.put('L', 0);
for (int i = 0; i < seq1.length(); i++) { char comp = seq1.charAt(i); //the first sequence is used as reference. Only its letters are count not from the second sequence. if (comp != '-' && seq2.charAt(i) != '-') { total_res++; if (comp == seq2.charAt(i)) { total_match++; match.put(comp, match.get(comp) + 1); } residues.put(comp, residues.get(comp) + 1); } }
// As only the letters of the first are counted, there is a difference by choosing a sequence as first or second. // There is no difference for the total number. prec_E = precision(match.get('E'), residues.get('E')); prec_H = precision(match.get('H'),residues.get('H')); prec_L = precision(match.get('L'), residues.get('L')); prec_total = precision(total_match, total_res);
}
/** * method for calculating the precision: number of matches / total number of residues. */ public static double precision(int match, int total) { double prec = 0.0; prec = Math.round((Double.valueOf(match)/(Double.valueOf(total))*100)*100)/100.0; return prec; }
/** * Method that writes the outputfile: returns a list with precision for E, H, L and all three. * * @throws Exception */ private static void write_file() throws Exception { PrintWriter OUT = new PrintWriter(new FileWriter(new File(o))); OUT.println("E:\t" + prec_E); OUT.println("H:\t" + prec_H); OUT.println("L:\t" + prec_L); OUT.println("total:\t" + prec_total); OUT.close();
}
/** * Main method that let read parameters, files and starts the comparison. * @param args * @throws Exception */ public static void main(String[] args) throws Exception { getCMDargs(args); String sequence1 = "seq1"; String sequence2 = "seq2"; read_file(sequence1,i_seq1); read_file(sequence2, i_seq2); compare(secStruc.get(sequence1),secStruc.get(sequence2)); write_file(); } } </source>