Difference between revisions of "Phenylketonuria/Task3/Scripts"

Latest revision as of 15:09, 17 August 2013

Secondary Structure

filter_secStruc.pl

This script reads an input file in ReProf format or PsiPred and filters the file for the secondary structur presented with E,H and L/C. Also it can read a DSSP file. If the structure forms a loop or irregular a "-" is inserted.
You can find the script in /mnt/home/student/waldraffs/masterpractical/Task03/secondary_structure/filter_secStruc.pl

Usage: perl filter_secStrucpl -inp <input-file in ReProf format> -out <output-file> -reprof|-psipred|-dssp 
-reprof: if the input file is result of a ReProf run.
-psipred: if the input file is result of a PsiPred run.
-dssp: if the input file is result of a DSSP run.

The output is the secondary structure shown with the letters E, H and L. Therefore at PsiPred the letter C is converted to L and at DSSP the letters G and I are replaced by H, B by E and S and T by L.

Code:

!/usr/bin/perl

my $inp; my $out; my $reprof = 0; my $psipred = 0; my $dssp = 0;

Reads the command line parameters

for ($i=0; $i<=@ARGV;$i++){ if($ARGV[$i] eq "-inp"){ $inp=$ARGV[$i+1]; } if($ARGV[$i] eq "-out"){ $out=$ARGV[$i+1]; } if($ARGV[$i] eq "-reprof"){ $reprof = 1; } if($ARGV[$i] eq "-psipred"){ $psipred = 1; } if($ARGV[$i] eq "-dssp"){ $dssp = 1; } }

if(!$inp && !$out){ print "Usage: filter_secStruc.pl -inp <input-file(.reprof)> -out <output-file> -reprof|-psipred|-dssp" }

my $secstructure= ""; my $no = 0;

reads input file and filters for the coloumn with secondary structure

open (FILE,"$inp") || die $!; if($reprof == 1){ while (my $line = <FILE>){

if($no == 1){ @splitLine = split(/\t/,$line); $secstructure=$secstructure.$splitLine[2]; }

if($line =~ /^No/){ $no = 1; } } } if($psipred == 1){ while (my $line = <FILE>){ if($line =~ /^Pred/){ @splitLine = split(/ /,$line); chomp($splitLine[1]); $secstructure = $secstructure.$splitLine[1]; } } $secstructure =~ s/C/L/g; } if($dssp ==1){ $number = 0; while (my $line = <FILE>){ if($no ==1){ $line =~ s/^\s+//g; @splitLine = split(/ /,$line); my $letter = ""; if($splitLine[1] eq "" && $splitLine[2] eq "" && $splitLine[3] eq ""){ $number = 1; } if($splitLine[1] eq "" && $splitLine[2] eq "" && $splitLine[3] ne ""){ $number = 2; } if($splitLine[1] eq "" && $splitLine[2] ne ""){ $number = 3; } if($splitLine[0] < 10){ $letter = $splitLine[9-$number]; } elsif($splitLine[0] >9 && $splitLine[0] < 100){ $letter = $splitLine[9-$number]; } else{ $letter = $splitLine[9-$number]; } if($letter eq ""){ $letter="-"; } $secstructure = $secstructure.$letter;

} if($line =~ /^ #/){ $no = 1; } } $secstructure =~ s/[G,I]/H/g; $secstructure =~ s/B/E/g; $secstructure =~ s/[S,T]/L/g; } close FILE;

writes output file

open(OUTFILE,">$out") || die $!; print OUTFILE $secstructure; close OUTFILE; </source>

SecStrucComparison.jar

The script takes two sequences as input (presented with E, H, L and -) and calculates the precision between those.

Usage:
java -jar SecStrucComparison.jar 
-i_seq1 <input-file 1>: first structure sequence (used as reference for precision) 
-i_seq2 <input-file 2>: second structure sequence
-o <output-file>: returns a list with the precision for E, H and L of the first sequence as well as the overall precision.

The structure sequences must be in the format of the filter_secStruc.pl output. Furthermore the two sequences must have the same length.
You can find the script in /mnt/home/student/waldraffs/masterpractical/Task03/secondary_structure/SecStrucComparison.jar

It must be considered which sequence should be used as first or second, as they can have a different number of E, H or L. Those letters are counted for the first sequence only and used for calculation of the precision. For the total precision there is no difference.

Java-code: <source lang="java"> package structureComparison;

import java.io.BufferedReader; import java.io.File; import java.io.FileReader; import java.io.FileWriter; import java.io.PrintWriter; import java.util.HashMap;

/**

* 
* This class takes two secondary structure sequences (represented with E, H, L and -) and calculates the precision between those.
* 
* @author Sonja Waldraff
*
*/

public class SecStrucComparison { private static String i_seq1; private static String i_seq2; private static String o; private static HashMap<String, String> secStruc = new HashMap<String, String>(); private static double prec_E = 0.0; private static double prec_H = 0.0; private static double prec_L = 0.0; private static double prec_total = 0.0;

/** * read Input-Parameters * -i_seq1: Input-file of the secondary structure sequence (#letters are counted) * -i_seq2: Input-file of the secondary structure sequence (to be compared with seq1) * -o: Output-file returns a list with precision for the letters E, H and L as well as for all residues. * * @param args */ public static void getCMDargs(String[] args) { int j = 0; if (args.length == 6) { while (j < args.length) { if (args[j].equals("-i_seq1")) { j++; i_seq1 = new String(args[j]); } else if (args[j].equals("-i_seq2")) { j++; i_seq2 = new String(args[j]); } else if (args[j].equals("-o")) { j++; o = new String(args[j]); } j++; } } else { System.err.println("You can call the program like: java -jar SecStrucComparison.jar -i_seq1 <input-file with first structure sequence (used for precision)> -i_seq2 <input-file with second structure sequence> -o <output-file>"); } }

/** * Methods that reads the input file * * @param inputfile * @throws Exception */ private static void read_file(String seq, String inputfile) throws Exception { String line; try { FileReader sfreader = new FileReader(inputfile); BufferedReader bfreader = new BufferedReader(sfreader); while ((line = bfreader.readLine()) != null) { secStruc.put(seq, line); } sfreader.close(); bfreader.close(); } catch (IndexOutOfBoundsException e) { e.printStackTrace(); }

}

/** * In the method compare the two sequences are compared on similarities. Thereby positions with '-' are ignored. * The sequences have to be represented with E, H and L. * @param seq1 * @param seq2 */ private static void compare(String seq1, String seq2) { HashMap<Character, Integer> match = new HashMap<Character, Integer>(); HashMap<Character, Integer> residues = new HashMap<Character, Integer>();

int total_match = 0; int total_res = 0;

match.put('E', 0); match.put('H', 0); match.put('L', 0); residues.put('E', 0); residues.put('H', 0); residues.put('L', 0);

for (int i = 0; i < seq1.length(); i++) { char comp = seq1.charAt(i); //the first sequence is used as reference. Only its letters are count not from the second sequence. if (comp != '-' && seq2.charAt(i) != '-') { total_res++; if (comp == seq2.charAt(i)) { total_match++; match.put(comp, match.get(comp) + 1); } residues.put(comp, residues.get(comp) + 1); } }

// As only the letters of the first are counted, there is a difference by choosing a sequence as first or second. // There is no difference for the total number. prec_E = precision(match.get('E'), residues.get('E')); prec_H = precision(match.get('H'),residues.get('H')); prec_L = precision(match.get('L'), residues.get('L')); prec_total = precision(total_match, total_res);

}

/** * method for calculating the precision: number of matches / total number of residues. */ public static double precision(int match, int total) { double prec = 0.0; prec = Math.round((Double.valueOf(match)/(Double.valueOf(total))*100)*100)/100.0; return prec; }

/** * Method that writes the outputfile: returns a list with precision for E, H, L and all three. * * @throws Exception */ private static void write_file() throws Exception { PrintWriter OUT = new PrintWriter(new FileWriter(new File(o))); OUT.println("E:\t" + prec_E); OUT.println("H:\t" + prec_H); OUT.println("L:\t" + prec_L); OUT.println("total:\t" + prec_total); OUT.close();

}

/** * Main method that let read parameters, files and starts the comparison. * @param args * @throws Exception */ public static void main(String[] args) throws Exception { getCMDargs(args); String sequence1 = "seq1"; String sequence2 = "seq2"; read_file(sequence1,i_seq1); read_file(sequence2, i_seq2); compare(secStruc.get(sequence1),secStruc.get(sequence2)); write_file(); } } </source>

Difference between revisions of "Phenylketonuria/Task3/Scripts"

Latest revision as of 15:09, 17 August 2013

Secondary Structure

filter_secStruc.pl

SecStrucComparison.jar

Navigation menu

Views

Personal tools

Bioinformatik navigation

MediaWiki navigation

Search

Tools

@@ Line 1: / Line 1: @@
+==Secondary Structure==
-This script reads an input file in ReProf format or PsiPred and filters the file for the secondary structur presented with E,H and L/C.<br>
+Back to [https://i12r-studfilesrv.informatik.tu-muenchen.de/wiki/index.php/Lab_Journal_-_Task_3_%28PAH%29#Secondary_structure Lab Journal]
-You can find the script in <code>/mnt/home/student/waldraffs/Masterpraktikum/Task3/filter_secStruc.pl</code>
+===filter_secStruc.pl===
+This script reads an input file in ReProf format or PsiPred and filters the file for the secondary structur presented with E,H and L/C. Also it can read a DSSP file. If the structure forms a loop or irregular a "-" is inserted.<br>
+You can find the script in <code>/mnt/home/student/waldraffs/masterpractical/Task03/secondary_structure/filter_secStruc.pl</code>
- Usage: perl filter_secStrucpl -inp <input-file in ReProf format> -out <output-file> -reprof|-psipred
+ Usage: perl filter_secStrucpl -inp <input-file in ReProf format> -out <output-file> -reprof|-psipred|-dssp
- -reprof: if the input file is result of a ReProf run
+ -reprof: if the input file is result of a ReProf run.
- -psipred: if the input file is result of a PsiPred run
+ -psipred: if the input file is result of a PsiPred run.
+ -dssp: if the input file is result of a DSSP run.
-The output is the secondary structure shown with the letters E, H and L/C.
+The output is the secondary structure shown with the letters E, H and L. Therefore at PsiPred the letter C is converted to L and at DSSP the letters G and I are replaced by H, B by E and S and T by L.
 Code:
@@ Line 18: / Line 22: @@
 my $reprof = 0;
 my $psipred = 0;
+my $dssp = 0;
 # Reads the command line parameters
@@ Line 32: / Line 37: @@
 	if($ARGV[$i] eq "-psipred"){
 		$psipred = 1;
+	}
+	if($ARGV[$i] eq "-dssp"){
+		$dssp = 1;
 	}
 }
 if(!$inp && !$out){
-	print "Usage: filter_secStruc.pl -inp <input-file(.reprof)> -out <output-file> -reprof|-psipred"
+	print "Usage: filter_secStruc.pl -inp <input-file(.reprof)> -out <output-file> -reprof|-psipred|-dssp"
 }
@@ Line 49: / Line 57: @@
 		if($no == 1){
 			@splitLine = split(/\t/,$line);
-			$secstructure="$secstructure$splitLine[2]";
+			$secstructure=$secstructure.$splitLine[2];
 		}
@@ Line 62: / Line 70: @@
 			@splitLine = split(/ /,$line);
 			chomp($splitLine[1]);
-			$secstructure = "$secstructure$splitLine[1]";
+			$secstructure = $secstructure.$splitLine[1];
 		}
 	}
+	$secstructure =~ s/C/L/g;
 }
+if($dssp ==1){
+	$number = 0;
+	while (my $line = <FILE>){
+		if($no ==1){
+			$line =~ s/^\s+//g;
+			@splitLine = split(/ /,$line);
+			my $letter = "";
+			if($splitLine[1] eq "" && $splitLine[2] eq "" && $splitLine[3] eq ""){
+					$number = 1;
+			}
+			if($splitLine[1] eq "" && $splitLine[2] eq "" && $splitLine[3] ne ""){
+					$number = 2;
+			}
+			if($splitLine[1] eq "" && $splitLine[2] ne ""){
+					$number = 3;
+			}
+			if($splitLine[0] < 10){
+				$letter = $splitLine[9-$number];
+			}
+			elsif($splitLine[0] >9 && $splitLine[0] < 100){
+				$letter = $splitLine[9-$number];
+			}
+			else{
+				$letter = $splitLine[9-$number];
+			}
+			if($letter eq ""){
+				$letter="-";
+			}
+			$secstructure = $secstructure.$letter;
+		}
+		if($line =~ /^  #/){
+			$no = 1;
+		}
+	}
+	$secstructure =~ s/[G,I]/H/g;
+	$secstructure =~ s/B/E/g;
+	$secstructure =~ s/[S,T]/L/g;
+	}
 close FILE;
@@ Line 72: / Line 120: @@
 	print OUTFILE $secstructure;
 close OUTFILE;
+</source>
+===SecStrucComparison.jar===
+The script takes two sequences as input (presented with E, H, L and -) and calculates the precision between those.
+ Usage:
+ java -jar SecStrucComparison.jar
+ -i_seq1 <input-file 1>: first structure sequence (used as reference for precision)
+ -i_seq2 <input-file 2>: second structure sequence
+ -o <output-file>: returns a list with the precision for E, H and L of the first sequence as well as the overall precision.
+The structure sequences must be in the format of the filter_secStruc.pl output. Furthermore the two sequences must have the same length. <br>
+You can find the script in <code>/mnt/home/student/waldraffs/masterpractical/Task03/secondary_structure/SecStrucComparison.jar</code>
+It must be considered which sequence should be used as first or second, as they can have a different number of E, H or L. Those letters are counted for the first sequence only and used for calculation of the precision. For the total precision there is no difference.
+Java-code:
+<source lang="java">
+package structureComparison;
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileReader;
+import java.io.FileWriter;
+import java.io.PrintWriter;
+import java.util.HashMap;
+/**
+ *
+ * This class takes two secondary structure sequences (represented with E, H, L and -) and calculates the precision between those.
+ *
+ * @author Sonja Waldraff
+ *
+ */
+public class SecStrucComparison {
+	private static String i_seq1;
+	private static String i_seq2;
+	private static String o;
+	private static HashMap<String, String> secStruc = new HashMap<String, String>();
+	private static double prec_E = 0.0;
+	private static double prec_H = 0.0;
+	private static double prec_L = 0.0;
+	private static double prec_total = 0.0;
+	/**
+	 * read Input-Parameters
+	 * -i_seq1: Input-file of the secondary structure sequence (#letters are counted)
+	 * -i_seq2: Input-file of the secondary structure sequence (to be compared with seq1)
+	 * -o: Output-file returns a list with precision for the letters E, H and L as well as for all residues.
+	 *
+	 * @param args
+	 */
+	public static void getCMDargs(String[] args) {
+		int j = 0;
+		if (args.length == 6) {
+			while (j < args.length) {
+				if (args[j].equals("-i_seq1")) {
+					j++;
+					i_seq1 = new String(args[j]);
+				} else if (args[j].equals("-i_seq2")) {
+					j++;
+					i_seq2 = new String(args[j]);
+				} else if (args[j].equals("-o")) {
+					j++;
+					o = new String(args[j]);
+				}
+				j++;
+			}
+		} else {
+			System.err.println("You can call the program like: java -jar SecStrucComparison.jar -i_seq1 <input-file with first structure sequence (used for precision)> -i_seq2 <input-file with second structure sequence> -o <output-file>");
+		}
+	}
+	/**
+	 * Methods that reads the input file
+	 *
+	 * @param inputfile
+	 * @throws Exception
+	 */
+	private static void read_file(String seq, String inputfile) throws Exception {
+		String line;
+		try {
+			FileReader sfreader = new FileReader(inputfile);
+			BufferedReader bfreader = new BufferedReader(sfreader);
+			while ((line = bfreader.readLine()) != null) {
+				secStruc.put(seq, line);
+			}
+			sfreader.close();
+			bfreader.close();
+		} catch (IndexOutOfBoundsException e) {
+			e.printStackTrace();
+		}
+	}
+	/**
+	 * In the method compare the two sequences are compared on similarities. Thereby positions with '-' are ignored.
+	 * The sequences have to be represented with E, H and L.
+	 * @param seq1
+	 * @param seq2
+	 */
+	private static void compare(String seq1, String seq2) {
+		HashMap<Character, Integer> match = new HashMap<Character, Integer>();
+		HashMap<Character, Integer> residues = new HashMap<Character, Integer>();
+		int total_match = 0;
+		int total_res = 0;
+		match.put('E', 0);	match.put('H', 0);	match.put('L', 0);
+		residues.put('E', 0);	residues.put('H', 0);	residues.put('L', 0);
+		for (int i = 0; i < seq1.length(); i++) {
+			char comp = seq1.charAt(i);
+			//the first sequence is used as reference. Only its letters are count not from the second sequence.
+			if (comp != '-' && seq2.charAt(i) != '-') {
+				total_res++;
+				if (comp == seq2.charAt(i)) {
+					total_match++;
+					match.put(comp, match.get(comp) + 1);
+				}
+				residues.put(comp, residues.get(comp) + 1);
+			}
+		}
+		// As only the letters of the first are counted, there is a difference by choosing a sequence as first or second.
+		// There is no difference for the total number.
+		prec_E = precision(match.get('E'), residues.get('E'));
+		prec_H = precision(match.get('H'),residues.get('H'));
+		prec_L = precision(match.get('L'), residues.get('L'));
+		prec_total = precision(total_match, total_res);
+	}
+	/**
+	 * method for calculating the precision: number of matches / total number of residues.
+	 */
+	public static double precision(int match, int total) {
+		double prec = 0.0;
+		prec = Math.round((Double.valueOf(match)/(Double.valueOf(total))*100)*100)/100.0;
+		return prec;
+	}
+	/**
+	 * Method that writes the outputfile: returns a list with precision for E, H, L and all three.
+	 *
+	 * @throws Exception
+	 */
+	private static void write_file() throws Exception {
+		PrintWriter OUT = new PrintWriter(new FileWriter(new File(o)));
+		OUT.println("E:\t" + prec_E);
+		OUT.println("H:\t" + prec_H);
+		OUT.println("L:\t" + prec_L);
+		OUT.println("total:\t" + prec_total);
+		OUT.close();
+	}
+	/**
+	 * Main method that let read parameters, files and starts the comparison.
+	 * @param args
+	 * @throws Exception
+	 */
+	public static void main(String[] args) throws Exception {
+		getCMDargs(args);
+		String sequence1 = "seq1";
+		String sequence2 = "seq2";
+		read_file(sequence1,i_seq1);
+		read_file(sequence2, i_seq2);
+		compare(secStruc.get(sequence1),secStruc.get(sequence2));
+		write_file();
+	}
+}
 </source>