Difference between revisions of "Phenylketonuria/Task2 Scripts"

Latest revision as of 09:46, 27 August 2013

Identity_Evalue.jar

Identity_Evalue.jar is a Java program that filters BLAST, PSIBLAST and HHblits outputs for the e-values and identities.
It can be found here: /mnt/home/student/waldraffs/masterpractical/Task02/Identity_Evalue.jar
Usage: java -jar Identity_Evalue.jar -i <input-file> -o_identity <output-file> -o_evalue <output-file>

-i:          Input must be the output of a BLAST, PSIBLAST or HHblits run
-o_identity: Outputfile containing the identities
-o_evalue:   Outputfile containing e-values

Java-code: <source lang="java"> package identity;

import java.io.BufferedReader; import java.io.File; import java.io.FileReader; import java.io.FileWriter; import java.io.PrintWriter; import java.util.ArrayList; import java.util.Collections;

/**

* 
* Class that filters HHblits, BLAST and PSIBLAST output files for e-value and identity in percent.
* @author Sonja Waldraff
*
*/

public class Identity_Evalue{

private static String i; private static String o_identity; private static boolean peId = false; private static String o_evalue; private static boolean ev = false;

private static ArrayList<Double> percentageIdentity = new ArrayList<Double>(); private static ArrayList<Double> eValue = new ArrayList<Double>();

/** * read Input-Parameters -i: Input-file, -o_evalue: Output-file returns a * List of the e-values, -o_identity: Output-file returns a List of identity * in percent -hhblits|blast: information if the input-file is a blast(/psiblast) or a * hhblits output * * @param args */ public static void getCMDargs(String[] args) { int j = 0; if (args.length >= 4) { while (j < args.length) { if (args[j].equals("-i")) { j++; i = new String(args[j]); } else if (args[j].equals("-o_identity")) { j++; o_identity = new String(args[j]); peId = true; } else if (args[j].equals("-o_evalue")) { j++; o_evalue = new String(args[j]); ev = true; } j++;

} } else { System.out .println("You can call the program like: -i <input-file (result of blast/psiblast/hhblits)> -o_identity <output-file for identity percentage> -o_evalue <output-file for evalue>"); } }

/** * Methods that reads the input file and filters for the e-value and identity in % * @param inputfile * @throws Exception */ private static void read_file(String inputfile) throws Exception { String line; try { FileReader sfreader = new FileReader(inputfile); BufferedReader bfreader = new BufferedReader(sfreader); while ((line = bfreader.readLine()) != null){ if(!line.startsWith("#")){ String[] x = line.split("\t"); eValue.add(Double.valueOf(x[1])); percentageIdentity.add(Double.valueOf(x[2])); } } Collections.sort(percentageIdentity); Collections.sort(eValue);

sfreader.close(); bfreader.close(); } catch (IndexOutOfBoundsException e) { e.printStackTrace(); }

}

/** * Method that writes e-values and identities in their outputfiles * @throws Exception */ private static void write_file() throws Exception { //if an output file for identities is claimed if (peId) { PrintWriter OUT = new PrintWriter(new FileWriter(new File( o_identity))); for (double i : percentageIdentity) { OUT.println(i); } OUT.close(); } //if an output file for e-values is claimed if (ev) { PrintWriter OUT2 = new PrintWriter( new FileWriter(new File(o_evalue))); for (double d : eValue) { OUT2.println(d); } OUT2.close(); } }

/** * Main-method * @param args * @throws Exception */ public static void main(String[] args) throws Exception { getCMDargs(args); read_file(i); write_file(); } } </source>

id_eval_hist.r

After that following R script was used to create the histograms based on those outputfiles: <source lang="gnuplot">

pdf(".../Identity_Evalue_Hist.pdf") par(mfrow=c(2,2))

input<-read.table(".../BLAST/identity_PAH_Blast_big_80.txt",sep="\t") hist(input[,], col="blue", xlab="identity",ylab="frequency", main = "Sequence identity - BLAST")

input1<-read.table(".../BLAST/evalue_PAH_Blast_big_80.txt",sep="\t") hist(log(input1[,]), col="blue", xlab="log(e-value)",ylab="frequency", main = "E-Value - BLAST")

input11<-read.table(".../HHblits/identity_PAH_2000.txt",sep="\t") hist(input11[,], col="forestgreen",xlab="identity",ylab="frequency",main = "Sequence identity - HHblits")

input12<-read.table(".../HHblits/evalue_PAH_2000.txt",sep="\t") hist(log(input12[,]), col="forestgreen",xlab="log(e-value)",ylab="frequency",main = "E-Value - HHblits")

library(plotrix)

par(mfrow=c(1,1))

input3<-read.table(".../PSIBLAST/identity_psi_2_2.txt",sep="\t") input4<-read.table(".../PSIBLAST/identity_psi_2_10.txt",sep="\t") input5<-read.table(".../PSIBLAST/identity_psi_10_2.txt",sep="\t") input6<-read.table(".../PSIBLAST/identity_psi_10_10.txt",sep="\t") l<-list(input3[,],input4[,],input5[,],input6[,]) multhist(l,col=c("blue","yellow","forestgreen","red3"),breaks=30,xlab="identity",ylab="frequency", main = "Sequence identity - PSIBLAST") legend("topright",legend=c("2 x 0.002","2 x 10e-10","10 x 0.002","10 x 10e-10"), fill=c("blue","yellow","forestgreen","red3"))

input3<-read.table(".../PSIBLAST/evalue_psi_2_2.txt",sep="\t") input4<-read.table(".../PSIBLAST/evalue_psi_2_10.txt",sep="\t") input5<-read.table(".../PSIBLAST/evalue_psi_10_2.txt",sep="\t") input6<-read.table(".../PSIBLAST/evalue_psi_10_10.txt",sep="\t") l<-list(log(input3[,]),log(input4[,]),log(input5[,]),log(input6[,])) multhist(l,col=c("blue","yellow","forestgreen","red3"),breaks=20,xlab="log(e-value)",ylab="frequency", main = "E-Value - PSIBLAST") legend("topleft",legend=c("2 x 0.002","2 x 10e-10","10 x 0.002","10 x 10e-10"), fill=c("blue","yellow","forestgreen","red3"))

dev.off() </source>

GO_comparison.jar

GO_comparison.jar is a Java program that compares the GO-terms of a reference protein with the GO-terms of other protein sequences.
The file is stored here: /mnt/home/student/waldraffs/masterpractical/Task02/GO_comparison.jar.
Usage: java -jar GO_comparison.jar -i <input-file> -o <output-file> -reference_id <uniprot-id> -hhblits|-blast

-i:            Path to the special parsed (psi-)blast/hhblits outputs using Parse_output.pl
-o:            Path, where the output should be stored
-reference_id: UniProt identifier (i.e. P00439) for the reference sequence with which the sequences of the input file should be compared
-hhblits:      if the input file was the result of a HHblits run or
 |blast:       if the input file was the result of a BLAST or PSIBLAST run.

Java-code: <source lang="java"> package identity;

import java.io.BufferedReader; import java.io.File; import java.io.FileReader; import java.io.FileWriter; import java.io.PrintWriter; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet;

/**

* 
* Class that compares the GO-terms of P00439 to the GO-terms of sequences found
* in BLAST, PSIBLAST or HHblits runs.
* 
* @author Sonja Waldraff
* 
*/

public class GO_comparison {

private static String i; private static String o; private static String reference_id; private static boolean hhblits = false; private static boolean blast = false;

private static ArrayList<String> id = new ArrayList<String>(); private static HashMap<String, Integer> numberGO = new HashMap<>();

/** * read Input-Parameters -i: Input-file, -o: Output-file, -reference_id: * UniProt-ID of the reference sequence, -hhblits|-blast if input was result * of a hhblits or a (psi-)blast run * * @param args */ public static void getCMDargs(String[] args) { int j = 0; if (args.length == 7) { while (j < args.length) { if (args[j].equals("-i")) { j++; i = new String(args[j]); } else if (args[j].equals("-o")) { j++; o = new String(args[j]); } else if (args[j].equals("-reference_id")) { j++; reference_id = new String(args[j]); } else if (args[j].equals("-hhblits")) { hhblits = true; } else if (args[j].equals("-blast")) { blast = true; } j++; } } else { System.out.println("You can call the program like: -i <input-file special parsed (psi-)blast/hhblits outputs> -o <output-file> -reference_id <uniprot-id of the reference sequence> -hhblits|-blast"); } }

/** * Method that reads the input file per line and filters for the IDs of the * found sequences. * * @param inputfile * @throws Exception */ private static void read_file(String inputfile) throws Exception { String line; try { FileReader sfreader = new FileReader(inputfile); BufferedReader bfreader = new BufferedReader(sfreader);

if (hhblits) { while ((line = bfreader.readLine()) != null) { if (!line.startsWith("#")) { String[] x = line.split("\t"); id.add(x[0]); }

} } if (blast) { while ((line = bfreader.readLine()) != null) { if (line.startsWith("tr") || line.startsWith("sp")) { String[] x = line.split("\\|"); id.add(x[1]); }

} } sfreader.close(); bfreader.close(); } catch (IndexOutOfBoundsException e) { e.printStackTrace(); }

}

private static void compareGOs(HashSet<String> go_PAH) throws Exception { // countKeys and countNotKeys to check, if all IDs can be found in // QuickGO int countKeys = 0; int countNotKeys = 0; System.out.println(id.size()); int s_i=0; // comparison of the reference GOs with the GOs of the other sequences for (String s : id) { HashSet<String> newTerms = ReadQuickGO.readHTMLPage(s); s_i++; System.out.println(s_i); if (newTerms == null) { countNotKeys++; } else { countKeys++; for (String go_i : go_PAH) { if (newTerms.contains(go_i)) { numberGO.put(go_i, numberGO.get(go_i) + 1); } } } }

System.out.println("found Keys: " + countKeys + "\nnot found:" + countNotKeys); }

/** * Main-method which compares the GO terms of the reference sequence to the * GO terms of the other found sequences downloaded from QuickGo with the * class ReadQuickGO * * @param args * @throws Exception */ public static void main(String[] args) throws Exception { getCMDargs(args); read_file(i); // GO-terms of the reference sequence HashSet<String> go_PAH = ReadQuickGO.readHTMLPage(reference_id); for (String i : go_PAH) { numberGO.put(i, 0); }

// comparison of the GO-Terms compareGOs(go_PAH);

// output-file is written with the GO-terms and their number of // occurrence in the other sequences. PrintWriter OUT = new PrintWriter(new FileWriter(new File(o))); OUT.println("GO\t#occurrence");

for (String s : go_PAH) { OUT.println(s + "\t" + numberGO.get(s)); } OUT.close();

} }

package identity;

import java.io.BufferedReader; import java.io.InputStreamReader; import java.net.URL; import java.util.HashSet;

public class ReadQuickGO {

/** * reads the HTML Page of QuickGO to download the GO annotations of the given UniProt ID. * @param uniProtID * @return set of GOterms annotated for this UniProt ID * @throws Exception */ public static HashSet<String> readHTMLPage(String uniProtID) throws Exception { HashSet<String> goTerms = new HashSet<String>(); BufferedReader reader = null; //HTML-Link String urlString = "http://www.ebi.ac.uk/QuickGO/GAnnotation?protein=" + uniProtID + "&format=tsv"; URL url = new URL(urlString); try { reader = new BufferedReader(new InputStreamReader(url.openStream())); //finding the column with GO-ID int index=-1; String[] go_id = reader.readLine().split("\t");

for(int i=0;i<go_id.length;i++){

               if(go_id[i].equals("GO ID")){
                   index=i;
               }
           } 
           if(index==-1){
               throw new Exception("No GO ID found for " + uniProtID);
           }
       	//saving the GO-Terms

String line;

           while ((line = reader.readLine()) != null) {

String[] lineSplit = line.split("\t"); goTerms.add(lineSplit[index]); }

reader.close();

} catch (Exception e) { throw new Exception("Could not find any GO-term for " + uniProtID); } //Returning HashSet with GO-Terms return goTerms; } }

</source>

Difference between revisions of "Phenylketonuria/Task2 Scripts"

Latest revision as of 09:46, 27 August 2013

Identity_Evalue.jar

id_eval_hist.r

GO_comparison.jar

Navigation menu

Views

Personal tools

Bioinformatik navigation

MediaWiki navigation

Search

Tools

@@ Line 1: / Line 1: @@
+Back to [https://i12r-studfilesrv.informatik.tu-muenchen.de/wiki/index.php/Lab_Journal_-_Task_2_%28PAH%29#Sequence_searches Lab journal]
-===Identity_Evalue.jar===
+==Identity_Evalue.jar==
 Identity_Evalue.jar is a Java program that filters BLAST, PSIBLAST and HHblits outputs for the e-values and identities. <br>It can be found here: <code>/mnt/home/student/waldraffs/masterpractical/Task02/Identity_Evalue.jar</code><br>Usage:
 <code>
@@ Line 134: / Line 135: @@
 }
 </source>
+===id_eval_hist.r===
+After that following R script was used to create the histograms based on those outputfiles:
+<source lang="gnuplot">
+pdf(".../Identity_Evalue_Hist.pdf")
-===GO_comparison.jar===
+par(mfrow=c(2,2))
-GO_comparison.jar is a Java program that compares the GO-terms of a reference protein with the GO-terms of other protein sequences. It can be found here: <code>/mnt/home/student/waldraffs/masterpractical/Task02/GO_comparison.jar</code>.<br>Usage:
+input<-read.table(".../BLAST/identity_PAH_Blast_big_80.txt",sep="\t")
+hist(input[,], col="blue", xlab="identity",ylab="frequency", main = "Sequence identity - BLAST")
+input1<-read.table(".../BLAST/evalue_PAH_Blast_big_80.txt",sep="\t")
+hist(log(input1[,]), col="blue", xlab="log(e-value)",ylab="frequency", main = "E-Value - BLAST")
+input11<-read.table(".../HHblits/identity_PAH_2000.txt",sep="\t")
+hist(input11[,], col="forestgreen",xlab="identity",ylab="frequency",main = "Sequence identity - HHblits")
+input12<-read.table(".../HHblits/evalue_PAH_2000.txt",sep="\t")
+hist(log(input12[,]), col="forestgreen",xlab="log(e-value)",ylab="frequency",main = "E-Value - HHblits")
+library(plotrix)
+par(mfrow=c(1,1))
+input3<-read.table(".../PSIBLAST/identity_psi_2_2.txt",sep="\t")
+input4<-read.table(".../PSIBLAST/identity_psi_2_10.txt",sep="\t")
+input5<-read.table(".../PSIBLAST/identity_psi_10_2.txt",sep="\t")
+input6<-read.table(".../PSIBLAST/identity_psi_10_10.txt",sep="\t")
+l<-list(input3[,],input4[,],input5[,],input6[,])
+multhist(l,col=c("blue","yellow","forestgreen","red3"),breaks=30,xlab="identity",ylab="frequency", main = "Sequence identity - PSIBLAST")
+legend("topright",legend=c("2 x 0.002","2 x 10e-10","10 x 0.002","10 x 10e-10"), fill=c("blue","yellow","forestgreen","red3"))
+input3<-read.table(".../PSIBLAST/evalue_psi_2_2.txt",sep="\t")
+input4<-read.table(".../PSIBLAST/evalue_psi_2_10.txt",sep="\t")
+input5<-read.table(".../PSIBLAST/evalue_psi_10_2.txt",sep="\t")
+input6<-read.table(".../PSIBLAST/evalue_psi_10_10.txt",sep="\t")
+l<-list(log(input3[,]),log(input4[,]),log(input5[,]),log(input6[,]))
+multhist(l,col=c("blue","yellow","forestgreen","red3"),breaks=20,xlab="log(e-value)",ylab="frequency", main = "E-Value - PSIBLAST")
+legend("topleft",legend=c("2 x 0.002","2 x 10e-10","10 x 0.002","10 x 10e-10"), fill=c("blue","yellow","forestgreen","red3"))
+dev.off()
+</source>
+==GO_comparison.jar==
+GO_comparison.jar is a Java program that compares the GO-terms of a reference protein with the GO-terms of other protein sequences. <br>The file is stored here: <code>/mnt/home/student/waldraffs/masterpractical/Task02/GO_comparison.jar</code>.<br>Usage:
 <code>java -jar GO_comparison.jar -i <input-file> -o <output-file> -reference_id <uniprot-id> -hhblits|-blast</code>
  -i:            Path to the special parsed (psi-)blast/hhblits outputs using [https://i12r-studfilesrv.informatik.tu-muenchen.de/wiki/index.php/Parse_output.pl Parse_output.pl]