Difference between revisions of "Phenylketonuria/Task2 Scripts"
(→Identity_Evalue.jar) |
(→id_eval_hist.r) |
||
(6 intermediate revisions by the same user not shown) | |||
Line 1: | Line 1: | ||
+ | Back to [https://i12r-studfilesrv.informatik.tu-muenchen.de/wiki/index.php/Lab_Journal_-_Task_2_%28PAH%29#Sequence_searches Lab journal] |
||
− | ===Identity_Evalue.jar=== |
||
+ | ==Identity_Evalue.jar== |
||
Identity_Evalue.jar is a Java program that filters BLAST, PSIBLAST and HHblits outputs for the e-values and identities. <br>It can be found here: <code>/mnt/home/student/waldraffs/masterpractical/Task02/Identity_Evalue.jar</code><br>Usage: |
Identity_Evalue.jar is a Java program that filters BLAST, PSIBLAST and HHblits outputs for the e-values and identities. <br>It can be found here: <code>/mnt/home/student/waldraffs/masterpractical/Task02/Identity_Evalue.jar</code><br>Usage: |
||
<code> |
<code> |
||
Line 134: | Line 135: | ||
} |
} |
||
</source> |
</source> |
||
+ | ===id_eval_hist.r=== |
||
+ | After that following R script was used to create the histograms based on those outputfiles: |
||
+ | <source lang="gnuplot"> |
||
+ | pdf(".../Identity_Evalue_Hist.pdf") |
||
− | ===GO_comparison.jar=== |
||
+ | par(mfrow=c(2,2)) |
||
− | GO_comparison.jar is a Java program that compares the GO-terms of a reference protein with the GO-terms of other protein sequences. It can be found here: <code>/mnt/home/student/waldraffs/masterpractical/Task02/GO_comparison.jar</code>.<br>Usage: |
||
+ | |||
+ | input<-read.table(".../BLAST/identity_PAH_Blast_big_80.txt",sep="\t") |
||
+ | hist(input[,], col="blue", xlab="identity",ylab="frequency", main = "Sequence identity - BLAST") |
||
+ | |||
+ | input1<-read.table(".../BLAST/evalue_PAH_Blast_big_80.txt",sep="\t") |
||
+ | hist(log(input1[,]), col="blue", xlab="log(e-value)",ylab="frequency", main = "E-Value - BLAST") |
||
+ | |||
+ | input11<-read.table(".../HHblits/identity_PAH_2000.txt",sep="\t") |
||
+ | hist(input11[,], col="forestgreen",xlab="identity",ylab="frequency",main = "Sequence identity - HHblits") |
||
+ | |||
+ | input12<-read.table(".../HHblits/evalue_PAH_2000.txt",sep="\t") |
||
+ | hist(log(input12[,]), col="forestgreen",xlab="log(e-value)",ylab="frequency",main = "E-Value - HHblits") |
||
+ | |||
+ | library(plotrix) |
||
+ | |||
+ | par(mfrow=c(1,1)) |
||
+ | |||
+ | input3<-read.table(".../PSIBLAST/identity_psi_2_2.txt",sep="\t") |
||
+ | input4<-read.table(".../PSIBLAST/identity_psi_2_10.txt",sep="\t") |
||
+ | input5<-read.table(".../PSIBLAST/identity_psi_10_2.txt",sep="\t") |
||
+ | input6<-read.table(".../PSIBLAST/identity_psi_10_10.txt",sep="\t") |
||
+ | l<-list(input3[,],input4[,],input5[,],input6[,]) |
||
+ | multhist(l,col=c("blue","yellow","forestgreen","red3"),breaks=30,xlab="identity",ylab="frequency", main = "Sequence identity - PSIBLAST") |
||
+ | legend("topright",legend=c("2 x 0.002","2 x 10e-10","10 x 0.002","10 x 10e-10"), fill=c("blue","yellow","forestgreen","red3")) |
||
+ | |||
+ | input3<-read.table(".../PSIBLAST/evalue_psi_2_2.txt",sep="\t") |
||
+ | input4<-read.table(".../PSIBLAST/evalue_psi_2_10.txt",sep="\t") |
||
+ | input5<-read.table(".../PSIBLAST/evalue_psi_10_2.txt",sep="\t") |
||
+ | input6<-read.table(".../PSIBLAST/evalue_psi_10_10.txt",sep="\t") |
||
+ | l<-list(log(input3[,]),log(input4[,]),log(input5[,]),log(input6[,])) |
||
+ | multhist(l,col=c("blue","yellow","forestgreen","red3"),breaks=20,xlab="log(e-value)",ylab="frequency", main = "E-Value - PSIBLAST") |
||
+ | legend("topleft",legend=c("2 x 0.002","2 x 10e-10","10 x 0.002","10 x 10e-10"), fill=c("blue","yellow","forestgreen","red3")) |
||
+ | |||
+ | dev.off() |
||
+ | </source> |
||
+ | |||
+ | ==GO_comparison.jar== |
||
+ | GO_comparison.jar is a Java program that compares the GO-terms of a reference protein with the GO-terms of other protein sequences. <br>The file is stored here: <code>/mnt/home/student/waldraffs/masterpractical/Task02/GO_comparison.jar</code>.<br>Usage: |
||
<code>java -jar GO_comparison.jar -i <input-file> -o <output-file> -reference_id <uniprot-id> -hhblits|-blast</code> |
<code>java -jar GO_comparison.jar -i <input-file> -o <output-file> -reference_id <uniprot-id> -hhblits|-blast</code> |
||
-i: Path to the special parsed (psi-)blast/hhblits outputs using [https://i12r-studfilesrv.informatik.tu-muenchen.de/wiki/index.php/Parse_output.pl Parse_output.pl] |
-i: Path to the special parsed (psi-)blast/hhblits outputs using [https://i12r-studfilesrv.informatik.tu-muenchen.de/wiki/index.php/Parse_output.pl Parse_output.pl] |
Latest revision as of 09:46, 27 August 2013
Back to Lab journal
Identity_Evalue.jar
Identity_Evalue.jar is a Java program that filters BLAST, PSIBLAST and HHblits outputs for the e-values and identities.
It can be found here: /mnt/home/student/waldraffs/masterpractical/Task02/Identity_Evalue.jar
Usage:
java -jar Identity_Evalue.jar -i <input-file> -o_identity <output-file> -o_evalue <output-file>
-i: Input must be the output of a BLAST, PSIBLAST or HHblits run -o_identity: Outputfile containing the identities -o_evalue: Outputfile containing e-values
Java-code: <source lang="java"> package identity;
import java.io.BufferedReader; import java.io.File; import java.io.FileReader; import java.io.FileWriter; import java.io.PrintWriter; import java.util.ArrayList; import java.util.Collections;
/**
* * Class that filters HHblits, BLAST and PSIBLAST output files for e-value and identity in percent. * @author Sonja Waldraff * */
public class Identity_Evalue{
private static String i; private static String o_identity; private static boolean peId = false; private static String o_evalue; private static boolean ev = false;
private static ArrayList<Double> percentageIdentity = new ArrayList<Double>();
private static ArrayList<Double> eValue = new ArrayList<Double>();
/** * read Input-Parameters -i: Input-file, -o_evalue: Output-file returns a * List of the e-values, -o_identity: Output-file returns a List of identity * in percent -hhblits|blast: information if the input-file is a blast(/psiblast) or a * hhblits output * * @param args */ public static void getCMDargs(String[] args) { int j = 0; if (args.length >= 4) { while (j < args.length) { if (args[j].equals("-i")) { j++; i = new String(args[j]); } else if (args[j].equals("-o_identity")) { j++; o_identity = new String(args[j]); peId = true; } else if (args[j].equals("-o_evalue")) { j++; o_evalue = new String(args[j]); ev = true; } j++;
} } else { System.out .println("You can call the program like: -i <input-file (result of blast/psiblast/hhblits)> -o_identity <output-file for identity percentage> -o_evalue <output-file for evalue>"); } }
/** * Methods that reads the input file and filters for the e-value and identity in % * @param inputfile * @throws Exception */ private static void read_file(String inputfile) throws Exception { String line; try { FileReader sfreader = new FileReader(inputfile); BufferedReader bfreader = new BufferedReader(sfreader); while ((line = bfreader.readLine()) != null){ if(!line.startsWith("#")){ String[] x = line.split("\t"); eValue.add(Double.valueOf(x[1])); percentageIdentity.add(Double.valueOf(x[2])); } } Collections.sort(percentageIdentity); Collections.sort(eValue);
sfreader.close(); bfreader.close(); } catch (IndexOutOfBoundsException e) { e.printStackTrace(); }
}
/** * Method that writes e-values and identities in their outputfiles * @throws Exception */ private static void write_file() throws Exception { //if an output file for identities is claimed if (peId) { PrintWriter OUT = new PrintWriter(new FileWriter(new File( o_identity))); for (double i : percentageIdentity) { OUT.println(i); } OUT.close(); } //if an output file for e-values is claimed if (ev) { PrintWriter OUT2 = new PrintWriter( new FileWriter(new File(o_evalue))); for (double d : eValue) { OUT2.println(d); } OUT2.close(); } }
/** * Main-method * @param args * @throws Exception */ public static void main(String[] args) throws Exception { getCMDargs(args); read_file(i); write_file(); } } </source>
id_eval_hist.r
After that following R script was used to create the histograms based on those outputfiles: <source lang="gnuplot">
pdf(".../Identity_Evalue_Hist.pdf") par(mfrow=c(2,2))
input<-read.table(".../BLAST/identity_PAH_Blast_big_80.txt",sep="\t") hist(input[,], col="blue", xlab="identity",ylab="frequency", main = "Sequence identity - BLAST")
input1<-read.table(".../BLAST/evalue_PAH_Blast_big_80.txt",sep="\t") hist(log(input1[,]), col="blue", xlab="log(e-value)",ylab="frequency", main = "E-Value - BLAST")
input11<-read.table(".../HHblits/identity_PAH_2000.txt",sep="\t") hist(input11[,], col="forestgreen",xlab="identity",ylab="frequency",main = "Sequence identity - HHblits")
input12<-read.table(".../HHblits/evalue_PAH_2000.txt",sep="\t") hist(log(input12[,]), col="forestgreen",xlab="log(e-value)",ylab="frequency",main = "E-Value - HHblits")
library(plotrix)
par(mfrow=c(1,1))
input3<-read.table(".../PSIBLAST/identity_psi_2_2.txt",sep="\t") input4<-read.table(".../PSIBLAST/identity_psi_2_10.txt",sep="\t") input5<-read.table(".../PSIBLAST/identity_psi_10_2.txt",sep="\t") input6<-read.table(".../PSIBLAST/identity_psi_10_10.txt",sep="\t") l<-list(input3[,],input4[,],input5[,],input6[,]) multhist(l,col=c("blue","yellow","forestgreen","red3"),breaks=30,xlab="identity",ylab="frequency", main = "Sequence identity - PSIBLAST") legend("topright",legend=c("2 x 0.002","2 x 10e-10","10 x 0.002","10 x 10e-10"), fill=c("blue","yellow","forestgreen","red3"))
input3<-read.table(".../PSIBLAST/evalue_psi_2_2.txt",sep="\t") input4<-read.table(".../PSIBLAST/evalue_psi_2_10.txt",sep="\t") input5<-read.table(".../PSIBLAST/evalue_psi_10_2.txt",sep="\t") input6<-read.table(".../PSIBLAST/evalue_psi_10_10.txt",sep="\t") l<-list(log(input3[,]),log(input4[,]),log(input5[,]),log(input6[,])) multhist(l,col=c("blue","yellow","forestgreen","red3"),breaks=20,xlab="log(e-value)",ylab="frequency", main = "E-Value - PSIBLAST") legend("topleft",legend=c("2 x 0.002","2 x 10e-10","10 x 0.002","10 x 10e-10"), fill=c("blue","yellow","forestgreen","red3"))
dev.off() </source>
GO_comparison.jar
GO_comparison.jar is a Java program that compares the GO-terms of a reference protein with the GO-terms of other protein sequences.
The file is stored here: /mnt/home/student/waldraffs/masterpractical/Task02/GO_comparison.jar
.
Usage:
java -jar GO_comparison.jar -i <input-file> -o <output-file> -reference_id <uniprot-id> -hhblits|-blast
-i: Path to the special parsed (psi-)blast/hhblits outputs using Parse_output.pl -o: Path, where the output should be stored -reference_id: UniProt identifier (i.e. P00439) for the reference sequence with which the sequences of the input file should be compared -hhblits: if the input file was the result of a HHblits run or |blast: if the input file was the result of a BLAST or PSIBLAST run.
Java-code: <source lang="java"> package identity;
import java.io.BufferedReader; import java.io.File; import java.io.FileReader; import java.io.FileWriter; import java.io.PrintWriter; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet;
/**
* * Class that compares the GO-terms of P00439 to the GO-terms of sequences found * in BLAST, PSIBLAST or HHblits runs. * * @author Sonja Waldraff * */
public class GO_comparison {
private static String i; private static String o; private static String reference_id; private static boolean hhblits = false; private static boolean blast = false;
private static ArrayList<String> id = new ArrayList<String>(); private static HashMap<String, Integer> numberGO = new HashMap<>();
/** * read Input-Parameters -i: Input-file, -o: Output-file, -reference_id: * UniProt-ID of the reference sequence, -hhblits|-blast if input was result * of a hhblits or a (psi-)blast run * * @param args */ public static void getCMDargs(String[] args) { int j = 0; if (args.length == 7) { while (j < args.length) { if (args[j].equals("-i")) { j++; i = new String(args[j]); } else if (args[j].equals("-o")) { j++; o = new String(args[j]); } else if (args[j].equals("-reference_id")) { j++; reference_id = new String(args[j]); } else if (args[j].equals("-hhblits")) { hhblits = true; } else if (args[j].equals("-blast")) { blast = true; } j++; } } else { System.out.println("You can call the program like: -i <input-file special parsed (psi-)blast/hhblits outputs> -o <output-file> -reference_id <uniprot-id of the reference sequence> -hhblits|-blast"); } }
/** * Method that reads the input file per line and filters for the IDs of the * found sequences. * * @param inputfile * @throws Exception */ private static void read_file(String inputfile) throws Exception { String line; try { FileReader sfreader = new FileReader(inputfile); BufferedReader bfreader = new BufferedReader(sfreader);
if (hhblits) { while ((line = bfreader.readLine()) != null) { if (!line.startsWith("#")) { String[] x = line.split("\t"); id.add(x[0]); }
} } if (blast) { while ((line = bfreader.readLine()) != null) { if (line.startsWith("tr") || line.startsWith("sp")) { String[] x = line.split("\\|"); id.add(x[1]); }
} } sfreader.close(); bfreader.close(); } catch (IndexOutOfBoundsException e) { e.printStackTrace(); }
}
private static void compareGOs(HashSet<String> go_PAH) throws Exception { // countKeys and countNotKeys to check, if all IDs can be found in // QuickGO int countKeys = 0; int countNotKeys = 0; System.out.println(id.size()); int s_i=0; // comparison of the reference GOs with the GOs of the other sequences for (String s : id) { HashSet<String> newTerms = ReadQuickGO.readHTMLPage(s); s_i++; System.out.println(s_i); if (newTerms == null) { countNotKeys++; } else { countKeys++; for (String go_i : go_PAH) { if (newTerms.contains(go_i)) { numberGO.put(go_i, numberGO.get(go_i) + 1); } } } }
System.out.println("found Keys: " + countKeys + "\nnot found:" + countNotKeys); }
/** * Main-method which compares the GO terms of the reference sequence to the * GO terms of the other found sequences downloaded from QuickGo with the * class ReadQuickGO * * @param args * @throws Exception */ public static void main(String[] args) throws Exception { getCMDargs(args); read_file(i); // GO-terms of the reference sequence HashSet<String> go_PAH = ReadQuickGO.readHTMLPage(reference_id); for (String i : go_PAH) { numberGO.put(i, 0); }
// comparison of the GO-Terms compareGOs(go_PAH);
// output-file is written with the GO-terms and their number of // occurrence in the other sequences. PrintWriter OUT = new PrintWriter(new FileWriter(new File(o))); OUT.println("GO\t#occurrence");
for (String s : go_PAH) { OUT.println(s + "\t" + numberGO.get(s)); } OUT.close();
} }
package identity;
import java.io.BufferedReader; import java.io.InputStreamReader; import java.net.URL; import java.util.HashSet;
public class ReadQuickGO {
/** * reads the HTML Page of QuickGO to download the GO annotations of the given UniProt ID. * @param uniProtID * @return set of GOterms annotated for this UniProt ID * @throws Exception */ public static HashSet<String> readHTMLPage(String uniProtID) throws Exception { HashSet<String> goTerms = new HashSet<String>(); BufferedReader reader = null; //HTML-Link String urlString = "http://www.ebi.ac.uk/QuickGO/GAnnotation?protein=" + uniProtID + "&format=tsv"; URL url = new URL(urlString); try { reader = new BufferedReader(new InputStreamReader(url.openStream())); //finding the column with GO-ID int index=-1; String[] go_id = reader.readLine().split("\t");
for(int i=0;i<go_id.length;i++){
if(go_id[i].equals("GO ID")){ index=i; } } if(index==-1){ throw new Exception("No GO ID found for " + uniProtID); } //saving the GO-Terms
String line;
while ((line = reader.readLine()) != null) {
String[] lineSplit = line.split("\t"); goTerms.add(lineSplit[index]); }
reader.close();
} catch (Exception e) { throw new Exception("Could not find any GO-term for " + uniProtID); } //Returning HashSet with GO-Terms return goTerms; } }
</source>