Difference between revisions of "Sequence Alignments Protocol TSD"
(→Find Unique matches: example) |
m (→Runs) |
||
(10 intermediate revisions by 2 users not shown) | |||
Line 1: | Line 1: | ||
+ | The results are shown in [[Sequence Alignments TSD|Sequence Alignments TSD]]. |
||
+ | |||
== Blast == |
== Blast == |
||
− | + | <source lang="bash"> |
|
blastall -p blastp -d /mnt/project/pracstrucfunc12/data/big/big_80 -i /mnt/home/student/meiera/1_SeqAli/1_SeqSearch/P06865.fasta -b 1200 -v 1200 > /mnt/home/student/meiera /1_SeqAli/1_SeqSearch/blastall_1200alis.out |
blastall -p blastp -d /mnt/project/pracstrucfunc12/data/big/big_80 -i /mnt/home/student/meiera/1_SeqAli/1_SeqSearch/P06865.fasta -b 1200 -v 1200 > /mnt/home/student/meiera /1_SeqAli/1_SeqSearch/blastall_1200alis.out |
||
− | + | </source> |
|
== PSI-Blast == |
== PSI-Blast == |
||
=== Big80 === |
=== Big80 === |
||
+ | <source lang="bash"> |
||
− | time blastpgp -C blastpgp1200_pssmdefault -d /mnt/project/pracstrucfunc12/data/big/big_80 -i /mnt/home/student/meiera/1_SeqAli/1_SeqSearch/P06865.fasta -v 1200 -b 1200 > /mnt/home/student/meiera/1_SeqAli/1_SeqSearch/blastpgp1200_default.out |
||
− | + | time blastpgp -C blastpgp1200_pssmdefault -d /mnt/project/pracstrucfunc12/data/big/big_80 -i /mnt/home/student/meiera/1_SeqAli/1_SeqSearch/P06865.fasta -v 1200 -b 1200 > /mnt/home/student/meiera/1_SeqAli/1_SeqSearch/blastpgp1200_default.out |
|
− | + | time blastpgp -C blastpgp1200_pssmit2e002 -d /mnt/project/pracstrucfunc12/data/big/big_80 -i /mnt/home/student/meiera/1_SeqAli/1_SeqSearch/P06865.fasta -j 2 -h "0.002" -v 1200 -b 1200 > /mnt/home/student/meiera/1_SeqAli/1_SeqSearch/blastpgp1200_it2e002.out |
|
− | + | time blastpgp -C blastpgp1200_pssmit10e002 -d /mnt/project/pracstrucfunc12/data/big/big_80 -i /mnt/home/student/meiera/1_SeqAli/1_SeqSearch/P06865.fasta -j 10 -h "0.002" -v 1200 -b 1200 > /mnt/home/student/meiera/1_SeqAli/1_SeqSearch/blastpgp1200_it10e002.out |
|
− | + | time blastpgp -C blastpgp1200_pssmit10e-10 -d /mnt/project/pracstrucfunc12/data/big/big_80 -i /mnt/home/student/meiera/1_SeqAli/1_SeqSearch/P06865.fasta -j 10 -h "10E-10" -v 1200 -b 1200 > /mnt/home/student/meiera/1_SeqAli/1_SeqSearch/blastpgp1200_it10e-10.out |
|
+ | time blastpgp -C blastpgp1200_pssmit2e-10 -d /mnt/project/pracstrucfunc12/data/big/big_80 -i /mnt/home/student/meiera/1_SeqAli/1_SeqSearch/P06865.fasta -j 2 -h "10E-10" -v 1200 -b 1200 > /mnt/home/student/meiera/1_SeqAli/1_SeqSearch/blastpgp1200_it2e-10.out |
||
+ | </source> |
||
=== Big === |
=== Big === |
||
+ | <source lang="bash"> |
||
PAT="/mnt/home/student/meiera/1_SeqAli/1_SeqSearch" |
PAT="/mnt/home/student/meiera/1_SeqAli/1_SeqSearch" |
||
time blastpgp -R $PAT/blastpgp1200_pssmit2e002 -d /mnt/project/pracstrucfunc12/data/big/big -i $PAT/P06865.fasta -v 3800 -b 3800 > $PAT/blastpgpBIG_3800_it2e002.out |
time blastpgp -R $PAT/blastpgp1200_pssmit2e002 -d /mnt/project/pracstrucfunc12/data/big/big -i $PAT/P06865.fasta -v 3800 -b 3800 > $PAT/blastpgpBIG_3800_it2e002.out |
||
Line 17: | Line 22: | ||
time blastpgp -R $PAT/blastpgp1200_pssmit2e-10 -d /mnt/project/pracstrucfunc12/data/big/big -i $PAT/P06865.fasta -v 3800 -b 3800 > $PAT/blastpgpBIG_3800_it2e-10.out |
time blastpgp -R $PAT/blastpgp1200_pssmit2e-10 -d /mnt/project/pracstrucfunc12/data/big/big -i $PAT/P06865.fasta -v 3800 -b 3800 > $PAT/blastpgpBIG_3800_it2e-10.out |
||
time blastpgp -R $PAT/blastpgp1200_pssmit10e-10 -d /mnt/project/pracstrucfunc12/data/big/big -i $PAT/P06865.fasta -v 3800 -b 3800 > $PAT/blastpgpBIG_3800_it10e-10.out |
time blastpgp -R $PAT/blastpgp1200_pssmit10e-10 -d /mnt/project/pracstrucfunc12/data/big/big -i $PAT/P06865.fasta -v 3800 -b 3800 > $PAT/blastpgpBIG_3800_it10e-10.out |
||
+ | </source> |
||
− | |||
− | === |
+ | === Count unique matches: example === |
+ | <source lang="bash"> |
||
head -1 m8blastpgp1200_it2e002.out |
head -1 m8blastpgp1200_it2e002.out |
||
grep -n G1RUL9 m8blastpgp1200_it2e002.out |
grep -n G1RUL9 m8blastpgp1200_it2e002.out |
||
tail -n +1233 m8blastpgp1200_it2e002.out | cut -f 2 | wc -l |
tail -n +1233 m8blastpgp1200_it2e002.out | cut -f 2 | wc -l |
||
tail -n +1233 m8blastpgp1200_it2e002.out | uniq -w 44 | wc -l |
tail -n +1233 m8blastpgp1200_it2e002.out | uniq -w 44 | wc -l |
||
+ | </source> |
||
− | |||
== HHblits == |
== HHblits == |
||
+ | <source lang="bash"> |
||
WD="/mnt/home/student/meiera/1_SeqAli/1_SeqSearch" |
WD="/mnt/home/student/meiera/1_SeqAli/1_SeqSearch" |
||
time hhblits -i $WD/P06865.fasta -o $WD/hhblits_460_P06865.hhr -d /mnt/project/pracstrucfunc12/data/hhblits/uniprot20_current -Z 460 -B 460 > $WD/hhblits_460_P06865_stdout.log |
time hhblits -i $WD/P06865.fasta -o $WD/hhblits_460_P06865.hhr -d /mnt/project/pracstrucfunc12/data/hhblits/uniprot20_current -Z 460 -B 460 > $WD/hhblits_460_P06865_stdout.log |
||
Line 32: | Line 39: | ||
time hhblits -i $WD/P06865.fasta -n 2 -o $WD/hhblits_2500_P06865_2it.hhr -d /mnt/project/pracstrucfunc12/data/hhblits/uniprot20_current -Z 2500 -B 2500 > $WD/hhblits_2500_P06865_2_stdout.log |
time hhblits -i $WD/P06865.fasta -n 2 -o $WD/hhblits_2500_P06865_2it.hhr -d /mnt/project/pracstrucfunc12/data/hhblits/uniprot20_current -Z 2500 -B 2500 > $WD/hhblits_2500_P06865_2_stdout.log |
||
time hhsearch -i $WD/P06865.fasta -o $WD/hhblits_460_P06865.hhr -d /mnt/project/pracstrucfunc12/data/hhblits/pdb70_current_hhm_db -Z 14 -B 14 > $WD/hhblits_pdb_1500_P06865_stdout.log |
time hhsearch -i $WD/P06865.fasta -o $WD/hhblits_460_P06865.hhr -d /mnt/project/pracstrucfunc12/data/hhblits/pdb70_current_hhm_db -Z 14 -B 14 > $WD/hhblits_pdb_1500_P06865_stdout.log |
||
+ | </source> |
||
− | |||
==Preparation of outputs== |
==Preparation of outputs== |
||
==== Sequence identity and e-values ==== |
==== Sequence identity and e-values ==== |
||
+ | <source lang="bash"> |
||
#for both Blast and HHblits output |
#for both Blast and HHblits output |
||
/mnt/home/student/meiera/bin/1_parseidentity.pl |
/mnt/home/student/meiera/bin/1_parseidentity.pl |
||
Line 44: | Line 52: | ||
-o OutFile writes output to fileOUT |
-o OutFile writes output to fileOUT |
||
-h Input is hhblits, default is blast |
-h Input is hhblits, default is blast |
||
+ | </source> |
||
− | |||
==== Uniprot sets for Venn diagrams ==== |
==== Uniprot sets for Venn diagrams ==== |
||
+ | <source lang="bash"> |
||
#for both Blast and HHblits output, also computation of the unique ids for HHblits |
#for both Blast and HHblits output, also computation of the unique ids for HHblits |
||
/mnt/home/student/meiera/bin/1_getUniprotIDS.pl |
/mnt/home/student/meiera/bin/1_getUniprotIDS.pl |
||
Line 54: | Line 63: | ||
-h = hhblits input default is set to blastoutput |
-h = hhblits input default is set to blastoutput |
||
-m mappingfile uniprot mappingfile |
-m mappingfile uniprot mappingfile |
||
+ | </source> |
||
− | |||
== Build PDB sets == |
== Build PDB sets == |
||
=== Unique identifiers for Blast and HHblits === |
=== Unique identifiers for Blast and HHblits === |
||
+ | <source lang="bash"> |
||
/mnt/home/student/meiera/bin/1_forPdbMapping.pl |
/mnt/home/student/meiera/bin/1_forPdbMapping.pl |
||
Usage: 1_forPdbMapping.pl [flags] file... |
Usage: 1_forPdbMapping.pl [flags] file... |
||
Line 90: | Line 100: | ||
# dont' have to do the special stuff here, the normal one doesn't find any because it's only on uniprot |
# dont' have to do the special stuff here, the normal one doesn't find any because it's only on uniprot |
||
+ | </source> |
||
== COPS Parsing == |
== COPS Parsing == |
||
The following script will read the COPS clusters (-c) and print out for every classification the members of the cluster that one or several IDs of interest (-i) are contained in. Additionally, if given a list of hits (-hi), it will count the number of hits that fall in the according cluster and the ones that don't and print out that information to a file for plotting e.g. with R. |
The following script will read the COPS clusters (-c) and print out for every classification the members of the cluster that one or several IDs of interest (-i) are contained in. Additionally, if given a list of hits (-hi), it will count the number of hits that fall in the according cluster and the ones that don't and print out that information to a file for plotting e.g. with R. |
||
Line 98: | Line 109: | ||
Sample call: ./parseCOPSClusters.pl -c ~/Desktop/COPS-ChainHierarchy.txt -i ./interestIds -nc -hi ./union.finpdb |
Sample call: ./parseCOPSClusters.pl -c ~/Desktop/COPS-ChainHierarchy.txt -i ./interestIds -nc -hi ./union.finpdb |
||
+ | <source lang="perl"> |
||
#!/usr/bin/env perl |
#!/usr/bin/env perl |
||
Line 332: | Line 344: | ||
return $id; |
return $id; |
||
} |
} |
||
+ | |||
+ | </source> |
||
+ | |||
+ | |||
+ | ==MSA== |
||
+ | |||
+ | ===Dataset creation === |
||
+ | |||
+ | <source lang="bash"> |
||
+ | # Get details for the pdb proteins union |
||
+ | grep -f pdbSets/union.finpdb m8blastpgpBIG_3800*.out > pdbunionBlast |
||
+ | |||
+ | </source> |
||
+ | ===Runs=== |
||
+ | |||
+ | <source lang="bash"> |
||
+ | p="/mnt/home/student/meiera/1_SeqAli/2_MSA" |
||
+ | export PATH=$PATH:/mnt/opt/T-Coffee/bin/ |
||
+ | |||
+ | clustalw -align -infile=$p/datasets/msaset_40.fasta -outfile=$p/clustalw_msaset_40.aln |
||
+ | clustalw -align -infile=$p/datasets/msaset_60.fasta -outfile=$p/clustalw_msaset_60.aln |
||
+ | clustalw -align -infile=$p/datasets/msaset_all.fasta -outfile=$p/clustalw_msaset_all.aln |
||
+ | clustalw -infile=$p/datasets/msaset_40.fasta -outfile=$p/clustalw_TESTmsaset_40.aln |
||
+ | |||
+ | muscle -in $p/datasets/msaset_40.fasta -out $p/muscle_msaset_40.msa -quiet |
||
+ | muscle -in $p/datasets/msaset_60.fasta -out $p/muscle_msaset_60.msa -quiet |
||
+ | muscle -in $p/datasets/msaset_all.fasta -out $p/muscle_msaset_all.msa -quiet |
||
+ | |||
+ | t_coffee $p/datasets/msaset_40.fasta -outfile $p/tcdef_40.aln |
||
+ | t_coffee $p/datasets/msaset_60.fasta -outfile $p/tcdef_60.aln |
||
+ | t_coffee $p/datasets/msaset_all.fasta -outfile $p/tcdef_all.aln |
||
+ | t_coffee $p/datasets/msaset_40_struct.fasta -outfile $p/tcdef_40_struct.aln |
||
+ | |||
+ | t_coffee $p/datasets/msaset_40_struct.fasta -method pdb_pair -template_file $p/datasets/msa40struct.temp -outfile $p/tc3d_msa40struct.aln &> $p/tc3d_msa40struct.log |
||
+ | </source> |
||
+ | |||
+ | The template file looks like this: |
||
+ | |||
+ | <source lang="bash"> |
||
+ | >sp|P06865|HEXA_HUMAN Beta-hexosaminidase subunit alpha OS=Homo sapiens GN=HEXA PE=1 SV=2 _P_ 2gjxA |
||
+ | >tr|Q06GJ0|Q06GJ0_OSTFU N-acetylglucosaminidase OS=Ostrinia furnacalis PE=1 SV=1 _P_ 3nsnA |
||
+ | >tr|D0VX21|D0VX21_PAESP Beta-hexosaminidase OS=Paenibacillus sp. GN=Hex1 PE=1 SV=1 _P_ 3gh5A |
||
+ | </source> |
||
== Plotting == |
== Plotting == |
||
Distribution of hits that full in and out of clusters |
Distribution of hits that full in and out of clusters |
||
+ | |||
+ | <source lang="bash"> |
||
library(gplots) #Needs caTools, bitops and a few more. Gives us: barplot2 |
library(gplots) #Needs caTools, bitops and a few more. Gives us: barplot2 |
||
Line 360: | Line 417: | ||
dev.off() |
dev.off() |
||
+ | </source> |
||
Overlap of PDB hits found by the different methods |
Overlap of PDB hits found by the different methods |
||
+ | |||
+ | |||
+ | <source lang="bash"> |
||
library("VennDiagram") |
library("VennDiagram") |
||
main1<-"Overlap of PDB hits" |
main1<-"Overlap of PDB hits" |
||
Line 380: | Line 441: | ||
+ | </source> |
||
e-Value and identity distributions |
e-Value and identity distributions |
||
+ | |||
+ | |||
+ | |||
+ | <source lang="bash"> |
||
ccmain <- 1.7 |
ccmain <- 1.7 |
||
Line 413: | Line 479: | ||
###Psi-Blast normal |
###Psi-Blast normal |
||
+ | |||
− | # #Iterations e0002 |
||
− | # d1 <- read.table("./temp/blastpgp1200_it10e002_IdEvfixed.plot") |
||
− | # d2 <- read.table("./temp/blastpgp1200_it2e002_IdEvfixed.plot") |
||
− | # |
||
− | # d1_id <- d1$V1 |
||
− | # d1_ev <- d1$V2 |
||
− | # d2_id <- d2$V1 |
||
− | # d2_ev <- d2$V2 |
||
− | # |
||
− | # |
||
− | # lab = c("2 iterations", "10 iterations") |
||
− | # |
||
− | # png(width=600, height=600, "tsd_psiblastEvals_Iterations_e002.png") |
||
− | # par(mar=par()$mar+c(0,1,0,0)) |
||
− | # hist(log(d1_ev), breaks=100,xlim=c(-430,0), ylim=c(0,70), main="PSI-Blast - Histogram of eValue distributions", xlab="log(eValue)", cex.axis=ccaxis, cex.main=ccmain, cex.lab=cclab, col=col1t) |
||
− | # mtext("Profile inclusion threshold 0.002, BIG80 database", cex=1.4) |
||
− | # hist(log(d2_ev), breaks=100,xlim=c(-430,0) ,ylim=c(0,70), col=col2t, add=T) |
||
− | # legend("topleft", lab, fill=c(col1t, col2t), cex=1.6, ncol=1) |
||
− | # dev.off() |
||
− | # |
||
− | # png(width=600, height=600, "tsd_psiblastIdent_Iterations_e002.png") |
||
− | # par(mar=par()$mar+c(0,1,0,0)) |
||
− | # hist(d1_id, breaks=100, xlim=c(0,160),ylim=c(0,200), main="PSI-Blast - Histogram of identities ", xlab="% sequence identity in Blast alignment", cex.axis=ccaxis, cex.main=ccmain, cex.lab=cclab, col=col1t) |
||
− | # mtext("Profile inclusion threshold 0.002, BIG80 database", cex=1.4) |
||
− | # hist(d2_id, breaks=100,ylim=c(0,160), col=col2t, add=T) |
||
− | # legend("topright", lab, fill=c(col1t, col2t), cex=1.6, ncol=1) |
||
− | # dev.off() |
||
− | # |
||
− | # #Iterations 10e-10 |
||
− | # d1 <- read.table("./temp/blastpgp1200_it10e-10_IdEvfixed.plot") |
||
− | # d2 <- read.table("./temp/blastpgp1200_it2e-10_IdEvfixed.plot") |
||
− | # |
||
− | # d1_id <- d1$V1 |
||
− | # d1_ev <- d1$V2 |
||
− | # d2_id <- d2$V1 |
||
− | # d2_ev <- d2$V2 |
||
− | # |
||
− | # lab = c("2 iterations", "10 iterations") |
||
− | # |
||
− | # png("tsd_psiblastEvals_Iterations_10e-10.png", width=600, height=600) |
||
− | # par(mar=par()$mar+c(0,1,0,0)) |
||
− | # hist(log(d1_ev), breaks=100, ylim=c(0,70), xlim=c(-430,0) ,main="PSI-Blast - Histogram of eValue distributions", xlab="log(eValue)", cex.axis=ccaxis, cex.main=ccmain, cex.lab=cclab, col=col1t) |
||
− | # mtext("Profile inclusion threshold 10e-10, BIG80 database", cex=1.4) |
||
− | # hist(log(d2_ev), breaks=100, ylim=c(0,70),xlim=c(-430,0) ,col=col2t, add=T) |
||
− | # legend("topleft", lab, fill=c(col1t, col2t), cex=1.6, ncol=1) |
||
− | # dev.off() |
||
− | # |
||
− | # png(width=600, height=600, "tsd_psiblastIdent_Iterations_10e-10.png") |
||
− | # par(mar=par()$mar+c(0,1,0,0)) |
||
− | # hist(d1_id, breaks=100, xlim=c(0,100), ylim=c(0,180), main="PSI-Blast - Histogram of identities ", xlab="% sequence identity in Blast alignment", cex.axis=ccaxis, cex.main=ccmain, cex.lab=cclab, col=col1t) |
||
− | # mtext("Profile inclusion threshold 10e-10, BIG80 database", cex=1.4) |
||
− | # hist(d2_id, breaks=100, col=col2t, ylim=c(0,180),add=T) |
||
− | # legend("topright", lab, fill=c(col1t, col2t), cex=1.6, ncol=1) |
||
− | # dev.off() |
||
d1 <- read.table("./temp/blastpgp1200_it2e002_IdEvfixed.plot") |
d1 <- read.table("./temp/blastpgp1200_it2e002_IdEvfixed.plot") |
||
d2 <- read.table("./temp/blastpgp1200_it10e002_IdEvfixed.plot") |
d2 <- read.table("./temp/blastpgp1200_it10e002_IdEvfixed.plot") |
||
Line 554: | Line 567: | ||
dev.off() |
dev.off() |
||
− | |||
− | # #Iterations e0002 |
||
− | # d1 <- read.table("./temp/blastpgpBIG_3800_it10e002_IdEvfixed.plot") |
||
− | # d2 <- read.table("./temp/blastpgpBIG_3800_it2e002_IdEvfixed.plot") |
||
− | # |
||
− | # d1_id <- d1$V1 |
||
− | # d1_ev <- d1$V2 |
||
− | # d2_id <- d2$V1 |
||
− | # d2_ev <- d2$V2 |
||
− | # |
||
− | # |
||
− | # lab = c("2 iterations", "10 iterations") |
||
− | # |
||
− | # png(width=600, height=600, "tsd_psiblastBIGEvals_Iterations_e002.png") |
||
− | # par(mar=par()$mar+c(0,1,0,0)) |
||
− | # hist(log(d1_ev), breaks=100, xlim=c(-430,0) ,ylim=c(0,200),main="PSI-Blast - Histogram of eValue distributions", xlab="log(eValue)", cex.axis=ccaxis, cex.main=ccmain, cex.lab=cclab, col=col1t) |
||
− | # mtext("Profile inclusion threshold 0.002, BIG database", cex=1.4) |
||
− | # hist(log(d2_ev), breaks=100, xlim=c(-430,0) ,ylim=c(0,200),col=col2t, add=T) |
||
− | # legend("topleft", lab, fill=c(col1t, col2t), cex=1.6, ncol=1) |
||
− | # dev.off() |
||
− | # |
||
− | # png(width=600, height=600, "tsd_psiblastBIGIdent_Iterations_e002.png") |
||
− | # par(mar=par()$mar+c(0,1,0,0)) |
||
− | # hist(d1_id, breaks=100, xlim=c(0,100), ylim=c(0,430), main="PSI-Blast - Histogram of identities ", xlab="% sequence identity in Blast alignment", cex.axis=ccaxis, cex.main=ccmain, cex.lab=cclab, col=col1t) |
||
− | # mtext("Profile inclusion threshold 0.002, BIG database", cex=1.4) |
||
− | # hist(d2_id, breaks=100, col=col2t,ylim=c(0,430), add=T) |
||
− | # legend("topright", lab, fill=c(col1t, col2t), cex=1.6, ncol=1) |
||
− | # dev.off() |
||
− | # |
||
− | # #Iterations 10e-10 |
||
− | # d1 <- read.table("./temp/blastpgpBIG_3800_it10e-10_IdEvfixed.plot") |
||
− | # d2 <- read.table("./temp/blastpgpBIG_3800_it2e-10_IdEvfixed.plot") |
||
− | # |
||
− | # d1_id <- d1$V1 |
||
− | # d1_ev <- d1$V2 |
||
− | # d2_id <- d2$V1 |
||
− | # d2_ev <- d2$V2 |
||
− | # |
||
− | # lab = c("2 iterations", "10 iterations") |
||
− | # |
||
− | # png(width=600, height=600, "tsd_psiblastBIGEvals_Iterations_10e-10.png") |
||
− | # par(mar=par()$mar+c(0,1,0,0)) |
||
− | # hist(log(d1_ev), breaks=100, xlim=c(-430,0) ,ylim=c(0,280),main="PSI-Blast - Histogram of eValue distributions", xlab="log(eValue)", cex.axis=ccaxis, cex.main=ccmain, cex.lab=cclab, col=col1t) |
||
− | # mtext("Profile inclusion threshold 10e-10, BIG database", cex=1.4) |
||
− | # hist(log(d2_ev), breaks=100, xlim=c(-430,0), ylim=c(0,280),col=col2t, add=T) |
||
− | # legend("topleft", lab, fill=c(col1t, col2t), cex=1.6, ncol=1) |
||
− | # dev.off() |
||
− | # |
||
− | # png(width=600, height=600, "tsd_psiblastBIGIdent_Iterations_10e-10.png") |
||
− | # par(mar=par()$mar+c(0,1,0,0)) |
||
− | # hist(d1_id, breaks=100, xlim=c(0,100),ylim=c(0,400), main="PSI-Blast - Histogram of identities ", xlab="% sequence identity in Blast alignment", cex.axis=ccaxis, cex.main=ccmain, cex.lab=cclab, col=col1t) |
||
− | # mtext("Profile inclusion threshold 10e-10, BIG database", cex=1.4) |
||
− | # hist(d2_id, breaks=100, col=col2t, ylim=c(0,400),add=T) |
||
− | # legend("topright", lab, fill=c(col1t, col2t), cex=1.6, ncol=1) |
||
− | # dev.off() |
||
###Hblits |
###Hblits |
||
Line 627: | Line 585: | ||
dev.off() |
dev.off() |
||
+ | </source> |
||
− | ==MSA== |
||
+ | Boxplot for MSA gaps |
||
− | ===Dataset creation === |
||
− | # Get details for the pdb proteins union |
||
− | grep -f pdbSets/union.finpdb m8blastpgpBIG_3800*.out > pdbunionBlast |
||
+ | <source lang="bash"> |
||
− | ===Runs=== |
||
+ | main1<-"Gaps in MSA" |
||
− | |||
+ | png("testall.png") |
||
− | p="/mnt/home/student/meiera/1_SeqAli/2_MSA" |
||
+ | boxplot(main=main1,col=c("blue","blue","blue","purple4","purple4","purple4","darkgreen","darkgreen","darkgreen"),names=c("<40%",">60%","Range", "<40%",">60%","Range", "<40%",">60%","Range"), clustal.m40,clustal.m60,clustal.mall,muscle.m40,muscle.m60,muscle.mall,tcoffee.m40,tcoffee.m60,tcoffee.mall,las=2) |
||
− | export PATH=$PATH:/mnt/opt/T-Coffee/bin/ |
||
+ | legend("topleft",legend=c("ClustalW","Muscle","T-Coffee"),fill=c("blue","purple4","darkgreen")) |
||
− | |||
+ | dev.off() |
||
− | clustalw -align -infile=$p/datasets/msaset_40.fasta -outfile=$p/clustalw_msaset_40.aln |
||
+ | </source> |
||
− | clustalw -align -infile=$p/datasets/msaset_60.fasta -outfile=$p/clustalw_msaset_60.aln |
||
− | clustalw -align -infile=$p/datasets/msaset_all.fasta -outfile=$p/clustalw_msaset_all.aln |
||
− | clustalw -infile=$p/datasets/msaset_40.fasta -outfile=$p/clustalw_TESTmsaset_40.aln |
||
− | |||
− | muscle -in $p/datasets/msaset_40.fasta -out $p/muscle_msaset_40.msa -quiet |
||
− | muscle -in $p/datasets/msaset_60.fasta -out $p/muscle_msaset_60.msa -quiet |
||
− | muscle -in $p/datasets/msaset_all.fasta -out $p/muscle_msaset_all.msa -quiet |
||
− | |||
− | t_coffee $p/datasets/msaset_40.fasta -outfile tcdef_40.aln |
||
− | t_coffee $p/datasets/msaset_60.fasta -outfile tcdef_60.aln |
||
− | t_coffee $p/datasets/msaset_all.fasta -outfile tcdef_all.aln |
||
− | |||
− | t_coffee $p/datasets/msaset_40.fasta -method sap_pair -template_file 3nsn_A -outfile tc3d_40.aln |
||
− | t_coffee $p/datasets/msaset_60.fasta -method sap_pair -template_file 2gjx_A -outfile tc3d_60.aln |
||
− | t_coffee $p/datasets/msaset_all.fasta -method sap_pair -template_file 3lmy_A -outfile tc3d_all.aln |
Latest revision as of 16:37, 7 May 2012
The results are shown in Sequence Alignments TSD.
Contents
Blast
<source lang="bash">
blastall -p blastp -d /mnt/project/pracstrucfunc12/data/big/big_80 -i /mnt/home/student/meiera/1_SeqAli/1_SeqSearch/P06865.fasta -b 1200 -v 1200 > /mnt/home/student/meiera /1_SeqAli/1_SeqSearch/blastall_1200alis.out
</source>
PSI-Blast
Big80
<source lang="bash"> time blastpgp -C blastpgp1200_pssmdefault -d /mnt/project/pracstrucfunc12/data/big/big_80 -i /mnt/home/student/meiera/1_SeqAli/1_SeqSearch/P06865.fasta -v 1200 -b 1200 > /mnt/home/student/meiera/1_SeqAli/1_SeqSearch/blastpgp1200_default.out time blastpgp -C blastpgp1200_pssmit2e002 -d /mnt/project/pracstrucfunc12/data/big/big_80 -i /mnt/home/student/meiera/1_SeqAli/1_SeqSearch/P06865.fasta -j 2 -h "0.002" -v 1200 -b 1200 > /mnt/home/student/meiera/1_SeqAli/1_SeqSearch/blastpgp1200_it2e002.out time blastpgp -C blastpgp1200_pssmit10e002 -d /mnt/project/pracstrucfunc12/data/big/big_80 -i /mnt/home/student/meiera/1_SeqAli/1_SeqSearch/P06865.fasta -j 10 -h "0.002" -v 1200 -b 1200 > /mnt/home/student/meiera/1_SeqAli/1_SeqSearch/blastpgp1200_it10e002.out time blastpgp -C blastpgp1200_pssmit10e-10 -d /mnt/project/pracstrucfunc12/data/big/big_80 -i /mnt/home/student/meiera/1_SeqAli/1_SeqSearch/P06865.fasta -j 10 -h "10E-10" -v 1200 -b 1200 > /mnt/home/student/meiera/1_SeqAli/1_SeqSearch/blastpgp1200_it10e-10.out time blastpgp -C blastpgp1200_pssmit2e-10 -d /mnt/project/pracstrucfunc12/data/big/big_80 -i /mnt/home/student/meiera/1_SeqAli/1_SeqSearch/P06865.fasta -j 2 -h "10E-10" -v 1200 -b 1200 > /mnt/home/student/meiera/1_SeqAli/1_SeqSearch/blastpgp1200_it2e-10.out </source>
Big
<source lang="bash">
PAT="/mnt/home/student/meiera/1_SeqAli/1_SeqSearch" time blastpgp -R $PAT/blastpgp1200_pssmit2e002 -d /mnt/project/pracstrucfunc12/data/big/big -i $PAT/P06865.fasta -v 3800 -b 3800 > $PAT/blastpgpBIG_3800_it2e002.out time blastpgp -R $PAT/blastpgp1200_pssmit10e002 -d /mnt/project/pracstrucfunc12/data/big/big -i $PAT/P06865.fasta -v 3800 -b 3800 > $PAT/blastpgpBIG_3800_it10e002.out time blastpgp -R $PAT/blastpgp1200_pssmit2e-10 -d /mnt/project/pracstrucfunc12/data/big/big -i $PAT/P06865.fasta -v 3800 -b 3800 > $PAT/blastpgpBIG_3800_it2e-10.out time blastpgp -R $PAT/blastpgp1200_pssmit10e-10 -d /mnt/project/pracstrucfunc12/data/big/big -i $PAT/P06865.fasta -v 3800 -b 3800 > $PAT/blastpgpBIG_3800_it10e-10.out
</source>
Count unique matches: example
<source lang="bash">
head -1 m8blastpgp1200_it2e002.out grep -n G1RUL9 m8blastpgp1200_it2e002.out tail -n +1233 m8blastpgp1200_it2e002.out | cut -f 2 | wc -l tail -n +1233 m8blastpgp1200_it2e002.out | uniq -w 44 | wc -l
</source>
HHblits
<source lang="bash">
WD="/mnt/home/student/meiera/1_SeqAli/1_SeqSearch" time hhblits -i $WD/P06865.fasta -o $WD/hhblits_460_P06865.hhr -d /mnt/project/pracstrucfunc12/data/hhblits/uniprot20_current -Z 460 -B 460 > $WD/hhblits_460_P06865_stdout.log time hhblits -i $WD/P06865.fasta -n 10 -o $WD/hhblits_460_P06865_10it.hhr -d /mnt/project/pracstrucfunc12/data/hhblits/uniprot20_current -Z 460 -B 460 > $WD/hhblits_460_P06865_10_stdout.log time hhblits -i $WD/P06865.fasta -n 10 -o $WD/hhblits_2500_P06865_10it.hhr -d /mnt/project/pracstrucfunc12/data/hhblits/uniprot20_current -Z 2500 -B 2500 > $WD/hhblits_2500_P06865_10_stdout.log time hhblits -i $WD/P06865.fasta -n 2 -o $WD/hhblits_460_P06865_2it.hhr -d /mnt/project/pracstrucfunc12/data/hhblits/uniprot20_current -Z 460 -B 460 > $WD/hhblits_460_P06865_2_stdout.log time hhblits -i $WD/P06865.fasta -n 2 -o $WD/hhblits_2500_P06865_2it.hhr -d /mnt/project/pracstrucfunc12/data/hhblits/uniprot20_current -Z 2500 -B 2500 > $WD/hhblits_2500_P06865_2_stdout.log time hhsearch -i $WD/P06865.fasta -o $WD/hhblits_460_P06865.hhr -d /mnt/project/pracstrucfunc12/data/hhblits/pdb70_current_hhm_db -Z 14 -B 14 > $WD/hhblits_pdb_1500_P06865_stdout.log
</source>
Preparation of outputs
Sequence identity and e-values
<source lang="bash">
#for both Blast and HHblits output /mnt/home/student/meiera/bin/1_parseidentity.pl Usage: 1_parseidentity.pl [flags] file... flags: -i Infile reads the input from fileIN -o OutFile writes output to fileOUT -h Input is hhblits, default is blast
</source>
Uniprot sets for Venn diagrams
<source lang="bash">
#for both Blast and HHblits output, also computation of the unique ids for HHblits /mnt/home/student/meiera/bin/1_getUniprotIDS.pl Usage: 1_getUniprotIDS.pl [flags] file... flags: -i Infile reads the input from fileIN -o OutFile writes uniprot identifiers to fileOUT -h = hhblits input default is set to blastoutput -m mappingfile uniprot mappingfile
</source>
Build PDB sets
Unique identifiers for Blast and HHblits
<source lang="bash">
/mnt/home/student/meiera/bin/1_forPdbMapping.pl Usage: 1_forPdbMapping.pl [flags] file... flags: -i Infile reads the input from fileIN -o OutFile writes uniprot identifiers to fileOUT -h = hhblits input, default is set to blastoutput
#Use uniprot.org's online mapping service on the .pdbmapping files and save the results to .mapped files
cut -f 2 psiblastBIGit10e002.mapped | tr '[A-Z]' '[a-z]' | sort | uniq | grep -P "\w{4}" > psiblastBIGit10e002.finpdb1 grep -P "\w{4}_" psiblastBIGit10e002.pdbmapping --only-matching | sed "s/_//" > psiblastBIGit10e002.finpdb2 cat psiblastBIGit10e002.finpdb2 psiblastBIGit10e002.finpdb1 | sort | uniq > psiblastBIGit10e002.finpdb
cut -f 2 psiblastBIGit10e-10.mapped | tr '[A-Z]' '[a-z]' | sort | uniq | grep -P "\w{4}" > psiblastBIGit10e-10.finpdb1 grep -P "\w{4}_" psiblastBIGit10e-10.pdbmapping --only-matching | sed "s/_//" > psiblastBIGit10e-10.finpdb2 cat psiblastBIGit10e-10.finpdb1 psiblastBIGit10e-10.finpdb2 | sort | uniq > psiblastBIGit10e-10.finpdb
cut -f 2 psiblastBIGit2e002.mapped | tr '[A-Z]' '[a-z]' | sort | uniq | grep -P "\w{4}" > psiblastBIGit2e002.finpdb1 grep -P "\w{4}_" psiblastBIGit2e002.pdbmapping --only-matching | sed "s/_//" > psiblastBIGit2e002.finpdb2 cat psiblastBIGit2e002.finpdb1 psiblastBIGit2e002.finpdb2 | sort | uniq > psiblastBIGit2e002.finpdb
cut -f 2 psiblastBIGit2e-10.mapped | tr '[A-Z]' '[a-z]' | sort | uniq | grep -P "\w{4}" > psiblastBIGit2e-10.finpdb1 grep -P "\w{4}_" psiblastBIGit2e-10.pdbmapping --only-matching | sed "s/_//" > psiblastBIGit2e-10.finpdb2 cat psiblastBIGit2e-10.finpdb1 psiblastBIGit2e-10.finpdb2 | sort | uniq > psiblastBIGit2e-10.finpdb
cut -f 2 hhblits__460.mapped | tr '[A-Z]' '[a-z]' | sort | uniq | grep -P "\w{4}" > hhbtemp sed "s/_\w//g" hhblits_pdb_1500.pdbmapping > hhbtemp2 cat hhbtemp hhbtemp2 | sed "s/\s//g" | sort | uniq > hhblits.finpdb rm -f hhbtemp hhbtemp2 # dont' have to do the special stuff here, the normal one doesn't find any because it's only on uniprot
</source>
COPS Parsing
The following script will read the COPS clusters (-c) and print out for every classification the members of the cluster that one or several IDs of interest (-i) are contained in. Additionally, if given a list of hits (-hi), it will count the number of hits that fall in the according cluster and the ones that don't and print out that information to a file for plotting e.g. with R. Note that you can ignore chains with the '-nc' option.
Also be aware that ignoring chains might lead to a merge of clusters. That is if given xxxx als protein of interest and xxxx,A as well was xxxx,B are contained in COPS, the final cluster which is used to calculate how many hits fall into it, will be a combination of the cluster from xxxx,A and the one from xxxx,B. That of course might be overestimating the number of hits that fall into a cluster.
Sample call: ./parseCOPSClusters.pl -c ~/Desktop/COPS-ChainHierarchy.txt -i ./interestIds -nc -hi ./union.finpdb
<source lang="perl">
#!/usr/bin/env perl use strict; use warnings; use sigtrap; use autodie; use diagnostics; use Pod::Usage; use Getopt::Long qw(:config require_order); use 5.010; #For smart match, which makes this script unbelievably slow, but I'm lazy :) ##Program variables my %clusters; my @clusterOrder; my @interestIDs = (); my %interestIDLocations; my @hitsIDs = (); ##Argument my $copsPath = '/mnt/project/pracstrucfunc12/data/COPS/COPS-ChainHierarchy.txt'; my $nochain = 0; my $interestPath = './interestIds'; my $hitsPath = './union.finpdb'; ######################## ###Argument Handling#### ######################## my $result = GetOptions( 'help|h|?' => sub { pod2usage( -verbose => 1 ); }, 'man|m' => sub { pod2usage( -verbose => 2 ); }, 'c|cops=s' => \$copsPath, 'nc|nocain' => \$nochain, 'i|interest=s' => \$interestPath, 'hi|hits=s' => \$hitsPath ) or pod2usage( -verbose => 1 ) && exit; #Read IDs to look out for my $fhi; open( $fhi, '<', $interestPath ); while ( my $line = <$fhi> ) { if ( $line =~ m/(\w{4},\w)/ || $line =~ m/(\w{4})/ ) { my $iid = lc($1); push( @interestIDs, $iid ); $interestIDLocations{$iid} = {}; } } close($fhi); #Read ids that were found by prediction method(s) my $fhh; open($fhh, '<', $hitsPath); while(my $line =<$fhh>) { if($line =~ m/^(\w{4})$/) { push(@hitsIDs, lc($1)); } } close($fhh); print "Found # hits: " . scalar(@hitsIDs) . "\n"; #Parse the COPS structure my $fh; open( $fh, '<', $copsPath ); while ( my $line = <$fh> ) { if ( $line =~ m/^Chain (.+)$/ ) { #Header line found. Inititialize the cluster hash my @matches = $1 =~ m/\w+/g; for my $clusterName (@matches) { $clusters{$clusterName} = {}; } @clusterOrder = @matches; } elsif ( $line =~ m/\w{4},/ ) { #Cluster info line my @matches = $line =~ m/(\w{4},\w)/g; my $currentId = shift(@matches); # print "current id is $currentId\n"; $currentId = sanitizePdbIDs($currentId); my $count = 0; for my $representative (@matches) { $representative = sanitizePdbIDs($representative); my $currentCluster = $clusterOrder[ $count++ ]; # print "currentCluster is $currentCluster\n"; if ( !exists $clusters{$currentCluster} || !defined $clusters{$currentCluster} ) { print "this should not have happened :)\n"; next; } if ( !exists $clusters{$currentCluster}{$representative} ) { #Inititialize the cluster with the representative $clusters{$currentCluster}{$representative} = [$representative]; } #Add the current pdb id if ( !( $currentId ~~ @{ $clusters{$currentCluster}{$representative} } ) ) { #This case can only occur when we do not use the chains push( @{ $clusters{$currentCluster}{$representative} }, $currentId ); } if ( $currentId ~~ @interestIDs ) { if(!defined $interestIDLocations{$currentId}{$currentCluster}) { $interestIDLocations{$currentId}{$currentCluster}= (); } push(@{$interestIDLocations{$currentId}{$currentCluster} }, $representative); } } } } close($fh); #Do some maybe interesting output for my $intId (@interestIDs) { printClustersForId($intId); } sub printClustersForId { my $id = shift; my @allIn = (); my @allOut = (); for my $cluster ( sort(keys( $interestIDLocations{$id} ) ) ) { my %clusterMembersMerged = (); print "MEMBERS: $id -- $cluster : "; my @currentRepresentatives = @{$interestIDLocations{$id}{$cluster}}; for my $currentRepresentative (@currentRepresentatives) { my @clusterMembers = @{ $clusters{$cluster}{$currentRepresentative} }; for my $member (@clusterMembers) { # print "$member "; $clusterMembersMerged{$member} = ; } # print "\n"; } my @clusterMembersMergedArr= keys(%clusterMembersMerged); for my $e (@clusterMembersMergedArr) { print $e." "; } print "\n"; my $hitsIn = 0; my $hitsOut = 0; print "OVERLAPS: $id -- $cluster : "; for my $hit (@hitsIDs) { if($hit ~~ @clusterMembersMergedArr) { $hitsIn++; } else { $hitsOut++; } } print "In: $hitsIn, Out: $hitsOut\n"; push(@allIn, $hitsIn); push(@allOut, $hitsOut); } #Write Out In/Out over Cluster stats for plotting in R my $fho; open($fho, '>', "stats$id.plotme"); for my $cluster (sort (keys( $interestIDLocations{$id}))) { print $fho "\t$cluster"; } print $fho "\n"; print $fho "IN"; for my $elem (@allIn) { print $fho "\t$elem"; } print $fho "\n"; print $fho "OUT"; for my $elem (@allOut) { print $fho "\t$elem"; } print $fho "\n"; close($fho); } sub sanitizePdbIDs { my $id = shift; if ($nochain) { $id = substr( $id, 0, 4, ); } return $id; }
</source>
MSA
Dataset creation
<source lang="bash">
# Get details for the pdb proteins union grep -f pdbSets/union.finpdb m8blastpgpBIG_3800*.out > pdbunionBlast
</source>
Runs
<source lang="bash">
p="/mnt/home/student/meiera/1_SeqAli/2_MSA" export PATH=$PATH:/mnt/opt/T-Coffee/bin/
clustalw -align -infile=$p/datasets/msaset_40.fasta -outfile=$p/clustalw_msaset_40.aln clustalw -align -infile=$p/datasets/msaset_60.fasta -outfile=$p/clustalw_msaset_60.aln clustalw -align -infile=$p/datasets/msaset_all.fasta -outfile=$p/clustalw_msaset_all.aln clustalw -infile=$p/datasets/msaset_40.fasta -outfile=$p/clustalw_TESTmsaset_40.aln
muscle -in $p/datasets/msaset_40.fasta -out $p/muscle_msaset_40.msa -quiet muscle -in $p/datasets/msaset_60.fasta -out $p/muscle_msaset_60.msa -quiet muscle -in $p/datasets/msaset_all.fasta -out $p/muscle_msaset_all.msa -quiet
t_coffee $p/datasets/msaset_40.fasta -outfile $p/tcdef_40.aln t_coffee $p/datasets/msaset_60.fasta -outfile $p/tcdef_60.aln t_coffee $p/datasets/msaset_all.fasta -outfile $p/tcdef_all.aln t_coffee $p/datasets/msaset_40_struct.fasta -outfile $p/tcdef_40_struct.aln
t_coffee $p/datasets/msaset_40_struct.fasta -method pdb_pair -template_file $p/datasets/msa40struct.temp -outfile $p/tc3d_msa40struct.aln &> $p/tc3d_msa40struct.log </source>
The template file looks like this:
<source lang="bash"> >sp|P06865|HEXA_HUMAN Beta-hexosaminidase subunit alpha OS=Homo sapiens GN=HEXA PE=1 SV=2 _P_ 2gjxA >tr|Q06GJ0|Q06GJ0_OSTFU N-acetylglucosaminidase OS=Ostrinia furnacalis PE=1 SV=1 _P_ 3nsnA >tr|D0VX21|D0VX21_PAESP Beta-hexosaminidase OS=Paenibacillus sp. GN=Hex1 PE=1 SV=1 _P_ 3gh5A </source>
Plotting
Distribution of hits that full in and out of clusters
<source lang="bash">
library(gplots) #Needs caTools, bitops and a few more. Gives us: barplot2 cbarnames <- 2.5 clwd <- 5 clwdaxis <- 20 ccaxis <- 2 maindat <- read.table('stats2gjx.plotme') mainmat <- as.matrix(maindat, header=TRUE) lab <- c("IN", "OUT") lab <- row.names(mainmat) len <- length(lab) method_col <- c("red", "blue") png("./tsd_inoutdistri.png", height=800, width=800) # par(xpd=T, mar=par()$mar+c(0,0,0,13)) barplot2(mainmat, beside=TRUE, col=method_col, cex.axis=ccaxis, cex.names=cbarnames, xpd=F, plot.ci=FALSE,cex.main=cbarnames, main="Overlap of union PDB hits with COPS Clusters") #names.arg=c("IN","OUT","IN","OUT","IN","OUT","IN","OUT","IN","OUT") legend(4,70, lab, fill=method_col, cex=2, ncol=1) par(mar=c(5, 4, 4, 2) + 0.1) # Restore default clipping rect dev.off()
</source>
Overlap of PDB hits found by the different methods
<source lang="bash">
library("VennDiagram") main1<-"Overlap of PDB hits" sub1<-"BIG database" hhb <- read.table("data/hhblits.finpdb") psiit10e002 <- read.table("data/psiblastBIGit10e002.finpdb") psiit10e10<-read.table("data/psiblastBIGit10e-10.finpdb") psiit2e002<-read.table("data/psiblastBIGit2e002.finpdb") # psiit2e10<-read.table("data/psiblastBIGit2e-10.finpdb") psi=list("HHblits"=hhb[,1], "It10, Ev10e-10"=psiit10e10[,1],"It10, Ev002"=psiit10e002[,1],"It2, Ev10e-10/Ev002"=psiit2e002[,1]) venn.diagram(psi,"pdbHitsOverlap.tif", scaled = TRUE,col=c("darkblue", "darkgreen", "orange","darkorchid4"), fill=c("cornflowerblue", "green", "yellow","darkorchid1"), fontfamily = "serif", fontface = "bold", cat.col = c("darkblue", "darkgreen", "orange","darkorchid4"),cat.cex = 1.5, cat.pos = 0, cat.fontfamily = "serif", rotation.degree = 200, main=main1, sub=sub1,main.cex=2,sub.cex=1.3,lwd=1,euler.d=2,lty=1, cat.dist=c(0.1,0.1,0.13,0.1))
</source>
e-Value and identity distributions
<source lang="bash">
ccmain <- 1.7 ccaxis <- 2 cclab <- 2 col1 <- "#0000ff" col2 <- "#ff0000" col3 <- "#00ff00" col4 <- "#808080" col1t <- "#0000FF80" col2t <- "#FF000080" col3t <- "#00ff0080" col4t <- "#80808080" ###Blast d <- read.table("./temp/blastall1200_IdEvfixed.plot") d_id <- d$V1 d_ev <- d$V2 #Plot identities png(width=600, height=600, "tsd_blastallIdent.png") par(mar=par()$mar+c(0,1,-1,0)) hist(d_id, breaks=100,xlim=c(0,100), main="Blast - Histogram of identities",xlab="% sequence identity in Blast alignment", cex.axis=ccaxis, cex.main=ccmain, cex.lab=cclab, col=col1) dev.off() #Plot evalues png(width=600, height=600, "tsd_blastallEval.png") par(mar=par()$mar+c(0,1,-1,0)) hist(log(d_ev), breaks=100, main="Blast - Histogram of eValue distribution",xlab="log(eValue)", cex.axis=ccaxis, cex.main=ccmain, cex.lab=cclab, col=col1) dev.off() ###Psi-Blast normal d1 <- read.table("./temp/blastpgp1200_it2e002_IdEvfixed.plot") d2 <- read.table("./temp/blastpgp1200_it10e002_IdEvfixed.plot") d3 <- read.table("./temp/blastpgp1200_it2e-10_IdEvfixed.plot") d4 <- read.table("./temp/blastpgp1200_it10e-10_IdEvfixed.plot") d1_id <- d1$V1 d1_ev <- d1$V2 d2_id <- d2$V1 d2_ev <- d2$V2 d3_id <- d3$V1 d3_ev <- d3$V2 d4_id <- d4$V1 d4_ev <- d4$V2 lab = c("2 iterations, e002", "10 iterations, e002", "2 Iterations, 10e-10", "10 Iterations, 10e-10") br <- seq(-450,0,5) png("tsd_psiblastEvals_allfour.png", width=600, height=600) par(mar=par()$mar+c(0,1,0,0)) hist(log(d1_ev), breaks=br, ylim=c(0,70), xlim=c(-450,0) ,main="PSI-Blast - Histogram of eValue distributions", xlab="log(eValue)", cex.axis=ccaxis, cex.main=ccmain, cex.lab=cclab, col=col1t) mtext("BIG80 database", cex=1.4) hist(log(d2_ev), breaks=br, ylim=c(0,70),xlim=c(-450,0) ,col=col2t, add=T) hist(log(d3_ev), breaks=br, ylim=c(0,70),xlim=c(-450,0) ,col=col3t, add=T) hist(log(d4_ev), breaks=br, ylim=c(0,70),xlim=c(-450,0) ,col=col4t, add=T) legend("topright", lab, fill=c(col1t, col2t, col3t, col4t), cex=1.6, ncol=1) dev.off() lab = c("2 iterations, e002", "10 iterations, e002", "2 Iterations, 10e-10", "10 Iterations, 10e-10") br <- seq(0,100,2.5) png("tsd_psiblastIdents_allfour.png", width=600, height=600) par(mar=par()$mar+c(0,1,0,0)) hist(d1_id, breaks=br , ylim=c(0,400),main="PSI-Blast - Histogram of identities", xlab="% sequence identity in Blast alignment", cex.axis=ccaxis, cex.main=ccmain, cex.lab=cclab, col=col1t) mtext("BIG80 database", cex=1.4) hist(d2_id, breaks=br,col=col2t, add=T) hist(d3_id, breaks=br, col=col3t, add=T) hist(d4_id, breaks=br, col=col4t, add=T) legend("topright", lab, fill=c(col1t, col2t, col3t, col4t), cex=1.6, ncol=1) dev.off() ###PSI-Blast BIG d1 <- read.table("./temp/blastpgpBIG_3800_it2e002_IdEvfixed.plot") d2 <- read.table("./temp/blastpgpBIG_3800_it10e002_IdEvfixed.plot") d3 <- read.table("./temp/blastpgpBIG_3800_it2e-10_IdEvfixed.plot") d4 <- read.table("./temp/blastpgpBIG_3800_it10e-10_IdEvfixed.plot") d1_id <- d1$V1 d1_ev <- d1$V2 d2_id <- d2$V1 d2_ev <- d2$V2 d3_id <- d3$V1 d3_ev <- d3$V2 d4_id <- d4$V1 d4_ev <- d4$V2 lab = c("2 iterations, e002", "10 iterations, e002", "2 Iterations, 10e-10", "10 Iterations, 10e-10") br <- seq(-450,0,5) png("tsd_psiblastBIGEvals_allfour.png", width=600, height=600) par(mar=par()$mar+c(0,1,0,0)) hist(log(d1_ev), breaks=br, ylim=c(0,260), xlim=c(-450,0) ,main="PSI-Blast - Histogram of eValue distributions", xlab="log(eValue)", cex.axis=ccaxis, cex.main=ccmain, cex.lab=cclab, col=col1t) mtext("BIG database", cex=1.4) hist(log(d2_ev), breaks=br, ylim=c(0,260),xlim=c(-450,0) ,col=col2t, add=T) hist(log(d3_ev), breaks=br, ylim=c(0,260),xlim=c(-450,0) ,col=col3t, add=T) hist(log(d4_ev), breaks=br, ylim=c(0,260),xlim=c(-450,0) ,col=col4t, add=T) legend("topleft", lab, fill=c(col1t, col2t, col3t, col4t), cex=1.6, ncol=1) dev.off() lab = c("2 iterations, e002", "10 iterations, e002", "2 Iterations, 10e-10", "10 Iterations, 10e-10") br <- seq(0,100,2.5) png("tsd_psiblastBIGIdents_allfour.png", width=600, height=600) par(mar=par()$mar+c(0,1,0,0)) hist(d1_id, breaks=br , ylim=c(0,1100),main="PSI-Blast - Histogram of identities", xlab="% sequence identity in Blast alignment", cex.axis=ccaxis, cex.main=ccmain, cex.lab=cclab, col=col1t) mtext("BIG database", cex=1.4) hist(d2_id, breaks=br,col=col2t, add=T) hist(d3_id, breaks=br, col=col3t, add=T) hist(d4_id, breaks=br, col=col4t, add=T) legend("topright", lab, fill=c(col1t, col2t, col3t, col4t), cex=1.6, ncol=1) dev.off() ###Hblits d <- read.table("./temp/hhblits_460_P06865_IdEvfixed.plot") d_id <- d$V1 d_ev <- d$V2 #Plot identities png(width=600, height=600, "tsd_hhblitsIdent.png") par(mar=par()$mar+c(0,1,-1,0)) hist(d_id, breaks=100, xlim=c(0,100), main="HHblits - Histogram of identities",xlab="% sequence identity in HHblits alignment", cex.axis=ccaxis, cex.main=ccmain, cex.lab=cclab, col=col1) dev.off() #Plot evalues png(width=600, height=600, "tsd_hhblitsEval.png") par(mar=par()$mar+c(0,1,-1,0)) hist(log(d_ev), breaks=100, main="HHblits - Histogram of eValue distribution",xlab="log(eValue)", cex.axis=ccaxis, cex.main=ccmain, cex.lab=cclab, col=col1) dev.off()
</source> Boxplot for MSA gaps
<source lang="bash">
main1<-"Gaps in MSA" png("testall.png") boxplot(main=main1,col=c("blue","blue","blue","purple4","purple4","purple4","darkgreen","darkgreen","darkgreen"),names=c("<40%",">60%","Range", "<40%",">60%","Range", "<40%",">60%","Range"), clustal.m40,clustal.m60,clustal.mall,muscle.m40,muscle.m60,muscle.mall,tcoffee.m40,tcoffee.m60,tcoffee.mall,las=2) legend("topleft",legend=c("ClustalW","Muscle","T-Coffee"),fill=c("blue","purple4","darkgreen")) dev.off()
</source>