Difference between revisions of "ARSA search protocol"
From Bioinformatikpedia
m (→PSI-Blast) |
m (→HHblits) |
||
(9 intermediate revisions by the same user not shown) | |||
Line 42: | Line 42: | ||
2040.800u 54.190s 36:10.13 96.5% 0+0k 19511152+12824io 2375pf+0w |
2040.800u 54.190s 36:10.13 96.5% 0+0k 19511152+12824io 2375pf+0w |
||
− | # To evaluate the final results, we have to dissect the output into separate files: Look for the first hit, find at what line numbers it occurs and then cut the files accordingly. |
+ | # To evaluate the final results for big_80, we have to dissect the output into separate files: Look for the first hit, find at what line numbers it occurs and then cut the files accordingly. |
grep -n G3IH84 ARSA.psiBlast.j2.hDefault |
grep -n G3IH84 ARSA.psiBlast.j2.hDefault |
||
tail -n +4679 ARSA.psiBlast.j2.hDefault > ARSA.psiBlast.j2.hDefault.lastIter |
tail -n +4679 ARSA.psiBlast.j2.hDefault > ARSA.psiBlast.j2.hDefault.lastIter |
||
− | |||
# Extract unique matches as above for Blast |
# Extract unique matches as above for Blast |
||
uniq -w 44 ARSA.psiBlast.j2.hDefault.lastIter > ARSA.psiBlast.j2.hDefault.lastIter.uniq |
uniq -w 44 ARSA.psiBlast.j2.hDefault.lastIter > ARSA.psiBlast.j2.hDefault.lastIter.uniq |
||
+ | # Or directly work on the run with the complete big, in order to compare to hhblits |
||
− | # To evaluate the true/false positives use the checkpoint files to search against pdb |
||
− | blastpgp -d |
+ | blastpgp -d /mnt/project/pracstrucfunc12/data/big/big -m 8 -b 100000 -j 1 -i ARSA.fas -o ARSA.psiBlast.j2.hDefault.big -R ARSA.psiBlast.j2.hDefault.chk |
... |
... |
||
− | + | uniq -w 44 ARSA.psiBlast.j2.hDefault.big > ARSA.psiBlast.j2.hDefault.lastIter.big |
|
+ | cut -f 2 ARSA.psiBlast.j2.hDefault.big | uniq > ARSA.psiBlast.j2.hDefault.big.ids.uniq |
||
... |
... |
||
− | # generate histogram of e-value distributions in R: |
+ | # generate histogram of e-value distributions in R: (see psiBlast.evalueHist.Rscript) |
+ | > Rscript psiBlast.evalueHist.Rscript |
||
− | #colors |
||
+ | |||
cols4 <- hcl(h = seq(30, by=360 / 4, length = 4), l = 65, alpha = 0.5) |
cols4 <- hcl(h = seq(30, by=360 / 4, length = 4), l = 65, alpha = 0.5) |
||
+ | br <- seq(-180,15,1.25) |
||
− | # read data |
||
j2_hDefault = read.table("ARSA.psiBlast.j2.hDefault.big.uniq") |
j2_hDefault = read.table("ARSA.psiBlast.j2.hDefault.big.uniq") |
||
j10_hDefault = read.table("ARSA.psiBlast.j10.hDefault.big.uniq") |
j10_hDefault = read.table("ARSA.psiBlast.j10.hDefault.big.uniq") |
||
j2_h10 = read.table("ARSA.psiBlast.j2.h1e-10.big.uniq") |
j2_h10 = read.table("ARSA.psiBlast.j2.h1e-10.big.uniq") |
||
− | j10_h10 = read.table("ARSA.psiBlast.j10.h1e-10.big.uniq") |
+ | j10_h10 = read.table("ARSA.psiBlast.j10.h1e-10.big.uniq") |
+ | pdf("ARSA_psiBlastHistogram_eVal.pdf") |
||
− | # hist1 |
||
− | hist(log10(j2_hDefault$V11),border=cols4[1],col=cols4[1],breaks= |
+ | hist(log10(j2_hDefault$V11),border=cols4[1],col=cols4[1],breaks=br,main='PSI-BLAST logarithm of E-values',panel.first = grid(),xlab='lg(E-value)',axes=FALSE,xlim=range(-150,1),ylim=range(0,2500)) |
+ | hist(log10(j10_hDefault$V11),border=cols4[2],col=cols4[2],breaks=br,xlim=range(-150,1),ylim=range(0,2500),add=TRUE) |
||
− | # hist2-4 |
||
− | hist(log10( |
+ | hist(log10(j2_h10$V11),border=cols4[3],col=cols4[3],breaks=br,xlim=range(-150,1),ylim=range(0,2500),add=TRUE) |
− | hist(log10( |
+ | hist(log10(j10_h10$V11),border=cols4[4],col=cols4[4],breaks=br,xlim=range(-150,1),ylim=range(0,2500),add=TRUE) |
− | hist(log10(j10_h10$V11),border=cols4[2],col=cols4[2],breaks=151,add=TRUE) |
||
− | #axes |
||
axis(1, las=1, labels=seq(-150,0,50), at=seq(-150,0,50), tcl=0.5) |
axis(1, las=1, labels=seq(-150,0,50), at=seq(-150,0,50), tcl=0.5) |
||
− | axis(2, las=1, labels=seq(0, |
+ | axis(2, las=1, labels=seq(0,2500,500), at=seq(0,2500,500), tcl=0.5) |
+ | legend(-150,1500,c('j 2, h 0.002','j 10, h 0.002','j 2, h 1e-10','j 10, h 1e-10'),col=cols4,fill=cols4,border=cols4) |
||
− | #legend |
||
+ | dev.off() |
||
− | legend(0.8,100,c('j 2, h 0.002','j 10, h 0.002','j 2, h 1e-10','j 10, h 1e-10'),col=cols4,fill=cols4,border=cols4) |
||
+ | |||
+ | |||
+ | |||
+ | # find the pdb ids of the different structure groups in COPS: |
||
+ | > head -1 ARSA.psiBlast.j10.h1e-10.pdb.ids.uniq |
||
+ | # "1n2l" is the first hit. Search for it in COPS |
||
+ | > grep -i 1n2l /mnt/project/pracstrucfunc12/data/COPS/COPS-ChainHierarchy.txt |
||
+ | # "3nkm" is the first L30 representative, "1p49" the L40 representative and also the L60 representative. |
||
+ | > grep 3nkm /mnt/project/pracstrucfunc12/data/COPS/COPS-ChainHierarchy.txt | cut -c 1-4 | sort | uniq > COPS-ARSA-L30.pdbIds |
||
+ | # count the structures in L30 |
||
+ | > wc -l COPS-ARSA-L30.pdbIds |
||
+ | 58 COPS-ARSA-L30.pdbIds |
||
+ | > grep 1p49 /mnt/project/pracstrucfunc12/data/COPS/COPS-ChainHierarchy.txt | cut -c 1-4 | sort | uniq > COPS-ARSA-L40.pdbIds |
||
+ | # count the structures in L40 (... similarly in L30) |
||
+ | > wc -l COPS-ARSA-L40.pdbIds |
||
+ | 29 COPS-ARSA-L40.pdbIds |
||
+ | > grep "1p49,A 1p49,A" /mnt/project/pracstrucfunc12/data/COPS/COPS-ChainHierarchy.txt | cut -c 1-4 | sort | uniq > COPS-ARSA-L60.pdbIds |
||
+ | |||
+ | # To evaluate the true/false positives use the checkpoint files to search against pdb |
||
+ | blastpgp -d pdb_seqres.fasta -m 8 -b 10000 -j 1 -i ARSA.fas -o ARSA.psiBlast.j2.hDefault.pdb -R ARSA.psiBlast.j2.hDefault.chk |
||
+ | ... |
||
+ | cut -f 2 ARSA.psiBlast.j2.hDefault.pdb | cut -c 9-14 | uniq > ARSA.psiBlast.j2.hDefault.pdb.ids.uniq |
||
+ | ... |
||
+ | cut -c 1-4 ARSA.psiBlast.j2.hDefault.pdb.ids.uniq | sort | uniq > ARSA.psiBlast.j2.hDefault.pdb.ids.uniq2 |
||
+ | ... |
||
+ | |||
+ | # Now we can just do a diff to find out which / how many pdb ids are missing / too much |
||
+ | # lines with "<" indicate structures only found in COPS (false negatives), lines with ">" structures only found in the search (false positives), |
||
+ | # lines with "|" changes (false positives AND negatives) |
||
+ | # we look for overlapping (true positives) in L30 and L40 |
||
+ | > diff -y COPS-ARSA-L40.pdbIds ARSA.psiBlast.j2.hDefault.pdb.ids.uniq2 | grep -v ">" | grep -v "|" | grep -v "<" | wc -l |
||
+ | ... |
||
+ | # for accuracy, count all found hits: |
||
+ | > wc -l ARSA.psiBlast.j2.hDefault.pdb.ids.uniq2 |
||
+ | ... |
||
+ | |||
+ | # to get an overview of the e-values, look at the true / false positives |
||
+ | diff -y COPS-ARSA-L40.pdbIds ARSA.psiBlast.j2.h1e-10.pdb.ids.uniq2 | grep -v ">" | grep -v "|" | grep -v "<" > ARSA.psiBlast.j2.h1e-10.pdb.truePos.raw cut -c 13-17 |
||
+ | cut -c 13-17 ARSA.psiBlast.j2.h1e-10.pdb.truePos.raw > ARSA.psiBlast.j2.h1e-10.pdb.truePos |
||
+ | foreach pdb ( `cat ARSA.psiBlast.j2.h1e-10.pdb.truePos` ) |
||
+ | grep $pdb ARSA.psiBlast.psiBlast.j2.h1e-10.pdb |
||
+ | end |
||
+ | diff -y COPS-ARSA-L40.pdbIds ARSA.psiBlast.j2.h1e-10.pdb.ids.uniq2 | grep ">" > ARSA.psiBlast.j2.h1e-10.pdb.falsePos.raw |
||
+ | cut -c 16-20 ARSA.psiBlast.j2.h1e-10.pdb.falsePos.raw > ARSA.psiBlast.j2.h1e-10.pdb.falsePos |
||
+ | foreach pdb ( `cat ARSA.psiBlast.j2.h1e-10.pdb.falsePos` ) |
||
+ | grep $pdb ARSA.psiBlast.j2.h1e-10.pdb |
||
+ | end |
||
+ | |||
+ | == HHblits == |
||
+ | |||
+ | # run with standard parameters (standard iterations: 2, standard e-value: 10^-3) |
||
+ | > time hhblits -i ../ARSA.fas -oa3m ARSA.hhblits.n2.e3.a3m -E 100 -atab ARSA.hhblits.n2.e3.tab -d /mnt/project/pracstrucfunc12/data/hhblits/uniprot20_current -Z 10000 -B 10000 > ARSA.hhblits.n2.e3.stdout |
||
+ | 450.330u 10.540s 6:31.66 117.6% 0+0k 3378216+41032io 36pf+0w |
||
+ | > time hhblits -i ../ARSA.fas -oa3m ARSA.hhblits.n8.e3.a3m -E 100 -atab ARSA.hhblits.n2.e3.tab -n 8 -d /mnt/project/pracstrucfunc12/data/hhblits/uniprot20_current -Z 10000 -B 10000 > ARSA.hhblits.n8.e3.stdout |
||
+ | |||
+ | # search against pdb |
||
+ | > time hhsearch -i ARSA.hhblits.n2.e3.a3m -oa3m ARSA.hhblits.n2.e3.pdb.a3m -E 100 -d /mnt/project/pracstrucfunc12/data/hhblits/pdb70_current_hhm_db -Z 10000 -B 10000 > ARSA.hhblits.n2.e-3.pdb.stdout |
Latest revision as of 14:26, 4 May 2012
Blast
# Run blast: blastall -p blastp -d /mnt/project/pracstrucfunc12/data/big/big_80 -m 8 -b 10000 -i ARSA.fas > ARSA.blast
# find unique matches cut -f 2 ARSA.blast | uniq > ARSA.blast.matchIDs.uniq uniq -w 44 ARSA.blast > ARSA.blast.uniq
# use R to plot: # within R: histogram of log10 Evalue for blast run blastTable = read.table("ARSA.blast.uniq") pdf("ARSA_blastHistogram_eVal.pdf") logeval.hist <- hist(log10(blastTable$V11), breaks=100, plot=FALSE) plot(logeval.hist, col="blue", main="BLAST logarithm of E-values", xlab="lg(E-Value)") dev.off() # within R: histogram of %identity for blast run blastTable = read.table("ARSA.blast.uniq") pdf("ARSA_blastHistogram_id.pdf") id.hist <- hist(blastTable$V3, breaks=5, plot=FALSE) plot(id.hist, col="blue", main="BLAST sequence identity distribution", xlab="percent identity") dev.off()
# create lists of Uniprot IDs for analysis for GO annotations grep -n "1e-10" ARSA.blast.uniq tail -n +1721 ARSA.blast.uniq > ARSA.blast.uniq.worseE10 cut -f 2 ARSA.blast.uniq.worseE10 > ARSA.blast.uniq.worseE10.matchIDs cut -d "|" -f 2 ARSA.blast.uniq.worseE10.matchIDs > ARSA.blast.uniq.worseE10.matchIDs.clean
PSI-Blast
# run PSI-Blast with different combinations of parameters (generate checkpoint files for scanning PDB) > blastpgp -d /mnt/project/pracstrucfunc12/data/big/big_80 -m 8 -b 10000 -j 2 -i ARSA.fas -o ARSA.psiBlast.j2.hDefault -C ARSA.psiBlast.j2.hDefault.chk 280.470u 30.420s 5:55.56 87.4% 0+0k 18005712+1880io 792pf+0w > blastpgp -d /mnt/project/pracstrucfunc12/data/big/big_80 -m 8 -b 10000 -j 10 -i ARSA.fas -o ARSA.psiBlast.j10.hDefault -C ARSA.psiBlast.j10.hDefault.chk [blastpgp] ERROR: ncbiapi [000.000] ObjMgrNextAvailEntityID failed with idx 2048 2111.750u 44.740s 36:55.25 97.3% 0+0k 12951072+11856io 1271pf+0w > blastpgp -d /mnt/project/pracstrucfunc12/data/big/big_80 -m 8 -b 10000 -j 2 -h 1e-10 -i ARSA.fas -o ARSA.psiBlast.j2.h1e-10 -C ARSA.psiBlast.j2.h1e-10.chk 280.140u 6.580s 4:55.89 96.9% 0+0k 2536896+2000io 258pf+0w > blastpgp -d /mnt/project/pracstrucfunc12/data/big/big_80 -m 8 -b 10000 -j 10 -h 1e-10 -i ARSA.fas -o ARSA.psiBlast.j10.h1e-10 -C ARSA.psiBlast.j10.h1e-10.chk [blastpgp] ERROR: ncbiapi [000.000] ObjMgrNextAvailEntityID failed with idx 2048 2040.800u 54.190s 36:10.13 96.5% 0+0k 19511152+12824io 2375pf+0w
# To evaluate the final results for big_80, we have to dissect the output into separate files: Look for the first hit, find at what line numbers it occurs and then cut the files accordingly. grep -n G3IH84 ARSA.psiBlast.j2.hDefault tail -n +4679 ARSA.psiBlast.j2.hDefault > ARSA.psiBlast.j2.hDefault.lastIter # Extract unique matches as above for Blast uniq -w 44 ARSA.psiBlast.j2.hDefault.lastIter > ARSA.psiBlast.j2.hDefault.lastIter.uniq
# Or directly work on the run with the complete big, in order to compare to hhblits blastpgp -d /mnt/project/pracstrucfunc12/data/big/big -m 8 -b 100000 -j 1 -i ARSA.fas -o ARSA.psiBlast.j2.hDefault.big -R ARSA.psiBlast.j2.hDefault.chk ... uniq -w 44 ARSA.psiBlast.j2.hDefault.big > ARSA.psiBlast.j2.hDefault.lastIter.big cut -f 2 ARSA.psiBlast.j2.hDefault.big | uniq > ARSA.psiBlast.j2.hDefault.big.ids.uniq ...
# generate histogram of e-value distributions in R: (see psiBlast.evalueHist.Rscript) > Rscript psiBlast.evalueHist.Rscript cols4 <- hcl(h = seq(30, by=360 / 4, length = 4), l = 65, alpha = 0.5) br <- seq(-180,15,1.25) j2_hDefault = read.table("ARSA.psiBlast.j2.hDefault.big.uniq") j10_hDefault = read.table("ARSA.psiBlast.j10.hDefault.big.uniq") j2_h10 = read.table("ARSA.psiBlast.j2.h1e-10.big.uniq") j10_h10 = read.table("ARSA.psiBlast.j10.h1e-10.big.uniq") pdf("ARSA_psiBlastHistogram_eVal.pdf") hist(log10(j2_hDefault$V11),border=cols4[1],col=cols4[1],breaks=br,main='PSI-BLAST logarithm of E-values',panel.first = grid(),xlab='lg(E-value)',axes=FALSE,xlim=range(-150,1),ylim=range(0,2500)) hist(log10(j10_hDefault$V11),border=cols4[2],col=cols4[2],breaks=br,xlim=range(-150,1),ylim=range(0,2500),add=TRUE) hist(log10(j2_h10$V11),border=cols4[3],col=cols4[3],breaks=br,xlim=range(-150,1),ylim=range(0,2500),add=TRUE) hist(log10(j10_h10$V11),border=cols4[4],col=cols4[4],breaks=br,xlim=range(-150,1),ylim=range(0,2500),add=TRUE) axis(1, las=1, labels=seq(-150,0,50), at=seq(-150,0,50), tcl=0.5) axis(2, las=1, labels=seq(0,2500,500), at=seq(0,2500,500), tcl=0.5) legend(-150,1500,c('j 2, h 0.002','j 10, h 0.002','j 2, h 1e-10','j 10, h 1e-10'),col=cols4,fill=cols4,border=cols4) dev.off()
# find the pdb ids of the different structure groups in COPS: > head -1 ARSA.psiBlast.j10.h1e-10.pdb.ids.uniq # "1n2l" is the first hit. Search for it in COPS > grep -i 1n2l /mnt/project/pracstrucfunc12/data/COPS/COPS-ChainHierarchy.txt # "3nkm" is the first L30 representative, "1p49" the L40 representative and also the L60 representative. > grep 3nkm /mnt/project/pracstrucfunc12/data/COPS/COPS-ChainHierarchy.txt | cut -c 1-4 | sort | uniq > COPS-ARSA-L30.pdbIds # count the structures in L30 > wc -l COPS-ARSA-L30.pdbIds 58 COPS-ARSA-L30.pdbIds > grep 1p49 /mnt/project/pracstrucfunc12/data/COPS/COPS-ChainHierarchy.txt | cut -c 1-4 | sort | uniq > COPS-ARSA-L40.pdbIds # count the structures in L40 (... similarly in L30) > wc -l COPS-ARSA-L40.pdbIds 29 COPS-ARSA-L40.pdbIds > grep "1p49,A 1p49,A" /mnt/project/pracstrucfunc12/data/COPS/COPS-ChainHierarchy.txt | cut -c 1-4 | sort | uniq > COPS-ARSA-L60.pdbIds
# To evaluate the true/false positives use the checkpoint files to search against pdb blastpgp -d pdb_seqres.fasta -m 8 -b 10000 -j 1 -i ARSA.fas -o ARSA.psiBlast.j2.hDefault.pdb -R ARSA.psiBlast.j2.hDefault.chk ... cut -f 2 ARSA.psiBlast.j2.hDefault.pdb | cut -c 9-14 | uniq > ARSA.psiBlast.j2.hDefault.pdb.ids.uniq ... cut -c 1-4 ARSA.psiBlast.j2.hDefault.pdb.ids.uniq | sort | uniq > ARSA.psiBlast.j2.hDefault.pdb.ids.uniq2 ...
# Now we can just do a diff to find out which / how many pdb ids are missing / too much # lines with "<" indicate structures only found in COPS (false negatives), lines with ">" structures only found in the search (false positives), # lines with "|" changes (false positives AND negatives) # we look for overlapping (true positives) in L30 and L40 > diff -y COPS-ARSA-L40.pdbIds ARSA.psiBlast.j2.hDefault.pdb.ids.uniq2 | grep -v ">" | grep -v "|" | grep -v "<" | wc -l ... # for accuracy, count all found hits: > wc -l ARSA.psiBlast.j2.hDefault.pdb.ids.uniq2 ...
# to get an overview of the e-values, look at the true / false positives diff -y COPS-ARSA-L40.pdbIds ARSA.psiBlast.j2.h1e-10.pdb.ids.uniq2 | grep -v ">" | grep -v "|" | grep -v "<" > ARSA.psiBlast.j2.h1e-10.pdb.truePos.raw cut -c 13-17 cut -c 13-17 ARSA.psiBlast.j2.h1e-10.pdb.truePos.raw > ARSA.psiBlast.j2.h1e-10.pdb.truePos foreach pdb ( `cat ARSA.psiBlast.j2.h1e-10.pdb.truePos` ) grep $pdb ARSA.psiBlast.psiBlast.j2.h1e-10.pdb end diff -y COPS-ARSA-L40.pdbIds ARSA.psiBlast.j2.h1e-10.pdb.ids.uniq2 | grep ">" > ARSA.psiBlast.j2.h1e-10.pdb.falsePos.raw cut -c 16-20 ARSA.psiBlast.j2.h1e-10.pdb.falsePos.raw > ARSA.psiBlast.j2.h1e-10.pdb.falsePos foreach pdb ( `cat ARSA.psiBlast.j2.h1e-10.pdb.falsePos` ) grep $pdb ARSA.psiBlast.j2.h1e-10.pdb end
HHblits
# run with standard parameters (standard iterations: 2, standard e-value: 10^-3) > time hhblits -i ../ARSA.fas -oa3m ARSA.hhblits.n2.e3.a3m -E 100 -atab ARSA.hhblits.n2.e3.tab -d /mnt/project/pracstrucfunc12/data/hhblits/uniprot20_current -Z 10000 -B 10000 > ARSA.hhblits.n2.e3.stdout 450.330u 10.540s 6:31.66 117.6% 0+0k 3378216+41032io 36pf+0w > time hhblits -i ../ARSA.fas -oa3m ARSA.hhblits.n8.e3.a3m -E 100 -atab ARSA.hhblits.n2.e3.tab -n 8 -d /mnt/project/pracstrucfunc12/data/hhblits/uniprot20_current -Z 10000 -B 10000 > ARSA.hhblits.n8.e3.stdout
# search against pdb > time hhsearch -i ARSA.hhblits.n2.e3.a3m -oa3m ARSA.hhblits.n2.e3.pdb.a3m -E 100 -d /mnt/project/pracstrucfunc12/data/hhblits/pdb70_current_hhm_db -Z 10000 -B 10000 > ARSA.hhblits.n2.e-3.pdb.stdout