Difference between revisions of "ARSA search protocol"

From Bioinformatikpedia
m (HHblits)
m (PSI-Blast)
Line 82: Line 82:
 
# "3nkm" is the first L30 representative, "1p49" the L40 representative and also the L60 representative.
 
# "3nkm" is the first L30 representative, "1p49" the L40 representative and also the L60 representative.
 
> grep 3nkm /mnt/project/pracstrucfunc12/data/COPS/COPS-ChainHierarchy.txt | cut -c 1-4 | sort | uniq > COPS-ARSA-L30.pdbIds
 
> grep 3nkm /mnt/project/pracstrucfunc12/data/COPS/COPS-ChainHierarchy.txt | cut -c 1-4 | sort | uniq > COPS-ARSA-L30.pdbIds
  +
# count the structures in L30
  +
> wc -l COPS-ARSA-L30.pdbIds
  +
58 COPS-ARSA-L30.pdbIds
 
> grep 1p49 /mnt/project/pracstrucfunc12/data/COPS/COPS-ChainHierarchy.txt | cut -c 1-4 | sort | uniq > COPS-ARSA-L40.pdbIds
 
> grep 1p49 /mnt/project/pracstrucfunc12/data/COPS/COPS-ChainHierarchy.txt | cut -c 1-4 | sort | uniq > COPS-ARSA-L40.pdbIds
# count the structures in L40
+
# count the structures in L40 (... similarly in L30)
 
> wc -l COPS-ARSA-L40.pdbIds
 
> wc -l COPS-ARSA-L40.pdbIds
 
29 COPS-ARSA-L40.pdbIds
 
29 COPS-ARSA-L40.pdbIds
Line 101: Line 104:
 
# we look for overlapping (true positives) in L30 and L40
 
# we look for overlapping (true positives) in L30 and L40
 
> diff -y COPS-ARSA-L40.pdbIds ARSA.psiBlast.j2.hDefault.pdb.ids.uniq2 | grep -v ">" | grep -v "|" | grep -v "<" | wc -l
 
> diff -y COPS-ARSA-L40.pdbIds ARSA.psiBlast.j2.hDefault.pdb.ids.uniq2 | grep -v ">" | grep -v "|" | grep -v "<" | wc -l
  +
...
  +
# for accuracy, count all found hits:
  +
> wc -l ARSA.psiBlast.j2.hDefault.pdb.ids.uniq2
 
...
 
...
   

Revision as of 12:01, 3 May 2012

Blast

# Run blast:
blastall -p blastp -d /mnt/project/pracstrucfunc12/data/big/big_80 -m 8 -b 10000 -i ARSA.fas > ARSA.blast
# find unique matches
cut -f 2 ARSA.blast | uniq > ARSA.blast.matchIDs.uniq
uniq -w 44 ARSA.blast > ARSA.blast.uniq
# use R to plot: 
# within R: histogram of log10 Evalue for blast run
blastTable = read.table("ARSA.blast.uniq")
pdf("ARSA_blastHistogram_eVal.pdf")
logeval.hist <- hist(log10(blastTable$V11), breaks=100, plot=FALSE) 
plot(logeval.hist, col="blue", main="BLAST logarithm of E-values", xlab="lg(E-Value)")
dev.off()

# within R: histogram of %identity for blast run
blastTable = read.table("ARSA.blast.uniq")
pdf("ARSA_blastHistogram_id.pdf")
id.hist <- hist(blastTable$V3, breaks=5, plot=FALSE)
plot(id.hist, col="blue", main="BLAST sequence identity distribution", xlab="percent identity")
dev.off()
 # create lists of Uniprot IDs for analysis for GO annotations
 grep -n "1e-10" ARSA.blast.uniq
 tail -n +1721 ARSA.blast.uniq > ARSA.blast.uniq.worseE10
 cut -f 2 ARSA.blast.uniq.worseE10  > ARSA.blast.uniq.worseE10.matchIDs
 cut -d "|" -f 2 ARSA.blast.uniq.worseE10.matchIDs > ARSA.blast.uniq.worseE10.matchIDs.clean

PSI-Blast

# run PSI-Blast with different combinations of parameters (generate checkpoint files for scanning PDB)
> blastpgp -d /mnt/project/pracstrucfunc12/data/big/big_80 -m 8 -b 10000 -j 2 -i ARSA.fas -o ARSA.psiBlast.j2.hDefault -C ARSA.psiBlast.j2.hDefault.chk
280.470u 30.420s 5:55.56 87.4%  0+0k 18005712+1880io 792pf+0w
> blastpgp -d /mnt/project/pracstrucfunc12/data/big/big_80 -m 8 -b 10000 -j 10 -i ARSA.fas -o ARSA.psiBlast.j10.hDefault -C ARSA.psiBlast.j10.hDefault.chk
[blastpgp] ERROR: ncbiapi [000.000]  ObjMgrNextAvailEntityID failed with idx 2048
2111.750u 44.740s 36:55.25 97.3%        0+0k 12951072+11856io 1271pf+0w 
> blastpgp -d /mnt/project/pracstrucfunc12/data/big/big_80 -m 8 -b 10000 -j 2 -h 1e-10 -i ARSA.fas -o ARSA.psiBlast.j2.h1e-10  -C ARSA.psiBlast.j2.h1e-10.chk
280.140u 6.580s 4:55.89 96.9%   0+0k 2536896+2000io 258pf+0w
> blastpgp -d /mnt/project/pracstrucfunc12/data/big/big_80 -m 8 -b 10000 -j 10 -h 1e-10 -i ARSA.fas -o ARSA.psiBlast.j10.h1e-10 -C ARSA.psiBlast.j10.h1e-10.chk
[blastpgp] ERROR: ncbiapi [000.000]  ObjMgrNextAvailEntityID failed with idx 2048
2040.800u 54.190s 36:10.13 96.5%        0+0k 19511152+12824io 2375pf+0w
# To evaluate the final results for big_80, we have to dissect the output into separate files: Look for the first hit, find at what line numbers it occurs and then cut the files accordingly.
grep -n G3IH84 ARSA.psiBlast.j2.hDefault
tail -n +4679 ARSA.psiBlast.j2.hDefault > ARSA.psiBlast.j2.hDefault.lastIter
# Extract unique matches as above for Blast
uniq -w 44 ARSA.psiBlast.j2.hDefault.lastIter > ARSA.psiBlast.j2.hDefault.lastIter.uniq
# Or directly work on the run with the complete big, in order to compare to hhblits
blastpgp -d /mnt/project/pracstrucfunc12/data/big/big -m 8 -b 100000 -j 1 -i ARSA.fas -o ARSA.psiBlast.j2.hDefault.big -R ARSA.psiBlast.j2.hDefault.chk
...
uniq -w 44 ARSA.psiBlast.j2.hDefault.big > ARSA.psiBlast.j2.hDefault.lastIter.big
cut -f 2 ARSA.psiBlast.j2.hDefault.big | uniq > ARSA.psiBlast.j2.hDefault.big.ids.uniq
...
# generate histogram of e-value distributions in R: (see psiBlast.evalueHist.Rscript)
> Rscript  psiBlast.evalueHist.Rscript

cols4 <- hcl(h = seq(30, by=360 / 4, length = 4), l = 65, alpha = 0.5)
br <- seq(-180,15,1.25)
j2_hDefault = read.table("ARSA.psiBlast.j2.hDefault.big.uniq")
j10_hDefault = read.table("ARSA.psiBlast.j10.hDefault.big.uniq")
j2_h10 = read.table("ARSA.psiBlast.j2.h1e-10.big.uniq")
j10_h10 = read.table("ARSA.psiBlast.j10.h1e-10.big.uniq") 
pdf("ARSA_psiBlastHistogram_eVal.pdf")
hist(log10(j2_hDefault$V11),border=cols4[1],col=cols4[1],breaks=br,main='PSI-BLAST logarithm of E-values',panel.first = grid(),xlab='lg(E-value)',axes=FALSE,xlim=range(-150,1),ylim=range(0,2500))
hist(log10(j10_hDefault$V11),border=cols4[2],col=cols4[2],breaks=br,xlim=range(-150,1),ylim=range(0,2500),add=TRUE)
hist(log10(j2_h10$V11),border=cols4[3],col=cols4[3],breaks=br,xlim=range(-150,1),ylim=range(0,2500),add=TRUE)
hist(log10(j10_h10$V11),border=cols4[4],col=cols4[4],breaks=br,xlim=range(-150,1),ylim=range(0,2500),add=TRUE)
axis(1, las=1, labels=seq(-150,0,50), at=seq(-150,0,50), tcl=0.5)
axis(2, las=1, labels=seq(0,2500,500), at=seq(0,2500,500), tcl=0.5)
legend(-150,1500,c('j 2, h 0.002','j 10, h 0.002','j 2, h 1e-10','j 10, h 1e-10'),col=cols4,fill=cols4,border=cols4)
dev.off() 


# find the pdb ids of the different structure groups in COPS:
> head -1 ARSA.psiBlast.j10.h1e-10.pdb.ids.uniq
# "1n2l" is the first hit. Search for it in COPS
> grep -i 1n2l /mnt/project/pracstrucfunc12/data/COPS/COPS-ChainHierarchy.txt
# "3nkm" is the first L30 representative, "1p49" the L40 representative and also the L60 representative. 
> grep 3nkm /mnt/project/pracstrucfunc12/data/COPS/COPS-ChainHierarchy.txt | cut -c 1-4 | sort | uniq > COPS-ARSA-L30.pdbIds
# count the structures in L30
> wc -l COPS-ARSA-L30.pdbIds
58 COPS-ARSA-L30.pdbIds
> grep 1p49 /mnt/project/pracstrucfunc12/data/COPS/COPS-ChainHierarchy.txt | cut -c 1-4 | sort | uniq > COPS-ARSA-L40.pdbIds
# count the structures in L40 (... similarly in L30)
> wc -l COPS-ARSA-L40.pdbIds
29 COPS-ARSA-L40.pdbIds
> grep "1p49,A 1p49,A" /mnt/project/pracstrucfunc12/data/COPS/COPS-ChainHierarchy.txt | cut -c 1-4 | sort | uniq > COPS-ARSA-L60.pdbIds
# To evaluate the true/false positives use the checkpoint files to search against pdb
blastpgp -d pdb_seqres.fasta -m 8 -b 10000 -j 1 -i ARSA.fas -o ARSA.psiBlast.j2.hDefault.pdb -R ARSA.psiBlast.j2.hDefault.chk
...
cut -f 2 ARSA.psiBlast.j2.hDefault.pdb | cut -c 9-14 | uniq > ARSA.psiBlast.j2.hDefault.pdb.ids.uniq
...
cut -c 1-4 ARSA.psiBlast.j2.hDefault.pdb.ids.uniq | sort | uniq > ARSA.psiBlast.j2.hDefault.pdb.ids.uniq2
...
# Now we can just do a diff to find out which / how many pdb ids are missing / too much
# lines with "<" indicate structures only found in COPS (false negatives), lines with ">" structures only found in the search (false positives),
# lines with "|" changes (false positives AND negatives)
# we look for overlapping (true positives) in L30 and L40
> diff -y COPS-ARSA-L40.pdbIds ARSA.psiBlast.j2.hDefault.pdb.ids.uniq2  | grep -v ">" | grep -v "|" | grep -v "<" | wc -l
...
# for accuracy, count all found hits:
> wc -l ARSA.psiBlast.j2.hDefault.pdb.ids.uniq2
...

HHblits

# run with standard parameters (standard iterations: 2, standard e-value: 10^-3)
time hhblits -i ../ARSA.fas -o ARSA.hhblits.n2.e3.hhr -ofas ARSA.hhblits.n2.e3.fas -ohmm  ARSA.hhblits.n2.e3.hmm -E 100 -atab ARSA.hhblits.n2.e3.tab -d /mnt/project/pracstrucfunc12/data/hhblits/uniprot20_current -Z 10000 -B 10000 > ARSA.hhblits.n2.e-3.stdout