ARSA search protocol
Blast
# Run blast: blastall -p blastp -d /mnt/project/pracstrucfunc12/data/big/big_80 -m 8 -b 10000 -i ARSA.fas > ARSA.blast
# find unique matches cut -f 2 ARSA.blast | uniq > ARSA.blast.matchIDs.uniq uniq -w 44 ARSA.blast > ARSA.blast.uniq
# use R to plot: # within R: histogram of log10 Evalue for blast run blastTable = read.table("ARSA.blast.uniq") pdf("ARSA_blastHistogram_eVal.pdf") logeval.hist <- hist(log10(blastTable$V11), breaks=100, plot=FALSE) plot(logeval.hist, col="blue", main="BLAST logarithm of E-values", xlab="lg(E-Value)") dev.off() # within R: histogram of %identity for blast run blastTable = read.table("ARSA.blast.uniq") pdf("ARSA_blastHistogram_id.pdf") id.hist <- hist(blastTable$V3, breaks=5, plot=FALSE) plot(id.hist, col="blue", main="BLAST sequence identity distribution", xlab="percent identity") dev.off()
# create lists of Uniprot IDs for analysis for GO annotations grep -n "1e-10" ARSA.blast.uniq tail -n +1721 ARSA.blast.uniq > ARSA.blast.uniq.worseE10 cut -f 2 ARSA.blast.uniq.worseE10 > ARSA.blast.uniq.worseE10.matchIDs cut -d "|" -f 2 ARSA.blast.uniq.worseE10.matchIDs > ARSA.blast.uniq.worseE10.matchIDs.clean
PSI-Blast
# run PSI-Blast with different combinations of parameters > blastpgp -d /mnt/project/pracstrucfunc12/data/big/big_80 -m 8 -b 10000 -j 2 -i ARSA.fas -o ARSA.psiBlast.j2.hDefault 280.470u 30.420s 5:55.56 87.4% 0+0k 18005712+1880io 792pf+0w > blastpgp -d /mnt/project/pracstrucfunc12/data/big/big_80 -m 8 -b 10000 -j 10 -i ARSA.fas -o ARSA.psiBlast.j10.hDefault [blastpgp] ERROR: ncbiapi [000.000] ObjMgrNextAvailEntityID failed with idx 2048 2111.750u 44.740s 36:55.25 97.3% 0+0k 12951072+11856io 1271pf+0w > blastpgp -d /mnt/project/pracstrucfunc12/data/big/big_80 -m 8 -b 10000 -j 2 -h 1e-10 -i ARSA.fas -o ARSA.psiBlast.j2.h1e-10 280.140u 6.580s 4:55.89 96.9% 0+0k 2536896+2000io 258pf+0w > blastpgp -d /mnt/project/pracstrucfunc12/data/big/big_80 -m 8 -b 10000 -j 10 -h 1e-10 -i ARSA.fas -o ARSA.psiBlast.j10.h1e-10
# To evaluate the final results, we have to dissect the output into separate files: Look for the first hit, find at what line numbers it occurs and then cut the files accordingly. grep -n G3IH84 ARSA.psiBlast.j2.hDefault tail -n +4679 ARSA.psiBlast.j2.hDefault > ARSA.psiBlast.j2.hDefault.lastIter
# Extract unique matches as above for Blast uniq -w 44 ARSA.psiBlast.j2.hDefault.lastIter > ARSA.psiBlast.j2.hDefault.lastIter.uniq
# generate histogram of e-value distributions in R: #colors
> cols4 <- hcl(h = seq(30, by=360 / 4, length = 4), l = 65, alpha = 0.5)
- hist1
> hist(result_tm[which(result_tm$hhb_tm>0.5),]$hhb_tm,border=cols4[1],col=cols4[1],breaks=20,main='TM-Scores > 0.5',panel.first = grid(),,xlab='TM-Score',axes=FALSE,xlim=range(0.5,1),ylim=range(0,150))
- hist2
> hist(result_tm[which(result_tm$psi_tm>0.5),]$psi_tm,border=cols4[2],col=cols4[2],breaks=20,add=TRUE)
- axes
> axis(1, las=1, labels=seq(0.5,1.0,0.1), at=seq(0.5,1.0,0.1), tcl=0.5) > axis(2, las=1, labels=seq(0,150,50), at=seq(0,150,50), tcl=0.5)
- hist3
> hist(result_tm[which(result_tm$pfam_tm>0.5),]$pfam_tm,border=cols4[3],col=cols4[3],breaks=20,add=TRUE)
- hist4
> hist(result_tm[which(result_tm$hs_tm>0.5),]$hs_tm,border=cols4[4],col=cols4[4],breaks=20,add=TRUE)
- legend
> legend(0.8,100,c('HHBlits','PSI-BLAST','PFAM','HSSP'),col=cols4,fill=cols4,border=cols4)