ARSA search protocol
From Bioinformatikpedia
# Run blast: blastall -p blastp -d /mnt/project/pracstrucfunc12/data/big/big_80 -m 8 -b 10000 -i ARSA.fas > ARSA.blast
# find unique matches cut -f 2 ARSA.blast | uniq > ARSA.blast.matchIDs.uniq uniq -w 44 ARSA.blast > ARSA.blast.uniq
# use R to plot: # within R: histogram of log10 Evalue for blast run blastTable = read.table("ARSA.blast.uniq") pdf("ARSA_blastHistogram_eVal.pdf") logeval.hist <- hist(log10(blastTable$V11), breaks=100, plot=FALSE) plot(logeval.hist, col="blue", main="BLAST logarithm of E-values", xlab="lg(E-Value)") dev.off() # within R: histogram of %identity for blast run blastTable = read.table("ARSA.blast.uniq") pdf("ARSA_blastHistogram_id.pdf") id.hist <- hist(blastTable$V3, breaks=5, plot=FALSE) plot(id.hist, col="blue", main="BLAST sequence identity distribution", xlab="percent identity") dev.off()
# create lists of Uniprot IDs for analysis for GO annotations grep -n "1e-10" ARSA.blast.uniq tail -n +1721 ARSA.blast.uniq > ARSA.blast.uniq.worseE10 cut -f 2 ARSA.blast.uniq.worseE10 > ARSA.blast.uniq.worseE10.matchIDs cut -d "|" -f 2 ARSA.blast.uniq.worseE10.matchIDs > ARSA.blast.uniq.worseE10.matchIDs.clean