Difference between revisions of "Lab journal"
(11 intermediate revisions by the same user not shown) | |||
Line 33: | Line 33: | ||
# determine pairs that have a minimum distance of below 5 A |
# determine pairs that have a minimum distance of below 5 A |
||
+ | |||
− | [[ras_ev_hotspots| Ras hotspots]] |
||
− | [[mhc_ev_hotspots| MHC I hotspots]] |
||
− | [[imm_ev_hotspots| Ig C1 hotspots]] |
||
== Calculate structural models == |
== Calculate structural models == |
||
+ | *[[Ras_DI_Score_Configuration]] |
||
+ | |||
+ | The configuration for the model calculation with PLM scoring was the same, except of course for the Contact scoring method. |
||
+ | |||
+ | === Contact Maps === |
||
+ | |||
+ | |||
+ | <figtable id="ras_fold_cm"> |
||
+ | |||
+ | {| |
||
+ | | [[File:Ras_di_76.png|center|thumb|300px|DI score and 76 EC constraints]] || [[File:Ras_di_123.png|center|thumb|300px|DI score and 123 EC constraints]] || [[File:Ras_di_189.png|center|thumb|300px|DI score and 189 EC constraints]] |
||
+ | |- |
||
+ | | [[File:Ras_plm_76.png|center|thumb|300px|PLM score and 76 EC constraints]] || [[File:Ras_plm_123.png|center|thumb|300px|PLM score and 123 EC constraints]] || [[File:Ras_plm_189.png|center|thumb|300px|PLM score and 189 EC constraints]] |
||
+ | |– |
||
+ | |+ style="caption-side: bottom; text-align: left" |<font size=2>'''Figure 1:''' Contact maps of EVfold predictions for Ras using DI and PLM score. |
||
+ | |} |
||
+ | </figtable> |
||
+ | |||
+ | == Scripts == |
||
+ | |||
+ | === contact_util.R === |
||
+ | |||
+ | <source lang="php"> |
||
+ | |||
+ | cDist <- function(a,b) { |
||
+ | |||
+ | if(abs(a-b) > 5) { |
||
+ | return(TRUE) |
||
+ | } |
||
+ | else { |
||
+ | return(FALSE) |
||
+ | } |
||
+ | } |
||
+ | |||
+ | getValid <- function(dat,col1,col2) { |
||
+ | valid <- with(dat[,c(col1,col2)], mapply(cDist,V1,V3)) |
||
+ | validPairs <- dat[valid,] |
||
+ | return(validPairs) |
||
+ | } |
||
+ | |||
+ | |||
+ | getHS <- function(a,histName="cn_score_hist.png") { |
||
+ | |||
+ | |||
+ | #get pairs with residue distance > 5 |
||
+ | valid <- with(a[,c(1,3)], mapply(cDist,V1,V3)) |
||
+ | validPairs <- a[valid,] |
||
+ | |||
+ | #order by decreasing cn value |
||
+ | ord <- order(validPairs[,6],decreasing=TRUE) |
||
+ | vSorted <- validPairs[ord,] |
||
+ | #plot score distribution |
||
+ | png(histName,width=500,height=400) |
||
+ | hist(vSorted[,6],100,col="lightblue",xlab="CN score", main="",xlim=c(-1,max(vSorted[,6]))) |
||
+ | dev.off() |
||
+ | |||
+ | |||
+ | return(vSorted) |
||
+ | } |
||
+ | |||
+ | |||
+ | getCoords <- function(hs,offset) { |
||
+ | xVal <- sapply(hs[,1],function(x){ x+offset }) |
||
+ | yVal <- sapply(hs[,3],function(x){ x+offset }) |
||
+ | return(cbind(xVal,yVal)) |
||
+ | } |
||
+ | |||
+ | mapAA <- function(dat) { |
||
+ | x <- dat[,c(1,2)] |
||
+ | y <- dat[,c(3,4)] |
||
+ | colnames(y) <- colnames(x) |
||
+ | z <- rbind(x,y) |
||
+ | map <- list() |
||
+ | for (i in 1:nrow(dat)) { |
||
+ | |||
+ | map[[z[i,1]]] <- z[i,2] |
||
+ | } |
||
+ | return(map) |
||
+ | } |
||
+ | |||
+ | normScores <- function(datSorted,l){ |
||
+ | lDat <- datSorted[1:l,] |
||
+ | AAmap <- mapAA(lDat) |
||
+ | nFactor <- mean(lDat[,6]) |
||
+ | x <- lDat[,c(1,6)] |
||
+ | y <- lDat[,c(3,6)] |
||
+ | colnames(y) <- colnames(x) |
||
+ | z <- rbind(x,y) |
||
+ | scSums <- aggregate(z[,2], by=list(z[,1]) , FUN=sum) |
||
+ | ord <- order(scSums[,2],decreasing=T) |
||
+ | scSums <- scSums[ord,] |
||
+ | |||
+ | return(scSums) |
||
+ | |||
+ | } |
||
+ | </source> |
||
+ | |||
+ | === get_contactMap.R === |
||
+ | |||
+ | <source lang="php"> |
||
+ | |||
+ | source("atom.select.R") |
||
+ | source("read.pdb.R") |
||
+ | source("dm.R") |
||
+ | source("dm.xyz.R") |
||
+ | |||
+ | |||
+ | minDist <- function(a,b,distAtom,allRes) { |
||
+ | aIND <- which(allRes==a) |
||
+ | bIND <- which(allRes==b) |
||
+ | |||
+ | |||
+ | return(min(distAtom[aIND,bIND])) |
||
+ | } |
||
+ | |||
+ | contactMap_A <- function(pdbFile) { |
||
+ | prot <- read.pdb(pdbFile) |
||
+ | distAtom <- dm(prot,selection="//A/////") |
||
+ | |||
+ | chainA <- prot$atom[prot$atom[,5]=="A",] |
||
+ | |||
+ | resVec <- unique(chainA[,6]) |
||
+ | |||
+ | distRes <- sapply(resVec, function(x,y){ sapply(y,function(i,j){ return(minDist(i,j,distAtom,chainA[,6])) },j=x)},y=resVec) |
||
+ | |||
+ | #distRes <- sapply(resVec, function(x,y){ sapply(y[(as.integer(x)+1):length(y)],function(i,j){ return(minDist(i,j,distAtom,chainA[,6])) },j=x) },y=resVec) |
||
+ | |||
+ | |||
+ | return(distRes) |
||
+ | } |
||
+ | |||
+ | </source> |
||
+ | |||
+ | === get_contacts_clean.R === |
||
+ | |||
+ | <source lang="php"> |
||
+ | |||
+ | args <- commandArgs(trailingOnly = TRUE) |
||
+ | |||
+ | source("get_contactMap.R") |
||
+ | source("contact_util.R") |
||
+ | |||
+ | #~ contactFile = "../ras_contacts.out" |
||
+ | #~ pdbFile = "5P21.pdb" |
||
+ | #~ seqL = 160 |
||
+ | #~ offset=0 |
||
+ | #~ pdbOff = 4 |
||
+ | |||
+ | contactFile = "../mhc_contacts.out" |
||
+ | pdbFile="1A6Z.pdb" |
||
+ | seqL=174 |
||
+ | offset=-25 |
||
+ | pdbOff=1 |
||
+ | |||
+ | #~ contactFile = "../imm_contacts.out" |
||
+ | #~ pdbFile="1A6Z.pdb" |
||
+ | #~ seqL=76 |
||
+ | #~ offset=-25 |
||
+ | #~ pdbOff=191 |
||
+ | |||
+ | |||
+ | |||
+ | #define borders of domains |
||
+ | r <- c(pdbOff+1,pdbOff+1,pdbOff+seqL,pdbOff+seqL) |
||
+ | |||
+ | |||
+ | dat <- read.table(contactFile) |
||
+ | datSorted <- getHS(dat) |
||
+ | #get high scoring pairs |
||
+ | hs <- datSorted[datSorted[,6] >= 1,] |
||
+ | |||
+ | #~ #get score ~ tp/fp correlation plot |
||
+ | #~ png("score_correlation.png",500,600) |
||
+ | #~ boxplot(hs[,6] ~ hs[,7],names=c("FP","TP"),ylab="CN score",cex.axis=1.3,cex.lab=1.3,main=paste("Pearson correlation: ",round(cc,3)),cex.main=1.3) |
||
+ | #~ dev.off() |
||
+ | |||
+ | |||
+ | |||
+ | #-4+4 -0 = 0 |
||
+ | xy_dom <- getCoords(hs,offset) |
||
+ | |||
+ | |||
+ | #load PDB |
||
+ | distRes <- contactMap_A(pdbFile) |
||
+ | l <- dim(distRes)[1] |
||
+ | dRes5 <- apply(distRes,c(1,2),function(x){ if(x<=5) { return(1) } else {return(0)}}) |
||
+ | predC <- matrix(rep(0,l*l),nrow=l) |
||
+ | |||
+ | |||
+ | |||
+ | |||
+ | true_contacts <- NULL |
||
+ | for (i in 1:dim(xy_dom)[1]) { |
||
+ | predC[xy_dom[i,1],xy_dom[i,2]] <- 1 |
||
+ | predC[xy_dom[i,2],xy_dom[i,1]] <- 1 |
||
+ | true_contacts <- c(true_contacts,dRes5[xy_dom[i,1],xy_dom[i,2]]) |
||
+ | } |
||
+ | |||
+ | hs <- cbind(hs,true_contacts) |
||
+ | |||
+ | TP <- table(hs[,7])[2] |
||
+ | FP <- length(hs[,7]) - TP |
||
+ | print(table(hs[,7])) |
||
+ | |||
+ | |||
+ | print(paste("TP: ",TP)) |
||
+ | print(paste("FP: ",FP)) |
||
+ | |||
+ | #cor(hs[,6],hs[,7]) |
||
+ | |||
+ | png("contactMap.png",width=800,height=800) |
||
+ | par(mar=c(5,5,3,3)) |
||
+ | dr <- which(dRes5==1,arr.ind=T) |
||
+ | pc <- which(predC==1,arr.ind=T) |
||
+ | plot(dr,pch=16,col="grey",xlab="residues",ylab="residues",cex.lab=1.7,main="") |
||
+ | points(pc,col="red",pch=3) |
||
+ | rect(r[1],r[2],r[3],r[4]) |
||
+ | dev.off() |
||
+ | |||
+ | |||
+ | #normalise scores |
||
+ | nScores <- normScores(datSorted,seqL) |
||
+ | print("normalised Scores") |
||
+ | print(nScores) |
||
+ | |||
+ | </source> |
Latest revision as of 20:38, 27 August 2013
Contents
Multiple sequence alignment
The HFE protein consists of 3 domains: alpha 1 and 2 and a Immunoglobulin C1-set domain. Therefore we found two different pfam families that both match our protein:
Class I Histocompatibility antigen, domains alpha 1 and 2 (http://pfam.sanger.ac.uk/family/PF00129) and Immunoglobulin C1-set domain (http://pfam.sanger.ac.uk/family/PF07654#tabview=tab0).
We checked how much of out proteins sequence is covered by both fimilies:
immunoglobulin
# GS HFE_HUMAN/211-294 DR PDB; 1A6Z A; 189-272;
mhc class 1
# GS HFE_HUMAN/26-202 DR PDB; 1A6Z A; 4-180;
We downloaded the alignments in fasta format. Therefore, we first formatted the alignment to fasta format and then downloaded the .txt file. The alignment were then formatted again with a2m2aln:
/usr/share/freecontact/a2m2aln -q '^HFE_HUMAN/(\d+)' --quiet < imm.txt > imm.aln /usr/share/freecontact/a2m2aln -q '^HFE_HUMAN/(\d+)' --quiet < mhc.txt > mhc.aln /usr/share/freecontact/a2m2aln -q '^RASH_HUMAN/(\d+)' --quiet < ras.txt > ras.aln
freecontact with standard parameters and evfold as output format this was done with the following command:
freecontact -o evfold < imm.aln > imm_contacts.out freecontact -o evfold < mhc.aln > mhc_contacts.out freecontact -o evfold < ras.aln > ras_contacts.out
The contact predictions were then analysed using the following pipeline
- Remove contacts that have a distance of <= 5 residues in sequence
- Order descending by CN score and plot score distribution
- Select high scoring pairs
- Load reference PDB and determine minimum atom distance between all residue pairs
- determine pairs that have a minimum distance of below 5 A
Calculate structural models
The configuration for the model calculation with PLM scoring was the same, except of course for the Contact scoring method.
Contact Maps
<figtable id="ras_fold_cm">
– |
</figtable>
Scripts
contact_util.R
<source lang="php">
cDist <- function(a,b) {
if(abs(a-b) > 5) { return(TRUE) } else { return(FALSE) } }
getValid <- function(dat,col1,col2) { valid <- with(dat[,c(col1,col2)], mapply(cDist,V1,V3)) validPairs <- dat[valid,] return(validPairs) }
getHS <- function(a,histName="cn_score_hist.png") {
- get pairs with residue distance > 5
valid <- with(a[,c(1,3)], mapply(cDist,V1,V3)) validPairs <- a[valid,]
- order by decreasing cn value
ord <- order(validPairs[,6],decreasing=TRUE) vSorted <- validPairs[ord,]
- plot score distribution
png(histName,width=500,height=400) hist(vSorted[,6],100,col="lightblue",xlab="CN score", main="",xlim=c(-1,max(vSorted[,6]))) dev.off()
return(vSorted)
}
getCoords <- function(hs,offset) {
xVal <- sapply(hs[,1],function(x){ x+offset })
yVal <- sapply(hs[,3],function(x){ x+offset })
return(cbind(xVal,yVal))
}
mapAA <- function(dat) { x <- dat[,c(1,2)] y <- dat[,c(3,4)] colnames(y) <- colnames(x) z <- rbind(x,y) map <- list() for (i in 1:nrow(dat)) {
map[[z[i,1]]] <- z[i,2] } return(map) }
normScores <- function(datSorted,l){ lDat <- datSorted[1:l,] AAmap <- mapAA(lDat) nFactor <- mean(lDat[,6]) x <- lDat[,c(1,6)] y <- lDat[,c(3,6)] colnames(y) <- colnames(x) z <- rbind(x,y) scSums <- aggregate(z[,2], by=list(z[,1]) , FUN=sum) ord <- order(scSums[,2],decreasing=T) scSums <- scSums[ord,]
return(scSums)
} </source>
get_contactMap.R
<source lang="php">
source("atom.select.R") source("read.pdb.R") source("dm.R") source("dm.xyz.R")
minDist <- function(a,b,distAtom,allRes) {
aIND <- which(allRes==a)
bIND <- which(allRes==b)
return(min(distAtom[aIND,bIND]))
}
contactMap_A <- function(pdbFile) { prot <- read.pdb(pdbFile) distAtom <- dm(prot,selection="//A/////")
chainA <- prot$atom[prot$atom[,5]=="A",]
resVec <- unique(chainA[,6])
distRes <- sapply(resVec, function(x,y){ sapply(y,function(i,j){ return(minDist(i,j,distAtom,chainA[,6])) },j=x)},y=resVec)
- distRes <- sapply(resVec, function(x,y){ sapply(y[(as.integer(x)+1):length(y)],function(i,j){ return(minDist(i,j,distAtom,chainA[,6])) },j=x) },y=resVec)
return(distRes)
}
</source>
get_contacts_clean.R
<source lang="php">
args <- commandArgs(trailingOnly = TRUE)
source("get_contactMap.R") source("contact_util.R")
- ~ contactFile = "../ras_contacts.out"
- ~ pdbFile = "5P21.pdb"
- ~ seqL = 160
- ~ offset=0
- ~ pdbOff = 4
contactFile = "../mhc_contacts.out" pdbFile="1A6Z.pdb" seqL=174 offset=-25 pdbOff=1
- ~ contactFile = "../imm_contacts.out"
- ~ pdbFile="1A6Z.pdb"
- ~ seqL=76
- ~ offset=-25
- ~ pdbOff=191
- define borders of domains
r <- c(pdbOff+1,pdbOff+1,pdbOff+seqL,pdbOff+seqL)
dat <- read.table(contactFile)
datSorted <- getHS(dat)
- get high scoring pairs
hs <- datSorted[datSorted[,6] >= 1,]
- ~ #get score ~ tp/fp correlation plot
- ~ png("score_correlation.png",500,600)
- ~ boxplot(hs[,6] ~ hs[,7],names=c("FP","TP"),ylab="CN score",cex.axis=1.3,cex.lab=1.3,main=paste("Pearson correlation: ",round(cc,3)),cex.main=1.3)
- ~ dev.off()
- -4+4 -0 = 0
xy_dom <- getCoords(hs,offset)
- load PDB
distRes <- contactMap_A(pdbFile) l <- dim(distRes)[1] dRes5 <- apply(distRes,c(1,2),function(x){ if(x<=5) { return(1) } else {return(0)}}) predC <- matrix(rep(0,l*l),nrow=l)
true_contacts <- NULL
for (i in 1:dim(xy_dom)[1]) {
predC[xy_dom[i,1],xy_dom[i,2]] <- 1
predC[xy_dom[i,2],xy_dom[i,1]] <- 1
true_contacts <- c(true_contacts,dRes5[xy_dom[i,1],xy_dom[i,2]])
}
hs <- cbind(hs,true_contacts)
TP <- table(hs[,7])[2] FP <- length(hs[,7]) - TP print(table(hs[,7]))
print(paste("TP: ",TP))
print(paste("FP: ",FP))
- cor(hs[,6],hs[,7])
png("contactMap.png",width=800,height=800) par(mar=c(5,5,3,3)) dr <- which(dRes5==1,arr.ind=T) pc <- which(predC==1,arr.ind=T) plot(dr,pch=16,col="grey",xlab="residues",ylab="residues",cex.lab=1.7,main="") points(pc,col="red",pch=3) rect(r[1],r[2],r[3],r[4]) dev.off()
- normalise scores
nScores <- normScores(datSorted,seqL) print("normalised Scores") print(nScores)
</source>