Lab journal
Contents
Multiple sequence alignment
The HFE protein consists of 3 domains: alpha 1 and 2 and a Immunoglobulin C1-set domain. Therefore we found two different pfam families that both match our protein:
Class I Histocompatibility antigen, domains alpha 1 and 2 (http://pfam.sanger.ac.uk/family/PF00129) and Immunoglobulin C1-set domain (http://pfam.sanger.ac.uk/family/PF07654#tabview=tab0).
We checked how much of out proteins sequence is covered by both fimilies:
immunoglobulin
# GS HFE_HUMAN/211-294 DR PDB; 1A6Z A; 189-272;
mhc class 1
# GS HFE_HUMAN/26-202 DR PDB; 1A6Z A; 4-180;
We downloaded the alignments in fasta format. Therefore, we first formatted the alignment to fasta format and then downloaded the .txt file. The alignment were then formatted again with a2m2aln:
/usr/share/freecontact/a2m2aln -q '^HFE_HUMAN/(\d+)' --quiet < imm.txt > imm.aln /usr/share/freecontact/a2m2aln -q '^HFE_HUMAN/(\d+)' --quiet < mhc.txt > mhc.aln /usr/share/freecontact/a2m2aln -q '^RASH_HUMAN/(\d+)' --quiet < ras.txt > ras.aln
freecontact with standard parameters and evfold as output format this was done with the following command:
freecontact -o evfold < imm.aln > imm_contacts.out freecontact -o evfold < mhc.aln > mhc_contacts.out freecontact -o evfold < ras.aln > ras_contacts.out
The contact predictions were then analysed using the following pipeline
- Remove contacts that have a distance of <= 5 residues in sequence
- Order descending by CN score and plot score distribution
- Select high scoring pairs
- Load reference PDB and determine minimum atom distance between all residue pairs
- determine pairs that have a minimum distance of below 5 A
Calculate structural models
The configuration for the model calculation with PLM scoring was the same, except of course for the Contact scoring method.
Scripts
contact_util.R
<source lang="php">
cDist <- function(a,b) {
if(abs(a-b) > 5) { return(TRUE) } else { return(FALSE) } }
getValid <- function(dat,col1,col2) { valid <- with(dat[,c(col1,col2)], mapply(cDist,V1,V3)) validPairs <- dat[valid,] return(validPairs) }
getHS <- function(a,histName="cn_score_hist.png") {
- get pairs with residue distance > 5
valid <- with(a[,c(1,3)], mapply(cDist,V1,V3)) validPairs <- a[valid,]
- order by decreasing cn value
ord <- order(validPairs[,6],decreasing=TRUE) vSorted <- validPairs[ord,]
- plot score distribution
png(histName,width=500,height=400) hist(vSorted[,6],100,col="lightblue",xlab="CN score", main="",xlim=c(-1,max(vSorted[,6]))) dev.off()
return(vSorted)
}
getCoords <- function(hs,offset) {
xVal <- sapply(hs[,1],function(x){ x+offset })
yVal <- sapply(hs[,3],function(x){ x+offset })
return(cbind(xVal,yVal))
}
mapAA <- function(dat) { x <- dat[,c(1,2)] y <- dat[,c(3,4)] colnames(y) <- colnames(x) z <- rbind(x,y) map <- list() for (i in 1:nrow(dat)) {
map[[z[i,1]]] <- z[i,2] } return(map) }
normScores <- function(datSorted,l){ lDat <- datSorted[1:l,] AAmap <- mapAA(lDat) nFactor <- mean(lDat[,6]) x <- lDat[,c(1,6)] y <- lDat[,c(3,6)] colnames(y) <- colnames(x) z <- rbind(x,y) scSums <- aggregate(z[,2], by=list(z[,1]) , FUN=sum) ord <- order(scSums[,2],decreasing=T) scSums <- scSums[ord,]
return(scSums)
} </source>
get_contactMap.R
<source lang="php">
source("atom.select.R") source("read.pdb.R") source("dm.R") source("dm.xyz.R")
minDist <- function(a,b,distAtom,allRes) {
aIND <- which(allRes==a)
bIND <- which(allRes==b)
return(min(distAtom[aIND,bIND]))
}
contactMap_A <- function(pdbFile) { prot <- read.pdb(pdbFile) distAtom <- dm(prot,selection="//A/////")
chainA <- prot$atom[prot$atom[,5]=="A",]
resVec <- unique(chainA[,6])
distRes <- sapply(resVec, function(x,y){ sapply(y,function(i,j){ return(minDist(i,j,distAtom,chainA[,6])) },j=x)},y=resVec)
- distRes <- sapply(resVec, function(x,y){ sapply(y[(as.integer(x)+1):length(y)],function(i,j){ return(minDist(i,j,distAtom,chainA[,6])) },j=x) },y=resVec)
return(distRes)
}
</source>
get_contacts_clean.R
<source lang="php">
args <- commandArgs(trailingOnly = TRUE)
source("get_contactMap.R") source("contact_util.R")
- ~ contactFile = "../ras_contacts.out"
- ~ pdbFile = "5P21.pdb"
- ~ seqL = 160
- ~ offset=0
- ~ pdbOff = 4
contactFile = "../mhc_contacts.out" pdbFile="1A6Z.pdb" seqL=174 offset=-25 pdbOff=1
- ~ contactFile = "../imm_contacts.out"
- ~ pdbFile="1A6Z.pdb"
- ~ seqL=76
- ~ offset=-25
- ~ pdbOff=191
- define borders of domains
r <- c(pdbOff+1,pdbOff+1,pdbOff+seqL,pdbOff+seqL)
dat <- read.table(contactFile)
datSorted <- getHS(dat)
- get high scoring pairs
hs <- datSorted[datSorted[,6] >= 1,]
- ~ #get score ~ tp/fp correlation plot
- ~ png("score_correlation.png",500,600)
- ~ boxplot(hs[,6] ~ hs[,7],names=c("FP","TP"),ylab="CN score",cex.axis=1.3,cex.lab=1.3,main=paste("Pearson correlation: ",round(cc,3)),cex.main=1.3)
- ~ dev.off()
- -4+4 -0 = 0
xy_dom <- getCoords(hs,offset)
- load PDB
distRes <- contactMap_A(pdbFile) l <- dim(distRes)[1] dRes5 <- apply(distRes,c(1,2),function(x){ if(x<=5) { return(1) } else {return(0)}}) predC <- matrix(rep(0,l*l),nrow=l)
true_contacts <- NULL
for (i in 1:dim(xy_dom)[1]) {
predC[xy_dom[i,1],xy_dom[i,2]] <- 1
predC[xy_dom[i,2],xy_dom[i,1]] <- 1
true_contacts <- c(true_contacts,dRes5[xy_dom[i,1],xy_dom[i,2]])
}
hs <- cbind(hs,true_contacts)
TP <- table(hs[,7])[2] FP <- length(hs[,7]) - TP print(table(hs[,7]))
print(paste("TP: ",TP))
print(paste("FP: ",FP))
- cor(hs[,6],hs[,7])
png("contactMap.png",width=800,height=800) par(mar=c(5,5,3,3)) dr <- which(dRes5==1,arr.ind=T) pc <- which(predC==1,arr.ind=T) plot(dr,pch=16,col="grey",xlab="residues",ylab="residues",cex.lab=1.7,main="") points(pc,col="red",pch=3) rect(r[1],r[2],r[3],r[4]) dev.off()
- normalise scores
nScores <- normScores(datSorted,seqL) print("normalised Scores") print(nScores)
</source>