Lab journal

Multiple sequence alignment

The HFE protein consists of 3 domains: alpha 1 and 2 and a Immunoglobulin C1-set domain. Therefore we found two different pfam families that both match our protein: Class I Histocompatibility antigen, domains alpha 1 and 2 (http://pfam.sanger.ac.uk/family/PF00129) and Immunoglobulin C1-set domain (http://pfam.sanger.ac.uk/family/PF07654#tabview=tab0). We checked how much of out proteins sequence is covered by both fimilies: immunoglobulin
# GS HFE_HUMAN/211-294 DR PDB; 1A6Z A; 189-272;

mhc class 1
# GS HFE_HUMAN/26-202 DR PDB; 1A6Z A; 4-180;

We downloaded the alignments in fasta format. Therefore, we first formatted the alignment to fasta format and then downloaded the .txt file. The alignment were then formatted again with a2m2aln:

/usr/share/freecontact/a2m2aln -q '^HFE_HUMAN/(\d+)' --quiet < imm.txt > imm.aln
/usr/share/freecontact/a2m2aln -q '^HFE_HUMAN/(\d+)' --quiet < mhc.txt > mhc.aln
/usr/share/freecontact/a2m2aln -q '^RASH_HUMAN/(\d+)' --quiet < ras.txt > ras.aln

Calculate and analyze correlated mutations

freecontact with standard parameters and evfold as output format this was done with the following command:

freecontact -o evfold < imm.aln > imm_contacts.out
freecontact -o evfold < mhc.aln > mhc_contacts.out
freecontact -o evfold < ras.aln > ras_contacts.out

The contact predictions were then analysed using the following pipeline

Remove contacts that have a distance of <= 5 residues in sequence
Order descending by CN score and plot score distribution
Select high scoring pairs
Load reference PDB and determine minimum atom distance between all residue pairs
determine pairs that have a minimum distance of below 5 A

Calculate structural models

Ras_DI_Score_Configuration

The configuration for the model calculation with PLM scoring was the same, except of course for the Contact scoring method.

Scripts

contact_util.R

cDist <- function(a,b) {

if(abs(a-b) > 5) { return(TRUE) } else { return(FALSE) } }

getValid <- function(dat,col1,col2) { valid <- with(dat[,c(col1,col2)], mapply(cDist,V1,V3)) validPairs <- dat[valid,] return(validPairs) }

getHS <- function(a,histName="cn_score_hist.png") {

get pairs with residue distance > 5

valid <- with(a[,c(1,3)], mapply(cDist,V1,V3)) validPairs <- a[valid,]

order by decreasing cn value

ord <- order(validPairs[,6],decreasing=TRUE) vSorted <- validPairs[ord,]

plot score distribution

png(histName,width=500,height=400) hist(vSorted[,6],100,col="lightblue",xlab="CN score", main="",xlim=c(-1,max(vSorted[,6]))) dev.off()

return(vSorted) }

getCoords <- function(hs,offset) { xVal <- sapply(hs[,1],function(x){ x+offset }) yVal <- sapply(hs[,3],function(x){ x+offset }) return(cbind(xVal,yVal)) }

mapAA <- function(dat) { x <- dat[,c(1,2)] y <- dat[,c(3,4)] colnames(y) <- colnames(x) z <- rbind(x,y) map <- list() for (i in 1:nrow(dat)) {

map[[z[i,1]]] <- z[i,2] } return(map) }

normScores <- function(datSorted,l){ lDat <- datSorted[1:l,] AAmap <- mapAA(lDat) nFactor <- mean(lDat[,6]) x <- lDat[,c(1,6)] y <- lDat[,c(3,6)] colnames(y) <- colnames(x) z <- rbind(x,y) scSums <- aggregate(z[,2], by=list(z[,1]) , FUN=sum) ord <- order(scSums[,2],decreasing=T) scSums <- scSums[ord,]

return(scSums)

} </source>

get_contactMap.R

source("atom.select.R") source("read.pdb.R") source("dm.R") source("dm.xyz.R")

minDist <- function(a,b,distAtom,allRes) { aIND <- which(allRes==a) bIND <- which(allRes==b)

return(min(distAtom[aIND,bIND])) }

contactMap_A <- function(pdbFile) { prot <- read.pdb(pdbFile) distAtom <- dm(prot,selection="//A/////")

chainA <- prot$atom[prot$atom[,5]=="A",]

resVec <- unique(chainA[,6])

distRes <- sapply(resVec, function(x,y){ sapply(y,function(i,j){ return(minDist(i,j,distAtom,chainA[,6])) },j=x)},y=resVec)

distRes <- sapply(resVec, function(x,y){ sapply(y[(as.integer(x)+1):length(y)],function(i,j){ return(minDist(i,j,distAtom,chainA[,6])) },j=x) },y=resVec)

return(distRes) }

</source>

get_contacts_clean.R

args <- commandArgs(trailingOnly = TRUE)

source("get_contactMap.R") source("contact_util.R")

~ contactFile = "../ras_contacts.out"
~ pdbFile = "5P21.pdb"
~ seqL = 160
~ offset=0
~ pdbOff = 4

contactFile = "../mhc_contacts.out" pdbFile="1A6Z.pdb" seqL=174 offset=-25 pdbOff=1

~ contactFile = "../imm_contacts.out"
~ pdbFile="1A6Z.pdb"
~ seqL=76
~ offset=-25
~ pdbOff=191

define borders of domains

r <- c(pdbOff+1,pdbOff+1,pdbOff+seqL,pdbOff+seqL)

dat <- read.table(contactFile) datSorted <- getHS(dat)

get high scoring pairs

hs <- datSorted[datSorted[,6] >= 1,]

~ #get score ~ tp/fp correlation plot
~ png("score_correlation.png",500,600)
~ boxplot(hs[,6] ~ hs[,7],names=c("FP","TP"),ylab="CN score",cex.axis=1.3,cex.lab=1.3,main=paste("Pearson correlation: ",round(cc,3)),cex.main=1.3)
~ dev.off()

-4+4 -0 = 0

xy_dom <- getCoords(hs,offset)

load PDB

distRes <- contactMap_A(pdbFile) l <- dim(distRes)[1] dRes5 <- apply(distRes,c(1,2),function(x){ if(x<=5) { return(1) } else {return(0)}}) predC <- matrix(rep(0,l*l),nrow=l)

true_contacts <- NULL for (i in 1:dim(xy_dom)[1]) { predC[xy_dom[i,1],xy_dom[i,2]] <- 1 predC[xy_dom[i,2],xy_dom[i,1]] <- 1 true_contacts <- c(true_contacts,dRes5[xy_dom[i,1],xy_dom[i,2]]) }

hs <- cbind(hs,true_contacts)

TP <- table(hs[,7])[2] FP <- length(hs[,7]) - TP print(table(hs[,7]))

print(paste("TP: ",TP)) print(paste("FP: ",FP))

cor(hs[,6],hs[,7])

png("contactMap.png",width=800,height=800) par(mar=c(5,5,3,3)) dr <- which(dRes5==1,arr.ind=T) pc <- which(predC==1,arr.ind=T) plot(dr,pch=16,col="grey",xlab="residues",ylab="residues",cex.lab=1.7,main="") points(pc,col="red",pch=3) rect(r[1],r[2],r[3],r[4]) dev.off()

normalise scores

nScores <- normScores(datSorted,seqL) print("normalised Scores") print(nScores)

</source>

Lab journal

Contents

Multiple sequence alignment

Calculate and analyze correlated mutations

Calculate structural models

Scripts

contact_util.R

get_contactMap.R

get_contacts_clean.R

Navigation menu

Views

Personal tools

Bioinformatik navigation

MediaWiki navigation

Search

Tools