Difference between revisions of "Lab journal"

From Bioinformatikpedia
 
(17 intermediate revisions by the same user not shown)
Line 1: Line 1:
 
== Multiple sequence alignment ==
 
== Multiple sequence alignment ==
 
The HFE protein consists of 3 domains: alpha 1 and 2 and a Immunoglobulin C1-set domain. Therefore we found two different pfam families that both match our protein:
 
The HFE protein consists of 3 domains: alpha 1 and 2 and a Immunoglobulin C1-set domain. Therefore we found two different pfam families that both match our protein:
Class I Histocompatibility antigen, domains alpha 1 and 2 (http://pfam.sanger.ac.uk/family/PF07654#tabview=tab0) and Immunoglobulin C1-set domain (http://pfam.sanger.ac.uk/family/PF07654#tabview=tab0).
+
Class I Histocompatibility antigen, domains alpha 1 and 2 (http://pfam.sanger.ac.uk/family/PF00129) and Immunoglobulin C1-set domain (http://pfam.sanger.ac.uk/family/PF07654#tabview=tab0).
 
We checked how much of out proteins sequence is covered by both fimilies:
 
We checked how much of out proteins sequence is covered by both fimilies:
 
immunoglobulin <br>
 
immunoglobulin <br>
Line 7: Line 7:
   
 
mhc class 1 <br>
 
mhc class 1 <br>
<!-- # -->=GS HFE_HUMAN/26-202 DR PDB; 1A6Z A; 4-180;
+
<nowiki>#</nowiki> GS HFE_HUMAN/26-202 DR PDB; 1A6Z A; 4-180;
 
   
 
We downloaded the alignments in fasta format. Therefore, we first formatted the alignment to fasta format and then downloaded the .txt file.
 
We downloaded the alignments in fasta format. Therefore, we first formatted the alignment to fasta format and then downloaded the .txt file.
 
The alignment were then formatted again with a2m2aln:
 
The alignment were then formatted again with a2m2aln:
   
/usr/share/freecontact/a2m2aln -q '^HFE_HUMAN/(\d+)' --quiet < imm.txt > imm.aln
+
/usr/share/freecontact/a2m2aln -q '^HFE_HUMAN/(\d+)' --quiet < imm.txt > imm.aln
/usr/share/freecontact/a2m2aln -q '^HFE_HUMAN/(\d+)' --quiet < mhc.txt > mhc.aln
+
/usr/share/freecontact/a2m2aln -q '^HFE_HUMAN/(\d+)' --quiet < mhc.txt > mhc.aln
/usr/share/freecontact/a2m2aln -q '^RASH_HUMAN/(\d+)' --quiet < ras.txt > ras.aln
+
/usr/share/freecontact/a2m2aln -q '^RASH_HUMAN/(\d+)' --quiet < ras.txt > ras.aln
   
 
== Calculate and analyze correlated mutations ==
 
== Calculate and analyze correlated mutations ==
Line 21: Line 20:
 
freecontact with standard parameters and evfold as output format
 
freecontact with standard parameters and evfold as output format
 
this was done with the following command:
 
this was done with the following command:
  +
  +
freecontact -o evfold < imm.aln > imm_contacts.out
  +
freecontact -o evfold < mhc.aln > mhc_contacts.out
  +
freecontact -o evfold < ras.aln > ras_contacts.out
  +
  +
The contact predictions were then analysed using the following pipeline
  +
  +
# Remove contacts that have a distance of <= 5 residues in sequence
  +
# Order descending by CN score and plot score distribution
  +
# Select high scoring pairs
  +
# Load reference PDB and determine minimum atom distance between all residue pairs
  +
# determine pairs that have a minimum distance of below 5 A
   
   
freecontact -o evfold < imm.aln > imm_contacts.out
 
freecontact -o evfold < mhc.aln > mhc_contacts.out
 
freecontact -o evfold < ras.aln > ras_contacts.out
 
   
 
== Calculate structural models ==
 
== Calculate structural models ==
  +
*[[Ras_DI_Score_Configuration]]
  +
  +
The configuration for the model calculation with PLM scoring was the same, except of course for the Contact scoring method.
  +
  +
=== Contact Maps ===
  +
  +
  +
<figtable id="ras_fold_cm">
  +
  +
{|
  +
| [[File:Ras_di_76.png|center|thumb|300px|DI score and 76 EC constraints]] || [[File:Ras_di_123.png|center|thumb|300px|DI score and 123 EC constraints]] || [[File:Ras_di_189.png|center|thumb|300px|DI score and 189 EC constraints]]
  +
|-
  +
| [[File:Ras_plm_76.png|center|thumb|300px|PLM score and 76 EC constraints]] || [[File:Ras_plm_123.png|center|thumb|300px|PLM score and 123 EC constraints]] || [[File:Ras_plm_189.png|center|thumb|300px|PLM score and 189 EC constraints]]
  +
|–
  +
|+ style="caption-side: bottom; text-align: left" |<font size=2>'''Figure 1:''' Contact maps of EVfold predictions for Ras using DI and PLM score.
  +
|}
  +
</figtable>
  +
  +
== Scripts ==
  +
  +
=== contact_util.R ===
  +
  +
<source lang="php">
  +
  +
cDist <- function(a,b) {
  +
  +
if(abs(a-b) > 5) {
  +
return(TRUE)
  +
}
  +
else {
  +
return(FALSE)
  +
}
  +
}
  +
  +
getValid <- function(dat,col1,col2) {
  +
valid <- with(dat[,c(col1,col2)], mapply(cDist,V1,V3))
  +
validPairs <- dat[valid,]
  +
return(validPairs)
  +
}
  +
  +
  +
getHS <- function(a,histName="cn_score_hist.png") {
  +
  +
  +
#get pairs with residue distance > 5
  +
valid <- with(a[,c(1,3)], mapply(cDist,V1,V3))
  +
validPairs <- a[valid,]
  +
  +
#order by decreasing cn value
  +
ord <- order(validPairs[,6],decreasing=TRUE)
  +
vSorted <- validPairs[ord,]
  +
#plot score distribution
  +
png(histName,width=500,height=400)
  +
hist(vSorted[,6],100,col="lightblue",xlab="CN score", main="",xlim=c(-1,max(vSorted[,6])))
  +
dev.off()
  +
  +
  +
return(vSorted)
  +
}
  +
  +
  +
getCoords <- function(hs,offset) {
  +
xVal <- sapply(hs[,1],function(x){ x+offset })
  +
yVal <- sapply(hs[,3],function(x){ x+offset })
  +
return(cbind(xVal,yVal))
  +
}
  +
  +
mapAA <- function(dat) {
  +
x <- dat[,c(1,2)]
  +
y <- dat[,c(3,4)]
  +
colnames(y) <- colnames(x)
  +
z <- rbind(x,y)
  +
map <- list()
  +
for (i in 1:nrow(dat)) {
  +
  +
map[[z[i,1]]] <- z[i,2]
  +
}
  +
return(map)
  +
}
  +
  +
normScores <- function(datSorted,l){
  +
lDat <- datSorted[1:l,]
  +
AAmap <- mapAA(lDat)
  +
nFactor <- mean(lDat[,6])
  +
x <- lDat[,c(1,6)]
  +
y <- lDat[,c(3,6)]
  +
colnames(y) <- colnames(x)
  +
z <- rbind(x,y)
  +
scSums <- aggregate(z[,2], by=list(z[,1]) , FUN=sum)
  +
ord <- order(scSums[,2],decreasing=T)
  +
scSums <- scSums[ord,]
  +
  +
return(scSums)
  +
  +
}
  +
</source>
  +
  +
=== get_contactMap.R ===
  +
  +
<source lang="php">
  +
  +
source("atom.select.R")
  +
source("read.pdb.R")
  +
source("dm.R")
  +
source("dm.xyz.R")
  +
  +
  +
minDist <- function(a,b,distAtom,allRes) {
  +
aIND <- which(allRes==a)
  +
bIND <- which(allRes==b)
  +
  +
  +
return(min(distAtom[aIND,bIND]))
  +
}
  +
  +
contactMap_A <- function(pdbFile) {
  +
prot <- read.pdb(pdbFile)
  +
distAtom <- dm(prot,selection="//A/////")
  +
  +
chainA <- prot$atom[prot$atom[,5]=="A",]
  +
  +
resVec <- unique(chainA[,6])
  +
  +
distRes <- sapply(resVec, function(x,y){ sapply(y,function(i,j){ return(minDist(i,j,distAtom,chainA[,6])) },j=x)},y=resVec)
  +
  +
#distRes <- sapply(resVec, function(x,y){ sapply(y[(as.integer(x)+1):length(y)],function(i,j){ return(minDist(i,j,distAtom,chainA[,6])) },j=x) },y=resVec)
  +
  +
  +
return(distRes)
  +
}
  +
  +
</source>
  +
  +
=== get_contacts_clean.R ===
  +
  +
<source lang="php">
  +
  +
args <- commandArgs(trailingOnly = TRUE)
  +
  +
source("get_contactMap.R")
  +
source("contact_util.R")
  +
  +
#~ contactFile = "../ras_contacts.out"
  +
#~ pdbFile = "5P21.pdb"
  +
#~ seqL = 160
  +
#~ offset=0
  +
#~ pdbOff = 4
  +
  +
contactFile = "../mhc_contacts.out"
  +
pdbFile="1A6Z.pdb"
  +
seqL=174
  +
offset=-25
  +
pdbOff=1
  +
  +
#~ contactFile = "../imm_contacts.out"
  +
#~ pdbFile="1A6Z.pdb"
  +
#~ seqL=76
  +
#~ offset=-25
  +
#~ pdbOff=191
  +
  +
  +
  +
#define borders of domains
  +
r <- c(pdbOff+1,pdbOff+1,pdbOff+seqL,pdbOff+seqL)
  +
  +
  +
dat <- read.table(contactFile)
  +
datSorted <- getHS(dat)
  +
#get high scoring pairs
  +
hs <- datSorted[datSorted[,6] >= 1,]
  +
  +
#~ #get score ~ tp/fp correlation plot
  +
#~ png("score_correlation.png",500,600)
  +
#~ boxplot(hs[,6] ~ hs[,7],names=c("FP","TP"),ylab="CN score",cex.axis=1.3,cex.lab=1.3,main=paste("Pearson correlation: ",round(cc,3)),cex.main=1.3)
  +
#~ dev.off()
  +
  +
  +
  +
#-4+4 -0 = 0
  +
xy_dom <- getCoords(hs,offset)
  +
  +
  +
#load PDB
  +
distRes <- contactMap_A(pdbFile)
  +
l <- dim(distRes)[1]
  +
dRes5 <- apply(distRes,c(1,2),function(x){ if(x<=5) { return(1) } else {return(0)}})
  +
predC <- matrix(rep(0,l*l),nrow=l)
  +
  +
  +
  +
  +
true_contacts <- NULL
  +
for (i in 1:dim(xy_dom)[1]) {
  +
predC[xy_dom[i,1],xy_dom[i,2]] <- 1
  +
predC[xy_dom[i,2],xy_dom[i,1]] <- 1
  +
true_contacts <- c(true_contacts,dRes5[xy_dom[i,1],xy_dom[i,2]])
  +
}
  +
  +
hs <- cbind(hs,true_contacts)
  +
  +
TP <- table(hs[,7])[2]
  +
FP <- length(hs[,7]) - TP
  +
print(table(hs[,7]))
  +
  +
  +
print(paste("TP: ",TP))
  +
print(paste("FP: ",FP))
  +
  +
#cor(hs[,6],hs[,7])
  +
  +
png("contactMap.png",width=800,height=800)
  +
par(mar=c(5,5,3,3))
  +
dr <- which(dRes5==1,arr.ind=T)
  +
pc <- which(predC==1,arr.ind=T)
  +
plot(dr,pch=16,col="grey",xlab="residues",ylab="residues",cex.lab=1.7,main="")
  +
points(pc,col="red",pch=3)
  +
rect(r[1],r[2],r[3],r[4])
  +
dev.off()
  +
  +
  +
#normalise scores
  +
nScores <- normScores(datSorted,seqL)
  +
print("normalised Scores")
  +
print(nScores)
  +
  +
</source>

Latest revision as of 20:38, 27 August 2013

Multiple sequence alignment

The HFE protein consists of 3 domains: alpha 1 and 2 and a Immunoglobulin C1-set domain. Therefore we found two different pfam families that both match our protein: Class I Histocompatibility antigen, domains alpha 1 and 2 (http://pfam.sanger.ac.uk/family/PF00129) and Immunoglobulin C1-set domain (http://pfam.sanger.ac.uk/family/PF07654#tabview=tab0). We checked how much of out proteins sequence is covered by both fimilies: immunoglobulin
# GS HFE_HUMAN/211-294 DR PDB; 1A6Z A; 189-272;

mhc class 1
# GS HFE_HUMAN/26-202 DR PDB; 1A6Z A; 4-180;

We downloaded the alignments in fasta format. Therefore, we first formatted the alignment to fasta format and then downloaded the .txt file. The alignment were then formatted again with a2m2aln:

/usr/share/freecontact/a2m2aln -q '^HFE_HUMAN/(\d+)' --quiet < imm.txt > imm.aln
/usr/share/freecontact/a2m2aln -q '^HFE_HUMAN/(\d+)' --quiet < mhc.txt > mhc.aln
/usr/share/freecontact/a2m2aln -q '^RASH_HUMAN/(\d+)' --quiet < ras.txt > ras.aln

Calculate and analyze correlated mutations

freecontact with standard parameters and evfold as output format this was done with the following command:

freecontact -o evfold < imm.aln > imm_contacts.out
freecontact -o evfold < mhc.aln > mhc_contacts.out
freecontact -o evfold < ras.aln > ras_contacts.out

The contact predictions were then analysed using the following pipeline

  1. Remove contacts that have a distance of <= 5 residues in sequence
  2. Order descending by CN score and plot score distribution
  3. Select high scoring pairs
  4. Load reference PDB and determine minimum atom distance between all residue pairs
  5. determine pairs that have a minimum distance of below 5 A


Calculate structural models

The configuration for the model calculation with PLM scoring was the same, except of course for the Contact scoring method.

Contact Maps

<figtable id="ras_fold_cm">

DI score and 76 EC constraints
DI score and 123 EC constraints
DI score and 189 EC constraints
PLM score and 76 EC constraints
PLM score and 123 EC constraints
PLM score and 189 EC constraints
Figure 1: Contact maps of EVfold predictions for Ras using DI and PLM score.

</figtable>

Scripts

contact_util.R

<source lang="php">

cDist <- function(a,b) {

if(abs(a-b) > 5) { return(TRUE) } else { return(FALSE) } }

getValid <- function(dat,col1,col2) { valid <- with(dat[,c(col1,col2)], mapply(cDist,V1,V3)) validPairs <- dat[valid,] return(validPairs) }


getHS <- function(a,histName="cn_score_hist.png") {


  1. get pairs with residue distance > 5

valid <- with(a[,c(1,3)], mapply(cDist,V1,V3)) validPairs <- a[valid,]

  1. order by decreasing cn value

ord <- order(validPairs[,6],decreasing=TRUE) vSorted <- validPairs[ord,]

  1. plot score distribution

png(histName,width=500,height=400) hist(vSorted[,6],100,col="lightblue",xlab="CN score", main="",xlim=c(-1,max(vSorted[,6]))) dev.off()


return(vSorted) }


getCoords <- function(hs,offset) { xVal <- sapply(hs[,1],function(x){ x+offset }) yVal <- sapply(hs[,3],function(x){ x+offset }) return(cbind(xVal,yVal)) }

mapAA <- function(dat) { x <- dat[,c(1,2)] y <- dat[,c(3,4)] colnames(y) <- colnames(x) z <- rbind(x,y) map <- list() for (i in 1:nrow(dat)) {

map[[z[i,1]]] <- z[i,2] } return(map) }

normScores <- function(datSorted,l){ lDat <- datSorted[1:l,] AAmap <- mapAA(lDat) nFactor <- mean(lDat[,6]) x <- lDat[,c(1,6)] y <- lDat[,c(3,6)] colnames(y) <- colnames(x) z <- rbind(x,y) scSums <- aggregate(z[,2], by=list(z[,1]) , FUN=sum) ord <- order(scSums[,2],decreasing=T) scSums <- scSums[ord,]

return(scSums)

} </source>

get_contactMap.R

<source lang="php">

source("atom.select.R") source("read.pdb.R") source("dm.R") source("dm.xyz.R")


minDist <- function(a,b,distAtom,allRes) { aIND <- which(allRes==a) bIND <- which(allRes==b)


return(min(distAtom[aIND,bIND])) }

contactMap_A <- function(pdbFile) { prot <- read.pdb(pdbFile) distAtom <- dm(prot,selection="//A/////")

chainA <- prot$atom[prot$atom[,5]=="A",]

resVec <- unique(chainA[,6])

distRes <- sapply(resVec, function(x,y){ sapply(y,function(i,j){ return(minDist(i,j,distAtom,chainA[,6])) },j=x)},y=resVec)

  1. distRes <- sapply(resVec, function(x,y){ sapply(y[(as.integer(x)+1):length(y)],function(i,j){ return(minDist(i,j,distAtom,chainA[,6])) },j=x) },y=resVec)


return(distRes) }

</source>

get_contacts_clean.R

<source lang="php">

args <- commandArgs(trailingOnly = TRUE)

source("get_contactMap.R") source("contact_util.R")

  1. ~ contactFile = "../ras_contacts.out"
  2. ~ pdbFile = "5P21.pdb"
  3. ~ seqL = 160
  4. ~ offset=0
  5. ~ pdbOff = 4

contactFile = "../mhc_contacts.out" pdbFile="1A6Z.pdb" seqL=174 offset=-25 pdbOff=1

  1. ~ contactFile = "../imm_contacts.out"
  2. ~ pdbFile="1A6Z.pdb"
  3. ~ seqL=76
  4. ~ offset=-25
  5. ~ pdbOff=191


  1. define borders of domains

r <- c(pdbOff+1,pdbOff+1,pdbOff+seqL,pdbOff+seqL)


dat <- read.table(contactFile) datSorted <- getHS(dat)

  1. get high scoring pairs

hs <- datSorted[datSorted[,6] >= 1,]

  1. ~ #get score ~ tp/fp correlation plot
  2. ~ png("score_correlation.png",500,600)
  3. ~ boxplot(hs[,6] ~ hs[,7],names=c("FP","TP"),ylab="CN score",cex.axis=1.3,cex.lab=1.3,main=paste("Pearson correlation: ",round(cc,3)),cex.main=1.3)
  4. ~ dev.off()


  1. -4+4 -0 = 0

xy_dom <- getCoords(hs,offset)


  1. load PDB

distRes <- contactMap_A(pdbFile) l <- dim(distRes)[1] dRes5 <- apply(distRes,c(1,2),function(x){ if(x<=5) { return(1) } else {return(0)}}) predC <- matrix(rep(0,l*l),nrow=l)



true_contacts <- NULL for (i in 1:dim(xy_dom)[1]) { predC[xy_dom[i,1],xy_dom[i,2]] <- 1 predC[xy_dom[i,2],xy_dom[i,1]] <- 1 true_contacts <- c(true_contacts,dRes5[xy_dom[i,1],xy_dom[i,2]]) }

hs <- cbind(hs,true_contacts)

TP <- table(hs[,7])[2] FP <- length(hs[,7]) - TP print(table(hs[,7]))


print(paste("TP: ",TP)) print(paste("FP: ",FP))

  1. cor(hs[,6],hs[,7])

png("contactMap.png",width=800,height=800) par(mar=c(5,5,3,3)) dr <- which(dRes5==1,arr.ind=T) pc <- which(predC==1,arr.ind=T) plot(dr,pch=16,col="grey",xlab="residues",ylab="residues",cex.lab=1.7,main="") points(pc,col="red",pch=3) rect(r[1],r[2],r[3],r[4]) dev.off()


  1. normalise scores

nScores <- normScores(datSorted,seqL) print("normalised Scores") print(nScores)

</source>