Difference between revisions of "Task 10 - Journal (PKU)"

From Bioinformatikpedia
(Ramachandran Plot)
(Distance RMSD)
 
Line 492: Line 492:
 
echo 1|g_rmsdist -s _md.tpr -f _traj_norot.xtc -o _distance-rmsd.xvg
 
echo 1|g_rmsdist -s _md.tpr -f _traj_norot.xtc -o _distance-rmsd.xvg
 
</source>
 
</source>
  +
  +
[[Category: Phenylketonuria 2012]]

Latest revision as of 11:58, 29 August 2012

Initial checks

Verify completion, frame number and frame rate: <source lang="bash"> gmxcheck -f _md.xtc </source>

performance statistics: <source lang="bash"> tail _md.log </source>

Create Trajectories

<source lang="bash">

  1. fix jumps over the periodic boundary

trjconv -f _md.xtc -s _solv.pdb -pbc nojump -o _traj_nojump.xtc<<EOF0\nEOF

  1. center protein in the box

trjconv -f _traj_nojump.xtc -s _solv.pdb -center -o _traj_center.xtc<<EOF1\n0\nEOF

  1. eliminate rotations

trjconv -f _traj_center.xtc -s _solv.pdb -fit rot+trans -o _traj_norot.xtc<<EOF1\n0\nEOF

  1. create pdb file

trjconv -f _traj_norot.xtc -s _solv.pdb -o _traj.pdb<<EOF1\nEOF

  1. reduce fluctuations and write every fifth frame

g_filter -f _traj.pdb -s _solv.pdb -ol _traj_filtered.pdb -fit -nf 5<<EOF1\nEOF

</source>

Plot Energies

<source lang="bash"> echo 12 0 | g_energy -f _md.edr -o _temperature.xvg echo 13 0 | g_energy -f _md.edr -o _pressure.xvg echo -e "9\n10\n11 0" | g_energy -f _md.edr -o _energy.xvg echo 18 0 | g_energy -f _md.edr -o _volume.xvg echo 19 0 | g_energy -f _md.edr -o _density.xvg echo -e "15\n16\n17 0" | g_energy -f _md.edr -o _box.xvg echo -e "48\n50 0" | g_energy -f _md.edr -o _coulomb-inter.xvg echo -e "49\n51 0" | g_energy -f _md.edr -o _vanderwaals-inter.xvg </source>

Minimal Distance of Interactions

<source lang="bash"> echo 1 0 |g_mindist -f _md.xtc -s _md.tpr -od _minimal_periodic_distance.xvg -pi echo 3 0 |g_mindist -f _md.xtc -s _md.tpr -od _minimal_periodic_c_alpha_distance.xvg -pi </source>

RMSD Calculations

<source lang="bash"> echo 1 1|g_rmsf -f _md.xtc -s _md.tpr -o _rmsf-per-residue.xvg -ox _average.pdb -oq _bfactors.pdb -res echo 1 1 |g_rms -f _traj_norot.xtc -s _md.tpr -o _rmsd-all-atom-vs-start.xvg echo 4 4 |g_rms -f _traj_norot.xtc -s _md.tpr -o _rmsd-backbone-vs-start.xvg echo 1 |trjconv -f _traj_norot.xtc -s _md.tpr -o _protein.xtc echo 1 1 |g_rms -f _protein.xtc -s _average.pdb -o _rmsd-all-atom-vs-average.xvg echo 4 4 |g_rms -f _protein.xtc -s _average.pdb -o _rmsd-backbone-vs-average.xvg </source>

Difference of RMSF

distributed by Marc: <source lang="perl">

  1. !/usr/bin/perl -w

$swExit = 0;

if(@ARGV < 2){ print STDERR "statTD.pl <RMSF FILE1> <RMSF FILE2> [START] [END]\n"; exit; } else{ if(! -e "$ARGV[0]"){ print STDERR "File $ARGV[0] does not exist!\n"; $swExit = 1; } if(! -e "$ARGV[1]"){ print STDERR "File $ARGV[1] does not exist!\n"; $swExit = 1; } } if(defined $ARGV[2]){

if(not defined $ARGV[3]){ print STDERR "END value missing!\n"; $swExit = 1; } elsif($ARGV[2] !~ m/[0-9]+/ || $ARGV[3] !~ m/[0-9]+/){ print STDERR "Problem with the ranges please check!\n"; $swExit = 1; } } if($swExit){exit;} if(defined $ARGV[2] && defined $ARGV[3] && $ARGV[2] > $ARGV[3]){ $tmp = $ARGV[2]; $ARGV[2] = $ARGV[3]; $ARGV[3] = $tmp; }


open(FILE,$ARGV[0]); while(<FILE>){ chomp($_); if(/^\s*([0-9]+)\s+([0-9]+\.[0-9]+)/){ #print "$1 $2\t"; if(defined $ARGV[2] && defined $ARGV[3] && $1 >= $ARGV[2] && $1 <= $ARGV[3]){ $in1{$1} = $2; #print "1\n"; } elsif(not defined $ARGV[2]){ $in1{$1} = $2; #print "$ARGV[2] $ARGV[3] 3\n"; }


} } close FILE; open(FILE,$ARGV[1]); while(<FILE>){ chomp($_); if(/^\s*([0-9]+)\s+([0-9]+\.[0-9]+)/){ #print "$1 $2\n"; if(defined $ARGV[2] && defined $ARGV[3] && $1 >= $ARGV[2] && $1 <= $ARGV[3]) { $in2{$1} = $2; } elsif(not defined $ARGV[2]){ $in2{$1} = $2; } } } close FILE; if((keys %in1) != (keys %in2)){ print STDERR "Number of datapoints not identical!\n"; exit; }

$value = &getPV(\%in1,\%in2);

print "$value"; if($value <= 0.001){ print "**\n"; } elsif($value <= 0.05){ print "*\n"; } else{ print "-\n"; } sub getPV{ my (%data1) = %{$_[0]}; my (%data2) = %{$_[1]}; my $avg = 0; my $cnt = 0; my $std = 0; my $ste = 0; my $ratio = 0;

foreach $key (sort keys %data1){ #print "$key\n"; $avg += ($data1{$key} - $data2{$key}); $cnt++; } $avg /= $cnt; foreach $key (sort keys %data1){ $std += (($data1{$key} - $data2{$key}) - $avg)**2; } $std /= $cnt; $std = sqrt($std); $ste = $std/sqrt($cnt); $ratio = abs($avg/$ste); open(OUT,">/tmp/rscript.tmp"); print OUT "ratio <- c($ratio);\n"; print OUT "dfree <- c($cnt);\n"; print OUT "dt(ratio,dfree);\n"; close OUT; $value = `Rscript /tmp/rscript.tmp`; chomp($value); `rm -f /tmp/rscript.tmp`; return (split/\s+/,$value)[1]; }

</source>

Radius of Gyration

<source lang="bash"> echo 1 0 |g_gyrate -f _traj_norot.xtc -s _md.tpr -o _radius-of-gyration.xvg echo 1 0 |g_gyrate -f _traj_norot.xtc -s _md.tpr -o _moments_of_inertia.xvg -moi </source>

Accessible Surface

<source lang="bash"> echo 1 1 |g_sas -f _traj_norot.xtc -s _md.tpr -o _solvent-accessible-surface.xvg -oa _atomic-sas.xvg -or _residue-sas.xvg </source>

Hydrogen Bonds

<source lang="bash"> echo 1 1 | g_hbond -f _traj_norot.xtc -s _md.tpr -num _hydrogen-bonds-intra-protein.xvg echo 1 12 | g_hbond -f _traj_norot.xtc -s _md.tpr -num _hydrogen-bonds-protein-water.xvg </source>

Secondary Structure

<source lang="bash"> wget http://swift.cmbi.ru.nl/gv/dssp/HTML/dsspcmbi chmod +x dsspcmbi export DSSP=$HOME/dsspcmbi echo 1|do_dssp -f _traj_norot.xtc -s _md.tpr -o _secondary-structure.xpm -sc _secondary-structure.xvg -dt 10 xpm2ps -f _secondary-structure.xpm -o _secondary-structure.eps </source>

Ramachandran Plot

<source lang="bash"> g_rama -f _traj_norot.xtc -s _md.tpr -o _ramachandran.xvg </source>

Somewhat nicer and modified from here. Extract the desired time steps in the traj_filtered.pdb with your favorite method, clean residues unknown to the script and use the script below with the pdb-file as parameter. Or use the whole trajectory-pdb, if you have the memory and want a nice black page. :-) Requires numpy, rpy (modules available in the ubuntu repositories) and MMTK (download). <source lang="bash"> python ramachandran.py <time-step> </source>

<source lang="python">

  1. !/usr/bin/env python
  2. usage: ramachandran.py <pdb-file>

rama_GENERAL = "General" rama_GLYCINE = "Glycine" rama_PROLINE = "Proline" rama_PRE_PRO = "Pre-Pro" ramachandran_types = [rama_GENERAL,rama_GLYCINE,rama_PROLINE,rama_PRE_PRO]

  1. I have used the same colours as RAMPAGE
  2. http://raven.bioc.cam.ac.uk/rampage.php

rama_settings = {"General" : ([0, 0.0005, 0.02, 1],

                             ['#FFFFFF','#B3E8FF','#7FD9FF'],
                             "pct/rama/rama500-general.data"),
                             # or rama500-general-nosec.data
                "Glycine" : ([0, 0.002,  0.02, 1],
                             ['#FFFFFF','#FFE8C5','#FFCC7F'],
                             "pct/rama/rama500-gly-sym.data"),
                             # or rama500-gly-sym-nosec.data
                "Proline" : ([0, 0.002,  0.02, 1],
                             ['#FFFFFF','#D0FFC5','#7FFF8C'],
                             "pct/rama/rama500-pro.data"),
                "Pre-Pro" : ([0, 0.002,  0.02, 1],
                             ['#FFFFFF','#B3E8FF','#7FD9FF'],
                             "pct/rama/rama500-prepro.data")}
                #P.S. Also rama500-ala-nosec.data

import numpy def load_data_file(filename) :

   STEP=2
   HALF_STEP=1
   STEP = HALF_STEP*2
   lower_bounds = range(-180, 180, STEP)
   mid_points = range(-180+HALF_STEP, 180+HALF_STEP, STEP)
   upper_bounds = range(-180+STEP, 180+STEP, STEP)
   data = numpy.array([[0.0 for x in mid_points] for y in mid_points],
                        dtype=numpy.float)
   """
   # Table name/description: "Top500 General case (not Gly, Pro, or pre-Pro) B<30"
   # Number of dimensions: 2
   # For each dimension, 1 to 2: lower_bound  upper_bound  number_of_bins  wrapping
   #   x1: -180.0 180.0 180 true
   #   x2: -180.0 180.0 180 true
   # List of table coordinates and values. (Value is last number on each line.)
   -179.0 -179.0 0.0918642445114388
   -179.0 -177.0 0.07105717866463215
   ...
   """
   input_file = open(filename,"r")
   for line in input_file :
       #Strip the newline character(s) from the end of the line
       if line[-1]=="\n" : line = line[:-1]
       if line[-1]=="\r" : line = line[:-1]
       if line[0]=="#" :
           #comment
           pass
       else :
           #data
           parts = line.split()
           assert len(parts)==3
           
           x1 = float(parts[0]) #phi
           x2 = float(parts[1]) #psi
           value = float(parts[2])
           
           assert x1 == float(int(x1))
           assert x2 == float(int(x2))
           i1 = mid_points.index(int(x1))
           i2 = mid_points.index(int(x2))
           
           data[i1,i2]=value
   input_file.close()
   return (data, lower_bounds, mid_points, upper_bounds)
  1. filename = "stat/rama/rama500-general.data"
  2. data, lower_bounds, mid_points, upper_bounds = load_data_file(filename)
  3. print sum(sum(data))

from rpy import * r.library("MASS")

print "Creating R function", r(""" ramachandran.plot <- function(x.scatter, y.scatter,

   x.grid = seq(0, 1, len = nrow(z)), y.grid = seq(0, 1, len = ncol(z)), z.grid,
   xlim = range(x.grid, finite = TRUE), ylim = range(y.grid, finite = TRUE),
   zlim = range(z.grid, finite = TRUE), levels = pretty(zlim, nlevels),
   nlevels = 20, color.palette = cm.colors, col = color.palette(length(levels) -
       1), plot.title="", plot.axes, key.title, key.axes, asp = NA,
   xaxs = "i", yaxs = "i", las = 1, axes = TRUE, frame.plot = axes,
   ...)

{

   if (missing(z.grid)) {
       stop("no 'z.grid' matrix specified")
   }
   else if (is.list(x.grid)) {
       y.grid <- x.grid$y
       x.grid <- x.grid$x
   }
   if (any(diff(x.grid) <= 0) || any(diff(y.grid) <= 0))
       stop("increasing 'x.grid' and 'y.grid' values expected")
   plot.new()
   plot.window(xlim, ylim, "", xaxs = xaxs, yaxs = yaxs, asp = asp)
   if (!is.matrix(z.grid) || nrow(z.grid) <= 1 || ncol(z.grid) <= 1)
       stop("no proper 'z.grid' matrix specified")
   if (!is.double(z.grid))
       storage.mode(z.grid) <- "double"
   .Internal(filledcontour(as.double(x.grid), as.double(y.grid), z.grid, as.double(levels),
       col = col))
   if (!(missing(x.scatter)) && !(missing(y.scatter))) {
       plot.xy(xy.coords(x.scatter,y.scatter,NULL,NULL,NULL,NULL),
               xlim=xlim, ylim=ylim, xlab="", ylab="", asp=asp,
               type="p", pch=20, cex=0.1)
   }
       
   if (missing(plot.axes)) {
       if (axes) {
           title(main=plot.title, xlab=expression(phi), ylab=expression(psi))
           axis(1, at=c(-180,-90,0,90,180))
           axis(2, at=c(-180,-90,0,90,180))
       }
   }
   else plot.axes
   if (frame.plot)
       box()
   if (missing(plot.title))
       title(...)
   else plot.title
   invisible()

} """) print "Done"


import math def degrees(rad_angle) :

   """Converts and angle in radians to degrees, mapped to the range [-180,180]"""
   angle = rad_angle * 180 / math.pi
   #Note this assume the radians angle is positive as that's what MMTK does
   while angle > 180 :
       angle = angle - 360
   return angle

def next_residue(residue) :

   """Expects an MMTK residue, returns the next residue in the chain, or None"""
   #Proteins go N terminal --> C terminal
   #The next reside is bonded to the C of this atom...
   for a in residue.peptide.C.bondedTo():
       if a.parent.parent != residue:
           return a.parent.parent
   return None


def residue_amino(residue) :

   """Expects an MMTK residue, returns the three letter amino acid code in upper case"""
   if residue :
       return residue.name[0:3].upper()
   else :
       return None

def residue_ramachandran_type(residue) :

   """Expects an MMTK residue, returns ramachandran 'type' (General, Glycine, Proline or Pre-Pro)"""
   if residue_amino(residue)=="GLY" :
       return rama_GLYCINE
   elif residue_amino(residue)=="PRO" :
       return rama_PROLINE
   elif residue_amino(next_residue(residue))=="PRO" :
       #exlcudes those that are Pro or Gly
       return rama_PRE_PRO
   else :
       return rama_GENERAL

scatter_phi = dict() scatter_psi = dict() for ramachandran_type in ramachandran_types :

   scatter_phi[ramachandran_type]=[]
   scatter_psi[ramachandran_type]=[]
   

pdb=sys.argv[1] pdb_filename = "%s.pdb" % pdb

print "Loading PDB file: " + pdb_filename import MMTK.PDB import MMTK.Proteins

  1. protein = MMTK.Proteins.Protein("1HMP.pdb", model="no_hydrogens")
  2. Load the PDB file, ignore the hydrogrens, and then build a model of the peptides:

configuration = MMTK.PDB.PDBConfiguration(pdb_filename) configuration.deleteHydrogens() protein = MMTK.Proteins.Protein(configuration.createPeptideChains(model = "no_hydrogens")) for chain in protein :

   print chain.name
   for residue in chain :
       phi, psi = residue.phiPsi()
       #print residue.name, phi, psi
       if phi and psi :
           ramachandran_type = residue_ramachandran_type(residue)
           assert ramachandran_type in ramachandran_types
           scatter_phi[ramachandran_type].append(degrees(phi))
           scatter_psi[ramachandran_type].append(degrees(psi))
       assert len(scatter_phi) == len(scatter_psi)
   

print "Done"

  1. pdf_filename = "../%s.pdf" % pdb
  2. r.pdf(pdf_filename)

png_filename = "%s.png" % pdb r.png(png_filename)

  1. To get four plots on one page, you could use :
  2. r.split_screen([2,2]) #split into two by two screen
  3. Or:
  4. r.layout(Numeric.array([[1,2],[3,4]]), respect=True)
  5. But I went for simply:

r.par(mfrow=[2,2])

for (i,ramachandran_type) in enumerate(ramachandran_types) :

   #pdf_filename = "../%s_%s.pdf" % (pdb, ramachandran_type)
   (rama_levels, rama_colors, rama_filename) = rama_settings[ramachandran_type]
   
   print "Loading data file: " + rama_filename,
   data, lower_bounds, mid_points, upper_bounds = load_data_file(rama_filename)
   print "Done"
   #print "Creating PDF output file: " + pdf_filename,
   #r.pdf(pdf_filename)
   #r.plot(scatter_phi, scatter_psi)
   print "Generating quadrant %i, %s" % (i+1, ramachandran_type)
   #r.screen(i+1)
   #Use small margins to make the plots nice and big,
   #and specify a SQUARE plot area (to go with aspect ratio, asp=1)
   r.par(mar = [2, 2, 2, 2], pty="s")
   #This function will do a Ramachandran plot in the next quadrant
   #which we setup using par(mfrow-...)
   r.ramachandran_plot(x_scatter=scatter_phi[ramachandran_type],
                       y_scatter=scatter_psi[ramachandran_type], 
                       x_grid=mid_points, y_grid=mid_points, z_grid=data,
                       xlim=[-180,180], ylim=[-180,180], asp=1.0,
                       plot_title=ramachandran_type, drawlabels=False,
                       levels=rama_levels, col=rama_colors)
   print ramachandran_type + " Done"

r.dev_off() print "Done" </source>

more RMSDs

<source lang="bash"> echo -e "1\n1\n\n"|g_rms -s _md.tpr -f _traj_norot.xtc -f2 _traj_norot.xtc -m _rmsd-matrix.xpm -dt 10 echo -e "6\n6\n\n"|g_rms -s _md.tpr -f _traj_norot.xtc -f2 _traj_norot.xtc -m _rmsd-matrix_chain-c_b.xpm -dt 10 xpm2ps -f _rmsd-matrix.xpm -o _rmsd-matrix.eps -rainbow blue xpm2ps -f _rmsd-matrix_chain-c_b.xpm -o _rmsd-matrix_chain-c_b.eps -rainbow blue </source>

Cluster Analysis

<source lang="bash"> echo 6 6 | g_cluster -s _md.tpr -f _traj_norot.xtc -dm _rmsd-matrix_chain-c_b.xpm -dist _rmsd-distribution.xvg -o _clusters.xpm -sz _cluster-sizes.xvg -tr _cluster-transitions.xpm -ntr _cluster-transitions.xvg -clid _cluster-id-over-time.xvg -cl _clusters.pdb -cutoff 0.1 -method gromos -dt 10 </source>

Distance RMSD

<source lang="bash"> echo 1|g_rmsdist -s _md.tpr -f _traj_norot.xtc -o _distance-rmsd.xvg </source>