Difference between revisions of "Run.pl"

From Bioinformatikpedia
(Created page with "You can find the script run.pl here on biocluster: <code>/mnt/home/student/kalemanovm/master_practical/Assignment2_Alignments/scripts/task1</code>. The usage of the script with e…")
 
 
(12 intermediate revisions by the same user not shown)
Line 1: Line 1:
  +
Performs a BLAST/PSI-BLAST/HHblits run with custom parameters on biolab computers.
You can find the script run.pl here on biocluster: <code>/mnt/home/student/kalemanovm/master_practical/Assignment2_Alignments/scripts/task1</code>. The usage of the script with examples can also be found in the header of the file itself.
 
   
  +
You can find the script <code>run.pl</code> here on biocluster: <code>/mnt/home/student/kalemanovm/master_practical/Assignment2_Alignments/scripts/task1</code>. The usage of the script with examples can also be found in the header of the file itself.
Usage: perl run.pl --in <input sequence file> --out_b <output blast file> --out_p <output psiblast file> --out_h <output hhblits file> [options]
 
  +
--db_b <database blast/psiblast> default=big_80
 
  +
==Usage==
--db_h <database hhblits> default=uniprot20
 
  +
perl run.pl --in <input sequence file> --out_b <output blast file> --out_p <output psiblast file> --out_h <output hhblits file> [options] <br>
--iter_p <number of iterations psiblast> default=2 (in this script)
 
--iter_h <number of iterations hhblits,[1,8]> default=2
+
--db_b <database BLAST/Psi-BLAST> default=big_80 <br>
  +
--db_h <database HHblits> default=uniprot20 <br>
--c <checkpoins file name. ckeckpoint PSSM file should be created to search in other iterations (in other database)
 
  +
--iter_p <number of iterations Psi-BLAST> default=2 (in this script) <br>
--r <checkpoins file name. ckeckpoint PSSM file to be reused to search in other iterations (in other database)
 
  +
--iter_h <number of iterations HHblits,[1,8]> default=2 <br>
--h prints this usage
 
  +
--eval_p <E-Value cutoff for Psi-BLAST, [0,1]> default=0.002 <br>
  +
--eval_h <E-Value cutoff for HHblits, [0,1]> default=0.001 <br>
  +
--c <checkpoins Psi-BLAST file name> ckeckpoint PSSM file should be created to search in other iterations (in other database) <br>
  +
--r <checkpoins Psi-BLAST file name> ckeckpoint PSSM file to be reused to search in other iterations (in other database) <br>
  +
--h prints this usage <br>
  +
  +
==Code==
  +
<source lang="perl">
  +
#!/usr/bin/perl -w
  +
#################################################################################################################################################
  +
# run.pl
  +
# Performs a BLAST/PSI-BLAST/HHblits run with custom parameters on biolab computers.
  +
# @autor: Maria Kalemanov
  +
  +
#example usages: ( from directory /mnt/home/student/kalemanovm/master_practical/Assignment2_Alignments/output/task1, but please do not try to execute it from there, use you own directories;) )
  +
  +
#BLAST:
  +
#../../scripts/task1/run.pl --in ../../gaucher_data/P04062.fasta --out_b blast-big_80
  +
#../scripts/task1/run.pl --in ../../gaucher_data/P04062.fasta --out_b blast-big -db_b big
  +
  +
#Psi-BLAST:
  +
#../../scripts/task1/run.pl --in ../../gaucher_data/P04062.fasta --out_p psiblast-2iter-big_80 --c psiblast-1iter-big_80.chk
  +
#../../scripts/task1/run.pl --in ../../gaucher_data/P04062.fasta --out_p psiblast-1iter-big_80--2iter_big --db_b big --r psiblast-1iter-big_80.chk --c psiblast-1iter-big_80--1iter_big.chk
  +
#../../scripts/task1/run.pl --in ../../gaucher_data/P04062.fasta --out_p psiblast-1iter-big_80--1iter_big--1iter_pdb --db_b pdb --iter_p 1 --r psiblast-1iter-big_80--1iter_big.chk
  +
  +
#HHblits:
  +
#../../scripts/task1/run.pl --in ../../gaucher_data/P04062.fasta --out_h hhblits-2iter-uniprot20
  +
#../../scripts/task1/run.pl --in hhblits-2iter-uniprot20.oa3m --out_h hhblits-2iter-uniprot20--1iter-pdb --db_h pdb --iter_h 1
  +
#################################################################################################################################################
  +
use strict;
  +
use File::chdir;
  +
use Getopt::Long;
  +
  +
  +
# Parse command line parameters
  +
our($in, $out_b, $out_p, $out_h, $db_b, $db_h, $iter_p, $iter_h, $eval_p, $eval_h, $c, $r, $h);
  +
  +
my $args_ok = GetOptions('in=s' => \$in, #full path of the input file with a protein sequence
  +
'out_b=s' => \$out_b, #path where the output file of BLAST should be saved
  +
'out_p=s' => \$out_p, #path where the output file of Psi-BLAST should be saved
  +
'out_h=s' => \$out_h, #path where the output file of HHblits should be saved
  +
#optional parameters:
  +
'db_b=s' => \$db_b, #BLAST/Psi-BLAST DB, e.g. big_80 or big
  +
'db_h=s' => \$db_h, #HHblits DB, e.g. uniprot20
  +
'iter_p=i' => \$iter_p, #Psi-BLAST number of iterations
  +
'iter_h=i' => \$iter_h, #HHblits number of iterations
  +
'eval_p=s' => \$eval_p, #E-Value cutoff for Psi-BLAST
  +
'eval_h=s' => \$eval_h, #E-Value cutoff for HHblits
  +
'c=s' => \$c, #Psi-BLAST: create a checkpoint PSSM file to use in further searches
  +
'r=s' => \$r, #Psi-BLAST: reuse a created checkpoint file
  +
'h' => \$h #print usage
  +
);
  +
  +
  +
sub print_usage {
  +
print "Usage: perl run.pl --in <input sequence file> --out_b <output blast file> --out_p <output psiblast file> --out_h <output hhblits file> [options]\n" .
  +
"--db_b <database BLAST/Psi-BLAST>\tdefault=big_80\n" .
  +
"--db_h <database HHblits>\tdefault=uniprot20\n" .
  +
"--iter_p <number of iterations Psi-BLAST>\tdefault=2 (in this script)\n" .
  +
"--iter_h <number of iterations HHblits,[1,8]>\tdefault=2\n" .
  +
"--eval_p <E-Value cutoff for Psi-BLAST, [0,1]>\tdefault=0.002\n".
  +
"--eval_h <E-Value cutoff for HHblits, [0,1]>\tdefault=0.001\n".
  +
"--c <checkpoins Psi-BLAST file name>\tckeckpoint PSSM file should be created to search in other iterations (in other database)\n" .
  +
"--r <checkpoins Psi-BLAST file name>\tckeckpoint PSSM file to be reused to search in other iterations (in other database)\n" .
  +
"--h prints this usage\n";
  +
}
  +
  +
sub print_err {
  +
print "Failed to execute run.pl\n";
  +
print_usage();
  +
exit;
  +
}
  +
  +
  +
if ($h){
  +
print_usage();
  +
exit;
  +
}
  +
if (!$in || !($out_b || $out_p || $out_h)){
  +
print_err();
  +
}
  +
  +
  +
#Databases:
  +
#for (Psi-)BLAST:
  +
my $big_80 = "/mnt/project/rost_db/data/big/big_80";
  +
my $big = "/mnt/project/rost_db/data/big/big";
  +
my $pdb_seqres = "/mnt/home/rost/kloppmann/data/blast_db/pdb_seqres";
  +
#for HHblits:
  +
my $uniprot20 = "/mnt/project/rost_db/data/hhblits/uniprot20_02Sep11";
  +
my $pdb_full = "/mnt/project/rost_db/data/hhblits/pdb_full";
  +
  +
  +
#Set $db_b from short databases names:
  +
if ($db_b){
  +
if ($db_b eq "big_80"){
  +
$db_b = $big_80;
  +
}elsif ($db_b eq "big"){
  +
$db_b = $big;
  +
}elsif ($db_b eq "pdb" || $db_b eq "pdb_seqres"){
  +
$db_b = $pdb_seqres;
  +
}
  +
}
  +
if ($db_h){
  +
if ($db_h eq "uniprot" || $db_h eq "uniprot20"){
  +
$db_h = $uniprot20;
  +
}elsif ($db_h eq "pdb" || $db_h eq "pdb_full"){
  +
$db_h = $pdb_full;
  +
}
  +
}
  +
  +
  +
#Set default parameters:
  +
my $num_shown_hits = 10000;
  +
if (!$db_b){
  +
$db_b = $big_80;
  +
}
  +
if (!$db_h){
  +
$db_h = $uniprot20;
  +
}
  +
if (!$iter_p){
  +
$iter_p = 2;
  +
}
  +
if (!$iter_h){
  +
$iter_h = 2;
  +
}
  +
if (!$eval_p){
  +
$eval_p = 0.002;
  +
}
  +
if (!$eval_h){
  +
$eval_h = 0.001;
  +
}
  +
  +
#Other variables:
  +
my $pssm_p = "";
  +
if ($out_p){
  +
$pssm_p = $out_p . "_pssm";
  +
}
  +
my $pssm_h = "";
  +
my $hhm = "";
  +
if ($out_h){
  +
$pssm_h = $out_h . ".a3m";
  +
$hhm = $out_h . ".hhm";
  +
$out_h = $out_h . ".hhr";
  +
}
  +
my $cmd_b; #command line parameters for BLAST
  +
my $cmd_p; #command line parameters for Psi-BLAST
  +
my $cmd_h; #command line parameters for HHblits
  +
  +
  +
#Run BLAST:
  +
if ($in && $out_b){
  +
$cmd_b = "time blastall -p blastp -i $in -d $db_b -v $num_shown_hits -b $num_shown_hits -o $out_b";
  +
system($cmd_b) && print "Failed to execute $cmd_b";
  +
}
  +
#if ($in && $out_b){
  +
# $cmd_b = "time blastpgp -i $in -d $db_b -j 1 -v $num_shown_hits -b $num_shown_hits -o $out_b";
  +
# system($cmd_b) && print "Failed to execute $cmd_b";
  +
#}
  +
  +
  +
# Run Psi-BLAST:
  +
if ($in && $out_p){
  +
if($c){
  +
$cmd_p = "time blastpgp -i $in -d $db_b -j $iter_p -h $eval_p -v $num_shown_hits -b $num_shown_hits -o $out_p -Q $pssm_p -C $c";
  +
system($cmd_p) && print "Failed to execute $cmd_p";
  +
}elsif($r){
  +
$cmd_p = "time blastpgp -i $in -d $db_b -j $iter_p -h $eval_p -v $num_shown_hits -b $num_shown_hits -o $out_p -Q $pssm_p -R $r -t 1";
  +
system($cmd_p) && print "Failed to execute $cmd_p";
  +
}elsif($r && $c){
  +
$cmd_p = "time blastpgp -i $in -d $db_b -j $iter_p -h $eval_p -v $num_shown_hits -b $num_shown_hits -o $out_p -Q $pssm_p -R $r -t 1 -C $c";
  +
system($cmd_p) && print "Failed to execute $cmd_p";
  +
}else{ #normal search without a checkpoint file
  +
$cmd_p = "time blastpgp -i $in -d $db_b -j $iter_p -h $eval_p -v $num_shown_hits -b $num_shown_hits -o $out_p -Q $pssm_p";
  +
system($cmd_p) && print "Failed to execute $cmd_p";
  +
}
  +
}
  +
  +
  +
# Run Hhblits:
  +
if ($in && $out_h){
  +
$cmd_h = "time hhblits -i $in -d $db_h -n $iter_h -e $eval_h -Z $num_shown_hits -B $num_shown_hits -o $out_h -oa3m $pssm_h -ohhm $hhm";
  +
system($cmd_h) && print "Failed to execute $cmd_h";
  +
}
  +
</source>

Latest revision as of 12:51, 21 May 2013

Performs a BLAST/PSI-BLAST/HHblits run with custom parameters on biolab computers.

You can find the script run.pl here on biocluster: /mnt/home/student/kalemanovm/master_practical/Assignment2_Alignments/scripts/task1. The usage of the script with examples can also be found in the header of the file itself.

Usage

perl run.pl --in <input sequence file> --out_b <output blast file> --out_p <output psiblast file> --out_h <output hhblits file> [options] 
--db_b <database BLAST/Psi-BLAST> default=big_80
--db_h <database HHblits> default=uniprot20
--iter_p <number of iterations Psi-BLAST> default=2 (in this script)
--iter_h <number of iterations HHblits,[1,8]> default=2
--eval_p <E-Value cutoff for Psi-BLAST, [0,1]> default=0.002
--eval_h <E-Value cutoff for HHblits, [0,1]> default=0.001
--c <checkpoins Psi-BLAST file name> ckeckpoint PSSM file should be created to search in other iterations (in other database)
--r <checkpoins Psi-BLAST file name> ckeckpoint PSSM file to be reused to search in other iterations (in other database)
--h prints this usage

Code

<source lang="perl">

  1. !/usr/bin/perl -w
  2. run.pl
  3. Performs a BLAST/PSI-BLAST/HHblits run with custom parameters on biolab computers.
  4. @autor: Maria Kalemanov
  1. example usages: ( from directory /mnt/home/student/kalemanovm/master_practical/Assignment2_Alignments/output/task1, but please do not try to execute it from there, use you own directories;) )
  1. BLAST:
  2. ../../scripts/task1/run.pl --in ../../gaucher_data/P04062.fasta --out_b blast-big_80
  3. ../scripts/task1/run.pl --in ../../gaucher_data/P04062.fasta --out_b blast-big -db_b big
  1. Psi-BLAST:
  2. ../../scripts/task1/run.pl --in ../../gaucher_data/P04062.fasta --out_p psiblast-2iter-big_80 --c psiblast-1iter-big_80.chk
  3. ../../scripts/task1/run.pl --in ../../gaucher_data/P04062.fasta --out_p psiblast-1iter-big_80--2iter_big --db_b big --r psiblast-1iter-big_80.chk --c psiblast-1iter-big_80--1iter_big.chk
  4. ../../scripts/task1/run.pl --in ../../gaucher_data/P04062.fasta --out_p psiblast-1iter-big_80--1iter_big--1iter_pdb --db_b pdb --iter_p 1 --r psiblast-1iter-big_80--1iter_big.chk
  1. HHblits:
  2. ../../scripts/task1/run.pl --in ../../gaucher_data/P04062.fasta --out_h hhblits-2iter-uniprot20
  3. ../../scripts/task1/run.pl --in hhblits-2iter-uniprot20.oa3m --out_h hhblits-2iter-uniprot20--1iter-pdb --db_h pdb --iter_h 1

use strict; use File::chdir; use Getopt::Long;


  1. Parse command line parameters

our($in, $out_b, $out_p, $out_h, $db_b, $db_h, $iter_p, $iter_h, $eval_p, $eval_h, $c, $r, $h);

my $args_ok = GetOptions('in=s' => \$in, #full path of the input file with a protein sequence 'out_b=s' => \$out_b, #path where the output file of BLAST should be saved 'out_p=s' => \$out_p, #path where the output file of Psi-BLAST should be saved 'out_h=s' => \$out_h, #path where the output file of HHblits should be saved #optional parameters: 'db_b=s' => \$db_b, #BLAST/Psi-BLAST DB, e.g. big_80 or big 'db_h=s' => \$db_h, #HHblits DB, e.g. uniprot20

            			                'iter_p=i'		=> \$iter_p, #Psi-BLAST number of iterations

'iter_h=i' => \$iter_h, #HHblits number of iterations 'eval_p=s' => \$eval_p, #E-Value cutoff for Psi-BLAST 'eval_h=s' => \$eval_h, #E-Value cutoff for HHblits 'c=s' => \$c, #Psi-BLAST: create a checkpoint PSSM file to use in further searches 'r=s' => \$r, #Psi-BLAST: reuse a created checkpoint file 'h' => \$h #print usage );


sub print_usage { print "Usage: perl run.pl --in <input sequence file> --out_b <output blast file> --out_p <output psiblast file> --out_h <output hhblits file> [options]\n" . "--db_b <database BLAST/Psi-BLAST>\tdefault=big_80\n" . "--db_h <database HHblits>\tdefault=uniprot20\n" . "--iter_p <number of iterations Psi-BLAST>\tdefault=2 (in this script)\n" . "--iter_h <number of iterations HHblits,[1,8]>\tdefault=2\n" . "--eval_p <E-Value cutoff for Psi-BLAST, [0,1]>\tdefault=0.002\n". "--eval_h <E-Value cutoff for HHblits, [0,1]>\tdefault=0.001\n". "--c <checkpoins Psi-BLAST file name>\tckeckpoint PSSM file should be created to search in other iterations (in other database)\n" . "--r <checkpoins Psi-BLAST file name>\tckeckpoint PSSM file to be reused to search in other iterations (in other database)\n" . "--h prints this usage\n"; }

sub print_err { print "Failed to execute run.pl\n"; print_usage(); exit; }


if ($h){ print_usage(); exit; } if (!$in || !($out_b || $out_p || $out_h)){ print_err(); }


  1. Databases:

#for (Psi-)BLAST: my $big_80 = "/mnt/project/rost_db/data/big/big_80"; my $big = "/mnt/project/rost_db/data/big/big"; my $pdb_seqres = "/mnt/home/rost/kloppmann/data/blast_db/pdb_seqres"; #for HHblits: my $uniprot20 = "/mnt/project/rost_db/data/hhblits/uniprot20_02Sep11"; my $pdb_full = "/mnt/project/rost_db/data/hhblits/pdb_full";


  1. Set $db_b from short databases names:

if ($db_b){ if ($db_b eq "big_80"){ $db_b = $big_80; }elsif ($db_b eq "big"){ $db_b = $big; }elsif ($db_b eq "pdb" || $db_b eq "pdb_seqres"){ $db_b = $pdb_seqres; } } if ($db_h){ if ($db_h eq "uniprot" || $db_h eq "uniprot20"){ $db_h = $uniprot20; }elsif ($db_h eq "pdb" || $db_h eq "pdb_full"){ $db_h = $pdb_full; } }


  1. Set default parameters:

my $num_shown_hits = 10000; if (!$db_b){ $db_b = $big_80; } if (!$db_h){ $db_h = $uniprot20; } if (!$iter_p){ $iter_p = 2; } if (!$iter_h){ $iter_h = 2; } if (!$eval_p){ $eval_p = 0.002; } if (!$eval_h){ $eval_h = 0.001; }

  1. Other variables:

my $pssm_p = ""; if ($out_p){ $pssm_p = $out_p . "_pssm"; } my $pssm_h = ""; my $hhm = ""; if ($out_h){ $pssm_h = $out_h . ".a3m"; $hhm = $out_h . ".hhm"; $out_h = $out_h . ".hhr"; } my $cmd_b; #command line parameters for BLAST my $cmd_p; #command line parameters for Psi-BLAST my $cmd_h; #command line parameters for HHblits


  1. Run BLAST:

if ($in && $out_b){ $cmd_b = "time blastall -p blastp -i $in -d $db_b -v $num_shown_hits -b $num_shown_hits -o $out_b"; system($cmd_b) && print "Failed to execute $cmd_b"; }

  1. if ($in && $out_b){
  2. $cmd_b = "time blastpgp -i $in -d $db_b -j 1 -v $num_shown_hits -b $num_shown_hits -o $out_b";
  3. system($cmd_b) && print "Failed to execute $cmd_b";
  4. }


  1. Run Psi-BLAST:

if ($in && $out_p){ if($c){ $cmd_p = "time blastpgp -i $in -d $db_b -j $iter_p -h $eval_p -v $num_shown_hits -b $num_shown_hits -o $out_p -Q $pssm_p -C $c"; system($cmd_p) && print "Failed to execute $cmd_p"; }elsif($r){ $cmd_p = "time blastpgp -i $in -d $db_b -j $iter_p -h $eval_p -v $num_shown_hits -b $num_shown_hits -o $out_p -Q $pssm_p -R $r -t 1"; system($cmd_p) && print "Failed to execute $cmd_p"; }elsif($r && $c){ $cmd_p = "time blastpgp -i $in -d $db_b -j $iter_p -h $eval_p -v $num_shown_hits -b $num_shown_hits -o $out_p -Q $pssm_p -R $r -t 1 -C $c"; system($cmd_p) && print "Failed to execute $cmd_p"; }else{ #normal search without a checkpoint file $cmd_p = "time blastpgp -i $in -d $db_b -j $iter_p -h $eval_p -v $num_shown_hits -b $num_shown_hits -o $out_p -Q $pssm_p"; system($cmd_p) && print "Failed to execute $cmd_p"; } }


  1. Run Hhblits:

if ($in && $out_h){ $cmd_h = "time hhblits -i $in -d $db_h -n $iter_h -e $eval_h -Z $num_shown_hits -B $num_shown_hits -o $out_h -oa3m $pssm_h -ohhm $hhm";

   system($cmd_h) && print "Failed to execute $cmd_h";

} </source>