Difference between revisions of "Task2 Hemochromatosis Protocol"

From Bioinformatikpedia
(HHblits)
(Scripts)
Line 310: Line 310:
   
 
=== MSA ===
 
=== MSA ===
  +
  +
=== Other ===

Revision as of 17:04, 7 May 2012

Sequence Searches

Blast

TSK="/mnt/home/student/bernhoferm/task2"
BIG="/mnt/project/pracstrucfunc12/data/big"

time blastall -p blastp -d $BIG/big_80 -i $TSK/Q30201.fasta -b 1500 -m 8 -o $TSK/align2/blast_default_1500.out

real	1m24.100s
user	1m5.950s
sys	0m3.500s

PSI-Blast

TSK="/mnt/home/student/bernhoferm/task2"
BIG="/mnt/project/pracstrucfunc12/data/big"


time blastpgp -C $TSK/h002j2.chk -d $BIG/big_80 -i $TSK/Q30201.fasta -h 0.002 -j 2 -b 10000 -m 8 -o $TSK/align2/blastpgp_h0.002_j2_10000.out

real	3m21.031s
user	2m44.750s
sys	0m10.270s


time blastpgp -C $TSK/h002j10.chk -d $BIG/big_80 -i $TSK/Q30201.fasta -h 0.002 -j 10 -b 10000 -m 8 -o $TSK/align2/blastpgp_h0.002_j10_10000.out

real	16m39.699s
user	14m22.150s
sys	0m41.310s


time blastpgp -C $TSK/h1e-10j2.chk -d $BIG/big_80 -i $TSK/Q30201.fasta -h 1e-10 -j 2 -b 10000 -m 8 -o $TSK/align2/blastpgp_h1e-10_j2_10000.out

real	3m6.093s
user	2m48.260s
sys	0m6.020s


time blastpgp -C $TSK/h1e-10j10.chk -d $BIG/big_80 -i $TSK/Q30201.fasta -h 1e-10 -j 10 -b 10000 -m 8 -o $TSK/align2/blastpgp_h1e-10_j10_10000.out

real	16m41.797s
user	15m18.350s
sys	0m28.890s


time blastpgp -R $TSK/h002j2.chk -d $BIG/big -i $TSK/Q30201.fasta -h 0.002 -j 2 -b 100000 -m 8 -o $TSK/align2/blastpgp_big_h0.002_j2_100000.out

real	28m17.459s
user	18m26.930s
sys	2m46.200s


time blastpgp -R $TSK/h002j10.chk -d $BIG/big -i $TSK/Q30201.fasta -h 0.002 -j 10 -b 100000 -m 8 -o $TSK/align2/blastpgp_big_h0.002_j10_100000.out

real	367m15.021s (srsly...)
user	310m31.210s
sys	14m39.460s
...and some thousand "[blastpgp] ERROR: ncbiapi [000.000]  ObjMgrNextAvailEntityID failed with idx 2048" messages


time blastpgp -R $TSK/h1e-10j2.chk -d $BIG/big -i $TSK/Q30201.fasta -h 1e-10 -j 2 -b 100000 -m 8 -o $TSK/align2/blastpgp_big_h1e-10_j2_100000.out

real	26m43.350s
user	19m54.380s
sys	2m3.900s


time blastpgp -R $TSK/h1e-10j10.chk -d $BIG/big -i $TSK/Q30201.fasta -h 1e-10 -j 10 -b 100000 -m 8 -o $TSK/align2/blastpgp_big_h1e-10_j10_100000.out

real	64m4.575s
user	47m16.970s
sys	4m54.890s
...and some thousand "[blastpgp] ERROR: ncbiapi [000.000]  ObjMgrNextAvailEntityID failed with idx 2048" messages

HHblits

TSK="/mnt/home/student/bernhoferm/task2"
HHB="/mnt/project/pracstrucfunc12/data/hhblits"


time hhblits -d $HHB/uniprot20_current -i $TSK/Q30201.fasta -Z 600 -B 600 -o $TSK/align2/hhblits_default_600.out

real	13m13.813s
user	5m4.400s
sys	2m45.520s

MSA

#!/bin/bash
time t_coffee /mnt/home/student/joerdensv/newGroups/group0-40.fasta -output=fasta_aln
time t_coffee /mnt/home/student/joerdensv/newGroups/group0-99.fasta -output=fasta_aln
time t_coffee /mnt/home/student/joerdensv/newGroups/group60-99.fasta -output=fasta_aln
time muscle -in /mnt/home/student/joerdensv/newGroups/group60-99.fasta -out group60-99muscle.fasta -quiet
time muscle -in /mnt/home/student/joerdensv/newGroups/group0-99.fasta -out group0-99muscle.fasta -quiet
time muscle -in /mnt/home/student/joerdensv/newGroups/group0-40.fasta -out group0-40muscle.fasta -quiet
time clustalw -align -infile=/mnt/home/student/joerdensv/newGroups/group60-99.fasta -outfile=/mnt/home/student/joerdensv/newGroups/group60-99clustalw.fasta -output=fasta
time clustalw -align -infile=/mnt/home/student/joerdensv/newGroups/group0-99.fasta -outfile=/mnt/home/student/joerdensv/newGroups/group0-99clustalw.fasta -output=fasta
time clustalw -align -infile=/mnt/home/student/joerdensv/newGroups/group0-40.fasta -outfile=/mnt/home/student/joerdensv/newGroups/group0-40clustalw.fasta -output=fasta

Scripts

(PSI-)BLAST

The output files from BLAST and PSI-BLAST were parsed and filtered with the following perl script. The PSI-BLAST output "filtered" for the last iteration step by hand (before using the script). If mutliple hits with the same ID were found, only the first one was considered (the one with the best e-Value).


>getM8StatsAndIDs.pl <source lang="perl">

  1. !/usr/local/bin/perl

use strict; use warnings; use diagnostics; use sigtrap; use autodie;

my $filename = shift; my $outputFile = "$filename.stats"; my $outputFile2 = "$filename.ids";

my %IDtoEVal; my %IDtoIdent;

my $id; my $eVal; my $ident;

open (FILE, "<", $filename) or die;

while (<FILE>) { my $line = $_;

my @content = split(/\t/, $line);

$id = $content[1]; $eVal = $content[10]; $ident = $content[2];

$id =~s/\s//gi; $eVal =~s/\s//gi; $ident =~s/\s//gi;

$id =~m/.*?\|(.*?)\|(.*?)$/gi;

my $uniID = $1; my $pdbID = $2;

if ($uniID=~m/^pdb$/gi) { $id = "PDB:".$pdbID; } else { $id = $uniID; }

if ($eVal=~m/^e/) { $eVal="1".$eVal; }

if (not defined $IDtoEVal{$id}) { $IDtoEVal{$id} = $eVal; $IDtoIdent{$id} = $ident; } }

close FILE;


  1. Sort Data
  2. my @keys = sort{$IDtoEVal{$a} <=> $IDtoEVal{$b}} keys %IDtoEVal;

my @keys = sort{$IDtoIdent{$b} <=> $IDtoIdent{$a}} keys %IDtoIdent;


  1. Thresholds

my $EValThreshold = 0.002; my $IdentThreshold = 0;


open (OUTFILE, ">", $outputFile) or die; open (OUTFILE2, ">", $outputFile2) or die;

foreach my $key (@keys) { if ($IDtoEVal{$key} <= $EValThreshold && $IDtoIdent{$key} >= $IdentThreshold) { print OUTFILE "$key\t$IDtoEVal{$key}\t$IDtoIdent{$key}\n"; print OUTFILE2 "$key\n"; } }

close OUTFILE; close OUTFILE2; </source>

HHblits

The perl script bellow was used to parse and filter the HHblits output. For multiple hits again only the first one was considered.


<source lang="perl">

  1. !/usr/local/bin/perl

use strict; use warnings; use diagnostics; use sigtrap; use autodie;

my $filename = shift; my $outputFile = "$filename.stats"; my $outputFile2 = "$filename.ids";

my %IDtoEVal; my %IDtoIdent; my %UniprotIDs;

my $id; my $eVal; my $ident; my $count = 1;


open (FILE, "<", $filename) or die;

while (<FILE>) { my $line = $_;

if ($line=~m/>UP20\|(.*?)\|.+/gi) { $id = $1;

my @tmp = split(/\|/, $line); my $uid;

for (my $i=4; $i < scalar(@tmp); $i++) { $uid = $tmp[$i]; $uid =~ s/\s*\[.*?\]//; $uid =~ s/\.//; $uid =~ s/\s//; $UniprotIDs{$uid}="alive"; } }

if ($line=~m/.*?E-value\s*=\s*(\d+?\D*?\d*?\D*?\d*?)\s+/gi) { $eVal = $1;

if ($eVal=~m/^e/) { $eVal="1".$eVal; }

if (not defined $IDtoEVal{$id}) { $IDtoEVal{$id} = $eVal; } }

if ($line=~m/.*?Identities\s*=\s*(\d*?)\D/gi) { $ident = $1;

if (not defined $IDtoIdent{$id}) { $IDtoIdent{$id} = $ident/100; } } }

close FILE;


  1. Sort Data
  2. my @keys = sort{$IDtoEVal{$a} <=> $IDtoEVal{$b}} keys %IDtoEVal;

my @keys = sort{$IDtoIdent{$b} <=> $IDtoIdent{$a}} keys %IDtoIdent;


  1. Thresholds

my $EValThreshold = 0.003; #To include the only cluster with a worse e-Value than 0.001 which has 0.002!


open (OUTFILE, ">", $outputFile) or die;

foreach my $key (@keys) { if ($IDtoEVal{$key} <= $EValThreshold) { print OUTFILE "$key\t$IDtoEVal{$key}\t$IDtoIdent{$key}\n"; } }

close OUTFILE;


open (OUTFILE2, ">", $outputFile2) or die;

foreach my $key (keys %UniprotIDs) { print OUTFILE2 "$key\n"; }

close OUTFILE2; </source>

MSA

Other