Task2 Hemochromatosis Protocol
Contents
Sequence Searches
Blast
TSK="/mnt/home/student/bernhoferm/task2" BIG="/mnt/project/pracstrucfunc12/data/big" time blastall -p blastp -d $BIG/big_80 -i $TSK/Q30201.fasta -b 1500 -m 8 -o $TSK/align2/blast_default_1500.out real 1m24.100s user 1m5.950s sys 0m3.500s
PSI-Blast
TSK="/mnt/home/student/bernhoferm/task2" BIG="/mnt/project/pracstrucfunc12/data/big" time blastpgp -C $TSK/h002j2.chk -d $BIG/big_80 -i $TSK/Q30201.fasta -h 0.002 -j 2 -b 10000 -m 8 -o $TSK/align2/blastpgp_h0.002_j2_10000.out real 3m21.031s user 2m44.750s sys 0m10.270s time blastpgp -C $TSK/h002j10.chk -d $BIG/big_80 -i $TSK/Q30201.fasta -h 0.002 -j 10 -b 10000 -m 8 -o $TSK/align2/blastpgp_h0.002_j10_10000.out real 16m39.699s user 14m22.150s sys 0m41.310s time blastpgp -C $TSK/h1e-10j2.chk -d $BIG/big_80 -i $TSK/Q30201.fasta -h 1e-10 -j 2 -b 10000 -m 8 -o $TSK/align2/blastpgp_h1e-10_j2_10000.out real 3m6.093s user 2m48.260s sys 0m6.020s time blastpgp -C $TSK/h1e-10j10.chk -d $BIG/big_80 -i $TSK/Q30201.fasta -h 1e-10 -j 10 -b 10000 -m 8 -o $TSK/align2/blastpgp_h1e-10_j10_10000.out real 16m41.797s user 15m18.350s sys 0m28.890s time blastpgp -R $TSK/h002j2.chk -d $BIG/big -i $TSK/Q30201.fasta -h 0.002 -j 2 -b 100000 -m 8 -o $TSK/align2/blastpgp_big_h0.002_j2_100000.out real 28m17.459s user 18m26.930s sys 2m46.200s time blastpgp -R $TSK/h002j10.chk -d $BIG/big -i $TSK/Q30201.fasta -h 0.002 -j 10 -b 100000 -m 8 -o $TSK/align2/blastpgp_big_h0.002_j10_100000.out real 367m15.021s (srsly...) user 310m31.210s sys 14m39.460s ...and some thousand "[blastpgp] ERROR: ncbiapi [000.000] ObjMgrNextAvailEntityID failed with idx 2048" messages time blastpgp -R $TSK/h1e-10j2.chk -d $BIG/big -i $TSK/Q30201.fasta -h 1e-10 -j 2 -b 100000 -m 8 -o $TSK/align2/blastpgp_big_h1e-10_j2_100000.out real 26m43.350s user 19m54.380s sys 2m3.900s time blastpgp -R $TSK/h1e-10j10.chk -d $BIG/big -i $TSK/Q30201.fasta -h 1e-10 -j 10 -b 100000 -m 8 -o $TSK/align2/blastpgp_big_h1e-10_j10_100000.out real 64m4.575s user 47m16.970s sys 4m54.890s ...and some thousand "[blastpgp] ERROR: ncbiapi [000.000] ObjMgrNextAvailEntityID failed with idx 2048" messages
HHblits
TSK="/mnt/home/student/bernhoferm/task2" HHB="/mnt/project/pracstrucfunc12/data/hhblits" time hhblits -d $HHB/uniprot20_current -i $TSK/Q30201.fasta -Z 600 -B 600 -o $TSK/align2/hhblits_default_600.out real 13m13.813s user 5m4.400s sys 2m45.520s
MSA
#!/bin/bash time t_coffee /mnt/home/student/joerdensv/newGroups/group0-40.fasta -output=fasta_aln time t_coffee /mnt/home/student/joerdensv/newGroups/group0-99.fasta -output=fasta_aln time t_coffee /mnt/home/student/joerdensv/newGroups/group60-99.fasta -output=fasta_aln time muscle -in /mnt/home/student/joerdensv/newGroups/group60-99.fasta -out group60-99muscle.fasta -quiet time muscle -in /mnt/home/student/joerdensv/newGroups/group0-99.fasta -out group0-99muscle.fasta -quiet time muscle -in /mnt/home/student/joerdensv/newGroups/group0-40.fasta -out group0-40muscle.fasta -quiet time clustalw -align -infile=/mnt/home/student/joerdensv/newGroups/group60-99.fasta -outfile=/mnt/home/student/joerdensv/newGroups/group60-99clustalw.fasta -output=fasta time clustalw -align -infile=/mnt/home/student/joerdensv/newGroups/group0-99.fasta -outfile=/mnt/home/student/joerdensv/newGroups/group0-99clustalw.fasta -output=fasta time clustalw -align -infile=/mnt/home/student/joerdensv/newGroups/group0-40.fasta -outfile=/mnt/home/student/joerdensv/newGroups/group0-40clustalw.fasta -output=fasta
Scripts
(PSI-)BLAST output
The output files from BLAST and PSI-BLAST were parsed and filtered with the following perl script. The PSI-BLAST output "filtered" for the last iteration step by hand (before using the script). If mutliple hits with the same ID were found, only the first one was considered (the one with the best e-Value).
>getM8StatsAndIDs.pl
<source lang="perl">
- !/usr/local/bin/perl
use strict; use warnings; use diagnostics; use sigtrap; use autodie;
my $filename = shift; my $outputFile = "$filename.stats"; my $outputFile2 = "$filename.ids";
my %IDtoEVal; my %IDtoIdent;
my $id; my $eVal; my $ident;
open (FILE, "<", $filename) or die;
while (<FILE>) { my $line = $_;
my @content = split(/\t/, $line);
$id = $content[1]; $eVal = $content[10]; $ident = $content[2];
$id =~s/\s//gi; $eVal =~s/\s//gi; $ident =~s/\s//gi;
$id =~m/.*?\|(.*?)\|(.*?)$/gi;
my $uniID = $1; my $pdbID = $2;
if ($uniID=~m/^pdb$/gi) { $id = "PDB:".$pdbID; } else { $id = $uniID; }
if ($eVal=~m/^e/) { $eVal="1".$eVal; }
if (not defined $IDtoEVal{$id}) { $IDtoEVal{$id} = $eVal; $IDtoIdent{$id} = $ident; } }
close FILE;
- Sort Data
- my @keys = sort{$IDtoEVal{$a} <=> $IDtoEVal{$b}} keys %IDtoEVal;
my @keys = sort{$IDtoIdent{$b} <=> $IDtoIdent{$a}} keys %IDtoIdent;
- Thresholds
my $EValThreshold = 0.002; my $IdentThreshold = 0;
open (OUTFILE, ">", $outputFile) or die;
open (OUTFILE2, ">", $outputFile2) or die;
foreach my $key (@keys) { if ($IDtoEVal{$key} <= $EValThreshold && $IDtoIdent{$key} >= $IdentThreshold) { print OUTFILE "$key\t$IDtoEVal{$key}\t$IDtoIdent{$key}\n"; print OUTFILE2 "$key\n"; } }
close OUTFILE; close OUTFILE2; </source>