TSD createMap.pl
<source lang="perl">
- !/usr/bin/perl
use strict; use warnings; use diagnostics; use sigtrap; use autodie;
sub updateMissingDiseaseInfo; sub readInput; sub checkForDiseaseInfo; sub printDatabasesPerPosition; sub printAminoAcidSourcesAndTargets; sub printMutationMap; sub printSubsetsForPymol;
my $SEQUENCE_LENGTH = 529; my $REF_SEQUENCE = "MTSSRLWFSLLLAAAFAGRATALWPWPQNFQTSDQRYVLYPNNFQFQYDVSSAAQPGCSVLDEAFQRYRDLLFGSGSWPRPYLTGKRHTLEKNVLVVSVVTPGCNQLPTLESVENYTLTINDDQCLLLSETVWGALRGLETFSQLVWKSAEGTFFINKTEIEDFPRFPHRGLLLDTSRHYLPLSSILDTLDVMAYNKLNVFHWHLVDDPSFPYESFTFPELMRKGSYNPVTHIYTAQDVKEVIEYARLRGIRVLAEFDTPGHTLSWGPGIPGLLTPCYSGSEPSGTFGPVNPSLNNTYEFMSTFFLEVSSVFPDFYLHLGGDEVDFTCWKSNPEIQDFMRKKGFGEDFKQLESFYIQTLLDIVSSYGKGYVVWQEVFDNKVKIQPDTIIQVWREDIPVNYMKELELVTKAGFRALLSAPWYLNRISYGPDWKDFYIVEPLAFEGTPEQKALVIGGEACMWGEYVDNTNLVPRLWPRAGAVAERLWSNKLTSDLTFAYERLSHFRCELLRRGVQAQPLNVGFCEQEFEQT"; my @A_REF_SEQ = split(//, $REF_SEQUENCE);
my $INDEX_ID = 0; my $INDEX_DISEASE = 1; my $INDEX_MISSYN = 2;
my $SYM_NOT_DISEASE = "N/A";
my %mutations = (); my %aaPropensities = (); my @diseaseAnnoSources = ('OMIM', 'HGMD', 'snpdbe'); my $noDiseaseAnnoSources = 'dbSNP|SNPedia';
my $in = shift; #all mutations in one combined file
readInput(); updateMissingDiseaseInfo();
- Output
printDatabasesPerPosition(); printAminoAcidSourcesAndTargets(); printMutationMap(); printSubsetsForPymol();
sub printSubsetsForPymol {
my $fhs; my $fhd; my $fhnd; open($fhs, '>', 'resi_synonly'); open($fhd, '>', 'resi_nsyndisonly'); open($fhnd, '>','resi_nsynndisonly'); for my $pos (keys (%mutations)) { for my $mut (keys %{ $mutations{$pos} } ) { my $dis = 1; my $syn = 0; for my $db (keys %{ $mutations{$pos}{$mut} } ) { if($mutations{$pos}{$mut}{$db}[$INDEX_DISEASE] =~ m|$SYM_NOT_DISEASE|) { $dis = 0; } if($mutations{$pos}{$mut}{$db}[$INDEX_MISSYN] =~ m|synonymous|) { $syn = 1; } } if($syn){ print $fhs $pos."\n"; } if($dis) { if(!$syn) { print $fhd $pos."\n"; } } else { if(!$syn) { print $fhnd $pos."\n"; } } } } close($fhs); close($fhd); close($fhnd);
}
sub printMutationMap
{
#This is just for statistics my $max_cnt_disease = 0; my $max_cnt_ndisease = 0; for my $pos (keys(%mutations)) { my $cnt_disease = 0; my $cnt_ndisease = 0; for my $mut (keys %{ $mutations{$pos} } ) { my $is_disease = 1; for my $db (keys %{ $mutations{$pos}{$mut} } ) {
- print $pos." $mut $db\t";
- print $mutations{$pos}{$mut}{$db}[$INDEX_DISEASE]." " .$mutations{$pos}{$mut}{$db}[$INDEX_ID] . " ". $mutations{$pos}{$mut}{$db}[$INDEX_MISSYN] . "\n";
if($mutations{$pos}{$mut}{$db}[$INDEX_DISEASE] =~ m|$SYM_NOT_DISEASE|) { $is_disease = 0; } } (!$is_disease) ? $cnt_ndisease++ : $cnt_disease++; $max_cnt_disease = ($cnt_disease > $max_cnt_disease) ? $cnt_disease : $max_cnt_disease; $max_cnt_ndisease = ($cnt_ndisease > $max_cnt_ndisease) ? $cnt_ndisease : $max_cnt_ndisease; } }
- print "MAx ndisease $max_cnt_ndisease\n"; #MAx ndisease 2
- print "Max disease $max_cnt_disease\n"; #Max disease 3
#Let the hardcoding fun begin. We need one line in the middle and then three above for disease mutations and two below for non-disease mutations my @disease_arrays = ( [], [], [] ); my @ndisease_arrays = ( [], []); for my $pos (keys (%mutations)) { for my $mut (keys %{ $mutations{$pos} } ) { my @split_mut = split(//, $mut); my $aa_wt = $split_mut[0]; my $aa_mt = $split_mut[-1]; #Handling all differnet mutations at the current position independent of dis/ndis my $dis = 1; my $syn = 0; #Just enter this to see if it is disease for my $db (keys %{ $mutations{$pos}{$mut} } ) { if($mutations{$pos}{$mut}{$db}[$INDEX_DISEASE] =~ m|$SYM_NOT_DISEASE|) { $dis = 0; } if($mutations{$pos}{$mut}{$db}[$INDEX_MISSYN] =~ m|synonymous|) { $syn = 1; } } if($syn){ $aa_mt = "$aa_mt"; } if($dis) { if(!$syn) { $aa_mt = "$aa_mt"; } addToArrays(\@disease_arrays, $pos, $aa_mt); } else { if(!$syn) { $aa_mt = "$aa_mt"; } addToArrays(\@ndisease_arrays, $pos, $aa_mt); } } } #Now recurse the arrays and fill the gaps for(my $i = 1; $i <= $SEQUENCE_LENGTH; $i++) { #Work down from upper disease layer to sequence to ndisease layer. I know, the efficiency is breathtaking fillLineArrays(\@disease_arrays, 2, $i); fillLineArrays(\@disease_arrays, 1, $i); fillLineArrays(\@disease_arrays, 0, $i);
- print $A_REF_SEQ[$i-1];
fillLineArrays(\@ndisease_arrays, 0, $i); fillLineArrays(\@ndisease_arrays, 1, $i); }
my $LINE_LENGTH = 100; for(my $i=0; $i < 6; $i++) # NOTE REALLY 6? { my $base = $i * 100; my $ind; for my $aref (reverse sort (@disease_arrays)) { my @a = @{$aref}; print " "; for(my $j = 1 ; $j <= $LINE_LENGTH ; $j++) { $ind = $base + $j; print $a[$ind]; } print "\t$ind\n"; } print "P06865 "; print ""; for(my $j = 1 ; $j <= $LINE_LENGTH ; $j++) { $ind = $base + $j -1; print $A_REF_SEQ[$ind]; } $ind++; print "\t$ind\n";
for my $aref (sort (@ndisease_arrays)) { my @a = @{$aref}; print " "; for(my $j = 1 ; $j <= $LINE_LENGTH ; $j++) { $ind = $base + $j; print $a[$ind]; } print "\t$ind\n"; } print "\n\n"; }
}
sub fillLineArrays {
my $arrayListRef = shift; my $layer = shift; my $pos = shift; my @arrayList = @{$arrayListRef}; $arrayList[$layer]->[$pos] = (exists $arrayList[$layer]->[$pos]) ? $arrayList[$layer]->[$pos] : ' ';
}
sub addToArrays {
my $arrayListRef = shift; my $pos = shift; my $aa = shift; my @arrayList = @{$arrayListRef}; for my $arr_ref (sort @arrayList) { if(!exists $arr_ref->[$pos]) { $arr_ref->[$pos] = $aa; last; } }
}
sub printAminoAcidSourcesAndTargets {
#Collect Data for my $pos (keys(%mutations)) { for my $mut ( keys ( %{$mutations{$pos}} ) ) { my @split = split(//,$mut); my $source = $split[0]; my $target = $split[-1]; if(!exists $aaPropensities{$source}) { $aaPropensities{$source} = [0 , 0]; } if(!exists $aaPropensities{$target}) { $aaPropensities{$target} = [0 , 0]; } $aaPropensities{$source}[0] += 1; $aaPropensities{$target}[1] += 1; } } my $fh; open($fh, '>', 'aaPropensities'); for my $aa (sort (keys(%aaPropensities))) { print $fh "\t$aa"; } print $fh "\nSource"; for my $aa (sort (keys(%aaPropensities))) { print $fh "\t". $aaPropensities{$aa}[0]; } print $fh "\nTarget"; for my $aa (sort (keys(%aaPropensities))) { print $fh "\t". $aaPropensities{$aa}[1]; } close($fh);
}
sub printDatabasesPerPosition {
my $fh; open($fh, '>', 'databasesPerPosition'); #For every position, print the number of databases that annotate a SNP at this position for(my $i = 0; $i<=$SEQUENCE_LENGTH; $i++) #Good old c-style for loop... ;) { my @foundDBs = (); if(exists $mutations{$i}) { for my $mut (keys %{$mutations{$i} } ) { for my $db (keys %{$mutations{$i}{$mut} } ) { if(! ($db ~~ @foundDBs ) ) { push(@foundDBs, $db); } } } }
my $num = scalar(@foundDBs); print $fh "$i\t$num\n"; } close($fh);
}
sub updateMissingDiseaseInfo {
for my $pos (keys(%mutations)) { for my $mut (keys %{ $mutations{$pos} } ) { for my $db (keys %{ $mutations{$pos}{$mut} } ) { if($db =~m/^$noDiseaseAnnoSources$/) { my $udpdatedInfo = checkForDiseaseInfo($pos, $mut, $db); $mutations{$pos}{$mut}{$db}[$INDEX_DISEASE] = $udpdatedInfo; } } } }
}
sub checkForDiseaseInfo {
my $pos = shift; my $mut = shift; my $dbc = shift;
my @dbs = keys(%{$mutations{$pos}{$mut}}); my $updatedInfo = $SYM_NOT_DISEASE; # NOTE we assume no link to disease if we have no evidence it is linked to TSD for my $dball (@dbs) { if($dball ~~ @diseaseAnnoSources) { $updatedInfo = $mutations{$pos}{$mut}{$dball}[$INDEX_DISEASE]; last; } } return $updatedInfo;
}
sub readInput {
my $fh; open($fh, '<', $in); while(my $line = <$fh>) { if($line =~ m/(\w)(\d+)(\w)\t(.+)\t(.*)\t(.*)\t(missense|synonymous)/) #L127F HGMD CM970716 Tay-Sachs disease missense { my $mut = $1.$2.$3; my $db = $4; my $id = $5; my $disease = $6; my $mssyn = $7; my $pos = $2; if($db =~ m/snpdbe/) { $db = 'snpdbe'; } if(!exists $mutations{$pos}) { $mutations{$pos} = {};} if(!exists $mutations{$pos}{$mut}) { $mutations{$pos}{$mut} = {}; } $mutations{$pos}{$mut}{$db} = [ $id, $disease, $mssyn ]; } else { print "Cannot read line $line\n"; } } close($fh);
} </perl>