TSD createMap.pl

From Bioinformatikpedia

<source lang="perl">

  1. !/usr/bin/perl

use strict; use warnings; use diagnostics; use sigtrap; use autodie;

sub updateMissingDiseaseInfo; sub readInput; sub checkForDiseaseInfo; sub printDatabasesPerPosition; sub printAminoAcidSourcesAndTargets; sub printMutationMap; sub printSubsetsForPymol;

my $SEQUENCE_LENGTH = 529; my $REF_SEQUENCE = "MTSSRLWFSLLLAAAFAGRATALWPWPQNFQTSDQRYVLYPNNFQFQYDVSSAAQPGCSVLDEAFQRYRDLLFGSGSWPRPYLTGKRHTLEKNVLVVSVVTPGCNQLPTLESVENYTLTINDDQCLLLSETVWGALRGLETFSQLVWKSAEGTFFINKTEIEDFPRFPHRGLLLDTSRHYLPLSSILDTLDVMAYNKLNVFHWHLVDDPSFPYESFTFPELMRKGSYNPVTHIYTAQDVKEVIEYARLRGIRVLAEFDTPGHTLSWGPGIPGLLTPCYSGSEPSGTFGPVNPSLNNTYEFMSTFFLEVSSVFPDFYLHLGGDEVDFTCWKSNPEIQDFMRKKGFGEDFKQLESFYIQTLLDIVSSYGKGYVVWQEVFDNKVKIQPDTIIQVWREDIPVNYMKELELVTKAGFRALLSAPWYLNRISYGPDWKDFYIVEPLAFEGTPEQKALVIGGEACMWGEYVDNTNLVPRLWPRAGAVAERLWSNKLTSDLTFAYERLSHFRCELLRRGVQAQPLNVGFCEQEFEQT"; my @A_REF_SEQ = split(//, $REF_SEQUENCE);

my $INDEX_ID = 0; my $INDEX_DISEASE = 1; my $INDEX_MISSYN = 2;

my $SYM_NOT_DISEASE = "N/A";

my %mutations = (); my %aaPropensities = (); my @diseaseAnnoSources = ('OMIM', 'HGMD', 'snpdbe'); my $noDiseaseAnnoSources = 'dbSNP|SNPedia';


my $in = shift; #all mutations in one combined file

readInput(); updateMissingDiseaseInfo();

  1. Output

printDatabasesPerPosition(); printAminoAcidSourcesAndTargets(); printMutationMap(); printSubsetsForPymol();

sub printSubsetsForPymol {

   my $fhs;
   my $fhd;
   my $fhnd;
   
   open($fhs, '>', 'resi_synonly');
   open($fhd, '>', 'resi_nsyndisonly');
   open($fhnd, '>','resi_nsynndisonly');
   
   for my $pos (keys (%mutations))
   {
       for my $mut (keys %{ $mutations{$pos} } )
       {
           my $dis = 1;
           my $syn = 0;
           for my $db (keys %{ $mutations{$pos}{$mut} } )
           {
               if($mutations{$pos}{$mut}{$db}[$INDEX_DISEASE] =~ m|$SYM_NOT_DISEASE|)
               {
                   $dis = 0;
               }
               if($mutations{$pos}{$mut}{$db}[$INDEX_MISSYN] =~ m|synonymous|)
               {
                   $syn = 1;
               }
           }
           
           if($syn){ print $fhs $pos."\n"; } 
           
           if($dis)
           {
               if(!$syn) { print $fhd $pos."\n"; }
           }
           else
           {
               if(!$syn) { print $fhnd $pos."\n"; }
           }
       }
   }
   
   close($fhs);
   close($fhd);
   close($fhnd);
   

}


sub printMutationMap {

   #This is just for statistics
   my $max_cnt_disease = 0;
   my $max_cnt_ndisease = 0;
   for my $pos (keys(%mutations))
   {
       my $cnt_disease = 0;
       my $cnt_ndisease = 0;
       
       for my $mut (keys %{ $mutations{$pos} } )
       {
           my $is_disease = 1;
           for my $db (keys %{ $mutations{$pos}{$mut} } )
           {
  1. print $pos." $mut $db\t";
  2. print $mutations{$pos}{$mut}{$db}[$INDEX_DISEASE]." " .$mutations{$pos}{$mut}{$db}[$INDEX_ID] . " ". $mutations{$pos}{$mut}{$db}[$INDEX_MISSYN] . "\n";
               if($mutations{$pos}{$mut}{$db}[$INDEX_DISEASE] =~ m|$SYM_NOT_DISEASE|)
               {
                   $is_disease = 0;
               }
           }
           
           (!$is_disease) ? $cnt_ndisease++ : $cnt_disease++;
           
           $max_cnt_disease = ($cnt_disease > $max_cnt_disease) ? $cnt_disease : $max_cnt_disease;
           $max_cnt_ndisease = ($cnt_ndisease > $max_cnt_ndisease) ? $cnt_ndisease : $max_cnt_ndisease;
       }
   }
   
  1. print "MAx ndisease $max_cnt_ndisease\n"; #MAx ndisease 2
  2. print "Max disease $max_cnt_disease\n"; #Max disease 3
   #Let the hardcoding fun begin. We need one line in the middle and then three above for disease mutations and two below for non-disease mutations
   my @disease_arrays = ( [], [], [] );
   my @ndisease_arrays = ( [], []);
   
   for my $pos (keys (%mutations))
   {
       for my $mut (keys %{ $mutations{$pos} } )
       {
           my @split_mut = split(//, $mut);
           my $aa_wt = $split_mut[0];
           my $aa_mt = $split_mut[-1];
           
           #Handling all differnet mutations at the current position independent of dis/ndis
           my $dis = 1;
           my $syn = 0;
           #Just enter this to see if it is disease
           for my $db (keys %{ $mutations{$pos}{$mut} } )
           {
               if($mutations{$pos}{$mut}{$db}[$INDEX_DISEASE] =~ m|$SYM_NOT_DISEASE|)
               {
                   $dis = 0;
               }
               if($mutations{$pos}{$mut}{$db}[$INDEX_MISSYN] =~ m|synonymous|)
               {
                   $syn = 1;
               }
           }
           
           if($syn){ $aa_mt = "$aa_mt"; } 
           
           if($dis)
           {
               if(!$syn) { $aa_mt = "$aa_mt"; }
               addToArrays(\@disease_arrays, $pos, $aa_mt);
           }
           else
           {
               if(!$syn) { $aa_mt = "$aa_mt"; }
               addToArrays(\@ndisease_arrays, $pos, $aa_mt);
           }
       }
   }
   
   
   #Now recurse the arrays and fill the gaps
   for(my $i = 1; $i <= $SEQUENCE_LENGTH; $i++)
   {
       #Work down from upper disease layer to sequence to ndisease layer. I know, the efficiency is breathtaking
       fillLineArrays(\@disease_arrays, 2, $i);
       fillLineArrays(\@disease_arrays, 1, $i);
       fillLineArrays(\@disease_arrays, 0, $i);
  1. print $A_REF_SEQ[$i-1];
       fillLineArrays(\@ndisease_arrays, 0, $i);
       fillLineArrays(\@ndisease_arrays, 1, $i);
   }
   my $LINE_LENGTH = 100;
   for(my $i=0; $i < 6; $i++)      # NOTE REALLY 6?
   {
       my $base = $i * 100;
      
        my $ind;
       for my $aref (reverse sort (@disease_arrays))
       {
           my @a = @{$aref};
           print "          ";
           for(my $j = 1 ; $j <= $LINE_LENGTH ; $j++)
           {
               $ind = $base + $j;     
               print $a[$ind];
           }
           print "\t$ind\n";
       }
       
       print "P06865    ";
      
       print "";
       for(my $j = 1 ; $j <= $LINE_LENGTH ; $j++)
       {
           $ind = $base + $j -1;     
           print $A_REF_SEQ[$ind];
       }
       $ind++;
       print "\t$ind\n";
       for my $aref (sort (@ndisease_arrays))
       {
           my @a = @{$aref};
           
           print "          ";
           for(my $j = 1 ; $j <= $LINE_LENGTH ; $j++)
           {
               $ind = $base + $j;     
               print $a[$ind];
           }
           print "\t$ind\n";
       }
       
       print "\n\n";
       
   }
   

}

sub fillLineArrays {

   my $arrayListRef = shift;
   my $layer = shift;
   my $pos = shift;
   my @arrayList = @{$arrayListRef};
   
   $arrayList[$layer]->[$pos] = (exists $arrayList[$layer]->[$pos]) ? $arrayList[$layer]->[$pos] : ' ';

}

sub addToArrays {

   my $arrayListRef = shift;
   my $pos = shift;
   my $aa = shift;
   
   my @arrayList = @{$arrayListRef};
   
   for my $arr_ref (sort @arrayList)
   {
       if(!exists $arr_ref->[$pos])
       {
           $arr_ref->[$pos] = $aa;
           last;
       }
   }

}

sub printAminoAcidSourcesAndTargets {

   #Collect Data
   for my $pos (keys(%mutations))
   {
       for my $mut ( keys ( %{$mutations{$pos}} ) )
       {
           my @split = split(//,$mut);
           my $source = $split[0];
           my $target = $split[-1];
           
           if(!exists $aaPropensities{$source})
           {
               $aaPropensities{$source} = [0 , 0];
           }
           if(!exists $aaPropensities{$target})
           {
               $aaPropensities{$target} = [0 , 0];
           }
           
           $aaPropensities{$source}[0] += 1;
           $aaPropensities{$target}[1] += 1;
       }
   }
   
   my $fh;
   open($fh, '>', 'aaPropensities');
   
   for my $aa (sort (keys(%aaPropensities)))
   {
       print $fh "\t$aa";
   }
   print $fh "\nSource";
   
   for my $aa (sort (keys(%aaPropensities)))
   {
       print $fh "\t". $aaPropensities{$aa}[0];
   }
   print $fh "\nTarget";
   
   for my $aa (sort (keys(%aaPropensities)))
   {
       print $fh "\t". $aaPropensities{$aa}[1];
   }
   
   close($fh);
   

}

sub printDatabasesPerPosition {

   my $fh;
   open($fh, '>', 'databasesPerPosition');
   #For every position, print the number of databases that annotate a SNP at this position
   for(my $i = 0; $i<=$SEQUENCE_LENGTH; $i++)  #Good old c-style for loop... ;)
   {
       my @foundDBs = ();
       if(exists $mutations{$i})
       {
           for my $mut (keys %{$mutations{$i} } )
           {
               for my $db (keys %{$mutations{$i}{$mut} } )
               {
                   if(! ($db ~~ @foundDBs ) )
                   {
                       push(@foundDBs, $db);
                   }
               }
           }
       }
       my $num = scalar(@foundDBs);
       print $fh "$i\t$num\n";
   }
   close($fh);

}

sub updateMissingDiseaseInfo {

   for my $pos (keys(%mutations))
   {
       for my $mut (keys %{ $mutations{$pos} } )
       {
           for my $db (keys %{ $mutations{$pos}{$mut} } )
           {
               if($db =~m/^$noDiseaseAnnoSources$/)
               {
                   my $udpdatedInfo = checkForDiseaseInfo($pos, $mut, $db);
                   $mutations{$pos}{$mut}{$db}[$INDEX_DISEASE] = $udpdatedInfo;
               }
           }
       }
   }

}

sub checkForDiseaseInfo {

   my $pos = shift;
   my $mut = shift;
   my $dbc = shift;
   my @dbs = keys(%{$mutations{$pos}{$mut}});
   my $updatedInfo = $SYM_NOT_DISEASE;       # NOTE we assume no link to disease if we have no evidence it is linked to TSD
   
   for my $dball (@dbs)
   {
       if($dball ~~ @diseaseAnnoSources)
       {
           $updatedInfo = $mutations{$pos}{$mut}{$dball}[$INDEX_DISEASE];
           last;
       }
   }
   return $updatedInfo;

}

sub readInput {

   my $fh;
   open($fh, '<', $in);
   while(my $line = <$fh>) 
   {
       if($line =~ m/(\w)(\d+)(\w)\t(.+)\t(.*)\t(.*)\t(missense|synonymous)/) #L127F   HGMD    CM970716    Tay-Sachs disease   missense
       {
           my $mut = $1.$2.$3;
           my $db = $4;
           my $id = $5;
           my $disease = $6;
           my $mssyn = $7;
           my $pos = $2;
           
           if($db =~ m/snpdbe/)
           {
               $db = 'snpdbe';
           }
           if(!exists $mutations{$pos}) { $mutations{$pos} = {};}
           if(!exists $mutations{$pos}{$mut}) { $mutations{$pos}{$mut} = {}; }
           $mutations{$pos}{$mut}{$db} = [ $id, $disease, $mssyn ];
       }
       else
       {
           print "Cannot read line $line\n";
       }
   }
   close($fh);

} </perl>