#!/usr/bin/perl -w
# 27 September 2004
# Michael E Sparks (mespar1@iastate.edu)

# A tool to scrape out conserved nucleotide blocks in a T-COFFEE alignment according to
# CORE index / minimum block length thresholds.  Will ablate $ABLATE residues from both
# ends of a conserved block.
#
# Assumes T-COFFEE was run with at least the following parameters:
#   -seqnos=on \
#   -output=t_coffee,score_ascii,fasta_aln
#
# Use mrtrans or a clone (e.g., bpmrtrans.PLS of BioPerl) to create a DNA fasta alignment
#   from the protein alignment.
#
# Finally, use this script to extract blocks.

# Example usage for a collection of cDNA sequences ``cluster.dna.fst" and
#   their translation products ``cluster.pep.fst" :
# $ cat cluster.pep.fst | t_coffee \
#    -infile=stdin \
#    -outfile=PROT \
#    -type=PROTEIN \
#    -seqnos=on    \
#    -output=t_coffee,fasta_aln,score_ascii
# $ bp_mrtrans.PLS \
#    -i PROT.fasta_aln  \
#    -if fasta          \
#    -o DNA.aln.fst     \
#    -of fasta          \
#    -s cluster.dna.fst \
#    -sf fasta
# $ block-scraper.pl \
#  7  \
#  20 \
#  5  \
#  PROT.t_coffee \
#  PROT.score_ascii \
#  DNA.aln.fst > nucleotide.blocks

# Copyright (c) Michael E Sparks
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
#
#     * Redistributions of source code must retain the above copyright
#       notice, this list of conditions and the following disclaimer.
#     * Redistributions in binary form must reproduce the above copyright
#       notice, this list of conditions and the following disclaimer in
#       the documentation and/or other materials provided with the
#       distribution.
#     * Neither the name of the Iowa State University nor the names of
#       its contributors may be used to endorse or promote products
#       derived from this software without specific prior written permission.
#
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.

use strict;

my $MINCORE=shift;
my $MINLEN=shift;
my $ABLATE=shift;
my $tcoff_file=shift;
my $score_file=shift;
my $dna_file=shift || die 
  "Usage: ./block-scraper.pl min_CORE minblocklen ablatelen *.t_coffee *.score_ascii DNA.aln.fst\n";

open(TCOFF,"<$tcoff_file") or die "Can't open $tcoff_file! $!\n";
open(SCORE,"<$score_file") or die "Can't open $score_file! $!\n";
open(DNA,"<$dna_file") or die "Can't open $dna_file! $!\n";

my ($line,$seqid,$scorestring,$alnseq)="";
my (%scores,%alnseqs,%alnseqsd)=();

# PARSE CORE INDEX STRINGS
until ($line=~m/:\s+\d+$/) { $line=<SCORE>; } # Scroll past overall scores
until ($line=~m/^$/) { $line=<SCORE>; }

$line=<SCORE>;
until ($line=~m/^$/) {
  $line=~m/^(\S{1,15}).*?\d+\s(.*?)\s+\d+\s$/;
  ($seqid,$scorestring)=($1,$2);
  $scores{$seqid}=$scorestring;
  $line=<SCORE>;
}
until ($line=~/^Cons/) { $line=<SCORE>; }
$line=<SCORE>;

while(defined(<SCORE>)) { 
  $line=<SCORE>;
  if ($line=~m/^$/) { last; }
  until ($line=~m/^$/) {
    $line=~m/^(\S{1,15}).*?\d+\s(.*?)\s+\d+\s$/;
    ($seqid,$scorestring)=($1,$2);
    $scores{$seqid}.=$scorestring;
    $line=<SCORE>;
  }
  until ($line=~/^Cons/) { $line=<SCORE>; }
  $line=<SCORE>;
}
close SCORE;

# PARSE PROTEIN SEQUENCE STRINGS FROM ALIGNMENT
until ($line=~m/^LEN /) { $line=<TCOFF>; } # Scroll past header
$line=~m/^LEN\s+(\d+)/; my $protalnlength=$1;
until ($line=~m/^$/) { $line=<TCOFF>; }

$line=<TCOFF>;
until ($line=~m/^\s+.*?\d+$/) {
  $line=~m/^(\S{1,15}).*?\s+\d+\s(.*?)\s+\d+\s$/;
  ($seqid,$alnseq)=($1,$2);
  $alnseqs{$seqid}=$alnseq;
  $line=<TCOFF>;
}

while(defined(<TCOFF>)) { 
  $line=<TCOFF>;
  if ($line=~m/^$/) { last; }
  until ($line=~m/^\s+.*?\d+$/) {
    $line=~m/^(\S{1,15}).*?\s+\d+\s(.*?)\s+\d+\s$/;
    ($seqid,$alnseq)=($1,$2);
    $alnseqs{$seqid}.=$alnseq;
    $line=<TCOFF>;
  }
}
close TCOFF;

#PARSE DNA ALIGNMENT STRINGS
$seqid=""; my $tempstring="";
while ($line=<DNA>) {
  if ($line=~m/^>/) {
    $line=~m/^>.*?(\S{1,15}).*?$/;
    $seqid=$1;
  }
  else {
    chomp($line);
    $alnseqsd{$seqid}.=$line;
  }
}
close DNA;

# Print strings for debug if needed
foreach my $key (sort keys %alnseqs) {
 if (length($alnseqs{$key}) != $protalnlength) { die "Inconsistency\n"; }
# print $key,"  ",$alnseqs{$key},"\n";
}
foreach my $key (sort keys %alnseqsd) {
 if (length($alnseqsd{$key}) != (3*$protalnlength)) { die "Inconsistency\n"; }
# print $key,"  ",$alnseqsd{$key},"\n";
}
foreach my $key (sort keys %scores)  {
 if (length($scores{$key}) != $protalnlength) { die "Inconsistency\n"; }
# print $key,"  ",$scores{$key},"\n";
}

# NOW, SCRAPE OUT THE BLOCKS
my $block_len=0;
my $block_ct=1;
my $blockcoltrue=1;
my $blocksettrue=0;
my $i=0;
my ($start,$stop)=0;
my @keys=keys(%alnseqs); # identical to keys(%scores)

for ($i=1;$i<=$protalnlength;++$i) {
  KEYPROGRESSION : foreach my $key (@keys) {
    my $char2test=substr($scores{$key},$i-1,1);
    if (($char2test eq '-')     ||
        ($char2test !~ m/[\d]/) ||
        ($char2test < $MINCORE))
      {
      if ($block_len >= $MINLEN) {
        $stop=$i-1;
        #Report it
        print "\n";
        $start+=$ABLATE; $stop-=$ABLATE;
        print "Block $block_ct ($start,$stop):\n";
        foreach my $key2 (@keys) {
          my $protstring=substr($alnseqs{$key2},$start-1,$stop-$start+1);
          my @protstring=split(//,$protstring);
          my $preprintstring="$key2                                                 ";
          $preprintstring=~m/^(.{20})/;
          print "$1";
          foreach my $protpiece (@protstring) {
            print "$protpiece   ";
          }
          print "\n";
          # Now report the coding data
          my $dnastring=substr($alnseqsd{$key2},3*($start-1),3*($stop-$start+1));
          my @dnastring=split(//,$dnastring);
          print "                   ";
          my $j=0;
          foreach my $dnapiece (@dnastring) {
            print "$dnapiece";
            ++$j; 
            if ($j%3 == 0 && $j>0) { print " "; }
          }
          print "\n";
        }
        ++$block_ct;
      }
      $block_len=0;
      $blockcoltrue=0;
      $blocksettrue=0;
      last KEYPROGRESSION;
    }
  } # end foreach
  if ($blockcoltrue) {
    if (!$blocksettrue) {
      $start=$i;
      $blocksettrue=1;
    }
    ++$block_len;
  }
  else {
    $blockcoltrue=1; # An alignment position has to be fouled to get set to 0
  }
}

# Does block persist until end of alignment?
if ($blocksettrue) {
  if ($block_len >= $MINLEN) {
    $stop=$protalnlength;
    #Report it
    print "\n";
    $start+=$ABLATE; $stop-=$ABLATE;
    print "Block $block_ct ($start,$stop):\n";
    foreach my $key2 (@keys) {
      my $protstring=substr($alnseqs{$key2},$start-1,$stop-$start+1);
      my @protstring=split(//,$protstring);
      my $preprintstring="$key2                                                 ";
      $preprintstring=~m/^(.{20})/;
      print "$1";
      foreach my $protpiece (@protstring) {
        print "$protpiece   ";
      }
      print "\n";
      # Now report the coding data
      my $dnastring=substr($alnseqsd{$key2},3*($start-1),3*($stop-$start+1));
      my @dnastring=split(//,$dnastring);
      print "                   ";
      my $j=0;
      foreach my $dnapiece (@dnastring) {
        print "$dnapiece";
        ++$j; 
        if ($j%3 == 0 && $j>0) { print " "; }
      }
      print "\n";
    }
    ++$block_ct;
  }
}
print "\n";

exit 0;

