#!/usr/bin/perl
# Create "grammars" for Asterisk-Sphinx integration.
# (c) 2009, Christopher Jansen
#
# Based on SimpleLM by Ricky Houghton
#
use strict;
use FileHandle;

if(!$ARGV[2])
{
  print("usage: $0 INTEXT INOUTDICT OUTGRAMMAR\nexample: $0 ./test.txt ./text.dict mygrammar\n");
  print("  INTEXT: plain-text file with one sentence to recognize per line\n");
  print("  INOUTDICT: dictionary file to create; note that switching grammars requires all grammars to share a dictionary\n");
  print("  OUTGRAMMAR: grammar file to create\n\n");
  print("Edit script to change location of master dictionary, temp files, or sphinx binaries.\n");
  exit(0);
}

my $intext         = shift;
my $outdictfile    = shift;
my $outgrammarfile = shift;

my $sphinxbindir = "/opt/sphinx/bin";
my $workdir      = "/tmp";
my $indictfile   = "/opt/sphinx/cmudict/dict.0.7a";
my $CLEANUP      = 1;  # Remove temporary files?

# parse input - format for use (remove all non-text chars, lowercase, wrap in <s></s>)
print "Creating $workdir/text\n";
my $ifh = new FileHandle("< $intext") || die("Cannot read: $intext");
my $tfh = new FileHandle("> $workdir/text") || die("Cannot write: $workdir/text");
while(my $line = <$ifh>)
{
  chomp $line;
  $line =~ s/[^[A-Z][a-z] ]//go;
  $tfh->print("<s> $line </s>\n");
}
$ifh->close;
$tfh->close;

# Create CCS file (Cargo-culted from original script - I have no idea what this is for.)
if(!-s "$workdir/ccs.ccs")
{
  print "Creating $workdir/ccs.css\n";
  my $ccs = new FileHandle("> $workdir/ccs.ccs") || die("Cannot write to $workdir/ccs.ccs\n");
  $ccs->print("<s>");
  $ccs->close;
}

# Create wfreq Word Frequency
print "Creating $workdir/wfreq\n";
my $cmdline    = "cat $workdir/text | $sphinxbindir/text2wfreq 2>/dev/null | sort -T . > $workdir/wfreq";
my $progoutput = qx/$cmdline/;

# Create vocab
print "Creating $workdir/vocab\n";
$cmdline    = "cat $workdir/wfreq | $sphinxbindir/wfreq2vocab 2>/dev/null > $workdir/vocab";
$progoutput = qx/$cmdline/;

# Create or update dict
my %vhash = (); #hash of vocab
if(-e $outdictfile)
{
  printf("Dictionary $outdictfile exists, so preserving existing entries.\n");
  my $fh = new FileHandle("< $outdictfile") || die("Cannot read: $outdictfile\n");
  while(my $line=<$fh>)
  {
    chomp $line;
    if($line =~ m/^([()]?[^(\s]+)/o)
    {
      $vhash{lc($1)} = 1;
    }
    else
    {
      print STDERR "Error parsing line from existing output dict: '$line'\n";
    }
  }
  $fh->close();
}

my $vfh   = new FileHandle("< $workdir/vocab") || die("Cannot open: $workdir/vocab");
while(my $line = <$vfh>)
{
  chomp $line;
  $vhash{lc($line)} = 1;
}
$vfh->close();

print("Outputting new dictionary $outdictfile\n");
my $dofh   = new FileHandle("> $outdictfile") || die("Cannot open for writing: $outdictfile\n");
my $idfh   = new FileHandle("< $indictfile") || die("Cannot open: $indictfile\n");
while(my $line = <$idfh>)
{
  next if($line =~ m/^;;;/o); # Skip comment lines
  chomp($line);

  if($line =~ m/^([()]?[^(\s]+)(\(\d+\))?\s+(.*)$/o)
  {
    if(exists $vhash{lc($1)})
    {
      if($3)
      {
        $dofh->print(sprintf("%-30s%s\n", lc($1).$2,$3));
      }
      else
      {
        $dofh->print(sprintf("%-30s%s\n", lc($1),$2));
      }
    }
  }
  else
  {
    print STDERR "Cannot parse dictionary line: '$line'\n";
  }
}
$idfh->close();
$dofh->close();

# Create wngram
print "Creating $workdir/wngram\n";
$cmdline    = "cat $workdir/text | $sphinxbindir/text2wngram -temp $workdir 2>/dev/null > $workdir/wngram";
$progoutput = qx/$cmdline/;

# Create idngram
print "Creating $workdir/idngram\n";
$cmdline    = "cat $workdir/wngram | $sphinxbindir/wngram2idngram -vocab $workdir/vocab -temp $workdir 2>/dev/null > $workdir/idngram";
$progoutput = qx/$cmdline/;

# Create grammar(lm)
print "Creating $outgrammarfile\n";
$cmdline    = "$sphinxbindir/idngram2lm -vocab $workdir/vocab -idngram $workdir/idngram -arpa $outgrammarfile -vocab_type 1 -good_turing -disc_ranges 1 7 7 -calc_mem -context $workdir/ccs.ccs -four_byte_counts -verbosity 1 2>/dev/null ";
$progoutput = qx/$cmdline/;

# Optionally delete interim files.
if($CLEANUP)
{
  foreach my $tfn (qw{ccs.ccs text wngram vocab idngram wfreq})
  {
    printf("Deleting: $workdir/$tfn\n");
    unlink("$workdir/$tfn");
  }
}

