#!/usr/bin/perl -w
use English; # things like MATCH

# --------------------------------------------------------------

# txt2phoNL - usage: perl txt2phoNL <dutch-text.txt >phonemes.pho
# the generated phoneme file is suitable for use with MBROLA,
# but you have to use the -e option in MBROLA to skip over
# spurious unpronounceable phoneme pairs (e.g. caused by English
# words in your Dutch text file!).
# Hint: Use pipes, e.g. "ls | txt2phoNL | mbrola -e - - | play"

# This is GPLed software (open source freeware) by
# Eric Auer <eric@coli.uni-sb.REMOVEthisIFyouAREnoSPAMMER.de>, the license is the GNU GPL
# version 2 or later, also available as copying.txt in this
# directory, http://www.coli.uni-saarland.de/~eric/stuff/soft (3/2002)

# Please give me some feedback: As I am no native speaker
# of Dutch, this txt2phoNL definitely need some improvement!

# --------------------------------------------------------------

# new version 14 feb 2002:
# - sanitize away illegal phone pairs in a last step,
#   includes devoicing of consonants before a break.
# - intermediate repn uses one char per phoneme.
# - simpler rewrite mechanism eats all matched chars
#   and produces only phones - so the text string is constant.
#   BUT: restart from " " if the rule input ended in " " !
# - steps: 1. digit/... names
#          2. sound pattern rules (preferring long matches,
#             walking the string and trying all rules per char)
#          3. sanitize and get final repn from intermediate one

my $XLATEDEBUG = 4; # show all translation rule applications
                    # of at least this size

# special one char repn:
# _ is " ", Ei is 1, 9y is 3, Au is 4, ai is 5,
# oi is  6, ui is 7, Ai is 8, Oi is 9, . is EOF, ? is question
# , is comma
 
open(STRING,">/dev/stderr") || die "cannot open debug log\n";
# open(STRING,">nl2pho.log") || die "cannot open debug log\n";
my $foo;

$OUTPUT_AUTOFLUSH = 1; # (also known as $|): flush after every
                       # write/print, do not buffer output
$/ = undef;       # do not split on line breaks
                  # $/ is $RS, record separator in use English
my $text0 = <STDIN>;   # read stdin
my $text = " ";  # other stage (start with a space)
my $phones = " "; # phoneme one-char-per-phoneme repn

# by the way: a "^>*" remover would be nice for mails...


# g vs G vs x: regen [reG@n] goal [goL]  gage [xaZe]
# where the G (voiced "ch") is a dialect alternative to x ("ch")
# and the g only occurs in foreign words.

# e vs E vs @: gemak [x@mAk] gage [xaZe] veer [ver]  pet [pEt]
# this is the len: e is long, is ee or e-at-end-of-syll.
# E is short, is default, kind of.


# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# first step: reduce the alphabet by spelling out specials
# result: a plain [a-z.? ]* string

my %special = ("0","null",   "=","is",
               "1","een",    "!","uitroepteken.",
               "2","twee",   '"',"aanhaalingsteken",
               "3","drie",
               "4","vier",   "\$","dollar",
               "5","vijf",   "%","percent",
               "6","zes",    "&","en",
               "7","zeven",  "/","door",
               "8","acht",   "(","haakje?",
               "9","negen",  ")","eind haakje,",
               "*","ster",   "\\","backslash",
               "+","plus",   "?","vraagteken?",
               "#","hekje",  "|","pijpteken?",
               ".","punt.",  "_","onderstreepje",
               ",","koma,",  "-","streepje",
               ">","groter", ";","semikolon?",
               "<","kleiner",":","dubbele punt?",
               "^","dakje",  "@","aapestaartje",
               "°","grad",   "{","accolade",
               "[","hoekje", "]","eind hoekje,",
               "~","tilde",  "}","eind accolade,"
              );

# use this: punt. koma, vraagteken?
#  or that: .     ,     ?
# the latter has the problem that a . or , or ?
# surrounded by spaces just sounds like a space...

# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

$text0 =~ s/^[>]*//g; # un-mailify the text :-)
$text0 =~ s{://}{ }g; # http:// and similar stuff
$text0 =~ s{:-[)]}{lachend gezicht}g;    # smiley
$text0 =~ s{:-[(]}{treurig gezicht}g;    # smiley
$text0 =~ s{;-[)]}{knipoogend gezicht}g; # smiley


for my $char (split(//,$text0)) {
  $char = lc($char);
  $char = "eu" if ($char =~ /öÖ/); # approximately :-)
  $char = "ae" if ($char =~ /äÄ/); # could be better
  $char = "uu" if ($char =~ /Üü/); # should also be for &euml;
  if (defined $special{$char}) {
    $text .= " " unless ($text =~ / $/);
    $text .= $special{$char} . " ";
  } elsif ($char =~ /[a-z]/) {
    $text .= $char;
  } else {
    $text .= " " unless ($text =~ / $/);
  }   # simplify all whitespace/linebreak stretches
      # and other special chars to single spaces
}
$text .= " " x 5; # end with spaces!

# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# second step: apply phoneme pattern rules (prefer longest
# match, eat up all left side apart from trailing space,
# produce pure phoneme list)

my %five  = (" lijk"," l1k", "lijk ",'l@k ',
             "elijk","El1k",
             "+size+",5
            );

my %four  = ("http","ha te te pe ",
             "html","ha te em el ",
             " je "," je ", "agen","axEn",
             "ooie","oie",  "ooit","oIt",
             "hou ","h4w",  " pc ","pe se ",
             "even",'ev@n',
             "+size+",4
            );

my %three = ("aai","5" , "ooi","6" ,  "oei","7",
             "ai ","8" , "oi ","9",
             "tje",'c@', "age","aZe",
             "ch ","x" , "ftp","ef te pe ",
             "www","we we we ",
             "htm","ha te em ",
             "://","  ", "je ",'j@',
             "tp:", "te pe ",
             "eeu", "e2", "en ",'@n', "he ","he","eij","1",
             "+size+",3
            ); # :// does not work, because at this point
               # is is "dubbele punt door door" already!

my %two   = ("ie","i" , "oe","u" , "uu","y",
             "aa","a" , "ee","e" , "oo","o",
             "eu","2" ,            "ei","1",
             "ui","3" , "ou","4" , "ij","1",
             "sj","S" , "g ","x" , "nj","J",
             "l ","l" , "ng","N" ,
             "dt","t" , "ch","x" , "iu","ju",
             "dl",'d@l',           "lf",'l@f',
             "bb","b" , "dd","d" , "e ",'@',
             "d ","t" , "hr","r" , "hl","l",
             "o ","o" , "e ","e" , "yl","1l",
             "zl","z l",
             "+size+",2
            ); # hr/hl/yl/zl: sane processing
               # of foreign words

my %one   = ("a","A", "b","b", "c","k",
             "d","d", "e","E", "f","f",
             "g","x", "h","h", "i","I",
             "j","j", "k","k", "l","l",
             "m","m", "n","n", "o","O",
             "p","p", "q","k", "r","r",
             "s","s", "t","t", "u","Y",
             "v","v", "w","w", "x","ks",
             "y","j", "z","z", " "," ",
             ".",".", "?","?", ",",",",
             "+size+",1
            ); # prosody with [ ?.,] is a later step

my @todo = (\%five, \%four, \%three, \%two, \%one);

# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

$phones = " ";
my $x = 0;     # string index
while ($x < (length($text)-5)) {
  my $y = 0;
  for my $hashref (@todo) {    # do l-longest rules first...
    if ($y != 0) { next; }
    my $check = substr($text,$x,$hashref->{"+size+"});
    if (defined $hashref->{$check}) {
      $phones .= $hashref->{$check};
      $x += $hashref->{"+size+"};
      $x-- if (($check =~ / $/) && ($check ne " "));
      # skip over matched part, but rewind on " " suffix
      $y++;
      print STDERR "Translate: <$check> to /"
        .   $hashref->{$check} . "/\n"
        if  (length($check) >= $XLATEDEBUG);
    }
  }
  if ($y == 0) {
    print STDERR "Had to translate first char to NIL:\n";
    print STDERR "<" . substr($text,$x,10) . "...>\n";
    $phones .= " ";
    $x++;
  }
}

# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# third step: convert to SAMPA alphabet and apply constraints
# on phoneme pairings (input: $phones string)

my %xlate = ("1","Ei", "3","9y", "4","Au", "5","ai",
             "6","oi", "7","ui", "8","Ai", "9","Oi",
             " ","_",  "?","_",  ".","_",  ",","_"
            );

my $Pvowel  = "aeiouAEIOy2Y13456789";
my $Pdipht  = "56789";
my $Pvoiced = "bdcvzZGhJjg"; # adding g for convenience
my $Pconson = "ptkbdgcfvszSZxGhmn";
my $Pvoice2 = "czZGhJj";
my $Psemi   = "GNJL";
my $Pspace  = ".?,_ ";

# rules:

# handled above: no "EY" or "IY" (replace by eY and iY)
# handled above: no d before l (add schwa)
# handled above: common case of bb and dd (replace by b and d)
# handled above: commod case of d_ (devoice to t_)

# no voiced/semi doubled (replace by single occurance)
# no schwa   before OR AFTER dipht (remove schwa)
# no voice2  before l, r or j (add schwa)
# no voiced  before conson (add schwa ; duplication rule first)
# no conson  before semi (add schwa)
# special case of next rule: j-E (replace by j-@)
# no dipht before or after vowel/j (insert " ", see above)
# no voiced  at the end of a word (devoice)

# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

$text = "";
my $adder = ""; # buffer before we really add the phone!
my $freq = 200; # freq in Hz, only used at " " for now
my $dur = 100;  # duration in msec
my $ph;         # current phoneme
my $ph0 = " ";  # previous phoneme
# the prosody and rhythm are still extremely simple

foreach $ph (split(//,$phones)) {

  if (($ph =~ /[${Psemi}${Pvoiced}]/) && ($ph0 eq $ph)) {
    print STDERR "${ph}-$ph: removeone $ph\n";
    $adder = ""; # ignore first copy of double phoneme
  } elsif (($ph0 eq "@") && ($ph =~ /[${Pdipht}]/)) {
    # remove the previous schwa
    # (or just insert a short "h")
    $adder = "";
    print STDERR "\@-$ph: remove \@\n";
  } elsif (($ph eq "@") && ($ph0 =~ /[${Pdipht}]/)) {
    # remove the current schwa
    $ph = "";
    print STDERR "${ph0}-\@: remove \@\n";
  } elsif (  (($ph0 =~ /[${Pvoice2}]/) && ($ph =~ /lrj/))
          || (($ph0 =~ /[${Pvoiced}]/) && 
              ($ph  =~ /[${Pconson}]/))
          || (($ph0 =~ /[${Pconson}]/) &&
              ($ph  =~ /[${Psemi}]/))
          ) {
    $adder .= "\@ 50\n";
    print STDERR "${ph0}-$ph: insert schwa\n";
  } elsif (($ph0 eq "j") && ($ph eq "E")) {
    print STDERR "j-E: changeto j-\@\n";
    $ph = "@"; # modify this part this time...
  } elsif (  (($ph0 =~ /[${Pdipht}j]/) && 
              ($ph  =~ /[${Pvowel}j]/))
          || (($ph0 =~ /[${Pvowel}j]/) && 
              ($ph  =~ /[${Pdipht}j]/))
          ) {
    if ($ph0 eq "j") {
      $adder = "i 100\n";
      print STDERR "${ph0}-$ph: changeto i-$ph\n";
    }
    if ($ph eq "j") {
      print STDERR "${ph0}-$ph: changeto ${ph0}-i\n";
      $ph = "i";
    }
    if (($ph0 ne "j") && ($ph ne "j")) {
      $adder .= "_ 50\n";
      print STDERR "${ph0}-$ph: insert break\n";
    }
  } elsif (($ph0 =~ /[${Pvoiced}]/) && ($ph =~ /[${Pspace}]/)) {
    my $de = $ph0;
    $de =~ tr/bdcvzZGhJjg/ptxfsSx_IIk/;
          #  ptxfsSx IIk
    print STDERR "${ph0}-_: changeto ${de}-_\n";
    $adder = "$de 100\n";
  }

  if (($ph0 eq "j") && ($ph =~ /[${Pspace}]/)) {
    print STDERR "j-_: insert \@\n";
    $adder .= "\@ 100\n";
  }

  if ($adder) {
    $text .= $adder; # add possibly corrected recent phoneme
    $adder =~ s{^([^ ]*).*$}
               {$1}gm;   # reduce to phonemes, multiline
    die "<$adder> ?\n" if ($adder =~ / /);
    $adder = join("-",split(/\n/,$adder)); # a\nb\n -> a-b-
    print STRING "${adder}-";
  }

  if ($ph) {
    $dur = ($ph =~ /[iuyaeo213456789rmnNJ]/) ? 200 : 100;
                                  # longer for long vowels/rmnNJ
    $freq = 200 if ($ph eq " ");  # default freq
    $freq = 252 if ($ph eq "?");  # go up for questions
    $freq = 159 if ($ph eq ".");  # go down for boundaries
    $freq = 178 if ($ph eq ",");  # go down a bit for commas
    if ($ph =~ /[${Pspace}]/) {   # various breaks
      $adder = "_ 100 (50 , $freq)\n";
    } else {
      $adder = ( (defined $xlate{$ph}) ? $xlate{$ph} : $ph );
                                  # use 1..2 char phone names
      $adder .= " $dur\n";
    }
  } else {
    print STDERR "Skip\n";
    $adder = "";
  }

  $ph0 = $ph;

}

print "$text\n";

print STRING "\n";
close STRING;

