#!/usr/bin/perl -w use English; # things like MATCH # -------------------------------------------------------------- # txt2phoNL - usage: perl txt2phoNL phonemes.pho # the generated phoneme file is suitable for use with MBROLA, # but you have to use the -e option in MBROLA to skip over # spurious unpronounceable phoneme pairs (e.g. caused by English # words in your Dutch text file!). # Hint: Use pipes, e.g. "ls | txt2phoNL | mbrola -e - - | play" # This is GPLed software (open source freeware) by # Eric Auer , the license is the GNU GPL # version 2 or later, also available as copying.txt in this # directory, http://www.coli.uni-saarland.de/~eric/stuff/soft (3/2002) # Please give me some feedback: As I am no native speaker # of Dutch, this txt2phoNL definitely need some improvement! # -------------------------------------------------------------- # new version 14 feb 2002: # - sanitize away illegal phone pairs in a last step, # includes devoicing of consonants before a break. # - intermediate repn uses one char per phoneme. # - simpler rewrite mechanism eats all matched chars # and produces only phones - so the text string is constant. # BUT: restart from " " if the rule input ended in " " ! # - steps: 1. digit/... names # 2. sound pattern rules (preferring long matches, # walking the string and trying all rules per char) # 3. sanitize and get final repn from intermediate one # new version 2003-04-05 by Marc Spoorendonk marc@spoorendonk.com (native Dutch speaker) # - changed to much to mention. Very acceptable translation now. my $XLATEDEBUG = 4; # show all translation rule applications # of at least this size # special one char repn: # _ is " ", Ei is 1, 9y is 3, Au is 4, ai is 5, # oi is 6, ui is 7, Ai is 8, Oi is 9, . is EOF, ? is question # , is comma open(STRING,">/dev/stderr") || die "cannot open debug log\n"; # open(STRING,">nl2pho.log") || die "cannot open debug log\n"; my $foo; $OUTPUT_AUTOFLUSH = 1; # (also known as $|): flush after every # write/print, do not buffer output $/ = undef; # do not split on line breaks # $/ is $RS, record separator in use English my $text0 = ; # read stdin my $text = " "; # other stage (start with a space) my $phones = " "; # phoneme one-char-per-phoneme repn # by the way: a "^>*" remover would be nice for mails... # g vs G vs x: regen [reG@n] goal [goL] gage [xaZe] # where the G (voiced "ch") is a dialect alternative to x ("ch") # and the g only occurs in foreign words. # e vs E vs @: gemak [x@mAk] gage [xaZe] veer [ver] pet [pEt] # this is the len: e is long, is ee or e-at-end-of-syll. # E is short, is default, kind of. # ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ # first step: reduce the alphabet by spelling out specials # result: a plain [a-z.? ]* string my %special = ("0","null", "=","is", "1","een", "!","!", "2","twee", '"',"aanhaalingsteken", "3","drie", "4","vier", "\$","dollar", "5","vijf", "%","procent", "6","zes", "&","en", "7","zeven", "/","slesh", #phonetically "8","acht", "(","haakje openen", "9","negen", ")","haakje sluiten,", "*","ster", "\\","beckslesh", #phonetically "+","plus", "?","?", "#","hekje", "|","paip", #phonetically ".",".", "_","underscoor", #phonetically ",",",", "-","", ">","groter", ";",";", "<","kleiner",":",":", "^","dakje", "@","aapestaartje", "°","grad", "{","accolade openen", "[","hoekje", "]","hoekje sluiten,", "~","tilde", "}","accolade sluiten," ); # use this: punt. koma, vraagteken? # or that: . , ? # the latter has the problem that a . or , or ? # surrounded by spaces just sounds like a space... # ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ #Marc> Prefix with space for easyer matching. $text0 =~ s/^/ /g; $text0 =~ s/$/ /g; $text0 =~ s/^[>]*//g; # un-mailify the text :-) $text0 =~ s{://}{dubbele punt slesh slesh}g; # http:// and similar stuff $text0 =~ s{:-[)]}{,lachend gezicht}g; # smiley $text0 =~ s{:[)]}{,lachend gezicht}g; # smiley $text0 =~ s{:-[(]}{,treurig gezicht}g; # smiley $text0 =~ s{:[(]}{,treurig gezicht}g; # smiley $text0 =~ s{;-[)]}{,knipoogend gezicht}g; # smiley $text0 =~ s{;[)]}{,knipoogend gezicht}g; # smiley $text0 =~ s/cie/sie/g; # precies -> presies, provincie -> provinsie #Marc> betaal -> betaal #Marc> betalen -> betaalen #Marc> It keeps metten, marren, matten as they are. #Marc> betaling -> betaaling # b e t a l i ng bet a a li ng $text0 =~ s/([^eaiou][eaou][rtpsdfgklzcvbnm])([eaiou])([rtpsdfgklzcvbnm][eaiou])/$1$2$2$3/g; #Marc> meten -> meeten maren -> maaren # m e t e n m e e ten $text0 =~ s/([^eaiou])([eaou])([rtpsdfgklzcvbnm][eaiou][^eaiou])/$1$2$2$3/g; #Marc> k.n.m.i. -> k n m i $text0 =~ s/([^a-z])([a-z])\./$1$2 /g; $text0 =~ s/([^a-z])([a-z])\./$1$2 /g; #Marc> remove lines from input: "-----------------------------" -> "" $text0 =~ s/[-_=+]{3,}//g; #Marc> www.bla.com -> www punt bla punt com $text0 =~ s/\.([^ \n\t])/punt $1/g; #Marc> translate some numbers. (write a function for this once) $text0 =~ s/([^0-9])10([^0-9])/$1tien$2/g; $text0 =~ s/([^0-9])11([^0-9])/$1elf$2/g; $text0 =~ s/([^0-9])12([^0-9])/$1twaalf$2/g; $text0 =~ s/([^0-9])13([^0-9])/$1dertien$2/g; $text0 =~ s/([^0-9])14([^0-9])/$1veertien$2/g; $text0 =~ s/([^0-9])15([^0-9])/$1vijftien$2/g; $text0 =~ s/([^0-9])16([^0-9])/$1zestien$2/g; $text0 =~ s/([^0-9])17([^0-9])/$1zeventien$2/g; $text0 =~ s/([^0-9])18([^0-9])/$1achttien$2/g; $text0 =~ s/([^0-9])19([^0-9])/$1negentien$2/g; $text0 =~ s/([^0-9])20([^0-9])/$1twintig$2/g; $text0 =~ s/([^0-9])21([^0-9])/$1eenentwintig$2/g; $text0 =~ s/([^0-9])22([^0-9])/$1tweeentwintig$2/g; $text0 =~ s/([^0-9])23([^0-9])/$1drieentwintig$2/g; $text0 =~ s/([^0-9])24([^0-9])/$1vierentwintig$2/g; $text0 =~ s/([^0-9])25([^0-9])/$1vijfentwintig$2/g; $text0 =~ s/([^0-9])26([^0-9])/$1zesentwintig$2/g; $text0 =~ s/([^0-9])27([^0-9])/$1zevenentwintig$2/g; $text0 =~ s/([^0-9])28([^0-9])/$1achtentwintig$2/g; $text0 =~ s/([^0-9])29([^0-9])/$1negenentwintig$2/g; $text0 =~ s/([^0-9])30([^0-9])/$1dertig$2/g; $text0 =~ s/([^0-9])31([^0-9])/$1eenendertig$2/g; $text0 =~ s/([^0-9])32([^0-9])/$1tweeendertig$2/g; $text0 =~ s/([^0-9])33([^0-9])/$1drieendertig$2/g; $text0 =~ s/([^0-9])34([^0-9])/$1vierendertig$2/g; $text0 =~ s/([^0-9])35([^0-9])/$1vijfendertig$2/g; $text0 =~ s/([^0-9])36([^0-9])/$1zesendertig$2/g; $text0 =~ s/([^0-9])37([^0-9])/$1zevenendertig$2/g; $text0 =~ s/([^0-9])38([^0-9])/$1achtendertig$2/g; $text0 =~ s/([^0-9])39([^0-9])/$1negenendertig$2/g; print STDERR "Text: $text0"; for my $char (split(//,$text0)) { $char = lc($char); $char = "eu" if ($char =~ /öÖ/); # approximately :-) $char = "ae" if ($char =~ /äÄ/); # could be better $char = "uu" if ($char =~ /Üü/); # should also be for ë if (defined $special{$char}) { $text .= " " unless ($text =~ / $/); $text .= $special{$char} . " "; } elsif ($char =~ /[a-z]/) { $text .= $char; } else { $text .= " " unless ($text =~ / $/); } # simplify all whitespace/linebreak stretches # and other special chars to single spaces } $text .= " " x 5; # end with spaces! # ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ # second step: apply phoneme pattern rules (prefer longest # match, eat up all left side apart from trailing space, # produce pure phoneme list) my %five = ( " lijk"," l1k", "lijk ",'l@k ', "elijk","El1k", " bent", " bEnt", "atie ","atsi", #informatie "+size+",5 ); my %four = ( "http","ha te te pe ", "html","ha te Em El ", "agen","axEn", "ooie","oi@", # mooie "ooit","oIt", " er "," Er ", " en "," En ", " nl "," EnEl ", " he "," hE ", " ok "," oke ", "hou ","h4w ", "ouch","uS", # douche "oush","uS", # kianoush " pc "," pe se ", "even",'ev@n', "tie ","tsi", #vakantie " chi"," Si", # china "+size+",4 ); my %three = ( "aai","5" , "ooi","oi" , "oei","7", "cee","se", "ai ","8" , "oi ","9", "age","aZe", "ch ","x" , "ftp","ef te pe ", "www","we we we ", "htm","ha te em ", "tp:", "te pe ", "mp ", "Em pe ", # mp3 "mb ", "Embe ", # mp3 "eeu", "e2", "en ",'@n', "he ","he", "eij","1", #pronounciation of E before double dissonant "ett","Et", #letter "epp","Ep", "ett","Et", "err","Er", "ekk","Ek", #lekker "emm","Em", "ess","Es", "eff","Ef", "ell","El", "ebb","Eb", "enn","En", #Marc> distinct letters: k.n.m.i a.u.b. " a ", "a", " b ", "be", " c ", "se", " d ", "de", " e ", "e", " f ", "Ef", " g ", "xe", " h ", "ha", " i ", "i", " j ", "ie", " k ", "ka", " l ", "El", " m ", "Em", " n ", "En", " o ", "o", " p ", "pe", " q ", "ky", " r ", "Er", " s ", "Es", " t ", "te", " u ", "y", " v ", "ve", " w ", "we", " x ", "Iks", " y ", "1", " z ", "zEt", "+size+",3 ); my %two = ("ie","i" , "oe","u" , "uu","y", "aa","a" , "ee","e" , "oo","o", "eu","2" , "ei","1", "ui","3" , "ou","4" , "ij","1", "sj","S" , "g ","x" , "nj","J", "ce","sE", "l ","l" , "ng","N" , "dt","t" , "ch","x" , "iu","ju", "dl",'d@l', "lf",'l@f', "bb","b" , "dd","d" , "e ",'@', "d ","t" , "hr","r" , "hl","l", "o ","o" , "a ", "a", "yl","1l", "zl","z l", "mm", "m", # Marc> m-m is not a sound. Same for p-p and n-n. "pp", "p", "nn", "n", "rr", "r", "kk", "k", "tt", "t", "+size+",2 ); # hr/hl/yl/zl: sane processing # of foreign words my %one = ("a","A", "b","b", "c","k", "d","d", "e",'E', "f","f", "g","x", "h","h", "i","I", "j","j", "k","k", "l","l", "m","m", "n","n", "o","O", "p","p", "q","k", "r","r", "s","s", "t","t", "u","Y", "v","v", "w","w", "x","ks", "y","j", "z","z", " "," ", ".",".", "?","?", ",",",", "+size+",1 ); # prosody with [ ?.,] is a later step my @todo = (\%five, \%four, \%three, \%two, \%one); # ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ $phones = " "; my $x = 0; # string index while ($x < (length($text)-5)) { my $y = 0; for my $hashref (@todo) { # do l-longest rules first... if ($y != 0) { next; } my $check = substr($text,$x,$hashref->{"+size+"}); if (defined $hashref->{$check}) { $phones .= $hashref->{$check}; $x += $hashref->{"+size+"}; $x-- if (($check =~ / $/) && ($check ne " ")); # skip over matched part, but rewind on " " suffix $y++; print STDERR "Translate: <$check> to /" . $hashref->{$check} . "/\n" if (length($check) >= $XLATEDEBUG); } } if ($y == 0) { print STDERR "Had to translate first char to NIL:\n"; print STDERR "<" . substr($text,$x,10) . "...>\n"; $phones .= " "; $x++; } } # ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ # third step: convert to SAMPA alphabet and apply constraints # on phoneme pairings (input: $phones string) my %xlate = ("1","Ei", "3","9y", "4","Au", "5","ai", "6","oi", "7","ui", "8","Ai", "9","Oi", " ","_", "?","_", ".","_", ",","_" ); my $Pvowel = "aeiouAEIOy2Y13456789"; my $Pdipht = "56789"; my $Pvoiced = "bdcvzZGhJjg"; # adding g for convenience my $Pconson = "ptkbdgcfvszSZxGhmn"; my $Pvoice2 = "czZGhJj"; my $Psemi = "GNJL"; my $Pspace = ".?,_ "; # rules: # handled above: no "EY" or "IY" (replace by eY and iY) # handled above: no d before l (add schwa) # handled above: common case of bb and dd (replace by b and d) # handled above: commod case of d_ (devoice to t_) # no voiced/semi doubled (replace by single occurance) # no schwa before OR AFTER dipht (remove schwa) # no voice2 before l, r or j (add schwa) # no voiced before conson (add schwa ; duplication rule first) # no conson before semi (add schwa) # special case of next rule: j-E (replace by j-@) # no dipht before or after vowel/j (insert " ", see above) # no voiced at the end of a word (devoice) # ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ $text = ""; my $adder = ""; # buffer before we really add the phone! my $freq = 200; # freq in Hz, only used at " " for now my $dur = 100; # duration in msec my $ph; # current phoneme my $ph0 = " "; # previous phoneme # the prosody and rhythm are still extremely simple foreach $ph (split(//,$phones)) { if (($ph =~ /[${Psemi}${Pvoiced}]/) && ($ph0 eq $ph)) { print STDERR "${ph}-$ph: removeone $ph\n"; $adder = ""; # ignore first copy of double phoneme } elsif (($ph0 eq "@") && ($ph =~ /[${Pdipht}]/)) { # remove the previous schwa # (or just insert a short "h") $adder = ""; print STDERR "\@-$ph: remove \@\n"; } elsif (($ph eq "@") && ($ph0 =~ /[${Pdipht}]/)) { # remove the current schwa $ph = ""; print STDERR "${ph0}-\@: remove \@\n"; } elsif ( (($ph0 =~ /[${Pvoice2}]/) && ($ph =~ /lrj/)) || (($ph0 =~ /[${Pvoiced}]/) && ($ph =~ /[${Pconson}]/)) || (($ph0 =~ /[${Pconson}]/) && ($ph =~ /[${Psemi}]/)) ) { $adder .= "\@ 50\n"; print STDERR "${ph0}-$ph: insert schwa\n"; } elsif (($ph0 eq "j") && ($ph eq "E")) { print STDERR "j-E: changeto j-\@\n"; $ph = "@"; # modify this part this time... } elsif ( (($ph0 =~ /[${Pdipht}j]/) && ($ph =~ /[${Pvowel}j]/)) || (($ph0 =~ /[${Pvowel}j]/) && ($ph =~ /[${Pdipht}j]/)) ) { if ($ph0 eq "j") { $adder = "i 100\n"; print STDERR "${ph0}-$ph: changeto i-$ph\n"; } if ($ph eq "j") { print STDERR "${ph0}-$ph: changeto ${ph0}-i\n"; $ph = "i"; } if (($ph0 ne "j") && ($ph ne "j")) { $adder .= "_ 50\n"; print STDERR "${ph0}-$ph: insert break\n"; } } elsif (($ph0 =~ /[${Pvoiced}]/) && ($ph =~ /[${Pspace}]/)) { my $de = $ph0; $de =~ tr/bdcvzZGhJjg/ptxfsSx_IIk/; # ptxfsSx IIk print STDERR "${ph0}-_: changeto ${de}-_\n"; $adder = "$de 100\n"; } if (($ph0 eq "j") && ($ph =~ /[${Pspace}]/)) { print STDERR "j-_: insert \@\n"; $adder .= "\@ 100\n"; } if ($adder) { $text .= $adder; # add possibly corrected recent phoneme $adder =~ s{^([^ ]*).*$} {$1}gm; # reduce to phonemes, multiline die "<$adder> ?\n" if ($adder =~ / /); $adder = join("-",split(/\n/,$adder)); # a\nb\n -> a-b- print STRING "${adder}-"; } if ($ph) { $dur = ($ph =~ /[iuyaeo213456789rmnNJ]/) ? 200 : 100; # longer for long vowels/rmnNJ $freq = 200 if ($ph eq " "); # default freq $freq = 252 if ($ph eq "?"); # go up for questions $freq = 159 if ($ph eq "."); # go down for boundaries $freq = 178 if ($ph eq ","); # go down a bit for commas if ($ph =~ /[${Pspace}]/) { # various breaks $adder = "_ 100 (50 , $freq)\n"; } else { $adder = ( (defined $xlate{$ph}) ? $xlate{$ph} : $ph ); # use 1..2 char phone names $adder .= " $dur\n"; } } else { print STDERR "Skip\n"; $adder = ""; } $ph0 = $ph; } print "$text\n"; print STRING "\n"; close STRING;