#!/usr/bin/perl -w
=head1

utf8ify_pinyin.pl
Copyright 2003 by Forrest Cahoon (hanziquiz@abstractfactory.org)
Modified (2008) by Jos van Wolput (wolput@onsneteindhoven.nl)
to include capitals and u: (as in nu:3).

This program converts anything that looks like ASCII pinyin with tone
numbers at the end of the word into utf-8 with proper pinyin tone marks.
It will now work if the pinyin syllables are not seperated
(e.g. "fei1 chang2"  and also "fei1chang2" will be converted ).

The original file is backed up to a file ending in "_bak"
(e.g. "hanzicards.js_bak"), then replaced with the new version.

This program is free software; you can redistribute it and|or modify it
under the same terms as Perl itself.

=cut
use strict;

my %UTF8_PINYIN_TONES =
  (a => [ "\xc4\x81", "\xc3\xa1", "\xc7\x8e", "\xc3\xa0", "a" ],
   o => [ "\xc5\x8d", "\xc3\xb3", "\xc7\x92", "\xc3\xb2", "o" ],
   e => [ "\xc4\x93", "\xc3\xa9", "\xc4\x9b", "\xc3\xa8", "e" ],
   i => [ "\xc4\xab", "\xc3\xad", "\xc7\x90", "\xc3\xac", "i" ],
   u => [ "\xc5\xab", "\xc3\xba", "\xc7\x94", "\xc3\xb9", "u" ],
   v => [ "\xc7\x96", "\xc7\x98", "\xc7\x9a", "\xc7\x9c", "\xc3\xbc" ],
   A => [ "\xc4\x80", "\xc3\x81", "\xc7\x8d", "\xc3\x80", "A" ],
   O => [ "\xc5\x8c", "\xc3\x93", "\xc7\x91", "\xc3\x92", "O" ],
   E => [ "\xc4\x92", "\xc3\x89", "\xc4\x9a", "\xc3\x88", "E" ],
   I => [ "\xc4\xaa", "\xc3\x8d", "\xc7\x8f", "\xc3\x8c", "I" ],
   U => [ "\xc5\xaa", "\xc3\x9a", "\xc7\x93", "\xc3\x99", "U" ],
   V => [ "\xc7\x95", "\xc7\x97", "\xc7\x99", "\xc7\x9b", "\xc3\x9c" ]);

my %FINAL_ACCENT_LETTERS =
  ("a" => "a", "ai" => "a", "an" => "a", "ang" => "a", "ao" => "a",
   "A" => "A", "AI" => "A", "AN" => "A", "ANG" => "A", "AO" => "A",
   "Ai" => "A", "An" => "A", "Ang" => "A", "Ao" => "A",
   "e" => "e", "ei" => "e", "en" => "e", "er" => "e", "eng" => "e",
   "E" => "E", "EI" => "E", "EN" => "E", "ER" => "E", "ENG" => "E",
   "Ei" => "E", "En" => "E", "Er" => "E", "Eng" => "E",
   "i" => "i", "ia" => "a", "ian" => "a", "iang" => "a", "iao" => "a",
   "I" => "I", "IA" => "A", "IAN" => "A", "IANG" => "A", "IAO" => "A",
   "ie" => "e", "in" => "i", "ing" => "i", "iong" => "o", "iu" => "u",
   "IE" => "E", "IN" => "I", "ING" => "I", "IONG" => "O", "IU" => "U",
   "o" => "o", "ong" => "o", "ou" => "o", "u" => "u", "ua" => "a",
   "O" => "O", "ONG" => "O", "OU" => "O", "Ou" => "O",   "U" => "U", "UA" => "A",
   "uai" => "a", "uan" => "a", "uang" => "a", "ue" => "e", "ui" => "i",
   "UAI" => "A", "UAN" => "A", "UANG" => "A", "UE" => "E", "UI" => "I",
   "un" => "u", "uo" => "o", "v" => "v", "ve" => "e",
   "UN" => "U", "UO" => "O", "V" => "V", "VE" => "E");

my $in_fname = $ARGV[0] || "file";
my $out_fname = $in_fname . "_utf8";

open IN, $in_fname || die
  "Couldn't open $in_fname for reading\n";
open OUT, ">" . $out_fname ||
  die "Couldn't open $out_fname for writing\n";

while (<IN>) {
   s/(\b\w+[0-5]\b)/fix_pinyin($1)/eg;
   s/(\b\w+[:]+[0-5]\b)/fix_pinyin($1)/eg; #read u:
print OUT $_;
}
close IN;
close OUT;

rename $in_fname, $in_fname . "_bak";
rename $out_fname, $in_fname;

###############################################################################

sub fix_pinyin {
   my ($orig_word) = @_;
   my ($word, $initial, $final, $tone, $accent_loc, $accented_letter,@words);

   # if anything goes wrong, we return the original
   # word unchanged, so get a copy to work on
   $word = $orig_word;

   # Convert common representations of u with umlaut
   # ("u:", "uu" and the iso-8859-1 codepoint)
   # to our preferred internal representation "v"
   $word =~ s/(u[:u]|\xfc)/v/g;
   $word =~ s/(U[:U]|\xdc)/V/g;
        #add spaces between compound pinyin words to separate them
	$word=~ s/0/0 /g;$word=~ s/1/1 /g;$word=~ s/2/2 /g;$word=~ s/3/3 /g;$word=~ s/4/4 /g;$word=~ s/5/5 /g;
	@words = split(" ", $word);   # split string into words
   foreach $word (@words) 
   {
    if ($word =~ /^([^aeiouvAEIOUV]*(\D+))(\d)$/) {
        $word = $1; $final = $2; $tone = $3;
        } else {return $orig_word;}
 
   $accent_loc = $FINAL_ACCENT_LETTERS{$final};
   if (!defined($accent_loc)) { return $orig_word; }
   $accented_letter = $UTF8_PINYIN_TONES{$accent_loc}->[$tone - 1];
   if (!defined($accented_letter)) {return $orig_word; }

   $word =~ s/$accent_loc/$accented_letter/;
   # Finally, change any "v" to a proper utf8 u with umlaut:
   $word =~ s/v/\xc3\xbc/g;
   }
	$word = join(' ', @words);   # join words back into one string
   return $word;
}

###############################################################################

__END__
