#!/usr/bin/perl -w
=head1
UTF8 encoded

utf8ify_pinyin.pl
Copyright 2003 by Forrest Cahoon (hanziquiz@abstractfactory.org)
Modified (2008) by Jos van Wolput (wolput@onsneteindhoven.nl)
to include capitals and u: (as in nu:3) and to convert compound pinyin words.
Modified (2011) by JvW to avoid converting chemical formulas (such as CO2 etc.)

This program converts anything that looks like ASCII pinyin with tone
numbers at the end of the word into utf-8 with proper pinyin tone marks.
It will also work if the pinyin syllables are not separated
(e.g. "fei1 chang2"  and also "fei1chang2" will be converted ).

The original file is backed up to a file ending in "_bak"
(e.g. "hanzicards.js_bak"), then replaced with the new version.

This program is free software; you can redistribute it and|or modify it
under the same terms as Perl itself.

=cut
use strict;

my %UTF8_PINYIN_TONES =
	(a => [ "ā", "á", "ǎ", "à", "a" ],
	e => [ "ē", "é", "ě", "è", "e" ],
	i => [ "ī", "í", "ǐ", "ì", "i" ],
	o => [ "ō", "ó", "ǒ", "ò", "o" ],	
	u => [ "ū", "ú", "ǔ", "ù", "u" ],
	v => [ "ǖ", "ǘ", "ǚ", "ǜ", "ü" ],
	A => [ "Ā", "Á", "Ǎ", "À", "A" ],
	E => [ "Ē", "É", "Ě", "È", "E" ],
	I => [ "Ī", "Í", "Ǐ", "Ì", "I" ],
	O => [ "Ō", "Ó", "Ǒ", "Ò", "O" ],   
	U => [ "Ū", "Ú", "Ǔ", "Ù", "U" ],
	V => [ "Ǖ", "Ǘ", "Ǚ", "Ǜ", "Ü" ],
	r => [ "", "", "", "", "er" ],
	R => [ "", "", "", "", "ER" ]);

my %FINAL_ACCENT_LETTERS =
	("a" => "a", "ai" => "a", "an" => "a", "ang" => "a", "ao" => "a",
	"A" => "A", "AI" => "A", "AN" => "A", "ANG" => "A", "AO" => "A",
	"Ai" => "A", "An" => "A", "Ang" => "A", "Ao" => "A",
	"e" => "e", "ei" => "e", "en" => "e", "er" => "e", "eng" => "e",
	"E" => "E", "EI" => "E", "EN" => "E", "ER" => "E", "ENG" => "E",
	"Ei" => "E", "En" => "E", "Er" => "E", "Eng" => "E",
	"i" => "i", "ia" => "a", "ian" => "a", "iang" => "a", "iao" => "a",
	"I" => "I", "IA" => "A", "IAN" => "A", "IANG" => "A", "IAO" => "A",
	"ie" => "e", "in" => "i", "ing" => "i", "iong" => "o", "iu" => "u",
	"IE" => "E", "IN" => "I", "ING" => "I", "IONG" => "O", "IU" => "U",
	"o" => "o", "ong" => "o", "ou" => "o", "u" => "u", "ua" => "a",
	"O" => "O", "ONG" => "O", "OU" => "O", "Ou" => "O",   "U" => "U", "UA" => "A",
	"uai" => "a", "uan" => "a", "uang" => "a", "ue" => "e", "ui" => "i",
	"UAI" => "A", "UAN" => "A", "UANG" => "A", "UE" => "E", "UI" => "I",
	"un" => "u", "uo" => "o", "v" => "v", "ve" => "e",
	"UN" => "U", "UO" => "O", "V" => "V", "VE" => "E" ,
	"r" => "r" ,
	"R" => "R" );

my $in_fname = $ARGV[0] || "file";
my $out_fname = $in_fname . "_utf8";

open IN, $in_fname || die
	"Couldn't open $in_fname for reading\n";
open OUT, ">" . $out_fname ||
	die "Couldn't open $out_fname for writing\n";
while (<IN>){
	s/(\b(\w|[ü:Ü])+[0-5]\b)/fix_pinyin($1)/eg;
print OUT $_;
}
close IN;
close OUT;

rename $in_fname, $in_fname . "_bak";
rename $out_fname, $in_fname;

###############################################################################

sub fix_pinyin {
	my ($orig_word) = @_;
	my ($word, $initial, $final, $tone, $accent_loc, $accented_letter,@words);

	# if anything goes wrong, we return the original word unchanged, so get a copy to work on
	$word = $orig_word;
	# don't convert chemical formulas, such as CO2 etc!
   	if ($word =~ /^(.*?)(O2|O3|O4|O5)/){return $word;}
	# Convert common representations of u with umlaut
	# ("u:", "uu" and the iso-8859-1 codepoint)
	# to our preferred internal representation "v"
	$word =~ s/(u[:u]|ü)/v/g;
	$word =~ s/(U[:U]|Ü)/V/g;

	#add spaces between compound pinyin words to separate them
	$word=~ s/0/0 /g;$word=~ s/1/1 /g;$word=~ s/2/2 /g;$word=~ s/3/3 /g;$word=~ s/4/4 /g;$word=~ s/5/5 /g;
	@words = split(" ", $word);   # split string into words
	foreach $word (@words) 
	{
    if ($word =~ /^([^aeiouvAEIOUV]*(\D+))(\d)$/){
		$word = $1; $final = $2; $tone = $3;
		} else {return $orig_word;}
	$accent_loc = $FINAL_ACCENT_LETTERS{$final};
	
	if (!defined($accent_loc)) {return $orig_word;}
	$accented_letter = $UTF8_PINYIN_TONES{$accent_loc}->[$tone - 1];
	if (!defined($accented_letter)) {return $orig_word;}
	
	$word =~ s/$accent_loc/$accented_letter/;
	# Finally, change any "v" to a proper utf8 u with umlaut:
	$word =~ s/v/ü/g;$word =~ s/V/Ü/g;
	}
	$word = join('',@words); # join words back into one string
	return $word;
}

###############################################################################

__END__

