#!/usr/bin/perl -w #Encoding: UTF-8 # # cedictlookup originally by David Hiebeler, modified by Jos van Wolput # Copyright (c) 1998,1999,2005 David Hiebeler # Copyright (c) 2007,2008 Jos van Wolput # For licensing information, see the "printLicense" function below. # # File: cedictlookup, Version 1.2.1-a.1 # By: David Hiebeler # Dept of Mathematics and Statistics # University of Maine # Orono, ME 04469-5752 # http://www.math.umaine.edu/faculty/hiebeler # modified by: # Jos van Wolput, Eindhoven, the Netherlands # wolput@onsneteindhoven.nl # http://homepages.onsneteindhoven.nl/~wolput # # Version 1.2.1-a.1: July 2008 (modified version, by JvW) # Version 1.2.1-a: October 2007 (modified version, by JvW) # Version 1.2.1: July 2005 # Version 1.2: June 2005 # Version 1.1.01: June 2005 # Version 1.1b: July 2004 (modified version, by JvW) # Version 1.1a: November 2003 (modified version, by JvW) # Version 1.1: June 1999 # Version 1.0: August 1998 (under the name `chnvlookup') # # Any comments about this software are appreciated, especially suggested improvements. # # Perl script for doing Chinese vocabulary lookup (by Chinese, pinyin, or English) from CEDICT-format vocabulary files. # (See "http://www.mandarintools.com/cedict.html" or "http://www.xuezhongwen.net/chindict/chindict.php?page=cedict" for information about CEDICT.) ##### # Modifications by JvW: # Script looks for vocab dir $HOME/chinese/cedict, vocab file cedict, (adapt script to your needs!) # Script modified to lookup the new CEDICT-format vocabulary files containing both traditional and simplified Chinese in UTF8 encoding. # Can also be used to lookup cedict files converted to tone marks pinyin, such as those from Matti Tukiainen, http://ktmatu.com/chinese/cedict. # # Added output and input conversion from tone numbers to tone marks pinyin (UTF-8 only) using a modified perl script # utf8ify_pinyin.pl, copyright 2003 by Forrest Cahoon (hanziquiz@abstractfactory.org), # added use of capitals in tone marks pinyin. # Also added user input "v" and "u:" for "ü". # # This script should work correctly on UTF-8 and GB files. It automatically tries to find the Chinese encoding. # If encoding is GB then $tone_marks_pinyin is disabled, however it also can be disabled manually in the script. # It is not tested on BIG5 encoding. # Have a look at Set up default values for parameters and adapt them to your needs! # Traditional or simplified Chinese can be disabled in the script. ##### # See cedictlookup.doc for documentation. # Quick & dirty "how-to" for those who don't like documentation: # cedictlookup -vf /my/chinese/vocab/file.gb # Then type in words in Chinese, tone numbers pinyin, or English, and it will try # to look them up in your vocab file. You'll have to read the docs (or # try to decipher the output of "cedictlookup -help") if you want to use # multiple vocabulary files simultaneously, do non-exact matches, or # enable/disable FastExact searching. # # Note that this script will ignore any lines it encounters in # vocabulary files which are not in cedict format. In particular, it # will ignore any blank lines, and discard any comments which # begin with '#' (whether the comment is the only thing on a line, or # at the end of a line). You may want to use the "cedictcheckformat" # script to catch any lines in your vocabulary files which are not in # strict CEDICT format. If you don't have it already, it's also available # at my web site, at the URL above. # # # Wishlist / known bugs: # o) Allow FastExact match searching for English (although that will probably # turn out to be too much of a memory hog). # o) Allow you to change the FastExact settings without restarting # (as you can change Match Mode and Anchor Mode). # o) "0-tone" (or "forgotten-tone") pinyin lookups do not work with # Match Mode = Shorter. # o) Further improve performance of vocabulary lookups. # o) Allow some options to sort the output in different ways, e.g. # alphabetically by pinyin (using the algorithm that "cedictsort" uses), # or maybe first by length of Chinese and then alphabetically within that. # It should also remove duplicate entries after sorting. # # History: # July 2008: Version 1.2.1-a.1 (JvW), revoving small bug, now also reads compound pinyin words. # October 2007 : Version 1.2.1-a (JvW), vocabulary lookup for different newer versions of cedict containing both traditional and simplified Chinese, tone pinyin added. # 22 July 2005: version 1.2.1 # 24 June 2005: version 1.2 # Just renumberings to stay synchronized with the whole package. # 24 June 2005: version 1.1.01 # Updated my address info above # July 2004: Version 1.1b (JvW): modified exact and longer mode for more results and added user input v for ü. # November 2003: Version 1.1a (JvW): vocabulary lookup for cedict version with tone marks pinyin (Matti Tukiainen, http://ktmatu.com/chinese/cedict) # # 08 June 1999: version 1.1 # Added "fastexact" matches in Chinese and pinyin. These use # Perl's built-in hashing algorithm to find exact matches to Chinese # words much more quickly than the code was able to do by searching # through the entire vocabulary list. # 10 Dec 1998: version 1.0.1 # added code to turn "uu" into "u:" or vice-versa in the # pinyin field if the user requests it, to handle the fact that both # forms were present in cedict for some time, or may be present in # various Chinese documents you encounter. Note, if you activate # this conversion, it will be done both when reading in the # vocabulary files, and when processing pinyin user input. # You can activate conversion by using either the "-uu2u:" or "-u:2uu" # command-line arguments. # 05 August 1998: original version, 1.0 # ################################################################ # I was using the following for timing the code; commented out for now. #use Benchmark; # Define a couple of constants $uu2uc = 1; $uc2uu = 2; $Code = 0; sub printLicense { print <<"END_OF_LICENSE"; cedictlookup Version 1.2.1-a.1, July 24, 2008 Copyright (C) 1998,1999,2005 David Hiebeler Dept of Mathematics and Statistics University of Maine Orono, ME 04469-5752 http://www.math.umaine.edu/faculty/hiebeler Copyright (C) 2007,2008 Jos van Wolput, Eindhoven, Netherlands http://homepages.onsneteindhoven.nl/~wolput This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA END_OF_LICENSE } # # Set up default values for parameters # sub setdefaults { #Adapt the following values to your needs # if cedict contains tone marks, show them else convert tone numbers to tone marks (UTF-8 only) #Set $tone_marks_pinyin to 0 for GB encoding! $tone_marks_pinyin = 1; # 1= add/show tone marks pinyin, 0= don't add and/or don't show. $tone_numbers_pinyin = 1; # 1= show, 0= hide tone numbers pinyin # if cedict contains both traditional and simplified Chinese: $CnT = 1; # 1= show, 0= hide traditional Chinese $CnS = 1; # 1= show, 0= hide simplified Chinese # 1=case-insensitive matching of English words, 0=case-sensitive matching $caseInsensitive = 1; # whether we are logging to a file (for debugging) $doLog = 0; # whether to give some extra output (for debugging) $verbose = 0; # we slightly change our behavior when called by emacs $underEmacs = 0; $HOME = $ENV{'HOME'}; # directory where vocab files are #$vocabDir = "$HOME/chinese/words/vocab"; $vocabDir = "$HOME/chinese/cedict"; # colon-separated list of vocab files # $vocabFiles = "vocabulary.gb:yuwen5.gb:yuyan.gb:cedict.gb"; $vocabFiles ="cedict"; if (!(-f $vocabDir."/cedict")) { print "No vocab file is found in $vocabDir/$vocabFiles\n"; exit 0; } # default MatchMode and AnchorMode to use $matchMode = "exact"; $anchorMode = "start"; # What kind (if any) of conversion to do in the pinyin field # (converting "uu" to "u:" or vice-versa). $uConvert = 0; # default is to do no conversion # whether or not to remove skill level information from English field $removeSkillLevels = 0; # What kind fast exact matches to do. For now, only Chinese fast exact # matches are supported (and enabled by default). # Note that if you want the default to be "c+" for example, you should # still set this value to "c", and set the next variable to 1 (i.e. # the continueAfterFastMatch variable indicates whether or not you # want the '+'). $fastExactMatches = "c"; #$fastExactMatches = "p"; # whether or not to continue with general matches after a successful # fast match $continueAfterFastMatch = 0; # explicitly initialize empty hashes %fastMatchChineseHash = (); %fastMatchChinese2Hash = (); %fastMatchPinyinHash = (); %fastMatchPinyin2Hash = (); } # # Print parameter values # sub printsettings { print "verbose = $verbose, underEmacs = $underEmacs, caseInsensitive = $caseInsensitive\n"; print "vocabDir = `$vocabDir'\n"; print "vocabFiles = `$vocabFiles'\n"; print "matchMode = `$matchMode', anchorMode = `$anchorMode'\n"; print "uConvert = "; if ($uConvert == 0) { print "none\n"; } elsif ($uConvert == $uu2uc) { print "uu2u:\n"; } elsif ($uConvert == $uc2uu) { print "u:2uu\n"; } else { die "Illegal value: $uConvert\n"; } print "rmskill = $removeSkillLevels\n"; print "fastExactMatches = "; if ($fastExactMatches eq "") { print "(none)\n"; } else { if ($fastExactMatches =~ m/c/) { print "chinese "; } if ($fastExactMatches =~ m/p/) { print "pinyin "; } if ($continueAfterFastMatch) { print "+\n"; } else { print "\n"; } } } # # Print usage message and exit # sub printusage { print "Usage: $0 [-v] [-vd path] [-i | +i] [-emacs] [-uu2u: | -u:2uu]\n"; print " [-vf fname1:fname2:...:fnameN] [-rmskill 0|1] [-mm e|s|l] [-am s|e|n]\n"; print " [-fastexact 0|c] [-license]\n"; print "\n"; print " -v : verbose mode (print extra info at startup and when scanning user input)\n"; print " -emacs : for use when invoked as subprocess under emacs\n"; print " -i : case-insensitive matching for English (enabled by default)\n"; print " +i : do case-sensitive matching of English words\n"; print " -vd path : set vocabDir, directory containing vocabulary files\n"; print " -vf fname1:fname2:...:fnameN : colon-separated list of vocabulary filenames\n"; print " -rmskill 0|1 : do(1) or don't(0) remove skill level info from English field\n"; print " -mm e|s|l : match mode -- (e)xact, (s)horter, (l)onger\n"; print " -am s|e|n : anchor mode -- (s)tart, (e)nd, or (n)one\n"; print " Only used for non-exact match mode; e.g. anchor mode of `start'\n"; print " means your word must be at the beginning of the match\n."; print " -uu2u: : Turn pinyin entries like `nuu3' into `nu:3' (default = don't)\n"; print " -u:2uu : Turn pinyin entries like `nu:3' into `nuu3' (default = don't)\n"; print " -fastexact 0|c : Do fast exact matches in addition to slower pattern matches\n"; print " (This option can also be abbreviated as `-fe').\n"; print " `0' means don't do any fast exact matches.\n"; print " `c' means do fast exact matches for Chinese (which uses extra memory).\n"; print " -license: Print license information and exit.\n"; exit 2; } # # Read a line from a file or stdin, removing comments which begin with "#", # and ignoring empty lines (or lines which only have a comment). # sub getline { if ($#_ == -1) { while (<>) { next if /^\s*#/; next if /^\s*$/; s/#.*$//; chop; return $_; } return undef; } elsif ($#_ == 0) { $fh = $_[0]; } else { die "getlinefp must be called with a single argument or no arguments"; } while (<$fh>) { next if /^\s*#/; next if /^\s*$/; s/#.*$//; chop; return $_; } return undef; } # # Read in a cedict-format vocabulary file # sub readvocabfile { my $chinese; my $chinese2; my $pinyin; my $pinyin2; my $english; my $ChineseLength; my $i = 0; $marks = 0; $convert_input = 0; if ($tone_numbers_pinyin) {$pinyin_1 = 1} else {$pinyin_1 = 0} if ($tone_marks_pinyin) {$pinyin_2 = 1} else {$pinyin_2 = 0} if ($CnT) {$Chinese_1 = 1} else {$Chinese_1 = 0} if ($CnS) {$Chinese_2 = 1} else {$Chinese_2 = 0} open(INFILE, $_[0]) or die "Couldn't open infile `$_[0]'"; READVOCABLOOP: while ($line=getline("INFILE")) { # handle case where line has skill level(s) at beginning (with traditonal and simplified Chinese) if ($line =~ m@^\s*[0-9]+\s*(.+)\s*\ (.+)\ \s*\[(.+)\]\s*(/.*/)\s*$@) { ($chinese,$chinese2,$pinyin,$english) = ($1,$2,$3,$4);$pinyin2=""; $chinese =~ s/\s+$//; # truncate trailing spaces on chinese $chinese2 =~ s/\s+$//; } # line doen't have skill level numbers at beginning (with traditonal and simplified Chinese) elsif ($line =~ m@^\s*(.+)\s*\ (.+)\ \s*\[(.+)\]\s*(/.*/)\s*$@) { ($chinese,$chinese2,$pinyin,$english) = ($1,$2,$3,$4);$pinyin2=""; $chinese =~ s/\s+$//; $chinese2 =~ s/\s+$//; } # handle case where line has skill level(s) at beginning (with tone numbers and marks) elsif ($line =~ m@^\s*[0-9]+\s*(.+)\s*\[(.+)\]\s*\{(.+)\}\s*(/.*/)\s*$@) { ($chinese,$pinyin,$pinyin2,$english) = ($1,$2,$3,$4);$chinese2="";$marks =1; $chinese =~ s/\s+$//; } # line doesn't have skill level numbers at beginning (with tone numbers and marks) elsif ($line =~ m@^\s*(.+)\s*\[(.+)\]\s*\{(.+)\}\s*(/.*/)\s*$@) { ($chinese,$pinyin,$pinyin2,$english) = ($1,$2,$3,$4);$chinese2="";$marks =1; $chinese =~ s/\s+$//; } # handle case where line has skill level(s) at beginning (with tone numbers only) elsif ($line =~ m@^\s*[0-9]+\s*(.+)\s*\[(.+)\]\s*(/.*/)\s*$@) { ($chinese,$pinyin,$english) = ($1,$2,$3);$chinese2="";$pinyin2=""; $chinese =~ s/\s+$//; } #line doesn't have skill level numbers at beginning (with tone numbers only) elsif ($line =~ m@^\s*(.+)\s*\[(.+)\]\s*(/.*/)\s*$@) { ($chinese,$pinyin,$english) = ($1,$2,$3);$chinese2="";$pinyin2=""; $chinese =~ s/\s+$//; } else { # die "Invalid line '$line'"; # ok, don't be so harsh, we will just ignore it next READVOCABLOOP; } # print "$chinese, $chinese2, $pinyin, $pinyin2, $english\n"; $ChineseLength = length($chinese); $firstChar = ord($chinese); $Code = $Code + $firstChar; # Convert "uu" into "u:" or vice-versa in pinyin field, # if the user requested it. if ($uConvert == $uu2uc) { $pinyin =~ s/uu/u:/; } elsif ($uConvert == $uc2uu) { $pinyin =~ s/u:/uu/; } # Remove skill level info from English field if requested if ($removeSkillLevels) { $english =~ s@/=[0-9]+=/$@/@; } # now put everything into the main array of hashes $wordList[$vocabIndex]->{"chinese"} = $chinese; $wordList[$vocabIndex]->{"chinese2"} = $chinese2; $wordList[$vocabIndex]->{"pinyin"} = $pinyin; $wordList[$vocabIndex]->{"pinyin2"} = $pinyin2; $wordList[$vocabIndex]->{"english"} = $english; $wordList[$vocabIndex]->{"ChineseLength"} = $ChineseLength; if ($fastExactMatches =~ m/c/) { # store index of this entry in a hash for fast lookup later. push @{$fastMatchChineseHash{$chinese}}, $vocabIndex; push @{$fastMatchChinese2Hash{$chinese2}}, $vocabIndex; } if ($fastExactMatches =~ m/p/) { # store index of this entry in a hash for fast lookup later. push @{$fastMatchPinyinHash{$pinyin}}, $vocabIndex; push @{$fastMatchPinyin2Hash{$pinyin2}}, $vocabIndex; } $vocabIndex++; $i++; } if ($chinese2 eq "") {$Chinese_1 = 1;$Chinese_2 = 0} if ($pinyin =~ m/[āáǎàēéěèīíǐìōóǒǒūúǔùǖǘǚǜĀÁǍÀĒÉĚÈŌÓǑÒ]/){$marks = 1; $pinyin_1 = 1; $pinyin_2 = 0; $convert_input = 1} if (($tone_marks_pinyin == 1) and ($marks == 0)) {$pyConvert = 1} else {$pyConvert = 0} # convert tone numbers to tone marks $Code = int($Code/$vocabIndex); # $Code >= 228 assuming encoding is UTF-8 if ($Code < 228){$tone_marks_pinyin = 0;$pyConvert = 0;$marks = 0; $pinyin_2 = 0} #assuming encoding is GB close INFILE; print "read $i words\n"; if ($doLog) { print LOGFP "read $i words\n"; } } # # Parse the command-line arguments # sub getargs { $thisarg = shift(); while (defined($thisarg)) { if ($thisarg eq "-v") { # verbose $verbose = 1; } elsif ($thisarg eq "-emacs") { # being run under emacs $underEmacs = 1; } elsif ($thisarg eq "-i") { # case-insensitive English matching $caseInsensitive = 1; } elsif ($thisarg eq "+i") { # case-sensitive English matching $caseInsensitive = 0; } elsif ($thisarg eq "-uu2u:") { $uConvert = $uu2uc; } elsif ($thisarg eq "-u:2uu") { $uConvert = $uc2uu; } elsif ($thisarg eq "-license") { printLicense(); exit(0); } else { $secondarg = shift; if (! defined($secondarg)) { printusage; } if ($thisarg eq "-vd") { # vocabulary directory $vocabDir = $secondarg; } elsif ($thisarg eq "-vf") { # vocabulary file(s) $vocabFiles = $secondarg; } elsif ($thisarg eq "-rmskill") { $removeSkillLevels = $secondarg; } elsif (($thisarg eq "-fastexact") || ($thisarg eq "-fe")) { if ($secondarg =~ s/0//) { $fastExactMatches = ""; } else { $fastExactMatches = ""; if ($secondarg =~ s/c//) { $fastExactMatches .= "c"; } if ($secondarg =~ s/p//) { $fastExactMatches .= "p"; } if ($secondarg =~ s/\+$//) { $continueAfterFastMatch = 1; } if ($secondarg =~ s/-$//) { $continueAfterFastMatch = 0; } } if ($secondarg ne "") { printusage; } } elsif ($thisarg eq "-mm") { # match mode $matchModeStr = $secondarg; if ($matchModeStr eq "e") { $matchMode = "exact"; } elsif ($matchModeStr eq "s") { $matchMode = "shorter"; } elsif ($matchModeStr eq "l") { $matchMode = "longer"; } else { printusage; } } elsif ($thisarg eq "-am") { # anchor mode $anchorModeStr = $secondarg; if ($anchorModeStr eq "s") { $anchorMode = "start"; } elsif ($anchorModeStr eq "e") { $anchorMode = "end"; } elsif ($anchorModeStr eq "n") { $anchorMode = "none"; } else { printusage; } } elsif ($thisarg eq "-log") { # do extra logging to a file # (for debugging) $logFname = $secondarg; $doLog = 1; } else { printusage; } } $thisarg = shift; } } # # Check to see whether a string contains Chinese, pinyin, or English. # This routine returns one of the following strings: # "english", "chinese", or "pinyin" # It also handles a couple of special flags, "-wp" or "-we" at the # beginning of the string, which indicates the user wants to do "wildcard # matching". # sub classifyCPE { $_[0]=~ s/[\.\,\(\)\[\]]+//g; # strip out periods, commas, parentheses and brackets $_[0] =~ s/^\s+//; # remove any leading spaces $_[0] =~ s/\s+$//; # and trailing spaces $firstChar = ord($_[0]); # if high bit on first char is set, assume chinese if ($firstChar >= 228){ $_[0]=~ s/[\ ]+//g; # strip out spaces return "chinese"; #UTF-8 and GB encoding } elsif ($firstChar >= 128){$A=substr($_[0],0,2); if (($marks == 1) and ($A =~ m/[āáǎàēéěèōóǒǒĀÁǍÀĒÉĚÈŌÓǑÒ]/)) {return "pinyin";} #first character is tone pinyin (UTF-8) $_[0]=~ s/[\ ]+//g; # strip out spaces return "chinese"; #GB encoding only } if (!(@words = split(" ", $_[0]))) { # split into words die "Fatal error in classifyCPE(), on `$_[0]'\n"; } # Special case -- if the beginning of the string is a "-wp", it # indicates "wildcard pinyin", i.e. the person wants to do partial matches, # letting "jin" match "jin4", "jing1", etc. if ($words[0] =~ m/^(?=-)(?:-\w+\s+)*-wp/) { $_[0] =~ s/^\s*-wp\s*//; # surgically remove the "-wp" from string $_[0] =~ s/v/uu/; #Convert "v" into "u:" # Convert "uu" into "u:" or vice-versa in pinyin field, # if the user requested it. if ($uConvert = $uu2uc) { $_[0] =~ s/uu/u:/; } elsif ($uConvert = $uc2uu) { $_[0] =~ s/u:/uu/;$word =~ s/\/+$//; } @words = split(" ", $_[0]); # split string into words again foreach $word (@words) { $word =~ s/^(.*)$/\[\^\\s\]\*$1/; $word =~ s/^(.*[^0-5])$/$1\[\^\\s\]\*/; # turn word # into a nice regexp so it can # be used for matching later # (unless this word already # has a tone on it) } $_[0] = join(' ', @words); # join words back into one string # See comment below about pinyin tone 0 for explanation of this line $_[0] =~ s/0/[1-5]/g; if ($convert_input){ $pyConvert = 1; mark_pinyin($_[0]);$_[0] = $py; $pyConvert = 0; } return "pinyin"; } # If the beginning of the string is a "-we", it # indicates "wildcard english", with partial matches (very much like # "-wp" above). if ($words[0]=~ m/^(?=-)(?:-\w+\s+)*-we/) { $_[0] =~ s/^\s*-we\s*//; # surgically remove "-we" from string @words = split(" ", $_[0]); foreach $word (@words) { $word =~ s/^\/+//; # remove any leading slashes $word =~ s/\/+$//; # and trailing slashes $word =~ s/^(.*)$/\.\*$1\.\*/; } $_[0] = join(' ', @words); return "english"; } # If we got this far, just check the first word and make a guess from that if ($words[0] =~ m/[a-zA-Zu:]+[012345]$/) { # if the first word is letters followed by a digit from 0-5 (presumably # a pinyin tone), then assume it really is pinyin. # But pinyin tone 0 is a special case, meaning the person doesn't # actually know which tone it is, so we should replace any zero's # by a character class matching any digit 1-5. if ($_[0] =~ m/0(\s|$)/) { $forgotTone = 1; $_[0] =~ s/0/[1-5]/g; } $_[0] =~ s/v/uu/; #Convert "v" into "u:" # Convert "uu" into "u:" or vice-versa in pinyin field, # if the user requested it. if ($uConvert = $uu2uc) { $_[0] =~ s/uu/u:/; } elsif ($uConvert = $uc2uu) { $_[0] =~ s/u:/uu/; } if ($convert_input){ $pyConvert = 1; mark_pinyin($_[0]);$_[0] = $py; $pyConvert = 0; } return "pinyin"; } elsif ($_[0] =~ m/[āáǎàēéěèīíǐìōóǒǒūúǔùǖǘǚǜĀÁǍÀĒÉĚÈŌÓǑÒ]/) #tone pinyin {return "pinyin"; } # otherwise, assume it's English else { $_[0] =~ s/^\/+//; # remove any leading slashes $_[0] =~ s/\/+$//; # and trailing slashes return "english";} } # # Used for sorting an array of references to hashes, by the Chinese length sub byChineseLength { $$a->{"ChineseLength"} <=> $$b->{"ChineseLength"}; } # # Look up a word. This is the main routine which does all the work. # The word to look up will be the first element of the argument array @_, # and the language to use (chinese, pinyin, or english) will be in $_[1]. # The matchMode and anchorMode parameters to use are passed in as # $_[2] and $_[3], respectively. # # Performance note: when MatchMode is "exact" or "longer", I use "eval" # to do the main search loop. This lets me build the pattern-matching # regexp dynamically each time this function is executed, but not have # the regexp be recompiled 12,000+ times as I loop through the # vocabulary file (or however big cedict is when you read this). # From my timings, however, this doesn't seem to make a very big difference. # If anyone has some suggestions for improving the performance of vocab # lookups, I'd appreciate hearing them, since in my opinion this routine # is somewhat slow (not surprising, since it's pretty simple-minded). # sub lookupWord { my ($i, $foundMatch, $numMatchesFound, $matchMode, $anchorMode, $fastExactMatches, $continueAfterFastMatch); my ($lastChinese,$lastChinese2,$lastPinyin,$lastPinyin2 , $lastEnglish, $headStr, $fastRef); my (@matchingWordList, @fastIndexArray); my %fastSeen; $lookFor = $_[0]; # just to make things more readable $languageToUse = $_[1]; # same here $caseStr = ""; if ($_[1] eq "english") { if ($caseInsensitive) { $caseStr= "(?i)"; } } elsif (($_[1] ne "chinese") && ($_[1] ne "pinyin")) { die "Unknown lookup method `$_[1]'"; } $matchMode = $_[2]; $anchorMode = $_[3]; $fastExactMatches = $_[4]; $continueAfterFastMatch = $_[5]; $foundMatch = 0; %fastSeen = (); if (($_[1] eq "chinese") && (($fastExactMatches =~ m/c/) || ($matchMode eq "exact"))) { $fastRef = $fastMatchChineseHash{$lookFor}; if (defined($fastRef)) { $foundMatch = 1; if ($matchMode ne "exact") { print "Fast exact matches:\n"; } @fastIndexArray = @{$fastRef}; foreach $i (@fastIndexArray) { $chinese=$wordList[$i]->{chinese};$chinese2=$wordList[$i]->{chinese2};$pinyin=$wordList[$i]->{pinyin};$english=$wordList[$i]->{english}; if ($pyConvert) {$pinyin2="$wordList[$i]->{pinyin}"} else {$pinyin2="$wordList[$i]->{pinyin2}"} mark_pinyin($pinyin2); print "$chinese $chinese2 $pinyin $pinyin2 $english\n"; $fastSeen{$i}++; } unless ($continueAfterFastMatch) { return; } if ($matchMode ne "exact") { print "Other matches:\n"; } } $fastRef = $fastMatchChinese2Hash{$lookFor}; if (defined($fastRef)) { $foundMatch = 1; if ($matchMode ne "exact") { print "Fast exact matches:\n"; } @fastIndexArray = @{$fastRef}; foreach $i (@fastIndexArray) { $chinese=$wordList[$i]->{chinese};$chinese2=$wordList[$i]->{chinese2};$pinyin=$wordList[$i]->{pinyin};$english=$wordList[$i]->{english}; if ($pyConvert) {$pinyin2="$wordList[$i]->{pinyin}"} else {$pinyin2="$wordList[$i]->{pinyin2}"} mark_pinyin($pinyin2); print "$chinese $chinese2 $pinyin $pinyin2 $english\n"; $fastSeen{$i}++; } unless ($continueAfterFastMatch) { return; } if ($matchMode ne "exact") { print "Other matches:\n"; } } } if (($_[1] eq "pinyin") && (($fastExactMatches =~ m/p/) || ($matchMode eq "exact"))) { $fastRef = $fastMatchPinyinHash{$lookFor}; if (defined($fastRef)) { $foundMatch = 1; if ($matchMode ne "exact") { print "Fast exact matches:\n"; } @fastIndexArray = @{$fastRef}; foreach $i (@fastIndexArray) { $chinese=$wordList[$i]->{chinese};$chinese2=$wordList[$i]->{chinese2};$pinyin=$wordList[$i]->{pinyin};$english=$wordList[$i]->{english}; if ($pyConvert) {$pinyin2="$wordList[$i]->{pinyin}"} else {$pinyin2="$wordList[$i]->{pinyin2}"} mark_pinyin($pinyin2); print "$chinese $chinese2 $pinyin $pinyin2 $english\n"; $fastSeen{$i}++; } unless ($continueAfterFastMatch) { return; } if ($matchMode ne "exact") { print "Other matches:\n"; } } $fastRef = $fastMatchPinyin2Hash{$lookFor}; if (defined($fastRef)) { $foundMatch = 1; if ($matchMode ne "exact") { print "Fast exact matches:\n"; } @fastIndexArray = @{$fastRef}; foreach $i (@fastIndexArray) { $chinese=$wordList[$i]->{chinese};$chinese2=$wordList[$i]->{chinese2};$pinyin=$wordList[$i]->{pinyin};$english=$wordList[$i]->{english}; if ($pyConvert) {$pinyin2="$wordList[$i]->{pinyin}"} else {$pinyin2="$wordList[$i]->{pinyin2}"} mark_pinyin($pinyin2); print "$chinese $chinese2 $pinyin $pinyin2 $english\n"; $fastSeen{$i}++; } unless ($continueAfterFastMatch) { return; } if ($matchMode ne "exact") { print "Other matches:\n"; } } } if ($matchMode eq "exact") { if ($_[1] eq "chinese") {$matchStr = "^$lookFor\$";} if ($_[1] eq "pinyin") {$matchStr = "^$lookFor\$";} if ($_[1] eq "english") {$matchStr = "$caseStr/$lookFor/";} # turn '/' into '\/' in search string, since we are going to eval # the pattern-match, rather than run it directly $matchStr =~ s@/@\\/@g; if ($verbose) {print "matchStr = `$matchStr'\n";} $searchCmd = ""; $searchCmd .= "for (\$i=0; \$i < $numVocabWords; \$i++) {"; $searchCmd .= "if (\$wordList[\$i]->{$_[1]} =~ m/$matchStr/) {"; $searchCmd .= "\$chinese = \$wordList[\$i]->{chinese};\$chinese2 = \$wordList[\$i]->{chinese2}; \$pinyin = \$wordList[\$i]->{pinyin}; \$english = \$wordList[\$i]->{english};"; $searchCmd .= "if (\$pyConvert) {\$pinyin2=\$wordList[\$i]->{pinyin};} else {\$pinyin2=\$wordList[\$i]->{pinyin2};} mark_pinyin(\$pinyin2);"; $searchCmd .= "print \"\$chinese \$chinese2 \$pinyin \$pinyin2 \$english\\n\";"; if ($doLog) { $searchCmd .= "print LOGFP \"\$chinese \$chinese2 \$pinyin \$pinyin2 \$english\\n\";"; } $searchCmd .= "\$foundMatch = 1;"; $searchCmd .= "}"; $searchCmd .= "}"; eval $searchCmd; if ($_[1] eq "chinese") {$_[1] = "chinese2";$matchStr = "^$lookFor\$";} if ($_[1] eq "pinyin") {$_[1] = "pinyin2";$matchStr = "^$lookFor\$";} if (($_[1] eq "chinese2") or ($_[1] eq "pinyin2")) { # turn '/' into '\/' in search string, since we are going to eval # the pattern-match, rather than run it directly $matchStr =~ s@/@\\/@g; if ($verbose) {print "matchStr = `$matchStr'\n";} $searchCmd = ""; $searchCmd .= "for (\$i=0; \$i < $numVocabWords; \$i++) {"; $searchCmd .= "if (\$wordList[\$i]->{$_[1]} =~ m/$matchStr/) {"; $searchCmd .= "\$chinese = \$wordList[\$i]->{chinese};\$chinese2 = \$wordList[\$i]->{chinese2}; \$pinyin = \$wordList[\$i]->{pinyin}; \$english = \$wordList[\$i]->{english};"; $searchCmd .= "if (\$pyConvert) {\$pinyin2=\$wordList[\$i]->{pinyin};} else {\$pinyin2=\$wordList[\$i]->{pinyin2};} mark_pinyin(\$pinyin2);"; $searchCmd .= "print \"\$chinese \$chinese2 \$pinyin \$pinyin2 \$english\\n\";"; if ($doLog) { $searchCmd .= "print LOGFP \"\$chinese \$chinese2 \$pinyin \$pinyin2 \$english\\n\";"; } $searchCmd .= "\$foundMatch = 1;"; $searchCmd .= "}"; $searchCmd .= "}"; eval $searchCmd; } } elsif ($matchMode eq "longer") { if ($anchorMode eq "start") { if ($_[1] eq "chinese") { $matchStr = "^$lookFor"; } elsif ($_[1] eq "pinyin") { $matchStr = "^$lookFor"; } elsif ($_[1] eq "english") { $matchStr = "$caseStr/${lookFor}[\\s/]"; } } elsif ($anchorMode eq "end") { # since Chinese characters are 2 bytes each, we need this little # bit of nonsense to avoid matching 2 bytes which straddle 2 # separate Chinese characters (it happens more than you might # expect). if ($_[1] eq "chinese") { $matchStr = "^(..)*$lookFor\$"; } elsif ($_[1] eq "pinyin") { $matchStr = "(^|\\s)$lookFor\$"; } elsif ($_[1] eq "english") { $matchStr = "[\\s/]$caseStr$lookFor/"; } } elsif ($anchorMode eq "none") { if ($_[1] eq "chinese") { $matchStr = "^(..)*$lookFor"; } elsif ($_[1] eq "pinyin") { $matchStr = "(^|\\s)$lookFor"; } elsif ($_[1] eq "english") { $matchStr = "[\\s/]$caseStr${lookFor}[\\s/]"; } } else { die "Unknown anchorMode `$anchorMode'\n"; } # turn '/' into '\/' in search string, since we are going to eval # the pattern-match, rather than run it directly $matchStr =~ s@/@\\/@g; if ($verbose) {print "matchStr = `$matchStr'\n";} $searchCmd = ""; $searchCmd .= "for (\$i=0; \$i < $numVocabWords; \$i++) {\n"; # for: i $searchCmd .= " unless (\$fastSeen{\$i}) {\n"; # unless: fastSeen $searchCmd .= " if (\$wordList[\$i]->{$_[1]} =~ m/$matchStr/) {\n"; # if: wordList $searchCmd .= "\$chinese = \$wordList[\$i]->{chinese};\$chinese2 = \$wordList[\$i]->{chinese2}; \$pinyin = \$wordList[\$i]->{pinyin}; \$english = \$wordList[\$i]->{english};"; $searchCmd .= "if (\$pyConvert) {\$pinyin2=\$wordList[\$i]->{pinyin};} else {\$pinyin2=\$wordList[\$i]->{pinyin2};} mark_pinyin(\$pinyin2);"; $searchCmd .= "print \"\$chinese \$chinese2 \$pinyin \$pinyin2 \$english\\n\";"; if ($doLog) { $searchCmd .= "print LOGFP \"\$chinese \$chinese2 \$pinyin \$pinyin2 \$english\\n\";"; } $searchCmd .= " \$foundMatch = 1;\n"; $searchCmd .= " }"; # if: wordList $searchCmd .= " }"; # unless: fastSeen $searchCmd .= "}"; # for: i if ($verbose) { print "searchCmd = `$searchCmd'\n"; } eval $searchCmd; if ($anchorMode eq "start") { if ($_[1] eq "chinese") {$_[1] = "chinese2"; $matchStr = "^$lookFor";} elsif ($_[1] eq "pinyin") {$_[1] = "pinyin2"; $matchStr = "^$lookFor"; } } elsif ($anchorMode eq "end") { # since Chinese characters are 2 bytes each, we need this little # bit of nonsense to avoid matching 2 bytes which straddle 2 # separate Chinese characters (it happens more than you might # expect) if ($_[1] eq "chinese") {$_[1] = "chinese2"; $matchStr = "^(..)*$lookFor\$"; } elsif ($_[1] eq "pinyin") {$_[1] = "pinyin2"; $matchStr = "(^|\\s)$lookFor\$"; } } elsif ($anchorMode eq "none") { if ($_[1] eq "chinese") {$_[1] = "chinese2"; $matchStr = "^(..)*$lookFor"; } elsif ($_[1] eq "pinyin") {$_[1] = "pinyin2"; $matchStr = "(^|\\s)$lookFor"; } } else { die "Unknown anchorMode `$anchorMode'\n"; } if (($_[1] eq "chinese2") or ($_[1] eq "pinyin2")) { # turn '/' into '\/' in search string, since we are going to eval # the pattern-match, rather than run it directly $matchStr =~ s@/@\\/@g; if ($verbose) {print "matchStr = `$matchStr'\n";} $searchCmd = ""; $searchCmd .= "for (\$i=0; \$i < $numVocabWords; \$i++) {\n"; # for: i $searchCmd .= " unless (\$fastSeen{\$i}) {\n"; # unless: fastSeen $searchCmd .= " if (\$wordList[\$i]->{$_[1]} =~ m/$matchStr/) {\n"; # if: wordList $searchCmd .= "\$chinese = \$wordList[\$i]->{chinese};\$chinese2 = \$wordList[\$i]->{chinese2}; \$pinyin = \$wordList[\$i]->{pinyin}; \$english = \$wordList[\$i]->{english};"; $searchCmd .= "if (\$pyConvert) {\$pinyin2=\$wordList[\$i]->{pinyin};} else {\$pinyin2=\$wordList[\$i]->{pinyin2};} mark_pinyin(\$pinyin2);"; $searchCmd .= "print \"\$chinese \$chinese2 \$pinyin \$pinyin2 \$english\\n\";"; if ($doLog) { $searchCmd .= "print LOGFP \"\$chinese \$chinese2 \$pinyin \$pinyin2 \$english\\n\";"; } $searchCmd .= " \$foundMatch = 1;\n"; $searchCmd .= " }"; # if: wordList $searchCmd .= " }"; # unless: fastSeen $searchCmd .= "}"; # for: i if ($verbose) { print "searchCmd = `$searchCmd'\n"; } eval $searchCmd; } } elsif ($matchMode eq "shorter") { if (($_[1] ne "chinese") && ($_[1] ne "pinyin")) { print "`shorter' matchMode can only be used with chinese and pinyin lookups\n"; return; } $numMatchesFound = 0; if ($anchorMode eq "start") { $headStr = "^"; for ($i=0; $i < $numVocabWords; $i++) { unless ($fastSeen{$i}) { $matchStr = $headStr . $wordList[$i]->{$languageToUse}; if ($lookFor =~ m/$matchStr/) { $matchingWordList[$numMatchesFound] = \${wordList[$i]}; $numMatchesFound++; $foundMatch = 1; } } } } elsif ($anchorMode eq "end") { if ($_[1] eq "chinese") { $headStr = "^(..)*"; } else { $headStr = "(^|\\s)"; } for ($i=0; $i < $numVocabWords; $i++) { unless ($fastSeen{$i}) { $matchStr = $headStr . $wordList[$i]->{$languageToUse} . "\$"; if ($lookFor =~ m/$matchStr/) { $matchingWordList[$numMatchesFound] = \${wordList[$i]}; $numMatchesFound++; $foundMatch = 1; } } } } elsif ($anchorMode eq "none") { if ($_[1] eq "chinese") { $headStr = "^(..)*"; } else { $headStr = "(^|\\s)"; } for ($i=0; $i < $numVocabWords; $i++) { unless ($fastSeen{$i}) { $matchStr = $headStr . $wordList[$i]->{$languageToUse}; if ($lookFor =~ m/$matchStr/) { $matchingWordList[$numMatchesFound] = \${wordList[$i]}; $numMatchesFound++; $foundMatch = 1; } } } } # print out the matches sorted by length (we want longer matches # displayed last where they will be seen more easily, since they # are presumably closer to the whole phrase which we wanted to match). $lastChinese = ""; $lastChinese2 = ""; $lastPinyin = ""; $lastPinyin2 = ""; $lastEnglish = ""; foreach $i (sort byChineseLength @matchingWordList) { # If this entry is identical to the previous one, don't bother # printing it. Although for this to be maximally useful, # we should do alphabetical sorting first if (! ((${$i}->{chinese} eq $lastChinese) && (${$i}->{chinese2} eq $lastChinese2) && (${$i}->{pinyin} eq $lastPinyin) && (${$i}->{pinyin2} eq $lastPinyin2) && (${$i}->{english} eq $lastEnglish))) { print ${$i}->{"chinese"}, " ", ${$i}->{"chinese2"}, " ", ${$i}->{"pinyin"}, " ", ${$i}->{"pinyin2"}, " ", ${$i}->{"english"}, "\n"; if ($doLog) { print LOGFP ${$i}->{"chinese"}, " ", ${$i}->{"chinese2"}, " ", ${$i}->{"pinyin"}, " ", ${$i}->{"pinyin2"}, " ", ${$i}->{"english"}, "\n"; } $lastChinese = ${$i}->{chinese}; $lastChinese2 = ${$i}->{chinese2}; $lastPinyin = ${$i}->{pinyin}; $lastPinyin2 = ${$i}->{pinyin2}; $lastEnglish = ${$i}->{english}; } } } if ($foundMatch == 0) { print "No match found.\n"; } } sub prompt { if ($underEmacs) { print "====\n"; } else { print "\nEnter word (-h for help): "; } if ($doLog) { print LOGFP "\nEnter word (-h for help): "; } } sub mark_pinyin{ if ($pyConvert){ $word=$_[0]; #add spaces between compound pinyin words to separate them $word=~ s/1/1 /g;$word=~ s/2/2 /g;$word=~ s/3/3 /g;$word=~ s/4/4 /g;$word=~ s/5/5 /g; @words = split(' ', $word); # split string into words foreach $word (@words){ fix_pinyin($word); $word = $py_word; } $py = join(' ', @words); # join words back into one string if ($convert_input){return $py;} $pinyin2 = $py; } if ($pinyin_1) {$pinyin = "[$pinyin]"} else {$pinyin = ""} if ($pinyin_2) {$pinyin2 = "[$pinyin2]"} else {$pinyin2 = ""} if ($Chinese_1 == 0){$chinese = ""} if ($Chinese_2 == 0){$chinese2 = ""} } sub fix_pinyin { #Copyright 2003 by Forrest Cahoon (hanziquiz@abstractfactory.org), modified (2008) by Jos van Wolput (wolput@onsneteindhoven.nl) #This script converts anything that looks like ASCII pinyin with tone numbers #at the end of the word into utf-8 with proper pinyin tone marks. my %UTF8_PINYIN_TONES = (a => [ "\xc4\x81", "\xc3\xa1", "\xc7\x8e", "\xc3\xa0", "a" ], o => [ "\xc5\x8d", "\xc3\xb3", "\xc7\x92", "\xc3\xb2", "o" ], e => [ "\xc4\x93", "\xc3\xa9", "\xc4\x9b", "\xc3\xa8", "e" ], i => [ "\xc4\xab", "\xc3\xad", "\xc7\x90", "\xc3\xac", "i" ], u => [ "\xc5\xab", "\xc3\xba", "\xc7\x94", "\xc3\xb9", "u" ], v => [ "\xc7\x96", "\xc7\x98", "\xc7\x9a", "\xc7\x9c", "\xc3\xbc" ], A => [ "\xc4\x80", "\xc3\x81", "\xc7\x8d", "\xc3\x80", "A" ], O => [ "\xc5\x8c", "\xc3\x93", "\xc7\x91", "\xc3\x92", "O" ], E => [ "\xc4\x92", "\xc3\x89", "\xc4\x9a", "\xc3\x88", "E" ], I => [ "\xc4\xaa", "\xc3\x8d", "\xc7\x8f", "\xc3\x8c", "I" ], U => [ "\xc5\xaa", "\xc3\x9a", "\xc7\x93", "\xc3\x99", "U" ], V => [ "\xc7\x95", "\xc7\x97", "\xc7\x99", "\xc7\x9b", "\xc3\x9c" ]); my %FINAL_ACCENT_LETTERS = ("a" => "a", "ai" => "a", "an" => "a", "ang" => "a", "ao" => "a", "A" => "A", "AI" => "A", "AN" => "A", "ANG" => "A", "AO" => "A", "Ai" => "A", "An" => "A", "Ang" => "A", "Ao" => "A", "e" => "e", "ei" => "e", "en" => "e", "er" => "e", "eng" => "e", "E" => "E", "EI" => "E", "EN" => "E", "ER" => "E", "ENG" => "E", "Ei" => "E", "En" => "E", "Er" => "E", "Eng" => "E", "i" => "i", "ia" => "a", "ian" => "a", "iang" => "a", "iao" => "a", "I" => "I", "IA" => "A", "IAN" => "A", "IANG" => "A", "IAO" => "A", "ie" => "e", "in" => "i", "ing" => "i", "iong" => "o", "iu" => "u", "IE" => "E", "IN" => "I", "ING" => "I", "IONG" => "O", "IU" => "U", "o" => "o", "ong" => "o", "ou" => "o", "u" => "u", "ua" => "a", "O" => "O", "ONG" => "O", "OU" => "O", "Ou" => "O", "U" => "U", "UA" => "A", "uai" => "a", "uan" => "a", "uang" => "a", "ue" => "e", "ui" => "i", "UAI" => "A", "UAN" => "A", "UANG" => "A", "UE" => "E", "UI" => "I", "un" => "u", "uo" => "o", "v" => "v", "ve" => "e", "UN" => "U", "UO" => "O", "V" => "V", "VE" => "E"); my ($initial, $final, $tone, $accent_loc, $accented_letter); # if anything goes wrong, we return the original # word unchanged, so get a copy to work on. $py_word = $word; # Convert common representations of u with umlaut # ("u:", "uu" and the iso-8859-1 codepoint) # to our preferred internal representation "v" $word =~ s/(u[:u]|\xfc)/v/g; $word =~ s/(U[:U]|\xdc)/V/g; if ($word =~ /^([^aeiouvAEIOUV]*(\D+))(\d)$/) { $word = $1; $final = $2; $tone = $3; } else { return $py_word; } $accent_loc = $FINAL_ACCENT_LETTERS{$final}; if (!defined($accent_loc)) {return $py_word; } $accented_letter = $UTF8_PINYIN_TONES{$accent_loc}->[$tone - 1]; if (!defined($accented_letter)) {return $py_word; } $word =~ s/$accent_loc/$accented_letter/; # Finally, change any "v" to a proper utf8 u with umlaut: $word =~ s/v/\xc3\xbc/g; $py_word=$word; return $py_word; } ############## # Main program ############## my $thisTimeMatchMode; my $thisTimeAnchorMode; setdefaults(); # split apart arguments which have spaces in them; this is because when # this script is called from emacs, sometimes multiple arguments may # be packed into one string, separated by spaces (e.g. "-mm e"). @ARGV = split(" ", join(" ", @ARGV)); getargs(@ARGV); print "Chinese Vocabulary (Cedict) Lookup (Version 1.2.1-a.1, July 2008)\n"; print "cedictlookup comes with ABSOLUTELY NO WARRANTY.\n"; print "Enter `-license' for more information.\n"; print "Adapt the script options to your needs.\n"; use FileHandle; if ($doLog) { open(LOGFP, ">$logFname") or die "Couldn't open logfile `$logFname'\n"; select(LOGFP); $| = 1; # make unbuffered LOGFP->autoflush(1); print LOGFP "verbose = $verbose, underEmacs = $underEmacs, caseInsensitive = $caseInsensitive\n"; print LOGFP "vocabDir = `$vocabDir'\n"; print LOGFP "vocabFiles = `$vocabFiles'\n"; print LOGFP "matchMode = `$matchMode', anchorMode = `$anchorMode'\n"; } select(STDOUT); $| = 1; STDOUT->autoflush(1); print "cedictlookup: Reading Chinese vocabulary files...\n"; # Append "/" to vocabDir if it doesn't already have one if (! ($vocabDir =~ m@/$@)) { $vocabDir .= "/"; } if ($verbose) { printsettings(); } $vocabIndex = 0; foreach $vFile (split(/:/, $vocabFiles)) { $tmpStr = $vocabDir . $vFile; print "reading `$tmpStr'..."; if ($doLog) { print LOGFP "reading `$tmpStr'... "; } readvocabfile($tmpStr); } $numVocabWords = $vocabIndex; print "cedictlookup: Done reading vocabulary files.\n"; print "cedictlookup: Got $numVocabWords entries.\n"; prompt(); INPUTLOOP: while (defined($line = )) { chop $line; $line =~ s/^\s+//; # remove any leading spaces $line =~ s/\s+$//; # and trailing spaces if ($line eq "-h") { printOnlineHelp(); prompt(); next INPUTLOOP; } if ($line eq "q") { exit 0; } $thisTimeMatchMode = $matchMode; $thisTimeAnchorMode = $anchorMode; $thisTimeFastExactMatches = $fastExactMatches; $thisTimeContinueAfterFastMatch = $continueAfterFastMatch; if ($line eq "-license") { printLicense(); prompt(); next INPUTLOOP; } if ($line eq "-q") { print "matchMode = $matchMode, anchorMode = $anchorMode\n"; print "fastExactMatches = "; if ($fastExactMatches eq "") { print "(none)\n"; } else { if ($fastExactMatches =~ m/c/) { print "chinese "; } if ($fastExactMatches =~ m/p/) { print "pinyin "; } if ($continueAfterFastMatch) { print "+\n"; } else { print "\n"; } } prompt(); next INPUTLOOP; } while ($line =~ m/^(?=-)(?:-\w+\s+)*-(mm|am) (\S+)/) { if ($1 eq "mm") { if ($2 eq "e") { $thisTimeMatchMode = "exact"; } elsif ($2 eq "s") { $thisTimeMatchMode = "shorter"; } elsif ($2 eq "l") { $thisTimeMatchMode = "longer"; } $line =~ s/\-mm \w\s*//; } elsif ($1 eq "am") { if ($2 eq "s") { $thisTimeAnchorMode = "start"; } elsif ($2 eq "e") { $thisTimeAnchorMode = "end"; } elsif ($2 eq "n") { $thisTimeAnchorMode = "none"; } $line =~ s/\-am \w\s*//; } elsif (($1 eq "fe") || ($1 eq "fastexact")) { # This code will never actually get executed, because # the "fastexact" and "fe" flags aren't in the pattern match # above. Later I may allow the user to change the FastExact # mode in the middle of things, by building the hash table # if necessary. But not yet. $firstarg = $1; $secondarg = $2; if ($secondarg =~ s/0//) { $thisTimeFastExactMatches = ""; } else { $thisTimeFastExactMatches = ""; if ($secondarg =~ s/c//) { $thisTimeFastExactMatches .= "c"; } if ($secondarg =~ s/p//) { $thisTimeFastExactMatches .= "p"; } if ($secondarg =~ s/\+$//) {$thisTimeContinueAfterFastMatch=1;} if ($secondarg =~ s/\-$//) {$thisTimeContinueAfterFastMatch=0;} } if ($secondarg ne "") {print "Illegal option for `$firstarg' argument\n";} $line =~ s/\-$firstarg \S+\s*//; } else { die "That's funny, I shouldn't be able to die this way."; } } if ($verbose) { print "line = `$line'\n"; print "mm = $thisTimeMatchMode, am = $thisTimeAnchorMode, fastExactMatches = $thisTimeFastExactMatches, continueAfterFastMatch = $thisTimeContinueAfterFastMatch\n"; } if ($line eq "") { # special case -- if we encounter an empty line, set the local # values of matchMode and anchorMode to the global copies. This # allows you to enter e.g. the line "+mm l -am n" to "permanently" # set the matchMode to "longer", and the anchorMode to "none", # at least until the user changes them again. We also display # the new settings for matchMode and anchorMode. $matchMode = $thisTimeMatchMode; $anchorMode = $thisTimeAnchorMode; $fastExactMatches = $thisTimeFastExactMatches; $continueAfterFastMatch = $thisTimeContinueAfterFastMatch; print "matchMode = $matchMode, anchorMode = $anchorMode\n"; print "fastExactMatches = "; if ($fastExactMatches eq "") { print "(none)\n"; } else { if ($fastExactMatches =~ m/c/) { print "chinese "; } if ($fastExactMatches =~ m/p/) { print "pinyin "; } if ($continueAfterFastMatch) { print "+\n"; } else { print "\n"; } } prompt(); next INPUTLOOP; } $forgotTone = 0; $typeOfInput = classifyCPE($line); if ($line =~ m/^-/) { warn "Hmm, looks like you tried to specify a flag on the input,\n"; warn "but I didn't recognize it. Ignoring input; try again!\n"; warn "You can input '-h' for a short online help.\n"; prompt(); next INPUTLOOP; } if ($verbose) { print "typeOfInput = `$typeOfInput'\n"; print "line = `$line'\n"; } if ($doLog) { print LOGFP "typeOfInput = `$typeOfInput'\n"; print LOGFP "line = `$line'\n"; } if ($forgotTone && ($thisTimeMatchMode eq "shorter")) { print "0-tone (forgotten-tone) pinyin lookups currently do not\n"; print "work with Match Mode = Shorter, sorry.\n"; prompt(); next INPUTLOOP; } # timing code is commented out for now... # $t0 = new Benchmark; if ($typeOfInput eq "pinyin") { # Convert "uu" into "u:" or vice-versa in pinyin field, # if the user requested it. if ($uConvert = $uu2uc) { $line =~ s/v/uu/; $line =~ s/uu/u:/; } elsif ($uConvert = $uc2uu) { $line =~ s/v/uu/; $line =~ s/u:/uu/; } } if ($underEmacs) { print "Searching...\n"; } lookupWord($line, $typeOfInput, $thisTimeMatchMode, $thisTimeAnchorMode, $thisTimeFastExactMatches, $thisTimeContinueAfterFastMatch); # $t1 = new Benchmark; # $td = timediff($t1, $t0); # print "word lookup took: ", timestr($td), "\n"; prompt(); } sub printOnlineHelp { print "\nOnline Help:\n\n"; print "\tq Quit\n"; print "\t-q or Query current Match Mode and Anchor Mode\n"; print "\t-license Print license of this program\n"; print "\t-mm Set Match Mode, where can be\n"; print "\t e exact\n"; print "\t s shorter\n"; print "\t l longer\n"; print "\t-am Set Anchor Mode, where can be\n"; print "\t s start\n"; print "\t e end\n"; print "\t n none\n"; print "\t-wp pinyin word Pinyin wildcard search\n"; print "\t-we English word English wildcard search\n"; print "\n"; print "\tWhen tone is 0 in pinyin lookups, it works as a wildcard\n"; print "\tfor the tone (doesn't work with Match Mode = Shorter)\n"; }