package String::Multibyte::UHC; use vars qw($VERSION); $VERSION = '1.12'; # HANGUL SYLLABLE 11172 # [\x81-\xA0][\x41-\x5A\x61-\x7A\x81-\xFE] 32 * 178 = 5696 # [\xA1-\xAF][\x41-\x5A\x61-\x7A\x81-\xA0] 15 * 84 = 1260 # [\xB0-\xC5][\x41-\x5A\x61-\x7A\x81-\xFE] 22 * 178 = 3916 # \xC6[\x41-\x52\xA1-\xFE] 18 + 94 = 112 # [\xC7-\xC8][\xA1-\xFE] 2 * 94 = 188 +{ charset => 'UHC', regexp => '(?:[\x00-\x7F]|[\x81-\xC5][\x41-\x5A\x61-\x7A\x81-\xFE]|' . '\xC6[\x41-\x52\xA1-\xFE]|[\xC7-\xFE][\xA1-\xFE])', cmpchar => sub { $_[0] cmp $_[1] }, nextchar => sub { my $ch = shift; my $len = length $ch; if ($len == 1) { return $ch eq "\x7F" ? "\x81\x41" : chr(ord($ch)+1); } elsif ($len == 2) { my($c, $d) = unpack('CC', $ch); return $ch eq "\xFE\xFE" ? undef : $ch eq "\xC6\x52" ? "\xC6\xA1" : $d == 0xFE ? pack('CC', $c+1, $c < 0xC6 ? 0x41 : 0xA1) : $d == 0x5A ? chr($c) ."\x61" : $d == 0x7A ? chr($c) ."\x81" : pack('CC', $c, $d+1); } else { return; } }, }; __END__ =head1 NAME String::Multibyte::UHC - internally used by String::Multibyte for UHC =head1 SYNOPSIS use String::Multibyte; $uhc = String::Multibyte->new('UHC'); $uhc_length = $uhc->length($uhc_string); =head1 DESCRIPTION C<String::Multibyte::UHC> is used for manipulation of strings in UHC (Unified Hangul Code). Byte range of single-byte characters: C<0x00..0x7F>. Leading byte range of double-byte characters: C<0x81..0xFE>. Trailing byte range of double-byte characters: C<0x41..0x5A>, C<0x61..0x7A>, and C<0x81..0xFE>. Unassigned code points out of EUC-KR are disabled. Character order (invalid code points are excluded): C<0x00..0x7F>, C<0x8141..0xFEFE>. =head1 SEE ALSO L<String::Multibyte> =cut