aboutsummaryrefslogtreecommitdiffstats
path: root/codepage
diff options
context:
space:
mode:
authorH. Peter Anvin <hpa@zytor.com>2008-06-16 00:18:48 -0700
committerH. Peter Anvin <hpa@zytor.com>2008-06-16 00:18:48 -0700
commit21edbdacf38c9eddc9dcca5c555a8a587f9df6fb (patch)
treed40ee22cce6d5e279c3c8ea812fd294c70589320 /codepage
parent52ff420fba565c4803cd43260c2d36189ff3d71a (diff)
downloadsyslinux-elf-21edbdacf38c9eddc9dcca5c555a8a587f9df6fb.tar.gz
syslinux-elf-21edbdacf38c9eddc9dcca5c555a8a587f9df6fb.tar.xz
syslinux-elf-21edbdacf38c9eddc9dcca5c555a8a587f9df6fb.zip
codepage: strip accents during shortname upper-casing
If we don't have a direct upper-case equivalent to a character, but we *do* have the "naked" version of the upper-case character (defined as the first character of the canonical Unicode decomposition of the case-mapped character), then use it for the intracodepage table (used for shortnames.) This matches DOS behaviour. No obvious way, yet, to handle the fact that DOS doesn't uppercase the lowercase sigma, but that's not a huge problem; besides, lowercase sigma is mapped to position 0xE5, which would require dealing with the special handling of this character in the first position.
Diffstat (limited to 'codepage')
-rwxr-xr-xcodepage/cptable.pl25
1 files changed, 23 insertions, 2 deletions
diff --git a/codepage/cptable.pl b/codepage/cptable.pl
index 44c710ce..9a9a13da 100755
--- a/codepage/cptable.pl
+++ b/codepage/cptable.pl
@@ -7,12 +7,16 @@
# Usage:
# cptable.pl UnicodeData cpXXX.txt cpXXX.bin
#
+# Note: for the format of the UnicodeData file, see:
+# http://www.unicode.org/Public/UNIDATA/UCD.html
+#
($ucd, $cpin, $cpout) = @ARGV;
%ucase = ();
%lcase = ();
%tcase = ();
+%decomp = ();
open(UCD, '<', $ucd) or die;
while (defined($line = <UCD>)) {
@@ -22,6 +26,15 @@ while (defined($line = <UCD>)) {
$ucase{$n} = hex $f[12] if ($f[12] ne '');
$lcase{$n} = hex $f[13] if ($f[13] ne '');
$tcase{$n} = hex $f[14] if ($f[14] ne '');
+ if ($f[5] =~ /^[0-9A-F\s]+$/) {
+ # This character has a canonical decomposition.
+ # The regular expression rejects angle brackets, so other
+ # decompositions aren't permitted.
+ $decomp{$n} = [];
+ foreach my $dch (split(' ', $f[5])) {
+ push(@{$decomp{$n}}, hex $dch);
+ }
+ }
}
close(UCD);
@@ -53,8 +66,16 @@ print CPOUT pack("VVVVVV", 0, 0, 0, 0, 0, 0);
# Self (shortname) uppercase table
#
for ($i = 0; $i < 256; $i++) {
- $u = $tabx{$ucase{$xtab[$i]}};
- $u = $i unless (defined($u));
+ $uuc = $ucase{$xtab[$i]}; # Unicode upper case
+ if (defined($tabx{$uuc})) {
+ # Straight-forward conversion
+ $u = $tabx{$uuc};
+ } elsif (defined($tabx{${$decomp{$uuc}}[0]})) {
+ # Upper case equivalent stripped of accents
+ $u = $tabx{${$decomp{$uuc}}[0]};
+ } else {
+ $u = $i;
+ }
print CPOUT pack("C", $u);
}