summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorH. Peter Anvin <hpa@zytor.com>2013-02-26 03:10:15 (GMT)
committerH. Peter Anvin <hpa@zytor.com>2013-02-26 03:10:15 (GMT)
commit952ffec34648dcd77b15744b3d8c660112022c0d (patch)
tree3d75174e53d686668912965788a83b25fa2b6b67
parent1dd96271ad8fd8fb7661f27cbff8d425a8b9d698 (diff)
downloadvirtio9p-952ffec34648dcd77b15744b3d8c660112022c0d.zip
virtio9p-952ffec34648dcd77b15744b3d8c660112022c0d.tar.gz
virtio9p-952ffec34648dcd77b15744b3d8c660112022c0d.tar.bz2
virtio9p-952ffec34648dcd77b15744b3d8c660112022c0d.tar.xz
checkpoint: cp-to-utf conversion now includes case conversion
-rw-r--r--cp/Makefile4
-rwxr-xr-xcp/cptable.pl160
-rw-r--r--utf.S113
3 files changed, 236 insertions, 41 deletions
diff --git a/cp/Makefile b/cp/Makefile
index 90fa950..276f2f4 100644
--- a/cp/Makefile
+++ b/cp/Makefile
@@ -10,8 +10,8 @@ all: $(GENFILES)
# This generates codepage files where the display and filesystem
# codepages are both the same.
-%.uni: %.txt cptable.pl
- $(PERL) cptable.pl $< $@
+%.uni: %.txt cptable.pl UnicodeData
+ $(PERL) cptable.pl UnicodeData $< $< $@
default.uni: $(DEFAULT).uni
cp -f $< $@
diff --git a/cp/cptable.pl b/cp/cptable.pl
index 19e72a0..89c4040 100755
--- a/cp/cptable.pl
+++ b/cp/cptable.pl
@@ -11,62 +11,178 @@
# http://www.unicode.org/Public/UNIDATA/UCD.html
#
-($cpfs, $cpout) = @ARGV;
+($ucd, $cpco, $cpfs, $cpout) = @ARGV;
if (!defined($cpout)) {
- die "Usage: $0 UnicodeData cp.txt output.cp\n";
+ die "Usage: $0 UnicodeData console-cp.txt fs-cp.txt output.cp\n";
}
+%ucase = ();
+%lcase = ();
+%tcase = ();
+%decomp = ();
+
+open(UCD, '<', $ucd)
+ or die "$0: could not open unicode data: $ucd: $!\n";
+while (defined($line = <UCD>)) {
+ chomp $line;
+ if ($line =~ /Name\:.*\bcp([0-9]+)\b/i) {
+ $cpnum = $1 + 0;
+ }
+ @f = split(/;/, $line);
+ $n = hex $f[0];
+ $ucase{$n} = ($f[12] ne '') ? hex $f[12] : $n;
+ $lcase{$n} = ($f[13] ne '') ? hex $f[13] : $n;
+ $tcase{$n} = ($f[14] ne '') ? hex $f[14] : $n;
+ if ($f[5] =~ /^[0-9A-F\s]+$/) {
+ # This character has a canonical decomposition.
+ # The regular expression rejects angle brackets, so other
+ # decompositions aren't permitted.
+ $decomp{$n} = [];
+ foreach my $dch (split(' ', $f[5])) {
+ push(@{$decomp{$n}}, hex $dch);
+ }
+ }
+}
+close(UCD);
+
#
-# Read the code pages
+# Filesystem and console codepages. The filesystem codepage is used
+# for FAT shortnames, whereas the console codepage is whatever is used
+# on the screen and keyboard.
#
@xtab = (undef) x 256;
%tabx = ();
open(CPFS, '<', $cpfs)
or die "$0: could not open fs codepage: $cpfs: $!\n";
while (defined($line = <CPFS>)) {
- if ($line =~ /\#.*Name\:.*\bcp([0-9]+)/i) {
- $cpnum = $1 + 0;
- next;
- }
$line =~ s/\s*(\#.*|)$//;
@f = split(/\s+/, $line);
next if (scalar @f != 2);
next if (hex $f[0] > 255);
$xtab[hex $f[0]] = hex $f[1]; # Codepage -> Unicode
- $tabx[hex $f[1]] = hex $f[0]; # Unicode -> Codepage
+ $tabx{hex $f[1]} = hex $f[0]; # Unicode -> Codepage
}
close(CPFS);
-#
-# Check that ASCII is 1:1
-#
-for ($i = 0; $i < 128; $i++) {
- if ($xtab[$i] != $i) {
- die "$0: code page file $cpfs does not identity-map ASCII\n";
- }
+@ytab = (undef) x 256;
+%taby = ();
+open(CPCO, '<', $cpco)
+ or die "$0: could not open console codepage: $cpco: $!\n";
+while (defined($line = <CPCO>)) {
+ $line =~ s/\s*(\#.*|)$//;
+ @f = split(/\s+/, $line);
+ next if (scalar @f != 2);
+ next if (hex $f[0] > 255);
+ $ytab[hex $f[0]] = hex $f[1]; # Codepage -> Unicode
+ $taby{hex $f[1]} = hex $f[0]; # Unicode -> Codepage
}
+close(CPCO);
open(CPOUT, '>', $cpout)
or die "$0: could not open output file: $cpout: $!\n";
+#
+# Magic number, in anticipation of being able to load these
+# files dynamically...
+#
+print CPOUT pack("VV", 0x58a8b3d4, 0x51d21eb1);
+# Header fields (some available for future use)
+print CPOUT pack("VVVVVV", $cpnum, 0, 0, 0, 0, 0);
+
+#
+# Self (shortname) uppercase table.
+# This depends both on the console codepage and the filesystem codepage;
+# the logical transcoding operation is:
+#
+# $tabx{$ucase{$ytab[$i]}}
#
-# Write a simple header
+# ... where @ytab is console codepage -> Unicode and
+# %tabx is Unicode -> filesystem codepage.
#
-print CPOUT pack("Vvv", 0x8740f99c, $cpnum, 0);
+@uctab = (undef) x 256;
+for ($i = 0; $i < 256; $i++) {
+ $uuc = $ucase{$ytab[$i]}; # Unicode upper case
+ if (defined($tabx{$uuc})) {
+ # Straight-forward conversion
+ $u = $tabx{$uuc};
+ } elsif (defined($tabx{${$decomp{$uuc}}[0]})) {
+ # Upper case equivalent stripped of accents
+ $u = $tabx{${$decomp{$uuc}}[0]};
+ } else {
+ # No equivalent at all found. Assume it is a lower-case-only
+ # character, like greek alpha in CP437.
+ $u = $i;
+ }
+ $uctab[$i] = $u;
+ print CPOUT pack("C", $u);
+}
+
+#
+# Self (shortname) lowercase table.
+# This depends both on the console codepage and the filesystem codepage;
+# the logical transcoding operation is:
+#
+# $taby{$lcase{$xtab[$i]}}
+#
+# ... where @ytab is console codepage -> Unicode and
+# %tabx is Unicode -> filesystem codepage.
+#
+@lctab = (undef) x 256;
+for ($i = 0; $i < 256; $i++) {
+ $llc = $lcase{$xtab[$i]}; # Unicode lower case
+ if (defined($l = $taby{$llc}) && $uctab[$l] == $i) {
+ # Straight-forward conversion
+ } elsif (defined($l = $tabx{${$decomp{$llc}}[0]}) && $uctab[$l] == $i) {
+ # Lower case equivalent stripped of accents
+ } else {
+ # No equivalent at all found. Find *anything* that matches the
+ # bijection criterion...
+ for ($l = 0; $l < 256; $l++) {
+ last if ($uctab[$l] == $i);
+ }
+ $l = $i if ($l == 256); # If nothing, we're screwed anyway...
+ }
+ $lctab[$i] = $l;
+ print CPOUT pack("C", $l);
+}
#
# Unicode (longname) matching table.
# This only depends on the console codepage.
#
-for ($i = 128; $i < 256; $i++) {
- $p0 = $xtab[$i];
- if (!defined($xtab[$i]) || $p0 > 0xffff) {
- $p0 = 0xffff;
+$pp0 = ''; $pp1 = '';
+@lcbitmap = (0) x 32;
+
+for ($i = 0; $i < 256; $i++) {
+ if (!defined($ytab[$i])) {
+ $p0 = $p1 = 0xffff;
+ } else {
+ $p0 = $ytab[$i];
+ if ($ucase{$p0} != $p0) {
+ $p1 = $ucase{$p0};
+ } elsif ($lcase{$p0} != $p0) {
+ $p1 = $lcase{$p0};
+ $lcbitmap[$i >> 3] |= (1 << ($i & 7)) if ($p1 < 0xffff);
+ } elsif ($tcase{$p0} != $p0) {
+ $p1 = $tcase{$p0};
+ } else {
+ $p1 = $p0;
+ }
}
+ # Only the BMP is supported...
+ $p0 = 0xffff if ($p0 > 0xffff);
+ $p1 = 0xffff if ($p1 > 0xffff);
$pp0 .= pack("v", $p0);
+ $pp1 .= pack("v", $p1);
}
-print CPOUT $pp0;
+print CPOUT $pp0, $pp1;
+
+#
+# Finally, bitmap indicating which altcharacters are lower case
+#
+print CPOUT pack("C*", @lcbitmap);
+
close (CPOUT);
diff --git a/utf.S b/utf.S
index ce5a7e0..fcf77f5 100644
--- a/utf.S
+++ b/utf.S
@@ -1,30 +1,35 @@
+#include "v9fs.h"
+
.code16
.section ".rtext","ax"
/* ------------------------------------------------------------------------- *
* cptoutf
*
- * Convert a codepage byte in AL to UTF-8 at ES:DI, incrementing DI
- * Destroys AX
+ * Convert a codepage byte in AL to UTF-8 at ES:DI, incrementing DI.
+ * Destroys AX.
+ *
+ * This is also used for case conversion, so we do a table lookup even
+ * for plain ASCII characters.
* ------------------------------------------------------------------------- */
.globl cptoutf
cptoutf:
- and %al,%al
- js 1f /* If in ASCII range, this is easy */
+ pushw %bx
+ movzbw %al,%bx
+ addw %bx,%bx
+ movw cptoutftbl(%bx),%ax
+ cmpw $0x7f,%ax
+ ja 1f
stosb
+ popw %bx
retw
1:
- pushw %ax
- pushw %bx
- movzbw %al,%bx
- addb %bl,%bl /* This drops the top bit */
- movw cptoutftbl(%bx),%bx
- cmpw $0x800,%bx
- jae 2f
+ movw %ax,%bx
+ cmpw $0x7ff,%ax
+ ja 2f
/* Two-byte sequence */
- movw %bx,%ax
shrw $6,%ax
orb $0xc0,%al
stosb
@@ -32,7 +37,6 @@ cptoutf:
2:
/* Three-byte sequence */
- movw %bx,%ax
shrw $12,%ax
orb $0xe0,%al
stosb
@@ -49,9 +53,12 @@ cptoutf:
andb $0x3f,%al
orb $0x80,%al
stosb
-
+ popw %bx
retw
+ lrbuf cptoutftbl, 512+32, 2
+canonmap = cptoutftbl + 512
+
/* ------------------------------------------------------------------------- *
* Unicode table
*
@@ -59,7 +66,8 @@ cptoutf:
* by data loaded in from a file at install time if necessary
* ------------------------------------------------------------------------- */
- .section ".rdata","aw"
+
+ .section ".data","aw"
.globl unifile
.balign 2
unifile:
@@ -67,5 +75,76 @@ unifile:
.size unifile,.-unifile
.type unifile,@object
-codepage = unifile + 4
-cptoutftbl = unifile + 8
+/* The unifile has a 32-byte header */
+uctable = unifile + 32 /* Codepage upper-case table */
+lctable = uctable + 256 /* Codepage lower-case table */
+unitable = lctable + 256 /* Codepage to Unicode table */
+unicasetable = unitable + 2*256 /* Codepage case-swap table */
+lcbitmap = unicasetable + 2*256 /* Case-swap is to lower case */
+
+/* ------------------------------------------------------------------------- *
+ * Routines to fill in cptoutftbl
+ *
+ * The first time we simply copy unitable - this is used for the tag.
+ * After the tag is parsed, we create the case-converting table.
+ *
+ * XXX: These routines should query the system code page and load the
+ * appropriate conversion table if not the compiled-in one.
+ * ------------------------------------------------------------------------- */
+
+ .section ".text","ax"
+
+ .globl utf_init_plain
+utf_init_plain:
+ movw $unitable,%si
+ movw $cptoutftbl,%di
+ movw $(512/4),%cx
+ rep movsl
+ retw
+
+ .size utf_init_plain,.-utf_init_plain
+ .type utf_init_plain,@function
+
+ .globl utf_init_case
+
+ libuf uctbl, 5, 1
+
+utf_init_case:
+ pushaw
+
+ cmpw $0x31e,dos_version /* Use case table from DOS? */
+ jb 1f
+
+ movw $-1,%dx /* Current country */
+ movw %dx,%bx /* Current code page (XXX) */
+ movw $5,%cx /* Size of buffer */
+ movw $uctbl,%di
+ int $0x21
+ jc 1f
+
+ lfsw uctbl+1,%si
+ movw $uctable+128,%di
+ movw $(128/4),%cx
+ fs rep movsl
+1:
+ xorw %bx,%bx
+ movw $cptoutftbl,%di
+2:
+ movzbw uctable(%bx),%si
+ btrw %bx,canonmap
+ cmpw %bx,%si /* Is this canonical? */
+ jne 3f
+ btsw %bx,canonmap
+3:
+ btw %si,lcbitmap /* Is alternate case lower case? */
+ jnc 4f
+ addw $256,%si /* Use the alternate case table */
+4:
+ addw %si,%si
+ movw unitable(%si),%ax
+ stosw
+ incb %bl
+ jnz 2b
+
+ popaw
+ retw