summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorH. Peter Anvin <hpa@zytor.com>2013-02-26 04:01:49 (GMT)
committerH. Peter Anvin <hpa@zytor.com>2013-02-26 04:01:49 (GMT)
commit29fe360520f99651726b8ea1050ca1070ae4b497 (patch)
tree709924b345cda4b7f83ad129a6cbc922e2e477f4
parent952ffec34648dcd77b15744b3d8c660112022c0d (diff)
downloadvirtio9p-29fe360520f99651726b8ea1050ca1070ae4b497.zip
virtio9p-29fe360520f99651726b8ea1050ca1070ae4b497.tar.gz
virtio9p-29fe360520f99651726b8ea1050ca1070ae4b497.tar.bz2
virtio9p-29fe360520f99651726b8ea1050ca1070ae4b497.tar.xz
utf: proper conversions to and from Unicode, including case
Convert to and from Unicode, including keeping track of if a character is canonical.
-rw-r--r--utf.S82
1 files changed, 81 insertions, 1 deletions
diff --git a/utf.S b/utf.S
index fcf77f5..c514ab2 100644
--- a/utf.S
+++ b/utf.S
@@ -56,6 +56,86 @@ cptoutf:
popw %bx
retw
+ .size cptoutf,.-cptoutf
+ .type cptoutf,@function
+
+/* ------------------------------------------------------------------------- *
+ * utftocp
+ *
+ * Convert a UTF-8 sequence in DS:SI to codepage form in AL, returning
+ * CF=1 (and SI undefined) for any error, unknown or noncanonical
+ * character.
+ *
+ * For example, U+0041 ('A') is noncanonical, because a codepage 'A'
+ * gets converted to lowercase and transmitted as U+0061 ('a').
+ * ------------------------------------------------------------------------- */
+
+utftocp:
+ pushw %cx
+ pushw %dx
+
+ xorw %ax,%ax
+ xorw %cx,%cx
+ xorw %dx,%dx
+
+ lodsb
+ andb %al,%al
+ jns 1f /* 1-byte sequence */
+
+ cmpb $0xc2,%al
+ jb 99f /* Error! */
+
+ xorb $0xc0,%al
+ cmpb $0x20,%al
+ jb 2f /* 2-byte sequence */
+
+ movb $0x08,%ch /* U+0800 minimum */
+ xorb $0x20,%al
+ cmpb $0x10,%al
+ jb 3f /* 3-byte sequence */
+99:
+ stc
+ popw %dx
+ popw %cx
+ retw
+3:
+ movw %ax,%dx
+ shlw $6,%dx
+ lodsb
+ xorb $0x80,%al
+ cmpb $0x40,%al
+ jae 99b
+
+2:
+ addw %ax,%dx
+ shlw $6,%dx
+ lodsb
+ xorb $0x80,%al
+ cmpb $0x40,%al
+ jae 99b
+ addw %dx,%ax
+1:
+ cmpw %cx,%ax
+ jb 99b /* Overlong sequence */
+
+ /* Now %ax has the Unicode code point */
+ movw $cptoutftbl,%di
+ movb $0x01,%ch
+91:
+ repne scasw
+
+ notb %cl
+ btw %cx,canonmap
+ jc 98f /* 1 in map = canonical = it's good! */
+ notb %cl
+ jnz 91b
+ jmp 99b
+98:
+ movw %cx,%ax
+ popw %dx
+ popw %cx
+ retw
+
lrbuf cptoutftbl, 512+32, 2
canonmap = cptoutftbl + 512
@@ -134,7 +214,7 @@ utf_init_case:
btrw %bx,canonmap
cmpw %bx,%si /* Is this canonical? */
jne 3f
- btsw %bx,canonmap
+ btsw %bx,canonmap /* 1 in map = canonical */
3:
btw %si,lcbitmap /* Is alternate case lower case? */
jnc 4f