I tried
Chat_GLM (https://chat.z.ai/)
today so here are some results. The times you have to code thes by hand are over.
However all untested. So you may need to test it first.
UTF-16 to UTF-8
'######################################################################################################################
'######################################################################################################################
' ===============================================================================
' SUB: ASM_UTF16ToUTF8
' PURPOSE: Convert UTF-16 string to UTF-8 with full special case handling
' PARAMS:
' U01 (STRING In/Out, BYREF) - String to convert (UTF-16 encoded)
' NOTES:
' - Handles surrogate pairs for characters beyond BMP
' - Properly validates UTF-16 sequences
' - Uses assembly for maximum performance
' - In-place conversion with minimal memory overhead
' - Handles all Unicode code points (U+0000 to U+10FFFF)
' ===============================================================================
' UTF-16 constants
%HIGH_SURROGATE_START = &HD800
%HIGH_SURROGATE_END = &HDBFF
%LOW_SURROGATE_START = &HDC00
%LOW_SURROGATE_END = &HDFFF
%UNICODE_MAX_BMP = &HFFFF
%UNICODE_MAX = &H10FFFF
%REPLACEMENT_CHAR = &HFFFD
' UTF-8 bit patterns
%UTF8_1BYTE = &H00 ' 0xxxxxxx
%UTF8_2BYTE = &HC0 ' 110xxxxx
%UTF8_3BYTE = &HE0 ' 1110xxxx
%UTF8_4BYTE = &HF0 ' 11110xxx
%UTF8_CONT = &H80 ' 10xxxxxx
SUB ASM_UTF16ToUTF8(BYREF U01 AS STRING)
#REGISTER NONE
LOCAL T01 AS LONG ' UTF-16 length (in bytes)
LOCAL T02 AS LONG ' UTF-8 length
LOCAL T03 AS DWORD PTR ' UTF-16 pointer
LOCAL T04 AS DWORD PTR ' UTF-8 pointer
LOCAL T05 AS LONG ' Current position
LOCAL T06 AS WORD ' Current UTF-16 char
LOCAL T07 AS WORD ' Next UTF-16 char
LOCAL T08 AS DWORD ' Combined code point
LOCAL T09 AS DWORD ' Temp for bit manipulation
LOCAL S01 AS STRING ' Result string
' Get UTF-16 length (must be even)
T01 = LEN(U01)
IF T01 = 0 THEN
EXIT SUB
END IF
' Check for odd length (invalid UTF-16)
IF (T01 AND 1) THEN
T01 = T01 - 1 ' Ignore last byte
IF T01 = 0 THEN
U01 = ""
EXIT SUB
END IF
END IF
' Calculate maximum UTF-8 length (worst case: 4 bytes per UTF-16 char)
T02 = T01 \ 2 * 4
' Create result buffer
S01 = STRING$(T02, 0)
' Get pointers
T03 = STRPTR(U01)
T04 = STRPTR(S01)
' Initialize UTF-8 length counter
T05 = 0
' Inline assembly for conversion
! push esi
! push edi
! push ebx
! mov esi, T03 ; UTF-16 pointer
! mov edi, T04 ; UTF-8 pointer
! mov ecx, T01 ; UTF-16 length in bytes
Lab_ConvertLoop:
! cmp ecx, 2 ; Need at least 2 bytes
! jb Lab_Done ; Done if less
' Get current UTF-16 character
! mov ax, [esi] ; Get 16-bit UTF-16 char
! mov T06, ax ; Store in variable
' Check for high surrogate
! cmp ax, %HIGH_SURROGATE_START
! jb Lab_NormalChar ; Below high surrogate range
! cmp ax, %HIGH_SURROGATE_END
! ja Lab_NormalChar ; Above high surrogate range
' It's a high surrogate - check for low surrogate
! cmp ecx, 4 ; Need at least 4 bytes for surrogate pair
! jb Lab_InvalidSurrogate
' Get next character
! mov bx, [esi+2] ; Get next 16-bit char
! mov T07, bx ; Store in variable
' Check if it's a low surrogate
! cmp bx, %LOW_SURROGATE_START
! jb Lab_InvalidSurrogate
! cmp bx, %LOW_SURROGATE_END
! ja Lab_InvalidSurrogate
' Valid surrogate pair - combine to 32-bit code point
! movzx eax, ax ; Zero-extend high surrogate
! sub eax, %HIGH_SURROGATE_START
! shl eax, 10 ; Shift left 10 bits
! movzx ebx, bx ; Zero-extend low surrogate
! sub ebx, %LOW_SURROGATE_START
! or eax, ebx ; Combine
! add eax, &H10000 ; Add base for surrogate pairs
! mov T08, eax ; Store combined code point
' Advance past surrogate pair
! add esi, 4 ; Skip 4 bytes
! sub ecx, 4 ; Decrease counter
! jmp Lab_Encode4Byte ; Encode as 4-byte UTF-8
Lab_NormalChar:
' Normal BMP character
! movzx eax, ax ; Zero-extend to 32-bit
! mov T08, eax ; Store as code point
' Advance past character
! add esi, 2 ; Skip 2 bytes
! sub ecx, 2 ; Decrease counter
' Determine UTF-8 encoding length
! cmp eax, &H7F ; 0-127: 1 byte
! ja Lab_Check2Byte
! jmp Lab_Encode1Byte
Lab_Check2Byte:
! cmp eax, &H7FF ; 128-2047: 2 bytes
! ja Lab_Check3Byte
! jmp Lab_Encode2Byte
Lab_Check3Byte:
! cmp eax, %UNICODE_MAX_BMP ; 2048-65535: 3 bytes
! ja Lab_InvalidCodePoint
! jmp Lab_Encode3Byte
Lab_InvalidSurrogate:
' Invalid surrogate sequence - use replacement character
! mov eax, %REPLACEMENT_CHAR
! mov T08, eax
! add esi, 2 ; Skip 2 bytes
! sub ecx, 2 ; Decrease counter
! jmp Lab_Encode3Byte ; Encode as 3-byte UTF-8
Lab_InvalidCodePoint:
' Invalid code point - use replacement character
! mov eax, %REPLACEMENT_CHAR
! mov T08, eax
! jmp Lab_Encode3Byte ; Encode as 3-byte UTF-8
Lab_Encode1Byte:
' 1-byte UTF-8: 0xxxxxxx
! mov al, T08 ; Get code point
! mov [edi], al ; Store directly
! inc edi ; Advance output pointer
! inc T05 ; Increase count
! jmp Lab_ConvertLoop ; Continue
Lab_Encode2Byte:
' 2-byte UTF-8: 110xxxxx 10xxxxxx
! mov eax, T08 ; Get code point
! mov ebx, eax ; Copy to EBX
! shr eax, 6 ; Shift right 6 bits (get top 5 bits)
! or al, %UTF8_2BYTE ; Add 2-byte prefix
! mov [edi], al ; Store first byte
! mov al, bl ; Get original value
! and al, &H3F ; Mask to 6 bits
! or al, %UTF8_CONT ; Add continuation byte prefix
! mov [edi+1], al ; Store second byte
! add edi, 2 ; Advance 2 bytes
! add T05, 2 ; Increase count by 2
! jmp Lab_ConvertLoop ; Continue
Lab_Encode3Byte:
' 3-byte UTF-8: 1110xxxx 10xxxxxx 10xxxxxx
! mov eax, T08 ; Get code point
! mov ebx, eax ; Copy to EBX
! shr eax, 12 ; Shift right 12 bits (get top 4 bits)
! or al, %UTF8_3BYTE ; Add 3-byte prefix
! mov [edi], al ; Store first byte
! mov eax, ebx ; Get original value
! shr eax, 6 ; Shift right 6 bits
! and al, &H3F ; Mask to 6 bits
! or al, %UTF8_CONT ; Add continuation byte prefix
! mov [edi+1], al ; Store second byte
! mov al, bl ; Get original value
! and al, &H3F ; Mask to 6 bits
! or al, %UTF8_CONT ; Add continuation byte prefix
! mov [edi+2], al ; Store third byte
! add edi, 3 ; Advance 3 bytes
! add T05, 3 ; Increase count by 3
! jmp Lab_ConvertLoop ; Continue
Lab_Encode4Byte:
' 4-byte UTF-8: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
! mov eax, T08 ; Get code point
! mov ebx, eax ; Copy to EBX
! shr eax, 18 ; Shift right 18 bits (get top 3 bits)
! or al, %UTF8_4BYTE ; Add 4-byte prefix
! mov [edi], al ; Store first byte
! mov eax, ebx ; Get original value
! shr eax, 12 ; Shift right 12 bits
! and al, &H3F ; Mask to 6 bits
! or al, %UTF8_CONT ; Add continuation byte prefix
! mov [edi+1], al ; Store second byte
! mov eax, ebx ; Get original value
! shr eax, 6 ; Shift right 6 bits
! and al, &H3F ; Mask to 6 bits
! or al, %UTF8_CONT ; Add continuation byte prefix
! mov [edi+2], al ; Store third byte
! mov al, bl ; Get original value
! and al, &H3F ; Mask to 6 bits
! or al, %UTF8_CONT ; Add continuation byte prefix
! mov [edi+3], al ; Store fourth byte
! add edi, 4 ; Advance 4 bytes
! add T05, 4 ; Increase count by 4
! jmp Lab_ConvertLoop ; Continue
Lab_Done:
! pop ebx
! pop edi
! pop esi
' Update the string with the converted content
IF T05 > 0 THEN
U01 = LEFT$(S01, T05)
ELSE
U01 = ""
END IF
END SUB
Unicode to ANSI
' ===============================================================================
' SUB: ASM_UnicodeToAnsi
' PURPOSE: Convert Unicode string to ANSI with full special case handling
' PARAMS:
' U01 (STRING In/Out, BYREF) - String to convert (UTF-16 encoded)
' NOTES:
' - Handles surrogate pairs for characters beyond BMP
' - Properly validates UTF-16 sequences
' - Maps common Unicode characters to ANSI equivalents
' - Uses assembly for maximum performance
' - In-place conversion with minimal memory overhead
' ===============================================================================
SUB ASM_UnicodeToAnsi(BYREF U01 AS STRING)
#REGISTER NONE
LOCAL T01 AS LONG ' Unicode length (in bytes)
LOCAL T02 AS LONG ' ANSI length
LOCAL T03 AS DWORD PTR ' Unicode pointer
LOCAL T04 AS DWORD PTR ' ANSI pointer
LOCAL T05 AS LONG ' Current position
LOCAL T06 AS WORD ' Current Unicode char
LOCAL T07 AS WORD ' Next Unicode char
LOCAL T08 AS DWORD ' Combined code point
LOCAL T09 AS BYTE ' ANSI equivalent
LOCAL S01 AS STRING ' Result string
' Get Unicode length (must be even)
T01 = LEN(U01)
IF T01 = 0 THEN
EXIT SUB
END IF
' Check for odd length (invalid UTF-16)
IF (T01 AND 1) THEN
T01 = T01 - 1 ' Ignore last byte
IF T01 = 0 THEN
U01 = ""
EXIT SUB
END IF
END IF
' Calculate maximum ANSI length (each Unicode char becomes at most 3 ANSI chars for ellipsis)
T02 = T01 \ 2 * 3
' Create result buffer
S01 = STRING$(T02, 0)
' Get pointers
T03 = STRPTR(U01)
T04 = STRPTR(S01)
' Initialize ANSI length counter
T05 = 0
' Inline assembly for conversion
! push esi
! push edi
! push ebx
! mov esi, T03 ; Unicode pointer
! mov edi, T04 ; ANSI pointer
! mov ecx, T01 ; Unicode length in bytes
! xor edx, edx ; Clear combined code point
Lab_ConvertLoop:
! cmp ecx, 2 ; Need at least 2 bytes
! jb Lab_Done ; Done if less
' Get current Unicode character
! mov ax, [esi] ; Get 16-bit Unicode char
! mov T06, ax ; Store in variable
' Check for high surrogate
! cmp ax, %HIGH_SURROGATE_START
! jb Lab_NormalChar ; Below high surrogate range
! cmp ax, %HIGH_SURROGATE_END
! ja Lab_NormalChar ; Above high surrogate range
' It's a high surrogate - check for low surrogate
! cmp ecx, 4 ; Need at least 4 bytes for surrogate pair
! jb Lab_InvalidSurrogate
' Get next character
! mov bx, [esi+2] ; Get next 16-bit char
! mov T07, bx ; Store in variable
' Check if it's a low surrogate
! cmp bx, %LOW_SURROGATE_START
! jb Lab_InvalidSurrogate
! cmp bx, %LOW_SURROGATE_END
! ja Lab_InvalidSurrogate
' Valid surrogate pair - combine to 32-bit code point
! movzx eax, ax ; Zero-extend high surrogate
! sub eax, %HIGH_SURROGATE_START
! shl eax, 10 ; Shift left 10 bits
! movzx ebx, bx ; Zero-extend low surrogate
! sub ebx, %LOW_SURROGATE_START
! or eax, ebx ; Combine
! add eax, &H10000 ; Add base for surrogate pairs
! mov T08, eax ; Store combined code point
' Advance past surrogate pair
! add esi, 4 ; Skip 4 bytes
! sub ecx, 4 ; Decrease counter
! jmp Lab_ProcessCodePoint
Lab_NormalChar:
' Normal BMP character
! movzx eax, ax ; Zero-extend to 32-bit
! mov T08, eax ; Store as code point
' Advance past character
! add esi, 2 ; Skip 2 bytes
! sub ecx, 2 ; Decrease counter
! jmp Lab_ProcessCodePoint
Lab_InvalidSurrogate:
' Invalid surrogate sequence - treat as single character
! movzx eax, ax ; Zero-extend to 32-bit
! mov T08, eax ; Store as code point
' Advance past character
! add esi, 2 ; Skip 2 bytes
! sub ecx, 2 ; Decrease counter
! jmp Lab_ProcessCodePoint
Lab_ProcessCodePoint:
' Process the code point (T08)
! mov eax, T08 ; Get code point
' Check for special mappings
! cmp eax, %EURO_SIGN
! je Lab_EuroSign
! cmp eax, %BULLET
! je Lab_Bullet
! cmp eax, %ELLIPSIS
! je Lab_Ellipsis
' Check if within ANSI range
! cmp eax, %ANSI_MAX
! ja Lab_NonAnsi ; Above ANSI range
' Direct ANSI mapping
! mov T09, al ; Use low byte
! jmp Lab_StoreChar
Lab_EuroSign:
! mov T09, %EURO_ANSI ; Map to ANSI 128
! jmp Lab_StoreChar
Lab_Bullet:
! mov T09, %BULLET_ANSI ; Map to ANSI 7
! jmp Lab_StoreChar
Lab_Ellipsis:
' Special case: map to "..." (3 characters)
! cmp T05, T02-2 ; Check if we have room for 3 chars
! jae Lab_Truncate ; Truncate if no room
! mov byte ptr [edi], %DOT_CHAR ; First dot
! mov byte ptr [edi+1], %DOT_CHAR ; Second dot
! mov byte ptr [edi+2], %DOT_CHAR ; Third dot
! add edi, 3 ; Advance 3 bytes
! add T05, 3 ; Increase count by 3
! jmp Lab_ConvertLoop ; Continue
Lab_NonAnsi:
' Non-ANSI character - use replacement
! mov T09, %REPLACEMENT_CHAR
! jmp Lab_StoreChar
Lab_StoreChar:
' Store the ANSI character
! mov bl, T09
! mov [edi], bl
! inc edi ; Advance output pointer
! inc T05 ; Increase count
! jmp Lab_ConvertLoop ; Continue
Lab_Truncate:
' Output buffer full - stop processing
! jmp Lab_Done
Lab_Done:
! pop ebx
! pop edi
! pop esi
' Update the string with the converted content
IF T05 > 0 THEN
U01 = LEFT$(S01, T05)
ELSE
U01 = ""
END IF
END SUB