Interactive PowerBasic Forum

IT-Berater: Theo Gottwald (IT-Consultant) => Low Level Code Optimization => Low Level Code Optimization PB 10 => Topic started by: Theo Gottwald on September 20, 2025, 09:46:47 PM

Title: UNICODE-ASM Replacements
Post by: Theo Gottwald on September 20, 2025, 09:46:47 PM
I tried
Chat_GLM (https://chat.z.ai/)
today so here are some results. The times you have to code thes by hand are over.

However all untested. So you may need to test it first.

UTF-16 to UTF-8
'######################################################################################################################
'######################################################################################################################
' ===============================================================================
' SUB: ASM_UTF16ToUTF8
' PURPOSE: Convert UTF-16 string to UTF-8 with full special case handling
' PARAMS:
'   U01 (STRING In/Out, BYREF) - String to convert (UTF-16 encoded)
' NOTES:
'   - Handles surrogate pairs for characters beyond BMP
'   - Properly validates UTF-16 sequences
'   - Uses assembly for maximum performance
'   - In-place conversion with minimal memory overhead
'   - Handles all Unicode code points (U+0000 to U+10FFFF)
' ===============================================================================
    ' UTF-16 constants
    %HIGH_SURROGATE_START = &HD800
    %HIGH_SURROGATE_END = &HDBFF
    %LOW_SURROGATE_START = &HDC00
    %LOW_SURROGATE_END = &HDFFF
    %UNICODE_MAX_BMP = &HFFFF
    %UNICODE_MAX = &H10FFFF
    %REPLACEMENT_CHAR = &HFFFD

    ' UTF-8 bit patterns
    %UTF8_1BYTE = &H00      ' 0xxxxxxx
    %UTF8_2BYTE = &HC0      ' 110xxxxx
    %UTF8_3BYTE = &HE0      ' 1110xxxx
    %UTF8_4BYTE = &HF0      ' 11110xxx
    %UTF8_CONT = &H80       ' 10xxxxxx
SUB ASM_UTF16ToUTF8(BYREF U01 AS STRING)
    #REGISTER NONE

    LOCAL T01 AS LONG        ' UTF-16 length (in bytes)
    LOCAL T02 AS LONG        ' UTF-8 length
    LOCAL T03 AS DWORD PTR   ' UTF-16 pointer
    LOCAL T04 AS DWORD PTR   ' UTF-8 pointer
    LOCAL T05 AS LONG        ' Current position
    LOCAL T06 AS WORD        ' Current UTF-16 char
    LOCAL T07 AS WORD        ' Next UTF-16 char
    LOCAL T08 AS DWORD       ' Combined code point
    LOCAL T09 AS DWORD       ' Temp for bit manipulation
    LOCAL S01 AS STRING      ' Result string

    ' Get UTF-16 length (must be even)
    T01 = LEN(U01)
    IF T01 = 0 THEN
        EXIT SUB
    END IF

    ' Check for odd length (invalid UTF-16)
    IF (T01 AND 1) THEN
        T01 = T01 - 1  ' Ignore last byte
        IF T01 = 0 THEN
            U01 = ""
            EXIT SUB
        END IF
    END IF

    ' Calculate maximum UTF-8 length (worst case: 4 bytes per UTF-16 char)
    T02 = T01 \ 2 * 4

    ' Create result buffer
    S01 = STRING$(T02, 0)

    ' Get pointers
    T03 = STRPTR(U01)
    T04 = STRPTR(S01)

    ' Initialize UTF-8 length counter
    T05 = 0

    ' Inline assembly for conversion
    ! push esi
    ! push edi
    ! push ebx
    ! mov esi, T03          ; UTF-16 pointer
    ! mov edi, T04          ; UTF-8 pointer
    ! mov ecx, T01          ; UTF-16 length in bytes

Lab_ConvertLoop:
    ! cmp ecx, 2            ; Need at least 2 bytes
    ! jb Lab_Done           ; Done if less

    ' Get current UTF-16 character
    ! mov ax, [esi]         ; Get 16-bit UTF-16 char
    ! mov T06, ax           ; Store in variable

    ' Check for high surrogate
    ! cmp ax, %HIGH_SURROGATE_START
    ! jb Lab_NormalChar     ; Below high surrogate range
    ! cmp ax, %HIGH_SURROGATE_END
    ! ja Lab_NormalChar     ; Above high surrogate range

    ' It's a high surrogate - check for low surrogate
    ! cmp ecx, 4            ; Need at least 4 bytes for surrogate pair
    ! jb Lab_InvalidSurrogate

    ' Get next character
    ! mov bx, [esi+2]       ; Get next 16-bit char
    ! mov T07, bx           ; Store in variable

    ' Check if it's a low surrogate
    ! cmp bx, %LOW_SURROGATE_START
    ! jb Lab_InvalidSurrogate
    ! cmp bx, %LOW_SURROGATE_END
    ! ja Lab_InvalidSurrogate

    ' Valid surrogate pair - combine to 32-bit code point
    ! movzx eax, ax         ; Zero-extend high surrogate
    ! sub eax, %HIGH_SURROGATE_START
    ! shl eax, 10           ; Shift left 10 bits
    ! movzx ebx, bx         ; Zero-extend low surrogate
    ! sub ebx, %LOW_SURROGATE_START
    ! or eax, ebx           ; Combine
    ! add eax, &H10000      ; Add base for surrogate pairs
    ! mov T08, eax          ; Store combined code point

    ' Advance past surrogate pair
    ! add esi, 4            ; Skip 4 bytes
    ! sub ecx, 4            ; Decrease counter
    ! jmp Lab_Encode4Byte   ; Encode as 4-byte UTF-8

Lab_NormalChar:
    ' Normal BMP character
    ! movzx eax, ax         ; Zero-extend to 32-bit
    ! mov T08, eax          ; Store as code point

    ' Advance past character
    ! add esi, 2            ; Skip 2 bytes
    ! sub ecx, 2            ; Decrease counter

    ' Determine UTF-8 encoding length
    ! cmp eax, &H7F         ; 0-127: 1 byte
    ! ja Lab_Check2Byte
    ! jmp Lab_Encode1Byte

Lab_Check2Byte:
    ! cmp eax, &H7FF        ; 128-2047: 2 bytes
    ! ja Lab_Check3Byte
    ! jmp Lab_Encode2Byte

Lab_Check3Byte:
    ! cmp eax, %UNICODE_MAX_BMP  ; 2048-65535: 3 bytes
    ! ja Lab_InvalidCodePoint
    ! jmp Lab_Encode3Byte

Lab_InvalidSurrogate:
    ' Invalid surrogate sequence - use replacement character
    ! mov eax, %REPLACEMENT_CHAR
    ! mov T08, eax
    ! add esi, 2            ; Skip 2 bytes
    ! sub ecx, 2            ; Decrease counter
    ! jmp Lab_Encode3Byte    ; Encode as 3-byte UTF-8

Lab_InvalidCodePoint:
    ' Invalid code point - use replacement character
    ! mov eax, %REPLACEMENT_CHAR
    ! mov T08, eax
    ! jmp Lab_Encode3Byte    ; Encode as 3-byte UTF-8

Lab_Encode1Byte:
    ' 1-byte UTF-8: 0xxxxxxx
    ! mov al, T08           ; Get code point
    ! mov [edi], al         ; Store directly
    ! inc edi               ; Advance output pointer
    ! inc T05               ; Increase count
    ! jmp Lab_ConvertLoop   ; Continue

Lab_Encode2Byte:
    ' 2-byte UTF-8: 110xxxxx 10xxxxxx
    ! mov eax, T08          ; Get code point
    ! mov ebx, eax          ; Copy to EBX
    ! shr eax, 6            ; Shift right 6 bits (get top 5 bits)
    ! or al, %UTF8_2BYTE    ; Add 2-byte prefix
    ! mov [edi], al         ; Store first byte
    ! mov al, bl            ; Get original value
    ! and al, &H3F          ; Mask to 6 bits
    ! or al, %UTF8_CONT     ; Add continuation byte prefix
    ! mov [edi+1], al       ; Store second byte
    ! add edi, 2            ; Advance 2 bytes
    ! add T05, 2            ; Increase count by 2
    ! jmp Lab_ConvertLoop   ; Continue

Lab_Encode3Byte:
    ' 3-byte UTF-8: 1110xxxx 10xxxxxx 10xxxxxx
    ! mov eax, T08          ; Get code point
    ! mov ebx, eax          ; Copy to EBX
    ! shr eax, 12           ; Shift right 12 bits (get top 4 bits)
    ! or al, %UTF8_3BYTE    ; Add 3-byte prefix
    ! mov [edi], al         ; Store first byte
    ! mov eax, ebx          ; Get original value
    ! shr eax, 6            ; Shift right 6 bits
    ! and al, &H3F          ; Mask to 6 bits
    ! or al, %UTF8_CONT     ; Add continuation byte prefix
    ! mov [edi+1], al       ; Store second byte
    ! mov al, bl            ; Get original value
    ! and al, &H3F          ; Mask to 6 bits
    ! or al, %UTF8_CONT     ; Add continuation byte prefix
    ! mov [edi+2], al       ; Store third byte
    ! add edi, 3            ; Advance 3 bytes
    ! add T05, 3            ; Increase count by 3
    ! jmp Lab_ConvertLoop   ; Continue

Lab_Encode4Byte:
    ' 4-byte UTF-8: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
    ! mov eax, T08          ; Get code point
    ! mov ebx, eax          ; Copy to EBX
    ! shr eax, 18           ; Shift right 18 bits (get top 3 bits)
    ! or al, %UTF8_4BYTE    ; Add 4-byte prefix
    ! mov [edi], al         ; Store first byte
    ! mov eax, ebx          ; Get original value
    ! shr eax, 12           ; Shift right 12 bits
    ! and al, &H3F          ; Mask to 6 bits
    ! or al, %UTF8_CONT     ; Add continuation byte prefix
    ! mov [edi+1], al       ; Store second byte
    ! mov eax, ebx          ; Get original value
    ! shr eax, 6            ; Shift right 6 bits
    ! and al, &H3F          ; Mask to 6 bits
    ! or al, %UTF8_CONT     ; Add continuation byte prefix
    ! mov [edi+2], al       ; Store third byte
    ! mov al, bl            ; Get original value
    ! and al, &H3F          ; Mask to 6 bits
    ! or al, %UTF8_CONT     ; Add continuation byte prefix
    ! mov [edi+3], al       ; Store fourth byte
    ! add edi, 4            ; Advance 4 bytes
    ! add T05, 4            ; Increase count by 4
    ! jmp Lab_ConvertLoop   ; Continue

Lab_Done:
    ! pop ebx
    ! pop edi
    ! pop esi

    ' Update the string with the converted content
    IF T05 > 0 THEN
        U01 = LEFT$(S01, T05)
    ELSE
        U01 = ""
    END IF
END SUB                                           


Unicode to ANSI
' ===============================================================================
' SUB: ASM_UnicodeToAnsi
' PURPOSE: Convert Unicode string to ANSI with full special case handling
' PARAMS:
'   U01 (STRING In/Out, BYREF) - String to convert (UTF-16 encoded)
' NOTES:
'   - Handles surrogate pairs for characters beyond BMP
'   - Properly validates UTF-16 sequences
'   - Maps common Unicode characters to ANSI equivalents
'   - Uses assembly for maximum performance
'   - In-place conversion with minimal memory overhead
' ===============================================================================
SUB ASM_UnicodeToAnsi(BYREF U01 AS STRING)
    #REGISTER NONE
    LOCAL T01 AS LONG        ' Unicode length (in bytes)
    LOCAL T02 AS LONG        ' ANSI length
    LOCAL T03 AS DWORD PTR   ' Unicode pointer
    LOCAL T04 AS DWORD PTR   ' ANSI pointer
    LOCAL T05 AS LONG        ' Current position
    LOCAL T06 AS WORD        ' Current Unicode char
    LOCAL T07 AS WORD        ' Next Unicode char
    LOCAL T08 AS DWORD       ' Combined code point
    LOCAL T09 AS BYTE        ' ANSI equivalent
    LOCAL S01 AS STRING      ' Result string

    ' Get Unicode length (must be even)
    T01 = LEN(U01)
    IF T01 = 0 THEN
        EXIT SUB
    END IF

    ' Check for odd length (invalid UTF-16)
    IF (T01 AND 1) THEN
        T01 = T01 - 1  ' Ignore last byte
        IF T01 = 0 THEN
            U01 = ""
            EXIT SUB
        END IF
    END IF

    ' Calculate maximum ANSI length (each Unicode char becomes at most 3 ANSI chars for ellipsis)
    T02 = T01 \ 2 * 3

    ' Create result buffer
    S01 = STRING$(T02, 0)

    ' Get pointers
    T03 = STRPTR(U01)
    T04 = STRPTR(S01)

    ' Initialize ANSI length counter
    T05 = 0

    ' Inline assembly for conversion
    ! push esi
    ! push edi
    ! push ebx
    ! mov esi, T03          ; Unicode pointer
    ! mov edi, T04          ; ANSI pointer
    ! mov ecx, T01          ; Unicode length in bytes
    ! xor edx, edx          ; Clear combined code point

Lab_ConvertLoop:
    ! cmp ecx, 2            ; Need at least 2 bytes
    ! jb Lab_Done           ; Done if less

    ' Get current Unicode character
    ! mov ax, [esi]         ; Get 16-bit Unicode char
    ! mov T06, ax           ; Store in variable

    ' Check for high surrogate
    ! cmp ax, %HIGH_SURROGATE_START
    ! jb Lab_NormalChar     ; Below high surrogate range
    ! cmp ax, %HIGH_SURROGATE_END
    ! ja Lab_NormalChar     ; Above high surrogate range

    ' It's a high surrogate - check for low surrogate
    ! cmp ecx, 4            ; Need at least 4 bytes for surrogate pair
    ! jb Lab_InvalidSurrogate

    ' Get next character
    ! mov bx, [esi+2]       ; Get next 16-bit char
    ! mov T07, bx           ; Store in variable

    ' Check if it's a low surrogate
    ! cmp bx, %LOW_SURROGATE_START
    ! jb Lab_InvalidSurrogate
    ! cmp bx, %LOW_SURROGATE_END
    ! ja Lab_InvalidSurrogate

    ' Valid surrogate pair - combine to 32-bit code point
    ! movzx eax, ax         ; Zero-extend high surrogate
    ! sub eax, %HIGH_SURROGATE_START
    ! shl eax, 10           ; Shift left 10 bits
    ! movzx ebx, bx         ; Zero-extend low surrogate
    ! sub ebx, %LOW_SURROGATE_START
    ! or eax, ebx           ; Combine
    ! add eax, &H10000      ; Add base for surrogate pairs
    ! mov T08, eax          ; Store combined code point

    ' Advance past surrogate pair
    ! add esi, 4            ; Skip 4 bytes
    ! sub ecx, 4            ; Decrease counter
    ! jmp Lab_ProcessCodePoint

Lab_NormalChar:
    ' Normal BMP character
    ! movzx eax, ax         ; Zero-extend to 32-bit
    ! mov T08, eax          ; Store as code point

    ' Advance past character
    ! add esi, 2            ; Skip 2 bytes
    ! sub ecx, 2            ; Decrease counter
    ! jmp Lab_ProcessCodePoint

Lab_InvalidSurrogate:
    ' Invalid surrogate sequence - treat as single character
    ! movzx eax, ax         ; Zero-extend to 32-bit
    ! mov T08, eax          ; Store as code point

    ' Advance past character
    ! add esi, 2            ; Skip 2 bytes
    ! sub ecx, 2            ; Decrease counter
    ! jmp Lab_ProcessCodePoint

Lab_ProcessCodePoint:
    ' Process the code point (T08)
    ! mov eax, T08          ; Get code point

    ' Check for special mappings
    ! cmp eax, %EURO_SIGN
    ! je Lab_EuroSign
    ! cmp eax, %BULLET
    ! je Lab_Bullet
    ! cmp eax, %ELLIPSIS
    ! je Lab_Ellipsis

    ' Check if within ANSI range
    ! cmp eax, %ANSI_MAX
    ! ja Lab_NonAnsi        ; Above ANSI range

    ' Direct ANSI mapping
    ! mov T09, al           ; Use low byte
    ! jmp Lab_StoreChar

Lab_EuroSign:
    ! mov T09, %EURO_ANSI   ; Map to ANSI 128
    ! jmp Lab_StoreChar

Lab_Bullet:
    ! mov T09, %BULLET_ANSI ; Map to ANSI 7
    ! jmp Lab_StoreChar

Lab_Ellipsis:
    ' Special case: map to "..." (3 characters)
    ! cmp T05, T02-2        ; Check if we have room for 3 chars
    ! jae Lab_Truncate      ; Truncate if no room

    ! mov byte ptr [edi], %DOT_CHAR  ; First dot
    ! mov byte ptr [edi+1], %DOT_CHAR ; Second dot
    ! mov byte ptr [edi+2], %DOT_CHAR ; Third dot
    ! add edi, 3            ; Advance 3 bytes
    ! add T05, 3            ; Increase count by 3
    ! jmp Lab_ConvertLoop   ; Continue

Lab_NonAnsi:
    ' Non-ANSI character - use replacement
    ! mov T09, %REPLACEMENT_CHAR
    ! jmp Lab_StoreChar

Lab_StoreChar:
    ' Store the ANSI character
    ! mov bl, T09
    ! mov [edi], bl
    ! inc edi               ; Advance output pointer
    ! inc T05               ; Increase count
    ! jmp Lab_ConvertLoop   ; Continue

Lab_Truncate:
    ' Output buffer full - stop processing
    ! jmp Lab_Done

Lab_Done:
    ! pop ebx
    ! pop edi
    ! pop esi

    ' Update the string with the converted content
    IF T05 > 0 THEN
        U01 = LEFT$(S01, T05)
    ELSE
        U01 = ""
    END IF
END SUB