GetCodepointsFromUtf16String
Short summary
This function creates a code point array from an UTF-16 string and returns the start address.
Attention: The returned addressOfCodePoints is a dynamically created pointer! It needs to be deleted after usage.
Example:
myString :WString(10) := "Test" ;
codepoints :Array[0..10] OF UnicodeCodepoint;
codepointCount :UDINT;
------------------
GetCodepointsFromUtf16String(
utf16StringAddress := ADR(myString),
utf16StringWordCount := GetUtf16StringLength(ADR(myString)),
codepointBuffer := ADR(codepoints),
bufferSize := SIZEOF(codepoints),
codePointCount => codepointCount
);
- Return type:
BOOL
Parameters
| Name | Type | Comment | Kind |
|---|---|---|---|
| utf16StringAddress | POINTER TO WORD | address of the utf-16 string | input |
| utf16StringWordCount | UDINT | number of words of the utf-16 string | input |
| codepointBuffer | POINTER TO UnicodeCodePoint | buffer where the encoded codepoints are stored | input |
| bufferSize | UDINT | size of the codepoint buffer in bytes | input |
| codePointCount | UDINT | number of codepoints | output |
Code
Declaration
FUNCTION GetCodepointsFromUtf16String : BOOL
VAR_INPUT
(*address of the utf-16 string*)
utf16StringAddress :POINTER TO WORD;
(* number of words of the utf-16 string *)
utf16StringWordCount :UDINT;
(* buffer where the encoded codepoints are stored *)
codepointBuffer :POINTER TO UnicodeCodePoint;
(* size of the codepoint buffer in bytes *)
bufferSize :UDINT;
END_VAR
VAR_OUTPUT
(* number of codepoints *)
codePointCount :UDINT := 0;
END_VAR
VAR CONSTANT
END_OF_STRING :BYTE := 16#00;
END_VAR
VAR
highSurrogate, lowSurrogate :WORD;
wordIndex :UDINT;
bufferSizeCodepoints :UDINT;
END_VAR
Implementation
RETURN((utf16StringAddress = 0) OR_ELSE (utf16StringWordCount = 0) OR_ELSE (codepointBuffer = 0) OR_ELSE bufferSize < SIZEOF(UnicodeCodepoint));
Tc2_System.MEMSET(codepointBuffer, 0, bufferSize);
wordIndex := 0;
bufferSizeCodepoints := bufferSize/SIZEOF(UnicodeCodepoint);
WHILE ((wordIndex < utf16StringWordCount) AND_THEN ( codePointCount < bufferSizeCodepoints )) DO
CASE utf16StringAddress[wordIndex] OF
16#D800..16#DBFF: // current word is high surrogate
// check next word is a low surrogate
IF ((wordIndex + 1 < utf16StringWordCount)
AND_THEN (utf16StringAddress[wordIndex+1] >= 16#DC00 )
AND_THEN (utf16StringAddress[wordIndex+1] <= 16#DFFF))
THEN
highSurrogate := utf16StringAddress[wordIndex] - 16#D800;
lowSurrogate := utf16StringAddress[wordIndex+1] - 16#DC00;
codepointBuffer[codePointCount] := SHL(highSurrogate,10) + lowSurrogate + 16#10000;
codePointCount := codePointCount + 1;
wordIndex := wordIndex + 2; // skip next word as it was a low surrogate
ELSE // no valid surrogate pair found
codepointBuffer[codePointCount] := 16#FFFD; // add replacement character �, usually used to mark non displayable characters
codePointCount := codePointCount + 1;
wordIndex := wordIndex + 1;
END_IF
16#DC00..16#DFFF: // unpaired low surrogate found
codepointBuffer[codePointCount] := 16#FFFD; // add replacement character �, usually used to mark non displayable characters
codePointCount := codePointCount + 1;
wordIndex := wordIndex + 1;
ELSE
// word is a single codepoint (basic multilingual plane U+0000 bis U+FFFF)
codepointBuffer[codePointCount] := utf16StringAddress[wordIndex];
codePointCount := codePointCount + 1;
wordIndex := wordIndex + 1;
END_CASE
END_WHILE
GetCodepointsFromUtf16String := TRUE;