Skip to main content

GetCodepointsFromUtf16String

Short summary

This function creates a code point array from an UTF-16 string and returns the start address.

Attention: The returned addressOfCodePoints is a dynamically created pointer! It needs to be deleted after usage.

Example:

myString :WString(10) := "Test" ;
codepoints :Array[0..10] OF UnicodeCodepoint;
codepointCount :UDINT;

------------------
GetCodepointsFromUtf16String(
utf16StringAddress := ADR(myString),
utf16StringWordCount := GetUtf16StringLength(ADR(myString)),
codepointBuffer := ADR(codepoints),
bufferSize := SIZEOF(codepoints),
codePointCount => codepointCount
);
  • Return type: BOOL

Parameters

NameTypeCommentKind
utf16StringAddressPOINTER TO WORDaddress of the utf-16 stringinput
utf16StringWordCountUDINTnumber of words of the utf-16 stringinput
codepointBufferPOINTER TO UnicodeCodePointbuffer where the encoded codepoints are storedinput
bufferSizeUDINTsize of the codepoint buffer in bytesinput
codePointCountUDINTnumber of codepointsoutput

Code

Declaration

FUNCTION GetCodepointsFromUtf16String : BOOL
VAR_INPUT
(*address of the utf-16 string*)
utf16StringAddress :POINTER TO WORD;
(* number of words of the utf-16 string *)
utf16StringWordCount :UDINT;
(* buffer where the encoded codepoints are stored *)
codepointBuffer :POINTER TO UnicodeCodePoint;
(* size of the codepoint buffer in bytes *)
bufferSize :UDINT;
END_VAR
VAR_OUTPUT
(* number of codepoints *)
codePointCount :UDINT := 0;
END_VAR
VAR CONSTANT
END_OF_STRING :BYTE := 16#00;
END_VAR
VAR
highSurrogate, lowSurrogate :WORD;
wordIndex :UDINT;
bufferSizeCodepoints :UDINT;
END_VAR

Implementation

RETURN((utf16StringAddress = 0) OR_ELSE (utf16StringWordCount = 0) OR_ELSE (codepointBuffer = 0) OR_ELSE bufferSize < SIZEOF(UnicodeCodepoint));
Tc2_System.MEMSET(codepointBuffer, 0, bufferSize);

wordIndex := 0;
bufferSizeCodepoints := bufferSize/SIZEOF(UnicodeCodepoint);

WHILE ((wordIndex < utf16StringWordCount) AND_THEN ( codePointCount < bufferSizeCodepoints )) DO
CASE utf16StringAddress[wordIndex] OF
16#D800..16#DBFF: // current word is high surrogate
// check next word is a low surrogate
IF ((wordIndex + 1 < utf16StringWordCount)
AND_THEN (utf16StringAddress[wordIndex+1] >= 16#DC00 )
AND_THEN (utf16StringAddress[wordIndex+1] <= 16#DFFF))
THEN
highSurrogate := utf16StringAddress[wordIndex] - 16#D800;
lowSurrogate := utf16StringAddress[wordIndex+1] - 16#DC00;
codepointBuffer[codePointCount] := SHL(highSurrogate,10) + lowSurrogate + 16#10000;
codePointCount := codePointCount + 1;
wordIndex := wordIndex + 2; // skip next word as it was a low surrogate
ELSE // no valid surrogate pair found
codepointBuffer[codePointCount] := 16#FFFD; // add replacement character �, usually used to mark non displayable characters
codePointCount := codePointCount + 1;
wordIndex := wordIndex + 1;
END_IF
16#DC00..16#DFFF: // unpaired low surrogate found
codepointBuffer[codePointCount] := 16#FFFD; // add replacement character �, usually used to mark non displayable characters
codePointCount := codePointCount + 1;
wordIndex := wordIndex + 1;
ELSE
// word is a single codepoint (basic multilingual plane U+0000 bis U+FFFF)
codepointBuffer[codePointCount] := utf16StringAddress[wordIndex];
codePointCount := codePointCount + 1;
wordIndex := wordIndex + 1;
END_CASE
END_WHILE

GetCodepointsFromUtf16String := TRUE;