GetCodepointsFromUtf16String

Short summary

This function creates a code point array from an UTF-16 string and returns the start address.

Attention: The returned addressOfCodePoints is a dynamically created pointer! It needs to be deleted after usage.

Example:

myString :WString(10) := "Test" ;
codepoints :Array[0..10] OF UnicodeCodepoint;
codepointCount :UDINT;

------------------
GetCodepointsFromUtf16String(
	utf16StringAddress := ADR(myString),
	utf16StringWordCount := GetUtf16StringLength(ADR(myString)),
	codepointBuffer := ADR(codepoints),
	bufferSize := SIZEOF(codepoints),
	codePointCount => codepointCount
);

Return type: BOOL

Parameters

Name	Type	Comment	Kind
utf16StringAddress	POINTER TO WORD	address of the utf-16 string	input
utf16StringWordCount	UDINT	number of words of the utf-16 string	input
codepointBuffer	POINTER TO UnicodeCodePoint	buffer where the encoded codepoints are stored	input
bufferSize	UDINT	size of the codepoint buffer in bytes	input
codePointCount	UDINT	number of codepoints	output

Code

`Declaration`

FUNCTION GetCodepointsFromUtf16String : BOOL
VAR_INPUT
	(*address of the utf-16 string*)
	utf16StringAddress	 :POINTER TO WORD;
	(* number of words of the utf-16 string *)
	utf16StringWordCount :UDINT;
	(* buffer where the encoded codepoints are stored *)
	codepointBuffer	:POINTER TO UnicodeCodePoint;
	(* size of the codepoint buffer in bytes *)
	bufferSize :UDINT;
END_VAR
VAR_OUTPUT	
	(* number of codepoints *)
	codePointCount		:UDINT := 0;
END_VAR
VAR CONSTANT
	END_OF_STRING		:BYTE := 16#00;
END_VAR
VAR
	highSurrogate, lowSurrogate :WORD;
	wordIndex			:UDINT;
	bufferSizeCodepoints :UDINT;
END_VAR

`Implementation`

RETURN((utf16StringAddress = 0) OR_ELSE (utf16StringWordCount = 0) OR_ELSE (codepointBuffer = 0) OR_ELSE bufferSize < SIZEOF(UnicodeCodepoint));
Tc2_System.MEMSET(codepointBuffer, 0, bufferSize);

wordIndex := 0;
bufferSizeCodepoints := bufferSize/SIZEOF(UnicodeCodepoint);

WHILE ((wordIndex < utf16StringWordCount) AND_THEN ( codePointCount < bufferSizeCodepoints )) DO
	CASE utf16StringAddress[wordIndex] OF
	16#D800..16#DBFF: 	// current word is high surrogate
		// check next word is a low surrogate
		IF ((wordIndex + 1 < utf16StringWordCount)
			AND_THEN (utf16StringAddress[wordIndex+1] >= 16#DC00 )
			AND_THEN (utf16StringAddress[wordIndex+1] <= 16#DFFF))
		THEN
			highSurrogate := utf16StringAddress[wordIndex] - 16#D800;
			lowSurrogate := utf16StringAddress[wordIndex+1] - 16#DC00;
			codepointBuffer[codePointCount] := SHL(highSurrogate,10) + lowSurrogate + 16#10000;
			codePointCount := codePointCount + 1;
			wordIndex := wordIndex + 2; // skip next word as it was a low surrogate 
		ELSE // no valid surrogate pair found
			codepointBuffer[codePointCount] := 16#FFFD; // add replacement character �, usually used to mark non displayable characters
			codePointCount := codePointCount + 1;
			wordIndex := wordIndex + 1; 
		END_IF
	16#DC00..16#DFFF:	// unpaired low surrogate found
		codepointBuffer[codePointCount] := 16#FFFD; // add replacement character �, usually used to mark non displayable characters
		codePointCount := codePointCount + 1;
		wordIndex := wordIndex + 1;
	ELSE
		// word is a single codepoint (basic multilingual plane U+0000 bis U+FFFF)
		codepointBuffer[codePointCount] := utf16StringAddress[wordIndex];
		codePointCount := codePointCount + 1;
		wordIndex := wordIndex + 1;
	END_CASE
END_WHILE

GetCodepointsFromUtf16String := TRUE;

GetCodepointsFromUtf16String

Short summary​

Parameters​

Code​

Declaration​

Implementation​

Short summary

Parameters

Code

`Declaration`

`Implementation`