Skip to main content

NormalizeUtf8String

Short summary

This functions transforms a given string into the selected normalization form and returns the normalized string. To learn more about normalization forms of unicode, check this

Attention: the provided normalizedStringBuffer must have an appropiate size, otherwise ther normalized string will be cut!

Example:


myString :String(10) := 'Täst';
myNormalizedString :String(30);
normalizedLength :UDINT;
------------------
NormalizeUtf8String(
utf8StringBuffer := ADR(myString),
normalizedStringBuffer := ADR(myNormalizedString),
bufferSize := SIZEOF(myNormalizedString),
normalForm := NormalizationForm.NFD,
normalizedStringLength => normalizedLength
);
  • Return type: UDINT

Parameters

NameTypeCommentKind
utf8StringBufferPVOIDadrress of the utf-8 encoded string or byte array (must be also null terminated!)input
normalizedStringBufferPVOIDaddress of the buffer in which the normalized string is stored. e.g. ADR(myNormalizedString)input
bufferSizeUDINTsize of the normalized string buffer, e.g. SIZEOF(myNormalizedString)input
normalFormNormalizationFormnormalized forminput
normalizedStringLengthUDINTlength of the normalized string. If the given bufferSize is smaller, the normalized string will be cut!output

Code

Declaration

FUNCTION NormalizeUtf8String 
VAR_INPUT
(* adrress of the utf-8 encoded string or byte array (must be also null terminated!) *)
utf8StringBuffer :PVOID;
(* address of the buffer in which the normalized string is stored.
e.g. ADR(myNormalizedString) *)
normalizedStringBuffer :PVOID;
(* size of the normalized string buffer, e.g. SIZEOF(myNormalizedString) *)
bufferSize :UDINT;
(* normalized form *)
normalForm :NormalizationForm := NormalizationForm.NFC;
END_VAR
VAR_OUTPUT
(* length of the normalized string. If the given bufferSize is smaller, the normalized string will be cut! *)
normalizedStringLength :UDINT;
END_VAR
VAR
stringLength, codePointCount, normalizedCodePointCount :UDINT;
codePoints :POINTER TO UnicodeCodePoint;
normalizedCodepoints :POINTER TO UnicodeCodePoint;
END_VAR

Implementation

RETURN((utf8StringBuffer = 0) OR_ELSE (normalizedStringBuffer = 0));
GetUtf8StringLength(utf8StringBuffer, byteCount => stringLength);
RETURN(stringLength = 0); // string is empty
codePoints := __NEW(UnicodeCodepoint, stringLength);
RETURN(codePoints = 0);

GetCodepointsFromUtf8String(
utf8StringAddress := utf8StringBuffer,
utf8StringByteCount := stringLength,
codePointBuffer := codePoints,
bufferSize := stringLength * SIZEOF(UnicodeCodepoint),
codePointsCount => codePointCount
);

IF (codePointCount = 0) THEN // string was not utf8 encoded
__DELETE(codePoints);
RETURN;
END_IF

normalizedCodepoints := __NEW(UnicodeCodepoint, (stringLength*4));

IF (normalizedCodepoints = 0) THEN
__DELETE(codePoints);
RETURN;
END_IF

CASE normalForm OF
NormalizationForm.NFD:
NormalizeCodepointsFormD(
codePoints := codePoints,
codePointsCount := codePointCount,
normalizedCodepoints := normalizedCodepoints,
bufferSize := SIZEOF(UnicodeCodepoint)*stringLength*4,
normalizedCodepointsCount => normalizedCodePointCount
);
NormalizationForm.NFC:
NormalizeCodepointsFormC(
codePoints := codePoints,
codePointsCount := codePointCount,
normalizedCodepoints := normalizedCodepoints,
bufferSize := SIZEOF(UnicodeCodepoint)*stringLength*4,
normalizedCodepointsCount => normalizedCodePointCount
);
ELSE
; // do nothing
END_CASE

GetUtf8StringFromCodepoints(
addressOfCodePoints := normalizedCodepoints,
codePointCount := normalizedCodePointCount,
utf8StringBuffer := normalizedStringBuffer,
bufferSize := bufferSize,
utf8StringByteCount => normalizedStringLength
);

__DELETE(codePoints);
__DELETE(normalizedCodepoints);