Skip to main content

NormalizeCodepointsFormC

Short summary

This function creates a codepoint array in normalized form C from a given codepoint array. To learn more about normalization forms of unicode, check this.

  • Return type: BOOL

Parameters

NameTypeCommentKind
codePointsPOINTER TO BYTEpointer to the unnormalized codepoint sequenceinput
codepointsCountUDINTnumber of codepoints in the unnormalized codepoint sequenceinput
normalizedCodePointsPOINTER TO UnicodeCodePointpointer to the buffer where the normalized sequence is storedinput
bufferSizeUDINTsize of the normalized bufferinput
normalizedCodepointsCountUDINTnumber of normalized codepointsoutput

Code

Declaration

FUNCTION NormalizeCodepointsFormC : BOOL
VAR_INPUT
(* pointer to the unnormalized codepoint sequence *)
codePoints :POINTER TO BYTE;
(* number of codepoints in the unnormalized codepoint sequence *)
codepointsCount :UDINT;
(* pointer to the buffer where the normalized sequence is stored *)
normalizedCodePoints :POINTER TO UnicodeCodePoint;
(* size of the normalized buffer *)
bufferSize :UDINT;
END_VAR
VAR_OUTPUT
(* number of normalized codepoints *)
normalizedCodepointsCount :UDINT := 0;
END_VAR
VAR
idx, nextIdx: UDINT;
currentCCC :UDINT;
nextCCC :UDINT;
composedCodePoint :UnicodeCodePoint;
prevCCC :DINT;
copySize :UDINT;
END_VAR

Implementation

RETURN((codePoints = 0) OR_ELSE (codepointsCount = 0) OR_ELSE (normalizedCodepoints = 0));
// run fast quickcheck first -- many strings are already in NFC form
IF (QuickCheckCodepointsNormalized(
codePoints := codePoints,
codepointCount := codepointsCount,
formToCheck := NormalizationForm.NFC
) = NormalizationQuickCheckResult.YES)
THEN
copySize := SEL(SIZEOF(UnicodeCodePoint) * codepointsCount > bufferSize, SIZEOF(UnicodeCodePoint) * codepointsCount, bufferSize);
Tc2_System.MEMCPY( normalizedCodePoints, codePoints, copySize);
normalizedCodepointsCount := codepointsCount;
RETURN;
END_IF

// get NFD first - canonical order of codepoints is important!
NormalizeCodepointsFormD(
codePoints := codePoints,
codePointsCount := codepointsCount,
normalizedCodepoints := normalizedCodePoints,
bufferSize := SIZEOF(UnicodeCodepoint)*codepointsCount*4,
normalizedCodepointsCount => normalizedCodepointsCount
);

(* loop through all unicode points and check if a starter codepoint
can be combined with one of its following non starters
starters have a canonical combining class = 0, non starters ccc is > 0 *)

FOR idx := 0 TO normalizedCodepointsCount - 1 DO
prevCCC := -1;
CheckCombiningMark( codePoint := ADR(normalizedCodePoints[idx]), canonicalCombiningClass => currentCCC);
IF (currentCCC = 0) THEN // is a starter
FOR nextIdx := idx + 1 TO normalizedCodepointsCount - 1 DO
// check if next sign is a non starter
CheckCombiningMark(codePoint := ADR(normalizedCodePoints[nextIdx]), canonicalCombiningClass => nextCCC );
IF GetRecompositionFromCodePoints(
codePointBase := ADR(normalizedCodePoints[idx]),
codePointComb := ADR(normalizedCodePoints[nextIdx]),
result => composedCodePoint)
AND_THEN (prevCCC < TO_DINT(nextCCC))
THEN
normalizedCodePoints[idx] := composedCodePoint;
normalizedCodePoints[nextIdx] := 0;
IF (nextIdx+1<normalizedCodepointsCount) THEN
Tc2_System.MEMCPY(ADR(normalizedCodePoints[nextIdx]),ADR(normalizedCodePoints[nextIdx+1]),SIZEOF(DWORD)*(normalizedCodepointsCount-nextIdx-1));
END_IF
normalizedCodepointsCount := normalizedCodepointsCount - 1;
//repeat all checks with new starter
nextIdx := idx;
prevCCC := -1;
ELSIF (nextCCC = 0) THEN
idx := nextIdx - 1; // check for next starter
EXIT;
ELSE
prevCCC := TO_DINT(nextCCC);
END_IF
END_FOR
END_IF
END_FOR