NormalizeCodepointsFormC
Short summary
This function creates a codepoint array in normalized form C from a given codepoint array. To learn more about normalization forms of unicode, check this.
- Return type:
BOOL
Parameters
| Name | Type | Comment | Kind |
|---|---|---|---|
| codePoints | POINTER TO BYTE | pointer to the unnormalized codepoint sequence | input |
| codepointsCount | UDINT | number of codepoints in the unnormalized codepoint sequence | input |
| normalizedCodePoints | POINTER TO UnicodeCodePoint | pointer to the buffer where the normalized sequence is stored | input |
| bufferSize | UDINT | size of the normalized buffer | input |
| normalizedCodepointsCount | UDINT | number of normalized codepoints | output |
Code
Declaration
FUNCTION NormalizeCodepointsFormC : BOOL
VAR_INPUT
(* pointer to the unnormalized codepoint sequence *)
codePoints :POINTER TO BYTE;
(* number of codepoints in the unnormalized codepoint sequence *)
codepointsCount :UDINT;
(* pointer to the buffer where the normalized sequence is stored *)
normalizedCodePoints :POINTER TO UnicodeCodePoint;
(* size of the normalized buffer *)
bufferSize :UDINT;
END_VAR
VAR_OUTPUT
(* number of normalized codepoints *)
normalizedCodepointsCount :UDINT := 0;
END_VAR
VAR
idx, nextIdx: UDINT;
currentCCC :UDINT;
nextCCC :UDINT;
composedCodePoint :UnicodeCodePoint;
prevCCC :DINT;
copySize :UDINT;
END_VAR
Implementation
RETURN((codePoints = 0) OR_ELSE (codepointsCount = 0) OR_ELSE (normalizedCodepoints = 0));
// run fast quickcheck first -- many strings are already in NFC form
IF (QuickCheckCodepointsNormalized(
codePoints := codePoints,
codepointCount := codepointsCount,
formToCheck := NormalizationForm.NFC
) = NormalizationQuickCheckResult.YES)
THEN
copySize := SEL(SIZEOF(UnicodeCodePoint) * codepointsCount > bufferSize, SIZEOF(UnicodeCodePoint) * codepointsCount, bufferSize);
Tc2_System.MEMCPY( normalizedCodePoints, codePoints, copySize);
normalizedCodepointsCount := codepointsCount;
RETURN;
END_IF
// get NFD first - canonical order of codepoints is important!
NormalizeCodepointsFormD(
codePoints := codePoints,
codePointsCount := codepointsCount,
normalizedCodepoints := normalizedCodePoints,
bufferSize := SIZEOF(UnicodeCodepoint)*codepointsCount*4,
normalizedCodepointsCount => normalizedCodepointsCount
);
(* loop through all unicode points and check if a starter codepoint
can be combined with one of its following non starters
starters have a canonical combining class = 0, non starters ccc is > 0 *)
FOR idx := 0 TO normalizedCodepointsCount - 1 DO
prevCCC := -1;
CheckCombiningMark( codePoint := ADR(normalizedCodePoints[idx]), canonicalCombiningClass => currentCCC);
IF (currentCCC = 0) THEN // is a starter
FOR nextIdx := idx + 1 TO normalizedCodepointsCount - 1 DO
// check if next sign is a non starter
CheckCombiningMark(codePoint := ADR(normalizedCodePoints[nextIdx]), canonicalCombiningClass => nextCCC );
IF GetRecompositionFromCodePoints(
codePointBase := ADR(normalizedCodePoints[idx]),
codePointComb := ADR(normalizedCodePoints[nextIdx]),
result => composedCodePoint)
AND_THEN (prevCCC < TO_DINT(nextCCC))
THEN
normalizedCodePoints[idx] := composedCodePoint;
normalizedCodePoints[nextIdx] := 0;
IF (nextIdx+1<normalizedCodepointsCount) THEN
Tc2_System.MEMCPY(ADR(normalizedCodePoints[nextIdx]),ADR(normalizedCodePoints[nextIdx+1]),SIZEOF(DWORD)*(normalizedCodepointsCount-nextIdx-1));
END_IF
normalizedCodepointsCount := normalizedCodepointsCount - 1;
//repeat all checks with new starter
nextIdx := idx;
prevCCC := -1;
ELSIF (nextCCC = 0) THEN
idx := nextIdx - 1; // check for next starter
EXIT;
ELSE
prevCCC := TO_DINT(nextCCC);
END_IF
END_FOR
END_IF
END_FOR