NormalizeCodepointsFormC

Short summary

This function creates a codepoint array in normalized form C from a given codepoint array. To learn more about normalization forms of unicode, check this.

Return type: BOOL

Parameters

Name	Type	Comment	Kind
codePoints	POINTER TO BYTE	pointer to the unnormalized codepoint sequence	input
codepointsCount	UDINT	number of codepoints in the unnormalized codepoint sequence	input
normalizedCodePoints	POINTER TO UnicodeCodePoint	pointer to the buffer where the normalized sequence is stored	input
bufferSize	UDINT	size of the normalized buffer	input
normalizedCodepointsCount	UDINT	number of normalized codepoints	output

Code

`Declaration`

FUNCTION NormalizeCodepointsFormC : BOOL
VAR_INPUT
	(* pointer to the unnormalized codepoint sequence *)
	codePoints :POINTER TO BYTE;
	(* number of codepoints in the unnormalized codepoint sequence *)
	codepointsCount :UDINT;
	(* pointer to the buffer where the normalized sequence is stored *)
	normalizedCodePoints :POINTER TO UnicodeCodePoint;
	(* size of the normalized buffer *)
	bufferSize :UDINT;
END_VAR
VAR_OUTPUT 
	(* number of normalized codepoints *)
	normalizedCodepointsCount :UDINT := 0;
END_VAR	
VAR
	idx, nextIdx: UDINT;
	currentCCC :UDINT;
	nextCCC :UDINT;
	composedCodePoint :UnicodeCodePoint;
	prevCCC :DINT;
	copySize :UDINT;
END_VAR

`Implementation`

RETURN((codePoints = 0) OR_ELSE (codepointsCount = 0) OR_ELSE (normalizedCodepoints = 0));
// run fast quickcheck first -- many strings are already in NFC form
IF (QuickCheckCodepointsNormalized(
		codePoints := codePoints,
		codepointCount := codepointsCount,
		formToCheck := NormalizationForm.NFC 
	) = NormalizationQuickCheckResult.YES) 
THEN
	copySize := SEL(SIZEOF(UnicodeCodePoint) * codepointsCount > bufferSize, SIZEOF(UnicodeCodePoint) * codepointsCount, bufferSize);
	Tc2_System.MEMCPY( normalizedCodePoints, codePoints, copySize);
	normalizedCodepointsCount := codepointsCount;
	RETURN;
END_IF

// get NFD first - canonical order of codepoints is important!
NormalizeCodepointsFormD(
	codePoints := codePoints,
	codePointsCount := codepointsCount,
	normalizedCodepoints := normalizedCodePoints,
	bufferSize := SIZEOF(UnicodeCodepoint)*codepointsCount*4,
	normalizedCodepointsCount => normalizedCodepointsCount 
);

(* loop through all unicode points and check if a starter codepoint 
 can be combined with one of its following non starters
 starters have a canonical combining class = 0, non starters ccc is > 0 *)

FOR idx := 0 TO normalizedCodepointsCount - 1 DO
	prevCCC := -1;
	CheckCombiningMark( codePoint := ADR(normalizedCodePoints[idx]), canonicalCombiningClass => currentCCC);
	IF  (currentCCC = 0) THEN // is a starter
		FOR nextIdx := idx + 1 TO normalizedCodepointsCount - 1 DO
			// check if next sign is a non starter
			CheckCombiningMark(codePoint := ADR(normalizedCodePoints[nextIdx]), canonicalCombiningClass => nextCCC );
			IF GetRecompositionFromCodePoints(
				codePointBase := ADR(normalizedCodePoints[idx]),
				codePointComb := ADR(normalizedCodePoints[nextIdx]),
				result => composedCodePoint)
			  AND_THEN (prevCCC < TO_DINT(nextCCC))
			THEN
				normalizedCodePoints[idx] := composedCodePoint;
				normalizedCodePoints[nextIdx] := 0;
				IF (nextIdx+1<normalizedCodepointsCount) THEN
					Tc2_System.MEMCPY(ADR(normalizedCodePoints[nextIdx]),ADR(normalizedCodePoints[nextIdx+1]),SIZEOF(DWORD)*(normalizedCodepointsCount-nextIdx-1));	
				END_IF
				normalizedCodepointsCount := normalizedCodepointsCount - 1;
				//repeat all checks with new starter
				nextIdx := idx;
				prevCCC := -1;
			ELSIF (nextCCC = 0) THEN
				idx := nextIdx - 1; // check for next starter	
				EXIT;
			ELSE
				prevCCC := TO_DINT(nextCCC);
			END_IF
		END_FOR
	END_IF
END_FOR

NormalizeCodepointsFormC

Short summary​

Parameters​

Code​

Declaration​

Implementation​

Short summary

Parameters

Code

`Declaration`

`Implementation`