{**************************************************************************************************} { } { Project JEDI Code Library (JCL) } { } { The contents of this file are subject to the Mozilla Public License Version 1.1 (the "License"); } { you may not use this file except in compliance with the License. You may obtain a copy of the } { License at http://www.mozilla.org/MPL/ } { } { Software distributed under the License is distributed on an "AS IS" basis, WITHOUT WARRANTY OF } { ANY KIND, either express or implied. See the License for the specific language governing rights } { and limitations under the License. } { } { The Original Code is JclUnicode.pas. } { } { The Initial Developer of the Original Code is Mike Lischke (public att lischke-online dott de). } { Portions created by Mike Lischke are Copyright (C) 1999-2000 Mike Lischke. All Rights Reserved. } { } { Contributor(s): } { Marcel van Brakel } { Andreas Hausladen (ahuser) } { Mike Lischke } { Flier Lu (flier) } { Robert Marquardt (marquardt) } { Robert Rossmair (rrossmair) } { Olivier Sannier (obones) } { Matthias Thoma (mthoma) } { Petr Vones (pvones) } { Peter Schraut (http://www.console-dev.de) } { Florent Ouchet (outchy) } { } {**************************************************************************************************} { } { String conversion routines } { } {**************************************************************************************************} { } { Last modified: $Date:: 2009-08-09 15:08:29 +0200 (dim., 09 août 2009) $ } { Revision: $Rev:: 2921 $ } { Author: $Author:: outchy $ } { } {**************************************************************************************************} unit JclStringConversions; {$I jcl.inc} interface uses Classes, {$IFDEF UNITVERSIONING} JclUnitVersioning, {$ENDIF UNITVERSIONING} JclBase; type EJclStringConversionError = class(EJclError); EJclUnexpectedEOSequenceError = class (EJclStringConversionError) public constructor Create; end; // conversion routines between Ansi, UTF-16, UCS-4 and UTF8 strings // one shot conversion between PAnsiChar and PWideChar procedure ExpandASCIIString(const Source: PAnsiChar; Target: PWideChar; Count: SizeInt); // tpye of stream related functions type TJclStreamGetNextCharFunc = function(S: TStream; out Ch: UCS4): Boolean; TJclStreamSkipCharsFunc = function(S: TStream; var NbSeq: SizeInt): Boolean; TJclStreamSetNextCharFunc = function(S: TStream; Ch: UCS4): Boolean; // iterative conversions // UTF8GetNextChar = read next UTF8 sequence at StrPos // if UNICODE_SILENT_FAILURE is defined, invalid sequences will be replaced by ReplacementCharacter // otherwise StrPos is set to -1 on return to flag an error (invalid UTF8 sequence) // StrPos will be incremented by the number of chars that were read function UTF8GetNextChar(const S: TUTF8String; var StrPos: SizeInt): UCS4; function UTF8GetNextCharFromStream(S: TStream; out Ch: UCS4): Boolean; // UTF8SkipChars = skip NbSeq UTF8 sequences starting from StrPos // returns False if String is too small // if UNICODE_SILENT_FAILURE is not defined StrPos is set to -1 on error (invalid UTF8 sequence) // StrPos will be incremented by the number of chars that were skipped // On return, NbSeq contains the number of UTF8 sequences that were skipped function UTF8SkipChars(const S: TUTF8String; var StrPos: SizeInt; var NbSeq: SizeInt): Boolean; function UTF8SkipCharsFromStream(S: TStream; var NbSeq: SizeInt): Boolean; // UTF8SetNextChar = append an UTF8 sequence at StrPos // returns False on error: // - if an UCS4 character cannot be stored to an UTF-8 string: // - if UNICODE_SILENT_FAILURE is defined, ReplacementCharacter is added // - if UNICODE_SILENT_FAILURE is not defined, StrPos is set to -1 // - StrPos > -1 flags string being too small, callee did nothing, caller is responsible for allocating space // StrPos will be incremented by the number of chars that were written function UTF8SetNextChar(var S: TUTF8String; var StrPos: SizeInt; Ch: UCS4): Boolean; function UTF8SetNextCharToStream(S: TStream; Ch: UCS4): Boolean; // UTF16GetNextChar = read next UTF16 sequence at StrPos // if UNICODE_SILENT_FAILURE is defined, invalid sequences will be replaced by ReplacementCharacter // otherwise StrPos is set to -1 on return to flag an error (invalid UTF16 sequence) // StrPos will be incremented by the number of chars that were read function UTF16GetNextChar(const S: TUTF16String; var StrPos: SizeInt): UCS4; overload; {$IFDEF SUPPORTS_UNICODE_STRING} function UTF16GetNextChar(const S: UnicodeString; var StrPos: SizeInt): UCS4; overload; {$ENDIF SUPPORTS_UNICODE_STRING} function UTF16GetNextCharFromStream(S: TStream; out Ch: UCS4): Boolean; // UTF16GetPreviousChar = read previous UTF16 sequence starting at StrPos-1 // if UNICODE_SILENT_FAILURE is defined, invalid sequences will be replaced by ReplacementCharacter // otherwise StrPos is set to -1 on return to flag an error (invalid UTF16 sequence) // StrPos will be decremented by the number of chars that were read function UTF16GetPreviousChar(const S: TUTF16String; var StrPos: SizeInt): UCS4; overload; {$IFDEF SUPPORTS_UNICODE_STRING} function UTF16GetPreviousChar(const S: UnicodeString; var StrPos: SizeInt): UCS4; overload; {$ENDIF SUPPORTS_UNICODE_STRING} // UTF16SkipChars = skip NbSeq UTF16 sequences starting from StrPos // returns False if String is too small // if UNICODE_SILENT_FAILURE is not defined StrPos is set to -1 on error (invalid UTF16 sequence) // StrPos will be incremented by the number of chars that were skipped // On return, NbChar contains the number of UTF16 sequences that were skipped function UTF16SkipChars(const S: TUTF16String; var StrPos: SizeInt; var NbSeq: SizeInt): Boolean; overload; {$IFDEF SUPPORTS_UNICODE_STRING} function UTF16SkipChars(const S: UnicodeString; var StrPos: SizeInt; var NbSeq: SizeInt): Boolean; overload; {$ENDIF SUPPORTS_UNICODE_STRING} function UTF16SkipCharsFromStream(S: TStream; var NbSeq: SizeInt): Boolean; // UTF16SetNextChar = append an UTF16 sequence at StrPos // returns False on error: // - if an UCS4 character cannot be stored to an UTF-16 string: // - if UNICODE_SILENT_FAILURE is defined, ReplacementCharacter is added // - if UNICODE_SILENT_FAILURE is not defined, StrPos is set to -1 // - StrPos > -1 flags string being too small, callee did nothing and caller is responsible for allocating space // StrPos will be incremented by the number of chars that were written function UTF16SetNextChar(var S: TUTF16String; var StrPos: SizeInt; Ch: UCS4): Boolean; overload; {$IFDEF SUPPORTS_UNICODE_STRING} function UTF16SetNextChar(var S: UnicodeString; var StrPos: SizeInt; Ch: UCS4): Boolean; overload; {$ENDIF SUPPORTS_UNICODE_STRING} function UTF16SetNextCharToStream(S: TStream; Ch: UCS4): Boolean; // AnsiGetNextChar = read next character at StrPos // StrPos will be incremented by the number of chars that were read (1) function AnsiGetNextChar(const S: AnsiString; var StrPos: SizeInt): UCS4; overload; function AnsiGetNextCharFromStream(S: TStream; out Ch: UCS4): Boolean; overload; // same as AnsiGetNextChar* with custom codepage function AnsiGetNextChar(const S: AnsiString; CodePage: Word; var StrPos: SizeInt): UCS4; overload; function AnsiGetNextCharFromStream(S: TStream; CodePage: Word; out Ch: UCS4): Boolean; overload; // AnsiSkipChars = skip NbSeq characters starting from StrPos // returns False if String is too small // StrPos will be incremented by the number of chars that were skipped // On return, NbChar contains the number of UTF16 sequences that were skipped function AnsiSkipChars(const S: AnsiString; var StrPos: SizeInt; var NbSeq: SizeInt): Boolean; function AnsiSkipCharsFromStream(S: TStream; var NbSeq: SizeInt): Boolean; // AnsiSetNextChar = append a character at StrPos // returns False on error: // - if an UCS4 character cannot be stored to an ansi string: // - if UNICODE_SILENT_FAILURE is defined, ReplacementCharacter is added // - if UNICODE_SILENT_FAILURE is not defined, StrPos is set to -1 // - StrPos > -1 flags string being too small, callee did nothing and caller is responsible for allocating space // StrPos will be incremented by the number of chars that were written (1) function AnsiSetNextChar(var S: AnsiString; var StrPos: SizeInt; Ch: UCS4): Boolean; overload; function AnsiSetNextCharToStream(S: TStream; Ch: UCS4): Boolean; overload; // same as AnsiSetNextChar* with custom codepage function AnsiSetNextChar(var S: AnsiString; CodePage: Word; var StrPos: SizeInt; Ch: UCS4): Boolean; overload; function AnsiSetNextCharToStream(S: TStream; CodePage: Word; Ch: UCS4): Boolean; overload; // StringGetNextChar = read next character/sequence at StrPos // if UNICODE_SILENT_FAILURE is defined, invalid sequences will be replaced by ReplacementCharacter // otherwise StrPos is set to -1 on return to flag an error (invalid UTF16 sequence for WideString) // StrPos will be incremented by the number of chars that were read function StringGetNextChar(const S: string; var StrPos: SizeInt): UCS4; {$IFDEF SUPPORTS_INLINE} inline; {$ENDIF} // StringSkipChars = skip NbSeq characters/sequences starting from StrPos // returns False if String is too small // if UNICODE_SILENT_FAILURE is not defined StrPos is set to -1 on error (invalid UTF16 sequence for WideString) // StrPos will be incremented by the number of chars that were skipped // On return, NbChar contains the number of UTF16 sequences that were skipped function StringSkipChars(const S: string; var StrPos: SizeInt; var NbSeq: SizeInt): Boolean; {$IFDEF SUPPORTS_INLINE} inline; {$ENDIF} // StringSetNextChar = append a character/sequence at StrPos // returns False on error: // - if an UCS4 character cannot be stored to a string: // - if UNICODE_SILENT_FAILURE is defined, ReplacementCharacter is added // - if UNICODE_SILENT_FAILURE is not defined, StrPos is set to -1 // - StrPos > -1 flags string being too small, callee did nothing and caller is responsible for allocating space // StrPos will be incremented by the number of chars that were written function StringSetNextChar(var S: string; var StrPos: SizeInt; Ch: UCS4): Boolean; {$IFDEF SUPPORTS_INLINE} inline; {$ENDIF} // one shot conversions between WideString and others function WideStringToUTF8(const S: WideString): TUTF8String; {$IFDEF SUPPORTS_INLINE} inline; {$ENDIF SUPPORTS_INLINE} function UTF8ToWideString(const S: TUTF8String): WideString; {$IFDEF SUPPORTS_INLINE} inline; {$ENDIF SUPPORTS_INLINE} function WideStringToUCS4(const S: WideString): TUCS4Array; {$IFDEF SUPPORTS_INLINE} inline; {$ENDIF SUPPORTS_INLINE} function UCS4ToWideString(const S: TUCS4Array): WideString; {$IFDEF SUPPORTS_INLINE} inline; {$ENDIF SUPPORTS_INLINE} // one shot conversions between AnsiString and others function AnsiStringToUTF8(const S: AnsiString): TUTF8String; {$IFDEF SUPPORTS_INLINE} inline; {$ENDIF SUPPORTS_INLINE} function UTF8ToAnsiString(const S: TUTF8String): AnsiString; {$IFDEF SUPPORTS_INLINE} inline; {$ENDIF SUPPORTS_INLINE} function AnsiStringToUTF16(const S: AnsiString): TUTF16String; {$IFDEF SUPPORTS_INLINE} inline; {$ENDIF SUPPORTS_INLINE} function UTF16ToAnsiString(const S: TUTF16String): AnsiString; {$IFDEF SUPPORTS_INLINE} inline; {$ENDIF SUPPORTS_INLINE} function AnsiStringToUCS4(const S: AnsiString): TUCS4Array; {$IFDEF SUPPORTS_INLINE} inline; {$ENDIF SUPPORTS_INLINE} function UCS4ToAnsiString(const S: TUCS4Array): AnsiString; {$IFDEF SUPPORTS_INLINE} inline; {$ENDIF SUPPORTS_INLINE} // one shot conversions between string and others function StringToUTF8(const S: string): TUTF8String; {$IFDEF SUPPORTS_INLINE} inline; {$ENDIF SUPPORTS_INLINE} function UTF8ToString(const S: TUTF8String): string; {$IFDEF SUPPORTS_INLINE} inline; {$ENDIF SUPPORTS_INLINE} function StringToUTF16(const S: string): TUTF16String; {$IFDEF SUPPORTS_INLINE} inline; {$ENDIF SUPPORTS_INLINE} function UTF16ToString(const S: TUTF16String): string; {$IFDEF SUPPORTS_INLINE} inline; {$ENDIF SUPPORTS_INLINE} function StringToUCS4(const S: string): TUCS4Array; {$IFDEF SUPPORTS_INLINE} inline; {$ENDIF SUPPORTS_INLINE} function UCS4ToString(const S: TUCS4Array): string; {$IFDEF SUPPORTS_INLINE} inline; {$ENDIF SUPPORTS_INLINE} function TryStringToUTF8(const S: string; out D: TUTF8String): Boolean; {$IFDEF SUPPORTS_INLINE} inline; {$ENDIF SUPPORTS_INLINE} function TryUTF8ToString(const S: TUTF8String; out D: string): Boolean; {$IFDEF SUPPORTS_INLINE} inline; {$ENDIF SUPPORTS_INLINE} function TryStringToUTF16(const S: string; out D: TUTF16String): Boolean; {$IFDEF SUPPORTS_INLINE} inline; {$ENDIF SUPPORTS_INLINE} function TryUTF16ToString(const S: TUTF16String; out D: string): Boolean; {$IFDEF SUPPORTS_INLINE} inline; {$ENDIF SUPPORTS_INLINE} function TryStringToUCS4(const S: string; out D: TUCS4Array): Boolean; {$IFDEF SUPPORTS_INLINE} inline; {$ENDIF SUPPORTS_INLINE} function TryUCS4ToString(const S: TUCS4Array; out D: string): Boolean; {$IFDEF SUPPORTS_INLINE} inline; {$ENDIF SUPPORTS_INLINE} function UTF8ToUTF16(const S: TUTF8String): TUTF16String; function UTF16ToUTF8(const S: TUTF16String): TUTF8String; function UTF8ToUCS4(const S: TUTF8String): TUCS4Array; function UCS4ToUTF8(const S: TUCS4Array): TUTF8String; function UTF16ToUCS4(const S: TUTF16String): TUCS4Array; function UCS4ToUTF16(const S: TUCS4Array): TUTF16String; function TryUTF8ToUTF16(const S: TUTF8String; out D: TUTF16String): Boolean; function TryUTF16ToUTF8(const S: TUTF16String; out D: TUTF8String): Boolean; function TryUTF8ToUCS4(const S: TUTF8String; out D: TUCS4Array): Boolean; function TryUCS4ToUTF8(const S: TUCS4Array; out D: TUTF8String): Boolean; function TryUTF16ToUCS4(const S: TUTF16String; out D: TUCS4Array): Boolean; function TryUCS4ToUTF16(const S: TUCS4Array; out D: TUTF16String): Boolean; // indexed conversions function UTF8CharCount(const S: TUTF8String): SizeInt; function UTF16CharCount(const S: TUTF16String): SizeInt; function UCS2CharCount(const S: TUCS2String): SizeInt; function UCS4CharCount(const S: TUCS4Array): SizeInt; // returns False if string is too small // if UNICODE_SILENT_FAILURE is not defined and an invalid UTFX sequence is detected, an exception is raised // returns True on success and Value contains UCS4 character that was read function GetUCS4CharAt(const UTF8Str: TUTF8String; Index: SizeInt; out Value: UCS4): Boolean; overload; function GetUCS4CharAt(const WideStr: TUTF16String; Index: SizeInt; out Value: UCS4; IsUTF16: Boolean = True): Boolean; overload; function GetUCS4CharAt(const UCS4Str: TUCS4Array; Index: SizeInt; out Value: UCS4): Boolean; overload; function UCS4ToAnsiChar(Value: UCS4): AnsiChar; function UCS4ToWideChar(Value: UCS4): WideChar; function UCS4ToChar(Value: UCS4): Char; {$IFDEF SUPPORTS_INLINE} inline; {$ENDIF SUPPORTS_INLINE} function AnsiCharToUCS4(Value: AnsiChar): UCS4; function WideCharToUCS4(Value: WideChar): UCS4; function CharToUCS4(Value: Char): UCS4; {$IFDEF SUPPORTS_INLINE} inline; {$ENDIF SUPPORTS_INLINE} {$IFDEF UNITVERSIONING} const UnitVersioning: TUnitVersionInfo = ( RCSfile: '$URL: https://jcl.svn.sourceforge.net/svnroot/jcl/tags/JCL-2.1-Build3536/jcl/source/common/JclStringConversions.pas $'; Revision: '$Revision: 2921 $'; Date: '$Date: 2009-08-09 15:08:29 +0200 (dim., 09 août 2009) $'; LogPath: 'JCL\source\common'; Extra: ''; Data: nil ); {$ENDIF UNITVERSIONING} implementation uses {$IFDEF MSWINDOWS} Windows, {$ENDIF MSWINDOWS} JclResources; const MB_ERR_INVALID_CHARS = 8; constructor EJclUnexpectedEOSequenceError.Create; begin inherited CreateRes(@RsEUnexpectedEOSeq); end; function StreamReadByte(S: TStream; out B: Byte): Boolean; {$IFDEF SUPPORTS_INLINE} inline; {$ENDIF} begin B := 0; Result := S.Read(B, SizeOf(B)) = SizeOf(B); end; function StreamWriteByte(S: TStream; B: Byte): Boolean; {$IFDEF SUPPORTS_INLINE} inline; {$ENDIF} begin Result := S.Write(B, SizeOf(B)) = SizeOf(B); end; function StreamReadWord(S: TStream; out W: Word): Boolean; {$IFDEF SUPPORTS_INLINE} inline; {$ENDIF} begin W := 0; Result := S.Read(W, SizeOf(W)) = SizeOf(W); end; function StreamWriteWord(S: TStream; W: Word): Boolean; {$IFDEF SUPPORTS_INLINE} inline; {$ENDIF} begin Result := S.Write(W, SizeOf(W)) = SizeOf(W); end; //----------------- conversion routines ------------------------------------------------------------ // Converts the given source ANSI string into a Unicode string by expanding each character // from one byte to two bytes. // EAX contains Source, EDX contains Target, ECX contains Count procedure ExpandASCIIString(const Source: PAnsiChar; Target: PWideChar; Count: SizeInt); asm {$IFDEF CPU32} // --> EAX Source // EDX Target // ECX Count JECXZ @@Finish // go out if there is nothing to do (ECX = 0) PUSH ESI MOV ESI, EAX XOR EAX, EAX @@1: MOV AL, [ESI] INC ESI MOV [EDX], AX ADD EDX, 2 DEC ECX JNZ @@1 POP ESI {$ENDIF CPU32} {$IFDEF CPU64} // --> RCX Source // RDX Target // R8 Count MOV RAX, RCX MOV RCX, R8 JRCXZ @@Finish // go out if there is nothing to do (ECX = 0) MOV RSI, RAX XOR RAX, RAX @@1: MOV AL, [RSI] INC RSI MOV [RDX], AX ADD RDX, 2 DEC RCX JNZ @@1 {$ENDIF CPU64} @@Finish: end; const HalfShift: Integer = 10; HalfBase: UCS4 = $0010000; HalfMask: UCS4 = $3FF; procedure FlagInvalidSequence(var StrPos: SizeInt; Increment: SizeInt; out Ch: UCS4); overload; begin {$IFDEF UNICODE_SILENT_FAILURE} Ch := UCS4ReplacementCharacter; Inc(StrPos, Increment); {$ELSE ~UNICODE_SILENT_FAILURE} StrPos := -1; {$ENDIF ~UNICODE_SILENT_FAILURE} end; procedure FlagInvalidSequence(var StrPos: SizeInt; Increment: SizeInt); overload; begin {$IFDEF UNICODE_SILENT_FAILURE} Inc(StrPos, Increment); {$ELSE ~UNICODE_SILENT_FAILURE} StrPos := -1; {$ENDIF ~UNICODE_SILENT_FAILURE} end; procedure FlagInvalidSequence(out Ch: UCS4); overload; begin {$IFDEF UNICODE_SILENT_FAILURE} Ch := UCS4ReplacementCharacter; {$ELSE ~UNICODE_SILENT_FAILURE} raise EJclUnexpectedEOSequenceError.Create; {$ENDIF ~UNICODE_SILENT_FAILURE} end; procedure FlagInvalidSequence; overload; begin {$IFNDEF UNICODE_SILENT_FAILURE} raise EJclUnexpectedEOSequenceError.Create; {$ENDIF ~UNICODE_SILENT_FAILURE} end; // if UNICODE_SILENT_FAILURE is defined, invalid sequences will be replaced by ReplacementCharacter // otherwise StrPos is set to -1 on return to flag an error (invalid UTF8 sequence) // StrPos will be incremented by the number of chars that were read function UTF8GetNextChar(const S: TUTF8String; var StrPos: SizeInt): UCS4; var StrLength: SizeInt; ChNext: UCS4; begin StrLength := Length(S); if (StrPos <= StrLength) and (StrPos > 0) then begin Result := UCS4(S[StrPos]); case Result of $00..$7F: // 1 byte to read Inc(StrPos); $C0..$DF: begin // 2 bytes to read if StrPos >= StrLength then begin FlagInvalidSequence(StrPos, 1, Result); Exit; end; ChNext := UCS4(S[StrPos + 1]); if (ChNext and $C0) <> $80 then begin FlagInvalidSequence(StrPos, 1, Result); Exit; end; Result := ((Result and $1F) shl 6) or (ChNext and $3F); Inc(StrPos, 2); end; $E0..$EF: begin // 3 bytes to read if (StrPos + 1) >= StrLength then begin FlagInvalidSequence(StrPos, 1, Result); Exit; end; ChNext := UCS4(S[StrPos + 1]); if (ChNext and $C0) <> $80 then begin FlagInvalidSequence(StrPos, 1, Result); Exit; end; Result := ((Result and $0F) shl 12) or ((ChNext and $3F) shl 6); ChNext := UCS4(S[StrPos + 2]); if (ChNext and $C0) <> $80 then begin FlagInvalidSequence(StrPos, 2, Result); Exit; end; Result := Result or (ChNext and $3F); Inc(StrPos, 3); end; $F0..$F7: begin // 4 bytes to read if (StrPos + 2) >= StrLength then begin FlagInvalidSequence(StrPos, 1, Result); Exit; end; ChNext := UCS4(S[StrPos + 1]); if (ChNext and $C0) <> $80 then begin FlagInvalidSequence(StrPos, 1, Result); Exit; end; Result := ((Result and $07) shl 18) or ((ChNext and $3F) shl 12); ChNext := UCS4(S[StrPos + 2]); if (ChNext and $C0) <> $80 then begin FlagInvalidSequence(StrPos, 2, Result); Exit; end; Result := Result or ((ChNext and $3F) shl 6); ChNext := UCS4(S[StrPos + 3]); if (ChNext and $C0) <> $80 then begin FlagInvalidSequence(StrPos, 3, Result); Exit; end; Result := Result or (ChNext and $3F); Inc(StrPos, 4); end; $F8..$FB: begin // 5 bytes to read if (StrPos + 3) >= StrLength then begin FlagInvalidSequence(StrPos, 1, Result); Exit; end; ChNext := UCS4(S[StrPos + 1]); if (ChNext and $C0) <> $80 then begin FlagInvalidSequence(StrPos, 1, Result); Exit; end; Result := ((Result and $03) shl 24) or ((ChNext and $3F) shl 18); ChNext := UCS4(S[StrPos + 2]); if (ChNext and $C0) <> $80 then begin FlagInvalidSequence(StrPos, 2, Result); Exit; end; Result := Result or ((ChNext and $3F) shl 12); ChNext := UCS4(S[StrPos + 3]); if (ChNext and $C0) <> $80 then begin FlagInvalidSequence(StrPos, 3, Result); Exit; end; Result := Result or ((ChNext and $3F) shl 6); ChNext := UCS4(S[StrPos + 4]); if (ChNext and $C0) <> $80 then begin FlagInvalidSequence(StrPos, 4, Result); Exit; end; Result := Result or (ChNext and $3F); Inc(StrPos, 5); end; $FC..$FD: begin // 6 bytes to read if (StrPos + 4) >= StrLength then begin FlagInvalidSequence(StrPos, 1, Result); Exit; end; ChNext := UCS4(S[StrPos + 1]); if (ChNext and $C0) <> $80 then begin FlagInvalidSequence(StrPos, 1, Result); Exit; end; Result := ((Result and $01) shl 30) or ((ChNext and $3F) shl 24); ChNext := UCS4(S[StrPos + 2]); if (ChNext and $C0) <> $80 then begin FlagInvalidSequence(StrPos, 2, Result); Exit; end; Result := Result or ((ChNext and $3F) shl 18); ChNext := UCS4(S[StrPos + 3]); if (ChNext and $C0) <> $80 then begin FlagInvalidSequence(StrPos, 3, Result); Exit; end; Result := Result or ((ChNext and $3F) shl 12); ChNext := UCS4(S[StrPos + 4]); if (ChNext and $C0) <> $80 then begin FlagInvalidSequence(StrPos, 4, Result); Exit; end; Result := Result or ((ChNext and $3F) shl 6); ChNext := UCS4(S[StrPos + 5]); if (ChNext and $C0) <> $80 then begin FlagInvalidSequence(StrPos, 5, Result); Exit; end; Result := Result or (ChNext and $3F); Inc(StrPos, 6); end; else FlagInvalidSequence(StrPos, 1, Result); Exit; end; end else begin // StrPos > StrLength Result := 0; FlagInvalidSequence(StrPos, 0, Result); end; end; function UTF8GetNextCharFromStream(S: TStream; out Ch: UCS4): Boolean; var B: Byte; begin Result := StreamReadByte(S,B); if not Result then Exit; Ch := UCS4(B); case Ch of $00..$7F: ; // 1 byte to read // nothing to do $C0..$DF: begin // 2 bytes to read Result := StreamReadByte(S,B); if not Result then Exit; if (B and $C0) <> $80 then begin FlagInvalidSequence(Ch); Exit; end; Ch := ((Ch and $1F) shl 6) or (B and $3F); end; $E0..$EF: begin // 3 bytes to read Result := StreamReadByte(S,B); if not Result then Exit; if (B and $C0) <> $80 then begin FlagInvalidSequence(Ch); Exit; end; Ch := ((Ch and $0F) shl 12) or ((B and $3F) shl 6); Result := StreamReadByte(S,B); if not Result then Exit; if (B and $C0) <> $80 then begin FlagInvalidSequence(Ch); Exit; end; Ch := Ch or (B and $3F); end; $F0..$F7: begin // 4 bytes to read Result := StreamReadByte(S,B); if not Result then Exit; if (B and $C0) <> $80 then begin FlagInvalidSequence(Ch); Exit; end; Ch := ((Ch and $07) shl 18) or ((B and $3F) shl 12); Result := StreamReadByte(S,B); if not Result then Exit; if (B and $C0) <> $80 then begin FlagInvalidSequence(Ch); Exit; end; Ch := Ch or ((B and $3F) shl 6); Result := StreamReadByte(S,B); if not Result then Exit; if (B and $C0) <> $80 then begin FlagInvalidSequence(Ch); Exit; end; Ch := Ch or (B and $3F); end; $F8..$FB: begin // 5 bytes to read Result := StreamReadByte(S,B); if not Result then Exit; if (B and $C0) <> $80 then begin FlagInvalidSequence(Ch); Exit; end; Ch := ((Ch and $03) shl 24) or ((B and $3F) shl 18); Result := StreamReadByte(S,B); if not Result then Exit; if (B and $C0) <> $80 then begin FlagInvalidSequence(Ch); Exit; end; Ch := Ch or ((B and $3F) shl 12); Result := StreamReadByte(S,B); if not Result then Exit; if (B and $C0) <> $80 then begin FlagInvalidSequence(Ch); Exit; end; Ch := Ch or ((B and $3F) shl 6); Result := StreamReadByte(S,B); if not Result then Exit; if (B and $C0) <> $80 then begin FlagInvalidSequence(Ch); Exit; end; Ch := Ch or (B and $3F); end; $FC..$FD: begin // 6 bytes to read Result := StreamReadByte(S,B); if not Result then Exit; if (B and $C0) <> $80 then begin FlagInvalidSequence(Ch); Exit; end; Ch := ((Ch and $01) shl 30) or ((B and $3F) shl 24); Result := StreamReadByte(S,B); if not Result then Exit; if (B and $C0) <> $80 then begin FlagInvalidSequence(Ch); Exit; end; Ch := Ch or ((B and $3F) shl 18); Result := StreamReadByte(S,B); if not Result then Exit; if (B and $C0) <> $80 then begin FlagInvalidSequence(Ch); Exit; end; Ch := Ch or ((B and $3F) shl 12); Result := StreamReadByte(S,B); if not Result then Exit; if (B and $C0) <> $80 then begin FlagInvalidSequence(Ch); Exit; end; Ch := Ch or ((B and $3F) shl 6); Result := StreamReadByte(S,B); if not Result then Exit; if (B and $C0) <> $80 then begin FlagInvalidSequence(Ch); Exit; end; Ch := Ch or (B and $3F); end; else FlagInvalidSequence(Ch); Exit; end; end; // returns False if String is too small // if UNICODE_SILENT_FAILURE is not defined StrPos is set to -1 on error (invalid UTF8 sequence) // StrPos will be incremented by the number of ansi chars that were skipped // On return, NbSeq contains the number of UTF8 sequences that were skipped function UTF8SkipChars(const S: TUTF8String; var StrPos: SizeInt; var NbSeq: SizeInt): Boolean; var StrLength: SizeInt; Ch: UCS4; Index: SizeInt; begin Result := True; StrLength := Length(S); Index := 0; while (Index < NbSeq) and (StrPos > 0) do begin Ch := UCS4(S[StrPos]); case Ch of $00..$7F: // 1 byte to skip Inc(StrPos); $C0..$DF: // 2 bytes to skip if (StrPos >= StrLength) or ((UCS4(S[StrPos + 1]) and $C0) <> $80) then FlagInvalidSequence(StrPos, 1) else Inc(StrPos, 2); $E0..$EF: // 3 bytes to skip if ((StrPos + 1) >= StrLength) or ((UCS4(S[StrPos + 1]) and $C0) <> $80) then FlagInvalidSequence(StrPos, 1) else if (UCS4(S[StrPos + 2]) and $C0) <> $80 then FlagInvalidSequence(StrPos, 2) else Inc(StrPos, 3); $F0..$F7: // 4 bytes to skip if ((StrPos + 2) >= StrLength) or ((UCS4(S[StrPos + 1]) and $C0) <> $80) then FlagInvalidSequence(StrPos, 1) else if (UCS4(S[StrPos + 2]) and $C0) <> $80 then FlagInvalidSequence(StrPos, 2) else if (UCS4(S[StrPos + 3]) and $C0) <> $80 then FlagInvalidSequence(StrPos, 3) else Inc(StrPos, 4); $F8..$FB: // 5 bytes to skip if ((StrPos + 3) >= StrLength) or ((UCS4(S[StrPos + 1]) and $C0) <> $80) then FlagInvalidSequence(StrPos, 1) else if (UCS4(S[StrPos + 2]) and $C0) <> $80 then FlagInvalidSequence(StrPos, 2) else if (UCS4(S[StrPos + 3]) and $C0) <> $80 then FlagInvalidSequence(StrPos, 3) else if (UCS4(S[StrPos + 4]) and $C0) <> $80 then FlagInvalidSequence(StrPos, 4) else Inc(StrPos, 5); $FC..$FD: // 6 bytes to skip if ((StrPos + 4) >= StrLength) or ((UCS4(S[StrPos + 1]) and $C0) <> $80) then FlagInvalidSequence(StrPos, 1) else if (UCS4(S[StrPos + 2]) and $C0) <> $80 then FlagInvalidSequence(StrPos, 2) else if (UCS4(S[StrPos + 3]) and $C0) <> $80 then FlagInvalidSequence(StrPos, 3) else if (UCS4(S[StrPos + 4]) and $C0) <> $80 then FlagInvalidSequence(StrPos, 4) else if (UCS4(S[StrPos + 5]) and $C0) <> $80 then FlagInvalidSequence(StrPos, 5) else Inc(StrPos, 6); else FlagInvalidSequence(StrPos, 1); end; if StrPos <> -1 then Inc(Index); if (StrPos > StrLength) and (Index < NbSeq) then begin Result := False; Break; end; end; NbSeq := Index; end; function UTF8SkipCharsFromStream(S: TStream; var NbSeq: SizeInt): Boolean; var B: Byte; Index: SizeInt; begin Index := 0; while (Index < NbSeq) do begin Result := StreamReadByte(S, B); if not Result then Break; case B of $00..$7F: ; // 1 byte to skip // nothing to do $C0..$DF: // 2 bytes to skip begin Result := StreamReadByte(S, B); if not Result then Break; if (B and $C0) <> $80 then FlagInvalidSequence; end; $E0..$EF: // 3 bytes to skip begin Result := StreamReadByte(S, B); if not Result then Break; if (B and $C0) <> $80 then FlagInvalidSequence; Result := StreamReadByte(S, B); if not Result then Break; if (B and $C0) <> $80 then FlagInvalidSequence; end; $F0..$F7: // 4 bytes to skip begin Result := StreamReadByte(S, B); if not Result then Break; if (B and $C0) <> $80 then FlagInvalidSequence; Result := StreamReadByte(S, B); if not Result then Break; if (B and $C0) <> $80 then FlagInvalidSequence; Result := StreamReadByte(S, B); if not Result then Break; if (B and $C0) <> $80 then FlagInvalidSequence; end; $F8..$FB: // 5 bytes to skip begin Result := StreamReadByte(S, B); if not Result then Break; if (B and $C0) <> $80 then FlagInvalidSequence; Result := StreamReadByte(S, B); if not Result then Break; if (B and $C0) <> $80 then FlagInvalidSequence; Result := StreamReadByte(S, B); if not Result then Break; if (B and $C0) <> $80 then FlagInvalidSequence; Result := StreamReadByte(S, B); if not Result then Break; if (B and $C0) <> $80 then FlagInvalidSequence; end; $FC..$FD: // 6 bytes to skip begin Result := StreamReadByte(S, B); if not Result then Break; if (B and $C0) <> $80 then FlagInvalidSequence; Result := StreamReadByte(S, B); if not Result then Break; if (B and $C0) <> $80 then FlagInvalidSequence; Result := StreamReadByte(S, B); if not Result then Break; if (B and $C0) <> $80 then FlagInvalidSequence; Result := StreamReadByte(S, B); if not Result then Break; if (B and $C0) <> $80 then FlagInvalidSequence; Result := StreamReadByte(S, B); if not Result then Break; if (B and $C0) <> $80 then FlagInvalidSequence; end; else FlagInvalidSequence; end; Inc(Index); end; Result := Index = NbSeq; NbSeq := Index; end; // returns False on error: // - if an UCS4 character cannot be stored to an UTF-8 string: // - if UNICODE_SILENT_FAILURE is defined, ReplacementCharacter is added // - if UNICODE_SILENT_FAILURE is not defined, StrPos is set to -1 // - StrPos > -1 flags string being too small, caller is responsible for allocating space // StrPos will be incremented by the number of chars that were written function UTF8SetNextChar(var S: TUTF8String; var StrPos: SizeInt; Ch: UCS4): Boolean; var StrLength: SizeInt; begin StrLength := Length(S); if Ch <= $7F then begin // 7 bits to store Result := (StrPos > 0) and (StrPos <= StrLength); if Result then begin S[StrPos] := AnsiChar(Ch); Inc(StrPos); end; end else if Ch <= $7FF then begin // 11 bits to store Result := (StrPos > 0) and (StrPos < StrLength); if Result then begin S[StrPos] := AnsiChar($C0 or (Ch shr 6)); // 5 bits S[StrPos + 1] := AnsiChar((Ch and $3F) or $80); // 6 bits Inc(StrPos, 2); end; end else if Ch <= $FFFF then begin // 16 bits to store Result := (StrPos > 0) and (StrPos < (StrLength - 1)); if Result then begin S[StrPos] := AnsiChar($E0 or (Ch shr 12)); // 4 bits S[StrPos + 1] := AnsiChar(((Ch shr 6) and $3F) or $80); // 6 bits S[StrPos + 2] := AnsiChar((Ch and $3F) or $80); // 6 bits Inc(StrPos, 3); end; end else if Ch <= $1FFFFF then begin // 21 bits to store Result := (StrPos > 0) and (StrPos < (StrLength - 2)); if Result then begin S[StrPos] := AnsiChar($F0 or (Ch shr 18)); // 3 bits S[StrPos + 1] := AnsiChar(((Ch shr 12) and $3F) or $80); // 6 bits S[StrPos + 2] := AnsiChar(((Ch shr 6) and $3F) or $80); // 6 bits S[StrPos + 3] := AnsiChar((Ch and $3F) or $80); // 6 bits Inc(StrPos, 4); end; end else if Ch <= $3FFFFFF then begin // 26 bits to store Result := (StrPos > 0) and (StrPos < (StrLength - 2)); if Result then begin S[StrPos] := AnsiChar($F8 or (Ch shr 24)); // 2 bits S[StrPos + 1] := AnsiChar(((Ch shr 18) and $3F) or $80); // 6 bits S[StrPos + 2] := AnsiChar(((Ch shr 12) and $3F) or $80); // 6 bits S[StrPos + 3] := AnsiChar(((Ch shr 6) and $3F) or $80); // 6 bits S[StrPos + 4] := AnsiChar((Ch and $3F) or $80); // 6 bits Inc(StrPos, 5); end; end else if Ch <= MaximumUCS4 then begin // 31 bits to store Result := (StrPos > 0) and (StrPos < (StrLength - 3)); if Result then begin S[StrPos] := AnsiChar($FC or (Ch shr 30)); // 1 bits S[StrPos + 1] := AnsiChar(((Ch shr 24) and $3F) or $80); // 6 bits S[StrPos + 2] := AnsiChar(((Ch shr 18) and $3F) or $80); // 6 bits S[StrPos + 3] := AnsiChar(((Ch shr 12) and $3F) or $80); // 6 bits S[StrPos + 4] := AnsiChar(((Ch shr 6) and $3F) or $80); // 6 bits S[StrPos + 5] := AnsiChar((Ch and $3F) or $80); // 6 bits Inc(StrPos, 6); end; end else begin {$IFDEF UNICODE_SILENT_FAILURE} // add ReplacementCharacter Result := (StrPos > 0) and (StrPos < (StrLength - 1)); if Result then begin S[StrPos] := AnsiChar($E0 or (UCS4ReplacementCharacter shr 12)); // 4 bits S[StrPos + 1] := AnsiChar(((UCS4ReplacementCharacter shr 6) and $3F) or $80); // 6 bits S[StrPos + 2] := AnsiChar((UCS4ReplacementCharacter and $3F) or $80); // 6 bits Inc(StrPos, 3); end; {$ELSE ~UNICODE_SILENT_FAILURE} StrPos := -1; Result := False; {$ENDIF ~UNICODE_SILENT_FAILURE} end; end; function UTF8SetNextCharToStream(S: TStream; Ch: UCS4): Boolean; begin if Ch <= $7F then // 7 bits to store Result := StreamWriteByte(S,Ch) else if Ch <= $7FF then // 11 bits to store Result := StreamWriteByte(S, $C0 or (Ch shr 6)) and // 5 bits StreamWriteByte(S, (Ch and $3F) or $80) // 6 bits else if Ch <= $FFFF then // 16 bits to store Result := StreamWriteByte(S, $E0 or (Ch shr 12)) and // 4 bits StreamWriteByte(S, ((Ch shr 6) and $3F) or $80) and // 6 bits StreamWriteByte(S, (Ch and $3F) or $80) // 6 bits else if Ch <= $1FFFFF then // 21 bits to store Result := StreamWriteByte(S, $F0 or (Ch shr 18)) and // 3 bits StreamWriteByte(S, ((Ch shr 12) and $3F) or $80) and // 6 bits StreamWriteByte(S, ((Ch shr 6) and $3F) or $80) and // 6 bits StreamWriteByte(S, (Ch and $3F) or $80) // 6 bits else if Ch <= $3FFFFFF then // 26 bits to store Result := StreamWriteByte(S, $F8 or (Ch shr 24)) and // 2 bits StreamWriteByte(S, ((Ch shr 18) and $3F) or $80) and // 6 bits StreamWriteByte(S, ((Ch shr 12) and $3F) or $80) and // 6 bits StreamWriteByte(S, ((Ch shr 6) and $3F) or $80) and // 6 bits StreamWriteByte(S, (Ch and $3F) or $80) // 6 bits else if Ch <= MaximumUCS4 then // 31 bits to store Result := StreamWriteByte(S, $FC or (Ch shr 30)) and // 1 bits StreamWriteByte(S, ((Ch shr 24) and $3F) or $80) and // 6 bits StreamWriteByte(S, ((Ch shr 18) and $3F) or $80) and // 6 bits StreamWriteByte(S, ((Ch shr 12) and $3F) or $80) and // 6 bits StreamWriteByte(S, ((Ch shr 6) and $3F) or $80) and // 6 bits StreamWriteByte(S, (Ch and $3F) or $80) // 6 bits else {$IFDEF UNICODE_SILENT_FAILURE} // add ReplacementCharacter Result := StreamWriteByte(S, $E0 or (UCS4ReplacementCharacter shr 12)) and // 4 bits StreamWriteByte(S, ((UCS4ReplacementCharacter shr 6) and $3F) or $80) and // 6 bits StreamWriteByte(S, (UCS4ReplacementCharacter and $3F) or $80); // 6 bits {$ELSE ~UNICODE_SILENT_FAILURE} Result := False; {$ENDIF ~UNICODE_SILENT_FAILURE} end; // if UNICODE_SILENT_FAILURE is defined, invalid sequences will be replaced by ReplacementCharacter // otherwise StrPos is set to -1 on return to flag an error (invalid UTF16 sequence) // StrPos will be incremented by the number of chars that were read function UTF16GetNextChar(const S: TUTF16String; var StrPos: SizeInt): UCS4; var StrLength: SizeInt; ChNext: UCS4; begin StrLength := Length(S); if (StrPos <= StrLength) and (StrPos > 0) then begin Result := UCS4(S[StrPos]); case Result of SurrogateHighStart..SurrogateHighEnd: begin // 2 bytes to read if StrPos >= StrLength then begin FlagInvalidSequence(StrPos, 1, Result); Exit; end; ChNext := UCS4(S[StrPos + 1]); if (ChNext < SurrogateLowStart) or (ChNext > SurrogateLowEnd) then begin FlagInvalidSequence(StrPos, 1, Result); Exit; end; Result := ((Result - SurrogateHighStart) shl HalfShift) + (ChNext - SurrogateLowStart) + HalfBase; Inc(StrPos, 2); end; SurrogateLowStart..SurrogateLowEnd: FlagInvalidSequence(StrPos, 1, Result); else // 1 byte to read Inc(StrPos); end; end else begin // StrPos > StrLength Result := 0; FlagInvalidSequence(StrPos, 0, Result); end; end; // if UNICODE_SILENT_FAILURE is defined, invalid sequences will be replaced by ReplacementCharacter // otherwise StrPos is set to -1 on return to flag an error (invalid UTF16 sequence) // StrPos will be incremented by the number of chars that were read {$IFDEF SUPPORTS_UNICODE_STRING} function UTF16GetNextChar(const S: UnicodeString; var StrPos: SizeInt): UCS4; var StrLength: SizeInt; ChNext: UCS4; begin StrLength := Length(S); if (StrPos <= StrLength) and (StrPos > 0) then begin Result := UCS4(S[StrPos]); case Result of SurrogateHighStart..SurrogateHighEnd: begin // 2 bytes to read if StrPos >= StrLength then begin FlagInvalidSequence(StrPos, 1, Result); Exit; end; ChNext := UCS4(S[StrPos + 1]); if (ChNext < SurrogateLowStart) or (ChNext > SurrogateLowEnd) then begin FlagInvalidSequence(StrPos, 1, Result); Exit; end; Result := ((Result - SurrogateHighStart) shl HalfShift) + (ChNext - SurrogateLowStart) + HalfBase; Inc(StrPos, 2); end; SurrogateLowStart..SurrogateLowEnd: FlagInvalidSequence(StrPos, 1, Result); else // 1 byte to read Inc(StrPos); end; end else begin // StrPos > StrLength Result := 0; FlagInvalidSequence(StrPos, 0, Result); end; end; {$ENDIF SUPPORTS_UNICODE_STRING} function UTF16GetNextCharFromStream(S: TStream; out Ch: UCS4): Boolean; var W: Word; begin Result := StreamReadWord(S, W); if not Result then Exit; Ch := UCS4(W); case W of SurrogateHighStart..SurrogateHighEnd: begin // 2 bytes to read Result := StreamReadWord(S, W); if not Result then Exit; if (W < SurrogateLowStart) or (W > SurrogateLowEnd) then begin FlagInvalidSequence(Ch); Exit; end; Ch := ((Ch - SurrogateHighStart) shl HalfShift) + (W - SurrogateLowStart) + HalfBase; end; SurrogateLowStart..SurrogateLowEnd: FlagInvalidSequence(Ch); else // 1 byte to read // nothing to do end; end; // if UNICODE_SILENT_FAILURE is defined, invalid sequences will be replaced by ReplacementCharacter // otherwise StrPos is set to -1 on return to flag an error (invalid UTF16 sequence) // StrPos will be decremented by the number of chars that were read function UTF16GetPreviousChar(const S: TUTF16String; var StrPos: SizeInt): UCS4; var StrLength: SizeInt; ChPrev: UCS4; begin StrLength := Length(S); if (StrPos <= (StrLength + 1)) and (StrPos > 1) then begin Result := UCS4(S[StrPos - 1]); case Result of SurrogateHighStart..SurrogateHighEnd: FlagInvalidSequence(StrPos, -1, Result); SurrogateLowStart..SurrogateLowEnd: begin // 2 bytes to read if StrPos <= 2 then begin FlagInvalidSequence(StrPos, -1, Result); Exit; end; ChPrev := UCS4(S[StrPos - 2]); if (ChPrev < SurrogateHighStart) or (ChPrev > SurrogateHighEnd) then begin FlagInvalidSequence(StrPos, -1, Result); Exit; end; Result := ((ChPrev - SurrogateHighStart) shl HalfShift) + (Result - SurrogateLowStart) + HalfBase; Dec(StrPos, 2); end; else // 1 byte to read Dec(StrPos); end; end else begin // StrPos > StrLength Result := 0; FlagInvalidSequence(StrPos, 0, Result); end; end; // if UNICODE_SILENT_FAILURE is defined, invalid sequences will be replaced by ReplacementCharacter // otherwise StrPos is set to -1 on return to flag an error (invalid UTF16 sequence) // StrPos will be decremented by the number of chars that were read {$IFDEF SUPPORTS_UNICODE_STRING} function UTF16GetPreviousChar(const S: UnicodeString; var StrPos: SizeInt): UCS4; var StrLength: SizeInt; ChPrev: UCS4; begin StrLength := Length(S); if (StrPos <= (StrLength + 1)) and (StrPos > 1) then begin Result := UCS4(S[StrPos - 1]); case Result of SurrogateHighStart..SurrogateHighEnd: FlagInvalidSequence(StrPos, -1, Result); SurrogateLowStart..SurrogateLowEnd: begin // 2 bytes to read if StrPos <= 2 then begin FlagInvalidSequence(StrPos, -1, Result); Exit; end; ChPrev := UCS4(S[StrPos - 2]); if (ChPrev < SurrogateHighStart) or (ChPrev > SurrogateHighEnd) then begin FlagInvalidSequence(StrPos, -1, Result); Exit; end; Result := ((ChPrev - SurrogateHighStart) shl HalfShift) + (Result - SurrogateLowStart) + HalfBase; Dec(StrPos, 2); end; else // 1 byte to read Dec(StrPos); end; end else begin // StrPos > StrLength Result := 0; FlagInvalidSequence(StrPos, 0, Result); end; end; {$ENDIF SUPPORTS_UNICODE_STRING} // returns False if String is too small // if UNICODE_SILENT_FAILURE is not defined StrPos is set to -1 on error (invalid UTF16 sequence) // StrPos will be incremented by the number of chars that were skipped // On return, NbSeq contains the number of UTF16 sequences that were skipped function UTF16SkipChars(const S: TUTF16String; var StrPos: SizeInt; var NbSeq: SizeInt): Boolean; var StrLength, Index: SizeInt; Ch, ChNext: UCS4; begin Result := True; StrLength := Length(S); Index := 0; if NbSeq >= 0 then while (Index < NbSeq) and (StrPos > 0) do begin Ch := UCS4(S[StrPos]); case Ch of SurrogateHighStart..SurrogateHighEnd: // 2 bytes to skip if StrPos >= StrLength then FlagInvalidSequence(StrPos, 1) else begin ChNext := UCS4(S[StrPos + 1]); if (ChNext < SurrogateLowStart) or (ChNext > SurrogateLowEnd) then FlagInvalidSequence(StrPos, 1) else Inc(StrPos, 2); end; SurrogateLowStart..SurrogateLowEnd: // error FlagInvalidSequence(StrPos, 1); else // 1 byte to skip Inc(StrPos); end; if StrPos <> -1 then Inc(Index); if (StrPos > StrLength) and (Index < NbSeq) then begin Result := False; Break; end; end else while (Index > NbSeq) and (StrPos > 1) do begin Ch := UCS4(S[StrPos - 1]); case Ch of SurrogateHighStart..SurrogateHighEnd: // error FlagInvalidSequence(StrPos, -1); SurrogateLowStart..SurrogateLowEnd: // 2 bytes to skip if StrPos <= 2 then FlagInvalidSequence(StrPos, -1) else begin ChNext := UCS4(S[StrPos - 2]); if (ChNext < SurrogateHighStart) or (ChNext > SurrogateHighEnd) then FlagInvalidSequence(StrPos, -1) else Dec(StrPos, 2); end; else // 1 byte to skip Dec(StrPos); end; if StrPos <> -1 then Dec(Index); if (StrPos = 1) and (Index > NbSeq) then begin Result := False; Break; end; end; NbSeq := Index; end; // returns False if String is too small // if UNICODE_SILENT_FAILURE is not defined StrPos is set to -1 on error (invalid UTF16 sequence) // StrPos will be incremented by the number of chars that were skipped // On return, NbSeq contains the number of UTF16 sequences that were skipped {$IFDEF SUPPORTS_UNICODE_STRING} function UTF16SkipChars(const S: UnicodeString; var StrPos: SizeInt; var NbSeq: SizeInt): Boolean; var StrLength, Index: SizeInt; Ch, ChNext: UCS4; begin Result := True; StrLength := Length(S); Index := 0; if NbSeq >= 0 then while (Index < NbSeq) and (StrPos > 0) do begin Ch := UCS4(S[StrPos]); case Ch of SurrogateHighStart..SurrogateHighEnd: // 2 bytes to skip if StrPos >= StrLength then FlagInvalidSequence(StrPos, 1) else begin ChNext := UCS4(S[StrPos + 1]); if (ChNext < SurrogateLowStart) or (ChNext > SurrogateLowEnd) then FlagInvalidSequence(StrPos, 1) else Inc(StrPos, 2); end; SurrogateLowStart..SurrogateLowEnd: // error FlagInvalidSequence(StrPos, 1); else // 1 byte to skip Inc(StrPos); end; if StrPos <> -1 then Inc(Index); if (StrPos > StrLength) and (Index < NbSeq) then begin Result := False; Break; end; end else while (Index > NbSeq) and (StrPos > 1) do begin Ch := UCS4(S[StrPos - 1]); case Ch of SurrogateHighStart..SurrogateHighEnd: // error FlagInvalidSequence(StrPos, -1); SurrogateLowStart..SurrogateLowEnd: // 2 bytes to skip if StrPos <= 2 then FlagInvalidSequence(StrPos, -1) else begin ChNext := UCS4(S[StrPos - 2]); if (ChNext < SurrogateHighStart) or (ChNext > SurrogateHighEnd) then FlagInvalidSequence(StrPos, -1) else Dec(StrPos, 2); end; else // 1 byte to skip Dec(StrPos); end; if StrPos <> -1 then Dec(Index); if (StrPos = 1) and (Index > NbSeq) then begin Result := False; Break; end; end; NbSeq := Index; end; {$ENDIF SUPPORTS_UNICODE_STRING} function UTF16SkipCharsFromStream(S: TStream; var NbSeq: SizeInt): Boolean; var Index: SizeInt; W: Word; begin Index := 0; while Index < NbSeq do begin Result := StreamReadWord(S, W); if not Result then Break; case W of SurrogateHighStart..SurrogateHighEnd: // 2 bytes to skip begin Result := StreamReadWord(S, W); if not Result then Break; if (W < SurrogateLowStart) or (W > SurrogateLowEnd) then FlagInvalidSequence; end; SurrogateLowStart..SurrogateLowEnd: // error FlagInvalidSequence; else // 1 byte to skip // nothing to do end; Inc(Index); end; Result := Index = NbSeq; NbSeq := Index; end; // returns False on error: // - if an UCS4 character cannot be stored to an UTF-8 string: // - if UNICODE_SILENT_FAILURE is defined, ReplacementCharacter is added // - if UNICODE_SILENT_FAILURE is not defined, StrPos is set to -1 // - StrPos > -1 flags string being too small, caller is responsible for allocating space // StrPos will be incremented by the number of chars that were written function UTF16SetNextChar(var S: TUTF16String; var StrPos: SizeInt; Ch: UCS4): Boolean; var StrLength: SizeInt; begin StrLength := Length(S); if Ch <= MaximumUCS2 then begin // 16 bits to store in place Result := (StrPos > 0) and (StrPos <= StrLength); if Result then begin S[StrPos] := WideChar(Ch); Inc(StrPos); end; end else if Ch <= MaximumUTF16 then begin // stores a surrogate pair Result := (StrPos > 0) and (StrPos < StrLength); if Result then begin Ch := Ch - HalfBase; S[StrPos] := WideChar((Ch shr HalfShift) or SurrogateHighStart); S[StrPos + 1] := WideChar((Ch and HalfMask) or SurrogateLowStart); Inc(StrPos, 2); end; end else begin {$IFDEF UNICODE_SILENT_FAILURE} // add ReplacementCharacter Result := (StrPos > 0) and (StrPos <= StrLength); if Result then begin S[StrPos] := WideChar(UCS4ReplacementCharacter); Inc(StrPos, 1); end; {$ELSE ~UNICODE_SILENT_FAILURE} StrPos := -1; Result := False; {$ENDIF ~UNICODE_SILENT_FAILURE} end; end; {$IFDEF SUPPORTS_UNICODE_STRING} function UTF16SetNextChar(var S: UnicodeString; var StrPos: SizeInt; Ch: UCS4): Boolean; var StrLength: SizeInt; begin StrLength := Length(S); if Ch <= MaximumUCS2 then begin // 16 bits to store in place Result := (StrPos > 0) and (StrPos <= StrLength); if Result then begin S[StrPos] := WideChar(Ch); Inc(StrPos); end; end else if Ch <= MaximumUTF16 then begin // stores a surrogate pair Result := (StrPos > 0) and (StrPos < StrLength); if Result then begin Ch := Ch - HalfBase; S[StrPos] := WideChar((Ch shr HalfShift) + SurrogateHighStart); S[StrPos + 1] := WideChar((Ch and HalfMask) + SurrogateLowStart); Inc(StrPos, 2); end; end else begin {$IFDEF UNICODE_SILENT_FAILURE} // add ReplacementCharacter Result := (StrPos > 0) and (StrPos <= StrLength); if Result then begin S[StrPos] := WideChar(UCS4ReplacementCharacter); Inc(StrPos, 1); end; {$ELSE ~UNICODE_SILENT_FAILURE} StrPos := -1; Result := False; {$ENDIF ~UNICODE_SILENT_FAILURE} end; end; {$ENDIF SUPPORTS_UNICODE_STRING} function UTF16SetNextCharToStream(S: TStream; Ch: UCS4): Boolean; begin if Ch <= MaximumUCS2 then // 16 bits to store in place Result := StreamWriteWord(S, Ch) else if Ch <= MaximumUTF16 then // stores a surrogate pair Result := StreamWriteWord(S, (Ch shr HalfShift) or SurrogateHighStart) and StreamWriteWord(S, (Ch and HalfMask) or SurrogateLowStart) else begin {$IFDEF UNICODE_SILENT_FAILURE} // add ReplacementCharacter Result := StreamWriteWord(S, UCS4ReplacementCharacter); {$ELSE ~UNICODE_SILENT_FAILURE} Result := False; {$ENDIF ~UNICODE_SILENT_FAILURE} end; end; // AnsiGetNextChar = read next character at StrPos // StrPos will be incremented by the number of chars that were read (1) function AnsiGetNextChar(const S: AnsiString; var StrPos: SizeInt): UCS4; var StrLen, TmpPos: SizeInt; UTF16Buffer: TUTF16String; begin StrLen := Length(S); if (StrPos <= StrLen) and (StrPos > 0) then begin UTF16Buffer := WideString(S[StrPos]); TmpPos := 1; Result := UTF16GetNextChar(UTF16Buffer, TmpPos); if TmpPos = -1 then StrPos := -1 else Inc(StrPos); end else begin // StrPos > StrLength Result := 0; FlagInvalidSequence(StrPos, 0, Result); end; end; function AnsiGetNextCharFromStream(S: TStream; out Ch: UCS4): Boolean; var B: Byte; TmpPos: SizeInt; UTF16Buffer: TUTF16String; begin Result := StreamReadByte(S, B); if not Result then Exit; UTF16Buffer := WideString(AnsiString(Chr(B))); TmpPos := 1; Ch := UTF16GetNextChar(UTF16Buffer, TmpPos); Result := TmpPos <> -1; end; // AnsiGetNextChar = read next character at StrPos // StrPos will be incremented by the number of chars that were read (1) function AnsiGetNextChar(const S: AnsiString; CodePage: Word; var StrPos: SizeInt): UCS4; var StrLen, TmpPos: SizeInt; UTF16Buffer: TUTF16String; begin StrLen := Length(S); if (StrPos <= StrLen) and (StrPos > 0) then begin SetLength(UTF16Buffer, 2); if MultiByteToWideChar(CodePage, MB_PRECOMPOSED or MB_ERR_INVALID_CHARS, @S[StrPos], 1, PWideChar(UTF16Buffer), 2) = 0 then begin Result := UCS4ReplacementCharacter; FlagInvalidSequence(StrPos, 0, Result); Exit; end; TmpPos := 1; Result := UTF16GetNextChar(UTF16Buffer, TmpPos); if TmpPos = -1 then StrPos := -1 else Inc(StrPos); end else begin // StrPos > StrLength Result := 0; FlagInvalidSequence(StrPos, 0, Result); end; end; function AnsiGetNextCharFromStream(S: TStream; CodePage: Word; out Ch: UCS4): Boolean; var B: Byte; TmpPos: SizeInt; UTF16Buffer: TUTF16String; begin Result := StreamReadByte(S, B); if not Result then Exit; SetLength(UTF16Buffer, 2); if MultiByteToWideChar(CodePage, MB_PRECOMPOSED or MB_ERR_INVALID_CHARS, @B, 1, PWideChar(UTF16Buffer), 2) = 0 then begin Result := False; Ch := UCS4ReplacementCharacter; Exit; end; TmpPos := 1; Ch := UTF16GetNextChar(UTF16Buffer, TmpPos); Result := TmpPos <> -1; end; // AnsiSkipChars = skip NbSeq characters starting from StrPos // returns False if String is too small // StrPos will be incremented by the number of chars that were skipped // On return, NbChar contains the number of UTF16 sequences that were skipped function AnsiSkipChars(const S: AnsiString; var StrPos: SizeInt; var NbSeq: SizeInt): Boolean; var StrLen: SizeInt; begin StrLen := Length(S); if StrPos > 0 then begin if StrPos + NbSeq > StrLen then begin NbSeq := StrLen + 1 - StrPos; StrPos := StrLen + 1; Result := False; end else begin // NbSeq := NbSeq; StrPos := StrLen + NbSeq; Result := True; end; end else begin // previous error NbSeq := 0; // StrPos := -1; Result := False; end; end; function AnsiSkipCharsFromStream(S: TStream; var NbSeq: SizeInt): Boolean; var Index: SizeInt; B: Byte; begin Index := 0; while Index < NbSeq do begin Result := StreamReadByte(S, B); if not Result then Break; Inc(Index); end; Result := Index = NbSeq; NbSeq := Index; end; // AnsiSetNextChar = append a character at StrPos // returns False on error: // - if an UCS4 character cannot be stored to an ansi string: // - if UNICODE_SILENT_FAILURE is defined, ReplacementCharacter is added // - if UNICODE_SILENT_FAILURE is not defined, StrPos is set to -1 // - StrPos > -1 flags string being too small, callee did nothing and caller is responsible for allocating space // StrPos will be incremented by the number of chars that were written (1) function AnsiSetNextChar(var S: AnsiString; var StrPos: SizeInt; Ch: UCS4): Boolean; var StrLen, TmpPos: SizeInt; UTF16Buffer: TUTF16String; AnsiBuffer: AnsiString; begin StrLen := Length(S); Result := (StrPos > 0) and (StrPos <= StrLen); if Result then begin SetLength(UTF16Buffer, 2); TmpPos := 1; Result := UTF16SetNextChar(UTF16Buffer, TmpPos, Ch); if Result and (TmpPos = 2) then begin // one wide character AnsiBuffer := AnsiString(WideString(UTF16Buffer[1])); S[StrPos] := AnsiBuffer[1]; Inc(StrPos); end else if Result and (TmpPos = 3) then begin // one surrogate pair AnsiBuffer := AnsiString(UTF16Buffer); S[StrPos] := AnsiBuffer[1]; Inc(StrPos); end else begin {$IFDEF UNICODE_SILENT_FAILURE} // add ReplacementCharacter S[StrPos] := AnsiReplacementCharacter; Inc(StrPos); {$ELSE ~UNICODE_SILENT_FAILURE} Result := False; StrPos := -1; {$ENDIF ~UNICODE_SILENT_FAILURE} end; end; end; function AnsiSetNextCharToStream(S: TStream; Ch: UCS4): Boolean; var TmpPos: SizeInt; UTF16Buffer: TUTF16String; AnsiBuffer: AnsiString; begin SetLength(UTF16Buffer, 2); TmpPos := 1; Result := UTF16SetNextChar(UTF16Buffer, TmpPos, Ch); if Result and (TmpPos = 2) then begin // one wide character AnsiBuffer := AnsiString(WideString(UTF16Buffer[1])); Result := StreamWriteByte(S, Ord(AnsiBuffer[1])); end else if Result and (TmpPos = 3) then begin // one surrogate pair AnsiBuffer := AnsiString(UTF16Buffer); Result := StreamWriteByte(S, Ord(AnsiBuffer[1])); end else begin {$IFDEF UNICODE_SILENT_FAILURE} // add ReplacementCharacter Result := StreamWriteByte(S, Ord(AnsiReplacementCharacter)); {$ELSE ~UNICODE_SILENT_FAILURE} Result := False; {$ENDIF ~UNICODE_SILENT_FAILURE} end; end; function AnsiSetNextChar(var S: AnsiString; CodePage: Word; var StrPos: SizeInt; Ch: UCS4): Boolean; var StrLen, TmpPos: SizeInt; UTF16Buffer: TUTF16String; AnsiCharacter: AnsiChar; begin StrLen := Length(S); Result := (StrPos > 0) and (StrPos <= StrLen); if Result then begin SetLength(UTF16Buffer, 2); TmpPos := 1; Result := UTF16SetNextChar(UTF16Buffer, TmpPos, Ch); Result := Result and (WideCharToMultiByte(CodePage, 0, PWideChar(UTF16Buffer), TmpPos-1, @AnsiCharacter, 1, nil, nil) > 0); if Result then begin S[StrPos] := AnsiCharacter; Inc(StrPos); end; if not Result then begin {$IFDEF UNICODE_SILENT_FAILURE} // add ReplacementCharacter S[StrPos] := AnsiReplacementCharacter; Inc(StrPos); {$ELSE ~UNICODE_SILENT_FAILURE} Result := False; StrPos := -1; {$ENDIF ~UNICODE_SILENT_FAILURE} end; end; end; function AnsiSetNextCharToStream(S: TStream; CodePage: Word; Ch: UCS4): Boolean; var TmpPos: SizeInt; UTF16Buffer: TUTF16String; AnsiCharacter: AnsiChar; begin SetLength(UTF16Buffer, 2); TmpPos := 1; Result := UTF16SetNextChar(UTF16Buffer, TmpPos, Ch); Result := Result and (WideCharToMultiByte(CodePage, 0, PWideChar(UTF16Buffer), TmpPos-1, @AnsiCharacter, 1, nil, nil) > 0); if Result then Result := StreamWriteByte(S, Ord(AnsiCharacter)); if not Result then begin {$IFDEF UNICODE_SILENT_FAILURE} // add ReplacementCharacter Result := StreamWriteByte(S, Ord(AnsiReplacementCharacter)); {$ELSE ~UNICODE_SILENT_FAILURE} Result := False; {$ENDIF ~UNICODE_SILENT_FAILURE} end; end; // StringGetNextChar = read next character/sequence at StrPos // if UNICODE_SILENT_FAILURE is defined, invalid sequences will be replaced by ReplacementCharacter // otherwise StrPos is set to -1 on return to flag an error (invalid UTF16 sequence for WideString) // StrPos will be incremented by the number of chars that were read function StringGetNextChar(const S: string; var StrPos: SizeInt): UCS4; begin {$IFDEF SUPPORTS_UNICODE} Result := UTF16GetNextChar(S, StrPos); {$ELSE ~SUPPORTS_UNICODE} Result := AnsiGetNextChar(S, StrPos); {$ENDIF ~SUPPORTS_UNICODE} end; // StringSkipChars = skip NbSeq characters/sequences starting from StrPos // returns False if String is too small // if UNICODE_SILENT_FAILURE is not defined StrPos is set to -1 on error (invalid UTF16 sequence for WideString) // StrPos will be incremented by the number of chars that were skipped // On return, NbChar contains the number of UTF16 sequences that were skipped function StringSkipChars(const S: string; var StrPos: SizeInt; var NbSeq: SizeInt): Boolean; begin {$IFDEF SUPPORTS_UNICODE} Result := UTF16SkipChars(S, StrPos, NbSeq); {$ELSE ~SUPPORTS_UNICODE} Result := AnsiSkipChars(S, StrPos, NbSeq); {$ENDIF ~SUPPORTS_UNICODE} end; // StringSetNextChar = append a character/sequence at StrPos // returns False on error: // - if an UCS4 character cannot be stored to a string: // - if UNICODE_SILENT_FAILURE is defined, ReplacementCharacter is added // - if UNICODE_SILENT_FAILURE is not defined, StrPos is set to -1 // - StrPos > -1 flags string being too small, callee did nothing and caller is responsible for allocating space // StrPos will be incremented by the number of chars that were written function StringSetNextChar(var S: string; var StrPos: SizeInt; Ch: UCS4): Boolean; begin {$IFDEF SUPPORTS_UNICODE} Result := UTF16SetNextChar(S, StrPos, Ch); {$ELSE ~SUPPORTS_UNICODE} Result := AnsiSetNextChar(S, StrPos, Ch); {$ENDIF ~SUPPORTS_UNICODE} end; function WideStringToUTF8(const S: WideString): TUTF8String; begin Result := UTF16ToUTF8(S); end; function UTF8ToWideString(const S: TUTF8String): WideString; begin Result := UTF8ToUTF16(S); end; function WideStringToUCS4(const S: WideString): TUCS4Array; begin Result := UTF16ToUCS4(S); end; function UCS4ToWideString(const S: TUCS4Array): WideString; begin Result := UCS4ToUTF16(S); end; function AnsiStringToUTF8(const S: AnsiString): TUTF8String; var WS: TUTF16String; begin WS := TUTF16String(S); Result := UTF16ToUTF8(WS); end; function UTF8ToAnsiString(const S: TUTF8String): AnsiString; var WS: TUTF16String; begin WS := UTF8ToUTF16(S); Result := AnsiString(WS); end; function AnsiStringToUTF16(const S: AnsiString): TUTF16String; begin Result := TUTF16String(S); end; function UTF16ToAnsiString(const S: TUTF16String): AnsiString; begin Result := AnsiString(S); end; function AnsiStringToUCS4(const S: AnsiString): TUCS4Array; var WS: TUTF16String; begin WS := TUTF16String(S); Result := UTF16ToUCS4(WS); end; function UCS4ToAnsiString(const S: TUCS4Array): AnsiString; var WS: TUTF16String; begin WS := UCS4ToUTF16(S); Result := AnsiString(WS); end; function StringToUTF8(const S: string): TUTF8String; var WS: TUTF16String; begin WS := TUTF16String(S); Result := UTF16ToUTF8(WS); end; function TryStringToUTF8(const S: string; out D: TUTF8String): Boolean; var WS: TUTF16String; begin WS := TUTF16String(S); Result := TryUTF16ToUTF8(WS, D); end; function UTF8ToString(const S: TUTF8String): string; var WS: TUTF16String; begin WS := UTF8ToUTF16(S); Result := string(WS); end; function TryUTF8ToString(const S: TUTF8String; out D: string): Boolean; var WS: TUTF16String; begin Result := TryUTF8ToUTF16(S, WS); D := string(WS); end; function StringToUTF16(const S: string): TUTF16String; begin Result := TUTF16String(S); end; function TryStringToUTF16(const S: string; out D: TUTF16String): Boolean; begin D := TUTF16String(S); Result := True; end; function UTF16ToString(const S: TUTF16String): string; begin Result := string(S); end; function TryUTF16ToString(const S: TUTF16String; out D: string): Boolean; begin D := string(S); Result := True; end; function StringToUCS4(const S: string): TUCS4Array; var WS: TUTF16String; begin WS := TUTF16String(S); Result := UTF16ToUCS4(WS); end; function TryStringToUCS4(const S: string; out D: TUCS4Array): Boolean; var WS: TUTF16String; begin WS := TUTF16String(S); Result := TryUTF16ToUCS4(WS, D); end; function UCS4ToString(const S: TUCS4Array): string; var WS: WideString; begin WS := UCS4ToUTF16(S); Result := string(WS); end; function TryUCS4ToString(const S: TUCS4Array; out D: string): Boolean; var WS: WideString; begin Result := TryUCS4ToUTF16(S, WS); D := string(WS); end; function UTF8ToUTF16(const S: TUTF8String): TUTF16String; var SrcIndex, SrcLength, DestIndex: SizeInt; Ch: UCS4; begin if S = '' then Result := '' else begin SrcLength := Length(S); SetLength(Result, SrcLength); // create enough room SrcIndex := 1; DestIndex := 1; while SrcIndex <= SrcLength do begin Ch := UTF8GetNextChar(S, SrcIndex); if SrcIndex = -1 then raise EJclUnexpectedEOSequenceError.Create; UTF16SetNextChar(Result, DestIndex, Ch); end; SetLength(Result, DestIndex - 1); // now fix up length end; end; function TryUTF8ToUTF16(const S: TUTF8String; out D: TUTF16String): Boolean; var SrcIndex, SrcLength, DestIndex: SizeInt; Ch: UCS4; begin Result := True; if S = '' then D := '' else begin SrcLength := Length(S); SetLength(D, SrcLength); // create enough room SrcIndex := 1; DestIndex := 1; while SrcIndex <= SrcLength do begin Ch := UTF8GetNextChar(S, SrcIndex); if SrcIndex = -1 then begin Result := False; D := ''; Exit; end; UTF16SetNextChar(D, DestIndex, Ch); end; SetLength(D, DestIndex - 1); // now fix up length end; end; function UTF16ToUTF8(const S: TUTF16String): TUTF8String; var SrcIndex, SrcLength, DestIndex: SizeInt; Ch: UCS4; begin if S = '' then Result := '' else begin SrcLength := Length(S); SetLength(Result, SrcLength * 3); // worste case SrcIndex := 1; DestIndex := 1; while SrcIndex <= SrcLength do begin Ch := UTF16GetNextChar(S, SrcIndex); if SrcIndex = -1 then raise EJclUnexpectedEOSequenceError.Create; UTF8SetNextChar(Result, DestIndex, Ch); end; SetLength(Result, DestIndex - 1); // now fix up length end; end; function TryUTF16ToUTF8(const S: TUTF16String; out D: TUTF8String): Boolean; var SrcIndex, SrcLength, DestIndex: SizeInt; Ch: UCS4; begin Result := True; if S = '' then D := '' else begin SrcLength := Length(S); SetLength(D, SrcLength * 3); // worste case SrcIndex := 1; DestIndex := 1; while SrcIndex <= SrcLength do begin Ch := UTF16GetNextChar(S, SrcIndex); if SrcIndex = -1 then begin Result := False; D := ''; Exit; end; UTF8SetNextChar(D, DestIndex, Ch); end; SetLength(D, DestIndex - 1); // now fix up length end; end; function UTF8ToUCS4(const S: TUTF8String): TUCS4Array; var SrcIndex, SrcLength, DestIndex: SizeInt; Ch: UCS4; begin if S <> '' then begin SrcLength := Length(S); SetLength(Result, SrcLength); // create enough room SrcIndex := 1; DestIndex := 0; while SrcIndex <= SrcLength do begin Ch := UTF8GetNextChar(S, SrcIndex); if SrcIndex = -1 then raise EJclUnexpectedEOSequenceError.Create; Result[DestIndex] := Ch; Inc(DestIndex); end; SetLength(Result, DestIndex); // now fix up length end; end; function TryUTF8ToUCS4(const S: TUTF8String; out D: TUCS4Array): Boolean; var SrcIndex, SrcLength, DestIndex: SizeInt; Ch: UCS4; begin Result := True; if S <> '' then begin SrcLength := Length(S); SetLength(D, SrcLength); // create enough room SrcIndex := 1; DestIndex := 0; while SrcIndex <= SrcLength do begin Ch := UTF8GetNextChar(S, SrcIndex); if SrcIndex = -1 then begin Result := False; SetLength(D, 0); Exit; end; D[DestIndex] := Ch; Inc(DestIndex); end; SetLength(D, DestIndex); // now fix up length end; end; function UCS4ToUTF8(const S: TUCS4Array): TUTF8String; var SrcIndex, SrcLength, DestIndex: SizeInt; begin SrcLength := Length(S); if Length(S) = 0 then Result := '' else begin SetLength(Result, SrcLength * 3); // assume worst case DestIndex := 1; for SrcIndex := 0 to SrcLength - 1 do begin UTF8SetNextChar(Result, DestIndex, S[SrcIndex]); if DestIndex = -1 then raise EJclUnexpectedEOSequenceError.Create; end; SetLength(Result, DestIndex - 1); // set to actual length end; end; function TryUCS4ToUTF8(const S: TUCS4Array; out D: TUTF8String): Boolean; var SrcIndex, SrcLength, DestIndex: SizeInt; begin SrcLength := Length(S); Result := True; if Length(S) = 0 then D := '' else begin SetLength(D, SrcLength * 3); // assume worst case DestIndex := 1; for SrcIndex := 0 to SrcLength - 1 do begin UTF8SetNextChar(D, DestIndex, S[SrcIndex]); if DestIndex = -1 then begin Result := False; D := ''; Exit; end; end; SetLength(D, DestIndex - 1); // set to actual length end; end; function UTF16ToUCS4(const S: TUTF16String): TUCS4Array; var SrcIndex, SrcLength, DestIndex: SizeInt; Ch: UCS4; begin if S <> '' then begin SrcLength := Length(S); SetLength(Result, SrcLength); // create enough room SrcIndex := 1; DestIndex := 0; while SrcIndex <= SrcLength do begin Ch := UTF16GetNextChar(S, SrcIndex); if SrcIndex = -1 then raise EJclUnexpectedEOSequenceError.Create; Result[DestIndex] := Ch; Inc(DestIndex); end; SetLength(Result, DestIndex); // now fix up length end; end; function TryUTF16ToUCS4(const S: TUTF16String; out D: TUCS4Array): Boolean; var SrcIndex, SrcLength, DestIndex: SizeInt; Ch: UCS4; begin Result := True; if S <> '' then begin SrcLength := Length(S); SetLength(D, SrcLength); // create enough room SrcIndex := 1; DestIndex := 0; while SrcIndex <= SrcLength do begin Ch := UTF16GetNextChar(S, SrcIndex); if SrcIndex = -1 then begin Result := False; SetLength(D, 0); Exit; end; D[DestIndex] := Ch; Inc(DestIndex); end; SetLength(D, DestIndex); // now fix up length end; end; function UCS4ToUTF16(const S: TUCS4Array): TUTF16String; var SrcIndex, SrcLength, DestIndex: SizeInt; begin SrcLength := Length(S); if SrcLength = 0 then Result := '' else begin SetLength(Result, SrcLength * 3); // assume worst case DestIndex := 1; for SrcIndex := 0 to SrcLength - 1 do begin UTF16SetNextChar(Result, DestIndex, S[SrcIndex]); if DestIndex = -1 then raise EJclUnexpectedEOSequenceError.Create; end; SetLength(Result, DestIndex - 1); // set to actual length end; end; function TryUCS4ToUTF16(const S: TUCS4Array; out D:TUTF16String): Boolean; var SrcIndex, SrcLength, DestIndex: SizeInt; begin SrcLength := Length(S); Result := True; if SrcLength = 0 then D := '' else begin SetLength(D, SrcLength * 3); // assume worst case DestIndex := 1; for SrcIndex := 0 to SrcLength - 1 do begin UTF16SetNextChar(D, DestIndex, S[SrcIndex]); if DestIndex = -1 then begin Result := False; D := ''; Exit; end; end; SetLength(D, DestIndex - 1); // set to actual length end; end; function UTF8CharCount(const S: TUTF8String): SizeInt; var StrPos: SizeInt; begin StrPos := 1; Result := Length(S); UTF8SkipChars(S, StrPos, Result); if StrPos = -1 then raise EJclUnexpectedEOSequenceError.Create; end; function UTF16CharCount(const S: TUTF16String): SizeInt; var StrPos: SizeInt; begin StrPos := 1; Result := Length(S); UTF16SkipChars(S, StrPos, Result); if StrPos = -1 then raise EJclUnexpectedEOSequenceError.Create; end; function UCS2CharCount(const S: TUCS2String): SizeInt; begin Result := Length(S); end; function UCS4CharCount(const S: TUCS4Array): SizeInt; begin Result := Length(S); end; function GetUCS4CharAt(const UTF8Str: TUTF8String; Index: SizeInt; out Value: UCS4): Boolean; overload; var StrPos: SizeInt; begin StrPos := 1; Result := Index >= 0; if Result then Result := UTF8SkipChars(UTF8Str, StrPos, Index); if StrPos = -1 then raise EJclUnexpectedEOSequenceError.Create; Result := Result and (StrPos <= Length(UTF8Str)); if Result then begin Value := UTF8GetNextChar(UTF8Str, StrPos); if StrPos = -1 then raise EJclUnexpectedEOSequenceError.Create; end; end; function GetUCS4CharAt(const WideStr: WideString; Index: SizeInt; out Value: UCS4; IsUTF16: Boolean): Boolean; overload; var StrPos: SizeInt; begin if IsUTF16 then begin StrPos := 1; Result := Index >= 0; if Result then Result := UTF16SkipChars(WideStr, StrPos, Index); if StrPos = -1 then raise EJclUnexpectedEOSequenceError.Create; Result := Result and (StrPos <= Length(WideStr)); if Result then begin Value := UTF16GetNextChar(WideStr, StrPos); if StrPos = -1 then raise EJclUnexpectedEOSequenceError.Create; end; end else begin Result := (Index >= 1) and (Index <= Length(WideStr)); Value := UCS4(WideStr[Index]); end; end; function GetUCS4CharAt(const UCS4Str: TUCS4Array; Index: SizeInt; out Value: UCS4): Boolean; overload; begin Result := (Index >= 0) and (Index < Length(UCS4Str)); if Result then Value := UCS4Str[Index]; end; function UCS4ToAnsiChar(Value: UCS4): AnsiChar; var Buf: WideString; StrPos: SizeInt; begin StrPos := 1; Buf := #0#0; if UTF16SetNextChar(Buf, StrPos, Value) then Result := AnsiString(Buf)[1] else Result := AnsiReplacementCharacter; end; function UCS4ToWideChar(Value: UCS4): WideChar; begin if Value <= MaximumUCS2 then Result := WideChar(Value) else Result := WideChar(UCS4ReplacementCharacter); end; function UCS4ToChar(Value: UCS4): Char; begin {$IFDEF SUPPORTS_UNICODE} Result := UCS4ToWideChar(Value); {$ELSE ~SUPPORTS_UNICODE} Result := UCS4ToAnsiChar(Value); {$ENDIF ~SUPPORTS_UNICODE} end; function AnsiCharToUCS4(Value: AnsiChar): UCS4; var Buf: WideString; StrPos: SizeInt; begin StrPos := 1; Buf := WideString(AnsiString(Value)); Result := UTF16GetNextChar(Buf, StrPos); end; function WideCharToUCS4(Value: WideChar): UCS4; begin Result := UCS4(Value); end; function CharToUCS4(Value: Char): UCS4; begin {$IFDEF SUPPORTS_UNICODE} Result := WideCharToUCS4(Value); {$ELSE ~SUPPORTS_UNICODE} Result := AnsiCharToUCS4(Value); {$ENDIF ~SUPPORTS_UNICODE} end; {$IFDEF UNITVERSIONING} initialization RegisterUnitVersion(HInstance, UnitVersioning); finalization UnregisterUnitVersion(HInstance); {$ENDIF UNITVERSIONING} end.