Componentes.Terceros.jcl/official/2.1.1/source/common/JclStringConversions.pas
2010-01-18 16:51:36 +00:00

2635 lines
82 KiB
ObjectPascal

{**************************************************************************************************}
{ }
{ Project JEDI Code Library (JCL) }
{ }
{ The contents of this file are subject to the Mozilla Public License Version 1.1 (the "License"); }
{ you may not use this file except in compliance with the License. You may obtain a copy of the }
{ License at http://www.mozilla.org/MPL/ }
{ }
{ Software distributed under the License is distributed on an "AS IS" basis, WITHOUT WARRANTY OF }
{ ANY KIND, either express or implied. See the License for the specific language governing rights }
{ and limitations under the License. }
{ }
{ The Original Code is JclUnicode.pas. }
{ }
{ The Initial Developer of the Original Code is Mike Lischke (public att lischke-online dott de). }
{ Portions created by Mike Lischke are Copyright (C) 1999-2000 Mike Lischke. All Rights Reserved. }
{ }
{ Contributor(s): }
{ Marcel van Brakel }
{ Andreas Hausladen (ahuser) }
{ Mike Lischke }
{ Flier Lu (flier) }
{ Robert Marquardt (marquardt) }
{ Robert Rossmair (rrossmair) }
{ Olivier Sannier (obones) }
{ Matthias Thoma (mthoma) }
{ Petr Vones (pvones) }
{ Peter Schraut (http://www.console-dev.de) }
{ Florent Ouchet (outchy) }
{ }
{**************************************************************************************************}
{ }
{ String conversion routines }
{ }
{**************************************************************************************************}
{ }
{ Last modified: $Date:: 2009-08-09 15:08:29 +0200 (dim., 09 août 2009) $ }
{ Revision: $Rev:: 2921 $ }
{ Author: $Author:: outchy $ }
{ }
{**************************************************************************************************}
unit JclStringConversions;
{$I jcl.inc}
interface
uses
Classes,
{$IFDEF UNITVERSIONING}
JclUnitVersioning,
{$ENDIF UNITVERSIONING}
JclBase;
type
EJclStringConversionError = class(EJclError);
EJclUnexpectedEOSequenceError = class (EJclStringConversionError)
public
constructor Create;
end;
// conversion routines between Ansi, UTF-16, UCS-4 and UTF8 strings
// one shot conversion between PAnsiChar and PWideChar
procedure ExpandASCIIString(const Source: PAnsiChar; Target: PWideChar; Count: SizeInt);
// tpye of stream related functions
type
TJclStreamGetNextCharFunc = function(S: TStream; out Ch: UCS4): Boolean;
TJclStreamSkipCharsFunc = function(S: TStream; var NbSeq: SizeInt): Boolean;
TJclStreamSetNextCharFunc = function(S: TStream; Ch: UCS4): Boolean;
// iterative conversions
// UTF8GetNextChar = read next UTF8 sequence at StrPos
// if UNICODE_SILENT_FAILURE is defined, invalid sequences will be replaced by ReplacementCharacter
// otherwise StrPos is set to -1 on return to flag an error (invalid UTF8 sequence)
// StrPos will be incremented by the number of chars that were read
function UTF8GetNextChar(const S: TUTF8String; var StrPos: SizeInt): UCS4;
function UTF8GetNextCharFromStream(S: TStream; out Ch: UCS4): Boolean;
// UTF8SkipChars = skip NbSeq UTF8 sequences starting from StrPos
// returns False if String is too small
// if UNICODE_SILENT_FAILURE is not defined StrPos is set to -1 on error (invalid UTF8 sequence)
// StrPos will be incremented by the number of chars that were skipped
// On return, NbSeq contains the number of UTF8 sequences that were skipped
function UTF8SkipChars(const S: TUTF8String; var StrPos: SizeInt; var NbSeq: SizeInt): Boolean;
function UTF8SkipCharsFromStream(S: TStream; var NbSeq: SizeInt): Boolean;
// UTF8SetNextChar = append an UTF8 sequence at StrPos
// returns False on error:
// - if an UCS4 character cannot be stored to an UTF-8 string:
// - if UNICODE_SILENT_FAILURE is defined, ReplacementCharacter is added
// - if UNICODE_SILENT_FAILURE is not defined, StrPos is set to -1
// - StrPos > -1 flags string being too small, callee did nothing, caller is responsible for allocating space
// StrPos will be incremented by the number of chars that were written
function UTF8SetNextChar(var S: TUTF8String; var StrPos: SizeInt; Ch: UCS4): Boolean;
function UTF8SetNextCharToStream(S: TStream; Ch: UCS4): Boolean;
// UTF16GetNextChar = read next UTF16 sequence at StrPos
// if UNICODE_SILENT_FAILURE is defined, invalid sequences will be replaced by ReplacementCharacter
// otherwise StrPos is set to -1 on return to flag an error (invalid UTF16 sequence)
// StrPos will be incremented by the number of chars that were read
function UTF16GetNextChar(const S: TUTF16String; var StrPos: SizeInt): UCS4; overload;
{$IFDEF SUPPORTS_UNICODE_STRING}
function UTF16GetNextChar(const S: UnicodeString; var StrPos: SizeInt): UCS4; overload;
{$ENDIF SUPPORTS_UNICODE_STRING}
function UTF16GetNextCharFromStream(S: TStream; out Ch: UCS4): Boolean;
// UTF16GetPreviousChar = read previous UTF16 sequence starting at StrPos-1
// if UNICODE_SILENT_FAILURE is defined, invalid sequences will be replaced by ReplacementCharacter
// otherwise StrPos is set to -1 on return to flag an error (invalid UTF16 sequence)
// StrPos will be decremented by the number of chars that were read
function UTF16GetPreviousChar(const S: TUTF16String; var StrPos: SizeInt): UCS4; overload;
{$IFDEF SUPPORTS_UNICODE_STRING}
function UTF16GetPreviousChar(const S: UnicodeString; var StrPos: SizeInt): UCS4; overload;
{$ENDIF SUPPORTS_UNICODE_STRING}
// UTF16SkipChars = skip NbSeq UTF16 sequences starting from StrPos
// returns False if String is too small
// if UNICODE_SILENT_FAILURE is not defined StrPos is set to -1 on error (invalid UTF16 sequence)
// StrPos will be incremented by the number of chars that were skipped
// On return, NbChar contains the number of UTF16 sequences that were skipped
function UTF16SkipChars(const S: TUTF16String; var StrPos: SizeInt; var NbSeq: SizeInt): Boolean; overload;
{$IFDEF SUPPORTS_UNICODE_STRING}
function UTF16SkipChars(const S: UnicodeString; var StrPos: SizeInt; var NbSeq: SizeInt): Boolean; overload;
{$ENDIF SUPPORTS_UNICODE_STRING}
function UTF16SkipCharsFromStream(S: TStream; var NbSeq: SizeInt): Boolean;
// UTF16SetNextChar = append an UTF16 sequence at StrPos
// returns False on error:
// - if an UCS4 character cannot be stored to an UTF-16 string:
// - if UNICODE_SILENT_FAILURE is defined, ReplacementCharacter is added
// - if UNICODE_SILENT_FAILURE is not defined, StrPos is set to -1
// - StrPos > -1 flags string being too small, callee did nothing and caller is responsible for allocating space
// StrPos will be incremented by the number of chars that were written
function UTF16SetNextChar(var S: TUTF16String; var StrPos: SizeInt; Ch: UCS4): Boolean; overload;
{$IFDEF SUPPORTS_UNICODE_STRING}
function UTF16SetNextChar(var S: UnicodeString; var StrPos: SizeInt; Ch: UCS4): Boolean; overload;
{$ENDIF SUPPORTS_UNICODE_STRING}
function UTF16SetNextCharToStream(S: TStream; Ch: UCS4): Boolean;
// AnsiGetNextChar = read next character at StrPos
// StrPos will be incremented by the number of chars that were read (1)
function AnsiGetNextChar(const S: AnsiString; var StrPos: SizeInt): UCS4; overload;
function AnsiGetNextCharFromStream(S: TStream; out Ch: UCS4): Boolean; overload;
// same as AnsiGetNextChar* with custom codepage
function AnsiGetNextChar(const S: AnsiString; CodePage: Word; var StrPos: SizeInt): UCS4; overload;
function AnsiGetNextCharFromStream(S: TStream; CodePage: Word; out Ch: UCS4): Boolean; overload;
// AnsiSkipChars = skip NbSeq characters starting from StrPos
// returns False if String is too small
// StrPos will be incremented by the number of chars that were skipped
// On return, NbChar contains the number of UTF16 sequences that were skipped
function AnsiSkipChars(const S: AnsiString; var StrPos: SizeInt; var NbSeq: SizeInt): Boolean;
function AnsiSkipCharsFromStream(S: TStream; var NbSeq: SizeInt): Boolean;
// AnsiSetNextChar = append a character at StrPos
// returns False on error:
// - if an UCS4 character cannot be stored to an ansi string:
// - if UNICODE_SILENT_FAILURE is defined, ReplacementCharacter is added
// - if UNICODE_SILENT_FAILURE is not defined, StrPos is set to -1
// - StrPos > -1 flags string being too small, callee did nothing and caller is responsible for allocating space
// StrPos will be incremented by the number of chars that were written (1)
function AnsiSetNextChar(var S: AnsiString; var StrPos: SizeInt; Ch: UCS4): Boolean; overload;
function AnsiSetNextCharToStream(S: TStream; Ch: UCS4): Boolean; overload;
// same as AnsiSetNextChar* with custom codepage
function AnsiSetNextChar(var S: AnsiString; CodePage: Word; var StrPos: SizeInt; Ch: UCS4): Boolean; overload;
function AnsiSetNextCharToStream(S: TStream; CodePage: Word; Ch: UCS4): Boolean; overload;
// StringGetNextChar = read next character/sequence at StrPos
// if UNICODE_SILENT_FAILURE is defined, invalid sequences will be replaced by ReplacementCharacter
// otherwise StrPos is set to -1 on return to flag an error (invalid UTF16 sequence for WideString)
// StrPos will be incremented by the number of chars that were read
function StringGetNextChar(const S: string; var StrPos: SizeInt): UCS4; {$IFDEF SUPPORTS_INLINE} inline; {$ENDIF}
// StringSkipChars = skip NbSeq characters/sequences starting from StrPos
// returns False if String is too small
// if UNICODE_SILENT_FAILURE is not defined StrPos is set to -1 on error (invalid UTF16 sequence for WideString)
// StrPos will be incremented by the number of chars that were skipped
// On return, NbChar contains the number of UTF16 sequences that were skipped
function StringSkipChars(const S: string; var StrPos: SizeInt; var NbSeq: SizeInt): Boolean; {$IFDEF SUPPORTS_INLINE} inline; {$ENDIF}
// StringSetNextChar = append a character/sequence at StrPos
// returns False on error:
// - if an UCS4 character cannot be stored to a string:
// - if UNICODE_SILENT_FAILURE is defined, ReplacementCharacter is added
// - if UNICODE_SILENT_FAILURE is not defined, StrPos is set to -1
// - StrPos > -1 flags string being too small, callee did nothing and caller is responsible for allocating space
// StrPos will be incremented by the number of chars that were written
function StringSetNextChar(var S: string; var StrPos: SizeInt; Ch: UCS4): Boolean; {$IFDEF SUPPORTS_INLINE} inline; {$ENDIF}
// one shot conversions between WideString and others
function WideStringToUTF8(const S: WideString): TUTF8String; {$IFDEF SUPPORTS_INLINE} inline; {$ENDIF SUPPORTS_INLINE}
function UTF8ToWideString(const S: TUTF8String): WideString; {$IFDEF SUPPORTS_INLINE} inline; {$ENDIF SUPPORTS_INLINE}
function WideStringToUCS4(const S: WideString): TUCS4Array; {$IFDEF SUPPORTS_INLINE} inline; {$ENDIF SUPPORTS_INLINE}
function UCS4ToWideString(const S: TUCS4Array): WideString; {$IFDEF SUPPORTS_INLINE} inline; {$ENDIF SUPPORTS_INLINE}
// one shot conversions between AnsiString and others
function AnsiStringToUTF8(const S: AnsiString): TUTF8String; {$IFDEF SUPPORTS_INLINE} inline; {$ENDIF SUPPORTS_INLINE}
function UTF8ToAnsiString(const S: TUTF8String): AnsiString; {$IFDEF SUPPORTS_INLINE} inline; {$ENDIF SUPPORTS_INLINE}
function AnsiStringToUTF16(const S: AnsiString): TUTF16String; {$IFDEF SUPPORTS_INLINE} inline; {$ENDIF SUPPORTS_INLINE}
function UTF16ToAnsiString(const S: TUTF16String): AnsiString; {$IFDEF SUPPORTS_INLINE} inline; {$ENDIF SUPPORTS_INLINE}
function AnsiStringToUCS4(const S: AnsiString): TUCS4Array; {$IFDEF SUPPORTS_INLINE} inline; {$ENDIF SUPPORTS_INLINE}
function UCS4ToAnsiString(const S: TUCS4Array): AnsiString; {$IFDEF SUPPORTS_INLINE} inline; {$ENDIF SUPPORTS_INLINE}
// one shot conversions between string and others
function StringToUTF8(const S: string): TUTF8String; {$IFDEF SUPPORTS_INLINE} inline; {$ENDIF SUPPORTS_INLINE}
function UTF8ToString(const S: TUTF8String): string; {$IFDEF SUPPORTS_INLINE} inline; {$ENDIF SUPPORTS_INLINE}
function StringToUTF16(const S: string): TUTF16String; {$IFDEF SUPPORTS_INLINE} inline; {$ENDIF SUPPORTS_INLINE}
function UTF16ToString(const S: TUTF16String): string; {$IFDEF SUPPORTS_INLINE} inline; {$ENDIF SUPPORTS_INLINE}
function StringToUCS4(const S: string): TUCS4Array; {$IFDEF SUPPORTS_INLINE} inline; {$ENDIF SUPPORTS_INLINE}
function UCS4ToString(const S: TUCS4Array): string; {$IFDEF SUPPORTS_INLINE} inline; {$ENDIF SUPPORTS_INLINE}
function TryStringToUTF8(const S: string; out D: TUTF8String): Boolean; {$IFDEF SUPPORTS_INLINE} inline; {$ENDIF SUPPORTS_INLINE}
function TryUTF8ToString(const S: TUTF8String; out D: string): Boolean; {$IFDEF SUPPORTS_INLINE} inline; {$ENDIF SUPPORTS_INLINE}
function TryStringToUTF16(const S: string; out D: TUTF16String): Boolean; {$IFDEF SUPPORTS_INLINE} inline; {$ENDIF SUPPORTS_INLINE}
function TryUTF16ToString(const S: TUTF16String; out D: string): Boolean; {$IFDEF SUPPORTS_INLINE} inline; {$ENDIF SUPPORTS_INLINE}
function TryStringToUCS4(const S: string; out D: TUCS4Array): Boolean; {$IFDEF SUPPORTS_INLINE} inline; {$ENDIF SUPPORTS_INLINE}
function TryUCS4ToString(const S: TUCS4Array; out D: string): Boolean; {$IFDEF SUPPORTS_INLINE} inline; {$ENDIF SUPPORTS_INLINE}
function UTF8ToUTF16(const S: TUTF8String): TUTF16String;
function UTF16ToUTF8(const S: TUTF16String): TUTF8String;
function UTF8ToUCS4(const S: TUTF8String): TUCS4Array;
function UCS4ToUTF8(const S: TUCS4Array): TUTF8String;
function UTF16ToUCS4(const S: TUTF16String): TUCS4Array;
function UCS4ToUTF16(const S: TUCS4Array): TUTF16String;
function TryUTF8ToUTF16(const S: TUTF8String; out D: TUTF16String): Boolean;
function TryUTF16ToUTF8(const S: TUTF16String; out D: TUTF8String): Boolean;
function TryUTF8ToUCS4(const S: TUTF8String; out D: TUCS4Array): Boolean;
function TryUCS4ToUTF8(const S: TUCS4Array; out D: TUTF8String): Boolean;
function TryUTF16ToUCS4(const S: TUTF16String; out D: TUCS4Array): Boolean;
function TryUCS4ToUTF16(const S: TUCS4Array; out D: TUTF16String): Boolean;
// indexed conversions
function UTF8CharCount(const S: TUTF8String): SizeInt;
function UTF16CharCount(const S: TUTF16String): SizeInt;
function UCS2CharCount(const S: TUCS2String): SizeInt;
function UCS4CharCount(const S: TUCS4Array): SizeInt;
// returns False if string is too small
// if UNICODE_SILENT_FAILURE is not defined and an invalid UTFX sequence is detected, an exception is raised
// returns True on success and Value contains UCS4 character that was read
function GetUCS4CharAt(const UTF8Str: TUTF8String; Index: SizeInt; out Value: UCS4): Boolean; overload;
function GetUCS4CharAt(const WideStr: TUTF16String; Index: SizeInt; out Value: UCS4; IsUTF16: Boolean = True): Boolean; overload;
function GetUCS4CharAt(const UCS4Str: TUCS4Array; Index: SizeInt; out Value: UCS4): Boolean; overload;
function UCS4ToAnsiChar(Value: UCS4): AnsiChar;
function UCS4ToWideChar(Value: UCS4): WideChar;
function UCS4ToChar(Value: UCS4): Char; {$IFDEF SUPPORTS_INLINE} inline; {$ENDIF SUPPORTS_INLINE}
function AnsiCharToUCS4(Value: AnsiChar): UCS4;
function WideCharToUCS4(Value: WideChar): UCS4;
function CharToUCS4(Value: Char): UCS4; {$IFDEF SUPPORTS_INLINE} inline; {$ENDIF SUPPORTS_INLINE}
{$IFDEF UNITVERSIONING}
const
UnitVersioning: TUnitVersionInfo = (
RCSfile: '$URL: https://jcl.svn.sourceforge.net/svnroot/jcl/tags/JCL-2.1-Build3536/jcl/source/common/JclStringConversions.pas $';
Revision: '$Revision: 2921 $';
Date: '$Date: 2009-08-09 15:08:29 +0200 (dim., 09 août 2009) $';
LogPath: 'JCL\source\common';
Extra: '';
Data: nil
);
{$ENDIF UNITVERSIONING}
implementation
uses
{$IFDEF MSWINDOWS}
Windows,
{$ENDIF MSWINDOWS}
JclResources;
const MB_ERR_INVALID_CHARS = 8;
constructor EJclUnexpectedEOSequenceError.Create;
begin
inherited CreateRes(@RsEUnexpectedEOSeq);
end;
function StreamReadByte(S: TStream; out B: Byte): Boolean; {$IFDEF SUPPORTS_INLINE} inline; {$ENDIF}
begin
B := 0;
Result := S.Read(B, SizeOf(B)) = SizeOf(B);
end;
function StreamWriteByte(S: TStream; B: Byte): Boolean; {$IFDEF SUPPORTS_INLINE} inline; {$ENDIF}
begin
Result := S.Write(B, SizeOf(B)) = SizeOf(B);
end;
function StreamReadWord(S: TStream; out W: Word): Boolean; {$IFDEF SUPPORTS_INLINE} inline; {$ENDIF}
begin
W := 0;
Result := S.Read(W, SizeOf(W)) = SizeOf(W);
end;
function StreamWriteWord(S: TStream; W: Word): Boolean; {$IFDEF SUPPORTS_INLINE} inline; {$ENDIF}
begin
Result := S.Write(W, SizeOf(W)) = SizeOf(W);
end;
//----------------- conversion routines ------------------------------------------------------------
// Converts the given source ANSI string into a Unicode string by expanding each character
// from one byte to two bytes.
// EAX contains Source, EDX contains Target, ECX contains Count
procedure ExpandASCIIString(const Source: PAnsiChar; Target: PWideChar; Count: SizeInt);
asm
{$IFDEF CPU32}
// --> EAX Source
// EDX Target
// ECX Count
JECXZ @@Finish // go out if there is nothing to do (ECX = 0)
PUSH ESI
MOV ESI, EAX
XOR EAX, EAX
@@1:
MOV AL, [ESI]
INC ESI
MOV [EDX], AX
ADD EDX, 2
DEC ECX
JNZ @@1
POP ESI
{$ENDIF CPU32}
{$IFDEF CPU64}
// --> RCX Source
// RDX Target
// R8 Count
MOV RAX, RCX
MOV RCX, R8
JRCXZ @@Finish // go out if there is nothing to do (ECX = 0)
MOV RSI, RAX
XOR RAX, RAX
@@1:
MOV AL, [RSI]
INC RSI
MOV [RDX], AX
ADD RDX, 2
DEC RCX
JNZ @@1
{$ENDIF CPU64}
@@Finish:
end;
const
HalfShift: Integer = 10;
HalfBase: UCS4 = $0010000;
HalfMask: UCS4 = $3FF;
procedure FlagInvalidSequence(var StrPos: SizeInt; Increment: SizeInt; out Ch: UCS4); overload;
begin
{$IFDEF UNICODE_SILENT_FAILURE}
Ch := UCS4ReplacementCharacter;
Inc(StrPos, Increment);
{$ELSE ~UNICODE_SILENT_FAILURE}
StrPos := -1;
{$ENDIF ~UNICODE_SILENT_FAILURE}
end;
procedure FlagInvalidSequence(var StrPos: SizeInt; Increment: SizeInt); overload;
begin
{$IFDEF UNICODE_SILENT_FAILURE}
Inc(StrPos, Increment);
{$ELSE ~UNICODE_SILENT_FAILURE}
StrPos := -1;
{$ENDIF ~UNICODE_SILENT_FAILURE}
end;
procedure FlagInvalidSequence(out Ch: UCS4); overload;
begin
{$IFDEF UNICODE_SILENT_FAILURE}
Ch := UCS4ReplacementCharacter;
{$ELSE ~UNICODE_SILENT_FAILURE}
raise EJclUnexpectedEOSequenceError.Create;
{$ENDIF ~UNICODE_SILENT_FAILURE}
end;
procedure FlagInvalidSequence; overload;
begin
{$IFNDEF UNICODE_SILENT_FAILURE}
raise EJclUnexpectedEOSequenceError.Create;
{$ENDIF ~UNICODE_SILENT_FAILURE}
end;
// if UNICODE_SILENT_FAILURE is defined, invalid sequences will be replaced by ReplacementCharacter
// otherwise StrPos is set to -1 on return to flag an error (invalid UTF8 sequence)
// StrPos will be incremented by the number of chars that were read
function UTF8GetNextChar(const S: TUTF8String; var StrPos: SizeInt): UCS4;
var
StrLength: SizeInt;
ChNext: UCS4;
begin
StrLength := Length(S);
if (StrPos <= StrLength) and (StrPos > 0) then
begin
Result := UCS4(S[StrPos]);
case Result of
$00..$7F:
// 1 byte to read
Inc(StrPos);
$C0..$DF:
begin
// 2 bytes to read
if StrPos >= StrLength then
begin
FlagInvalidSequence(StrPos, 1, Result);
Exit;
end;
ChNext := UCS4(S[StrPos + 1]);
if (ChNext and $C0) <> $80 then
begin
FlagInvalidSequence(StrPos, 1, Result);
Exit;
end;
Result := ((Result and $1F) shl 6) or (ChNext and $3F);
Inc(StrPos, 2);
end;
$E0..$EF:
begin
// 3 bytes to read
if (StrPos + 1) >= StrLength then
begin
FlagInvalidSequence(StrPos, 1, Result);
Exit;
end;
ChNext := UCS4(S[StrPos + 1]);
if (ChNext and $C0) <> $80 then
begin
FlagInvalidSequence(StrPos, 1, Result);
Exit;
end;
Result := ((Result and $0F) shl 12) or ((ChNext and $3F) shl 6);
ChNext := UCS4(S[StrPos + 2]);
if (ChNext and $C0) <> $80 then
begin
FlagInvalidSequence(StrPos, 2, Result);
Exit;
end;
Result := Result or (ChNext and $3F);
Inc(StrPos, 3);
end;
$F0..$F7:
begin
// 4 bytes to read
if (StrPos + 2) >= StrLength then
begin
FlagInvalidSequence(StrPos, 1, Result);
Exit;
end;
ChNext := UCS4(S[StrPos + 1]);
if (ChNext and $C0) <> $80 then
begin
FlagInvalidSequence(StrPos, 1, Result);
Exit;
end;
Result := ((Result and $07) shl 18) or ((ChNext and $3F) shl 12);
ChNext := UCS4(S[StrPos + 2]);
if (ChNext and $C0) <> $80 then
begin
FlagInvalidSequence(StrPos, 2, Result);
Exit;
end;
Result := Result or ((ChNext and $3F) shl 6);
ChNext := UCS4(S[StrPos + 3]);
if (ChNext and $C0) <> $80 then
begin
FlagInvalidSequence(StrPos, 3, Result);
Exit;
end;
Result := Result or (ChNext and $3F);
Inc(StrPos, 4);
end;
$F8..$FB:
begin
// 5 bytes to read
if (StrPos + 3) >= StrLength then
begin
FlagInvalidSequence(StrPos, 1, Result);
Exit;
end;
ChNext := UCS4(S[StrPos + 1]);
if (ChNext and $C0) <> $80 then
begin
FlagInvalidSequence(StrPos, 1, Result);
Exit;
end;
Result := ((Result and $03) shl 24) or ((ChNext and $3F) shl 18);
ChNext := UCS4(S[StrPos + 2]);
if (ChNext and $C0) <> $80 then
begin
FlagInvalidSequence(StrPos, 2, Result);
Exit;
end;
Result := Result or ((ChNext and $3F) shl 12);
ChNext := UCS4(S[StrPos + 3]);
if (ChNext and $C0) <> $80 then
begin
FlagInvalidSequence(StrPos, 3, Result);
Exit;
end;
Result := Result or ((ChNext and $3F) shl 6);
ChNext := UCS4(S[StrPos + 4]);
if (ChNext and $C0) <> $80 then
begin
FlagInvalidSequence(StrPos, 4, Result);
Exit;
end;
Result := Result or (ChNext and $3F);
Inc(StrPos, 5);
end;
$FC..$FD:
begin
// 6 bytes to read
if (StrPos + 4) >= StrLength then
begin
FlagInvalidSequence(StrPos, 1, Result);
Exit;
end;
ChNext := UCS4(S[StrPos + 1]);
if (ChNext and $C0) <> $80 then
begin
FlagInvalidSequence(StrPos, 1, Result);
Exit;
end;
Result := ((Result and $01) shl 30) or ((ChNext and $3F) shl 24);
ChNext := UCS4(S[StrPos + 2]);
if (ChNext and $C0) <> $80 then
begin
FlagInvalidSequence(StrPos, 2, Result);
Exit;
end;
Result := Result or ((ChNext and $3F) shl 18);
ChNext := UCS4(S[StrPos + 3]);
if (ChNext and $C0) <> $80 then
begin
FlagInvalidSequence(StrPos, 3, Result);
Exit;
end;
Result := Result or ((ChNext and $3F) shl 12);
ChNext := UCS4(S[StrPos + 4]);
if (ChNext and $C0) <> $80 then
begin
FlagInvalidSequence(StrPos, 4, Result);
Exit;
end;
Result := Result or ((ChNext and $3F) shl 6);
ChNext := UCS4(S[StrPos + 5]);
if (ChNext and $C0) <> $80 then
begin
FlagInvalidSequence(StrPos, 5, Result);
Exit;
end;
Result := Result or (ChNext and $3F);
Inc(StrPos, 6);
end;
else
FlagInvalidSequence(StrPos, 1, Result);
Exit;
end;
end
else
begin
// StrPos > StrLength
Result := 0;
FlagInvalidSequence(StrPos, 0, Result);
end;
end;
function UTF8GetNextCharFromStream(S: TStream; out Ch: UCS4): Boolean;
var
B: Byte;
begin
Result := StreamReadByte(S,B);
if not Result then
Exit;
Ch := UCS4(B);
case Ch of
$00..$7F: ;
// 1 byte to read
// nothing to do
$C0..$DF:
begin
// 2 bytes to read
Result := StreamReadByte(S,B);
if not Result then
Exit;
if (B and $C0) <> $80 then
begin
FlagInvalidSequence(Ch);
Exit;
end;
Ch := ((Ch and $1F) shl 6) or (B and $3F);
end;
$E0..$EF:
begin
// 3 bytes to read
Result := StreamReadByte(S,B);
if not Result then
Exit;
if (B and $C0) <> $80 then
begin
FlagInvalidSequence(Ch);
Exit;
end;
Ch := ((Ch and $0F) shl 12) or ((B and $3F) shl 6);
Result := StreamReadByte(S,B);
if not Result then
Exit;
if (B and $C0) <> $80 then
begin
FlagInvalidSequence(Ch);
Exit;
end;
Ch := Ch or (B and $3F);
end;
$F0..$F7:
begin
// 4 bytes to read
Result := StreamReadByte(S,B);
if not Result then
Exit;
if (B and $C0) <> $80 then
begin
FlagInvalidSequence(Ch);
Exit;
end;
Ch := ((Ch and $07) shl 18) or ((B and $3F) shl 12);
Result := StreamReadByte(S,B);
if not Result then
Exit;
if (B and $C0) <> $80 then
begin
FlagInvalidSequence(Ch);
Exit;
end;
Ch := Ch or ((B and $3F) shl 6);
Result := StreamReadByte(S,B);
if not Result then
Exit;
if (B and $C0) <> $80 then
begin
FlagInvalidSequence(Ch);
Exit;
end;
Ch := Ch or (B and $3F);
end;
$F8..$FB:
begin
// 5 bytes to read
Result := StreamReadByte(S,B);
if not Result then
Exit;
if (B and $C0) <> $80 then
begin
FlagInvalidSequence(Ch);
Exit;
end;
Ch := ((Ch and $03) shl 24) or ((B and $3F) shl 18);
Result := StreamReadByte(S,B);
if not Result then
Exit;
if (B and $C0) <> $80 then
begin
FlagInvalidSequence(Ch);
Exit;
end;
Ch := Ch or ((B and $3F) shl 12);
Result := StreamReadByte(S,B);
if not Result then
Exit;
if (B and $C0) <> $80 then
begin
FlagInvalidSequence(Ch);
Exit;
end;
Ch := Ch or ((B and $3F) shl 6);
Result := StreamReadByte(S,B);
if not Result then
Exit;
if (B and $C0) <> $80 then
begin
FlagInvalidSequence(Ch);
Exit;
end;
Ch := Ch or (B and $3F);
end;
$FC..$FD:
begin
// 6 bytes to read
Result := StreamReadByte(S,B);
if not Result then
Exit;
if (B and $C0) <> $80 then
begin
FlagInvalidSequence(Ch);
Exit;
end;
Ch := ((Ch and $01) shl 30) or ((B and $3F) shl 24);
Result := StreamReadByte(S,B);
if not Result then
Exit;
if (B and $C0) <> $80 then
begin
FlagInvalidSequence(Ch);
Exit;
end;
Ch := Ch or ((B and $3F) shl 18);
Result := StreamReadByte(S,B);
if not Result then
Exit;
if (B and $C0) <> $80 then
begin
FlagInvalidSequence(Ch);
Exit;
end;
Ch := Ch or ((B and $3F) shl 12);
Result := StreamReadByte(S,B);
if not Result then
Exit;
if (B and $C0) <> $80 then
begin
FlagInvalidSequence(Ch);
Exit;
end;
Ch := Ch or ((B and $3F) shl 6);
Result := StreamReadByte(S,B);
if not Result then
Exit;
if (B and $C0) <> $80 then
begin
FlagInvalidSequence(Ch);
Exit;
end;
Ch := Ch or (B and $3F);
end;
else
FlagInvalidSequence(Ch);
Exit;
end;
end;
// returns False if String is too small
// if UNICODE_SILENT_FAILURE is not defined StrPos is set to -1 on error (invalid UTF8 sequence)
// StrPos will be incremented by the number of ansi chars that were skipped
// On return, NbSeq contains the number of UTF8 sequences that were skipped
function UTF8SkipChars(const S: TUTF8String; var StrPos: SizeInt; var NbSeq: SizeInt): Boolean;
var
StrLength: SizeInt;
Ch: UCS4;
Index: SizeInt;
begin
Result := True;
StrLength := Length(S);
Index := 0;
while (Index < NbSeq) and (StrPos > 0) do
begin
Ch := UCS4(S[StrPos]);
case Ch of
$00..$7F:
// 1 byte to skip
Inc(StrPos);
$C0..$DF:
// 2 bytes to skip
if (StrPos >= StrLength) or ((UCS4(S[StrPos + 1]) and $C0) <> $80) then
FlagInvalidSequence(StrPos, 1)
else
Inc(StrPos, 2);
$E0..$EF:
// 3 bytes to skip
if ((StrPos + 1) >= StrLength) or ((UCS4(S[StrPos + 1]) and $C0) <> $80) then
FlagInvalidSequence(StrPos, 1)
else
if (UCS4(S[StrPos + 2]) and $C0) <> $80 then
FlagInvalidSequence(StrPos, 2)
else
Inc(StrPos, 3);
$F0..$F7:
// 4 bytes to skip
if ((StrPos + 2) >= StrLength) or ((UCS4(S[StrPos + 1]) and $C0) <> $80) then
FlagInvalidSequence(StrPos, 1)
else
if (UCS4(S[StrPos + 2]) and $C0) <> $80 then
FlagInvalidSequence(StrPos, 2)
else
if (UCS4(S[StrPos + 3]) and $C0) <> $80 then
FlagInvalidSequence(StrPos, 3)
else
Inc(StrPos, 4);
$F8..$FB:
// 5 bytes to skip
if ((StrPos + 3) >= StrLength) or ((UCS4(S[StrPos + 1]) and $C0) <> $80) then
FlagInvalidSequence(StrPos, 1)
else
if (UCS4(S[StrPos + 2]) and $C0) <> $80 then
FlagInvalidSequence(StrPos, 2)
else
if (UCS4(S[StrPos + 3]) and $C0) <> $80 then
FlagInvalidSequence(StrPos, 3)
else
if (UCS4(S[StrPos + 4]) and $C0) <> $80 then
FlagInvalidSequence(StrPos, 4)
else
Inc(StrPos, 5);
$FC..$FD:
// 6 bytes to skip
if ((StrPos + 4) >= StrLength) or ((UCS4(S[StrPos + 1]) and $C0) <> $80) then
FlagInvalidSequence(StrPos, 1)
else
if (UCS4(S[StrPos + 2]) and $C0) <> $80 then
FlagInvalidSequence(StrPos, 2)
else
if (UCS4(S[StrPos + 3]) and $C0) <> $80 then
FlagInvalidSequence(StrPos, 3)
else
if (UCS4(S[StrPos + 4]) and $C0) <> $80 then
FlagInvalidSequence(StrPos, 4)
else
if (UCS4(S[StrPos + 5]) and $C0) <> $80 then
FlagInvalidSequence(StrPos, 5)
else
Inc(StrPos, 6);
else
FlagInvalidSequence(StrPos, 1);
end;
if StrPos <> -1 then
Inc(Index);
if (StrPos > StrLength) and (Index < NbSeq) then
begin
Result := False;
Break;
end;
end;
NbSeq := Index;
end;
function UTF8SkipCharsFromStream(S: TStream; var NbSeq: SizeInt): Boolean;
var
B: Byte;
Index: SizeInt;
begin
Index := 0;
while (Index < NbSeq) do
begin
Result := StreamReadByte(S, B);
if not Result then
Break;
case B of
$00..$7F: ;
// 1 byte to skip
// nothing to do
$C0..$DF:
// 2 bytes to skip
begin
Result := StreamReadByte(S, B);
if not Result then
Break;
if (B and $C0) <> $80 then
FlagInvalidSequence;
end;
$E0..$EF:
// 3 bytes to skip
begin
Result := StreamReadByte(S, B);
if not Result then
Break;
if (B and $C0) <> $80 then
FlagInvalidSequence;
Result := StreamReadByte(S, B);
if not Result then
Break;
if (B and $C0) <> $80 then
FlagInvalidSequence;
end;
$F0..$F7:
// 4 bytes to skip
begin
Result := StreamReadByte(S, B);
if not Result then
Break;
if (B and $C0) <> $80 then
FlagInvalidSequence;
Result := StreamReadByte(S, B);
if not Result then
Break;
if (B and $C0) <> $80 then
FlagInvalidSequence;
Result := StreamReadByte(S, B);
if not Result then
Break;
if (B and $C0) <> $80 then
FlagInvalidSequence;
end;
$F8..$FB:
// 5 bytes to skip
begin
Result := StreamReadByte(S, B);
if not Result then
Break;
if (B and $C0) <> $80 then
FlagInvalidSequence;
Result := StreamReadByte(S, B);
if not Result then
Break;
if (B and $C0) <> $80 then
FlagInvalidSequence;
Result := StreamReadByte(S, B);
if not Result then
Break;
if (B and $C0) <> $80 then
FlagInvalidSequence;
Result := StreamReadByte(S, B);
if not Result then
Break;
if (B and $C0) <> $80 then
FlagInvalidSequence;
end;
$FC..$FD:
// 6 bytes to skip
begin
Result := StreamReadByte(S, B);
if not Result then
Break;
if (B and $C0) <> $80 then
FlagInvalidSequence;
Result := StreamReadByte(S, B);
if not Result then
Break;
if (B and $C0) <> $80 then
FlagInvalidSequence;
Result := StreamReadByte(S, B);
if not Result then
Break;
if (B and $C0) <> $80 then
FlagInvalidSequence;
Result := StreamReadByte(S, B);
if not Result then
Break;
if (B and $C0) <> $80 then
FlagInvalidSequence;
Result := StreamReadByte(S, B);
if not Result then
Break;
if (B and $C0) <> $80 then
FlagInvalidSequence;
end;
else
FlagInvalidSequence;
end;
Inc(Index);
end;
Result := Index = NbSeq;
NbSeq := Index;
end;
// returns False on error:
// - if an UCS4 character cannot be stored to an UTF-8 string:
// - if UNICODE_SILENT_FAILURE is defined, ReplacementCharacter is added
// - if UNICODE_SILENT_FAILURE is not defined, StrPos is set to -1
// - StrPos > -1 flags string being too small, caller is responsible for allocating space
// StrPos will be incremented by the number of chars that were written
function UTF8SetNextChar(var S: TUTF8String; var StrPos: SizeInt; Ch: UCS4): Boolean;
var
StrLength: SizeInt;
begin
StrLength := Length(S);
if Ch <= $7F then
begin
// 7 bits to store
Result := (StrPos > 0) and (StrPos <= StrLength);
if Result then
begin
S[StrPos] := AnsiChar(Ch);
Inc(StrPos);
end;
end
else
if Ch <= $7FF then
begin
// 11 bits to store
Result := (StrPos > 0) and (StrPos < StrLength);
if Result then
begin
S[StrPos] := AnsiChar($C0 or (Ch shr 6)); // 5 bits
S[StrPos + 1] := AnsiChar((Ch and $3F) or $80); // 6 bits
Inc(StrPos, 2);
end;
end
else
if Ch <= $FFFF then
begin
// 16 bits to store
Result := (StrPos > 0) and (StrPos < (StrLength - 1));
if Result then
begin
S[StrPos] := AnsiChar($E0 or (Ch shr 12)); // 4 bits
S[StrPos + 1] := AnsiChar(((Ch shr 6) and $3F) or $80); // 6 bits
S[StrPos + 2] := AnsiChar((Ch and $3F) or $80); // 6 bits
Inc(StrPos, 3);
end;
end
else
if Ch <= $1FFFFF then
begin
// 21 bits to store
Result := (StrPos > 0) and (StrPos < (StrLength - 2));
if Result then
begin
S[StrPos] := AnsiChar($F0 or (Ch shr 18)); // 3 bits
S[StrPos + 1] := AnsiChar(((Ch shr 12) and $3F) or $80); // 6 bits
S[StrPos + 2] := AnsiChar(((Ch shr 6) and $3F) or $80); // 6 bits
S[StrPos + 3] := AnsiChar((Ch and $3F) or $80); // 6 bits
Inc(StrPos, 4);
end;
end
else
if Ch <= $3FFFFFF then
begin
// 26 bits to store
Result := (StrPos > 0) and (StrPos < (StrLength - 2));
if Result then
begin
S[StrPos] := AnsiChar($F8 or (Ch shr 24)); // 2 bits
S[StrPos + 1] := AnsiChar(((Ch shr 18) and $3F) or $80); // 6 bits
S[StrPos + 2] := AnsiChar(((Ch shr 12) and $3F) or $80); // 6 bits
S[StrPos + 3] := AnsiChar(((Ch shr 6) and $3F) or $80); // 6 bits
S[StrPos + 4] := AnsiChar((Ch and $3F) or $80); // 6 bits
Inc(StrPos, 5);
end;
end
else
if Ch <= MaximumUCS4 then
begin
// 31 bits to store
Result := (StrPos > 0) and (StrPos < (StrLength - 3));
if Result then
begin
S[StrPos] := AnsiChar($FC or (Ch shr 30)); // 1 bits
S[StrPos + 1] := AnsiChar(((Ch shr 24) and $3F) or $80); // 6 bits
S[StrPos + 2] := AnsiChar(((Ch shr 18) and $3F) or $80); // 6 bits
S[StrPos + 3] := AnsiChar(((Ch shr 12) and $3F) or $80); // 6 bits
S[StrPos + 4] := AnsiChar(((Ch shr 6) and $3F) or $80); // 6 bits
S[StrPos + 5] := AnsiChar((Ch and $3F) or $80); // 6 bits
Inc(StrPos, 6);
end;
end
else
begin
{$IFDEF UNICODE_SILENT_FAILURE}
// add ReplacementCharacter
Result := (StrPos > 0) and (StrPos < (StrLength - 1));
if Result then
begin
S[StrPos] := AnsiChar($E0 or (UCS4ReplacementCharacter shr 12)); // 4 bits
S[StrPos + 1] := AnsiChar(((UCS4ReplacementCharacter shr 6) and $3F) or $80); // 6 bits
S[StrPos + 2] := AnsiChar((UCS4ReplacementCharacter and $3F) or $80); // 6 bits
Inc(StrPos, 3);
end;
{$ELSE ~UNICODE_SILENT_FAILURE}
StrPos := -1;
Result := False;
{$ENDIF ~UNICODE_SILENT_FAILURE}
end;
end;
function UTF8SetNextCharToStream(S: TStream; Ch: UCS4): Boolean;
begin
if Ch <= $7F then
// 7 bits to store
Result := StreamWriteByte(S,Ch)
else
if Ch <= $7FF then
// 11 bits to store
Result := StreamWriteByte(S, $C0 or (Ch shr 6)) and // 5 bits
StreamWriteByte(S, (Ch and $3F) or $80) // 6 bits
else
if Ch <= $FFFF then
// 16 bits to store
Result := StreamWriteByte(S, $E0 or (Ch shr 12)) and // 4 bits
StreamWriteByte(S, ((Ch shr 6) and $3F) or $80) and // 6 bits
StreamWriteByte(S, (Ch and $3F) or $80) // 6 bits
else
if Ch <= $1FFFFF then
// 21 bits to store
Result := StreamWriteByte(S, $F0 or (Ch shr 18)) and // 3 bits
StreamWriteByte(S, ((Ch shr 12) and $3F) or $80) and // 6 bits
StreamWriteByte(S, ((Ch shr 6) and $3F) or $80) and // 6 bits
StreamWriteByte(S, (Ch and $3F) or $80) // 6 bits
else
if Ch <= $3FFFFFF then
// 26 bits to store
Result := StreamWriteByte(S, $F8 or (Ch shr 24)) and // 2 bits
StreamWriteByte(S, ((Ch shr 18) and $3F) or $80) and // 6 bits
StreamWriteByte(S, ((Ch shr 12) and $3F) or $80) and // 6 bits
StreamWriteByte(S, ((Ch shr 6) and $3F) or $80) and // 6 bits
StreamWriteByte(S, (Ch and $3F) or $80) // 6 bits
else
if Ch <= MaximumUCS4 then
// 31 bits to store
Result := StreamWriteByte(S, $FC or (Ch shr 30)) and // 1 bits
StreamWriteByte(S, ((Ch shr 24) and $3F) or $80) and // 6 bits
StreamWriteByte(S, ((Ch shr 18) and $3F) or $80) and // 6 bits
StreamWriteByte(S, ((Ch shr 12) and $3F) or $80) and // 6 bits
StreamWriteByte(S, ((Ch shr 6) and $3F) or $80) and // 6 bits
StreamWriteByte(S, (Ch and $3F) or $80) // 6 bits
else
{$IFDEF UNICODE_SILENT_FAILURE}
// add ReplacementCharacter
Result := StreamWriteByte(S, $E0 or (UCS4ReplacementCharacter shr 12)) and // 4 bits
StreamWriteByte(S, ((UCS4ReplacementCharacter shr 6) and $3F) or $80) and // 6 bits
StreamWriteByte(S, (UCS4ReplacementCharacter and $3F) or $80); // 6 bits
{$ELSE ~UNICODE_SILENT_FAILURE}
Result := False;
{$ENDIF ~UNICODE_SILENT_FAILURE}
end;
// if UNICODE_SILENT_FAILURE is defined, invalid sequences will be replaced by ReplacementCharacter
// otherwise StrPos is set to -1 on return to flag an error (invalid UTF16 sequence)
// StrPos will be incremented by the number of chars that were read
function UTF16GetNextChar(const S: TUTF16String; var StrPos: SizeInt): UCS4;
var
StrLength: SizeInt;
ChNext: UCS4;
begin
StrLength := Length(S);
if (StrPos <= StrLength) and (StrPos > 0) then
begin
Result := UCS4(S[StrPos]);
case Result of
SurrogateHighStart..SurrogateHighEnd:
begin
// 2 bytes to read
if StrPos >= StrLength then
begin
FlagInvalidSequence(StrPos, 1, Result);
Exit;
end;
ChNext := UCS4(S[StrPos + 1]);
if (ChNext < SurrogateLowStart) or (ChNext > SurrogateLowEnd) then
begin
FlagInvalidSequence(StrPos, 1, Result);
Exit;
end;
Result := ((Result - SurrogateHighStart) shl HalfShift) + (ChNext - SurrogateLowStart) + HalfBase;
Inc(StrPos, 2);
end;
SurrogateLowStart..SurrogateLowEnd:
FlagInvalidSequence(StrPos, 1, Result);
else
// 1 byte to read
Inc(StrPos);
end;
end
else
begin
// StrPos > StrLength
Result := 0;
FlagInvalidSequence(StrPos, 0, Result);
end;
end;
// if UNICODE_SILENT_FAILURE is defined, invalid sequences will be replaced by ReplacementCharacter
// otherwise StrPos is set to -1 on return to flag an error (invalid UTF16 sequence)
// StrPos will be incremented by the number of chars that were read
{$IFDEF SUPPORTS_UNICODE_STRING}
function UTF16GetNextChar(const S: UnicodeString; var StrPos: SizeInt): UCS4;
var
StrLength: SizeInt;
ChNext: UCS4;
begin
StrLength := Length(S);
if (StrPos <= StrLength) and (StrPos > 0) then
begin
Result := UCS4(S[StrPos]);
case Result of
SurrogateHighStart..SurrogateHighEnd:
begin
// 2 bytes to read
if StrPos >= StrLength then
begin
FlagInvalidSequence(StrPos, 1, Result);
Exit;
end;
ChNext := UCS4(S[StrPos + 1]);
if (ChNext < SurrogateLowStart) or (ChNext > SurrogateLowEnd) then
begin
FlagInvalidSequence(StrPos, 1, Result);
Exit;
end;
Result := ((Result - SurrogateHighStart) shl HalfShift) + (ChNext - SurrogateLowStart) + HalfBase;
Inc(StrPos, 2);
end;
SurrogateLowStart..SurrogateLowEnd:
FlagInvalidSequence(StrPos, 1, Result);
else
// 1 byte to read
Inc(StrPos);
end;
end
else
begin
// StrPos > StrLength
Result := 0;
FlagInvalidSequence(StrPos, 0, Result);
end;
end;
{$ENDIF SUPPORTS_UNICODE_STRING}
function UTF16GetNextCharFromStream(S: TStream; out Ch: UCS4): Boolean;
var
W: Word;
begin
Result := StreamReadWord(S, W);
if not Result then
Exit;
Ch := UCS4(W);
case W of
SurrogateHighStart..SurrogateHighEnd:
begin
// 2 bytes to read
Result := StreamReadWord(S, W);
if not Result then
Exit;
if (W < SurrogateLowStart) or (W > SurrogateLowEnd) then
begin
FlagInvalidSequence(Ch);
Exit;
end;
Ch := ((Ch - SurrogateHighStart) shl HalfShift) + (W - SurrogateLowStart) + HalfBase;
end;
SurrogateLowStart..SurrogateLowEnd:
FlagInvalidSequence(Ch);
else
// 1 byte to read
// nothing to do
end;
end;
// if UNICODE_SILENT_FAILURE is defined, invalid sequences will be replaced by ReplacementCharacter
// otherwise StrPos is set to -1 on return to flag an error (invalid UTF16 sequence)
// StrPos will be decremented by the number of chars that were read
function UTF16GetPreviousChar(const S: TUTF16String; var StrPos: SizeInt): UCS4;
var
StrLength: SizeInt;
ChPrev: UCS4;
begin
StrLength := Length(S);
if (StrPos <= (StrLength + 1)) and (StrPos > 1) then
begin
Result := UCS4(S[StrPos - 1]);
case Result of
SurrogateHighStart..SurrogateHighEnd:
FlagInvalidSequence(StrPos, -1, Result);
SurrogateLowStart..SurrogateLowEnd:
begin
// 2 bytes to read
if StrPos <= 2 then
begin
FlagInvalidSequence(StrPos, -1, Result);
Exit;
end;
ChPrev := UCS4(S[StrPos - 2]);
if (ChPrev < SurrogateHighStart) or (ChPrev > SurrogateHighEnd) then
begin
FlagInvalidSequence(StrPos, -1, Result);
Exit;
end;
Result := ((ChPrev - SurrogateHighStart) shl HalfShift) + (Result - SurrogateLowStart) + HalfBase;
Dec(StrPos, 2);
end;
else
// 1 byte to read
Dec(StrPos);
end;
end
else
begin
// StrPos > StrLength
Result := 0;
FlagInvalidSequence(StrPos, 0, Result);
end;
end;
// if UNICODE_SILENT_FAILURE is defined, invalid sequences will be replaced by ReplacementCharacter
// otherwise StrPos is set to -1 on return to flag an error (invalid UTF16 sequence)
// StrPos will be decremented by the number of chars that were read
{$IFDEF SUPPORTS_UNICODE_STRING}
function UTF16GetPreviousChar(const S: UnicodeString; var StrPos: SizeInt): UCS4;
var
StrLength: SizeInt;
ChPrev: UCS4;
begin
StrLength := Length(S);
if (StrPos <= (StrLength + 1)) and (StrPos > 1) then
begin
Result := UCS4(S[StrPos - 1]);
case Result of
SurrogateHighStart..SurrogateHighEnd:
FlagInvalidSequence(StrPos, -1, Result);
SurrogateLowStart..SurrogateLowEnd:
begin
// 2 bytes to read
if StrPos <= 2 then
begin
FlagInvalidSequence(StrPos, -1, Result);
Exit;
end;
ChPrev := UCS4(S[StrPos - 2]);
if (ChPrev < SurrogateHighStart) or (ChPrev > SurrogateHighEnd) then
begin
FlagInvalidSequence(StrPos, -1, Result);
Exit;
end;
Result := ((ChPrev - SurrogateHighStart) shl HalfShift) + (Result - SurrogateLowStart) + HalfBase;
Dec(StrPos, 2);
end;
else
// 1 byte to read
Dec(StrPos);
end;
end
else
begin
// StrPos > StrLength
Result := 0;
FlagInvalidSequence(StrPos, 0, Result);
end;
end;
{$ENDIF SUPPORTS_UNICODE_STRING}
// returns False if String is too small
// if UNICODE_SILENT_FAILURE is not defined StrPos is set to -1 on error (invalid UTF16 sequence)
// StrPos will be incremented by the number of chars that were skipped
// On return, NbSeq contains the number of UTF16 sequences that were skipped
function UTF16SkipChars(const S: TUTF16String; var StrPos: SizeInt; var NbSeq: SizeInt): Boolean;
var
StrLength, Index: SizeInt;
Ch, ChNext: UCS4;
begin
Result := True;
StrLength := Length(S);
Index := 0;
if NbSeq >= 0 then
while (Index < NbSeq) and (StrPos > 0) do
begin
Ch := UCS4(S[StrPos]);
case Ch of
SurrogateHighStart..SurrogateHighEnd:
// 2 bytes to skip
if StrPos >= StrLength then
FlagInvalidSequence(StrPos, 1)
else
begin
ChNext := UCS4(S[StrPos + 1]);
if (ChNext < SurrogateLowStart) or (ChNext > SurrogateLowEnd) then
FlagInvalidSequence(StrPos, 1)
else
Inc(StrPos, 2);
end;
SurrogateLowStart..SurrogateLowEnd:
// error
FlagInvalidSequence(StrPos, 1);
else
// 1 byte to skip
Inc(StrPos);
end;
if StrPos <> -1 then
Inc(Index);
if (StrPos > StrLength) and (Index < NbSeq) then
begin
Result := False;
Break;
end;
end
else
while (Index > NbSeq) and (StrPos > 1) do
begin
Ch := UCS4(S[StrPos - 1]);
case Ch of
SurrogateHighStart..SurrogateHighEnd:
// error
FlagInvalidSequence(StrPos, -1);
SurrogateLowStart..SurrogateLowEnd:
// 2 bytes to skip
if StrPos <= 2 then
FlagInvalidSequence(StrPos, -1)
else
begin
ChNext := UCS4(S[StrPos - 2]);
if (ChNext < SurrogateHighStart) or (ChNext > SurrogateHighEnd) then
FlagInvalidSequence(StrPos, -1)
else
Dec(StrPos, 2);
end;
else
// 1 byte to skip
Dec(StrPos);
end;
if StrPos <> -1 then
Dec(Index);
if (StrPos = 1) and (Index > NbSeq) then
begin
Result := False;
Break;
end;
end;
NbSeq := Index;
end;
// returns False if String is too small
// if UNICODE_SILENT_FAILURE is not defined StrPos is set to -1 on error (invalid UTF16 sequence)
// StrPos will be incremented by the number of chars that were skipped
// On return, NbSeq contains the number of UTF16 sequences that were skipped
{$IFDEF SUPPORTS_UNICODE_STRING}
function UTF16SkipChars(const S: UnicodeString; var StrPos: SizeInt; var NbSeq: SizeInt): Boolean;
var
StrLength, Index: SizeInt;
Ch, ChNext: UCS4;
begin
Result := True;
StrLength := Length(S);
Index := 0;
if NbSeq >= 0 then
while (Index < NbSeq) and (StrPos > 0) do
begin
Ch := UCS4(S[StrPos]);
case Ch of
SurrogateHighStart..SurrogateHighEnd:
// 2 bytes to skip
if StrPos >= StrLength then
FlagInvalidSequence(StrPos, 1)
else
begin
ChNext := UCS4(S[StrPos + 1]);
if (ChNext < SurrogateLowStart) or (ChNext > SurrogateLowEnd) then
FlagInvalidSequence(StrPos, 1)
else
Inc(StrPos, 2);
end;
SurrogateLowStart..SurrogateLowEnd:
// error
FlagInvalidSequence(StrPos, 1);
else
// 1 byte to skip
Inc(StrPos);
end;
if StrPos <> -1 then
Inc(Index);
if (StrPos > StrLength) and (Index < NbSeq) then
begin
Result := False;
Break;
end;
end
else
while (Index > NbSeq) and (StrPos > 1) do
begin
Ch := UCS4(S[StrPos - 1]);
case Ch of
SurrogateHighStart..SurrogateHighEnd:
// error
FlagInvalidSequence(StrPos, -1);
SurrogateLowStart..SurrogateLowEnd:
// 2 bytes to skip
if StrPos <= 2 then
FlagInvalidSequence(StrPos, -1)
else
begin
ChNext := UCS4(S[StrPos - 2]);
if (ChNext < SurrogateHighStart) or (ChNext > SurrogateHighEnd) then
FlagInvalidSequence(StrPos, -1)
else
Dec(StrPos, 2);
end;
else
// 1 byte to skip
Dec(StrPos);
end;
if StrPos <> -1 then
Dec(Index);
if (StrPos = 1) and (Index > NbSeq) then
begin
Result := False;
Break;
end;
end;
NbSeq := Index;
end;
{$ENDIF SUPPORTS_UNICODE_STRING}
function UTF16SkipCharsFromStream(S: TStream; var NbSeq: SizeInt): Boolean;
var
Index: SizeInt;
W: Word;
begin
Index := 0;
while Index < NbSeq do
begin
Result := StreamReadWord(S, W);
if not Result then
Break;
case W of
SurrogateHighStart..SurrogateHighEnd:
// 2 bytes to skip
begin
Result := StreamReadWord(S, W);
if not Result then
Break;
if (W < SurrogateLowStart) or (W > SurrogateLowEnd) then
FlagInvalidSequence;
end;
SurrogateLowStart..SurrogateLowEnd:
// error
FlagInvalidSequence;
else
// 1 byte to skip
// nothing to do
end;
Inc(Index);
end;
Result := Index = NbSeq;
NbSeq := Index;
end;
// returns False on error:
// - if an UCS4 character cannot be stored to an UTF-8 string:
// - if UNICODE_SILENT_FAILURE is defined, ReplacementCharacter is added
// - if UNICODE_SILENT_FAILURE is not defined, StrPos is set to -1
// - StrPos > -1 flags string being too small, caller is responsible for allocating space
// StrPos will be incremented by the number of chars that were written
function UTF16SetNextChar(var S: TUTF16String; var StrPos: SizeInt; Ch: UCS4): Boolean;
var
StrLength: SizeInt;
begin
StrLength := Length(S);
if Ch <= MaximumUCS2 then
begin
// 16 bits to store in place
Result := (StrPos > 0) and (StrPos <= StrLength);
if Result then
begin
S[StrPos] := WideChar(Ch);
Inc(StrPos);
end;
end
else
if Ch <= MaximumUTF16 then
begin
// stores a surrogate pair
Result := (StrPos > 0) and (StrPos < StrLength);
if Result then
begin
Ch := Ch - HalfBase;
S[StrPos] := WideChar((Ch shr HalfShift) or SurrogateHighStart);
S[StrPos + 1] := WideChar((Ch and HalfMask) or SurrogateLowStart);
Inc(StrPos, 2);
end;
end
else
begin
{$IFDEF UNICODE_SILENT_FAILURE}
// add ReplacementCharacter
Result := (StrPos > 0) and (StrPos <= StrLength);
if Result then
begin
S[StrPos] := WideChar(UCS4ReplacementCharacter);
Inc(StrPos, 1);
end;
{$ELSE ~UNICODE_SILENT_FAILURE}
StrPos := -1;
Result := False;
{$ENDIF ~UNICODE_SILENT_FAILURE}
end;
end;
{$IFDEF SUPPORTS_UNICODE_STRING}
function UTF16SetNextChar(var S: UnicodeString; var StrPos: SizeInt; Ch: UCS4): Boolean;
var
StrLength: SizeInt;
begin
StrLength := Length(S);
if Ch <= MaximumUCS2 then
begin
// 16 bits to store in place
Result := (StrPos > 0) and (StrPos <= StrLength);
if Result then
begin
S[StrPos] := WideChar(Ch);
Inc(StrPos);
end;
end
else
if Ch <= MaximumUTF16 then
begin
// stores a surrogate pair
Result := (StrPos > 0) and (StrPos < StrLength);
if Result then
begin
Ch := Ch - HalfBase;
S[StrPos] := WideChar((Ch shr HalfShift) + SurrogateHighStart);
S[StrPos + 1] := WideChar((Ch and HalfMask) + SurrogateLowStart);
Inc(StrPos, 2);
end;
end
else
begin
{$IFDEF UNICODE_SILENT_FAILURE}
// add ReplacementCharacter
Result := (StrPos > 0) and (StrPos <= StrLength);
if Result then
begin
S[StrPos] := WideChar(UCS4ReplacementCharacter);
Inc(StrPos, 1);
end;
{$ELSE ~UNICODE_SILENT_FAILURE}
StrPos := -1;
Result := False;
{$ENDIF ~UNICODE_SILENT_FAILURE}
end;
end;
{$ENDIF SUPPORTS_UNICODE_STRING}
function UTF16SetNextCharToStream(S: TStream; Ch: UCS4): Boolean;
begin
if Ch <= MaximumUCS2 then
// 16 bits to store in place
Result := StreamWriteWord(S, Ch)
else
if Ch <= MaximumUTF16 then
// stores a surrogate pair
Result := StreamWriteWord(S, (Ch shr HalfShift) or SurrogateHighStart) and
StreamWriteWord(S, (Ch and HalfMask) or SurrogateLowStart)
else
begin
{$IFDEF UNICODE_SILENT_FAILURE}
// add ReplacementCharacter
Result := StreamWriteWord(S, UCS4ReplacementCharacter);
{$ELSE ~UNICODE_SILENT_FAILURE}
Result := False;
{$ENDIF ~UNICODE_SILENT_FAILURE}
end;
end;
// AnsiGetNextChar = read next character at StrPos
// StrPos will be incremented by the number of chars that were read (1)
function AnsiGetNextChar(const S: AnsiString; var StrPos: SizeInt): UCS4;
var
StrLen, TmpPos: SizeInt;
UTF16Buffer: TUTF16String;
begin
StrLen := Length(S);
if (StrPos <= StrLen) and (StrPos > 0) then
begin
UTF16Buffer := WideString(S[StrPos]);
TmpPos := 1;
Result := UTF16GetNextChar(UTF16Buffer, TmpPos);
if TmpPos = -1 then
StrPos := -1
else
Inc(StrPos);
end
else
begin
// StrPos > StrLength
Result := 0;
FlagInvalidSequence(StrPos, 0, Result);
end;
end;
function AnsiGetNextCharFromStream(S: TStream; out Ch: UCS4): Boolean;
var
B: Byte;
TmpPos: SizeInt;
UTF16Buffer: TUTF16String;
begin
Result := StreamReadByte(S, B);
if not Result then
Exit;
UTF16Buffer := WideString(AnsiString(Chr(B)));
TmpPos := 1;
Ch := UTF16GetNextChar(UTF16Buffer, TmpPos);
Result := TmpPos <> -1;
end;
// AnsiGetNextChar = read next character at StrPos
// StrPos will be incremented by the number of chars that were read (1)
function AnsiGetNextChar(const S: AnsiString; CodePage: Word; var StrPos: SizeInt): UCS4;
var
StrLen, TmpPos: SizeInt;
UTF16Buffer: TUTF16String;
begin
StrLen := Length(S);
if (StrPos <= StrLen) and (StrPos > 0) then
begin
SetLength(UTF16Buffer, 2);
if MultiByteToWideChar(CodePage, MB_PRECOMPOSED or MB_ERR_INVALID_CHARS, @S[StrPos], 1, PWideChar(UTF16Buffer), 2) = 0 then
begin
Result := UCS4ReplacementCharacter;
FlagInvalidSequence(StrPos, 0, Result);
Exit;
end;
TmpPos := 1;
Result := UTF16GetNextChar(UTF16Buffer, TmpPos);
if TmpPos = -1 then
StrPos := -1
else
Inc(StrPos);
end
else
begin
// StrPos > StrLength
Result := 0;
FlagInvalidSequence(StrPos, 0, Result);
end;
end;
function AnsiGetNextCharFromStream(S: TStream; CodePage: Word; out Ch: UCS4): Boolean;
var
B: Byte;
TmpPos: SizeInt;
UTF16Buffer: TUTF16String;
begin
Result := StreamReadByte(S, B);
if not Result then
Exit;
SetLength(UTF16Buffer, 2);
if MultiByteToWideChar(CodePage, MB_PRECOMPOSED or MB_ERR_INVALID_CHARS, @B, 1, PWideChar(UTF16Buffer), 2) = 0 then
begin
Result := False;
Ch := UCS4ReplacementCharacter;
Exit;
end;
TmpPos := 1;
Ch := UTF16GetNextChar(UTF16Buffer, TmpPos);
Result := TmpPos <> -1;
end;
// AnsiSkipChars = skip NbSeq characters starting from StrPos
// returns False if String is too small
// StrPos will be incremented by the number of chars that were skipped
// On return, NbChar contains the number of UTF16 sequences that were skipped
function AnsiSkipChars(const S: AnsiString; var StrPos: SizeInt; var NbSeq: SizeInt): Boolean;
var
StrLen: SizeInt;
begin
StrLen := Length(S);
if StrPos > 0 then
begin
if StrPos + NbSeq > StrLen then
begin
NbSeq := StrLen + 1 - StrPos;
StrPos := StrLen + 1;
Result := False;
end
else
begin
// NbSeq := NbSeq;
StrPos := StrLen + NbSeq;
Result := True;
end;
end
else
begin
// previous error
NbSeq := 0;
// StrPos := -1;
Result := False;
end;
end;
function AnsiSkipCharsFromStream(S: TStream; var NbSeq: SizeInt): Boolean;
var
Index: SizeInt;
B: Byte;
begin
Index := 0;
while Index < NbSeq do
begin
Result := StreamReadByte(S, B);
if not Result then
Break;
Inc(Index);
end;
Result := Index = NbSeq;
NbSeq := Index;
end;
// AnsiSetNextChar = append a character at StrPos
// returns False on error:
// - if an UCS4 character cannot be stored to an ansi string:
// - if UNICODE_SILENT_FAILURE is defined, ReplacementCharacter is added
// - if UNICODE_SILENT_FAILURE is not defined, StrPos is set to -1
// - StrPos > -1 flags string being too small, callee did nothing and caller is responsible for allocating space
// StrPos will be incremented by the number of chars that were written (1)
function AnsiSetNextChar(var S: AnsiString; var StrPos: SizeInt; Ch: UCS4): Boolean;
var
StrLen, TmpPos: SizeInt;
UTF16Buffer: TUTF16String;
AnsiBuffer: AnsiString;
begin
StrLen := Length(S);
Result := (StrPos > 0) and (StrPos <= StrLen);
if Result then
begin
SetLength(UTF16Buffer, 2);
TmpPos := 1;
Result := UTF16SetNextChar(UTF16Buffer, TmpPos, Ch);
if Result and (TmpPos = 2) then
begin
// one wide character
AnsiBuffer := AnsiString(WideString(UTF16Buffer[1]));
S[StrPos] := AnsiBuffer[1];
Inc(StrPos);
end
else
if Result and (TmpPos = 3) then
begin
// one surrogate pair
AnsiBuffer := AnsiString(UTF16Buffer);
S[StrPos] := AnsiBuffer[1];
Inc(StrPos);
end
else
begin
{$IFDEF UNICODE_SILENT_FAILURE}
// add ReplacementCharacter
S[StrPos] := AnsiReplacementCharacter;
Inc(StrPos);
{$ELSE ~UNICODE_SILENT_FAILURE}
Result := False;
StrPos := -1;
{$ENDIF ~UNICODE_SILENT_FAILURE}
end;
end;
end;
function AnsiSetNextCharToStream(S: TStream; Ch: UCS4): Boolean;
var
TmpPos: SizeInt;
UTF16Buffer: TUTF16String;
AnsiBuffer: AnsiString;
begin
SetLength(UTF16Buffer, 2);
TmpPos := 1;
Result := UTF16SetNextChar(UTF16Buffer, TmpPos, Ch);
if Result and (TmpPos = 2) then
begin
// one wide character
AnsiBuffer := AnsiString(WideString(UTF16Buffer[1]));
Result := StreamWriteByte(S, Ord(AnsiBuffer[1]));
end
else
if Result and (TmpPos = 3) then
begin
// one surrogate pair
AnsiBuffer := AnsiString(UTF16Buffer);
Result := StreamWriteByte(S, Ord(AnsiBuffer[1]));
end
else
begin
{$IFDEF UNICODE_SILENT_FAILURE}
// add ReplacementCharacter
Result := StreamWriteByte(S, Ord(AnsiReplacementCharacter));
{$ELSE ~UNICODE_SILENT_FAILURE}
Result := False;
{$ENDIF ~UNICODE_SILENT_FAILURE}
end;
end;
function AnsiSetNextChar(var S: AnsiString; CodePage: Word; var StrPos: SizeInt; Ch: UCS4): Boolean;
var
StrLen, TmpPos: SizeInt;
UTF16Buffer: TUTF16String;
AnsiCharacter: AnsiChar;
begin
StrLen := Length(S);
Result := (StrPos > 0) and (StrPos <= StrLen);
if Result then
begin
SetLength(UTF16Buffer, 2);
TmpPos := 1;
Result := UTF16SetNextChar(UTF16Buffer, TmpPos, Ch);
Result := Result and (WideCharToMultiByte(CodePage, 0, PWideChar(UTF16Buffer), TmpPos-1, @AnsiCharacter, 1, nil, nil) > 0);
if Result then
begin
S[StrPos] := AnsiCharacter;
Inc(StrPos);
end;
if not Result then
begin
{$IFDEF UNICODE_SILENT_FAILURE}
// add ReplacementCharacter
S[StrPos] := AnsiReplacementCharacter;
Inc(StrPos);
{$ELSE ~UNICODE_SILENT_FAILURE}
Result := False;
StrPos := -1;
{$ENDIF ~UNICODE_SILENT_FAILURE}
end;
end;
end;
function AnsiSetNextCharToStream(S: TStream; CodePage: Word; Ch: UCS4): Boolean;
var
TmpPos: SizeInt;
UTF16Buffer: TUTF16String;
AnsiCharacter: AnsiChar;
begin
SetLength(UTF16Buffer, 2);
TmpPos := 1;
Result := UTF16SetNextChar(UTF16Buffer, TmpPos, Ch);
Result := Result and (WideCharToMultiByte(CodePage, 0, PWideChar(UTF16Buffer), TmpPos-1, @AnsiCharacter, 1, nil, nil) > 0);
if Result then
Result := StreamWriteByte(S, Ord(AnsiCharacter));
if not Result then
begin
{$IFDEF UNICODE_SILENT_FAILURE}
// add ReplacementCharacter
Result := StreamWriteByte(S, Ord(AnsiReplacementCharacter));
{$ELSE ~UNICODE_SILENT_FAILURE}
Result := False;
{$ENDIF ~UNICODE_SILENT_FAILURE}
end;
end;
// StringGetNextChar = read next character/sequence at StrPos
// if UNICODE_SILENT_FAILURE is defined, invalid sequences will be replaced by ReplacementCharacter
// otherwise StrPos is set to -1 on return to flag an error (invalid UTF16 sequence for WideString)
// StrPos will be incremented by the number of chars that were read
function StringGetNextChar(const S: string; var StrPos: SizeInt): UCS4;
begin
{$IFDEF SUPPORTS_UNICODE}
Result := UTF16GetNextChar(S, StrPos);
{$ELSE ~SUPPORTS_UNICODE}
Result := AnsiGetNextChar(S, StrPos);
{$ENDIF ~SUPPORTS_UNICODE}
end;
// StringSkipChars = skip NbSeq characters/sequences starting from StrPos
// returns False if String is too small
// if UNICODE_SILENT_FAILURE is not defined StrPos is set to -1 on error (invalid UTF16 sequence for WideString)
// StrPos will be incremented by the number of chars that were skipped
// On return, NbChar contains the number of UTF16 sequences that were skipped
function StringSkipChars(const S: string; var StrPos: SizeInt; var NbSeq: SizeInt): Boolean;
begin
{$IFDEF SUPPORTS_UNICODE}
Result := UTF16SkipChars(S, StrPos, NbSeq);
{$ELSE ~SUPPORTS_UNICODE}
Result := AnsiSkipChars(S, StrPos, NbSeq);
{$ENDIF ~SUPPORTS_UNICODE}
end;
// StringSetNextChar = append a character/sequence at StrPos
// returns False on error:
// - if an UCS4 character cannot be stored to a string:
// - if UNICODE_SILENT_FAILURE is defined, ReplacementCharacter is added
// - if UNICODE_SILENT_FAILURE is not defined, StrPos is set to -1
// - StrPos > -1 flags string being too small, callee did nothing and caller is responsible for allocating space
// StrPos will be incremented by the number of chars that were written
function StringSetNextChar(var S: string; var StrPos: SizeInt; Ch: UCS4): Boolean;
begin
{$IFDEF SUPPORTS_UNICODE}
Result := UTF16SetNextChar(S, StrPos, Ch);
{$ELSE ~SUPPORTS_UNICODE}
Result := AnsiSetNextChar(S, StrPos, Ch);
{$ENDIF ~SUPPORTS_UNICODE}
end;
function WideStringToUTF8(const S: WideString): TUTF8String;
begin
Result := UTF16ToUTF8(S);
end;
function UTF8ToWideString(const S: TUTF8String): WideString;
begin
Result := UTF8ToUTF16(S);
end;
function WideStringToUCS4(const S: WideString): TUCS4Array;
begin
Result := UTF16ToUCS4(S);
end;
function UCS4ToWideString(const S: TUCS4Array): WideString;
begin
Result := UCS4ToUTF16(S);
end;
function AnsiStringToUTF8(const S: AnsiString): TUTF8String;
var
WS: TUTF16String;
begin
WS := TUTF16String(S);
Result := UTF16ToUTF8(WS);
end;
function UTF8ToAnsiString(const S: TUTF8String): AnsiString;
var
WS: TUTF16String;
begin
WS := UTF8ToUTF16(S);
Result := AnsiString(WS);
end;
function AnsiStringToUTF16(const S: AnsiString): TUTF16String;
begin
Result := TUTF16String(S);
end;
function UTF16ToAnsiString(const S: TUTF16String): AnsiString;
begin
Result := AnsiString(S);
end;
function AnsiStringToUCS4(const S: AnsiString): TUCS4Array;
var
WS: TUTF16String;
begin
WS := TUTF16String(S);
Result := UTF16ToUCS4(WS);
end;
function UCS4ToAnsiString(const S: TUCS4Array): AnsiString;
var
WS: TUTF16String;
begin
WS := UCS4ToUTF16(S);
Result := AnsiString(WS);
end;
function StringToUTF8(const S: string): TUTF8String;
var
WS: TUTF16String;
begin
WS := TUTF16String(S);
Result := UTF16ToUTF8(WS);
end;
function TryStringToUTF8(const S: string; out D: TUTF8String): Boolean;
var
WS: TUTF16String;
begin
WS := TUTF16String(S);
Result := TryUTF16ToUTF8(WS, D);
end;
function UTF8ToString(const S: TUTF8String): string;
var
WS: TUTF16String;
begin
WS := UTF8ToUTF16(S);
Result := string(WS);
end;
function TryUTF8ToString(const S: TUTF8String; out D: string): Boolean;
var
WS: TUTF16String;
begin
Result := TryUTF8ToUTF16(S, WS);
D := string(WS);
end;
function StringToUTF16(const S: string): TUTF16String;
begin
Result := TUTF16String(S);
end;
function TryStringToUTF16(const S: string; out D: TUTF16String): Boolean;
begin
D := TUTF16String(S);
Result := True;
end;
function UTF16ToString(const S: TUTF16String): string;
begin
Result := string(S);
end;
function TryUTF16ToString(const S: TUTF16String; out D: string): Boolean;
begin
D := string(S);
Result := True;
end;
function StringToUCS4(const S: string): TUCS4Array;
var
WS: TUTF16String;
begin
WS := TUTF16String(S);
Result := UTF16ToUCS4(WS);
end;
function TryStringToUCS4(const S: string; out D: TUCS4Array): Boolean;
var
WS: TUTF16String;
begin
WS := TUTF16String(S);
Result := TryUTF16ToUCS4(WS, D);
end;
function UCS4ToString(const S: TUCS4Array): string;
var
WS: WideString;
begin
WS := UCS4ToUTF16(S);
Result := string(WS);
end;
function TryUCS4ToString(const S: TUCS4Array; out D: string): Boolean;
var
WS: WideString;
begin
Result := TryUCS4ToUTF16(S, WS);
D := string(WS);
end;
function UTF8ToUTF16(const S: TUTF8String): TUTF16String;
var
SrcIndex, SrcLength, DestIndex: SizeInt;
Ch: UCS4;
begin
if S = '' then
Result := ''
else
begin
SrcLength := Length(S);
SetLength(Result, SrcLength); // create enough room
SrcIndex := 1;
DestIndex := 1;
while SrcIndex <= SrcLength do
begin
Ch := UTF8GetNextChar(S, SrcIndex);
if SrcIndex = -1 then
raise EJclUnexpectedEOSequenceError.Create;
UTF16SetNextChar(Result, DestIndex, Ch);
end;
SetLength(Result, DestIndex - 1); // now fix up length
end;
end;
function TryUTF8ToUTF16(const S: TUTF8String; out D: TUTF16String): Boolean;
var
SrcIndex, SrcLength, DestIndex: SizeInt;
Ch: UCS4;
begin
Result := True;
if S = '' then
D := ''
else
begin
SrcLength := Length(S);
SetLength(D, SrcLength); // create enough room
SrcIndex := 1;
DestIndex := 1;
while SrcIndex <= SrcLength do
begin
Ch := UTF8GetNextChar(S, SrcIndex);
if SrcIndex = -1 then
begin
Result := False;
D := '';
Exit;
end;
UTF16SetNextChar(D, DestIndex, Ch);
end;
SetLength(D, DestIndex - 1); // now fix up length
end;
end;
function UTF16ToUTF8(const S: TUTF16String): TUTF8String;
var
SrcIndex, SrcLength, DestIndex: SizeInt;
Ch: UCS4;
begin
if S = '' then
Result := ''
else
begin
SrcLength := Length(S);
SetLength(Result, SrcLength * 3); // worste case
SrcIndex := 1;
DestIndex := 1;
while SrcIndex <= SrcLength do
begin
Ch := UTF16GetNextChar(S, SrcIndex);
if SrcIndex = -1 then
raise EJclUnexpectedEOSequenceError.Create;
UTF8SetNextChar(Result, DestIndex, Ch);
end;
SetLength(Result, DestIndex - 1); // now fix up length
end;
end;
function TryUTF16ToUTF8(const S: TUTF16String; out D: TUTF8String): Boolean;
var
SrcIndex, SrcLength, DestIndex: SizeInt;
Ch: UCS4;
begin
Result := True;
if S = '' then
D := ''
else
begin
SrcLength := Length(S);
SetLength(D, SrcLength * 3); // worste case
SrcIndex := 1;
DestIndex := 1;
while SrcIndex <= SrcLength do
begin
Ch := UTF16GetNextChar(S, SrcIndex);
if SrcIndex = -1 then
begin
Result := False;
D := '';
Exit;
end;
UTF8SetNextChar(D, DestIndex, Ch);
end;
SetLength(D, DestIndex - 1); // now fix up length
end;
end;
function UTF8ToUCS4(const S: TUTF8String): TUCS4Array;
var
SrcIndex, SrcLength, DestIndex: SizeInt;
Ch: UCS4;
begin
if S <> '' then
begin
SrcLength := Length(S);
SetLength(Result, SrcLength); // create enough room
SrcIndex := 1;
DestIndex := 0;
while SrcIndex <= SrcLength do
begin
Ch := UTF8GetNextChar(S, SrcIndex);
if SrcIndex = -1 then
raise EJclUnexpectedEOSequenceError.Create;
Result[DestIndex] := Ch;
Inc(DestIndex);
end;
SetLength(Result, DestIndex); // now fix up length
end;
end;
function TryUTF8ToUCS4(const S: TUTF8String; out D: TUCS4Array): Boolean;
var
SrcIndex, SrcLength, DestIndex: SizeInt;
Ch: UCS4;
begin
Result := True;
if S <> '' then
begin
SrcLength := Length(S);
SetLength(D, SrcLength); // create enough room
SrcIndex := 1;
DestIndex := 0;
while SrcIndex <= SrcLength do
begin
Ch := UTF8GetNextChar(S, SrcIndex);
if SrcIndex = -1 then
begin
Result := False;
SetLength(D, 0);
Exit;
end;
D[DestIndex] := Ch;
Inc(DestIndex);
end;
SetLength(D, DestIndex); // now fix up length
end;
end;
function UCS4ToUTF8(const S: TUCS4Array): TUTF8String;
var
SrcIndex, SrcLength, DestIndex: SizeInt;
begin
SrcLength := Length(S);
if Length(S) = 0 then
Result := ''
else
begin
SetLength(Result, SrcLength * 3); // assume worst case
DestIndex := 1;
for SrcIndex := 0 to SrcLength - 1 do
begin
UTF8SetNextChar(Result, DestIndex, S[SrcIndex]);
if DestIndex = -1 then
raise EJclUnexpectedEOSequenceError.Create;
end;
SetLength(Result, DestIndex - 1); // set to actual length
end;
end;
function TryUCS4ToUTF8(const S: TUCS4Array; out D: TUTF8String): Boolean;
var
SrcIndex, SrcLength, DestIndex: SizeInt;
begin
SrcLength := Length(S);
Result := True;
if Length(S) = 0 then
D := ''
else
begin
SetLength(D, SrcLength * 3); // assume worst case
DestIndex := 1;
for SrcIndex := 0 to SrcLength - 1 do
begin
UTF8SetNextChar(D, DestIndex, S[SrcIndex]);
if DestIndex = -1 then
begin
Result := False;
D := '';
Exit;
end;
end;
SetLength(D, DestIndex - 1); // set to actual length
end;
end;
function UTF16ToUCS4(const S: TUTF16String): TUCS4Array;
var
SrcIndex, SrcLength, DestIndex: SizeInt;
Ch: UCS4;
begin
if S <> '' then
begin
SrcLength := Length(S);
SetLength(Result, SrcLength); // create enough room
SrcIndex := 1;
DestIndex := 0;
while SrcIndex <= SrcLength do
begin
Ch := UTF16GetNextChar(S, SrcIndex);
if SrcIndex = -1 then
raise EJclUnexpectedEOSequenceError.Create;
Result[DestIndex] := Ch;
Inc(DestIndex);
end;
SetLength(Result, DestIndex); // now fix up length
end;
end;
function TryUTF16ToUCS4(const S: TUTF16String; out D: TUCS4Array): Boolean;
var
SrcIndex, SrcLength, DestIndex: SizeInt;
Ch: UCS4;
begin
Result := True;
if S <> '' then
begin
SrcLength := Length(S);
SetLength(D, SrcLength); // create enough room
SrcIndex := 1;
DestIndex := 0;
while SrcIndex <= SrcLength do
begin
Ch := UTF16GetNextChar(S, SrcIndex);
if SrcIndex = -1 then
begin
Result := False;
SetLength(D, 0);
Exit;
end;
D[DestIndex] := Ch;
Inc(DestIndex);
end;
SetLength(D, DestIndex); // now fix up length
end;
end;
function UCS4ToUTF16(const S: TUCS4Array): TUTF16String;
var
SrcIndex, SrcLength, DestIndex: SizeInt;
begin
SrcLength := Length(S);
if SrcLength = 0 then
Result := ''
else
begin
SetLength(Result, SrcLength * 3); // assume worst case
DestIndex := 1;
for SrcIndex := 0 to SrcLength - 1 do
begin
UTF16SetNextChar(Result, DestIndex, S[SrcIndex]);
if DestIndex = -1 then
raise EJclUnexpectedEOSequenceError.Create;
end;
SetLength(Result, DestIndex - 1); // set to actual length
end;
end;
function TryUCS4ToUTF16(const S: TUCS4Array; out D:TUTF16String): Boolean;
var
SrcIndex, SrcLength, DestIndex: SizeInt;
begin
SrcLength := Length(S);
Result := True;
if SrcLength = 0 then
D := ''
else
begin
SetLength(D, SrcLength * 3); // assume worst case
DestIndex := 1;
for SrcIndex := 0 to SrcLength - 1 do
begin
UTF16SetNextChar(D, DestIndex, S[SrcIndex]);
if DestIndex = -1 then
begin
Result := False;
D := '';
Exit;
end;
end;
SetLength(D, DestIndex - 1); // set to actual length
end;
end;
function UTF8CharCount(const S: TUTF8String): SizeInt;
var
StrPos: SizeInt;
begin
StrPos := 1;
Result := Length(S);
UTF8SkipChars(S, StrPos, Result);
if StrPos = -1 then
raise EJclUnexpectedEOSequenceError.Create;
end;
function UTF16CharCount(const S: TUTF16String): SizeInt;
var
StrPos: SizeInt;
begin
StrPos := 1;
Result := Length(S);
UTF16SkipChars(S, StrPos, Result);
if StrPos = -1 then
raise EJclUnexpectedEOSequenceError.Create;
end;
function UCS2CharCount(const S: TUCS2String): SizeInt;
begin
Result := Length(S);
end;
function UCS4CharCount(const S: TUCS4Array): SizeInt;
begin
Result := Length(S);
end;
function GetUCS4CharAt(const UTF8Str: TUTF8String; Index: SizeInt; out Value: UCS4): Boolean; overload;
var
StrPos: SizeInt;
begin
StrPos := 1;
Result := Index >= 0;
if Result then
Result := UTF8SkipChars(UTF8Str, StrPos, Index);
if StrPos = -1 then
raise EJclUnexpectedEOSequenceError.Create;
Result := Result and (StrPos <= Length(UTF8Str));
if Result then
begin
Value := UTF8GetNextChar(UTF8Str, StrPos);
if StrPos = -1 then
raise EJclUnexpectedEOSequenceError.Create;
end;
end;
function GetUCS4CharAt(const WideStr: WideString; Index: SizeInt; out Value: UCS4; IsUTF16: Boolean): Boolean; overload;
var
StrPos: SizeInt;
begin
if IsUTF16 then
begin
StrPos := 1;
Result := Index >= 0;
if Result then
Result := UTF16SkipChars(WideStr, StrPos, Index);
if StrPos = -1 then
raise EJclUnexpectedEOSequenceError.Create;
Result := Result and (StrPos <= Length(WideStr));
if Result then
begin
Value := UTF16GetNextChar(WideStr, StrPos);
if StrPos = -1 then
raise EJclUnexpectedEOSequenceError.Create;
end;
end
else
begin
Result := (Index >= 1) and (Index <= Length(WideStr));
Value := UCS4(WideStr[Index]);
end;
end;
function GetUCS4CharAt(const UCS4Str: TUCS4Array; Index: SizeInt; out Value: UCS4): Boolean; overload;
begin
Result := (Index >= 0) and (Index < Length(UCS4Str));
if Result then
Value := UCS4Str[Index];
end;
function UCS4ToAnsiChar(Value: UCS4): AnsiChar;
var
Buf: WideString;
StrPos: SizeInt;
begin
StrPos := 1;
Buf := #0#0;
if UTF16SetNextChar(Buf, StrPos, Value) then
Result := AnsiString(Buf)[1]
else
Result := AnsiReplacementCharacter;
end;
function UCS4ToWideChar(Value: UCS4): WideChar;
begin
if Value <= MaximumUCS2 then
Result := WideChar(Value)
else
Result := WideChar(UCS4ReplacementCharacter);
end;
function UCS4ToChar(Value: UCS4): Char;
begin
{$IFDEF SUPPORTS_UNICODE}
Result := UCS4ToWideChar(Value);
{$ELSE ~SUPPORTS_UNICODE}
Result := UCS4ToAnsiChar(Value);
{$ENDIF ~SUPPORTS_UNICODE}
end;
function AnsiCharToUCS4(Value: AnsiChar): UCS4;
var
Buf: WideString;
StrPos: SizeInt;
begin
StrPos := 1;
Buf := WideString(AnsiString(Value));
Result := UTF16GetNextChar(Buf, StrPos);
end;
function WideCharToUCS4(Value: WideChar): UCS4;
begin
Result := UCS4(Value);
end;
function CharToUCS4(Value: Char): UCS4;
begin
{$IFDEF SUPPORTS_UNICODE}
Result := WideCharToUCS4(Value);
{$ELSE ~SUPPORTS_UNICODE}
Result := AnsiCharToUCS4(Value);
{$ENDIF ~SUPPORTS_UNICODE}
end;
{$IFDEF UNITVERSIONING}
initialization
RegisterUnitVersion(HInstance, UnitVersioning);
finalization
UnregisterUnitVersion(HInstance);
{$ENDIF UNITVERSIONING}
end.