git-svn-id: https://192.168.0.254/svn/Componentes.Terceros.jcl@20 c37d764d-f447-7644-a108-883140d013fb
2635 lines
82 KiB
ObjectPascal
2635 lines
82 KiB
ObjectPascal
{**************************************************************************************************}
|
|
{ }
|
|
{ Project JEDI Code Library (JCL) }
|
|
{ }
|
|
{ The contents of this file are subject to the Mozilla Public License Version 1.1 (the "License"); }
|
|
{ you may not use this file except in compliance with the License. You may obtain a copy of the }
|
|
{ License at http://www.mozilla.org/MPL/ }
|
|
{ }
|
|
{ Software distributed under the License is distributed on an "AS IS" basis, WITHOUT WARRANTY OF }
|
|
{ ANY KIND, either express or implied. See the License for the specific language governing rights }
|
|
{ and limitations under the License. }
|
|
{ }
|
|
{ The Original Code is JclUnicode.pas. }
|
|
{ }
|
|
{ The Initial Developer of the Original Code is Mike Lischke (public att lischke-online dott de). }
|
|
{ Portions created by Mike Lischke are Copyright (C) 1999-2000 Mike Lischke. All Rights Reserved. }
|
|
{ }
|
|
{ Contributor(s): }
|
|
{ Marcel van Brakel }
|
|
{ Andreas Hausladen (ahuser) }
|
|
{ Mike Lischke }
|
|
{ Flier Lu (flier) }
|
|
{ Robert Marquardt (marquardt) }
|
|
{ Robert Rossmair (rrossmair) }
|
|
{ Olivier Sannier (obones) }
|
|
{ Matthias Thoma (mthoma) }
|
|
{ Petr Vones (pvones) }
|
|
{ Peter Schraut (http://www.console-dev.de) }
|
|
{ Florent Ouchet (outchy) }
|
|
{ }
|
|
{**************************************************************************************************}
|
|
{ }
|
|
{ String conversion routines }
|
|
{ }
|
|
{**************************************************************************************************}
|
|
{ }
|
|
{ Last modified: $Date:: 2009-08-09 15:08:29 +0200 (dim., 09 août 2009) $ }
|
|
{ Revision: $Rev:: 2921 $ }
|
|
{ Author: $Author:: outchy $ }
|
|
{ }
|
|
{**************************************************************************************************}
|
|
|
|
unit JclStringConversions;
|
|
|
|
{$I jcl.inc}
|
|
|
|
interface
|
|
|
|
uses
|
|
Classes,
|
|
{$IFDEF UNITVERSIONING}
|
|
JclUnitVersioning,
|
|
{$ENDIF UNITVERSIONING}
|
|
JclBase;
|
|
|
|
type
|
|
EJclStringConversionError = class(EJclError);
|
|
EJclUnexpectedEOSequenceError = class (EJclStringConversionError)
|
|
public
|
|
constructor Create;
|
|
end;
|
|
|
|
// conversion routines between Ansi, UTF-16, UCS-4 and UTF8 strings
|
|
|
|
// one shot conversion between PAnsiChar and PWideChar
|
|
procedure ExpandASCIIString(const Source: PAnsiChar; Target: PWideChar; Count: SizeInt);
|
|
|
|
// tpye of stream related functions
|
|
type
|
|
TJclStreamGetNextCharFunc = function(S: TStream; out Ch: UCS4): Boolean;
|
|
TJclStreamSkipCharsFunc = function(S: TStream; var NbSeq: SizeInt): Boolean;
|
|
TJclStreamSetNextCharFunc = function(S: TStream; Ch: UCS4): Boolean;
|
|
|
|
// iterative conversions
|
|
|
|
// UTF8GetNextChar = read next UTF8 sequence at StrPos
|
|
// if UNICODE_SILENT_FAILURE is defined, invalid sequences will be replaced by ReplacementCharacter
|
|
// otherwise StrPos is set to -1 on return to flag an error (invalid UTF8 sequence)
|
|
// StrPos will be incremented by the number of chars that were read
|
|
function UTF8GetNextChar(const S: TUTF8String; var StrPos: SizeInt): UCS4;
|
|
function UTF8GetNextCharFromStream(S: TStream; out Ch: UCS4): Boolean;
|
|
|
|
// UTF8SkipChars = skip NbSeq UTF8 sequences starting from StrPos
|
|
// returns False if String is too small
|
|
// if UNICODE_SILENT_FAILURE is not defined StrPos is set to -1 on error (invalid UTF8 sequence)
|
|
// StrPos will be incremented by the number of chars that were skipped
|
|
// On return, NbSeq contains the number of UTF8 sequences that were skipped
|
|
function UTF8SkipChars(const S: TUTF8String; var StrPos: SizeInt; var NbSeq: SizeInt): Boolean;
|
|
function UTF8SkipCharsFromStream(S: TStream; var NbSeq: SizeInt): Boolean;
|
|
|
|
// UTF8SetNextChar = append an UTF8 sequence at StrPos
|
|
// returns False on error:
|
|
// - if an UCS4 character cannot be stored to an UTF-8 string:
|
|
// - if UNICODE_SILENT_FAILURE is defined, ReplacementCharacter is added
|
|
// - if UNICODE_SILENT_FAILURE is not defined, StrPos is set to -1
|
|
// - StrPos > -1 flags string being too small, callee did nothing, caller is responsible for allocating space
|
|
// StrPos will be incremented by the number of chars that were written
|
|
function UTF8SetNextChar(var S: TUTF8String; var StrPos: SizeInt; Ch: UCS4): Boolean;
|
|
function UTF8SetNextCharToStream(S: TStream; Ch: UCS4): Boolean;
|
|
|
|
// UTF16GetNextChar = read next UTF16 sequence at StrPos
|
|
// if UNICODE_SILENT_FAILURE is defined, invalid sequences will be replaced by ReplacementCharacter
|
|
// otherwise StrPos is set to -1 on return to flag an error (invalid UTF16 sequence)
|
|
// StrPos will be incremented by the number of chars that were read
|
|
function UTF16GetNextChar(const S: TUTF16String; var StrPos: SizeInt): UCS4; overload;
|
|
{$IFDEF SUPPORTS_UNICODE_STRING}
|
|
function UTF16GetNextChar(const S: UnicodeString; var StrPos: SizeInt): UCS4; overload;
|
|
{$ENDIF SUPPORTS_UNICODE_STRING}
|
|
function UTF16GetNextCharFromStream(S: TStream; out Ch: UCS4): Boolean;
|
|
|
|
// UTF16GetPreviousChar = read previous UTF16 sequence starting at StrPos-1
|
|
// if UNICODE_SILENT_FAILURE is defined, invalid sequences will be replaced by ReplacementCharacter
|
|
// otherwise StrPos is set to -1 on return to flag an error (invalid UTF16 sequence)
|
|
// StrPos will be decremented by the number of chars that were read
|
|
function UTF16GetPreviousChar(const S: TUTF16String; var StrPos: SizeInt): UCS4; overload;
|
|
{$IFDEF SUPPORTS_UNICODE_STRING}
|
|
function UTF16GetPreviousChar(const S: UnicodeString; var StrPos: SizeInt): UCS4; overload;
|
|
{$ENDIF SUPPORTS_UNICODE_STRING}
|
|
|
|
// UTF16SkipChars = skip NbSeq UTF16 sequences starting from StrPos
|
|
// returns False if String is too small
|
|
// if UNICODE_SILENT_FAILURE is not defined StrPos is set to -1 on error (invalid UTF16 sequence)
|
|
// StrPos will be incremented by the number of chars that were skipped
|
|
// On return, NbChar contains the number of UTF16 sequences that were skipped
|
|
function UTF16SkipChars(const S: TUTF16String; var StrPos: SizeInt; var NbSeq: SizeInt): Boolean; overload;
|
|
{$IFDEF SUPPORTS_UNICODE_STRING}
|
|
function UTF16SkipChars(const S: UnicodeString; var StrPos: SizeInt; var NbSeq: SizeInt): Boolean; overload;
|
|
{$ENDIF SUPPORTS_UNICODE_STRING}
|
|
function UTF16SkipCharsFromStream(S: TStream; var NbSeq: SizeInt): Boolean;
|
|
|
|
// UTF16SetNextChar = append an UTF16 sequence at StrPos
|
|
// returns False on error:
|
|
// - if an UCS4 character cannot be stored to an UTF-16 string:
|
|
// - if UNICODE_SILENT_FAILURE is defined, ReplacementCharacter is added
|
|
// - if UNICODE_SILENT_FAILURE is not defined, StrPos is set to -1
|
|
// - StrPos > -1 flags string being too small, callee did nothing and caller is responsible for allocating space
|
|
// StrPos will be incremented by the number of chars that were written
|
|
function UTF16SetNextChar(var S: TUTF16String; var StrPos: SizeInt; Ch: UCS4): Boolean; overload;
|
|
{$IFDEF SUPPORTS_UNICODE_STRING}
|
|
function UTF16SetNextChar(var S: UnicodeString; var StrPos: SizeInt; Ch: UCS4): Boolean; overload;
|
|
{$ENDIF SUPPORTS_UNICODE_STRING}
|
|
function UTF16SetNextCharToStream(S: TStream; Ch: UCS4): Boolean;
|
|
|
|
// AnsiGetNextChar = read next character at StrPos
|
|
// StrPos will be incremented by the number of chars that were read (1)
|
|
function AnsiGetNextChar(const S: AnsiString; var StrPos: SizeInt): UCS4; overload;
|
|
function AnsiGetNextCharFromStream(S: TStream; out Ch: UCS4): Boolean; overload;
|
|
|
|
// same as AnsiGetNextChar* with custom codepage
|
|
function AnsiGetNextChar(const S: AnsiString; CodePage: Word; var StrPos: SizeInt): UCS4; overload;
|
|
function AnsiGetNextCharFromStream(S: TStream; CodePage: Word; out Ch: UCS4): Boolean; overload;
|
|
|
|
// AnsiSkipChars = skip NbSeq characters starting from StrPos
|
|
// returns False if String is too small
|
|
// StrPos will be incremented by the number of chars that were skipped
|
|
// On return, NbChar contains the number of UTF16 sequences that were skipped
|
|
function AnsiSkipChars(const S: AnsiString; var StrPos: SizeInt; var NbSeq: SizeInt): Boolean;
|
|
function AnsiSkipCharsFromStream(S: TStream; var NbSeq: SizeInt): Boolean;
|
|
|
|
// AnsiSetNextChar = append a character at StrPos
|
|
// returns False on error:
|
|
// - if an UCS4 character cannot be stored to an ansi string:
|
|
// - if UNICODE_SILENT_FAILURE is defined, ReplacementCharacter is added
|
|
// - if UNICODE_SILENT_FAILURE is not defined, StrPos is set to -1
|
|
// - StrPos > -1 flags string being too small, callee did nothing and caller is responsible for allocating space
|
|
// StrPos will be incremented by the number of chars that were written (1)
|
|
function AnsiSetNextChar(var S: AnsiString; var StrPos: SizeInt; Ch: UCS4): Boolean; overload;
|
|
function AnsiSetNextCharToStream(S: TStream; Ch: UCS4): Boolean; overload;
|
|
|
|
// same as AnsiSetNextChar* with custom codepage
|
|
function AnsiSetNextChar(var S: AnsiString; CodePage: Word; var StrPos: SizeInt; Ch: UCS4): Boolean; overload;
|
|
function AnsiSetNextCharToStream(S: TStream; CodePage: Word; Ch: UCS4): Boolean; overload;
|
|
|
|
// StringGetNextChar = read next character/sequence at StrPos
|
|
// if UNICODE_SILENT_FAILURE is defined, invalid sequences will be replaced by ReplacementCharacter
|
|
// otherwise StrPos is set to -1 on return to flag an error (invalid UTF16 sequence for WideString)
|
|
// StrPos will be incremented by the number of chars that were read
|
|
function StringGetNextChar(const S: string; var StrPos: SizeInt): UCS4; {$IFDEF SUPPORTS_INLINE} inline; {$ENDIF}
|
|
|
|
// StringSkipChars = skip NbSeq characters/sequences starting from StrPos
|
|
// returns False if String is too small
|
|
// if UNICODE_SILENT_FAILURE is not defined StrPos is set to -1 on error (invalid UTF16 sequence for WideString)
|
|
// StrPos will be incremented by the number of chars that were skipped
|
|
// On return, NbChar contains the number of UTF16 sequences that were skipped
|
|
function StringSkipChars(const S: string; var StrPos: SizeInt; var NbSeq: SizeInt): Boolean; {$IFDEF SUPPORTS_INLINE} inline; {$ENDIF}
|
|
|
|
// StringSetNextChar = append a character/sequence at StrPos
|
|
// returns False on error:
|
|
// - if an UCS4 character cannot be stored to a string:
|
|
// - if UNICODE_SILENT_FAILURE is defined, ReplacementCharacter is added
|
|
// - if UNICODE_SILENT_FAILURE is not defined, StrPos is set to -1
|
|
// - StrPos > -1 flags string being too small, callee did nothing and caller is responsible for allocating space
|
|
// StrPos will be incremented by the number of chars that were written
|
|
function StringSetNextChar(var S: string; var StrPos: SizeInt; Ch: UCS4): Boolean; {$IFDEF SUPPORTS_INLINE} inline; {$ENDIF}
|
|
|
|
// one shot conversions between WideString and others
|
|
function WideStringToUTF8(const S: WideString): TUTF8String; {$IFDEF SUPPORTS_INLINE} inline; {$ENDIF SUPPORTS_INLINE}
|
|
function UTF8ToWideString(const S: TUTF8String): WideString; {$IFDEF SUPPORTS_INLINE} inline; {$ENDIF SUPPORTS_INLINE}
|
|
function WideStringToUCS4(const S: WideString): TUCS4Array; {$IFDEF SUPPORTS_INLINE} inline; {$ENDIF SUPPORTS_INLINE}
|
|
function UCS4ToWideString(const S: TUCS4Array): WideString; {$IFDEF SUPPORTS_INLINE} inline; {$ENDIF SUPPORTS_INLINE}
|
|
|
|
// one shot conversions between AnsiString and others
|
|
function AnsiStringToUTF8(const S: AnsiString): TUTF8String; {$IFDEF SUPPORTS_INLINE} inline; {$ENDIF SUPPORTS_INLINE}
|
|
function UTF8ToAnsiString(const S: TUTF8String): AnsiString; {$IFDEF SUPPORTS_INLINE} inline; {$ENDIF SUPPORTS_INLINE}
|
|
function AnsiStringToUTF16(const S: AnsiString): TUTF16String; {$IFDEF SUPPORTS_INLINE} inline; {$ENDIF SUPPORTS_INLINE}
|
|
function UTF16ToAnsiString(const S: TUTF16String): AnsiString; {$IFDEF SUPPORTS_INLINE} inline; {$ENDIF SUPPORTS_INLINE}
|
|
function AnsiStringToUCS4(const S: AnsiString): TUCS4Array; {$IFDEF SUPPORTS_INLINE} inline; {$ENDIF SUPPORTS_INLINE}
|
|
function UCS4ToAnsiString(const S: TUCS4Array): AnsiString; {$IFDEF SUPPORTS_INLINE} inline; {$ENDIF SUPPORTS_INLINE}
|
|
|
|
// one shot conversions between string and others
|
|
function StringToUTF8(const S: string): TUTF8String; {$IFDEF SUPPORTS_INLINE} inline; {$ENDIF SUPPORTS_INLINE}
|
|
function UTF8ToString(const S: TUTF8String): string; {$IFDEF SUPPORTS_INLINE} inline; {$ENDIF SUPPORTS_INLINE}
|
|
function StringToUTF16(const S: string): TUTF16String; {$IFDEF SUPPORTS_INLINE} inline; {$ENDIF SUPPORTS_INLINE}
|
|
function UTF16ToString(const S: TUTF16String): string; {$IFDEF SUPPORTS_INLINE} inline; {$ENDIF SUPPORTS_INLINE}
|
|
function StringToUCS4(const S: string): TUCS4Array; {$IFDEF SUPPORTS_INLINE} inline; {$ENDIF SUPPORTS_INLINE}
|
|
function UCS4ToString(const S: TUCS4Array): string; {$IFDEF SUPPORTS_INLINE} inline; {$ENDIF SUPPORTS_INLINE}
|
|
|
|
function TryStringToUTF8(const S: string; out D: TUTF8String): Boolean; {$IFDEF SUPPORTS_INLINE} inline; {$ENDIF SUPPORTS_INLINE}
|
|
function TryUTF8ToString(const S: TUTF8String; out D: string): Boolean; {$IFDEF SUPPORTS_INLINE} inline; {$ENDIF SUPPORTS_INLINE}
|
|
function TryStringToUTF16(const S: string; out D: TUTF16String): Boolean; {$IFDEF SUPPORTS_INLINE} inline; {$ENDIF SUPPORTS_INLINE}
|
|
function TryUTF16ToString(const S: TUTF16String; out D: string): Boolean; {$IFDEF SUPPORTS_INLINE} inline; {$ENDIF SUPPORTS_INLINE}
|
|
function TryStringToUCS4(const S: string; out D: TUCS4Array): Boolean; {$IFDEF SUPPORTS_INLINE} inline; {$ENDIF SUPPORTS_INLINE}
|
|
function TryUCS4ToString(const S: TUCS4Array; out D: string): Boolean; {$IFDEF SUPPORTS_INLINE} inline; {$ENDIF SUPPORTS_INLINE}
|
|
|
|
function UTF8ToUTF16(const S: TUTF8String): TUTF16String;
|
|
function UTF16ToUTF8(const S: TUTF16String): TUTF8String;
|
|
function UTF8ToUCS4(const S: TUTF8String): TUCS4Array;
|
|
function UCS4ToUTF8(const S: TUCS4Array): TUTF8String;
|
|
function UTF16ToUCS4(const S: TUTF16String): TUCS4Array;
|
|
function UCS4ToUTF16(const S: TUCS4Array): TUTF16String;
|
|
|
|
function TryUTF8ToUTF16(const S: TUTF8String; out D: TUTF16String): Boolean;
|
|
function TryUTF16ToUTF8(const S: TUTF16String; out D: TUTF8String): Boolean;
|
|
function TryUTF8ToUCS4(const S: TUTF8String; out D: TUCS4Array): Boolean;
|
|
function TryUCS4ToUTF8(const S: TUCS4Array; out D: TUTF8String): Boolean;
|
|
function TryUTF16ToUCS4(const S: TUTF16String; out D: TUCS4Array): Boolean;
|
|
function TryUCS4ToUTF16(const S: TUCS4Array; out D: TUTF16String): Boolean;
|
|
|
|
// indexed conversions
|
|
function UTF8CharCount(const S: TUTF8String): SizeInt;
|
|
function UTF16CharCount(const S: TUTF16String): SizeInt;
|
|
function UCS2CharCount(const S: TUCS2String): SizeInt;
|
|
function UCS4CharCount(const S: TUCS4Array): SizeInt;
|
|
// returns False if string is too small
|
|
// if UNICODE_SILENT_FAILURE is not defined and an invalid UTFX sequence is detected, an exception is raised
|
|
// returns True on success and Value contains UCS4 character that was read
|
|
function GetUCS4CharAt(const UTF8Str: TUTF8String; Index: SizeInt; out Value: UCS4): Boolean; overload;
|
|
function GetUCS4CharAt(const WideStr: TUTF16String; Index: SizeInt; out Value: UCS4; IsUTF16: Boolean = True): Boolean; overload;
|
|
function GetUCS4CharAt(const UCS4Str: TUCS4Array; Index: SizeInt; out Value: UCS4): Boolean; overload;
|
|
|
|
function UCS4ToAnsiChar(Value: UCS4): AnsiChar;
|
|
function UCS4ToWideChar(Value: UCS4): WideChar;
|
|
function UCS4ToChar(Value: UCS4): Char; {$IFDEF SUPPORTS_INLINE} inline; {$ENDIF SUPPORTS_INLINE}
|
|
|
|
function AnsiCharToUCS4(Value: AnsiChar): UCS4;
|
|
function WideCharToUCS4(Value: WideChar): UCS4;
|
|
function CharToUCS4(Value: Char): UCS4; {$IFDEF SUPPORTS_INLINE} inline; {$ENDIF SUPPORTS_INLINE}
|
|
|
|
{$IFDEF UNITVERSIONING}
|
|
const
|
|
UnitVersioning: TUnitVersionInfo = (
|
|
RCSfile: '$URL: https://jcl.svn.sourceforge.net/svnroot/jcl/tags/JCL-2.1-Build3536/jcl/source/common/JclStringConversions.pas $';
|
|
Revision: '$Revision: 2921 $';
|
|
Date: '$Date: 2009-08-09 15:08:29 +0200 (dim., 09 août 2009) $';
|
|
LogPath: 'JCL\source\common';
|
|
Extra: '';
|
|
Data: nil
|
|
);
|
|
{$ENDIF UNITVERSIONING}
|
|
|
|
implementation
|
|
|
|
uses
|
|
{$IFDEF MSWINDOWS}
|
|
Windows,
|
|
{$ENDIF MSWINDOWS}
|
|
JclResources;
|
|
|
|
const MB_ERR_INVALID_CHARS = 8;
|
|
|
|
constructor EJclUnexpectedEOSequenceError.Create;
|
|
begin
|
|
inherited CreateRes(@RsEUnexpectedEOSeq);
|
|
end;
|
|
|
|
function StreamReadByte(S: TStream; out B: Byte): Boolean; {$IFDEF SUPPORTS_INLINE} inline; {$ENDIF}
|
|
begin
|
|
B := 0;
|
|
Result := S.Read(B, SizeOf(B)) = SizeOf(B);
|
|
end;
|
|
|
|
function StreamWriteByte(S: TStream; B: Byte): Boolean; {$IFDEF SUPPORTS_INLINE} inline; {$ENDIF}
|
|
begin
|
|
Result := S.Write(B, SizeOf(B)) = SizeOf(B);
|
|
end;
|
|
|
|
function StreamReadWord(S: TStream; out W: Word): Boolean; {$IFDEF SUPPORTS_INLINE} inline; {$ENDIF}
|
|
begin
|
|
W := 0;
|
|
Result := S.Read(W, SizeOf(W)) = SizeOf(W);
|
|
end;
|
|
|
|
function StreamWriteWord(S: TStream; W: Word): Boolean; {$IFDEF SUPPORTS_INLINE} inline; {$ENDIF}
|
|
begin
|
|
Result := S.Write(W, SizeOf(W)) = SizeOf(W);
|
|
end;
|
|
|
|
//----------------- conversion routines ------------------------------------------------------------
|
|
|
|
// Converts the given source ANSI string into a Unicode string by expanding each character
|
|
// from one byte to two bytes.
|
|
// EAX contains Source, EDX contains Target, ECX contains Count
|
|
|
|
procedure ExpandASCIIString(const Source: PAnsiChar; Target: PWideChar; Count: SizeInt);
|
|
asm
|
|
{$IFDEF CPU32}
|
|
// --> EAX Source
|
|
// EDX Target
|
|
// ECX Count
|
|
JECXZ @@Finish // go out if there is nothing to do (ECX = 0)
|
|
PUSH ESI
|
|
MOV ESI, EAX
|
|
XOR EAX, EAX
|
|
@@1:
|
|
MOV AL, [ESI]
|
|
INC ESI
|
|
MOV [EDX], AX
|
|
ADD EDX, 2
|
|
DEC ECX
|
|
JNZ @@1
|
|
POP ESI
|
|
{$ENDIF CPU32}
|
|
{$IFDEF CPU64}
|
|
// --> RCX Source
|
|
// RDX Target
|
|
// R8 Count
|
|
MOV RAX, RCX
|
|
MOV RCX, R8
|
|
JRCXZ @@Finish // go out if there is nothing to do (ECX = 0)
|
|
MOV RSI, RAX
|
|
XOR RAX, RAX
|
|
@@1:
|
|
MOV AL, [RSI]
|
|
INC RSI
|
|
MOV [RDX], AX
|
|
ADD RDX, 2
|
|
DEC RCX
|
|
JNZ @@1
|
|
{$ENDIF CPU64}
|
|
@@Finish:
|
|
end;
|
|
|
|
const
|
|
HalfShift: Integer = 10;
|
|
|
|
HalfBase: UCS4 = $0010000;
|
|
HalfMask: UCS4 = $3FF;
|
|
|
|
procedure FlagInvalidSequence(var StrPos: SizeInt; Increment: SizeInt; out Ch: UCS4); overload;
|
|
begin
|
|
{$IFDEF UNICODE_SILENT_FAILURE}
|
|
Ch := UCS4ReplacementCharacter;
|
|
Inc(StrPos, Increment);
|
|
{$ELSE ~UNICODE_SILENT_FAILURE}
|
|
StrPos := -1;
|
|
{$ENDIF ~UNICODE_SILENT_FAILURE}
|
|
end;
|
|
|
|
procedure FlagInvalidSequence(var StrPos: SizeInt; Increment: SizeInt); overload;
|
|
begin
|
|
{$IFDEF UNICODE_SILENT_FAILURE}
|
|
Inc(StrPos, Increment);
|
|
{$ELSE ~UNICODE_SILENT_FAILURE}
|
|
StrPos := -1;
|
|
{$ENDIF ~UNICODE_SILENT_FAILURE}
|
|
end;
|
|
|
|
procedure FlagInvalidSequence(out Ch: UCS4); overload;
|
|
begin
|
|
{$IFDEF UNICODE_SILENT_FAILURE}
|
|
Ch := UCS4ReplacementCharacter;
|
|
{$ELSE ~UNICODE_SILENT_FAILURE}
|
|
raise EJclUnexpectedEOSequenceError.Create;
|
|
{$ENDIF ~UNICODE_SILENT_FAILURE}
|
|
end;
|
|
|
|
procedure FlagInvalidSequence; overload;
|
|
begin
|
|
{$IFNDEF UNICODE_SILENT_FAILURE}
|
|
raise EJclUnexpectedEOSequenceError.Create;
|
|
{$ENDIF ~UNICODE_SILENT_FAILURE}
|
|
end;
|
|
|
|
// if UNICODE_SILENT_FAILURE is defined, invalid sequences will be replaced by ReplacementCharacter
|
|
// otherwise StrPos is set to -1 on return to flag an error (invalid UTF8 sequence)
|
|
// StrPos will be incremented by the number of chars that were read
|
|
function UTF8GetNextChar(const S: TUTF8String; var StrPos: SizeInt): UCS4;
|
|
var
|
|
StrLength: SizeInt;
|
|
ChNext: UCS4;
|
|
begin
|
|
StrLength := Length(S);
|
|
|
|
if (StrPos <= StrLength) and (StrPos > 0) then
|
|
begin
|
|
Result := UCS4(S[StrPos]);
|
|
|
|
case Result of
|
|
$00..$7F:
|
|
// 1 byte to read
|
|
Inc(StrPos);
|
|
$C0..$DF:
|
|
begin
|
|
// 2 bytes to read
|
|
if StrPos >= StrLength then
|
|
begin
|
|
FlagInvalidSequence(StrPos, 1, Result);
|
|
Exit;
|
|
end;
|
|
ChNext := UCS4(S[StrPos + 1]);
|
|
if (ChNext and $C0) <> $80 then
|
|
begin
|
|
FlagInvalidSequence(StrPos, 1, Result);
|
|
Exit;
|
|
end;
|
|
Result := ((Result and $1F) shl 6) or (ChNext and $3F);
|
|
Inc(StrPos, 2);
|
|
end;
|
|
$E0..$EF:
|
|
begin
|
|
// 3 bytes to read
|
|
if (StrPos + 1) >= StrLength then
|
|
begin
|
|
FlagInvalidSequence(StrPos, 1, Result);
|
|
Exit;
|
|
end;
|
|
ChNext := UCS4(S[StrPos + 1]);
|
|
if (ChNext and $C0) <> $80 then
|
|
begin
|
|
FlagInvalidSequence(StrPos, 1, Result);
|
|
Exit;
|
|
end;
|
|
Result := ((Result and $0F) shl 12) or ((ChNext and $3F) shl 6);
|
|
ChNext := UCS4(S[StrPos + 2]);
|
|
if (ChNext and $C0) <> $80 then
|
|
begin
|
|
FlagInvalidSequence(StrPos, 2, Result);
|
|
Exit;
|
|
end;
|
|
Result := Result or (ChNext and $3F);
|
|
Inc(StrPos, 3);
|
|
end;
|
|
$F0..$F7:
|
|
begin
|
|
// 4 bytes to read
|
|
if (StrPos + 2) >= StrLength then
|
|
begin
|
|
FlagInvalidSequence(StrPos, 1, Result);
|
|
Exit;
|
|
end;
|
|
ChNext := UCS4(S[StrPos + 1]);
|
|
if (ChNext and $C0) <> $80 then
|
|
begin
|
|
FlagInvalidSequence(StrPos, 1, Result);
|
|
Exit;
|
|
end;
|
|
Result := ((Result and $07) shl 18) or ((ChNext and $3F) shl 12);
|
|
ChNext := UCS4(S[StrPos + 2]);
|
|
if (ChNext and $C0) <> $80 then
|
|
begin
|
|
FlagInvalidSequence(StrPos, 2, Result);
|
|
Exit;
|
|
end;
|
|
Result := Result or ((ChNext and $3F) shl 6);
|
|
ChNext := UCS4(S[StrPos + 3]);
|
|
if (ChNext and $C0) <> $80 then
|
|
begin
|
|
FlagInvalidSequence(StrPos, 3, Result);
|
|
Exit;
|
|
end;
|
|
Result := Result or (ChNext and $3F);
|
|
Inc(StrPos, 4);
|
|
end;
|
|
$F8..$FB:
|
|
begin
|
|
// 5 bytes to read
|
|
if (StrPos + 3) >= StrLength then
|
|
begin
|
|
FlagInvalidSequence(StrPos, 1, Result);
|
|
Exit;
|
|
end;
|
|
ChNext := UCS4(S[StrPos + 1]);
|
|
if (ChNext and $C0) <> $80 then
|
|
begin
|
|
FlagInvalidSequence(StrPos, 1, Result);
|
|
Exit;
|
|
end;
|
|
Result := ((Result and $03) shl 24) or ((ChNext and $3F) shl 18);
|
|
ChNext := UCS4(S[StrPos + 2]);
|
|
if (ChNext and $C0) <> $80 then
|
|
begin
|
|
FlagInvalidSequence(StrPos, 2, Result);
|
|
Exit;
|
|
end;
|
|
Result := Result or ((ChNext and $3F) shl 12);
|
|
ChNext := UCS4(S[StrPos + 3]);
|
|
if (ChNext and $C0) <> $80 then
|
|
begin
|
|
FlagInvalidSequence(StrPos, 3, Result);
|
|
Exit;
|
|
end;
|
|
Result := Result or ((ChNext and $3F) shl 6);
|
|
ChNext := UCS4(S[StrPos + 4]);
|
|
if (ChNext and $C0) <> $80 then
|
|
begin
|
|
FlagInvalidSequence(StrPos, 4, Result);
|
|
Exit;
|
|
end;
|
|
Result := Result or (ChNext and $3F);
|
|
Inc(StrPos, 5);
|
|
end;
|
|
$FC..$FD:
|
|
begin
|
|
// 6 bytes to read
|
|
if (StrPos + 4) >= StrLength then
|
|
begin
|
|
FlagInvalidSequence(StrPos, 1, Result);
|
|
Exit;
|
|
end;
|
|
ChNext := UCS4(S[StrPos + 1]);
|
|
if (ChNext and $C0) <> $80 then
|
|
begin
|
|
FlagInvalidSequence(StrPos, 1, Result);
|
|
Exit;
|
|
end;
|
|
Result := ((Result and $01) shl 30) or ((ChNext and $3F) shl 24);
|
|
ChNext := UCS4(S[StrPos + 2]);
|
|
if (ChNext and $C0) <> $80 then
|
|
begin
|
|
FlagInvalidSequence(StrPos, 2, Result);
|
|
Exit;
|
|
end;
|
|
Result := Result or ((ChNext and $3F) shl 18);
|
|
ChNext := UCS4(S[StrPos + 3]);
|
|
if (ChNext and $C0) <> $80 then
|
|
begin
|
|
FlagInvalidSequence(StrPos, 3, Result);
|
|
Exit;
|
|
end;
|
|
Result := Result or ((ChNext and $3F) shl 12);
|
|
ChNext := UCS4(S[StrPos + 4]);
|
|
if (ChNext and $C0) <> $80 then
|
|
begin
|
|
FlagInvalidSequence(StrPos, 4, Result);
|
|
Exit;
|
|
end;
|
|
Result := Result or ((ChNext and $3F) shl 6);
|
|
ChNext := UCS4(S[StrPos + 5]);
|
|
if (ChNext and $C0) <> $80 then
|
|
begin
|
|
FlagInvalidSequence(StrPos, 5, Result);
|
|
Exit;
|
|
end;
|
|
Result := Result or (ChNext and $3F);
|
|
Inc(StrPos, 6);
|
|
end;
|
|
else
|
|
FlagInvalidSequence(StrPos, 1, Result);
|
|
Exit;
|
|
end;
|
|
end
|
|
else
|
|
begin
|
|
// StrPos > StrLength
|
|
Result := 0;
|
|
FlagInvalidSequence(StrPos, 0, Result);
|
|
end;
|
|
end;
|
|
|
|
function UTF8GetNextCharFromStream(S: TStream; out Ch: UCS4): Boolean;
|
|
var
|
|
B: Byte;
|
|
begin
|
|
Result := StreamReadByte(S,B);
|
|
if not Result then
|
|
Exit;
|
|
Ch := UCS4(B);
|
|
|
|
case Ch of
|
|
$00..$7F: ;
|
|
// 1 byte to read
|
|
// nothing to do
|
|
$C0..$DF:
|
|
begin
|
|
// 2 bytes to read
|
|
Result := StreamReadByte(S,B);
|
|
if not Result then
|
|
Exit;
|
|
if (B and $C0) <> $80 then
|
|
begin
|
|
FlagInvalidSequence(Ch);
|
|
Exit;
|
|
end;
|
|
Ch := ((Ch and $1F) shl 6) or (B and $3F);
|
|
end;
|
|
$E0..$EF:
|
|
begin
|
|
// 3 bytes to read
|
|
Result := StreamReadByte(S,B);
|
|
if not Result then
|
|
Exit;
|
|
if (B and $C0) <> $80 then
|
|
begin
|
|
FlagInvalidSequence(Ch);
|
|
Exit;
|
|
end;
|
|
Ch := ((Ch and $0F) shl 12) or ((B and $3F) shl 6);
|
|
Result := StreamReadByte(S,B);
|
|
if not Result then
|
|
Exit;
|
|
if (B and $C0) <> $80 then
|
|
begin
|
|
FlagInvalidSequence(Ch);
|
|
Exit;
|
|
end;
|
|
Ch := Ch or (B and $3F);
|
|
end;
|
|
$F0..$F7:
|
|
begin
|
|
// 4 bytes to read
|
|
Result := StreamReadByte(S,B);
|
|
if not Result then
|
|
Exit;
|
|
if (B and $C0) <> $80 then
|
|
begin
|
|
FlagInvalidSequence(Ch);
|
|
Exit;
|
|
end;
|
|
Ch := ((Ch and $07) shl 18) or ((B and $3F) shl 12);
|
|
Result := StreamReadByte(S,B);
|
|
if not Result then
|
|
Exit;
|
|
if (B and $C0) <> $80 then
|
|
begin
|
|
FlagInvalidSequence(Ch);
|
|
Exit;
|
|
end;
|
|
Ch := Ch or ((B and $3F) shl 6);
|
|
Result := StreamReadByte(S,B);
|
|
if not Result then
|
|
Exit;
|
|
if (B and $C0) <> $80 then
|
|
begin
|
|
FlagInvalidSequence(Ch);
|
|
Exit;
|
|
end;
|
|
Ch := Ch or (B and $3F);
|
|
end;
|
|
$F8..$FB:
|
|
begin
|
|
// 5 bytes to read
|
|
Result := StreamReadByte(S,B);
|
|
if not Result then
|
|
Exit;
|
|
if (B and $C0) <> $80 then
|
|
begin
|
|
FlagInvalidSequence(Ch);
|
|
Exit;
|
|
end;
|
|
Ch := ((Ch and $03) shl 24) or ((B and $3F) shl 18);
|
|
Result := StreamReadByte(S,B);
|
|
if not Result then
|
|
Exit;
|
|
if (B and $C0) <> $80 then
|
|
begin
|
|
FlagInvalidSequence(Ch);
|
|
Exit;
|
|
end;
|
|
Ch := Ch or ((B and $3F) shl 12);
|
|
Result := StreamReadByte(S,B);
|
|
if not Result then
|
|
Exit;
|
|
if (B and $C0) <> $80 then
|
|
begin
|
|
FlagInvalidSequence(Ch);
|
|
Exit;
|
|
end;
|
|
Ch := Ch or ((B and $3F) shl 6);
|
|
Result := StreamReadByte(S,B);
|
|
if not Result then
|
|
Exit;
|
|
if (B and $C0) <> $80 then
|
|
begin
|
|
FlagInvalidSequence(Ch);
|
|
Exit;
|
|
end;
|
|
Ch := Ch or (B and $3F);
|
|
end;
|
|
$FC..$FD:
|
|
begin
|
|
// 6 bytes to read
|
|
Result := StreamReadByte(S,B);
|
|
if not Result then
|
|
Exit;
|
|
if (B and $C0) <> $80 then
|
|
begin
|
|
FlagInvalidSequence(Ch);
|
|
Exit;
|
|
end;
|
|
Ch := ((Ch and $01) shl 30) or ((B and $3F) shl 24);
|
|
Result := StreamReadByte(S,B);
|
|
if not Result then
|
|
Exit;
|
|
if (B and $C0) <> $80 then
|
|
begin
|
|
FlagInvalidSequence(Ch);
|
|
Exit;
|
|
end;
|
|
Ch := Ch or ((B and $3F) shl 18);
|
|
Result := StreamReadByte(S,B);
|
|
if not Result then
|
|
Exit;
|
|
if (B and $C0) <> $80 then
|
|
begin
|
|
FlagInvalidSequence(Ch);
|
|
Exit;
|
|
end;
|
|
Ch := Ch or ((B and $3F) shl 12);
|
|
Result := StreamReadByte(S,B);
|
|
if not Result then
|
|
Exit;
|
|
if (B and $C0) <> $80 then
|
|
begin
|
|
FlagInvalidSequence(Ch);
|
|
Exit;
|
|
end;
|
|
Ch := Ch or ((B and $3F) shl 6);
|
|
Result := StreamReadByte(S,B);
|
|
if not Result then
|
|
Exit;
|
|
if (B and $C0) <> $80 then
|
|
begin
|
|
FlagInvalidSequence(Ch);
|
|
Exit;
|
|
end;
|
|
Ch := Ch or (B and $3F);
|
|
end;
|
|
else
|
|
FlagInvalidSequence(Ch);
|
|
Exit;
|
|
end;
|
|
end;
|
|
|
|
// returns False if String is too small
|
|
// if UNICODE_SILENT_FAILURE is not defined StrPos is set to -1 on error (invalid UTF8 sequence)
|
|
// StrPos will be incremented by the number of ansi chars that were skipped
|
|
// On return, NbSeq contains the number of UTF8 sequences that were skipped
|
|
function UTF8SkipChars(const S: TUTF8String; var StrPos: SizeInt; var NbSeq: SizeInt): Boolean;
|
|
var
|
|
StrLength: SizeInt;
|
|
Ch: UCS4;
|
|
Index: SizeInt;
|
|
begin
|
|
Result := True;
|
|
StrLength := Length(S);
|
|
|
|
Index := 0;
|
|
while (Index < NbSeq) and (StrPos > 0) do
|
|
begin
|
|
Ch := UCS4(S[StrPos]);
|
|
|
|
case Ch of
|
|
$00..$7F:
|
|
// 1 byte to skip
|
|
Inc(StrPos);
|
|
$C0..$DF:
|
|
// 2 bytes to skip
|
|
if (StrPos >= StrLength) or ((UCS4(S[StrPos + 1]) and $C0) <> $80) then
|
|
FlagInvalidSequence(StrPos, 1)
|
|
else
|
|
Inc(StrPos, 2);
|
|
$E0..$EF:
|
|
// 3 bytes to skip
|
|
if ((StrPos + 1) >= StrLength) or ((UCS4(S[StrPos + 1]) and $C0) <> $80) then
|
|
FlagInvalidSequence(StrPos, 1)
|
|
else
|
|
if (UCS4(S[StrPos + 2]) and $C0) <> $80 then
|
|
FlagInvalidSequence(StrPos, 2)
|
|
else
|
|
Inc(StrPos, 3);
|
|
$F0..$F7:
|
|
// 4 bytes to skip
|
|
if ((StrPos + 2) >= StrLength) or ((UCS4(S[StrPos + 1]) and $C0) <> $80) then
|
|
FlagInvalidSequence(StrPos, 1)
|
|
else
|
|
if (UCS4(S[StrPos + 2]) and $C0) <> $80 then
|
|
FlagInvalidSequence(StrPos, 2)
|
|
else
|
|
if (UCS4(S[StrPos + 3]) and $C0) <> $80 then
|
|
FlagInvalidSequence(StrPos, 3)
|
|
else
|
|
Inc(StrPos, 4);
|
|
$F8..$FB:
|
|
// 5 bytes to skip
|
|
if ((StrPos + 3) >= StrLength) or ((UCS4(S[StrPos + 1]) and $C0) <> $80) then
|
|
FlagInvalidSequence(StrPos, 1)
|
|
else
|
|
if (UCS4(S[StrPos + 2]) and $C0) <> $80 then
|
|
FlagInvalidSequence(StrPos, 2)
|
|
else
|
|
if (UCS4(S[StrPos + 3]) and $C0) <> $80 then
|
|
FlagInvalidSequence(StrPos, 3)
|
|
else
|
|
if (UCS4(S[StrPos + 4]) and $C0) <> $80 then
|
|
FlagInvalidSequence(StrPos, 4)
|
|
else
|
|
Inc(StrPos, 5);
|
|
$FC..$FD:
|
|
// 6 bytes to skip
|
|
if ((StrPos + 4) >= StrLength) or ((UCS4(S[StrPos + 1]) and $C0) <> $80) then
|
|
FlagInvalidSequence(StrPos, 1)
|
|
else
|
|
if (UCS4(S[StrPos + 2]) and $C0) <> $80 then
|
|
FlagInvalidSequence(StrPos, 2)
|
|
else
|
|
if (UCS4(S[StrPos + 3]) and $C0) <> $80 then
|
|
FlagInvalidSequence(StrPos, 3)
|
|
else
|
|
if (UCS4(S[StrPos + 4]) and $C0) <> $80 then
|
|
FlagInvalidSequence(StrPos, 4)
|
|
else
|
|
if (UCS4(S[StrPos + 5]) and $C0) <> $80 then
|
|
FlagInvalidSequence(StrPos, 5)
|
|
else
|
|
Inc(StrPos, 6);
|
|
else
|
|
FlagInvalidSequence(StrPos, 1);
|
|
end;
|
|
|
|
if StrPos <> -1 then
|
|
Inc(Index);
|
|
if (StrPos > StrLength) and (Index < NbSeq) then
|
|
begin
|
|
Result := False;
|
|
Break;
|
|
end;
|
|
end;
|
|
NbSeq := Index;
|
|
end;
|
|
|
|
function UTF8SkipCharsFromStream(S: TStream; var NbSeq: SizeInt): Boolean;
|
|
var
|
|
B: Byte;
|
|
Index: SizeInt;
|
|
begin
|
|
Index := 0;
|
|
while (Index < NbSeq) do
|
|
begin
|
|
Result := StreamReadByte(S, B);
|
|
if not Result then
|
|
Break;
|
|
case B of
|
|
$00..$7F: ;
|
|
// 1 byte to skip
|
|
// nothing to do
|
|
$C0..$DF:
|
|
// 2 bytes to skip
|
|
begin
|
|
Result := StreamReadByte(S, B);
|
|
if not Result then
|
|
Break;
|
|
if (B and $C0) <> $80 then
|
|
FlagInvalidSequence;
|
|
end;
|
|
$E0..$EF:
|
|
// 3 bytes to skip
|
|
begin
|
|
Result := StreamReadByte(S, B);
|
|
if not Result then
|
|
Break;
|
|
if (B and $C0) <> $80 then
|
|
FlagInvalidSequence;
|
|
Result := StreamReadByte(S, B);
|
|
if not Result then
|
|
Break;
|
|
if (B and $C0) <> $80 then
|
|
FlagInvalidSequence;
|
|
end;
|
|
$F0..$F7:
|
|
// 4 bytes to skip
|
|
begin
|
|
Result := StreamReadByte(S, B);
|
|
if not Result then
|
|
Break;
|
|
if (B and $C0) <> $80 then
|
|
FlagInvalidSequence;
|
|
Result := StreamReadByte(S, B);
|
|
if not Result then
|
|
Break;
|
|
if (B and $C0) <> $80 then
|
|
FlagInvalidSequence;
|
|
Result := StreamReadByte(S, B);
|
|
if not Result then
|
|
Break;
|
|
if (B and $C0) <> $80 then
|
|
FlagInvalidSequence;
|
|
end;
|
|
$F8..$FB:
|
|
// 5 bytes to skip
|
|
begin
|
|
Result := StreamReadByte(S, B);
|
|
if not Result then
|
|
Break;
|
|
if (B and $C0) <> $80 then
|
|
FlagInvalidSequence;
|
|
Result := StreamReadByte(S, B);
|
|
if not Result then
|
|
Break;
|
|
if (B and $C0) <> $80 then
|
|
FlagInvalidSequence;
|
|
Result := StreamReadByte(S, B);
|
|
if not Result then
|
|
Break;
|
|
if (B and $C0) <> $80 then
|
|
FlagInvalidSequence;
|
|
Result := StreamReadByte(S, B);
|
|
if not Result then
|
|
Break;
|
|
if (B and $C0) <> $80 then
|
|
FlagInvalidSequence;
|
|
end;
|
|
$FC..$FD:
|
|
// 6 bytes to skip
|
|
begin
|
|
Result := StreamReadByte(S, B);
|
|
if not Result then
|
|
Break;
|
|
if (B and $C0) <> $80 then
|
|
FlagInvalidSequence;
|
|
Result := StreamReadByte(S, B);
|
|
if not Result then
|
|
Break;
|
|
if (B and $C0) <> $80 then
|
|
FlagInvalidSequence;
|
|
Result := StreamReadByte(S, B);
|
|
if not Result then
|
|
Break;
|
|
if (B and $C0) <> $80 then
|
|
FlagInvalidSequence;
|
|
Result := StreamReadByte(S, B);
|
|
if not Result then
|
|
Break;
|
|
if (B and $C0) <> $80 then
|
|
FlagInvalidSequence;
|
|
Result := StreamReadByte(S, B);
|
|
if not Result then
|
|
Break;
|
|
if (B and $C0) <> $80 then
|
|
FlagInvalidSequence;
|
|
end;
|
|
else
|
|
FlagInvalidSequence;
|
|
end;
|
|
Inc(Index);
|
|
end;
|
|
Result := Index = NbSeq;
|
|
NbSeq := Index;
|
|
end;
|
|
|
|
// returns False on error:
|
|
// - if an UCS4 character cannot be stored to an UTF-8 string:
|
|
// - if UNICODE_SILENT_FAILURE is defined, ReplacementCharacter is added
|
|
// - if UNICODE_SILENT_FAILURE is not defined, StrPos is set to -1
|
|
// - StrPos > -1 flags string being too small, caller is responsible for allocating space
|
|
// StrPos will be incremented by the number of chars that were written
|
|
function UTF8SetNextChar(var S: TUTF8String; var StrPos: SizeInt; Ch: UCS4): Boolean;
|
|
var
|
|
StrLength: SizeInt;
|
|
begin
|
|
StrLength := Length(S);
|
|
|
|
if Ch <= $7F then
|
|
begin
|
|
// 7 bits to store
|
|
Result := (StrPos > 0) and (StrPos <= StrLength);
|
|
if Result then
|
|
begin
|
|
S[StrPos] := AnsiChar(Ch);
|
|
Inc(StrPos);
|
|
end;
|
|
end
|
|
else
|
|
if Ch <= $7FF then
|
|
begin
|
|
// 11 bits to store
|
|
Result := (StrPos > 0) and (StrPos < StrLength);
|
|
if Result then
|
|
begin
|
|
S[StrPos] := AnsiChar($C0 or (Ch shr 6)); // 5 bits
|
|
S[StrPos + 1] := AnsiChar((Ch and $3F) or $80); // 6 bits
|
|
Inc(StrPos, 2);
|
|
end;
|
|
end
|
|
else
|
|
if Ch <= $FFFF then
|
|
begin
|
|
// 16 bits to store
|
|
Result := (StrPos > 0) and (StrPos < (StrLength - 1));
|
|
if Result then
|
|
begin
|
|
S[StrPos] := AnsiChar($E0 or (Ch shr 12)); // 4 bits
|
|
S[StrPos + 1] := AnsiChar(((Ch shr 6) and $3F) or $80); // 6 bits
|
|
S[StrPos + 2] := AnsiChar((Ch and $3F) or $80); // 6 bits
|
|
Inc(StrPos, 3);
|
|
end;
|
|
end
|
|
else
|
|
if Ch <= $1FFFFF then
|
|
begin
|
|
// 21 bits to store
|
|
Result := (StrPos > 0) and (StrPos < (StrLength - 2));
|
|
if Result then
|
|
begin
|
|
S[StrPos] := AnsiChar($F0 or (Ch shr 18)); // 3 bits
|
|
S[StrPos + 1] := AnsiChar(((Ch shr 12) and $3F) or $80); // 6 bits
|
|
S[StrPos + 2] := AnsiChar(((Ch shr 6) and $3F) or $80); // 6 bits
|
|
S[StrPos + 3] := AnsiChar((Ch and $3F) or $80); // 6 bits
|
|
Inc(StrPos, 4);
|
|
end;
|
|
end
|
|
else
|
|
if Ch <= $3FFFFFF then
|
|
begin
|
|
// 26 bits to store
|
|
Result := (StrPos > 0) and (StrPos < (StrLength - 2));
|
|
if Result then
|
|
begin
|
|
S[StrPos] := AnsiChar($F8 or (Ch shr 24)); // 2 bits
|
|
S[StrPos + 1] := AnsiChar(((Ch shr 18) and $3F) or $80); // 6 bits
|
|
S[StrPos + 2] := AnsiChar(((Ch shr 12) and $3F) or $80); // 6 bits
|
|
S[StrPos + 3] := AnsiChar(((Ch shr 6) and $3F) or $80); // 6 bits
|
|
S[StrPos + 4] := AnsiChar((Ch and $3F) or $80); // 6 bits
|
|
Inc(StrPos, 5);
|
|
end;
|
|
end
|
|
else
|
|
if Ch <= MaximumUCS4 then
|
|
begin
|
|
// 31 bits to store
|
|
Result := (StrPos > 0) and (StrPos < (StrLength - 3));
|
|
if Result then
|
|
begin
|
|
S[StrPos] := AnsiChar($FC or (Ch shr 30)); // 1 bits
|
|
S[StrPos + 1] := AnsiChar(((Ch shr 24) and $3F) or $80); // 6 bits
|
|
S[StrPos + 2] := AnsiChar(((Ch shr 18) and $3F) or $80); // 6 bits
|
|
S[StrPos + 3] := AnsiChar(((Ch shr 12) and $3F) or $80); // 6 bits
|
|
S[StrPos + 4] := AnsiChar(((Ch shr 6) and $3F) or $80); // 6 bits
|
|
S[StrPos + 5] := AnsiChar((Ch and $3F) or $80); // 6 bits
|
|
Inc(StrPos, 6);
|
|
end;
|
|
end
|
|
else
|
|
begin
|
|
{$IFDEF UNICODE_SILENT_FAILURE}
|
|
// add ReplacementCharacter
|
|
Result := (StrPos > 0) and (StrPos < (StrLength - 1));
|
|
if Result then
|
|
begin
|
|
S[StrPos] := AnsiChar($E0 or (UCS4ReplacementCharacter shr 12)); // 4 bits
|
|
S[StrPos + 1] := AnsiChar(((UCS4ReplacementCharacter shr 6) and $3F) or $80); // 6 bits
|
|
S[StrPos + 2] := AnsiChar((UCS4ReplacementCharacter and $3F) or $80); // 6 bits
|
|
Inc(StrPos, 3);
|
|
end;
|
|
{$ELSE ~UNICODE_SILENT_FAILURE}
|
|
StrPos := -1;
|
|
Result := False;
|
|
{$ENDIF ~UNICODE_SILENT_FAILURE}
|
|
end;
|
|
end;
|
|
|
|
function UTF8SetNextCharToStream(S: TStream; Ch: UCS4): Boolean;
|
|
begin
|
|
if Ch <= $7F then
|
|
// 7 bits to store
|
|
Result := StreamWriteByte(S,Ch)
|
|
else
|
|
if Ch <= $7FF then
|
|
// 11 bits to store
|
|
Result := StreamWriteByte(S, $C0 or (Ch shr 6)) and // 5 bits
|
|
StreamWriteByte(S, (Ch and $3F) or $80) // 6 bits
|
|
else
|
|
if Ch <= $FFFF then
|
|
// 16 bits to store
|
|
Result := StreamWriteByte(S, $E0 or (Ch shr 12)) and // 4 bits
|
|
StreamWriteByte(S, ((Ch shr 6) and $3F) or $80) and // 6 bits
|
|
StreamWriteByte(S, (Ch and $3F) or $80) // 6 bits
|
|
else
|
|
if Ch <= $1FFFFF then
|
|
// 21 bits to store
|
|
Result := StreamWriteByte(S, $F0 or (Ch shr 18)) and // 3 bits
|
|
StreamWriteByte(S, ((Ch shr 12) and $3F) or $80) and // 6 bits
|
|
StreamWriteByte(S, ((Ch shr 6) and $3F) or $80) and // 6 bits
|
|
StreamWriteByte(S, (Ch and $3F) or $80) // 6 bits
|
|
else
|
|
if Ch <= $3FFFFFF then
|
|
// 26 bits to store
|
|
Result := StreamWriteByte(S, $F8 or (Ch shr 24)) and // 2 bits
|
|
StreamWriteByte(S, ((Ch shr 18) and $3F) or $80) and // 6 bits
|
|
StreamWriteByte(S, ((Ch shr 12) and $3F) or $80) and // 6 bits
|
|
StreamWriteByte(S, ((Ch shr 6) and $3F) or $80) and // 6 bits
|
|
StreamWriteByte(S, (Ch and $3F) or $80) // 6 bits
|
|
else
|
|
if Ch <= MaximumUCS4 then
|
|
// 31 bits to store
|
|
Result := StreamWriteByte(S, $FC or (Ch shr 30)) and // 1 bits
|
|
StreamWriteByte(S, ((Ch shr 24) and $3F) or $80) and // 6 bits
|
|
StreamWriteByte(S, ((Ch shr 18) and $3F) or $80) and // 6 bits
|
|
StreamWriteByte(S, ((Ch shr 12) and $3F) or $80) and // 6 bits
|
|
StreamWriteByte(S, ((Ch shr 6) and $3F) or $80) and // 6 bits
|
|
StreamWriteByte(S, (Ch and $3F) or $80) // 6 bits
|
|
else
|
|
{$IFDEF UNICODE_SILENT_FAILURE}
|
|
// add ReplacementCharacter
|
|
Result := StreamWriteByte(S, $E0 or (UCS4ReplacementCharacter shr 12)) and // 4 bits
|
|
StreamWriteByte(S, ((UCS4ReplacementCharacter shr 6) and $3F) or $80) and // 6 bits
|
|
StreamWriteByte(S, (UCS4ReplacementCharacter and $3F) or $80); // 6 bits
|
|
{$ELSE ~UNICODE_SILENT_FAILURE}
|
|
Result := False;
|
|
{$ENDIF ~UNICODE_SILENT_FAILURE}
|
|
end;
|
|
|
|
// if UNICODE_SILENT_FAILURE is defined, invalid sequences will be replaced by ReplacementCharacter
|
|
// otherwise StrPos is set to -1 on return to flag an error (invalid UTF16 sequence)
|
|
// StrPos will be incremented by the number of chars that were read
|
|
function UTF16GetNextChar(const S: TUTF16String; var StrPos: SizeInt): UCS4;
|
|
var
|
|
StrLength: SizeInt;
|
|
ChNext: UCS4;
|
|
begin
|
|
StrLength := Length(S);
|
|
|
|
if (StrPos <= StrLength) and (StrPos > 0) then
|
|
begin
|
|
Result := UCS4(S[StrPos]);
|
|
|
|
case Result of
|
|
SurrogateHighStart..SurrogateHighEnd:
|
|
begin
|
|
// 2 bytes to read
|
|
if StrPos >= StrLength then
|
|
begin
|
|
FlagInvalidSequence(StrPos, 1, Result);
|
|
Exit;
|
|
end;
|
|
ChNext := UCS4(S[StrPos + 1]);
|
|
if (ChNext < SurrogateLowStart) or (ChNext > SurrogateLowEnd) then
|
|
begin
|
|
FlagInvalidSequence(StrPos, 1, Result);
|
|
Exit;
|
|
end;
|
|
Result := ((Result - SurrogateHighStart) shl HalfShift) + (ChNext - SurrogateLowStart) + HalfBase;
|
|
Inc(StrPos, 2);
|
|
end;
|
|
SurrogateLowStart..SurrogateLowEnd:
|
|
FlagInvalidSequence(StrPos, 1, Result);
|
|
else
|
|
// 1 byte to read
|
|
Inc(StrPos);
|
|
end;
|
|
end
|
|
else
|
|
begin
|
|
// StrPos > StrLength
|
|
Result := 0;
|
|
FlagInvalidSequence(StrPos, 0, Result);
|
|
end;
|
|
end;
|
|
|
|
// if UNICODE_SILENT_FAILURE is defined, invalid sequences will be replaced by ReplacementCharacter
|
|
// otherwise StrPos is set to -1 on return to flag an error (invalid UTF16 sequence)
|
|
// StrPos will be incremented by the number of chars that were read
|
|
{$IFDEF SUPPORTS_UNICODE_STRING}
|
|
function UTF16GetNextChar(const S: UnicodeString; var StrPos: SizeInt): UCS4;
|
|
var
|
|
StrLength: SizeInt;
|
|
ChNext: UCS4;
|
|
begin
|
|
StrLength := Length(S);
|
|
|
|
if (StrPos <= StrLength) and (StrPos > 0) then
|
|
begin
|
|
Result := UCS4(S[StrPos]);
|
|
|
|
case Result of
|
|
SurrogateHighStart..SurrogateHighEnd:
|
|
begin
|
|
// 2 bytes to read
|
|
if StrPos >= StrLength then
|
|
begin
|
|
FlagInvalidSequence(StrPos, 1, Result);
|
|
Exit;
|
|
end;
|
|
ChNext := UCS4(S[StrPos + 1]);
|
|
if (ChNext < SurrogateLowStart) or (ChNext > SurrogateLowEnd) then
|
|
begin
|
|
FlagInvalidSequence(StrPos, 1, Result);
|
|
Exit;
|
|
end;
|
|
Result := ((Result - SurrogateHighStart) shl HalfShift) + (ChNext - SurrogateLowStart) + HalfBase;
|
|
Inc(StrPos, 2);
|
|
end;
|
|
SurrogateLowStart..SurrogateLowEnd:
|
|
FlagInvalidSequence(StrPos, 1, Result);
|
|
else
|
|
// 1 byte to read
|
|
Inc(StrPos);
|
|
end;
|
|
end
|
|
else
|
|
begin
|
|
// StrPos > StrLength
|
|
Result := 0;
|
|
FlagInvalidSequence(StrPos, 0, Result);
|
|
end;
|
|
end;
|
|
{$ENDIF SUPPORTS_UNICODE_STRING}
|
|
|
|
function UTF16GetNextCharFromStream(S: TStream; out Ch: UCS4): Boolean;
|
|
var
|
|
W: Word;
|
|
begin
|
|
Result := StreamReadWord(S, W);
|
|
if not Result then
|
|
Exit;
|
|
Ch := UCS4(W);
|
|
|
|
case W of
|
|
SurrogateHighStart..SurrogateHighEnd:
|
|
begin
|
|
// 2 bytes to read
|
|
Result := StreamReadWord(S, W);
|
|
if not Result then
|
|
Exit;
|
|
if (W < SurrogateLowStart) or (W > SurrogateLowEnd) then
|
|
begin
|
|
FlagInvalidSequence(Ch);
|
|
Exit;
|
|
end;
|
|
Ch := ((Ch - SurrogateHighStart) shl HalfShift) + (W - SurrogateLowStart) + HalfBase;
|
|
end;
|
|
SurrogateLowStart..SurrogateLowEnd:
|
|
FlagInvalidSequence(Ch);
|
|
else
|
|
// 1 byte to read
|
|
// nothing to do
|
|
end;
|
|
end;
|
|
|
|
// if UNICODE_SILENT_FAILURE is defined, invalid sequences will be replaced by ReplacementCharacter
|
|
// otherwise StrPos is set to -1 on return to flag an error (invalid UTF16 sequence)
|
|
// StrPos will be decremented by the number of chars that were read
|
|
function UTF16GetPreviousChar(const S: TUTF16String; var StrPos: SizeInt): UCS4;
|
|
var
|
|
StrLength: SizeInt;
|
|
ChPrev: UCS4;
|
|
begin
|
|
StrLength := Length(S);
|
|
|
|
if (StrPos <= (StrLength + 1)) and (StrPos > 1) then
|
|
begin
|
|
Result := UCS4(S[StrPos - 1]);
|
|
|
|
case Result of
|
|
SurrogateHighStart..SurrogateHighEnd:
|
|
FlagInvalidSequence(StrPos, -1, Result);
|
|
SurrogateLowStart..SurrogateLowEnd:
|
|
begin
|
|
// 2 bytes to read
|
|
if StrPos <= 2 then
|
|
begin
|
|
FlagInvalidSequence(StrPos, -1, Result);
|
|
Exit;
|
|
end;
|
|
ChPrev := UCS4(S[StrPos - 2]);
|
|
if (ChPrev < SurrogateHighStart) or (ChPrev > SurrogateHighEnd) then
|
|
begin
|
|
FlagInvalidSequence(StrPos, -1, Result);
|
|
Exit;
|
|
end;
|
|
Result := ((ChPrev - SurrogateHighStart) shl HalfShift) + (Result - SurrogateLowStart) + HalfBase;
|
|
Dec(StrPos, 2);
|
|
end;
|
|
else
|
|
// 1 byte to read
|
|
Dec(StrPos);
|
|
end;
|
|
end
|
|
else
|
|
begin
|
|
// StrPos > StrLength
|
|
Result := 0;
|
|
FlagInvalidSequence(StrPos, 0, Result);
|
|
end;
|
|
end;
|
|
|
|
// if UNICODE_SILENT_FAILURE is defined, invalid sequences will be replaced by ReplacementCharacter
|
|
// otherwise StrPos is set to -1 on return to flag an error (invalid UTF16 sequence)
|
|
// StrPos will be decremented by the number of chars that were read
|
|
{$IFDEF SUPPORTS_UNICODE_STRING}
|
|
function UTF16GetPreviousChar(const S: UnicodeString; var StrPos: SizeInt): UCS4;
|
|
var
|
|
StrLength: SizeInt;
|
|
ChPrev: UCS4;
|
|
begin
|
|
StrLength := Length(S);
|
|
|
|
if (StrPos <= (StrLength + 1)) and (StrPos > 1) then
|
|
begin
|
|
Result := UCS4(S[StrPos - 1]);
|
|
|
|
case Result of
|
|
SurrogateHighStart..SurrogateHighEnd:
|
|
FlagInvalidSequence(StrPos, -1, Result);
|
|
SurrogateLowStart..SurrogateLowEnd:
|
|
begin
|
|
// 2 bytes to read
|
|
if StrPos <= 2 then
|
|
begin
|
|
FlagInvalidSequence(StrPos, -1, Result);
|
|
Exit;
|
|
end;
|
|
ChPrev := UCS4(S[StrPos - 2]);
|
|
if (ChPrev < SurrogateHighStart) or (ChPrev > SurrogateHighEnd) then
|
|
begin
|
|
FlagInvalidSequence(StrPos, -1, Result);
|
|
Exit;
|
|
end;
|
|
Result := ((ChPrev - SurrogateHighStart) shl HalfShift) + (Result - SurrogateLowStart) + HalfBase;
|
|
Dec(StrPos, 2);
|
|
end;
|
|
else
|
|
// 1 byte to read
|
|
Dec(StrPos);
|
|
end;
|
|
end
|
|
else
|
|
begin
|
|
// StrPos > StrLength
|
|
Result := 0;
|
|
FlagInvalidSequence(StrPos, 0, Result);
|
|
end;
|
|
end;
|
|
{$ENDIF SUPPORTS_UNICODE_STRING}
|
|
|
|
// returns False if String is too small
|
|
// if UNICODE_SILENT_FAILURE is not defined StrPos is set to -1 on error (invalid UTF16 sequence)
|
|
// StrPos will be incremented by the number of chars that were skipped
|
|
// On return, NbSeq contains the number of UTF16 sequences that were skipped
|
|
function UTF16SkipChars(const S: TUTF16String; var StrPos: SizeInt; var NbSeq: SizeInt): Boolean;
|
|
var
|
|
StrLength, Index: SizeInt;
|
|
Ch, ChNext: UCS4;
|
|
begin
|
|
Result := True;
|
|
StrLength := Length(S);
|
|
|
|
Index := 0;
|
|
if NbSeq >= 0 then
|
|
while (Index < NbSeq) and (StrPos > 0) do
|
|
begin
|
|
Ch := UCS4(S[StrPos]);
|
|
|
|
case Ch of
|
|
SurrogateHighStart..SurrogateHighEnd:
|
|
// 2 bytes to skip
|
|
if StrPos >= StrLength then
|
|
FlagInvalidSequence(StrPos, 1)
|
|
else
|
|
begin
|
|
ChNext := UCS4(S[StrPos + 1]);
|
|
if (ChNext < SurrogateLowStart) or (ChNext > SurrogateLowEnd) then
|
|
FlagInvalidSequence(StrPos, 1)
|
|
else
|
|
Inc(StrPos, 2);
|
|
end;
|
|
SurrogateLowStart..SurrogateLowEnd:
|
|
// error
|
|
FlagInvalidSequence(StrPos, 1);
|
|
else
|
|
// 1 byte to skip
|
|
Inc(StrPos);
|
|
end;
|
|
|
|
if StrPos <> -1 then
|
|
Inc(Index);
|
|
|
|
if (StrPos > StrLength) and (Index < NbSeq) then
|
|
begin
|
|
Result := False;
|
|
Break;
|
|
end;
|
|
end
|
|
else
|
|
while (Index > NbSeq) and (StrPos > 1) do
|
|
begin
|
|
Ch := UCS4(S[StrPos - 1]);
|
|
|
|
case Ch of
|
|
SurrogateHighStart..SurrogateHighEnd:
|
|
// error
|
|
FlagInvalidSequence(StrPos, -1);
|
|
SurrogateLowStart..SurrogateLowEnd:
|
|
// 2 bytes to skip
|
|
if StrPos <= 2 then
|
|
FlagInvalidSequence(StrPos, -1)
|
|
else
|
|
begin
|
|
ChNext := UCS4(S[StrPos - 2]);
|
|
if (ChNext < SurrogateHighStart) or (ChNext > SurrogateHighEnd) then
|
|
FlagInvalidSequence(StrPos, -1)
|
|
else
|
|
Dec(StrPos, 2);
|
|
end;
|
|
else
|
|
// 1 byte to skip
|
|
Dec(StrPos);
|
|
end;
|
|
|
|
if StrPos <> -1 then
|
|
Dec(Index);
|
|
|
|
if (StrPos = 1) and (Index > NbSeq) then
|
|
begin
|
|
Result := False;
|
|
Break;
|
|
end;
|
|
end;
|
|
NbSeq := Index;
|
|
end;
|
|
|
|
// returns False if String is too small
|
|
// if UNICODE_SILENT_FAILURE is not defined StrPos is set to -1 on error (invalid UTF16 sequence)
|
|
// StrPos will be incremented by the number of chars that were skipped
|
|
// On return, NbSeq contains the number of UTF16 sequences that were skipped
|
|
{$IFDEF SUPPORTS_UNICODE_STRING}
|
|
function UTF16SkipChars(const S: UnicodeString; var StrPos: SizeInt; var NbSeq: SizeInt): Boolean;
|
|
var
|
|
StrLength, Index: SizeInt;
|
|
Ch, ChNext: UCS4;
|
|
begin
|
|
Result := True;
|
|
StrLength := Length(S);
|
|
|
|
Index := 0;
|
|
if NbSeq >= 0 then
|
|
while (Index < NbSeq) and (StrPos > 0) do
|
|
begin
|
|
Ch := UCS4(S[StrPos]);
|
|
|
|
case Ch of
|
|
SurrogateHighStart..SurrogateHighEnd:
|
|
// 2 bytes to skip
|
|
if StrPos >= StrLength then
|
|
FlagInvalidSequence(StrPos, 1)
|
|
else
|
|
begin
|
|
ChNext := UCS4(S[StrPos + 1]);
|
|
if (ChNext < SurrogateLowStart) or (ChNext > SurrogateLowEnd) then
|
|
FlagInvalidSequence(StrPos, 1)
|
|
else
|
|
Inc(StrPos, 2);
|
|
end;
|
|
SurrogateLowStart..SurrogateLowEnd:
|
|
// error
|
|
FlagInvalidSequence(StrPos, 1);
|
|
else
|
|
// 1 byte to skip
|
|
Inc(StrPos);
|
|
end;
|
|
|
|
if StrPos <> -1 then
|
|
Inc(Index);
|
|
|
|
if (StrPos > StrLength) and (Index < NbSeq) then
|
|
begin
|
|
Result := False;
|
|
Break;
|
|
end;
|
|
end
|
|
else
|
|
while (Index > NbSeq) and (StrPos > 1) do
|
|
begin
|
|
Ch := UCS4(S[StrPos - 1]);
|
|
|
|
case Ch of
|
|
SurrogateHighStart..SurrogateHighEnd:
|
|
// error
|
|
FlagInvalidSequence(StrPos, -1);
|
|
SurrogateLowStart..SurrogateLowEnd:
|
|
// 2 bytes to skip
|
|
if StrPos <= 2 then
|
|
FlagInvalidSequence(StrPos, -1)
|
|
else
|
|
begin
|
|
ChNext := UCS4(S[StrPos - 2]);
|
|
if (ChNext < SurrogateHighStart) or (ChNext > SurrogateHighEnd) then
|
|
FlagInvalidSequence(StrPos, -1)
|
|
else
|
|
Dec(StrPos, 2);
|
|
end;
|
|
else
|
|
// 1 byte to skip
|
|
Dec(StrPos);
|
|
end;
|
|
|
|
if StrPos <> -1 then
|
|
Dec(Index);
|
|
|
|
if (StrPos = 1) and (Index > NbSeq) then
|
|
begin
|
|
Result := False;
|
|
Break;
|
|
end;
|
|
end;
|
|
NbSeq := Index;
|
|
end;
|
|
{$ENDIF SUPPORTS_UNICODE_STRING}
|
|
|
|
function UTF16SkipCharsFromStream(S: TStream; var NbSeq: SizeInt): Boolean;
|
|
var
|
|
Index: SizeInt;
|
|
W: Word;
|
|
begin
|
|
Index := 0;
|
|
while Index < NbSeq do
|
|
begin
|
|
Result := StreamReadWord(S, W);
|
|
if not Result then
|
|
Break;
|
|
case W of
|
|
SurrogateHighStart..SurrogateHighEnd:
|
|
// 2 bytes to skip
|
|
begin
|
|
Result := StreamReadWord(S, W);
|
|
if not Result then
|
|
Break;
|
|
if (W < SurrogateLowStart) or (W > SurrogateLowEnd) then
|
|
FlagInvalidSequence;
|
|
end;
|
|
SurrogateLowStart..SurrogateLowEnd:
|
|
// error
|
|
FlagInvalidSequence;
|
|
else
|
|
// 1 byte to skip
|
|
// nothing to do
|
|
end;
|
|
Inc(Index);
|
|
end;
|
|
Result := Index = NbSeq;
|
|
NbSeq := Index;
|
|
end;
|
|
|
|
// returns False on error:
|
|
// - if an UCS4 character cannot be stored to an UTF-8 string:
|
|
// - if UNICODE_SILENT_FAILURE is defined, ReplacementCharacter is added
|
|
// - if UNICODE_SILENT_FAILURE is not defined, StrPos is set to -1
|
|
// - StrPos > -1 flags string being too small, caller is responsible for allocating space
|
|
// StrPos will be incremented by the number of chars that were written
|
|
function UTF16SetNextChar(var S: TUTF16String; var StrPos: SizeInt; Ch: UCS4): Boolean;
|
|
var
|
|
StrLength: SizeInt;
|
|
begin
|
|
StrLength := Length(S);
|
|
|
|
if Ch <= MaximumUCS2 then
|
|
begin
|
|
// 16 bits to store in place
|
|
Result := (StrPos > 0) and (StrPos <= StrLength);
|
|
if Result then
|
|
begin
|
|
S[StrPos] := WideChar(Ch);
|
|
Inc(StrPos);
|
|
end;
|
|
end
|
|
else
|
|
if Ch <= MaximumUTF16 then
|
|
begin
|
|
// stores a surrogate pair
|
|
Result := (StrPos > 0) and (StrPos < StrLength);
|
|
if Result then
|
|
begin
|
|
Ch := Ch - HalfBase;
|
|
S[StrPos] := WideChar((Ch shr HalfShift) or SurrogateHighStart);
|
|
S[StrPos + 1] := WideChar((Ch and HalfMask) or SurrogateLowStart);
|
|
Inc(StrPos, 2);
|
|
end;
|
|
end
|
|
else
|
|
begin
|
|
{$IFDEF UNICODE_SILENT_FAILURE}
|
|
// add ReplacementCharacter
|
|
Result := (StrPos > 0) and (StrPos <= StrLength);
|
|
if Result then
|
|
begin
|
|
S[StrPos] := WideChar(UCS4ReplacementCharacter);
|
|
Inc(StrPos, 1);
|
|
end;
|
|
{$ELSE ~UNICODE_SILENT_FAILURE}
|
|
StrPos := -1;
|
|
Result := False;
|
|
{$ENDIF ~UNICODE_SILENT_FAILURE}
|
|
end;
|
|
end;
|
|
|
|
{$IFDEF SUPPORTS_UNICODE_STRING}
|
|
function UTF16SetNextChar(var S: UnicodeString; var StrPos: SizeInt; Ch: UCS4): Boolean;
|
|
var
|
|
StrLength: SizeInt;
|
|
begin
|
|
StrLength := Length(S);
|
|
|
|
if Ch <= MaximumUCS2 then
|
|
begin
|
|
// 16 bits to store in place
|
|
Result := (StrPos > 0) and (StrPos <= StrLength);
|
|
if Result then
|
|
begin
|
|
S[StrPos] := WideChar(Ch);
|
|
Inc(StrPos);
|
|
end;
|
|
end
|
|
else
|
|
if Ch <= MaximumUTF16 then
|
|
begin
|
|
// stores a surrogate pair
|
|
Result := (StrPos > 0) and (StrPos < StrLength);
|
|
if Result then
|
|
begin
|
|
Ch := Ch - HalfBase;
|
|
S[StrPos] := WideChar((Ch shr HalfShift) + SurrogateHighStart);
|
|
S[StrPos + 1] := WideChar((Ch and HalfMask) + SurrogateLowStart);
|
|
Inc(StrPos, 2);
|
|
end;
|
|
end
|
|
else
|
|
begin
|
|
{$IFDEF UNICODE_SILENT_FAILURE}
|
|
// add ReplacementCharacter
|
|
Result := (StrPos > 0) and (StrPos <= StrLength);
|
|
if Result then
|
|
begin
|
|
S[StrPos] := WideChar(UCS4ReplacementCharacter);
|
|
Inc(StrPos, 1);
|
|
end;
|
|
{$ELSE ~UNICODE_SILENT_FAILURE}
|
|
StrPos := -1;
|
|
Result := False;
|
|
{$ENDIF ~UNICODE_SILENT_FAILURE}
|
|
end;
|
|
end;
|
|
{$ENDIF SUPPORTS_UNICODE_STRING}
|
|
|
|
function UTF16SetNextCharToStream(S: TStream; Ch: UCS4): Boolean;
|
|
begin
|
|
if Ch <= MaximumUCS2 then
|
|
// 16 bits to store in place
|
|
Result := StreamWriteWord(S, Ch)
|
|
else
|
|
if Ch <= MaximumUTF16 then
|
|
// stores a surrogate pair
|
|
Result := StreamWriteWord(S, (Ch shr HalfShift) or SurrogateHighStart) and
|
|
StreamWriteWord(S, (Ch and HalfMask) or SurrogateLowStart)
|
|
else
|
|
begin
|
|
{$IFDEF UNICODE_SILENT_FAILURE}
|
|
// add ReplacementCharacter
|
|
Result := StreamWriteWord(S, UCS4ReplacementCharacter);
|
|
{$ELSE ~UNICODE_SILENT_FAILURE}
|
|
Result := False;
|
|
{$ENDIF ~UNICODE_SILENT_FAILURE}
|
|
end;
|
|
end;
|
|
|
|
// AnsiGetNextChar = read next character at StrPos
|
|
// StrPos will be incremented by the number of chars that were read (1)
|
|
function AnsiGetNextChar(const S: AnsiString; var StrPos: SizeInt): UCS4;
|
|
var
|
|
StrLen, TmpPos: SizeInt;
|
|
UTF16Buffer: TUTF16String;
|
|
begin
|
|
StrLen := Length(S);
|
|
|
|
if (StrPos <= StrLen) and (StrPos > 0) then
|
|
begin
|
|
UTF16Buffer := WideString(S[StrPos]);
|
|
TmpPos := 1;
|
|
Result := UTF16GetNextChar(UTF16Buffer, TmpPos);
|
|
if TmpPos = -1 then
|
|
StrPos := -1
|
|
else
|
|
Inc(StrPos);
|
|
end
|
|
else
|
|
begin
|
|
// StrPos > StrLength
|
|
Result := 0;
|
|
FlagInvalidSequence(StrPos, 0, Result);
|
|
end;
|
|
end;
|
|
|
|
function AnsiGetNextCharFromStream(S: TStream; out Ch: UCS4): Boolean;
|
|
var
|
|
B: Byte;
|
|
TmpPos: SizeInt;
|
|
UTF16Buffer: TUTF16String;
|
|
begin
|
|
Result := StreamReadByte(S, B);
|
|
if not Result then
|
|
Exit;
|
|
UTF16Buffer := WideString(AnsiString(Chr(B)));
|
|
TmpPos := 1;
|
|
Ch := UTF16GetNextChar(UTF16Buffer, TmpPos);
|
|
Result := TmpPos <> -1;
|
|
end;
|
|
|
|
// AnsiGetNextChar = read next character at StrPos
|
|
// StrPos will be incremented by the number of chars that were read (1)
|
|
function AnsiGetNextChar(const S: AnsiString; CodePage: Word; var StrPos: SizeInt): UCS4;
|
|
var
|
|
StrLen, TmpPos: SizeInt;
|
|
UTF16Buffer: TUTF16String;
|
|
begin
|
|
StrLen := Length(S);
|
|
|
|
if (StrPos <= StrLen) and (StrPos > 0) then
|
|
begin
|
|
SetLength(UTF16Buffer, 2);
|
|
if MultiByteToWideChar(CodePage, MB_PRECOMPOSED or MB_ERR_INVALID_CHARS, @S[StrPos], 1, PWideChar(UTF16Buffer), 2) = 0 then
|
|
begin
|
|
Result := UCS4ReplacementCharacter;
|
|
FlagInvalidSequence(StrPos, 0, Result);
|
|
Exit;
|
|
end;
|
|
TmpPos := 1;
|
|
Result := UTF16GetNextChar(UTF16Buffer, TmpPos);
|
|
if TmpPos = -1 then
|
|
StrPos := -1
|
|
else
|
|
Inc(StrPos);
|
|
end
|
|
else
|
|
begin
|
|
// StrPos > StrLength
|
|
Result := 0;
|
|
FlagInvalidSequence(StrPos, 0, Result);
|
|
end;
|
|
end;
|
|
|
|
function AnsiGetNextCharFromStream(S: TStream; CodePage: Word; out Ch: UCS4): Boolean;
|
|
var
|
|
B: Byte;
|
|
TmpPos: SizeInt;
|
|
UTF16Buffer: TUTF16String;
|
|
begin
|
|
Result := StreamReadByte(S, B);
|
|
if not Result then
|
|
Exit;
|
|
SetLength(UTF16Buffer, 2);
|
|
if MultiByteToWideChar(CodePage, MB_PRECOMPOSED or MB_ERR_INVALID_CHARS, @B, 1, PWideChar(UTF16Buffer), 2) = 0 then
|
|
begin
|
|
Result := False;
|
|
Ch := UCS4ReplacementCharacter;
|
|
Exit;
|
|
end;
|
|
TmpPos := 1;
|
|
Ch := UTF16GetNextChar(UTF16Buffer, TmpPos);
|
|
Result := TmpPos <> -1;
|
|
end;
|
|
|
|
// AnsiSkipChars = skip NbSeq characters starting from StrPos
|
|
// returns False if String is too small
|
|
// StrPos will be incremented by the number of chars that were skipped
|
|
// On return, NbChar contains the number of UTF16 sequences that were skipped
|
|
function AnsiSkipChars(const S: AnsiString; var StrPos: SizeInt; var NbSeq: SizeInt): Boolean;
|
|
var
|
|
StrLen: SizeInt;
|
|
begin
|
|
StrLen := Length(S);
|
|
|
|
if StrPos > 0 then
|
|
begin
|
|
if StrPos + NbSeq > StrLen then
|
|
begin
|
|
NbSeq := StrLen + 1 - StrPos;
|
|
StrPos := StrLen + 1;
|
|
Result := False;
|
|
end
|
|
else
|
|
begin
|
|
// NbSeq := NbSeq;
|
|
StrPos := StrLen + NbSeq;
|
|
Result := True;
|
|
end;
|
|
end
|
|
else
|
|
begin
|
|
// previous error
|
|
NbSeq := 0;
|
|
// StrPos := -1;
|
|
Result := False;
|
|
end;
|
|
end;
|
|
|
|
function AnsiSkipCharsFromStream(S: TStream; var NbSeq: SizeInt): Boolean;
|
|
var
|
|
Index: SizeInt;
|
|
B: Byte;
|
|
begin
|
|
Index := 0;
|
|
while Index < NbSeq do
|
|
begin
|
|
Result := StreamReadByte(S, B);
|
|
if not Result then
|
|
Break;
|
|
Inc(Index);
|
|
end;
|
|
Result := Index = NbSeq;
|
|
NbSeq := Index;
|
|
end;
|
|
|
|
// AnsiSetNextChar = append a character at StrPos
|
|
// returns False on error:
|
|
// - if an UCS4 character cannot be stored to an ansi string:
|
|
// - if UNICODE_SILENT_FAILURE is defined, ReplacementCharacter is added
|
|
// - if UNICODE_SILENT_FAILURE is not defined, StrPos is set to -1
|
|
// - StrPos > -1 flags string being too small, callee did nothing and caller is responsible for allocating space
|
|
// StrPos will be incremented by the number of chars that were written (1)
|
|
function AnsiSetNextChar(var S: AnsiString; var StrPos: SizeInt; Ch: UCS4): Boolean;
|
|
var
|
|
StrLen, TmpPos: SizeInt;
|
|
UTF16Buffer: TUTF16String;
|
|
AnsiBuffer: AnsiString;
|
|
begin
|
|
StrLen := Length(S);
|
|
Result := (StrPos > 0) and (StrPos <= StrLen);
|
|
if Result then
|
|
begin
|
|
SetLength(UTF16Buffer, 2);
|
|
TmpPos := 1;
|
|
Result := UTF16SetNextChar(UTF16Buffer, TmpPos, Ch);
|
|
if Result and (TmpPos = 2) then
|
|
begin
|
|
// one wide character
|
|
AnsiBuffer := AnsiString(WideString(UTF16Buffer[1]));
|
|
S[StrPos] := AnsiBuffer[1];
|
|
Inc(StrPos);
|
|
end
|
|
else
|
|
if Result and (TmpPos = 3) then
|
|
begin
|
|
// one surrogate pair
|
|
AnsiBuffer := AnsiString(UTF16Buffer);
|
|
S[StrPos] := AnsiBuffer[1];
|
|
Inc(StrPos);
|
|
end
|
|
else
|
|
begin
|
|
{$IFDEF UNICODE_SILENT_FAILURE}
|
|
// add ReplacementCharacter
|
|
S[StrPos] := AnsiReplacementCharacter;
|
|
Inc(StrPos);
|
|
{$ELSE ~UNICODE_SILENT_FAILURE}
|
|
Result := False;
|
|
StrPos := -1;
|
|
{$ENDIF ~UNICODE_SILENT_FAILURE}
|
|
end;
|
|
end;
|
|
end;
|
|
|
|
function AnsiSetNextCharToStream(S: TStream; Ch: UCS4): Boolean;
|
|
var
|
|
TmpPos: SizeInt;
|
|
UTF16Buffer: TUTF16String;
|
|
AnsiBuffer: AnsiString;
|
|
begin
|
|
SetLength(UTF16Buffer, 2);
|
|
TmpPos := 1;
|
|
Result := UTF16SetNextChar(UTF16Buffer, TmpPos, Ch);
|
|
|
|
if Result and (TmpPos = 2) then
|
|
begin
|
|
// one wide character
|
|
AnsiBuffer := AnsiString(WideString(UTF16Buffer[1]));
|
|
Result := StreamWriteByte(S, Ord(AnsiBuffer[1]));
|
|
end
|
|
else
|
|
if Result and (TmpPos = 3) then
|
|
begin
|
|
// one surrogate pair
|
|
AnsiBuffer := AnsiString(UTF16Buffer);
|
|
Result := StreamWriteByte(S, Ord(AnsiBuffer[1]));
|
|
end
|
|
else
|
|
begin
|
|
{$IFDEF UNICODE_SILENT_FAILURE}
|
|
// add ReplacementCharacter
|
|
Result := StreamWriteByte(S, Ord(AnsiReplacementCharacter));
|
|
{$ELSE ~UNICODE_SILENT_FAILURE}
|
|
Result := False;
|
|
{$ENDIF ~UNICODE_SILENT_FAILURE}
|
|
end;
|
|
end;
|
|
|
|
function AnsiSetNextChar(var S: AnsiString; CodePage: Word; var StrPos: SizeInt; Ch: UCS4): Boolean;
|
|
var
|
|
StrLen, TmpPos: SizeInt;
|
|
UTF16Buffer: TUTF16String;
|
|
AnsiCharacter: AnsiChar;
|
|
begin
|
|
StrLen := Length(S);
|
|
Result := (StrPos > 0) and (StrPos <= StrLen);
|
|
if Result then
|
|
begin
|
|
SetLength(UTF16Buffer, 2);
|
|
TmpPos := 1;
|
|
Result := UTF16SetNextChar(UTF16Buffer, TmpPos, Ch);
|
|
Result := Result and (WideCharToMultiByte(CodePage, 0, PWideChar(UTF16Buffer), TmpPos-1, @AnsiCharacter, 1, nil, nil) > 0);
|
|
if Result then
|
|
begin
|
|
S[StrPos] := AnsiCharacter;
|
|
Inc(StrPos);
|
|
end;
|
|
if not Result then
|
|
begin
|
|
{$IFDEF UNICODE_SILENT_FAILURE}
|
|
// add ReplacementCharacter
|
|
S[StrPos] := AnsiReplacementCharacter;
|
|
Inc(StrPos);
|
|
{$ELSE ~UNICODE_SILENT_FAILURE}
|
|
Result := False;
|
|
StrPos := -1;
|
|
{$ENDIF ~UNICODE_SILENT_FAILURE}
|
|
end;
|
|
end;
|
|
end;
|
|
|
|
function AnsiSetNextCharToStream(S: TStream; CodePage: Word; Ch: UCS4): Boolean;
|
|
var
|
|
TmpPos: SizeInt;
|
|
UTF16Buffer: TUTF16String;
|
|
AnsiCharacter: AnsiChar;
|
|
begin
|
|
SetLength(UTF16Buffer, 2);
|
|
TmpPos := 1;
|
|
Result := UTF16SetNextChar(UTF16Buffer, TmpPos, Ch);
|
|
Result := Result and (WideCharToMultiByte(CodePage, 0, PWideChar(UTF16Buffer), TmpPos-1, @AnsiCharacter, 1, nil, nil) > 0);
|
|
if Result then
|
|
Result := StreamWriteByte(S, Ord(AnsiCharacter));
|
|
if not Result then
|
|
begin
|
|
{$IFDEF UNICODE_SILENT_FAILURE}
|
|
// add ReplacementCharacter
|
|
Result := StreamWriteByte(S, Ord(AnsiReplacementCharacter));
|
|
{$ELSE ~UNICODE_SILENT_FAILURE}
|
|
Result := False;
|
|
{$ENDIF ~UNICODE_SILENT_FAILURE}
|
|
end;
|
|
end;
|
|
|
|
// StringGetNextChar = read next character/sequence at StrPos
|
|
// if UNICODE_SILENT_FAILURE is defined, invalid sequences will be replaced by ReplacementCharacter
|
|
// otherwise StrPos is set to -1 on return to flag an error (invalid UTF16 sequence for WideString)
|
|
// StrPos will be incremented by the number of chars that were read
|
|
function StringGetNextChar(const S: string; var StrPos: SizeInt): UCS4;
|
|
begin
|
|
{$IFDEF SUPPORTS_UNICODE}
|
|
Result := UTF16GetNextChar(S, StrPos);
|
|
{$ELSE ~SUPPORTS_UNICODE}
|
|
Result := AnsiGetNextChar(S, StrPos);
|
|
{$ENDIF ~SUPPORTS_UNICODE}
|
|
end;
|
|
|
|
// StringSkipChars = skip NbSeq characters/sequences starting from StrPos
|
|
// returns False if String is too small
|
|
// if UNICODE_SILENT_FAILURE is not defined StrPos is set to -1 on error (invalid UTF16 sequence for WideString)
|
|
// StrPos will be incremented by the number of chars that were skipped
|
|
// On return, NbChar contains the number of UTF16 sequences that were skipped
|
|
function StringSkipChars(const S: string; var StrPos: SizeInt; var NbSeq: SizeInt): Boolean;
|
|
begin
|
|
{$IFDEF SUPPORTS_UNICODE}
|
|
Result := UTF16SkipChars(S, StrPos, NbSeq);
|
|
{$ELSE ~SUPPORTS_UNICODE}
|
|
Result := AnsiSkipChars(S, StrPos, NbSeq);
|
|
{$ENDIF ~SUPPORTS_UNICODE}
|
|
end;
|
|
|
|
// StringSetNextChar = append a character/sequence at StrPos
|
|
// returns False on error:
|
|
// - if an UCS4 character cannot be stored to a string:
|
|
// - if UNICODE_SILENT_FAILURE is defined, ReplacementCharacter is added
|
|
// - if UNICODE_SILENT_FAILURE is not defined, StrPos is set to -1
|
|
// - StrPos > -1 flags string being too small, callee did nothing and caller is responsible for allocating space
|
|
// StrPos will be incremented by the number of chars that were written
|
|
function StringSetNextChar(var S: string; var StrPos: SizeInt; Ch: UCS4): Boolean;
|
|
begin
|
|
{$IFDEF SUPPORTS_UNICODE}
|
|
Result := UTF16SetNextChar(S, StrPos, Ch);
|
|
{$ELSE ~SUPPORTS_UNICODE}
|
|
Result := AnsiSetNextChar(S, StrPos, Ch);
|
|
{$ENDIF ~SUPPORTS_UNICODE}
|
|
end;
|
|
|
|
function WideStringToUTF8(const S: WideString): TUTF8String;
|
|
begin
|
|
Result := UTF16ToUTF8(S);
|
|
end;
|
|
|
|
function UTF8ToWideString(const S: TUTF8String): WideString;
|
|
begin
|
|
Result := UTF8ToUTF16(S);
|
|
end;
|
|
|
|
function WideStringToUCS4(const S: WideString): TUCS4Array;
|
|
begin
|
|
Result := UTF16ToUCS4(S);
|
|
end;
|
|
|
|
function UCS4ToWideString(const S: TUCS4Array): WideString;
|
|
begin
|
|
Result := UCS4ToUTF16(S);
|
|
end;
|
|
|
|
function AnsiStringToUTF8(const S: AnsiString): TUTF8String;
|
|
var
|
|
WS: TUTF16String;
|
|
begin
|
|
WS := TUTF16String(S);
|
|
Result := UTF16ToUTF8(WS);
|
|
end;
|
|
|
|
function UTF8ToAnsiString(const S: TUTF8String): AnsiString;
|
|
var
|
|
WS: TUTF16String;
|
|
begin
|
|
WS := UTF8ToUTF16(S);
|
|
Result := AnsiString(WS);
|
|
end;
|
|
|
|
function AnsiStringToUTF16(const S: AnsiString): TUTF16String;
|
|
begin
|
|
Result := TUTF16String(S);
|
|
end;
|
|
|
|
function UTF16ToAnsiString(const S: TUTF16String): AnsiString;
|
|
begin
|
|
Result := AnsiString(S);
|
|
end;
|
|
|
|
function AnsiStringToUCS4(const S: AnsiString): TUCS4Array;
|
|
var
|
|
WS: TUTF16String;
|
|
begin
|
|
WS := TUTF16String(S);
|
|
Result := UTF16ToUCS4(WS);
|
|
end;
|
|
|
|
function UCS4ToAnsiString(const S: TUCS4Array): AnsiString;
|
|
var
|
|
WS: TUTF16String;
|
|
begin
|
|
WS := UCS4ToUTF16(S);
|
|
Result := AnsiString(WS);
|
|
end;
|
|
|
|
function StringToUTF8(const S: string): TUTF8String;
|
|
var
|
|
WS: TUTF16String;
|
|
begin
|
|
WS := TUTF16String(S);
|
|
Result := UTF16ToUTF8(WS);
|
|
end;
|
|
|
|
function TryStringToUTF8(const S: string; out D: TUTF8String): Boolean;
|
|
var
|
|
WS: TUTF16String;
|
|
begin
|
|
WS := TUTF16String(S);
|
|
Result := TryUTF16ToUTF8(WS, D);
|
|
end;
|
|
|
|
function UTF8ToString(const S: TUTF8String): string;
|
|
var
|
|
WS: TUTF16String;
|
|
begin
|
|
WS := UTF8ToUTF16(S);
|
|
Result := string(WS);
|
|
end;
|
|
|
|
function TryUTF8ToString(const S: TUTF8String; out D: string): Boolean;
|
|
var
|
|
WS: TUTF16String;
|
|
begin
|
|
Result := TryUTF8ToUTF16(S, WS);
|
|
D := string(WS);
|
|
end;
|
|
|
|
function StringToUTF16(const S: string): TUTF16String;
|
|
begin
|
|
Result := TUTF16String(S);
|
|
end;
|
|
|
|
function TryStringToUTF16(const S: string; out D: TUTF16String): Boolean;
|
|
begin
|
|
D := TUTF16String(S);
|
|
Result := True;
|
|
end;
|
|
|
|
function UTF16ToString(const S: TUTF16String): string;
|
|
begin
|
|
Result := string(S);
|
|
end;
|
|
|
|
function TryUTF16ToString(const S: TUTF16String; out D: string): Boolean;
|
|
begin
|
|
D := string(S);
|
|
Result := True;
|
|
end;
|
|
|
|
function StringToUCS4(const S: string): TUCS4Array;
|
|
var
|
|
WS: TUTF16String;
|
|
begin
|
|
WS := TUTF16String(S);
|
|
Result := UTF16ToUCS4(WS);
|
|
end;
|
|
|
|
function TryStringToUCS4(const S: string; out D: TUCS4Array): Boolean;
|
|
var
|
|
WS: TUTF16String;
|
|
begin
|
|
WS := TUTF16String(S);
|
|
Result := TryUTF16ToUCS4(WS, D);
|
|
end;
|
|
|
|
function UCS4ToString(const S: TUCS4Array): string;
|
|
var
|
|
WS: WideString;
|
|
begin
|
|
WS := UCS4ToUTF16(S);
|
|
Result := string(WS);
|
|
end;
|
|
|
|
function TryUCS4ToString(const S: TUCS4Array; out D: string): Boolean;
|
|
var
|
|
WS: WideString;
|
|
begin
|
|
Result := TryUCS4ToUTF16(S, WS);
|
|
D := string(WS);
|
|
end;
|
|
|
|
function UTF8ToUTF16(const S: TUTF8String): TUTF16String;
|
|
var
|
|
SrcIndex, SrcLength, DestIndex: SizeInt;
|
|
Ch: UCS4;
|
|
begin
|
|
if S = '' then
|
|
Result := ''
|
|
else
|
|
begin
|
|
SrcLength := Length(S);
|
|
SetLength(Result, SrcLength); // create enough room
|
|
|
|
SrcIndex := 1;
|
|
DestIndex := 1;
|
|
while SrcIndex <= SrcLength do
|
|
begin
|
|
Ch := UTF8GetNextChar(S, SrcIndex);
|
|
if SrcIndex = -1 then
|
|
raise EJclUnexpectedEOSequenceError.Create;
|
|
|
|
UTF16SetNextChar(Result, DestIndex, Ch);
|
|
end;
|
|
SetLength(Result, DestIndex - 1); // now fix up length
|
|
end;
|
|
end;
|
|
|
|
function TryUTF8ToUTF16(const S: TUTF8String; out D: TUTF16String): Boolean;
|
|
var
|
|
SrcIndex, SrcLength, DestIndex: SizeInt;
|
|
Ch: UCS4;
|
|
begin
|
|
Result := True;
|
|
if S = '' then
|
|
D := ''
|
|
else
|
|
begin
|
|
SrcLength := Length(S);
|
|
SetLength(D, SrcLength); // create enough room
|
|
|
|
SrcIndex := 1;
|
|
DestIndex := 1;
|
|
while SrcIndex <= SrcLength do
|
|
begin
|
|
Ch := UTF8GetNextChar(S, SrcIndex);
|
|
if SrcIndex = -1 then
|
|
begin
|
|
Result := False;
|
|
D := '';
|
|
Exit;
|
|
end;
|
|
|
|
UTF16SetNextChar(D, DestIndex, Ch);
|
|
end;
|
|
SetLength(D, DestIndex - 1); // now fix up length
|
|
end;
|
|
end;
|
|
|
|
function UTF16ToUTF8(const S: TUTF16String): TUTF8String;
|
|
var
|
|
SrcIndex, SrcLength, DestIndex: SizeInt;
|
|
Ch: UCS4;
|
|
begin
|
|
if S = '' then
|
|
Result := ''
|
|
else
|
|
begin
|
|
SrcLength := Length(S);
|
|
SetLength(Result, SrcLength * 3); // worste case
|
|
|
|
SrcIndex := 1;
|
|
DestIndex := 1;
|
|
while SrcIndex <= SrcLength do
|
|
begin
|
|
Ch := UTF16GetNextChar(S, SrcIndex);
|
|
if SrcIndex = -1 then
|
|
raise EJclUnexpectedEOSequenceError.Create;
|
|
|
|
UTF8SetNextChar(Result, DestIndex, Ch);
|
|
end;
|
|
SetLength(Result, DestIndex - 1); // now fix up length
|
|
end;
|
|
end;
|
|
|
|
function TryUTF16ToUTF8(const S: TUTF16String; out D: TUTF8String): Boolean;
|
|
var
|
|
SrcIndex, SrcLength, DestIndex: SizeInt;
|
|
Ch: UCS4;
|
|
begin
|
|
Result := True;
|
|
if S = '' then
|
|
D := ''
|
|
else
|
|
begin
|
|
SrcLength := Length(S);
|
|
SetLength(D, SrcLength * 3); // worste case
|
|
|
|
SrcIndex := 1;
|
|
DestIndex := 1;
|
|
while SrcIndex <= SrcLength do
|
|
begin
|
|
Ch := UTF16GetNextChar(S, SrcIndex);
|
|
if SrcIndex = -1 then
|
|
begin
|
|
Result := False;
|
|
D := '';
|
|
Exit;
|
|
end;
|
|
|
|
UTF8SetNextChar(D, DestIndex, Ch);
|
|
end;
|
|
SetLength(D, DestIndex - 1); // now fix up length
|
|
end;
|
|
end;
|
|
|
|
function UTF8ToUCS4(const S: TUTF8String): TUCS4Array;
|
|
var
|
|
SrcIndex, SrcLength, DestIndex: SizeInt;
|
|
Ch: UCS4;
|
|
begin
|
|
if S <> '' then
|
|
begin
|
|
SrcLength := Length(S);
|
|
SetLength(Result, SrcLength); // create enough room
|
|
|
|
SrcIndex := 1;
|
|
DestIndex := 0;
|
|
while SrcIndex <= SrcLength do
|
|
begin
|
|
Ch := UTF8GetNextChar(S, SrcIndex);
|
|
if SrcIndex = -1 then
|
|
raise EJclUnexpectedEOSequenceError.Create;
|
|
|
|
Result[DestIndex] := Ch;
|
|
Inc(DestIndex);
|
|
end;
|
|
SetLength(Result, DestIndex); // now fix up length
|
|
end;
|
|
end;
|
|
|
|
function TryUTF8ToUCS4(const S: TUTF8String; out D: TUCS4Array): Boolean;
|
|
var
|
|
SrcIndex, SrcLength, DestIndex: SizeInt;
|
|
Ch: UCS4;
|
|
begin
|
|
Result := True;
|
|
if S <> '' then
|
|
begin
|
|
SrcLength := Length(S);
|
|
SetLength(D, SrcLength); // create enough room
|
|
|
|
SrcIndex := 1;
|
|
DestIndex := 0;
|
|
while SrcIndex <= SrcLength do
|
|
begin
|
|
Ch := UTF8GetNextChar(S, SrcIndex);
|
|
if SrcIndex = -1 then
|
|
begin
|
|
Result := False;
|
|
SetLength(D, 0);
|
|
Exit;
|
|
end;
|
|
|
|
D[DestIndex] := Ch;
|
|
Inc(DestIndex);
|
|
end;
|
|
SetLength(D, DestIndex); // now fix up length
|
|
end;
|
|
end;
|
|
|
|
function UCS4ToUTF8(const S: TUCS4Array): TUTF8String;
|
|
var
|
|
SrcIndex, SrcLength, DestIndex: SizeInt;
|
|
begin
|
|
SrcLength := Length(S);
|
|
if Length(S) = 0 then
|
|
Result := ''
|
|
else
|
|
begin
|
|
SetLength(Result, SrcLength * 3); // assume worst case
|
|
DestIndex := 1;
|
|
|
|
for SrcIndex := 0 to SrcLength - 1 do
|
|
begin
|
|
UTF8SetNextChar(Result, DestIndex, S[SrcIndex]);
|
|
if DestIndex = -1 then
|
|
raise EJclUnexpectedEOSequenceError.Create;
|
|
end;
|
|
|
|
SetLength(Result, DestIndex - 1); // set to actual length
|
|
end;
|
|
end;
|
|
|
|
function TryUCS4ToUTF8(const S: TUCS4Array; out D: TUTF8String): Boolean;
|
|
var
|
|
SrcIndex, SrcLength, DestIndex: SizeInt;
|
|
begin
|
|
SrcLength := Length(S);
|
|
Result := True;
|
|
if Length(S) = 0 then
|
|
D := ''
|
|
else
|
|
begin
|
|
SetLength(D, SrcLength * 3); // assume worst case
|
|
DestIndex := 1;
|
|
|
|
for SrcIndex := 0 to SrcLength - 1 do
|
|
begin
|
|
UTF8SetNextChar(D, DestIndex, S[SrcIndex]);
|
|
if DestIndex = -1 then
|
|
begin
|
|
Result := False;
|
|
D := '';
|
|
Exit;
|
|
end;
|
|
end;
|
|
|
|
SetLength(D, DestIndex - 1); // set to actual length
|
|
end;
|
|
end;
|
|
|
|
function UTF16ToUCS4(const S: TUTF16String): TUCS4Array;
|
|
var
|
|
SrcIndex, SrcLength, DestIndex: SizeInt;
|
|
Ch: UCS4;
|
|
begin
|
|
if S <> '' then
|
|
begin
|
|
SrcLength := Length(S);
|
|
SetLength(Result, SrcLength); // create enough room
|
|
|
|
SrcIndex := 1;
|
|
DestIndex := 0;
|
|
while SrcIndex <= SrcLength do
|
|
begin
|
|
Ch := UTF16GetNextChar(S, SrcIndex);
|
|
if SrcIndex = -1 then
|
|
raise EJclUnexpectedEOSequenceError.Create;
|
|
|
|
Result[DestIndex] := Ch;
|
|
Inc(DestIndex);
|
|
end;
|
|
SetLength(Result, DestIndex); // now fix up length
|
|
end;
|
|
end;
|
|
|
|
function TryUTF16ToUCS4(const S: TUTF16String; out D: TUCS4Array): Boolean;
|
|
var
|
|
SrcIndex, SrcLength, DestIndex: SizeInt;
|
|
Ch: UCS4;
|
|
begin
|
|
Result := True;
|
|
if S <> '' then
|
|
begin
|
|
SrcLength := Length(S);
|
|
SetLength(D, SrcLength); // create enough room
|
|
|
|
SrcIndex := 1;
|
|
DestIndex := 0;
|
|
while SrcIndex <= SrcLength do
|
|
begin
|
|
Ch := UTF16GetNextChar(S, SrcIndex);
|
|
if SrcIndex = -1 then
|
|
begin
|
|
Result := False;
|
|
SetLength(D, 0);
|
|
Exit;
|
|
end;
|
|
|
|
D[DestIndex] := Ch;
|
|
Inc(DestIndex);
|
|
end;
|
|
SetLength(D, DestIndex); // now fix up length
|
|
end;
|
|
end;
|
|
|
|
function UCS4ToUTF16(const S: TUCS4Array): TUTF16String;
|
|
var
|
|
SrcIndex, SrcLength, DestIndex: SizeInt;
|
|
begin
|
|
SrcLength := Length(S);
|
|
if SrcLength = 0 then
|
|
Result := ''
|
|
else
|
|
begin
|
|
SetLength(Result, SrcLength * 3); // assume worst case
|
|
DestIndex := 1;
|
|
|
|
for SrcIndex := 0 to SrcLength - 1 do
|
|
begin
|
|
UTF16SetNextChar(Result, DestIndex, S[SrcIndex]);
|
|
if DestIndex = -1 then
|
|
raise EJclUnexpectedEOSequenceError.Create;
|
|
end;
|
|
|
|
SetLength(Result, DestIndex - 1); // set to actual length
|
|
end;
|
|
end;
|
|
|
|
function TryUCS4ToUTF16(const S: TUCS4Array; out D:TUTF16String): Boolean;
|
|
var
|
|
SrcIndex, SrcLength, DestIndex: SizeInt;
|
|
begin
|
|
SrcLength := Length(S);
|
|
Result := True;
|
|
if SrcLength = 0 then
|
|
D := ''
|
|
else
|
|
begin
|
|
SetLength(D, SrcLength * 3); // assume worst case
|
|
DestIndex := 1;
|
|
|
|
for SrcIndex := 0 to SrcLength - 1 do
|
|
begin
|
|
UTF16SetNextChar(D, DestIndex, S[SrcIndex]);
|
|
if DestIndex = -1 then
|
|
begin
|
|
Result := False;
|
|
D := '';
|
|
Exit;
|
|
end;
|
|
end;
|
|
|
|
SetLength(D, DestIndex - 1); // set to actual length
|
|
end;
|
|
end;
|
|
|
|
function UTF8CharCount(const S: TUTF8String): SizeInt;
|
|
var
|
|
StrPos: SizeInt;
|
|
begin
|
|
StrPos := 1;
|
|
Result := Length(S);
|
|
UTF8SkipChars(S, StrPos, Result);
|
|
if StrPos = -1 then
|
|
raise EJclUnexpectedEOSequenceError.Create;
|
|
end;
|
|
|
|
function UTF16CharCount(const S: TUTF16String): SizeInt;
|
|
var
|
|
StrPos: SizeInt;
|
|
begin
|
|
StrPos := 1;
|
|
Result := Length(S);
|
|
UTF16SkipChars(S, StrPos, Result);
|
|
if StrPos = -1 then
|
|
raise EJclUnexpectedEOSequenceError.Create;
|
|
end;
|
|
|
|
function UCS2CharCount(const S: TUCS2String): SizeInt;
|
|
begin
|
|
Result := Length(S);
|
|
end;
|
|
|
|
function UCS4CharCount(const S: TUCS4Array): SizeInt;
|
|
begin
|
|
Result := Length(S);
|
|
end;
|
|
|
|
function GetUCS4CharAt(const UTF8Str: TUTF8String; Index: SizeInt; out Value: UCS4): Boolean; overload;
|
|
var
|
|
StrPos: SizeInt;
|
|
begin
|
|
StrPos := 1;
|
|
Result := Index >= 0;
|
|
if Result then
|
|
Result := UTF8SkipChars(UTF8Str, StrPos, Index);
|
|
if StrPos = -1 then
|
|
raise EJclUnexpectedEOSequenceError.Create;
|
|
Result := Result and (StrPos <= Length(UTF8Str));
|
|
if Result then
|
|
begin
|
|
Value := UTF8GetNextChar(UTF8Str, StrPos);
|
|
if StrPos = -1 then
|
|
raise EJclUnexpectedEOSequenceError.Create;
|
|
end;
|
|
end;
|
|
|
|
function GetUCS4CharAt(const WideStr: WideString; Index: SizeInt; out Value: UCS4; IsUTF16: Boolean): Boolean; overload;
|
|
var
|
|
StrPos: SizeInt;
|
|
begin
|
|
if IsUTF16 then
|
|
begin
|
|
StrPos := 1;
|
|
Result := Index >= 0;
|
|
if Result then
|
|
Result := UTF16SkipChars(WideStr, StrPos, Index);
|
|
if StrPos = -1 then
|
|
raise EJclUnexpectedEOSequenceError.Create;
|
|
Result := Result and (StrPos <= Length(WideStr));
|
|
if Result then
|
|
begin
|
|
Value := UTF16GetNextChar(WideStr, StrPos);
|
|
if StrPos = -1 then
|
|
raise EJclUnexpectedEOSequenceError.Create;
|
|
end;
|
|
end
|
|
else
|
|
begin
|
|
Result := (Index >= 1) and (Index <= Length(WideStr));
|
|
Value := UCS4(WideStr[Index]);
|
|
end;
|
|
end;
|
|
|
|
function GetUCS4CharAt(const UCS4Str: TUCS4Array; Index: SizeInt; out Value: UCS4): Boolean; overload;
|
|
begin
|
|
Result := (Index >= 0) and (Index < Length(UCS4Str));
|
|
if Result then
|
|
Value := UCS4Str[Index];
|
|
end;
|
|
|
|
function UCS4ToAnsiChar(Value: UCS4): AnsiChar;
|
|
var
|
|
Buf: WideString;
|
|
StrPos: SizeInt;
|
|
begin
|
|
StrPos := 1;
|
|
Buf := #0#0;
|
|
if UTF16SetNextChar(Buf, StrPos, Value) then
|
|
Result := AnsiString(Buf)[1]
|
|
else
|
|
Result := AnsiReplacementCharacter;
|
|
end;
|
|
|
|
function UCS4ToWideChar(Value: UCS4): WideChar;
|
|
begin
|
|
if Value <= MaximumUCS2 then
|
|
Result := WideChar(Value)
|
|
else
|
|
Result := WideChar(UCS4ReplacementCharacter);
|
|
end;
|
|
|
|
function UCS4ToChar(Value: UCS4): Char;
|
|
begin
|
|
{$IFDEF SUPPORTS_UNICODE}
|
|
Result := UCS4ToWideChar(Value);
|
|
{$ELSE ~SUPPORTS_UNICODE}
|
|
Result := UCS4ToAnsiChar(Value);
|
|
{$ENDIF ~SUPPORTS_UNICODE}
|
|
end;
|
|
|
|
function AnsiCharToUCS4(Value: AnsiChar): UCS4;
|
|
var
|
|
Buf: WideString;
|
|
StrPos: SizeInt;
|
|
begin
|
|
StrPos := 1;
|
|
Buf := WideString(AnsiString(Value));
|
|
Result := UTF16GetNextChar(Buf, StrPos);
|
|
end;
|
|
|
|
function WideCharToUCS4(Value: WideChar): UCS4;
|
|
begin
|
|
Result := UCS4(Value);
|
|
end;
|
|
|
|
function CharToUCS4(Value: Char): UCS4;
|
|
begin
|
|
{$IFDEF SUPPORTS_UNICODE}
|
|
Result := WideCharToUCS4(Value);
|
|
{$ELSE ~SUPPORTS_UNICODE}
|
|
Result := AnsiCharToUCS4(Value);
|
|
{$ENDIF ~SUPPORTS_UNICODE}
|
|
end;
|
|
|
|
{$IFDEF UNITVERSIONING}
|
|
initialization
|
|
RegisterUnitVersion(HInstance, UnitVersioning);
|
|
|
|
finalization
|
|
UnregisterUnitVersion(HInstance);
|
|
{$ENDIF UNITVERSIONING}
|
|
|
|
end.
|