8609 lines
254 KiB
ObjectPascal
8609 lines
254 KiB
ObjectPascal
|
|
{**************************************************************************************************}
|
|||
|
|
{ }
|
|||
|
|
{ Project JEDI Code Library (JCL) }
|
|||
|
|
{ }
|
|||
|
|
{ The contents of this file are subject to the Mozilla Public License Version 1.1 (the "License"); }
|
|||
|
|
{ you may not use this file except in compliance with the License. You may obtain a copy of the }
|
|||
|
|
{ License at http://www.mozilla.org/MPL/ }
|
|||
|
|
{ }
|
|||
|
|
{ Software distributed under the License is distributed on an "AS IS" basis, WITHOUT WARRANTY OF }
|
|||
|
|
{ ANY KIND, either express or implied. See the License for the specific language governing rights }
|
|||
|
|
{ and limitations under the License. }
|
|||
|
|
{ }
|
|||
|
|
{ The Original Code is JclUnicode.pas. }
|
|||
|
|
{ }
|
|||
|
|
{ The Initial Developer of the Original Code is Mike Lischke (public att lischke-online dott de). }
|
|||
|
|
{ Portions created by Mike Lischke are Copyright (C) 1999-2000 Mike Lischke. All Rights Reserved. }
|
|||
|
|
{ }
|
|||
|
|
{ Contributor(s): }
|
|||
|
|
{ Marcel van Brakel }
|
|||
|
|
{ Andreas Hausladen (ahuser) }
|
|||
|
|
{ Mike Lischke }
|
|||
|
|
{ Flier Lu (flier) }
|
|||
|
|
{ Robert Marquardt (marquardt) }
|
|||
|
|
{ Robert Rossmair (rrossmair) }
|
|||
|
|
{ Olivier Sannier (obones) }
|
|||
|
|
{ Matthias Thoma (mthoma) }
|
|||
|
|
{ Petr Vones (pvones) }
|
|||
|
|
{ Peter Schraut (http://www.console-dev.de) }
|
|||
|
|
{ }
|
|||
|
|
{**************************************************************************************************}
|
|||
|
|
{ }
|
|||
|
|
{ Various Unicode related routines }
|
|||
|
|
{ }
|
|||
|
|
{**************************************************************************************************}
|
|||
|
|
|
|||
|
|
// Last modified: $Date: 2005/10/26 09:15:13 $
|
|||
|
|
// For history see end of file
|
|||
|
|
|
|||
|
|
unit JclUnicode;
|
|||
|
|
|
|||
|
|
{$I jcl.inc}
|
|||
|
|
|
|||
|
|
// Copyright (c) 1999-2000 Mike Lischke (public att lischke-online dott de)
|
|||
|
|
//
|
|||
|
|
|
|||
|
|
// 10-JUL-2005: (changes by Peter Schraut)
|
|||
|
|
// - added CodeBlockName, returns the blockname as string
|
|||
|
|
// - added CodeBlockRange, returns the range of the specified codeblock
|
|||
|
|
// - updated TUnicodeBlock to reflect changes in unicode 4.1
|
|||
|
|
// - updated CodeBlockFromChar to reflect changes in unicode 4.1
|
|||
|
|
// - Notes:
|
|||
|
|
// Here are a few suggestions to reflect latest namechanges in unicode 4.1,
|
|||
|
|
// but they were not done due to compatibility with old code:
|
|||
|
|
// ubGreek should be renamed to ubGreekandCoptic
|
|||
|
|
// ubCombiningMarksforSymbols should be renamed to ubCombiningDiacriticalMarksforSymbols
|
|||
|
|
// ubPrivateUse should be renamed to ubPrivateUseArea
|
|||
|
|
//
|
|||
|
|
//
|
|||
|
|
// 19-SEP-2003: (changes by Andreas Hausladen)
|
|||
|
|
// - added OWN_WIDESTRING_MEMMGR for faster memory managment in TWideStringList
|
|||
|
|
// under Windows
|
|||
|
|
// - fixed: TWideStringList.Destroy does not set OnChange and OnChanging to nil before calling Clear
|
|||
|
|
//
|
|||
|
|
//
|
|||
|
|
// 29-MAR-2002: MT
|
|||
|
|
// - WideNormalize now returns strings with normalization mode nfNone unchanged.
|
|||
|
|
// - Bug fix in WideCompose: Raised exception when Result of WideComposeHangul was an
|
|||
|
|
// empty string. (#0000044)
|
|||
|
|
// - Bug fix in WideAdjustLineBreaks
|
|||
|
|
// - Added Asserts were needed.
|
|||
|
|
// - TWideStrings.IndexOfName now takes care of NormalizeForm as well.
|
|||
|
|
// - TWideStrings.IndexOf now takes care of NormalizeForm as well.
|
|||
|
|
// - TWideString.List Find now uses the same NormalizationForm for the search string as it uses
|
|||
|
|
// within the list itself.
|
|||
|
|
//
|
|||
|
|
// 29-NOV-2001:
|
|||
|
|
// - bug fix
|
|||
|
|
// 06-JUN-2001:
|
|||
|
|
// - small changes
|
|||
|
|
// 28-APR-2001:
|
|||
|
|
// - bug fixes
|
|||
|
|
// 05-APR-2001:
|
|||
|
|
// - bug fixes
|
|||
|
|
// 23-MAR-2001:
|
|||
|
|
// - WideSameText
|
|||
|
|
// - small changes
|
|||
|
|
// 10-FEB-2001:
|
|||
|
|
// - bug fix in StringToWideStringEx and WideStringToStringEx
|
|||
|
|
// 05-FEB-2001:
|
|||
|
|
// - TWideStrings.GetSeparatedText changed (no separator anymore after the last line)
|
|||
|
|
// 29-JAN-2001:
|
|||
|
|
// - PrepareUnicodeData
|
|||
|
|
// - LoadInProgress critical section is now created at init time to avoid critical thread races
|
|||
|
|
// - bug fixes
|
|||
|
|
// 26-JAN-2001:
|
|||
|
|
// - ExpandANSIString
|
|||
|
|
// - TWideStrings.SaveUnicode is by default True now
|
|||
|
|
// 20..21-JAN-2001:
|
|||
|
|
// - StrUpperW, StrLowerW and StrTitleW removed because they potentially would need
|
|||
|
|
// a reallocation to work correctly (use the WideString versions instead)
|
|||
|
|
// - further improvements related to internal data
|
|||
|
|
// - introduced TUnicodeBlock
|
|||
|
|
// - CodeBlockFromChar improved
|
|||
|
|
// 07-JAN-2001:
|
|||
|
|
// optimized access to character properties, combining class etc.
|
|||
|
|
// 06-JAN-2001:
|
|||
|
|
// TWideStrings and TWideStringList improved
|
|||
|
|
// APR-DEC 2000: versions 2.1 - 2.6
|
|||
|
|
// - preparation for public rlease
|
|||
|
|
// - additional conversion routines
|
|||
|
|
// - JCL compliance
|
|||
|
|
// - character properties unified
|
|||
|
|
// - character properties data and lookup improvements
|
|||
|
|
// - reworked Unicode data resource file
|
|||
|
|
// - improved simple string comparation routines (StrCompW, StrLCompW etc., include surrogate fix)
|
|||
|
|
// - special case folding data for language neutral case insensitive comparations included
|
|||
|
|
// - optimized decomposition
|
|||
|
|
// - composition and normalization support
|
|||
|
|
// - normalization conformance tests applied
|
|||
|
|
// - bug fixes
|
|||
|
|
// FEB-MAR 2000: version 2.0
|
|||
|
|
// - Unicode regular expressions (URE) search class (TURESearch)
|
|||
|
|
// - generic search engine base class for both the Boyer-Moore and the RE search class
|
|||
|
|
// - whole word only search in UTBM, bug fixes in UTBM
|
|||
|
|
// - string decompositon (including hangul)
|
|||
|
|
// OCT/99 - JAN/2000: version 1.0
|
|||
|
|
// - basic Unicode implementation, more than 100 WideString/UCS2 and UCS4 core functions
|
|||
|
|
// - TWideStrings and TWideStringList classes
|
|||
|
|
// - Unicode Tuned Boyer-Moore search class (TUTBMSearch)
|
|||
|
|
// - low and high level Unicode/Wide* functions
|
|||
|
|
// - low level Unicode UCS4 data import and functions
|
|||
|
|
// - helper functions
|
|||
|
|
//
|
|||
|
|
// Version 2.9
|
|||
|
|
// This unit contains routines and classes to manage and work with Unicode/WideString strings.
|
|||
|
|
// You need Delphi 4 or higher to compile this code.
|
|||
|
|
//
|
|||
|
|
// Publicly available low level functions are all preceded by "Unicode..." (e.g.
|
|||
|
|
// in UnicodeToUpper) while the high level functions use the Str... or Wide...
|
|||
|
|
// naming scheme (e.g. StrLICompW and WideUpperCase).
|
|||
|
|
//
|
|||
|
|
// The normalization implementation in this unit has successfully and completely passed the
|
|||
|
|
// official normative conformance testing as of Annex 9 in Technical Report #15
|
|||
|
|
// (Unicode Standard Annex #15, http://www.unicode.org/unicode/reports/tr15, from 2000-08-31).
|
|||
|
|
//
|
|||
|
|
// Open issues:
|
|||
|
|
// - Yet to do things in the URE class are:
|
|||
|
|
// - check all character classes if they match correctly
|
|||
|
|
// - optimize rebuild of DFA (build only when pattern changes)
|
|||
|
|
// - set flag parameter of ExecuteURE
|
|||
|
|
// - add \d any decimal digit
|
|||
|
|
// \D any character that is not a decimal digit
|
|||
|
|
// \s any whitespace character
|
|||
|
|
// \S any character that is not a whitespace character
|
|||
|
|
// \w any "word" character
|
|||
|
|
// \W any "non-word" character
|
|||
|
|
// - The wide string classes still compare text with functions provided by the
|
|||
|
|
// particular system. This works usually fine under WinNT/W2K (although also
|
|||
|
|
// there are limitations like maximum text lengths). Under Win9x conversions
|
|||
|
|
// from and to MBCS are necessary which are bound to a particular locale and
|
|||
|
|
// so very limited in general use. These comparisons should be changed so that
|
|||
|
|
// the code in this unit is used.
|
|||
|
|
|
|||
|
|
interface
|
|||
|
|
|
|||
|
|
uses
|
|||
|
|
{$IFDEF MSWINDOWS}
|
|||
|
|
Windows,
|
|||
|
|
{$ENDIF MSWINDOWS}
|
|||
|
|
Classes,
|
|||
|
|
JclBase;
|
|||
|
|
|
|||
|
|
{$IFNDEF FPC}
|
|||
|
|
{$IFDEF MSWINDOWS}
|
|||
|
|
{$DEFINE OWN_WIDESTRING_MEMMGR}
|
|||
|
|
{$ENDIF MSWINDOWS}
|
|||
|
|
{$ENDIF ~FPC}
|
|||
|
|
|
|||
|
|
{$IFDEF SUPPORTS_WIDESTRING}
|
|||
|
|
|
|||
|
|
const
|
|||
|
|
// definitions of often used characters:
|
|||
|
|
// Note: Use them only for tests of a certain character not to determine character
|
|||
|
|
// classes (like white spaces) as in Unicode are often many code points defined
|
|||
|
|
// being in a certain class. Hence your best option is to use the various
|
|||
|
|
// UnicodeIs* functions.
|
|||
|
|
WideNull = WideChar(#0);
|
|||
|
|
WideTabulator = WideChar(#9);
|
|||
|
|
WideSpace = WideChar(#32);
|
|||
|
|
|
|||
|
|
// logical line breaks
|
|||
|
|
WideLF = WideChar(#10);
|
|||
|
|
WideLineFeed = WideChar(#10);
|
|||
|
|
WideVerticalTab = WideChar(#11);
|
|||
|
|
WideFormFeed = WideChar(#12);
|
|||
|
|
WideCR = WideChar(#13);
|
|||
|
|
WideCarriageReturn = WideChar(#13);
|
|||
|
|
WideCRLF: WideString = #13#10;
|
|||
|
|
WideLineSeparator = WideChar($2028);
|
|||
|
|
WideParagraphSeparator = WideChar($2029);
|
|||
|
|
|
|||
|
|
// byte order marks for Unicode files
|
|||
|
|
// Unicode text files (in UTF-16 format) should contain $FFFE as first character to
|
|||
|
|
// identify such a file clearly. Depending on the system where the file was created
|
|||
|
|
// on this appears either in big endian or little endian style.
|
|||
|
|
BOM_LSB_FIRST = WideChar($FEFF);
|
|||
|
|
BOM_MSB_FIRST = WideChar($FFFE);
|
|||
|
|
|
|||
|
|
type
|
|||
|
|
TSaveFormat = ( sfUTF16LSB, sfUTF16MSB, sfUTF8, sfAnsi );
|
|||
|
|
|
|||
|
|
const
|
|||
|
|
sfUnicodeLSB = sfUTF16LSB;
|
|||
|
|
sfUnicodeMSB = sfUTF16MSB;
|
|||
|
|
|
|||
|
|
BOM_UTF16_LSB: array [0..1] of Byte = ($FF,$FE);
|
|||
|
|
BOM_UTF16_MSB: array [0..1] of Byte = ($FE,$FF);
|
|||
|
|
BOM_UTF8: array [0..2] of Byte = ($EF,$BB,$BF);
|
|||
|
|
BOM_UTF32_LSB: array [0..3] of Byte = ($FF,$FE,$00,$00);
|
|||
|
|
BOM_UTF32_MSB: array [0..3] of Byte = ($00,$00,$FE,$FF);
|
|||
|
|
// BOM_UTF7_1: array [0..3] of Byte = ($2B,$2F,$76,$38);
|
|||
|
|
// BOM_UTF7_2: array [0..3] of Byte = ($2B,$2F,$76,$39);
|
|||
|
|
// BOM_UTF7_3: array [0..3] of Byte = ($2B,$2F,$76,$2B);
|
|||
|
|
// BOM_UTF7_4: array [0..3] of Byte = ($2B,$2F,$76,$2F);
|
|||
|
|
// BOM_UTF7_5: array [0..3] of Byte = ($2B,$2F,$76,$38,$2D);
|
|||
|
|
|
|||
|
|
type
|
|||
|
|
// Unicode transformation formats (UTF) data types
|
|||
|
|
PUTF7 = ^UTF7;
|
|||
|
|
UTF7 = Char;
|
|||
|
|
PUTF8 = ^UTF8;
|
|||
|
|
UTF8 = Char;
|
|||
|
|
PUTF16 = ^UTF16;
|
|||
|
|
UTF16 = WideChar;
|
|||
|
|
PUTF32 = ^UTF32;
|
|||
|
|
UTF32 = Cardinal;
|
|||
|
|
|
|||
|
|
// UTF conversion schemes (UCS) data types
|
|||
|
|
PUCS4 = ^UCS4;
|
|||
|
|
UCS4 = Cardinal;
|
|||
|
|
PUCS2 = PWideChar;
|
|||
|
|
UCS2 = WideChar;
|
|||
|
|
|
|||
|
|
TUCS2Array = array of UCS2;
|
|||
|
|
TUCS4Array = array of UCS4;
|
|||
|
|
|
|||
|
|
// various predefined or otherwise useful character property categories
|
|||
|
|
TCharacterCategory = (
|
|||
|
|
// normative categories
|
|||
|
|
ccLetterUppercase,
|
|||
|
|
ccLetterLowercase,
|
|||
|
|
ccLetterTitlecase,
|
|||
|
|
ccMarkNonSpacing,
|
|||
|
|
ccMarkSpacingCombining,
|
|||
|
|
ccMarkEnclosing,
|
|||
|
|
ccNumberDecimalDigit,
|
|||
|
|
ccNumberLetter,
|
|||
|
|
ccNumberOther,
|
|||
|
|
ccSeparatorSpace,
|
|||
|
|
ccSeparatorLine,
|
|||
|
|
ccSeparatorParagraph,
|
|||
|
|
ccOtherControl,
|
|||
|
|
ccOtherFormat,
|
|||
|
|
ccOtherSurrogate,
|
|||
|
|
ccOtherPrivate,
|
|||
|
|
ccOtherUnassigned,
|
|||
|
|
// informative categories
|
|||
|
|
ccLetterModifier,
|
|||
|
|
ccLetterOther,
|
|||
|
|
ccPunctuationConnector,
|
|||
|
|
ccPunctuationDash,
|
|||
|
|
ccPunctuationOpen,
|
|||
|
|
ccPunctuationClose,
|
|||
|
|
ccPunctuationInitialQuote,
|
|||
|
|
ccPunctuationFinalQuote,
|
|||
|
|
ccPunctuationOther,
|
|||
|
|
ccSymbolMath,
|
|||
|
|
ccSymbolCurrency,
|
|||
|
|
ccSymbolModifier,
|
|||
|
|
ccSymbolOther,
|
|||
|
|
// bidirectional categories
|
|||
|
|
ccLeftToRight,
|
|||
|
|
ccLeftToRightEmbedding,
|
|||
|
|
ccLeftToRightOverride,
|
|||
|
|
ccRightToLeft,
|
|||
|
|
ccRightToLeftArabic,
|
|||
|
|
ccRightToLeftEmbedding,
|
|||
|
|
ccRightToLeftoverride,
|
|||
|
|
ccPopDirectionalFormat,
|
|||
|
|
ccEuropeanNumber,
|
|||
|
|
ccEuropeanNumberSeparator,
|
|||
|
|
ccEuropeanNumberTerminator,
|
|||
|
|
ccArabicNumber,
|
|||
|
|
ccCommonNumberSeparator,
|
|||
|
|
ccBoundaryNeutral,
|
|||
|
|
ccSegmentSeparator, // this includes tab and vertical tab
|
|||
|
|
ccWhiteSpace,
|
|||
|
|
ccOtherNeutrals,
|
|||
|
|
// self defined categories, they do not appear in the Unicode data file
|
|||
|
|
ccComposed, // can be decomposed
|
|||
|
|
ccNonBreaking,
|
|||
|
|
ccSymmetric, // has left and right forms
|
|||
|
|
ccHexDigit,
|
|||
|
|
ccQuotationMark,
|
|||
|
|
ccMirroring,
|
|||
|
|
ccSpaceOther,
|
|||
|
|
ccAssigned // means there is a definition in the Unicode standard
|
|||
|
|
);
|
|||
|
|
TCharacterCategories = set of TCharacterCategory;
|
|||
|
|
|
|||
|
|
// four forms of normalization are defined:
|
|||
|
|
TNormalizationForm = (
|
|||
|
|
nfNone, // do not normalize
|
|||
|
|
nfC, // canonical decomposition followed by canonical composition (this is most often used)
|
|||
|
|
nfD, // canonical decomposition
|
|||
|
|
nfKC, // compatibility decomposition followed by a canonical composition
|
|||
|
|
nfKD // compatibility decomposition
|
|||
|
|
);
|
|||
|
|
|
|||
|
|
// used to hold information about the start and end
|
|||
|
|
// position of a unicodeblock.
|
|||
|
|
TUnicodeBlockRange = record
|
|||
|
|
RangeStart,
|
|||
|
|
RangeEnd: Cardinal;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
// An Unicode block usually corresponds to a particular language script but
|
|||
|
|
// can also represent special characters, musical symbols and the like.
|
|||
|
|
TUnicodeBlock = (
|
|||
|
|
ubUndefined,
|
|||
|
|
ubBasicLatin,
|
|||
|
|
ubLatin1Supplement,
|
|||
|
|
ubLatinExtendedA,
|
|||
|
|
ubLatinExtendedB,
|
|||
|
|
ubIPAExtensions,
|
|||
|
|
ubSpacingModifierLetters,
|
|||
|
|
ubCombiningDiacriticalMarks,
|
|||
|
|
//ubGreekandCoptic,
|
|||
|
|
ubGreek,
|
|||
|
|
ubCyrillic,
|
|||
|
|
ubCyrillicSupplement,
|
|||
|
|
ubArmenian,
|
|||
|
|
ubHebrew,
|
|||
|
|
ubArabic,
|
|||
|
|
ubSyriac,
|
|||
|
|
ubArabicSupplement,
|
|||
|
|
ubThaana,
|
|||
|
|
ubDevanagari,
|
|||
|
|
ubBengali,
|
|||
|
|
ubGurmukhi,
|
|||
|
|
ubGujarati,
|
|||
|
|
ubOriya,
|
|||
|
|
ubTamil,
|
|||
|
|
ubTelugu,
|
|||
|
|
ubKannada,
|
|||
|
|
ubMalayalam,
|
|||
|
|
ubSinhala,
|
|||
|
|
ubThai,
|
|||
|
|
ubLao,
|
|||
|
|
ubTibetan,
|
|||
|
|
ubMyanmar,
|
|||
|
|
ubGeorgian,
|
|||
|
|
ubHangulJamo,
|
|||
|
|
ubEthiopic,
|
|||
|
|
ubEthiopicSupplement,
|
|||
|
|
ubCherokee,
|
|||
|
|
ubUnifiedCanadianAboriginalSyllabics,
|
|||
|
|
ubOgham,
|
|||
|
|
ubRunic,
|
|||
|
|
ubTagalog,
|
|||
|
|
ubHanunoo,
|
|||
|
|
ubBuhid,
|
|||
|
|
ubTagbanwa,
|
|||
|
|
ubKhmer,
|
|||
|
|
ubMongolian,
|
|||
|
|
ubLimbu,
|
|||
|
|
ubTaiLe,
|
|||
|
|
ubNewTaiLue,
|
|||
|
|
ubKhmerSymbols,
|
|||
|
|
ubBuginese,
|
|||
|
|
ubPhoneticExtensions,
|
|||
|
|
ubPhoneticExtensionsSupplement,
|
|||
|
|
ubCombiningDiacriticalMarksSupplement,
|
|||
|
|
ubLatinExtendedAdditional,
|
|||
|
|
ubGreekExtended,
|
|||
|
|
ubGeneralPunctuation,
|
|||
|
|
ubSuperscriptsandSubscripts,
|
|||
|
|
ubCurrencySymbols,
|
|||
|
|
//ubCombiningDiacriticalMarksforSymbols,
|
|||
|
|
ubCombiningMarksforSymbols,
|
|||
|
|
ubLetterlikeSymbols,
|
|||
|
|
ubNumberForms,
|
|||
|
|
ubArrows,
|
|||
|
|
ubMathematicalOperators,
|
|||
|
|
ubMiscellaneousTechnical,
|
|||
|
|
ubControlPictures,
|
|||
|
|
ubOpticalCharacterRecognition,
|
|||
|
|
ubEnclosedAlphanumerics,
|
|||
|
|
ubBoxDrawing,
|
|||
|
|
ubBlockElements,
|
|||
|
|
ubGeometricShapes,
|
|||
|
|
ubMiscellaneousSymbols,
|
|||
|
|
ubDingbats,
|
|||
|
|
ubMiscellaneousMathematicalSymbolsA,
|
|||
|
|
ubSupplementalArrowsA,
|
|||
|
|
ubBraillePatterns,
|
|||
|
|
ubSupplementalArrowsB,
|
|||
|
|
ubMiscellaneousMathematicalSymbolsB,
|
|||
|
|
ubSupplementalMathematicalOperators,
|
|||
|
|
ubMiscellaneousSymbolsandArrows,
|
|||
|
|
ubGlagolitic,
|
|||
|
|
ubCoptic,
|
|||
|
|
ubGeorgianSupplement,
|
|||
|
|
ubTifinagh,
|
|||
|
|
ubEthiopicExtended,
|
|||
|
|
ubSupplementalPunctuation,
|
|||
|
|
ubCJKRadicalsSupplement,
|
|||
|
|
ubKangxiRadicals,
|
|||
|
|
ubIdeographicDescriptionCharacters,
|
|||
|
|
ubCJKSymbolsandPunctuation,
|
|||
|
|
ubHiragana,
|
|||
|
|
ubKatakana,
|
|||
|
|
ubBopomofo,
|
|||
|
|
ubHangulCompatibilityJamo,
|
|||
|
|
ubKanbun,
|
|||
|
|
ubBopomofoExtended,
|
|||
|
|
ubCJKStrokes,
|
|||
|
|
ubKatakanaPhoneticExtensions,
|
|||
|
|
ubEnclosedCJKLettersandMonths,
|
|||
|
|
ubCJKCompatibility,
|
|||
|
|
ubCJKUnifiedIdeographsExtensionA,
|
|||
|
|
ubYijingHexagramSymbols,
|
|||
|
|
ubCJKUnifiedIdeographs,
|
|||
|
|
ubYiSyllables,
|
|||
|
|
ubYiRadicals,
|
|||
|
|
ubModifierToneLetters,
|
|||
|
|
ubSylotiNagri,
|
|||
|
|
ubHangulSyllables,
|
|||
|
|
ubHighSurrogates,
|
|||
|
|
ubHighPrivateUseSurrogates,
|
|||
|
|
ubLowSurrogates,
|
|||
|
|
//ubPrivateUseArea,
|
|||
|
|
ubPrivateUse,
|
|||
|
|
ubCJKCompatibilityIdeographs,
|
|||
|
|
ubAlphabeticPresentationForms,
|
|||
|
|
ubArabicPresentationFormsA,
|
|||
|
|
ubVariationSelectors,
|
|||
|
|
ubVerticalForms,
|
|||
|
|
ubCombiningHalfMarks,
|
|||
|
|
ubCJKCompatibilityForms,
|
|||
|
|
ubSmallFormVariants,
|
|||
|
|
ubArabicPresentationFormsB,
|
|||
|
|
ubHalfwidthandFullwidthForms,
|
|||
|
|
ubSpecials,
|
|||
|
|
ubLinearBSyllabary,
|
|||
|
|
ubLinearBIdeograms,
|
|||
|
|
ubAegeanNumbers,
|
|||
|
|
ubAncientGreekNumbers,
|
|||
|
|
ubOldItalic,
|
|||
|
|
ubGothic,
|
|||
|
|
ubUgaritic,
|
|||
|
|
ubOldPersian,
|
|||
|
|
ubDeseret,
|
|||
|
|
ubShavian,
|
|||
|
|
ubOsmanya,
|
|||
|
|
ubCypriotSyllabary,
|
|||
|
|
ubKharoshthi,
|
|||
|
|
ubByzantineMusicalSymbols,
|
|||
|
|
ubMusicalSymbols,
|
|||
|
|
ubAncientGreekMusicalNotation,
|
|||
|
|
ubTaiXuanJingSymbols,
|
|||
|
|
ubMathematicalAlphanumericSymbols,
|
|||
|
|
ubCJKUnifiedIdeographsExtensionB,
|
|||
|
|
ubCJKCompatibilityIdeographsSupplement,
|
|||
|
|
ubTags,
|
|||
|
|
ubVariationSelectorsSupplement,
|
|||
|
|
ubSupplementaryPrivateUseAreaA,
|
|||
|
|
ubSupplementaryPrivateUseAreaB
|
|||
|
|
);
|
|||
|
|
|
|||
|
|
|
|||
|
|
TWideStrings = class;
|
|||
|
|
|
|||
|
|
TSearchFlag = (
|
|||
|
|
sfCaseSensitive, // match letter case
|
|||
|
|
sfIgnoreNonSpacing, // ignore non-spacing characters in search
|
|||
|
|
sfSpaceCompress, // handle several consecutive white spaces as one white space
|
|||
|
|
// (this applies to the pattern as well as the search text)
|
|||
|
|
sfWholeWordOnly // match only text at end/start and/or surrounded by white spaces
|
|||
|
|
);
|
|||
|
|
|
|||
|
|
TSearchFlags = set of TSearchFlag;
|
|||
|
|
|
|||
|
|
// a generic search class defininition used for tuned Boyer-Moore and Unicode
|
|||
|
|
// regular expression searches
|
|||
|
|
TSearchEngine = class(TObject)
|
|||
|
|
private
|
|||
|
|
FResults: TList; // 2 entries for each result (start and stop position)
|
|||
|
|
FOwner: TWideStrings; // at the moment unused, perhaps later to access strings faster
|
|||
|
|
protected
|
|||
|
|
function GetCount: Integer; virtual;
|
|||
|
|
public
|
|||
|
|
constructor Create(AOwner: TWideStrings); virtual;
|
|||
|
|
destructor Destroy; override;
|
|||
|
|
|
|||
|
|
procedure AddResult(Start, Stop: Cardinal); virtual;
|
|||
|
|
procedure Clear; virtual;
|
|||
|
|
procedure ClearResults; virtual;
|
|||
|
|
procedure DeleteResult(Index: Cardinal); virtual;
|
|||
|
|
procedure FindPrepare(const Pattern: WideString; Options: TSearchFlags); overload; virtual; abstract;
|
|||
|
|
procedure FindPrepare(Pattern: PWideChar; PatternLength: Cardinal; Options: TSearchFlags); overload; virtual; abstract;
|
|||
|
|
function FindFirst(const Text: WideString; var Start, Stop: Cardinal): Boolean; overload; virtual; abstract;
|
|||
|
|
function FindFirst(Text: PWideChar; TextLen: Cardinal; var Start, Stop: Cardinal): Boolean; overload; virtual; abstract;
|
|||
|
|
function FindAll(const Text: WideString): Boolean; overload; virtual; abstract;
|
|||
|
|
function FindAll(Text: PWideChar; TextLen: Cardinal): Boolean; overload; virtual; abstract;
|
|||
|
|
procedure GetResult(Index: Cardinal; var Start, Stop: Integer); virtual;
|
|||
|
|
|
|||
|
|
property Count: Integer read GetCount;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
// The Unicode Tuned Boyer-Moore (UTBM) search implementation is an extended
|
|||
|
|
// translation created from a free package written by Mark Leisher (mleisher att crl dott nmsu dott edu).
|
|||
|
|
//
|
|||
|
|
// The code handles high and low surrogates as well as case (in)dependency,
|
|||
|
|
// can ignore non-spacing characters and allows optionally to return whole
|
|||
|
|
// words only.
|
|||
|
|
|
|||
|
|
// single pattern character
|
|||
|
|
PUTBMChar = ^TUTBMChar;
|
|||
|
|
TUTBMChar = record
|
|||
|
|
LoCase,
|
|||
|
|
UpCase,
|
|||
|
|
TitleCase: UCS4;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
PUTBMSkip = ^TUTBMSkip;
|
|||
|
|
TUTBMSkip = record
|
|||
|
|
BMChar: PUTBMChar;
|
|||
|
|
SkipValues: Integer;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
TUTBMSearch = class(TSearchEngine)
|
|||
|
|
private
|
|||
|
|
FFlags: TSearchFlags;
|
|||
|
|
FPattern: PUTBMChar;
|
|||
|
|
FPatternUsed: Cardinal;
|
|||
|
|
FPatternSize: Cardinal;
|
|||
|
|
FPatternLength: Cardinal;
|
|||
|
|
FSkipValues: PUTBMSkip;
|
|||
|
|
FSkipsUsed: Integer;
|
|||
|
|
FMD4: Cardinal;
|
|||
|
|
protected
|
|||
|
|
procedure ClearPattern;
|
|||
|
|
procedure Compile(Pattern: PUCS2; PatternLength: Integer; Flags: TSearchFlags);
|
|||
|
|
function Find(Text: PUCS2; TextLen: Cardinal; var MatchStart, MatchEnd: Cardinal): Boolean;
|
|||
|
|
function GetSkipValue(TextStart, TextEnd: PUCS2): Cardinal;
|
|||
|
|
function Match(Text, Start, Stop: PUCS2; var MatchStart, MatchEnd: Cardinal): Boolean;
|
|||
|
|
public
|
|||
|
|
procedure Clear; override;
|
|||
|
|
procedure FindPrepare(const Pattern: WideString; Options: TSearchFlags); overload; override;
|
|||
|
|
procedure FindPrepare(Pattern: PWideChar; PatternLength: Cardinal; Options: TSearchFlags); overload; override;
|
|||
|
|
function FindFirst(const Text: WideString; var Start, Stop: Cardinal): Boolean; overload; override;
|
|||
|
|
function FindFirst(Text: PWideChar; TextLen: Cardinal; var Start, Stop: Cardinal): Boolean; overload; override;
|
|||
|
|
function FindAll(const Text: WideString): Boolean; overload; override;
|
|||
|
|
function FindAll(Text: PWideChar; TextLen: Cardinal): Boolean; overload; override;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
// Regular expression search engine for text in UCS2 form taking surrogates
|
|||
|
|
// into account. This implementation is an improved translation from the URE
|
|||
|
|
// package written by Mark Leisher (mleisher att crl dott nmsu dott edu) who used a variation
|
|||
|
|
// of the RE->DFA algorithm done by Mark Hopkins (markh att csd4 dott csd dott uwm dott edu).
|
|||
|
|
// Assumptions:
|
|||
|
|
// o Regular expression and text already normalized.
|
|||
|
|
// o Conversion to lower case assumes a 1-1 mapping.
|
|||
|
|
//
|
|||
|
|
// Definitions:
|
|||
|
|
// Separator - any one of U+2028, U+2029, NL, CR.
|
|||
|
|
//
|
|||
|
|
// Operators:
|
|||
|
|
// . - match any character
|
|||
|
|
// * - match zero or more of the last subexpression
|
|||
|
|
// + - match one or more of the last subexpression
|
|||
|
|
// ? - match zero or one of the last subexpression
|
|||
|
|
// () - subexpression grouping
|
|||
|
|
// {m, n} - match at least m occurences and up to n occurences
|
|||
|
|
// Note: both values can be 0 or ommitted which denotes then a unlimiting bound
|
|||
|
|
// {,} and {0,} and {0, 0} correspond to *
|
|||
|
|
// {, 1} and {0, 1} correspond to ?
|
|||
|
|
// {1,} and {1, 0} correspond to +
|
|||
|
|
// {m} - match exactly m occurences
|
|||
|
|
//
|
|||
|
|
// Notes:
|
|||
|
|
// o The "." operator normally does not match separators, but a flag is
|
|||
|
|
// available that will allow this operator to match a separator.
|
|||
|
|
//
|
|||
|
|
// Literals and Constants:
|
|||
|
|
// c - literal UCS2 character
|
|||
|
|
// \x.... - hexadecimal number of up to 4 digits
|
|||
|
|
// \X.... - hexadecimal number of up to 4 digits
|
|||
|
|
// \u.... - hexadecimal number of up to 4 digits
|
|||
|
|
// \U.... - hexadecimal number of up to 4 digits
|
|||
|
|
//
|
|||
|
|
// Character classes:
|
|||
|
|
// [...] - Character class
|
|||
|
|
// [^...] - Negated character class
|
|||
|
|
// \pN1,N2,...,Nn - Character properties class
|
|||
|
|
// \PN1,N2,...,Nn - Negated character properties class
|
|||
|
|
//
|
|||
|
|
// POSIX character classes recognized:
|
|||
|
|
// :alnum:
|
|||
|
|
// :alpha:
|
|||
|
|
// :cntrl:
|
|||
|
|
// :digit:
|
|||
|
|
// :graph:
|
|||
|
|
// :lower:
|
|||
|
|
// :print:
|
|||
|
|
// :punct:
|
|||
|
|
// :space:
|
|||
|
|
// :upper:
|
|||
|
|
// :xdigit:
|
|||
|
|
//
|
|||
|
|
// Notes:
|
|||
|
|
// o Character property classes are \p or \P followed by a comma separated
|
|||
|
|
// list of integers between 0 and the maximum entry index in TCharacterCategory.
|
|||
|
|
// These integers directly correspond to the TCharacterCategory enumeration entries.
|
|||
|
|
// Note: upper, lower and title case classes need to have case sensitive search
|
|||
|
|
// be enabled to match correctly!
|
|||
|
|
//
|
|||
|
|
// o Character classes can contain literals, constants and character
|
|||
|
|
// property classes. Example:
|
|||
|
|
//
|
|||
|
|
// [abc\U10A\p0,13,4]
|
|||
|
|
|
|||
|
|
// structure used to handle a compacted range of characters
|
|||
|
|
PUcRange = ^TUcRange;
|
|||
|
|
TUcRange = record
|
|||
|
|
MinCode,
|
|||
|
|
MaxCode: UCS4;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
TUcCClass = record
|
|||
|
|
Ranges: array of TUcRange;
|
|||
|
|
RangesUsed: Integer;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
// either a single character or a list of character classes
|
|||
|
|
TUcSymbol = record
|
|||
|
|
Chr: UCS4;
|
|||
|
|
CCL: TUcCClass;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
// this is a general element structure used for expressions and stack elements
|
|||
|
|
TUcElement = record
|
|||
|
|
OnStack: Boolean;
|
|||
|
|
AType,
|
|||
|
|
LHS,
|
|||
|
|
RHS: Cardinal;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
// this is a structure used to track a list or a stack of states
|
|||
|
|
PUcStateList = ^TUcStateList;
|
|||
|
|
TUcStateList = record
|
|||
|
|
List: array of Cardinal;
|
|||
|
|
ListUsed: Integer;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
// structure to track the list of unique states for a symbol during reduction
|
|||
|
|
PUcSymbolTableEntry = ^TUcSymbolTableEntry;
|
|||
|
|
TUcSymbolTableEntry = record
|
|||
|
|
ID,
|
|||
|
|
AType: Cardinal;
|
|||
|
|
Mods,
|
|||
|
|
Categories: TCharacterCategories;
|
|||
|
|
Symbol: TUcSymbol;
|
|||
|
|
States: TUcStateList;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
// structure to hold a single State
|
|||
|
|
PUcState = ^TUcState;
|
|||
|
|
TUcState = record
|
|||
|
|
ID: Cardinal;
|
|||
|
|
Accepting: Boolean;
|
|||
|
|
StateList: TUcStateList;
|
|||
|
|
Transitions: array of TUcElement;
|
|||
|
|
TransitionsUsed: Integer;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
// structure used for keeping lists of states
|
|||
|
|
TUcStateTable = record
|
|||
|
|
States: array of TUcState;
|
|||
|
|
StatesUsed: Integer;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
// structure to track pairs of DFA states when equivalent states are merged
|
|||
|
|
TUcEquivalent = record
|
|||
|
|
Left,
|
|||
|
|
Right: Cardinal;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
TUcExpressionList = record
|
|||
|
|
Expressions: array of TUcElement;
|
|||
|
|
ExpressionsUsed: Integer;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
TUcSymbolTable = record
|
|||
|
|
Symbols: array of TUcSymbolTableEntry;
|
|||
|
|
SymbolsUsed: Integer;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
TUcEquivalentList = record
|
|||
|
|
Equivalents: array of TUcEquivalent;
|
|||
|
|
EquivalentsUsed: Integer;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
// structure used for constructing the NFA and reducing to a minimal DFA
|
|||
|
|
PUREBuffer = ^TUREBuffer;
|
|||
|
|
TUREBuffer = record
|
|||
|
|
Reducing: Boolean;
|
|||
|
|
Error: Integer;
|
|||
|
|
Flags: Cardinal;
|
|||
|
|
Stack: TUcStateList;
|
|||
|
|
SymbolTable: TUcSymbolTable; // table of unique symbols encountered
|
|||
|
|
ExpressionList: TUcExpressionList; // tracks the unique expressions generated
|
|||
|
|
// for the NFA and when the NFA is reduced
|
|||
|
|
States: TUcStateTable; // the reduced table of unique groups of NFA states
|
|||
|
|
EquivalentList: TUcEquivalentList; // tracks states when equivalent states are merged
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
TUcTransition = record
|
|||
|
|
Symbol,
|
|||
|
|
NextState: Cardinal;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
PDFAState = ^TDFAState;
|
|||
|
|
TDFAState = record
|
|||
|
|
Accepting: Boolean;
|
|||
|
|
NumberTransitions: Integer;
|
|||
|
|
StartTransition: Integer;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
TDFAStates = record
|
|||
|
|
States: array of TDFAState;
|
|||
|
|
StatesUsed: Integer;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
TUcTransitions = record
|
|||
|
|
Transitions: array of TUcTransition;
|
|||
|
|
TransitionsUsed: Integer;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
TDFA = record
|
|||
|
|
Flags: Cardinal;
|
|||
|
|
SymbolTable: TUcSymbolTable;
|
|||
|
|
StateList: TDFAStates;
|
|||
|
|
TransitionList: TUcTransitions;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
TURESearch = class(TSearchEngine)
|
|||
|
|
private
|
|||
|
|
FUREBuffer: TUREBuffer;
|
|||
|
|
FDFA: TDFA;
|
|||
|
|
protected
|
|||
|
|
procedure AddEquivalentPair(L, R: Cardinal);
|
|||
|
|
procedure AddRange(var CCL: TUcCClass; Range: TUcRange);
|
|||
|
|
function AddState(NewStates: array of Cardinal): Cardinal;
|
|||
|
|
procedure AddSymbolState(Symbol, State: Cardinal);
|
|||
|
|
function BuildCharacterClass(CP: PUCS2; Limit: Cardinal; Symbol: PUcSymbolTableEntry): Cardinal;
|
|||
|
|
procedure ClearUREBuffer;
|
|||
|
|
function CompileSymbol(S: PUCS2; Limit: Cardinal; Symbol: PUcSymbolTableEntry): Cardinal;
|
|||
|
|
procedure CompileURE(RE: PWideChar; RELength: Cardinal; Casefold: Boolean);
|
|||
|
|
procedure CollectPendingOperations(var State: Cardinal);
|
|||
|
|
function ConvertRegExpToNFA(RE: PWideChar; RELength: Cardinal): Cardinal;
|
|||
|
|
function ExecuteURE(Flags: Cardinal; Text: PUCS2; TextLen: Cardinal; var MatchStart, MatchEnd: Cardinal): Boolean;
|
|||
|
|
procedure ClearDFA;
|
|||
|
|
procedure HexDigitSetup(Symbol: PUcSymbolTableEntry);
|
|||
|
|
function MakeExpression(AType, LHS, RHS: Cardinal): Cardinal;
|
|||
|
|
function MakeHexNumber(NP: PUCS2; Limit: Cardinal; var Number: Cardinal): Cardinal;
|
|||
|
|
function MakeSymbol(S: PUCS2; Limit: Cardinal; var Consumed: Cardinal): Cardinal;
|
|||
|
|
procedure MergeEquivalents;
|
|||
|
|
function ParsePropertyList(Properties: PUCS2; Limit: Cardinal; var Categories: TCharacterCategories): Cardinal;
|
|||
|
|
function Peek: Cardinal;
|
|||
|
|
function Pop: Cardinal;
|
|||
|
|
function PosixCCL(CP: PUCS2; Limit: Cardinal; Symbol: PUcSymbolTableEntry): Cardinal;
|
|||
|
|
function ProbeLowSurrogate(LeftState: PUCS2; Limit: Cardinal; var Code: UCS4): Cardinal;
|
|||
|
|
procedure Push(V: Cardinal);
|
|||
|
|
procedure Reduce(Start: Cardinal);
|
|||
|
|
procedure SpaceSetup(Symbol: PUcSymbolTableEntry; Categories: TCharacterCategories);
|
|||
|
|
function SymbolsAreDifferent(A, B: PUcSymbolTableEntry): Boolean;
|
|||
|
|
public
|
|||
|
|
procedure Clear; override;
|
|||
|
|
procedure FindPrepare(const Pattern: WideString; Options: TSearchFlags); overload; override;
|
|||
|
|
procedure FindPrepare(Pattern: PWideChar; PatternLength: Cardinal; Options: TSearchFlags); overload; override;
|
|||
|
|
function FindFirst(const Text: WideString; var Start, Stop: Cardinal): Boolean; overload; override;
|
|||
|
|
function FindFirst(Text: PWideChar; TextLen: Cardinal; var Start, Stop: Cardinal): Boolean; overload; override;
|
|||
|
|
function FindAll(const Text: WideString): Boolean; overload; override;
|
|||
|
|
function FindAll(Text: PWideChar; TextLen: Cardinal): Boolean; overload; override;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
// Event used to give the application a chance to switch the way of how to save
|
|||
|
|
// the text in TWideStrings if the text contains characters not only from the
|
|||
|
|
// ANSI block but the save type is ANSI. On triggering the event the application
|
|||
|
|
// can change the property SaveUnicode as needed. This property is again checked
|
|||
|
|
// after the callback returns.
|
|||
|
|
TConfirmConversionEvent = procedure (Sender: TWideStrings; var Allowed: Boolean) of object;
|
|||
|
|
|
|||
|
|
TWideStrings = class(TPersistent)
|
|||
|
|
private
|
|||
|
|
FUpdateCount: Integer;
|
|||
|
|
FLanguage: LCID; // language can usually left alone, the system's default is used
|
|||
|
|
FSaved: Boolean; // set in SaveToStream, True in case saving was successfull otherwise False
|
|||
|
|
FNormalizationForm: TNormalizationForm; // determines in which form Unicode strings should be stored
|
|||
|
|
FOnConfirmConversion: TConfirmConversionEvent;
|
|||
|
|
FSaveFormat: TSaveFormat; // overrides the FSaveUnicode flag, initialized when a file is loaded,
|
|||
|
|
// expect losses if it is set to sfAnsi before saving
|
|||
|
|
function GetCommaText: WideString;
|
|||
|
|
function GetName(Index: Integer): WideString;
|
|||
|
|
function GetValue(const Name: WideString): WideString;
|
|||
|
|
procedure ReadData(Reader: TReader);
|
|||
|
|
procedure SetCommaText(const Value: WideString);
|
|||
|
|
procedure SetNormalizationForm(const Value: TNormalizationForm);
|
|||
|
|
procedure SetValue(const Name, Value: WideString);
|
|||
|
|
procedure WriteData(Writer: TWriter);
|
|||
|
|
function GetSaveUnicode: Boolean;
|
|||
|
|
procedure SetSaveUnicode(const Value: Boolean);
|
|||
|
|
protected
|
|||
|
|
procedure DefineProperties(Filer: TFiler); override;
|
|||
|
|
procedure DoConfirmConversion(var Allowed: Boolean); virtual;
|
|||
|
|
procedure Error(const Msg: string; Data: Integer);
|
|||
|
|
function Get(Index: Integer): WideString; virtual; abstract;
|
|||
|
|
function GetCapacity: Integer; virtual;
|
|||
|
|
function GetCount: Integer; virtual; abstract;
|
|||
|
|
function GetObject(Index: Integer): TObject; virtual;
|
|||
|
|
function GetTextStr: WideString; virtual;
|
|||
|
|
procedure Put(Index: Integer; const S: WideString); virtual; abstract;
|
|||
|
|
procedure PutObject(Index: Integer; AObject: TObject); virtual; abstract;
|
|||
|
|
procedure SetCapacity(NewCapacity: Integer); virtual;
|
|||
|
|
procedure SetUpdateState(Updating: Boolean); virtual;
|
|||
|
|
procedure SetLanguage(Value: LCID); virtual;
|
|||
|
|
public
|
|||
|
|
constructor Create;
|
|||
|
|
|
|||
|
|
function Add(const S: WideString): Integer; virtual;
|
|||
|
|
function AddObject(const S: WideString; AObject: TObject): Integer; virtual;
|
|||
|
|
procedure Append(const S: WideString);
|
|||
|
|
procedure AddStrings(Strings: TStrings); overload; virtual;
|
|||
|
|
procedure AddStrings(Strings: TWideStrings); overload; virtual;
|
|||
|
|
procedure Assign(Source: TPersistent); override;
|
|||
|
|
procedure AssignTo(Dest: TPersistent); override;
|
|||
|
|
procedure BeginUpdate;
|
|||
|
|
procedure Clear; virtual; abstract;
|
|||
|
|
procedure Delete(Index: Integer); virtual; abstract;
|
|||
|
|
procedure EndUpdate;
|
|||
|
|
function Equals(Strings: TWideStrings): Boolean;
|
|||
|
|
procedure Exchange(Index1, Index2: Integer); virtual;
|
|||
|
|
function GetSeparatedText(Separators: WideString): WideString; virtual;
|
|||
|
|
function GetText: PWideChar; virtual;
|
|||
|
|
function IndexOf(const S: WideString): Integer; virtual;
|
|||
|
|
function IndexOfName(const Name: WideString): Integer;
|
|||
|
|
function IndexOfObject(AObject: TObject): Integer;
|
|||
|
|
procedure Insert(Index: Integer; const S: WideString); virtual; abstract;
|
|||
|
|
procedure InsertObject(Index: Integer; const S: WideString; AObject: TObject);
|
|||
|
|
procedure LoadFromFile(const FileName: string); virtual;
|
|||
|
|
procedure LoadFromStream(Stream: TStream); virtual;
|
|||
|
|
procedure Move(CurIndex, NewIndex: Integer); virtual;
|
|||
|
|
procedure SaveToFile(const FileName: string); virtual;
|
|||
|
|
procedure SaveToStream(Stream: TStream; WithBOM: Boolean = True); virtual;
|
|||
|
|
procedure SetText(const Value: WideString); virtual;
|
|||
|
|
|
|||
|
|
property Capacity: Integer read GetCapacity write SetCapacity;
|
|||
|
|
property CommaText: WideString read GetCommaText write SetCommaText;
|
|||
|
|
property Count: Integer read GetCount;
|
|||
|
|
property Language: LCID read FLanguage write SetLanguage;
|
|||
|
|
property Names[Index: Integer]: WideString read GetName;
|
|||
|
|
property NormalizationForm: TNormalizationForm read FNormalizationForm write SetNormalizationForm default nfC;
|
|||
|
|
property Objects[Index: Integer]: TObject read GetObject write PutObject;
|
|||
|
|
property Values[const Name: WideString]: WideString read GetValue write SetValue;
|
|||
|
|
property Saved: Boolean read FSaved;
|
|||
|
|
property SaveUnicode: Boolean read GetSaveUnicode write SetSaveUnicode default True;
|
|||
|
|
property SaveFormat: TSaveFormat read FSaveFormat write FSaveFormat default sfUnicodeLSB;
|
|||
|
|
property Strings[Index: Integer]: WideString read Get write Put; default;
|
|||
|
|
property Text: WideString read GetTextStr write SetText;
|
|||
|
|
|
|||
|
|
property OnConfirmConversion: TConfirmConversionEvent read FOnConfirmConversion write FOnConfirmConversion;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
//----- TWideStringList class
|
|||
|
|
TDynWideCharArray = array of WideChar;
|
|||
|
|
TWideStringItem = record
|
|||
|
|
{$IFDEF OWN_WIDESTRING_MEMMGR}
|
|||
|
|
FString: PWideChar; // "array of WideChar";
|
|||
|
|
{$ELSE}
|
|||
|
|
FString: WideString;
|
|||
|
|
{$ENDIF OWN_WIDESTRING_MEMMGR}
|
|||
|
|
FObject: TObject;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
TWideStringItemList = array of TWideStringItem;
|
|||
|
|
|
|||
|
|
TWideStringList = class(TWideStrings)
|
|||
|
|
private
|
|||
|
|
FList: TWideStringItemList;
|
|||
|
|
FCount: Integer;
|
|||
|
|
FSorted: Boolean;
|
|||
|
|
FDuplicates: TDuplicates;
|
|||
|
|
FOnChange: TNotifyEvent;
|
|||
|
|
FOnChanging: TNotifyEvent;
|
|||
|
|
procedure ExchangeItems(Index1, Index2: Integer);
|
|||
|
|
procedure Grow;
|
|||
|
|
procedure QuickSort(L, R: Integer);
|
|||
|
|
procedure InsertItem(Index: Integer; const S: WideString);
|
|||
|
|
procedure SetSorted(Value: Boolean);
|
|||
|
|
{$IFDEF OWN_WIDESTRING_MEMMGR}
|
|||
|
|
procedure SetListString(Index: Integer; const S: WideString);
|
|||
|
|
{$ENDIF OWN_WIDESTRING_MEMMGR}
|
|||
|
|
protected
|
|||
|
|
procedure Changed; virtual;
|
|||
|
|
procedure Changing; virtual;
|
|||
|
|
function Get(Index: Integer): WideString; override;
|
|||
|
|
function GetCapacity: Integer; override;
|
|||
|
|
function GetCount: Integer; override;
|
|||
|
|
function GetObject(Index: Integer): TObject; override;
|
|||
|
|
procedure Put(Index: Integer; const S: WideString); override;
|
|||
|
|
procedure PutObject(Index: Integer; AObject: TObject); override;
|
|||
|
|
procedure SetCapacity(NewCapacity: Integer); override;
|
|||
|
|
procedure SetUpdateState(Updating: Boolean); override;
|
|||
|
|
procedure SetLanguage(Value: LCID); override;
|
|||
|
|
public
|
|||
|
|
destructor Destroy; override;
|
|||
|
|
|
|||
|
|
function Add(const S: WideString): Integer; override;
|
|||
|
|
procedure Clear; override;
|
|||
|
|
procedure Delete(Index: Integer); override;
|
|||
|
|
procedure Exchange(Index1, Index2: Integer); override;
|
|||
|
|
function Find(const S: WideString; var Index: Integer): Boolean; virtual;
|
|||
|
|
function IndexOf(const S: WideString): Integer; override;
|
|||
|
|
procedure Insert(Index: Integer; const S: WideString); override;
|
|||
|
|
procedure Sort; virtual;
|
|||
|
|
|
|||
|
|
property Duplicates: TDuplicates read FDuplicates write FDuplicates;
|
|||
|
|
property Sorted: Boolean read FSorted write SetSorted;
|
|||
|
|
property OnChange: TNotifyEvent read FOnChange write FOnChange;
|
|||
|
|
property OnChanging: TNotifyEvent read FOnChanging write FOnChanging;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
// result type for number retrieval functions
|
|||
|
|
TUcNumber = record
|
|||
|
|
Numerator,
|
|||
|
|
Denominator: Integer;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
TFontCharSet = 0..255;
|
|||
|
|
|
|||
|
|
const
|
|||
|
|
ReplacementCharacter: UCS4 = $0000FFFD;
|
|||
|
|
MaximumUCS2: UCS4 = $0000FFFF;
|
|||
|
|
MaximumUTF16: UCS4 = $0010FFFF;
|
|||
|
|
MaximumUCS4: UCS4 = $7FFFFFFF;
|
|||
|
|
|
|||
|
|
SurrogateHighStart: UCS4 = $D800;
|
|||
|
|
SurrogateHighEnd: UCS4 = $DBFF;
|
|||
|
|
SurrogateLowStart: UCS4 = $DC00;
|
|||
|
|
SurrogateLowEnd: UCS4 = $DFFF;
|
|||
|
|
|
|||
|
|
// functions involving null-terminated strings
|
|||
|
|
// NOTE: PWideChars as well as WideStrings are NOT managed by reference counting under Win32.
|
|||
|
|
// In Kylix this is different. WideStrings are reference counted there, just like ANSI strings.
|
|||
|
|
function StrLenW(Str: PWideChar): Cardinal;
|
|||
|
|
function StrEndW(Str: PWideChar): PWideChar;
|
|||
|
|
function StrMoveW(Dest, Source: PWideChar; Count: Cardinal): PWideChar;
|
|||
|
|
function StrCopyW(Dest, Source: PWideChar): PWideChar;
|
|||
|
|
function StrECopyW(Dest, Source: PWideChar): PWideChar;
|
|||
|
|
function StrLCopyW(Dest, Source: PWideChar; MaxLen: Cardinal): PWideChar;
|
|||
|
|
function StrPCopyWW(Dest: PWideChar; const Source: WideString): PWideChar; overload;
|
|||
|
|
function StrPCopyW(Dest: PWideChar; const Source: string): PWideChar;
|
|||
|
|
function StrPLCopyWW(Dest: PWideChar; const Source: WideString; MaxLen: Cardinal): PWideChar;
|
|||
|
|
function StrPLCopyW(Dest: PWideChar; const Source: string; MaxLen: Cardinal): PWideChar;
|
|||
|
|
function StrCatW(Dest: PWideChar; const Source: PWideChar): PWideChar;
|
|||
|
|
function StrLCatW(Dest, Source: PWideChar; MaxLen: Cardinal): PWideChar;
|
|||
|
|
function StrCompW(const Str1, Str2: PWideChar): Integer;
|
|||
|
|
function StrICompW(const Str1, Str2: PWideChar): Integer;
|
|||
|
|
function StrLCompW(const Str1, Str2: PWideChar; MaxLen: Cardinal): Integer;
|
|||
|
|
function StrLICompW(const Str1, Str2: PWideChar; MaxLen: Cardinal): Integer;
|
|||
|
|
function StrNScanW(const Str1, Str2: PWideChar): Integer;
|
|||
|
|
function StrRNScanW(const Str1, Str2: PWideChar): Integer;
|
|||
|
|
function StrScanW(Str: PWideChar; Chr: WideChar): PWideChar; overload;
|
|||
|
|
function StrScanW(Str: PWideChar; Chr: WideChar; StrLen: Cardinal): PWideChar; overload;
|
|||
|
|
function StrRScanW(Str: PWideChar; Chr: WideChar): PWideChar;
|
|||
|
|
function StrPosW(Str, SubStr: PWideChar): PWideChar;
|
|||
|
|
function StrAllocW(WideSize: Cardinal): PWideChar;
|
|||
|
|
function StrBufSizeW(const Str: PWideChar): Cardinal;
|
|||
|
|
function StrNewW(const Str: PWideChar): PWideChar; overload;
|
|||
|
|
function StrNewW(const Str: WideString): PWideChar; overload;
|
|||
|
|
procedure StrDisposeW(Str: PWideChar);
|
|||
|
|
procedure StrDisposeAndNilW(var Str: PWideChar);
|
|||
|
|
procedure StrSwapByteOrder(Str: PWideChar);
|
|||
|
|
|
|||
|
|
// functions involving Delphi wide strings
|
|||
|
|
function WideAdjustLineBreaks(const S: WideString): WideString;
|
|||
|
|
function WideCharPos(const S: WideString; const Ch: WideChar; const Index: Integer): Integer; //az
|
|||
|
|
function WideCompose(const S: WideString): WideString;
|
|||
|
|
function WideDecompose(const S: WideString; Compatible: Boolean): WideString;
|
|||
|
|
function WideExtractQuotedStr(var Src: PWideChar; Quote: WideChar): WideString;
|
|||
|
|
function WideQuotedStr(const S: WideString; Quote: WideChar): WideString;
|
|||
|
|
function WideStringOfChar(C: WideChar; Count: Cardinal): WideString;
|
|||
|
|
function WideCaseFolding(C: WideChar): WideString; overload;
|
|||
|
|
function WideCaseFolding(const S: WideString): WideString; overload;
|
|||
|
|
function WideLowerCase(C: WideChar): WideString; overload;
|
|||
|
|
function WideLowerCase(const S: WideString): WideString; overload;
|
|||
|
|
function WideNormalize(const S: WideString; Form: TNormalizationForm): WideString;
|
|||
|
|
function WideSameText(const Str1, Str2: WideString): Boolean;
|
|||
|
|
function WideTitleCase(C: WideChar): WideString; overload;
|
|||
|
|
function WideTitleCase(const S: WideString): WideString; overload;
|
|||
|
|
function WideTrim(const S: WideString): WideString;
|
|||
|
|
function WideTrimLeft(const S: WideString): WideString;
|
|||
|
|
function WideTrimRight(const S: WideString): WideString;
|
|||
|
|
function WideUpperCase(C: WideChar): WideString; overload;
|
|||
|
|
function WideUpperCase(const S: WideString): WideString; overload;
|
|||
|
|
|
|||
|
|
// Low level character routines
|
|||
|
|
function UnicodeNumberLookup(Code: UCS4; var Number: TUcNumber): Boolean;
|
|||
|
|
function UnicodeComposePair(First, Second: UCS4; var Composite: UCS4): Boolean;
|
|||
|
|
function UnicodeCaseFold(Code: UCS4): TUCS4Array;
|
|||
|
|
function UnicodeToUpper(Code: UCS4): TUCS4Array;
|
|||
|
|
function UnicodeToLower(Code: UCS4): TUCS4Array;
|
|||
|
|
function UnicodeToTitle(Code: UCS4): TUCS4Array;
|
|||
|
|
|
|||
|
|
// Character test routines
|
|||
|
|
function UnicodeIsAlpha(C: UCS4): Boolean;
|
|||
|
|
function UnicodeIsDigit(C: UCS4): Boolean;
|
|||
|
|
function UnicodeIsAlphaNum(C: UCS4): Boolean;
|
|||
|
|
function UnicodeIsCased(C: UCS4): Boolean;
|
|||
|
|
function UnicodeIsControl(C: UCS4): Boolean;
|
|||
|
|
function UnicodeIsSpace(C: UCS4): Boolean;
|
|||
|
|
function UnicodeIsWhiteSpace(C: UCS4): Boolean;
|
|||
|
|
function UnicodeIsBlank(C: UCS4): Boolean;
|
|||
|
|
function UnicodeIsPunctuation(C: UCS4): Boolean;
|
|||
|
|
function UnicodeIsGraph(C: UCS4): Boolean;
|
|||
|
|
function UnicodeIsPrintable(C: UCS4): Boolean;
|
|||
|
|
function UnicodeIsUpper(C: UCS4): Boolean;
|
|||
|
|
function UnicodeIsLower(C: UCS4): Boolean;
|
|||
|
|
function UnicodeIsTitle(C: UCS4): Boolean;
|
|||
|
|
function UnicodeIsHexDigit(C: UCS4): Boolean;
|
|||
|
|
function UnicodeIsIsoControl(C: UCS4): Boolean;
|
|||
|
|
function UnicodeIsFormatControl(C: UCS4): Boolean;
|
|||
|
|
function UnicodeIsSymbol(C: UCS4): Boolean;
|
|||
|
|
function UnicodeIsNumber(C: UCS4): Boolean;
|
|||
|
|
function UnicodeIsNonSpacing(C: UCS4): Boolean;
|
|||
|
|
function UnicodeIsOpenPunctuation(C: UCS4): Boolean;
|
|||
|
|
function UnicodeIsClosePunctuation(C: UCS4): Boolean;
|
|||
|
|
function UnicodeIsInitialPunctuation(C: UCS4): Boolean;
|
|||
|
|
function UnicodeIsFinalPunctuation(C: UCS4): Boolean;
|
|||
|
|
function UnicodeIsComposed(C: UCS4): Boolean;
|
|||
|
|
function UnicodeIsQuotationMark(C: UCS4): Boolean;
|
|||
|
|
function UnicodeIsSymmetric(C: UCS4): Boolean;
|
|||
|
|
function UnicodeIsMirroring(C: UCS4): Boolean;
|
|||
|
|
function UnicodeIsNonBreaking(C: UCS4): Boolean;
|
|||
|
|
|
|||
|
|
// Directionality functions
|
|||
|
|
function UnicodeIsRightToLeft(C: UCS4): Boolean;
|
|||
|
|
function UnicodeIsLeftToRight(C: UCS4): Boolean;
|
|||
|
|
function UnicodeIsStrong(C: UCS4): Boolean;
|
|||
|
|
function UnicodeIsWeak(C: UCS4): Boolean;
|
|||
|
|
function UnicodeIsNeutral(C: UCS4): Boolean;
|
|||
|
|
function UnicodeIsSeparator(C: UCS4): Boolean;
|
|||
|
|
|
|||
|
|
// Other character test functions
|
|||
|
|
function UnicodeIsMark(C: UCS4): Boolean;
|
|||
|
|
function UnicodeIsModifier(C: UCS4): Boolean;
|
|||
|
|
function UnicodeIsLetterNumber(C: UCS4): Boolean;
|
|||
|
|
function UnicodeIsConnectionPunctuation(C: UCS4): Boolean;
|
|||
|
|
function UnicodeIsDash(C: UCS4): Boolean;
|
|||
|
|
function UnicodeIsMath(C: UCS4): Boolean;
|
|||
|
|
function UnicodeIsCurrency(C: UCS4): Boolean;
|
|||
|
|
function UnicodeIsModifierSymbol(C: UCS4): Boolean;
|
|||
|
|
function UnicodeIsNonSpacingMark(C: UCS4): Boolean;
|
|||
|
|
function UnicodeIsSpacingMark(C: UCS4): Boolean;
|
|||
|
|
function UnicodeIsEnclosing(C: UCS4): Boolean;
|
|||
|
|
function UnicodeIsPrivate(C: UCS4): Boolean;
|
|||
|
|
function UnicodeIsSurrogate(C: UCS4): Boolean;
|
|||
|
|
function UnicodeIsLineSeparator(C: UCS4): Boolean;
|
|||
|
|
function UnicodeIsParagraphSeparator(C: UCS4): Boolean;
|
|||
|
|
function UnicodeIsIdentifierStart(C: UCS4): Boolean;
|
|||
|
|
function UnicodeIsIdentifierPart(C: UCS4): Boolean;
|
|||
|
|
function UnicodeIsDefined(C: UCS4): Boolean;
|
|||
|
|
function UnicodeIsUndefined(C: UCS4): Boolean;
|
|||
|
|
function UnicodeIsHan(C: UCS4): Boolean;
|
|||
|
|
function UnicodeIsHangul(C: UCS4): Boolean;
|
|||
|
|
|
|||
|
|
// Utility functions
|
|||
|
|
function CharSetFromLocale(Language: LCID): TFontCharSet;
|
|||
|
|
function GetCharSetFromLocale(Language: LCID; out FontCharSet: TFontCharSet): Boolean;
|
|||
|
|
function CodePageFromLocale(Language: LCID): Integer;
|
|||
|
|
function CodeBlockName(const CB: TUnicodeBlock): string;
|
|||
|
|
function CodeBlockRange(const CB: TUnicodeBlock): TUnicodeBlockRange;
|
|||
|
|
function CodeBlockFromChar(const C: UCS4): TUnicodeBlock;
|
|||
|
|
function KeyboardCodePage: Word;
|
|||
|
|
function KeyUnicode(C: Char): WideChar;
|
|||
|
|
function StringToWideStringEx(const S: string; CodePage: Word): WideString;
|
|||
|
|
function TranslateString(const S: string; CP1, CP2: Word): string;
|
|||
|
|
function WideStringToStringEx(const WS: WideString; CodePage: Word): string;
|
|||
|
|
|
|||
|
|
// WideString conversion routines
|
|||
|
|
procedure ExpandANSIString(const Source: PChar; Target: PWideChar; Count: Cardinal);
|
|||
|
|
function WideStringToUTF8(S: WideString): AnsiString;
|
|||
|
|
function UTF8ToWideString(S: AnsiString): WideString;
|
|||
|
|
|
|||
|
|
type
|
|||
|
|
TCompareFunc = function (const W1, W2: WideString; Locale: LCID): Integer;
|
|||
|
|
|
|||
|
|
var
|
|||
|
|
WideCompareText: TCompareFunc;
|
|||
|
|
|
|||
|
|
{$ENDIF SUPPORTS_WIDESTRING}
|
|||
|
|
|
|||
|
|
type
|
|||
|
|
EJclUnicodeError = class(EJclError);
|
|||
|
|
|
|||
|
|
implementation
|
|||
|
|
|
|||
|
|
{$IFDEF SUPPORTS_WIDESTRING}
|
|||
|
|
|
|||
|
|
// Unicode data for case mapping, decomposition, numbers etc. This data is
|
|||
|
|
// loaded on demand which means only those parts will be put in memory which are
|
|||
|
|
// needed by one of the lookup functions.
|
|||
|
|
// Note: There is a little tool called UDExtract which creates a resouce script from
|
|||
|
|
// the Unicode database file which can be compiled to the needed res file.
|
|||
|
|
// This tool, including its source code, can be downloaded from www.lischke-online.de/Unicode.html.
|
|||
|
|
|
|||
|
|
{$R JclUnicode.res}
|
|||
|
|
|
|||
|
|
uses
|
|||
|
|
{$IFDEF HAS_UNIT_RTLCONSTS}
|
|||
|
|
RtlConsts,
|
|||
|
|
{$ELSE}
|
|||
|
|
{$IFNDEF FPC}
|
|||
|
|
Consts,
|
|||
|
|
{$ENDIF ~FPC}
|
|||
|
|
{$ENDIF HAS_UNIT_RTLCONSTS}
|
|||
|
|
SysUtils,
|
|||
|
|
JclResources, JclSynch;
|
|||
|
|
|
|||
|
|
const
|
|||
|
|
{$IFDEF FPC} // declarations from unit [Rtl]Consts
|
|||
|
|
SDuplicateString = 'String list does not allow duplicates';
|
|||
|
|
SListIndexError = 'List index out of bounds (%d)';
|
|||
|
|
SSortedListError = 'Operation not allowed on sorted string list';
|
|||
|
|
{$ENDIF FPC}
|
|||
|
|
// some predefined sets to shorten parameter lists below and ease repeative usage
|
|||
|
|
ClassLetter = [ccLetterUppercase, ccLetterLowercase, ccLetterTitlecase, ccLetterModifier, ccLetterOther];
|
|||
|
|
ClassSpace = [ccSeparatorSpace, ccSpaceOther];
|
|||
|
|
ClassPunctuation = [ccPunctuationConnector, ccPunctuationDash, ccPunctuationOpen, ccPunctuationClose,
|
|||
|
|
ccPunctuationOther, ccPunctuationInitialQuote, ccPunctuationFinalQuote];
|
|||
|
|
ClassMark = [ccMarkNonSpacing, ccMarkSpacingCombining, ccMarkEnclosing];
|
|||
|
|
ClassNumber = [ccNumberDecimalDigit, ccNumberLetter, ccNumberOther];
|
|||
|
|
ClassSymbol = [ccSymbolMath, ccSymbolCurrency, ccSymbolModifier, ccSymbolOther];
|
|||
|
|
ClassEuropeanNumber = [ccEuropeanNumber, ccEuropeanNumberSeparator, ccEuropeanNumberTerminator];
|
|||
|
|
|
|||
|
|
// used to negate a set of categories
|
|||
|
|
ClassAll = [Low(TCharacterCategory)..High(TCharacterCategory)];
|
|||
|
|
|
|||
|
|
var
|
|||
|
|
// As the global data can be accessed by several threads it should be guarded
|
|||
|
|
// while the data is loaded.
|
|||
|
|
LoadInProgress: TJclCriticalSection;
|
|||
|
|
|
|||
|
|
//----------------- support for character categories -----------------------------------------------
|
|||
|
|
|
|||
|
|
// Character category data is quite a large block since every defined character in Unicode is assigned at least
|
|||
|
|
// one category. Because of this we cannot use a sparse matrix to provide quick access as implemented for
|
|||
|
|
// e.g. composition data.
|
|||
|
|
// The approach used here is based on the fact that an application seldomly uses all characters defined in Unicode
|
|||
|
|
// simultanously. In fact the opposite is true. Most application will use either Western Europe or Arabic or
|
|||
|
|
// Far East character data, but very rarely all together. Based on this fact is the implementation of virtual
|
|||
|
|
// memory using the systems paging file (aka file mapping) to load only into virtual memory what is used currently.
|
|||
|
|
// The implementation is not yet finished and needs a lot of improvements yet.
|
|||
|
|
|
|||
|
|
type
|
|||
|
|
// start and stop of a range of code points
|
|||
|
|
TRange = record
|
|||
|
|
Start,
|
|||
|
|
Stop: Cardinal;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
TRangeArray = array of TRange;
|
|||
|
|
TCategoriesArray = array of TCharacterCategories;
|
|||
|
|
|
|||
|
|
var
|
|||
|
|
// character categories, stored in the system's swap file and mapped on demand
|
|||
|
|
CategoriesLoaded: Boolean;
|
|||
|
|
Categories: array [Byte] of TCategoriesArray;
|
|||
|
|
|
|||
|
|
procedure LoadCharacterCategories;
|
|||
|
|
// Loads the character categories data (as saved by the Unicode database extractor, see also
|
|||
|
|
// the comments about JclUnicode.res above).
|
|||
|
|
var
|
|||
|
|
Size: Integer;
|
|||
|
|
Stream: TResourceStream;
|
|||
|
|
Category: TCharacterCategory;
|
|||
|
|
Buffer: TRangeArray;
|
|||
|
|
First,
|
|||
|
|
Second: Byte;
|
|||
|
|
J, K: Integer;
|
|||
|
|
begin
|
|||
|
|
// Data already loaded?
|
|||
|
|
if not CategoriesLoaded then
|
|||
|
|
begin
|
|||
|
|
// make sure no other code is currently modifying the global data area
|
|||
|
|
LoadInProgress.Enter;
|
|||
|
|
try
|
|||
|
|
CategoriesLoaded := True;
|
|||
|
|
Stream := TResourceStream.Create(HInstance, 'CATEGORIES', 'UNICODEDATA');
|
|||
|
|
try
|
|||
|
|
while Stream.Position < Stream.Size do
|
|||
|
|
begin
|
|||
|
|
// a) read which category is current in the stream
|
|||
|
|
Stream.ReadBuffer(Category, 1);
|
|||
|
|
// b) read the size of the ranges and the ranges themself
|
|||
|
|
Stream.ReadBuffer(Size, 4);
|
|||
|
|
if Size > 0 then
|
|||
|
|
begin
|
|||
|
|
SetLength(Buffer, Size);
|
|||
|
|
Stream.ReadBuffer(Buffer[0], Size * SizeOf(TRange));
|
|||
|
|
|
|||
|
|
// c) go through every range and add the current category to each code point
|
|||
|
|
for J := 0 to Size - 1 do
|
|||
|
|
for K := Buffer[J].Start to Buffer[J].Stop do
|
|||
|
|
begin
|
|||
|
|
if K > $FFFF then
|
|||
|
|
Break;
|
|||
|
|
|
|||
|
|
First := (K shr 8) and $FF;
|
|||
|
|
Second := K and $FF;
|
|||
|
|
// add second step array if not yet done
|
|||
|
|
if Categories[First] = nil then
|
|||
|
|
SetLength(Categories[First], 256);
|
|||
|
|
Include(Categories[First, Second], Category);
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
finally
|
|||
|
|
Stream.Free;
|
|||
|
|
end;
|
|||
|
|
finally
|
|||
|
|
LoadInProgress.Leave;
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function CategoryLookup(Code: Cardinal; Cats: TCharacterCategories): Boolean; overload;
|
|||
|
|
// determines whether the Code is in the given category
|
|||
|
|
var
|
|||
|
|
First,
|
|||
|
|
Second: Byte;
|
|||
|
|
begin
|
|||
|
|
// load property data if not already done
|
|||
|
|
if not CategoriesLoaded then
|
|||
|
|
LoadCharacterCategories;
|
|||
|
|
|
|||
|
|
First := (Code shr 8) and $FF;
|
|||
|
|
Second := Code and $FF;
|
|||
|
|
if Categories[First] <> nil then
|
|||
|
|
Result := Categories[First, Second] * Cats <> []
|
|||
|
|
else
|
|||
|
|
Result := False;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
//----------------- support for case mapping -------------------------------------------------------
|
|||
|
|
|
|||
|
|
type
|
|||
|
|
TCase = array [0..3] of TUCS4Array; // mapping for case fold, lower, title and upper in this order
|
|||
|
|
TCaseArray = array of TCase;
|
|||
|
|
|
|||
|
|
var
|
|||
|
|
// An array for all case mappings (including 1 to many casing if saved by the extraction program).
|
|||
|
|
// The organization is a sparse, two stage matrix.
|
|||
|
|
// SingletonMapping is to quickly return a single default mapping.
|
|||
|
|
CaseDataLoaded: Boolean;
|
|||
|
|
CaseMapping: array [Byte] of TCaseArray;
|
|||
|
|
SingletonMapping: TUCS4Array;
|
|||
|
|
|
|||
|
|
procedure LoadCaseMappingData;
|
|||
|
|
var
|
|||
|
|
Stream: TResourceStream;
|
|||
|
|
I, Code,
|
|||
|
|
Size: Cardinal;
|
|||
|
|
First,
|
|||
|
|
Second: Byte;
|
|||
|
|
begin
|
|||
|
|
if not CaseDataLoaded then
|
|||
|
|
begin
|
|||
|
|
// make sure no other code is currently modifying the global data area
|
|||
|
|
LoadInProgress.Enter;
|
|||
|
|
|
|||
|
|
try
|
|||
|
|
SetLength(SingletonMapping, 1);
|
|||
|
|
CaseDataLoaded := True;
|
|||
|
|
Stream := TResourceStream.Create(HInstance, 'CASE', 'UNICODEDATA');
|
|||
|
|
try
|
|||
|
|
// the first entry in the stream is the number of entries in the case mapping table
|
|||
|
|
Stream.ReadBuffer(Size, 4);
|
|||
|
|
for I := 0 to Size - 1 do
|
|||
|
|
begin
|
|||
|
|
// a) read actual code point
|
|||
|
|
Stream.ReadBuffer(Code, 4);
|
|||
|
|
|
|||
|
|
Assert(Code < $10000, LoadResString(@RsCasedUnicodeChar));
|
|||
|
|
// if there is no high byte entry in the first stage table then create one
|
|||
|
|
First := (Code shr 8) and $FF;
|
|||
|
|
Second := Code and $FF;
|
|||
|
|
if CaseMapping[First] = nil then
|
|||
|
|
SetLength(CaseMapping[First], 256);
|
|||
|
|
|
|||
|
|
// b) read fold case array
|
|||
|
|
Stream.ReadBuffer(Size, 4);
|
|||
|
|
if Size > 0 then
|
|||
|
|
begin
|
|||
|
|
SetLength(CaseMapping[First, Second, 0], Size);
|
|||
|
|
Stream.ReadBuffer(CaseMapping[First, Second, 0, 0], Size * SizeOf(UCS4));
|
|||
|
|
end;
|
|||
|
|
// c) read lower case array
|
|||
|
|
Stream.ReadBuffer(Size, 4);
|
|||
|
|
if Size > 0 then
|
|||
|
|
begin
|
|||
|
|
SetLength(CaseMapping[First, Second, 1], Size);
|
|||
|
|
Stream.ReadBuffer(CaseMapping[First, Second, 1, 0], Size * SizeOf(UCS4));
|
|||
|
|
end;
|
|||
|
|
// d) read title case array
|
|||
|
|
Stream.ReadBuffer(Size, 4);
|
|||
|
|
if Size > 0 then
|
|||
|
|
begin
|
|||
|
|
SetLength(CaseMapping[First, Second, 2], Size);
|
|||
|
|
Stream.ReadBuffer(CaseMapping[First, Second, 2, 0], Size * SizeOf(UCS4));
|
|||
|
|
end;
|
|||
|
|
// e) read upper case array
|
|||
|
|
Stream.ReadBuffer(Size, 4);
|
|||
|
|
if Size > 0 then
|
|||
|
|
begin
|
|||
|
|
SetLength(CaseMapping[First, Second, 3], Size);
|
|||
|
|
Stream.ReadBuffer(CaseMapping[First, Second, 3, 0], Size * SizeOf(UCS4));
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
finally
|
|||
|
|
Stream.Free;
|
|||
|
|
end;
|
|||
|
|
finally
|
|||
|
|
LoadInProgress.Leave;
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function CaseLookup(Code: Cardinal; CaseType: Cardinal): TUCS4Array;
|
|||
|
|
// Performs a lookup of the given code and returns its case mapping if found.
|
|||
|
|
// CaseType must be 0 for case folding, 1 for lower case, 2 for title case and 3 for upper case, respectively.
|
|||
|
|
// If Code could not be found (or there is no case mapping) then the result is a mapping of length 1 with the
|
|||
|
|
// code itself. Otherwise an array of code points is returned which represent the mapping.
|
|||
|
|
var
|
|||
|
|
First,
|
|||
|
|
Second: Byte;
|
|||
|
|
begin
|
|||
|
|
// load case mapping data if not already done
|
|||
|
|
if not CaseDataLoaded then
|
|||
|
|
LoadCaseMappingData;
|
|||
|
|
|
|||
|
|
First := (Code shr 8) and $FF;
|
|||
|
|
Second := Code and $FF;
|
|||
|
|
// Check first stage table whether there is a mapping for a particular block and
|
|||
|
|
// (if so) then whether there is a mapping or not.
|
|||
|
|
if (CaseMapping[First] = nil) or (CaseMapping[First, Second, CaseType] = nil) then
|
|||
|
|
begin
|
|||
|
|
SingletonMapping[0] := Code;
|
|||
|
|
Result := SingletonMapping;
|
|||
|
|
end
|
|||
|
|
else
|
|||
|
|
Result := CaseMapping[First, Second, CaseType];
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function UnicodeCaseFold(Code: UCS4): TUCS4Array;
|
|||
|
|
// This function returnes an array of special case fold mappings if there is one defined for the given
|
|||
|
|
// code, otherwise the lower case will be returned. This all applies only to cased code points.
|
|||
|
|
// Uncased code points are returned unchanged.
|
|||
|
|
begin
|
|||
|
|
Result := CaseLookup(Code, 0);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function UnicodeToUpper(Code: UCS4): TUCS4Array;
|
|||
|
|
begin
|
|||
|
|
Result := CaseLookup(Code, 3);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function UnicodeToLower(Code: UCS4): TUCS4Array;
|
|||
|
|
begin
|
|||
|
|
Result := CaseLookup(Code, 1);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function UnicodeToTitle(Code: UCS4): TUCS4Array;
|
|||
|
|
begin
|
|||
|
|
Result := CaseLookup(Code, 2);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
//----------------- support for decomposition ------------------------------------------------------
|
|||
|
|
|
|||
|
|
const
|
|||
|
|
// constants for hangul composition and hangul-to-jamo decomposition
|
|||
|
|
SBase = $AC00; // hangul syllables start code point
|
|||
|
|
LBase = $1100; // leading syllable
|
|||
|
|
VBase = $1161;
|
|||
|
|
TBase = $11A7; // trailing syllable
|
|||
|
|
LCount = 19;
|
|||
|
|
VCount = 21;
|
|||
|
|
TCount = 28;
|
|||
|
|
NCount = VCount * TCount; // 588
|
|||
|
|
SCount = LCount * NCount; // 11172
|
|||
|
|
|
|||
|
|
type
|
|||
|
|
TDecompositions = array of TUCS4Array;
|
|||
|
|
TDecompositionsArray = array [Byte] of TDecompositions;
|
|||
|
|
|
|||
|
|
var
|
|||
|
|
// list of decompositions, organized (again) as two stage matrix
|
|||
|
|
// Note: there are two tables, one for canonical decompositions and the other one
|
|||
|
|
// for compatibility decompositions.
|
|||
|
|
DecompositionsLoaded: Boolean;
|
|||
|
|
CanonicalDecompositions,
|
|||
|
|
CompatibleDecompositions: TDecompositionsArray;
|
|||
|
|
|
|||
|
|
procedure LoadDecompositionData;
|
|||
|
|
var
|
|||
|
|
Stream: TResourceStream;
|
|||
|
|
I, Code,
|
|||
|
|
Size: Cardinal;
|
|||
|
|
First,
|
|||
|
|
Second: Byte;
|
|||
|
|
begin
|
|||
|
|
if not DecompositionsLoaded then
|
|||
|
|
begin
|
|||
|
|
// make sure no other code is currently modifying the global data area
|
|||
|
|
LoadInProgress.Enter;
|
|||
|
|
|
|||
|
|
try
|
|||
|
|
DecompositionsLoaded := True;
|
|||
|
|
Stream := TResourceStream.Create(HInstance, 'DECOMPOSITION', 'UNICODEDATA');
|
|||
|
|
try
|
|||
|
|
// determine how many decomposition entries we have
|
|||
|
|
Stream.ReadBuffer(Size, 4);
|
|||
|
|
for I := 0 to Size - 1 do
|
|||
|
|
begin
|
|||
|
|
Stream.ReadBuffer(Code, 4);
|
|||
|
|
|
|||
|
|
Assert((Code and not $40000000) < $10000, LoadResString(@RsDecomposedUnicodeChar));
|
|||
|
|
|
|||
|
|
// if there is no high byte entry in the first stage table then create one
|
|||
|
|
First := (Code shr 8) and $FF;
|
|||
|
|
Second := Code and $FF;
|
|||
|
|
|
|||
|
|
// insert into the correct table depending on bit 30
|
|||
|
|
// (if set then it is a compatibility decomposition)
|
|||
|
|
if Code and $40000000 <> 0 then
|
|||
|
|
begin
|
|||
|
|
if CompatibleDecompositions[First] = nil then
|
|||
|
|
SetLength(CompatibleDecompositions[First], 256);
|
|||
|
|
|
|||
|
|
Stream.ReadBuffer(Size, 4);
|
|||
|
|
if Size > 0 then
|
|||
|
|
begin
|
|||
|
|
SetLength(CompatibleDecompositions[First, Second], Size);
|
|||
|
|
Stream.ReadBuffer(CompatibleDecompositions[First, Second, 0], Size * SizeOf(UCS4));
|
|||
|
|
end;
|
|||
|
|
end
|
|||
|
|
else
|
|||
|
|
begin
|
|||
|
|
if CanonicalDecompositions[First] = nil then
|
|||
|
|
SetLength(CanonicalDecompositions[First], 256);
|
|||
|
|
|
|||
|
|
Stream.ReadBuffer(Size, 4);
|
|||
|
|
if Size > 0 then
|
|||
|
|
begin
|
|||
|
|
SetLength(CanonicalDecompositions[First, Second], Size);
|
|||
|
|
Stream.ReadBuffer(CanonicalDecompositions[First, Second, 0], Size * SizeOf(UCS4));
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
finally
|
|||
|
|
Stream.Free;
|
|||
|
|
end;
|
|||
|
|
finally
|
|||
|
|
LoadInProgress.Leave;
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function UnicodeDecomposeHangul(Code: UCS4): TUCS4Array;
|
|||
|
|
// algorithmically decomposes hangul character
|
|||
|
|
var
|
|||
|
|
Rest: Integer;
|
|||
|
|
begin
|
|||
|
|
Dec(Code, SBase);
|
|||
|
|
Rest := Code mod TCount;
|
|||
|
|
if Rest = 0 then
|
|||
|
|
SetLength(Result, 2)
|
|||
|
|
else
|
|||
|
|
SetLength(Result, 3);
|
|||
|
|
Result[0] := LBase + (Code div NCount);
|
|||
|
|
Result[1] := VBase + ((Code mod NCount) div TCount);
|
|||
|
|
if Rest <> 0 then
|
|||
|
|
Result[2] := TBase + Rest;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function UnicodeDecompose(Code: UCS4; Compatible: Boolean): TUCS4Array;
|
|||
|
|
var
|
|||
|
|
First,
|
|||
|
|
Second: Byte;
|
|||
|
|
begin
|
|||
|
|
// load decomposition data if not already done
|
|||
|
|
if not DecompositionsLoaded then
|
|||
|
|
LoadDecompositionData;
|
|||
|
|
|
|||
|
|
Result := nil;
|
|||
|
|
|
|||
|
|
// if the code is hangul then decomposition is algorithmically
|
|||
|
|
if UnicodeIsHangul(Code) then
|
|||
|
|
Result := UnicodeDecomposeHangul(Code)
|
|||
|
|
else
|
|||
|
|
begin
|
|||
|
|
First := (Code shr 8) and $FF;
|
|||
|
|
Second := Code and $FF;
|
|||
|
|
if Compatible then
|
|||
|
|
begin
|
|||
|
|
// Check first stage table whether there is a particular block and
|
|||
|
|
// (if so) then whether there is a decomposition or not.
|
|||
|
|
if (CompatibleDecompositions[First] = nil) or (CompatibleDecompositions[First, Second] = nil) then
|
|||
|
|
begin
|
|||
|
|
// if there is no compatibility decompositions try canonical
|
|||
|
|
if (CanonicalDecompositions[First] = nil) or (CanonicalDecompositions[First, Second] = nil) then
|
|||
|
|
Result := nil
|
|||
|
|
else
|
|||
|
|
Result := CanonicalDecompositions[First, Second];
|
|||
|
|
end
|
|||
|
|
else
|
|||
|
|
Result := CompatibleDecompositions[First, Second];
|
|||
|
|
end
|
|||
|
|
else
|
|||
|
|
begin
|
|||
|
|
if (CanonicalDecompositions[First] = nil) or (CanonicalDecompositions[First, Second] = nil) then
|
|||
|
|
Result := nil
|
|||
|
|
else
|
|||
|
|
Result := CanonicalDecompositions[First, Second];
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
//----------------- support for combining classes --------------------------------------------------
|
|||
|
|
|
|||
|
|
type
|
|||
|
|
TClassArray = array of Byte;
|
|||
|
|
|
|||
|
|
var
|
|||
|
|
// canonical combining classes, again as two stage matrix
|
|||
|
|
CCCsLoaded: Boolean;
|
|||
|
|
CCCs: array [Byte] of TClassArray;
|
|||
|
|
|
|||
|
|
procedure LoadCombiningClassData;
|
|||
|
|
var
|
|||
|
|
Stream: TResourceStream;
|
|||
|
|
I, J, K,
|
|||
|
|
Size: Cardinal;
|
|||
|
|
Buffer: TRangeArray;
|
|||
|
|
First,
|
|||
|
|
Second: Byte;
|
|||
|
|
begin
|
|||
|
|
// make sure no other code is currently modifying the global data area
|
|||
|
|
LoadInProgress.Enter;
|
|||
|
|
|
|||
|
|
try
|
|||
|
|
if not CCCsLoaded then
|
|||
|
|
begin
|
|||
|
|
CCCsLoaded := True;
|
|||
|
|
Stream := TResourceStream.Create(HInstance, 'COMBINING', 'UNICODEDATA');
|
|||
|
|
try
|
|||
|
|
while Stream.Position < Stream.Size do
|
|||
|
|
begin
|
|||
|
|
// a) determine which class is stored here
|
|||
|
|
Stream.ReadBuffer(I, 4);
|
|||
|
|
// b) determine how many ranges are assigned to this class
|
|||
|
|
Stream.ReadBuffer(Size, 4);
|
|||
|
|
// c) read start and stop code of each range
|
|||
|
|
if Size > 0 then
|
|||
|
|
begin
|
|||
|
|
SetLength(Buffer, Size);
|
|||
|
|
Stream.ReadBuffer(Buffer[0], Size * SizeOf(TRange));
|
|||
|
|
|
|||
|
|
// d) put this class in every of the code points just loaded
|
|||
|
|
for J := 0 to Size - 1 do
|
|||
|
|
for K := Buffer[J].Start to Buffer[J].Stop do
|
|||
|
|
begin
|
|||
|
|
Assert(K < $10000, LoadResString(@RsCombiningClassUnicodeChar));
|
|||
|
|
|
|||
|
|
First := (K shr 8) and $FF;
|
|||
|
|
Second := K and $FF;
|
|||
|
|
// add second step array if not yet done
|
|||
|
|
if CCCs[First] = nil then
|
|||
|
|
SetLength(CCCs[First], 256);
|
|||
|
|
CCCs[First, Second] := I;
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
finally
|
|||
|
|
Stream.Free;
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
finally
|
|||
|
|
LoadInProgress.Leave;
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function CanonicalCombiningClass(Code: Cardinal): Cardinal;
|
|||
|
|
var
|
|||
|
|
First,
|
|||
|
|
Second: Byte;
|
|||
|
|
begin
|
|||
|
|
// load combining class data if not already done
|
|||
|
|
if not CCCsLoaded then
|
|||
|
|
LoadCombiningClassData;
|
|||
|
|
|
|||
|
|
First := (Code shr 8) and $FF;
|
|||
|
|
Second := Code and $FF;
|
|||
|
|
if CCCs[First] <> nil then
|
|||
|
|
Result := CCCs[First, Second]
|
|||
|
|
else
|
|||
|
|
Result := 0;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
//----------------- support for numeric values -----------------------------------------------------
|
|||
|
|
|
|||
|
|
type
|
|||
|
|
// structures for handling numbers
|
|||
|
|
TCodeIndex = record
|
|||
|
|
Code,
|
|||
|
|
Index: Cardinal;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
var
|
|||
|
|
// array to hold the number equivalents for specific codes
|
|||
|
|
NumberCodes: array of TCodeIndex;
|
|||
|
|
// array of numbers used in NumberCodes
|
|||
|
|
Numbers: array of TUcNumber;
|
|||
|
|
|
|||
|
|
procedure LoadNumberData;
|
|||
|
|
var
|
|||
|
|
Stream: TResourceStream;
|
|||
|
|
Size: Cardinal;
|
|||
|
|
begin
|
|||
|
|
// make sure no other code is currently modifying the global data area
|
|||
|
|
LoadInProgress.Enter;
|
|||
|
|
|
|||
|
|
try
|
|||
|
|
if NumberCodes = nil then
|
|||
|
|
begin
|
|||
|
|
Stream := TResourceStream.Create(HInstance, 'NUMBERS', 'UNICODEDATA');
|
|||
|
|
// Numbers are special (compared to other Unicode data) as they utilize two
|
|||
|
|
// arrays, one containing all used numbers (in nominator-denominator format) and
|
|||
|
|
// another one which maps a code point to one of the numbers in the first array.
|
|||
|
|
|
|||
|
|
// a) determine size of numbers array
|
|||
|
|
Stream.ReadBuffer(Size, 4);
|
|||
|
|
SetLength(Numbers, Size);
|
|||
|
|
// b) read numbers data
|
|||
|
|
Stream.ReadBuffer(Numbers[0], Size * SizeOf(TUcNumber));
|
|||
|
|
// c) determine size of index array
|
|||
|
|
Stream.ReadBuffer(Size, 4);
|
|||
|
|
SetLength(NumberCodes, Size);
|
|||
|
|
// d) read index data
|
|||
|
|
Stream.ReadBuffer(NumberCodes[0], Size * SizeOf(TCodeIndex));
|
|||
|
|
Stream.Free;
|
|||
|
|
end;
|
|||
|
|
finally
|
|||
|
|
LoadInProgress.Leave;
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function UnicodeNumberLookup(Code: UCS4; var Number: TUcNumber): Boolean;
|
|||
|
|
// Searches for the given code and returns its number equivalent (if there is one).
|
|||
|
|
// Typical cases are: '1/6' (U+2159), '3/8' (U+215C), 'XII' (U+216B) etc.
|
|||
|
|
// Result is set to True if the code could be found.
|
|||
|
|
var
|
|||
|
|
L, R, M: Integer;
|
|||
|
|
begin
|
|||
|
|
// load number data if not already done
|
|||
|
|
if NumberCodes = nil then
|
|||
|
|
LoadNumberData;
|
|||
|
|
|
|||
|
|
Result := False;
|
|||
|
|
L := 0;
|
|||
|
|
R := High(NumberCodes);
|
|||
|
|
while L <= R do
|
|||
|
|
begin
|
|||
|
|
M := (L + R) shr 1;
|
|||
|
|
if Code > NumberCodes[M].Code then
|
|||
|
|
L := M + 1
|
|||
|
|
else
|
|||
|
|
begin
|
|||
|
|
if Code < NumberCodes[M].Code then
|
|||
|
|
R := M - 1
|
|||
|
|
else
|
|||
|
|
begin
|
|||
|
|
Number := Numbers[NumberCodes[M].Index];
|
|||
|
|
Result := True;
|
|||
|
|
Break;
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
//----------------- support for composition --------------------------------------------------------
|
|||
|
|
|
|||
|
|
type
|
|||
|
|
// maps between a pair of code points to a composite code point
|
|||
|
|
// Note: the source pair is packed into one 4 byte value to speed up search.
|
|||
|
|
TCompositionPair = record
|
|||
|
|
Code: Cardinal;
|
|||
|
|
Composition: UCS4;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
var
|
|||
|
|
// list of composition mappings
|
|||
|
|
Compositions: array of TCompositionPair;
|
|||
|
|
|
|||
|
|
procedure LoadCompositionData;
|
|||
|
|
var
|
|||
|
|
Stream: TResourceStream;
|
|||
|
|
Size: Cardinal;
|
|||
|
|
begin
|
|||
|
|
// make sure no other code is currently modifying the global data area
|
|||
|
|
LoadInProgress.Enter;
|
|||
|
|
|
|||
|
|
try
|
|||
|
|
if Compositions = nil then
|
|||
|
|
begin
|
|||
|
|
Stream := TResourceStream.Create(HInstance, 'COMPOSITION', 'UNICODEDATA');
|
|||
|
|
// a) determine size of compositions array
|
|||
|
|
Stream.ReadBuffer(Size, 4);
|
|||
|
|
SetLength(Compositions, Size);
|
|||
|
|
// b) read data
|
|||
|
|
Stream.ReadBuffer(Compositions[0], Size * SizeOf(TCompositionPair));
|
|||
|
|
Stream.Free;
|
|||
|
|
end;
|
|||
|
|
finally
|
|||
|
|
LoadInProgress.Leave;
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function UnicodeComposePair(First, Second: UCS4; var Composite: UCS4): Boolean;
|
|||
|
|
// Maps the sequence of First and Second to a composite.
|
|||
|
|
// Result is True if there was a mapping otherwise it is False.
|
|||
|
|
var
|
|||
|
|
L, R, M, C: Integer;
|
|||
|
|
Pair: Integer;
|
|||
|
|
begin
|
|||
|
|
if Compositions = nil then
|
|||
|
|
LoadCompositionData;
|
|||
|
|
|
|||
|
|
Result := False;
|
|||
|
|
L := 0;
|
|||
|
|
R := High(Compositions);
|
|||
|
|
Pair := Integer((First shl 16) or Word(Second));
|
|||
|
|
while L <= R do
|
|||
|
|
begin
|
|||
|
|
M := (L + R) shr 1;
|
|||
|
|
C := Integer(Compositions[M].Code) - Pair;
|
|||
|
|
if C < 0 then
|
|||
|
|
L := M + 1
|
|||
|
|
else
|
|||
|
|
begin
|
|||
|
|
R := M - 1;
|
|||
|
|
if C = 0 then
|
|||
|
|
begin
|
|||
|
|
Result := True;
|
|||
|
|
L := M;
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
if Result then
|
|||
|
|
Composite := Compositions[L].Composition;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
//=== { TSearchEngine } ======================================================
|
|||
|
|
|
|||
|
|
constructor TSearchEngine.Create(AOwner: TWideStrings);
|
|||
|
|
begin
|
|||
|
|
FOwner := AOwner;
|
|||
|
|
FResults := TList.Create;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
destructor TSearchEngine.Destroy;
|
|||
|
|
begin
|
|||
|
|
Clear;
|
|||
|
|
FResults.Free;
|
|||
|
|
inherited Destroy;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
procedure TSearchEngine.AddResult(Start, Stop: Cardinal);
|
|||
|
|
begin
|
|||
|
|
FResults.Add(Pointer(Start));
|
|||
|
|
FResults.Add(Pointer(Stop));
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
procedure TSearchEngine.Clear;
|
|||
|
|
begin
|
|||
|
|
ClearResults;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
procedure TSearchEngine.ClearResults;
|
|||
|
|
begin
|
|||
|
|
FResults.Clear;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
procedure TSearchEngine.DeleteResult(Index: Cardinal);
|
|||
|
|
// explicitly deletes a search result
|
|||
|
|
begin
|
|||
|
|
with FResults do
|
|||
|
|
begin
|
|||
|
|
// start index
|
|||
|
|
Delete(2 * Index);
|
|||
|
|
// stop index
|
|||
|
|
Delete(2 * Index);
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function TSearchEngine.GetCount: Integer;
|
|||
|
|
// returns the number of matches found
|
|||
|
|
begin
|
|||
|
|
Result := FResults.Count div 2;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
procedure TSearchEngine.GetResult(Index: Cardinal; var Start, Stop: Integer);
|
|||
|
|
// returns the start position of a match (end position can be determined by
|
|||
|
|
// adding the length of the pattern to the start position)
|
|||
|
|
begin
|
|||
|
|
Start := Cardinal(FResults[2 * Index]);
|
|||
|
|
Stop := Cardinal(FResults[2 * Index + 1]);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
//----------------- TUTBSearch ---------------------------------------------------------------------
|
|||
|
|
|
|||
|
|
procedure TUTBMSearch.ClearPattern;
|
|||
|
|
begin
|
|||
|
|
FreeMem(FPattern);
|
|||
|
|
FPattern := nil;
|
|||
|
|
FFlags := [];
|
|||
|
|
FPatternUsed := 0;
|
|||
|
|
FPatternSize := 0;
|
|||
|
|
FPatternLength := 0;
|
|||
|
|
FreeMem(FSkipValues);
|
|||
|
|
FSkipValues := nil;
|
|||
|
|
FSkipsUsed := 0;
|
|||
|
|
FMD4 := 0;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function TUTBMSearch.GetSkipValue(TextStart, TextEnd: PUCS2): Cardinal;
|
|||
|
|
// looks up the SkipValues value for a character
|
|||
|
|
var
|
|||
|
|
I: Integer;
|
|||
|
|
C1,
|
|||
|
|
C2: UCS4;
|
|||
|
|
Sp: PUTBMSkip;
|
|||
|
|
begin
|
|||
|
|
Result := 0;
|
|||
|
|
if Cardinal(TextStart) < Cardinal(TextEnd) then
|
|||
|
|
begin
|
|||
|
|
C1 := UCS4(TextStart^);
|
|||
|
|
if (TextStart + 1) < TextEnd then
|
|||
|
|
C2 := UCS4((TextStart + 1)^)
|
|||
|
|
else
|
|||
|
|
C2 := $FFFFFFFF;
|
|||
|
|
if (SurrogateHighStart <= C1) and (C1 <= SurrogateHighEnd) and
|
|||
|
|
(SurrogateLowStart <= C2) and (C2 <= $DDDD) then
|
|||
|
|
C1 := $10000 + (((C1 and $03FF) shl 10) or (C2 and $03FF));
|
|||
|
|
|
|||
|
|
Sp := FSkipValues;
|
|||
|
|
for I := 0 to FSkipsUsed - 1 do
|
|||
|
|
begin
|
|||
|
|
if not (Boolean(C1 xor Sp.BMChar.UpCase) and
|
|||
|
|
Boolean(C1 xor Sp.BMChar.LoCase) and
|
|||
|
|
Boolean(C1 xor Sp.BMChar.TitleCase)) then
|
|||
|
|
begin
|
|||
|
|
if (TextEnd - TextStart) < Sp.SkipValues then
|
|||
|
|
Result := TextEnd - TextStart
|
|||
|
|
else
|
|||
|
|
Result := Sp.SkipValues;
|
|||
|
|
Exit;
|
|||
|
|
end;
|
|||
|
|
Inc(Sp);
|
|||
|
|
end;
|
|||
|
|
Result := FPatternLength;
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function TUTBMSearch.Match(Text, Start, Stop: PUCS2; var MatchStart, MatchEnd: Cardinal): Boolean;
|
|||
|
|
// Checks once whether the text at position Start (which points to the end of the
|
|||
|
|
// current text part to be matched) matches.
|
|||
|
|
// Note: If whole words only are allowed then the left and right border tests are
|
|||
|
|
// done here too. The keypoint for the right border is that the next character
|
|||
|
|
// after the search string is either the text end or a space character.
|
|||
|
|
// For the left side this is similar, but there is nothing like a string
|
|||
|
|
// start marker (like the string end marker #0).
|
|||
|
|
//
|
|||
|
|
// It seems not obvious, but we still can use the passed Text pointer to do
|
|||
|
|
// the left check. Although this pointer might not point to the real string
|
|||
|
|
// start (e.g. in TUTBMSearch.FindAll Text is incremented as needed) it is
|
|||
|
|
// still a valid check mark. The reason is that Text either points to the
|
|||
|
|
// real string start or a previous match (happend already, keep in mind the
|
|||
|
|
// search options do not change in the FindAll loop) and the character just
|
|||
|
|
// before Text is a space character.
|
|||
|
|
// This fact implies, though, that strings passed to Find (or FindFirst,
|
|||
|
|
// FindAll in TUTBMSearch) always really start at the given address. Although
|
|||
|
|
// this might not be the case in some circumstances (e.g. if you pass only
|
|||
|
|
// the selection from an editor) it is still assumed that a pattern matching
|
|||
|
|
// from the first position on (from the search string start) also matches
|
|||
|
|
// when whole words only are allowed.
|
|||
|
|
var
|
|||
|
|
CheckSpace: Boolean;
|
|||
|
|
C1, C2: UCS4;
|
|||
|
|
Count: Integer;
|
|||
|
|
Cp: PUTBMChar;
|
|||
|
|
begin
|
|||
|
|
// be pessimistic
|
|||
|
|
Result := False;
|
|||
|
|
|
|||
|
|
// set the potential match endpoint first
|
|||
|
|
MatchEnd := (Start - Text) + 1;
|
|||
|
|
|
|||
|
|
C1 := UCS4(Start^);
|
|||
|
|
if (Start + 1) < Stop then
|
|||
|
|
C2 := UCS4((Start + 1)^)
|
|||
|
|
else
|
|||
|
|
C2 := $FFFFFFFF;
|
|||
|
|
if (SurrogateHighStart <= C1) and (C1 <= SurrogateHighEnd) and
|
|||
|
|
(SurrogateLowStart <= C2) and (C2 <= SurrogateLowEnd) then
|
|||
|
|
begin
|
|||
|
|
C1 := $10000 + (((C1 and $03FF) shl 10) or (C2 and $03FF));
|
|||
|
|
// Adjust the match end point to occur after the UTF-16 character.
|
|||
|
|
Inc(MatchEnd);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
// check special cases
|
|||
|
|
if FPatternUsed = 1 then
|
|||
|
|
begin
|
|||
|
|
MatchStart := Start - Text;
|
|||
|
|
Result := True;
|
|||
|
|
Exit;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
// Early out if entire words need to be matched and the next character
|
|||
|
|
// in the search string is neither the string end nor a space character.
|
|||
|
|
if (sfWholeWordOnly in FFlags) and
|
|||
|
|
not ((Start + 1)^ = WideNull) and
|
|||
|
|
not UnicodeIsWhiteSpace(UCS4((Start + 1)^)) then
|
|||
|
|
Exit;
|
|||
|
|
|
|||
|
|
// compare backward
|
|||
|
|
Cp := FPattern;
|
|||
|
|
Inc(Cp, FPatternUsed - 1);
|
|||
|
|
|
|||
|
|
Count := FPatternLength;
|
|||
|
|
while (Start >= Text) and (Count > 0) do
|
|||
|
|
begin
|
|||
|
|
// ignore non-spacing characters if indicated
|
|||
|
|
if sfIgnoreNonSpacing in FFlags then
|
|||
|
|
begin
|
|||
|
|
while (Start > Text) and UnicodeIsNonSpacing(C1) do
|
|||
|
|
begin
|
|||
|
|
Dec(Start);
|
|||
|
|
C2 := UCS4(Start^);
|
|||
|
|
if (Start - 1) > Text then
|
|||
|
|
C1 := UCS4((Start - 1)^)
|
|||
|
|
else
|
|||
|
|
C1 := $FFFFFFFF;
|
|||
|
|
if (SurrogateLowStart <= C2) and (C2 <= SurrogateLowEnd) and
|
|||
|
|
(SurrogateHighStart <= C1) and (C1 <= SurrogateHighEnd) then
|
|||
|
|
begin
|
|||
|
|
C1 := $10000 + (((C1 and $03FF) shl 10) or (C2 and $03FF));
|
|||
|
|
Dec(Start);
|
|||
|
|
end
|
|||
|
|
else
|
|||
|
|
C1 := C2;
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
// handle space compression if indicated
|
|||
|
|
if sfSpaceCompress in FFlags then
|
|||
|
|
begin
|
|||
|
|
CheckSpace := False;
|
|||
|
|
while (Start > Text) and (UnicodeIsWhiteSpace(C1) or UnicodeIsControl(C1)) do
|
|||
|
|
begin
|
|||
|
|
CheckSpace := UnicodeIsWhiteSpace(C1);
|
|||
|
|
Dec(Start);
|
|||
|
|
C2 := UCS4(Start^);
|
|||
|
|
if (Start - 1) > Text then
|
|||
|
|
C1 := UCS4((Start - 1)^)
|
|||
|
|
else
|
|||
|
|
C1 := $FFFFFFFF;
|
|||
|
|
if (SurrogateLowStart <= C2) and (C2 <= SurrogateLowEnd) and
|
|||
|
|
(SurrogateHighStart <= C1) and (C1 <= SurrogateHighEnd) then
|
|||
|
|
begin
|
|||
|
|
C1 := $10000 + (((C1 and $03FF) shl 10) or (C2 and $03FF));
|
|||
|
|
Dec(Start);
|
|||
|
|
end
|
|||
|
|
else
|
|||
|
|
C1 := C2;
|
|||
|
|
end;
|
|||
|
|
// Handle things if space compression was indicated and one or
|
|||
|
|
// more member characters were found.
|
|||
|
|
if CheckSpace then
|
|||
|
|
begin
|
|||
|
|
if Cp.UpCase <> $20 then
|
|||
|
|
Exit;
|
|||
|
|
Dec(Cp);
|
|||
|
|
Dec(Count);
|
|||
|
|
// If Count is 0 at this place then the space character(s) was the first
|
|||
|
|
// in the pattern and we need to correct the start position.
|
|||
|
|
if Count = 0 then
|
|||
|
|
Inc(Start);
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
// handle the normal comparison cases
|
|||
|
|
if (Count > 0) and
|
|||
|
|
(Boolean(C1 xor Cp.UpCase) and
|
|||
|
|
Boolean(C1 xor Cp.LoCase) and
|
|||
|
|
Boolean(C1 xor Cp.TitleCase)) then
|
|||
|
|
Exit;
|
|||
|
|
|
|||
|
|
if C1 >= $10000 then
|
|||
|
|
Dec(Count, 2)
|
|||
|
|
else
|
|||
|
|
Dec(Count, 1);
|
|||
|
|
if Count > 0 then
|
|||
|
|
begin
|
|||
|
|
Dec(Cp);
|
|||
|
|
// get the next preceding character
|
|||
|
|
if Start > Text then
|
|||
|
|
begin
|
|||
|
|
Dec(Start);
|
|||
|
|
C2 := UCS4(Start^);
|
|||
|
|
if (Start - 1) > Text then
|
|||
|
|
C1 := UCS4((Start - 1)^)
|
|||
|
|
else
|
|||
|
|
C1 := $FFFFFFFF;
|
|||
|
|
if (SurrogateLowStart <= C2) and (C2 <= SurrogateLowEnd) and
|
|||
|
|
(SurrogateHighStart <= C1) and (C1 <= SurrogateHighEnd) then
|
|||
|
|
begin
|
|||
|
|
C1 := $10000 + (((C1 and $03FF) shl 10) or (C2 and $03FF));
|
|||
|
|
Dec(Start);
|
|||
|
|
end
|
|||
|
|
else
|
|||
|
|
C1 := C2;
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
// So far the string matched. Now check its left border for a space character
|
|||
|
|
// if whole word only are allowed.
|
|||
|
|
if not (sfWholeWordOnly in FFlags) or
|
|||
|
|
(Start <= Text) or
|
|||
|
|
UnicodeIsWhiteSpace(UCS4((Start - 1)^)) then
|
|||
|
|
begin
|
|||
|
|
// set the match start position
|
|||
|
|
MatchStart := Start - Text;
|
|||
|
|
Result := True;
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
procedure TUTBMSearch.Compile(Pattern: PUCS2; PatternLength: Integer; Flags: TSearchFlags);
|
|||
|
|
var
|
|||
|
|
HaveSpace: Boolean;
|
|||
|
|
I, J, K,
|
|||
|
|
SLen: Integer;
|
|||
|
|
Cp: PUTBMChar;
|
|||
|
|
Sp: PUTBMSkip;
|
|||
|
|
C1, C2,
|
|||
|
|
Sentinel: UCS4;
|
|||
|
|
begin
|
|||
|
|
if (Pattern <> nil) and (Pattern^ <> #0) and (PatternLength > 0) then
|
|||
|
|
begin
|
|||
|
|
// do some initialization
|
|||
|
|
FFlags := Flags;
|
|||
|
|
// extra skip flag
|
|||
|
|
FMD4 := 1;
|
|||
|
|
|
|||
|
|
Sentinel := 0;
|
|||
|
|
|
|||
|
|
// allocate more storage if necessary
|
|||
|
|
FPattern := AllocMem(SizeOf(TUTBMChar) * PatternLength);
|
|||
|
|
FSkipValues := AllocMem(SizeOf(TUTBMSkip) * PatternLength);
|
|||
|
|
FPatternSize := PatternLength;
|
|||
|
|
|
|||
|
|
// Preprocess the pattern to remove controls (if specified) and determine case.
|
|||
|
|
Cp := FPattern;
|
|||
|
|
I := 0;
|
|||
|
|
HaveSpace := False;
|
|||
|
|
while I < PatternLength do
|
|||
|
|
begin
|
|||
|
|
C1 := UCS4(Pattern[I]);
|
|||
|
|
if (I + 1) < PatternLength then
|
|||
|
|
C2 := UCS4(Pattern[I + 1])
|
|||
|
|
else
|
|||
|
|
C2 := $FFFFFFFF;
|
|||
|
|
if (SurrogateHighStart <= C1) and (C1 <= SurrogateHighEnd) and
|
|||
|
|
(SurrogateLowStart <= C2) and (C2 <= SurrogateLowEnd) then
|
|||
|
|
C1 := $10000 + (((C1 and $03FF) shl 10) or (C2 and $03FF));
|
|||
|
|
|
|||
|
|
// Make sure the HaveSpace flag is turned off if the character is not an
|
|||
|
|
// appropriate one.
|
|||
|
|
if not UnicodeIsWhiteSpace(C1) then
|
|||
|
|
HaveSpace := False;
|
|||
|
|
|
|||
|
|
// If non-spacing characters should be ignored, do it here.
|
|||
|
|
if (sfIgnoreNonSpacing in Flags) and UnicodeIsNonSpacing(C1) then
|
|||
|
|
begin
|
|||
|
|
Inc(I);
|
|||
|
|
Continue;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
// check if spaces and controls need to be compressed
|
|||
|
|
if sfSpaceCompress in Flags then
|
|||
|
|
begin
|
|||
|
|
if UnicodeIsWhiteSpace(C1) then
|
|||
|
|
begin
|
|||
|
|
if not HaveSpace then
|
|||
|
|
begin
|
|||
|
|
// Add a space and set the flag.
|
|||
|
|
Cp.UpCase := $20;
|
|||
|
|
Cp.LoCase := $20;
|
|||
|
|
Cp.TitleCase := $20;
|
|||
|
|
Inc(Cp);
|
|||
|
|
|
|||
|
|
// increase the real pattern length
|
|||
|
|
Inc(FPatternLength);
|
|||
|
|
Sentinel := $20;
|
|||
|
|
HaveSpace := True;
|
|||
|
|
end;
|
|||
|
|
Inc(I);
|
|||
|
|
Continue;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
// ignore all control characters
|
|||
|
|
if UnicodeIsControl(C1) then
|
|||
|
|
begin
|
|||
|
|
Inc(I);
|
|||
|
|
Continue;
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
// add the character
|
|||
|
|
if not (sfCaseSensitive in Flags) then
|
|||
|
|
begin
|
|||
|
|
{ TODO : use the entire mapping, not only the first character }
|
|||
|
|
Cp.UpCase := UnicodeToUpper(C1)[0];
|
|||
|
|
Cp.LoCase := UnicodeToLower(C1)[0];
|
|||
|
|
Cp.TitleCase := UnicodeToTitle(C1)[0];
|
|||
|
|
end
|
|||
|
|
else
|
|||
|
|
begin
|
|||
|
|
Cp.UpCase := C1;
|
|||
|
|
Cp.LoCase := C1;
|
|||
|
|
Cp.TitleCase := C1;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
Sentinel := Cp.UpCase;
|
|||
|
|
|
|||
|
|
// move to the next character
|
|||
|
|
Inc(Cp);
|
|||
|
|
|
|||
|
|
// increase the real pattern length appropriately
|
|||
|
|
if C1 >= $10000 then
|
|||
|
|
Inc(FPatternLength, 2)
|
|||
|
|
else
|
|||
|
|
Inc(FPatternLength);
|
|||
|
|
|
|||
|
|
// increment the loop index for UTF-16 characters
|
|||
|
|
if C1 > $10000 then
|
|||
|
|
Inc(I, 2)
|
|||
|
|
else
|
|||
|
|
Inc(I);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
// set the number of characters actually used
|
|||
|
|
FPatternUsed := (PChar(Cp) - PChar(FPattern)) div SizeOf(TUTBMChar);
|
|||
|
|
|
|||
|
|
// Go through and construct the skip array and determine the actual length
|
|||
|
|
// of the pattern in UCS2 terms.
|
|||
|
|
SLen := FPatternLength - 1;
|
|||
|
|
Cp := FPattern;
|
|||
|
|
K := 0;
|
|||
|
|
for I := 0 to FPatternUsed - 1 do
|
|||
|
|
begin
|
|||
|
|
// locate the character in the FSkipValues array
|
|||
|
|
Sp := FSkipValues;
|
|||
|
|
J := 0;
|
|||
|
|
while (J < FSkipsUsed) and (Sp.BMChar.UpCase <> Cp.UpCase) do
|
|||
|
|
begin
|
|||
|
|
Inc(J);
|
|||
|
|
Inc(Sp);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
// If the character is not found, set the new FSkipValues element and
|
|||
|
|
// increase the number of FSkipValues elements.
|
|||
|
|
if J = FSkipsUsed then
|
|||
|
|
begin
|
|||
|
|
Sp.BMChar := Cp;
|
|||
|
|
Inc(FSkipsUsed);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
// Set the updated FSkipValues value. If the character is UTF-16 and is
|
|||
|
|
// not the last one in the pattern, add one to its FSkipValues value.
|
|||
|
|
Sp.SkipValues := SLen - K;
|
|||
|
|
if (Cp.UpCase >= $10000) and ((K + 2) < SLen) then
|
|||
|
|
Inc(Sp.SkipValues);
|
|||
|
|
|
|||
|
|
// set the new extra FSkipValues for the sentinel character
|
|||
|
|
if ((Cp.UpCase >= $10000) and
|
|||
|
|
((K + 2) <= SLen) or ((K + 1) <= SLen) and
|
|||
|
|
(Cp.UpCase = Sentinel)) then
|
|||
|
|
FMD4 := SLen - K;
|
|||
|
|
|
|||
|
|
// increase the actual index
|
|||
|
|
if Cp.UpCase >= $10000 then
|
|||
|
|
Inc(K, 2)
|
|||
|
|
else
|
|||
|
|
Inc(K);
|
|||
|
|
Inc(Cp);
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function TUTBMSearch.Find(Text: PUCS2; TextLen: Cardinal; var MatchStart, MatchEnd: Cardinal): Boolean;
|
|||
|
|
// this is the main matching routine using a tuned Boyer-Moore algorithm
|
|||
|
|
var
|
|||
|
|
K: Cardinal;
|
|||
|
|
Start,
|
|||
|
|
Stop: PUCS2;
|
|||
|
|
begin
|
|||
|
|
Result := False;
|
|||
|
|
if (FPattern <> nil) and (FPatternUsed > 0) and (Text <> nil) and
|
|||
|
|
(TextLen > 0) and (TextLen >= FPatternLength) then
|
|||
|
|
begin
|
|||
|
|
Start := Text + FPatternLength - 1;
|
|||
|
|
Stop := Text + TextLen;
|
|||
|
|
|
|||
|
|
// adjust the start point if it points to a low surrogate
|
|||
|
|
if (SurrogateLowStart <= UCS4(Start^)) and
|
|||
|
|
(UCS4(Start^) <= SurrogateLowEnd) and
|
|||
|
|
(SurrogateHighStart <= UCS4((Start - 1)^)) and
|
|||
|
|
(UCS4((Start - 1)^) <= SurrogateHighEnd) then
|
|||
|
|
Dec(Start);
|
|||
|
|
|
|||
|
|
while Start < Stop do
|
|||
|
|
begin
|
|||
|
|
repeat
|
|||
|
|
K := GetSkipValue(Start, Stop);
|
|||
|
|
if K = 0 then
|
|||
|
|
Break;
|
|||
|
|
Inc(Start, K);
|
|||
|
|
if (Start < Stop) and
|
|||
|
|
(SurrogateLowStart <= UCS4(Start^)) and
|
|||
|
|
(UCS4(Start^) <= SurrogateLowEnd) and
|
|||
|
|
(SurrogateHighStart <= UCS4((Start - 1)^)) and
|
|||
|
|
(UCS4((Start - 1)^) <= SurrogateHighEnd) then
|
|||
|
|
Dec(Start);
|
|||
|
|
until False;
|
|||
|
|
|
|||
|
|
if (Start < Stop) and Match(Text, Start, Stop, MatchStart, MatchEnd) then
|
|||
|
|
begin
|
|||
|
|
Result := True;
|
|||
|
|
Break;
|
|||
|
|
end;
|
|||
|
|
Inc(Start, FMD4);
|
|||
|
|
if (Start < Stop) and
|
|||
|
|
(SurrogateLowStart <= UCS4(Start^)) and
|
|||
|
|
(UCS4(Start^) <= SurrogateLowEnd) and
|
|||
|
|
(SurrogateHighStart <= UCS4((Start - 1)^)) and
|
|||
|
|
(UCS4((Start - 1)^) <= SurrogateHighEnd) then
|
|||
|
|
Dec(Start);
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
procedure TUTBMSearch.Clear;
|
|||
|
|
begin
|
|||
|
|
ClearPattern;
|
|||
|
|
inherited Clear;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function TUTBMSearch.FindAll(const Text: WideString): Boolean;
|
|||
|
|
begin
|
|||
|
|
Result := FindAll(PWideChar(Text), Length(Text));
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function TUTBMSearch.FindAll(Text: PWideChar; TextLen: Cardinal): Boolean;
|
|||
|
|
// Looks for all occurences of the pattern passed to FindPrepare and creates an
|
|||
|
|
// internal list of their positions.
|
|||
|
|
var
|
|||
|
|
Start, Stop: Cardinal;
|
|||
|
|
Run: PWideChar;
|
|||
|
|
RunLen: Cardinal;
|
|||
|
|
begin
|
|||
|
|
ClearResults;
|
|||
|
|
Run := Text;
|
|||
|
|
RunLen := TextLen;
|
|||
|
|
// repeat to find all occurences of the pattern
|
|||
|
|
while Find(Run, RunLen, Start, Stop) do
|
|||
|
|
begin
|
|||
|
|
// store this result (consider text pointer movement)...
|
|||
|
|
AddResult(Start + Run - Text, Stop + Run - Text);
|
|||
|
|
// ... and advance text position and length
|
|||
|
|
Inc(Run, Stop);
|
|||
|
|
Dec(RunLen, Stop);
|
|||
|
|
end;
|
|||
|
|
Result := Count > 0;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function TUTBMSearch.FindFirst(const Text: WideString; var Start, Stop: Cardinal): Boolean;
|
|||
|
|
// Looks for the first occurence of the pattern passed to FindPrepare in Text and
|
|||
|
|
// returns True if one could be found (in which case Start and Stop are set to
|
|||
|
|
// the according indices) otherwise False. This function is in particular of
|
|||
|
|
// interest if only one occurence needs to be found.
|
|||
|
|
begin
|
|||
|
|
ClearResults;
|
|||
|
|
Result := Find(PWideChar(Text), Length(Text), Start, Stop);
|
|||
|
|
if Result then
|
|||
|
|
AddResult(Start, Stop);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function TUTBMSearch.FindFirst(Text: PWideChar; TextLen: Cardinal; var Start, Stop: Cardinal): Boolean;
|
|||
|
|
// Same as the WideString version of this method.
|
|||
|
|
begin
|
|||
|
|
ClearResults;
|
|||
|
|
Result := Find(Text, TextLen, Start, Stop);
|
|||
|
|
if Result then
|
|||
|
|
AddResult(Start, Stop);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
procedure TUTBMSearch.FindPrepare(const Pattern: WideString; Options: TSearchFlags);
|
|||
|
|
begin
|
|||
|
|
FindPrepare(PWideChar(Pattern), Length(Pattern), Options);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
procedure TUTBMSearch.FindPrepare(Pattern: PWideChar; PatternLength: Cardinal; Options: TSearchFlags);
|
|||
|
|
// prepares following search by compiling the given pattern into an internal structure
|
|||
|
|
begin
|
|||
|
|
Compile(Pattern, PatternLength, Options);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
//----------------- Unicode RE search core ---------------------------------------------------------
|
|||
|
|
|
|||
|
|
const
|
|||
|
|
// error codes
|
|||
|
|
_URE_OK = 0;
|
|||
|
|
_URE_UNEXPECTED_EOS = -1;
|
|||
|
|
_URE_CCLASS_OPEN = -2;
|
|||
|
|
_URE_UNBALANCED_GROUP = -3;
|
|||
|
|
_URE_INVALID_PROPERTY = -4;
|
|||
|
|
_URE_INVALID_RANGE = -5;
|
|||
|
|
_URE_RANGE_OPEN = -6;
|
|||
|
|
|
|||
|
|
// options that can be combined for searching
|
|||
|
|
URE_IGNORE_NONSPACING = $01;
|
|||
|
|
URE_DONT_MATCHES_SEPARATORS = $02;
|
|||
|
|
|
|||
|
|
const
|
|||
|
|
// Flags used internally in the DFA
|
|||
|
|
_URE_DFA_CASEFOLD = $01;
|
|||
|
|
_URE_DFA_BLANKLINE = $02;
|
|||
|
|
|
|||
|
|
// symbol types for the DFA
|
|||
|
|
_URE_ANY_CHAR = 1;
|
|||
|
|
_URE_CHAR = 2;
|
|||
|
|
_URE_CCLASS = 3;
|
|||
|
|
_URE_NCCLASS = 4;
|
|||
|
|
_URE_BOL_ANCHOR = 5;
|
|||
|
|
_URE_EOL_ANCHOR = 6;
|
|||
|
|
|
|||
|
|
// op codes for converting the NFA to a DFA
|
|||
|
|
_URE_SYMBOL = 10;
|
|||
|
|
_URE_PAREN = 11;
|
|||
|
|
_URE_QUEST = 12;
|
|||
|
|
_URE_STAR = 13;
|
|||
|
|
_URE_PLUS = 14;
|
|||
|
|
_URE_ONE = 15;
|
|||
|
|
_URE_AND = 16;
|
|||
|
|
_URE_OR = 17;
|
|||
|
|
|
|||
|
|
_URE_NOOP = $FFFF;
|
|||
|
|
|
|||
|
|
//----------------- TURESearch ---------------------------------------------------------------------
|
|||
|
|
|
|||
|
|
procedure TURESearch.Clear;
|
|||
|
|
begin
|
|||
|
|
inherited Clear;
|
|||
|
|
ClearUREBuffer;
|
|||
|
|
ClearDFA;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
procedure TURESearch.Push(V: Cardinal);
|
|||
|
|
begin
|
|||
|
|
with FUREBuffer do
|
|||
|
|
begin
|
|||
|
|
// If the 'Reducing' parameter is True, check to see if the value passed is
|
|||
|
|
// already on the stack.
|
|||
|
|
if Reducing and ExpressionList.Expressions[Word(V)].OnStack then
|
|||
|
|
Exit;
|
|||
|
|
|
|||
|
|
if Stack.ListUsed = Length(Stack.List) then
|
|||
|
|
SetLength(Stack.List, Length(Stack.List) + 8);
|
|||
|
|
Stack.List[Stack.ListUsed] := V;
|
|||
|
|
Inc(Stack.ListUsed);
|
|||
|
|
|
|||
|
|
// If the 'reducing' parameter is True, flag the element as being on the Stack.
|
|||
|
|
if Reducing then
|
|||
|
|
ExpressionList.Expressions[Word(V)].OnStack := True;
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function TURESearch.Peek: Cardinal;
|
|||
|
|
begin
|
|||
|
|
if FUREBuffer.Stack.ListUsed = 0 then
|
|||
|
|
Result := _URE_NOOP
|
|||
|
|
else
|
|||
|
|
Result := FUREBuffer.Stack.List[FUREBuffer.Stack.ListUsed - 1];
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function TURESearch.Pop: Cardinal;
|
|||
|
|
begin
|
|||
|
|
if FUREBuffer.Stack.ListUsed = 0 then
|
|||
|
|
Result := _URE_NOOP
|
|||
|
|
else
|
|||
|
|
begin
|
|||
|
|
Dec(FUREBuffer.Stack.ListUsed);
|
|||
|
|
Result := FUREBuffer.Stack.List[FUREBuffer.Stack.ListUsed];
|
|||
|
|
if FUREBuffer.Reducing then
|
|||
|
|
FUREBuffer.ExpressionList.Expressions[Word(Result)].OnStack := False;
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function TURESearch.ParsePropertyList(Properties: PUCS2; Limit: Cardinal;
|
|||
|
|
var Categories: TCharacterCategories): Cardinal;
|
|||
|
|
// Parse a comma-separated list of integers that represent character properties.
|
|||
|
|
// Combine them into a set of categories and return the number of characters consumed.
|
|||
|
|
var
|
|||
|
|
N: Cardinal;
|
|||
|
|
Run,
|
|||
|
|
ListEnd: PUCS2;
|
|||
|
|
begin
|
|||
|
|
Run := Properties;
|
|||
|
|
ListEnd := Run + Limit;
|
|||
|
|
|
|||
|
|
N := 0;
|
|||
|
|
Categories := [];
|
|||
|
|
while (FUREBuffer.Error = _URE_OK) and (Run < ListEnd) do
|
|||
|
|
begin
|
|||
|
|
if Run^ = ',' then
|
|||
|
|
begin
|
|||
|
|
// Encountered a comma, so take the number parsed so far as category and
|
|||
|
|
// reset the number.
|
|||
|
|
Include(Categories, TCharacterCategory(N));
|
|||
|
|
N := 0;
|
|||
|
|
end
|
|||
|
|
else
|
|||
|
|
begin
|
|||
|
|
if (Run^ >= '0') and (Run^ <= '9') then
|
|||
|
|
begin
|
|||
|
|
// Encountered a digit, so start or continue building the cardinal that
|
|||
|
|
// represents the character category.
|
|||
|
|
N := (N * 10) + Cardinal(Word(Run^) - Ord('0'));
|
|||
|
|
end
|
|||
|
|
else
|
|||
|
|
begin
|
|||
|
|
// Encountered something that is not part of the property list.
|
|||
|
|
// Indicate that we are done.
|
|||
|
|
Break;
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
// If the number is to large then there is a problem.
|
|||
|
|
// Most likely a missing comma separator.
|
|||
|
|
if Integer(N) > Ord(High(TCharacterCategory)) then
|
|||
|
|
FUREBuffer.Error := _URE_INVALID_PROPERTY;
|
|||
|
|
Inc(Run);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
// Return the number of characters consumed.
|
|||
|
|
Result := Run - Properties;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function TURESearch.MakeHexNumber(NP: PUCS2; Limit: Cardinal; var Number: Cardinal): Cardinal;
|
|||
|
|
// Collect a hex number with 1 to 4 digits and return the number of characters used.
|
|||
|
|
var
|
|||
|
|
I: Integer;
|
|||
|
|
Run,
|
|||
|
|
ListEnd: PUCS2;
|
|||
|
|
begin
|
|||
|
|
Run := np;
|
|||
|
|
ListEnd := Run + Limit;
|
|||
|
|
|
|||
|
|
Number := 0;
|
|||
|
|
I := 0;
|
|||
|
|
while (I < 4) and (Run < ListEnd) do
|
|||
|
|
begin
|
|||
|
|
if (Run^ >= '0') and (Run^ <= '9') then
|
|||
|
|
Number := (Number shl 4) + Cardinal(Word(Run^) - Ord('0'))
|
|||
|
|
else
|
|||
|
|
begin
|
|||
|
|
if (Run^ >= 'A') and (Run^ <= 'F') then
|
|||
|
|
Number := (Number shl 4) + Cardinal(Word(Run^) - Ord('A')) + 10
|
|||
|
|
else
|
|||
|
|
begin
|
|||
|
|
if (Run^ >= 'a') and (Run^ <= 'f') then
|
|||
|
|
Number := (Number shl 4) + Cardinal(Word(Run^) - Ord('a')) + 10
|
|||
|
|
else
|
|||
|
|
Break;
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
Inc(I);
|
|||
|
|
Inc(Run);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
Result := Run - NP;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
procedure TURESearch.AddRange(var CCL: TUcCClass; Range: TUcRange);
|
|||
|
|
// Insert a Range into a character class, removing duplicates and ordering them
|
|||
|
|
// in increasing Range-start order.
|
|||
|
|
var
|
|||
|
|
I: Integer;
|
|||
|
|
Temp: UCS4;
|
|||
|
|
begin
|
|||
|
|
// If the `Casefold' flag is set, then make sure both endpoints of the Range
|
|||
|
|
// are converted to lower.
|
|||
|
|
if (FUREBuffer.Flags and _URE_DFA_CASEFOLD) <> 0 then
|
|||
|
|
begin
|
|||
|
|
{ TODO : use the entire mapping, not only the first character }
|
|||
|
|
Range.MinCode := UnicodeToLower(Range.MinCode)[0];
|
|||
|
|
Range.MaxCode := UnicodeToLower(Range.MaxCode)[0];
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
// Swap the Range endpoints if they are not in increasing order.
|
|||
|
|
if Range.MinCode > Range.MaxCode then
|
|||
|
|
begin
|
|||
|
|
Temp := Range.MinCode;
|
|||
|
|
Range.MinCode := Range.MaxCode;
|
|||
|
|
Range.MaxCode := Temp;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
I := 0;
|
|||
|
|
while (I < CCL.RangesUsed) and (Range.MinCode < CCL.Ranges[I].MinCode) do
|
|||
|
|
Inc(I);
|
|||
|
|
|
|||
|
|
// check for a duplicate
|
|||
|
|
if (I < CCL.RangesUsed) and (Range.MinCode = CCL.Ranges[I].MinCode) and
|
|||
|
|
(Range.MaxCode = CCL.Ranges[I].MaxCode) then
|
|||
|
|
Exit;
|
|||
|
|
|
|||
|
|
if CCL.RangesUsed = Length(CCL.Ranges) then
|
|||
|
|
SetLength(CCL.Ranges, Length(CCL.Ranges) + 8);
|
|||
|
|
|
|||
|
|
if I < CCL.RangesUsed then
|
|||
|
|
Move(CCL.Ranges[I], CCL.Ranges[I + 1], SizeOf(TUcRange) * (CCL.RangesUsed - I));
|
|||
|
|
|
|||
|
|
CCL.Ranges[I].MinCode := Range.MinCode;
|
|||
|
|
CCL.Ranges[I].MaxCode := Range.MaxCode;
|
|||
|
|
Inc(CCL.RangesUsed);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
type
|
|||
|
|
PTrie = ^TTrie;
|
|||
|
|
TTrie = record
|
|||
|
|
Key: UCS2;
|
|||
|
|
Len,
|
|||
|
|
Next: Cardinal;
|
|||
|
|
Setup: Integer;
|
|||
|
|
Categories: TCharacterCategories;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
procedure TURESearch.SpaceSetup(Symbol: PUcSymbolTableEntry; Categories: TCharacterCategories);
|
|||
|
|
var
|
|||
|
|
Range: TUcRange;
|
|||
|
|
begin
|
|||
|
|
Symbol.Categories := Symbol.Categories + Categories;
|
|||
|
|
|
|||
|
|
Range.MinCode := UCS4(WideTabulator);
|
|||
|
|
Range.MaxCode := UCS4(WideTabulator);
|
|||
|
|
AddRange(Symbol.Symbol.CCL, Range);
|
|||
|
|
Range.MinCode := UCS4(WideCarriageReturn);
|
|||
|
|
Range.MaxCode := UCS4(WideCarriageReturn);
|
|||
|
|
AddRange(Symbol.Symbol.CCL, Range);
|
|||
|
|
Range.MinCode := UCS4(WideLineFeed);
|
|||
|
|
Range.MaxCode := UCS4(WideLineFeed);
|
|||
|
|
AddRange(Symbol.Symbol.CCL, Range);
|
|||
|
|
Range.MinCode := UCS4(WideFormFeed);
|
|||
|
|
Range.MaxCode := UCS4(WideFormFeed);
|
|||
|
|
AddRange(Symbol.Symbol.CCL, Range);
|
|||
|
|
Range.MinCode := $FEFF;
|
|||
|
|
Range.MaxCode := $FEFF;
|
|||
|
|
AddRange(Symbol.Symbol.CCL, Range);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
procedure TURESearch.HexDigitSetup(Symbol: PUcSymbolTableEntry);
|
|||
|
|
var
|
|||
|
|
Range: TUcRange;
|
|||
|
|
begin
|
|||
|
|
Range.MinCode := UCS4('0');
|
|||
|
|
Range.MaxCode := UCS4('9');
|
|||
|
|
AddRange(Symbol.Symbol.CCL, Range);
|
|||
|
|
Range.MinCode := UCS4('A');
|
|||
|
|
Range.MaxCode := UCS4('F');
|
|||
|
|
AddRange(Symbol.Symbol.CCL, Range);
|
|||
|
|
Range.MinCode := UCS4('a');
|
|||
|
|
Range.MaxCode := UCS4('f');
|
|||
|
|
AddRange(Symbol.Symbol.CCL, Range);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
const
|
|||
|
|
CClassTrie: array [0..64] of TTrie = (
|
|||
|
|
(Key: #$003A; Len: 1; Next: 1; Setup: 0; Categories: []),
|
|||
|
|
(Key: #$0061; Len: 9; Next: 10; Setup: 0; Categories: []),
|
|||
|
|
(Key: #$0063; Len: 8; Next: 19; Setup: 0; Categories: []),
|
|||
|
|
(Key: #$0064; Len: 7; Next: 24; Setup: 0; Categories: []),
|
|||
|
|
(Key: #$0067; Len: 6; Next: 29; Setup: 0; Categories: []),
|
|||
|
|
(Key: #$006C; Len: 5; Next: 34; Setup: 0; Categories: []),
|
|||
|
|
(Key: #$0070; Len: 4; Next: 39; Setup: 0; Categories: []),
|
|||
|
|
(Key: #$0073; Len: 3; Next: 49; Setup: 0; Categories: []),
|
|||
|
|
(Key: #$0075; Len: 2; Next: 54; Setup: 0; Categories: []),
|
|||
|
|
(Key: #$0078; Len: 1; Next: 59; Setup: 0; Categories: []),
|
|||
|
|
(Key: #$006C; Len: 1; Next: 11; Setup: 0; Categories: []),
|
|||
|
|
(Key: #$006E; Len: 2; Next: 13; Setup: 0; Categories: []),
|
|||
|
|
(Key: #$0070; Len: 1; Next: 16; Setup: 0; Categories: []),
|
|||
|
|
(Key: #$0075; Len: 1; Next: 14; Setup: 0; Categories: []),
|
|||
|
|
(Key: #$006D; Len: 1; Next: 15; Setup: 0; Categories: []),
|
|||
|
|
(Key: #$003A; Len: 1; Next: 16; Setup: 1; Categories: ClassLetter + ClassNumber),
|
|||
|
|
(Key: #$0068; Len: 1; Next: 17; Setup: 0; Categories: []),
|
|||
|
|
(Key: #$0061; Len: 1; Next: 18; Setup: 0; Categories: []),
|
|||
|
|
(Key: #$003A; Len: 1; Next: 19; Setup: 1; Categories: ClassLetter),
|
|||
|
|
(Key: #$006E; Len: 1; Next: 20; Setup: 0; Categories: []),
|
|||
|
|
(Key: #$0074; Len: 1; Next: 21; Setup: 0; Categories: []),
|
|||
|
|
(Key: #$0072; Len: 1; Next: 22; Setup: 0; Categories: []),
|
|||
|
|
(Key: #$006C; Len: 1; Next: 23; Setup: 0; Categories: []),
|
|||
|
|
(Key: #$003A; Len: 1; Next: 24; Setup: 1; Categories: [ccOtherControl, ccOtherFormat]),
|
|||
|
|
(Key: #$0069; Len: 1; Next: 25; Setup: 0; Categories: []),
|
|||
|
|
(Key: #$0067; Len: 1; Next: 26; Setup: 0; Categories: []),
|
|||
|
|
(Key: #$0069; Len: 1; Next: 27; Setup: 0; Categories: []),
|
|||
|
|
(Key: #$0074; Len: 1; Next: 28; Setup: 0; Categories: []),
|
|||
|
|
(Key: #$003A; Len: 1; Next: 29; Setup: 1; Categories: ClassNumber),
|
|||
|
|
(Key: #$0072; Len: 1; Next: 30; Setup: 0; Categories: []),
|
|||
|
|
(Key: #$0061; Len: 1; Next: 31; Setup: 0; Categories: []),
|
|||
|
|
(Key: #$0070; Len: 1; Next: 32; Setup: 0; Categories: []),
|
|||
|
|
(Key: #$0068; Len: 1; Next: 33; Setup: 0; Categories: []),
|
|||
|
|
(Key: #$003A; Len: 1; Next: 34; Setup: 1; Categories: ClassMark + ClassNumber + ClassLetter + ClassPunctuation +
|
|||
|
|
ClassSymbol),
|
|||
|
|
(Key: #$006F; Len: 1; Next: 35; Setup: 0; Categories: []),
|
|||
|
|
(Key: #$0077; Len: 1; Next: 36; Setup: 0; Categories: []),
|
|||
|
|
(Key: #$0065; Len: 1; Next: 37; Setup: 0; Categories: []),
|
|||
|
|
(Key: #$0072; Len: 1; Next: 38; Setup: 0; Categories: []),
|
|||
|
|
(Key: #$003A; Len: 1; Next: 39; Setup: 1; Categories: [ccLetterLowercase]),
|
|||
|
|
(Key: #$0072; Len: 2; Next: 41; Setup: 0; Categories: []),
|
|||
|
|
(Key: #$0075; Len: 1; Next: 45; Setup: 0; Categories: []),
|
|||
|
|
(Key: #$0069; Len: 1; Next: 42; Setup: 0; Categories: []),
|
|||
|
|
(Key: #$006E; Len: 1; Next: 43; Setup: 0; Categories: []),
|
|||
|
|
(Key: #$0074; Len: 1; Next: 44; Setup: 0; Categories: []),
|
|||
|
|
(Key: #$003A; Len: 1; Next: 45; Setup: 1; Categories: ClassMark + ClassNumber + ClassLetter + ClassPunctuation +
|
|||
|
|
ClassSymbol + [ccSeparatorSpace]),
|
|||
|
|
(Key: #$006E; Len: 1; Next: 46; Setup: 0; Categories: []),
|
|||
|
|
(Key: #$0063; Len: 1; Next: 47; Setup: 0; Categories: []),
|
|||
|
|
(Key: #$0074; Len: 1; Next: 48; Setup: 0; Categories: []),
|
|||
|
|
(Key: #$003A; Len: 1; Next: 49; Setup: 1; Categories: ClassPunctuation),
|
|||
|
|
(Key: #$0070; Len: 1; Next: 50; Setup: 0; Categories: []),
|
|||
|
|
(Key: #$0061; Len: 1; Next: 51; Setup: 0; Categories: []),
|
|||
|
|
(Key: #$0063; Len: 1; Next: 52; Setup: 0; Categories: []),
|
|||
|
|
(Key: #$0065; Len: 1; Next: 53; Setup: 0; Categories: []),
|
|||
|
|
(Key: #$003A; Len: 1; Next: 54; Setup: 2; Categories: ClassSpace),
|
|||
|
|
(Key: #$0070; Len: 1; Next: 55; Setup: 0; Categories: []),
|
|||
|
|
(Key: #$0070; Len: 1; Next: 56; Setup: 0; Categories: []),
|
|||
|
|
(Key: #$0065; Len: 1; Next: 57; Setup: 0; Categories: []),
|
|||
|
|
(Key: #$0072; Len: 1; Next: 58; Setup: 0; Categories: []),
|
|||
|
|
(Key: #$003A; Len: 1; Next: 59; Setup: 1; Categories: [ccLetterUppercase]),
|
|||
|
|
(Key: #$0064; Len: 1; Next: 60; Setup: 0; Categories: []),
|
|||
|
|
(Key: #$0069; Len: 1; Next: 61; Setup: 0; Categories: []),
|
|||
|
|
(Key: #$0067; Len: 1; Next: 62; Setup: 0; Categories: []),
|
|||
|
|
(Key: #$0069; Len: 1; Next: 63; Setup: 0; Categories: []),
|
|||
|
|
(Key: #$0074; Len: 1; Next: 64; Setup: 0; Categories: []),
|
|||
|
|
(Key: #$003A; Len: 1; Next: 65; Setup: 3; Categories: [])
|
|||
|
|
);
|
|||
|
|
|
|||
|
|
function TURESearch.PosixCCL(CP: PUCS2; Limit: Cardinal; Symbol: PUcSymbolTableEntry): Cardinal;
|
|||
|
|
// Probe for one of the POSIX colon delimited character classes in the static trie.
|
|||
|
|
var
|
|||
|
|
I: Integer;
|
|||
|
|
N: Cardinal;
|
|||
|
|
TP: PTrie;
|
|||
|
|
Run,
|
|||
|
|
ListEnd: PUCS2;
|
|||
|
|
begin
|
|||
|
|
Result := 0;
|
|||
|
|
// If the number of characters left is less than 7,
|
|||
|
|
// then this cannot be interpreted as one of the colon delimited classes.
|
|||
|
|
if Limit >= 7 then
|
|||
|
|
begin
|
|||
|
|
Run := cp;
|
|||
|
|
ListEnd := Run + Limit;
|
|||
|
|
TP := @CClassTrie[0];
|
|||
|
|
I := 0;
|
|||
|
|
while (Run < ListEnd) and (I < 8) do
|
|||
|
|
begin
|
|||
|
|
N := TP.Len;
|
|||
|
|
while (N > 0) and (TP.Key <> Run^) do
|
|||
|
|
begin
|
|||
|
|
Inc(TP);
|
|||
|
|
Dec(N);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
if N = 0 then
|
|||
|
|
begin
|
|||
|
|
Result := 0;
|
|||
|
|
Exit;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
if (Run^ = ':') and ((I = 6) or (I = 7)) then
|
|||
|
|
begin
|
|||
|
|
Inc(Run);
|
|||
|
|
Break;
|
|||
|
|
end;
|
|||
|
|
if (Run + 1) < ListEnd then
|
|||
|
|
TP := @CClassTrie[TP.Next];
|
|||
|
|
Inc(I);
|
|||
|
|
Inc(Run);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
Result := Run - CP;
|
|||
|
|
case TP.Setup of
|
|||
|
|
1:
|
|||
|
|
Symbol.Categories := Symbol.Categories + TP.Categories;
|
|||
|
|
2:
|
|||
|
|
SpaceSetup(Symbol, TP.Categories);
|
|||
|
|
3:
|
|||
|
|
HexDigitSetup(Symbol);
|
|||
|
|
else
|
|||
|
|
Result := 0;
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function TURESearch.BuildCharacterClass(CP: PUCS2; Limit: Cardinal; Symbol: PUcSymbolTableEntry): Cardinal;
|
|||
|
|
// Construct a list of ranges and return the number of characters consumed.
|
|||
|
|
var
|
|||
|
|
RangeEnd: Integer;
|
|||
|
|
N: Cardinal;
|
|||
|
|
Run,
|
|||
|
|
ListEnd: PUCS2;
|
|||
|
|
C, Last: UCS4;
|
|||
|
|
Range: TUcRange;
|
|||
|
|
begin
|
|||
|
|
Run := cp;
|
|||
|
|
ListEnd := Run + Limit;
|
|||
|
|
|
|||
|
|
if Run^ = '^' then
|
|||
|
|
begin
|
|||
|
|
Symbol.AType := _URE_NCCLASS;
|
|||
|
|
Inc(Run);
|
|||
|
|
end
|
|||
|
|
else
|
|||
|
|
Symbol.AType := _URE_CCLASS;
|
|||
|
|
|
|||
|
|
Last := 0;
|
|||
|
|
RangeEnd := 0;
|
|||
|
|
while (FUREBuffer.Error = _URE_OK) and (Run < ListEnd) do
|
|||
|
|
begin
|
|||
|
|
// Allow for the special case []abc], where the first closing bracket would end an empty
|
|||
|
|
// character class, which makes no sense. Hence this bracket is treaded literally.
|
|||
|
|
if (Run^ = ']') and (Symbol.Symbol.CCL.RangesUsed > 0) then
|
|||
|
|
Break;
|
|||
|
|
|
|||
|
|
C := UCS4(Run^);
|
|||
|
|
Inc(Run);
|
|||
|
|
|
|||
|
|
// escape character
|
|||
|
|
if C = Ord('\') then
|
|||
|
|
begin
|
|||
|
|
if Run = ListEnd then
|
|||
|
|
begin
|
|||
|
|
// The EOS was encountered when expecting the reverse solidus to be followed by the character it is escaping.
|
|||
|
|
// Set an Error code and return the number of characters consumed up to this point.
|
|||
|
|
FUREBuffer.Error := _URE_UNEXPECTED_EOS;
|
|||
|
|
Result := Run - CP;
|
|||
|
|
Exit;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
C := UCS4(Run^);
|
|||
|
|
Inc(Run);
|
|||
|
|
case UCS2(C) of
|
|||
|
|
'a':
|
|||
|
|
C := $07;
|
|||
|
|
'b':
|
|||
|
|
C := $08;
|
|||
|
|
'f':
|
|||
|
|
C := $0C;
|
|||
|
|
'n':
|
|||
|
|
C := $0A;
|
|||
|
|
'R':
|
|||
|
|
C := $0D;
|
|||
|
|
't':
|
|||
|
|
C := $09;
|
|||
|
|
'v':
|
|||
|
|
C := $0B;
|
|||
|
|
'p', 'P':
|
|||
|
|
begin
|
|||
|
|
Inc(Run, ParsePropertyList(Run, ListEnd - Run, Symbol.Categories));
|
|||
|
|
// Invert the bit mask of the properties if this is a negated character class or if 'P' is used to specify
|
|||
|
|
// a list of character properties that should *not* match in a character class.
|
|||
|
|
if C = Ord('P') then
|
|||
|
|
Symbol.Categories := ClassAll - Symbol.Categories;
|
|||
|
|
Continue;
|
|||
|
|
end;
|
|||
|
|
'x', 'X', 'u', 'U':
|
|||
|
|
begin
|
|||
|
|
if (Run < ListEnd) and
|
|||
|
|
((Run^ >= '0') and (Run^ <= '9') or
|
|||
|
|
(Run^ >= 'A') and (Run^ <= 'F') or
|
|||
|
|
(Run^ >= 'a') and (Run^ <= 'f')) then
|
|||
|
|
Inc(Run, MakeHexNumber(Run, ListEnd - Run, C));
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
end
|
|||
|
|
else
|
|||
|
|
begin
|
|||
|
|
if C = Ord(':') then
|
|||
|
|
begin
|
|||
|
|
// Probe for a POSIX colon delimited character class.
|
|||
|
|
Dec(Run);
|
|||
|
|
N := PosixCCL(Run, ListEnd - Run, Symbol);
|
|||
|
|
if N = 0 then
|
|||
|
|
Inc(Run)
|
|||
|
|
else
|
|||
|
|
begin
|
|||
|
|
Inc(Run, N);
|
|||
|
|
Continue;
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
// Check to see if the current character is a low surrogate that needs
|
|||
|
|
// to be combined with a preceding high surrogate.
|
|||
|
|
if Last <> 0 then
|
|||
|
|
begin
|
|||
|
|
if (C >= SurrogateLowStart) and (C <= SurrogateLowEnd) then
|
|||
|
|
begin
|
|||
|
|
// Construct the UTF16 character code.
|
|||
|
|
C := $10000 + (((Last and $03FF) shl 10) or (C and $03FF))
|
|||
|
|
end
|
|||
|
|
else
|
|||
|
|
begin
|
|||
|
|
// Add the isolated high surrogate to the range.
|
|||
|
|
if RangeEnd = 1 then
|
|||
|
|
Range.MaxCode := Last and $FFFF
|
|||
|
|
else
|
|||
|
|
begin
|
|||
|
|
Range.MinCode := Last and $FFFF;
|
|||
|
|
Range.MaxCode := Last and $FFFF;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
AddRange(Symbol.Symbol.CCL, Range);
|
|||
|
|
RangeEnd := 0;
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
// Clear the Last character code.
|
|||
|
|
Last := 0;
|
|||
|
|
|
|||
|
|
// This slightly awkward code handles the different cases needed to construct a range.
|
|||
|
|
if (C >= SurrogateHighStart) and (C <= SurrogateHighEnd) then
|
|||
|
|
begin
|
|||
|
|
// If the high surrogate is followed by a Range indicator, simply add it as the Range start. Otherwise,
|
|||
|
|
// save it in the next character is a low surrogate.
|
|||
|
|
if Run^ = '-' then
|
|||
|
|
begin
|
|||
|
|
Inc(Run);
|
|||
|
|
Range.MinCode := C;
|
|||
|
|
RangeEnd := 1;
|
|||
|
|
end
|
|||
|
|
else
|
|||
|
|
Last := C;
|
|||
|
|
end
|
|||
|
|
else
|
|||
|
|
begin
|
|||
|
|
if RangeEnd = 1 then
|
|||
|
|
begin
|
|||
|
|
Range.MaxCode := C;
|
|||
|
|
AddRange(Symbol.Symbol.CCL, Range);
|
|||
|
|
RangeEnd := 0;
|
|||
|
|
end
|
|||
|
|
else
|
|||
|
|
begin
|
|||
|
|
Range.MinCode := C;
|
|||
|
|
Range.MaxCode := C;
|
|||
|
|
if Run^ = '-' then
|
|||
|
|
begin
|
|||
|
|
Inc(Run);
|
|||
|
|
RangeEnd := 1;
|
|||
|
|
end
|
|||
|
|
else
|
|||
|
|
AddRange(Symbol.Symbol.CCL, Range);
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
if (Run < ListEnd) and (Run^ = ']') then
|
|||
|
|
Inc(Run)
|
|||
|
|
else
|
|||
|
|
begin
|
|||
|
|
// The parse was not terminated by the character class close symbol (']'), so set an error code.
|
|||
|
|
FUREBuffer.Error := _URE_CCLASS_OPEN;
|
|||
|
|
end;
|
|||
|
|
Result := Run - CP;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function TURESearch.ProbeLowSurrogate(LeftState: PUCS2; Limit: Cardinal; var Code: UCS4): Cardinal;
|
|||
|
|
// probes for a low surrogate hex code
|
|||
|
|
var
|
|||
|
|
I: Integer;
|
|||
|
|
Run,
|
|||
|
|
ListEnd: PUCS2;
|
|||
|
|
begin
|
|||
|
|
I := 0;
|
|||
|
|
Code := 0;
|
|||
|
|
Run := LeftState;
|
|||
|
|
ListEnd := Run + Limit;
|
|||
|
|
|
|||
|
|
while (I < 4) and (Run < ListEnd) do
|
|||
|
|
begin
|
|||
|
|
if (Run^ >= '0') and (Run^ <= '9') then
|
|||
|
|
Code := (Code shl 4) + Cardinal(Word(Run^) - Ord('0'))
|
|||
|
|
else
|
|||
|
|
begin
|
|||
|
|
if (Run^ >= 'A') and (Run^ <= 'F') then
|
|||
|
|
Code := (Code shl 4) + Cardinal(Word(Run^) - Ord('A')) + 10
|
|||
|
|
else
|
|||
|
|
begin
|
|||
|
|
if (Run^ >= 'a') and (Run^ <= 'f') then
|
|||
|
|
Code := (Code shl 4) + Cardinal(Word(Run^) - Ord('a')) + 10
|
|||
|
|
else
|
|||
|
|
Break;
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
Inc(Run);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
if (SurrogateLowStart <= Code) and (Code <= SurrogateLowEnd) then
|
|||
|
|
Result := Run - LeftState
|
|||
|
|
else
|
|||
|
|
Result := 0;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function TURESearch.CompileSymbol(S: PUCS2; Limit: Cardinal; Symbol: PUcSymbolTableEntry): Cardinal;
|
|||
|
|
var
|
|||
|
|
C: UCS4;
|
|||
|
|
Run,
|
|||
|
|
ListEnd: PUCS2;
|
|||
|
|
begin
|
|||
|
|
Run := S;
|
|||
|
|
ListEnd := S + Limit;
|
|||
|
|
|
|||
|
|
C := UCS4(Run^);
|
|||
|
|
Inc(Run);
|
|||
|
|
if C = Ord('\') then
|
|||
|
|
begin
|
|||
|
|
if Run = ListEnd then
|
|||
|
|
begin
|
|||
|
|
// The EOS was encountered when expecting the reverse solidus to be followed
|
|||
|
|
// by the character it is escaping. Set an Error code and return the number
|
|||
|
|
// of characters consumed up to this point.
|
|||
|
|
FUREBuffer.Error := _URE_UNEXPECTED_EOS;
|
|||
|
|
Result := Run - S;
|
|||
|
|
Exit;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
C := UCS4(Run^);
|
|||
|
|
Inc(Run);
|
|||
|
|
case UCS2(C) of
|
|||
|
|
'p', 'P':
|
|||
|
|
begin
|
|||
|
|
if UCS2(C) = 'p' then
|
|||
|
|
Symbol.AType :=_URE_CCLASS
|
|||
|
|
else
|
|||
|
|
Symbol.AType :=_URE_NCCLASS;
|
|||
|
|
Inc(Run, ParsePropertyList(Run, ListEnd - Run, Symbol.Categories));
|
|||
|
|
end;
|
|||
|
|
'a':
|
|||
|
|
begin
|
|||
|
|
Symbol.AType := _URE_CHAR;
|
|||
|
|
Symbol.Symbol.Chr := $07;
|
|||
|
|
end;
|
|||
|
|
'b':
|
|||
|
|
begin
|
|||
|
|
Symbol.AType := _URE_CHAR;
|
|||
|
|
Symbol.Symbol.Chr := $08;
|
|||
|
|
end;
|
|||
|
|
'f':
|
|||
|
|
begin
|
|||
|
|
Symbol.AType := _URE_CHAR;
|
|||
|
|
Symbol.Symbol.Chr := $0C;
|
|||
|
|
end;
|
|||
|
|
'n':
|
|||
|
|
begin
|
|||
|
|
Symbol.AType := _URE_CHAR;
|
|||
|
|
Symbol.Symbol.Chr := $0A;
|
|||
|
|
end;
|
|||
|
|
'r':
|
|||
|
|
begin
|
|||
|
|
Symbol.AType := _URE_CHAR;
|
|||
|
|
Symbol.Symbol.Chr := $0D;
|
|||
|
|
end;
|
|||
|
|
't':
|
|||
|
|
begin
|
|||
|
|
Symbol.AType := _URE_CHAR;
|
|||
|
|
Symbol.Symbol.Chr := $09;
|
|||
|
|
end;
|
|||
|
|
'v':
|
|||
|
|
begin
|
|||
|
|
Symbol.AType := _URE_CHAR;
|
|||
|
|
Symbol.Symbol.Chr := $0B;
|
|||
|
|
end;
|
|||
|
|
else
|
|||
|
|
case UCS2(C) of
|
|||
|
|
'x', 'X', 'u', 'U':
|
|||
|
|
begin
|
|||
|
|
// Collect between 1 and 4 digits representing an UCS2 code.
|
|||
|
|
if (Run < ListEnd) and
|
|||
|
|
((Run^ >= '0') and (Run^ <= '9') or
|
|||
|
|
(Run^ >= 'A') and (Run^ <= 'F') or
|
|||
|
|
(Run^ >= 'a') and (Run^ <= 'f')) then
|
|||
|
|
Inc(Run, MakeHexNumber(Run, ListEnd - Run, C));
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
// Simply add an escaped character here.
|
|||
|
|
Symbol.AType := _URE_CHAR;
|
|||
|
|
Symbol.Symbol.Chr := C;
|
|||
|
|
end;
|
|||
|
|
end
|
|||
|
|
else
|
|||
|
|
begin
|
|||
|
|
if (UCS2(C) = '^') or (UCS2(C) = '$') then
|
|||
|
|
begin
|
|||
|
|
// Handle the BOL and EOL anchors. This actually consists simply of setting
|
|||
|
|
// a flag that indicates that the user supplied anchor match function should
|
|||
|
|
// be called. This needs to be done instead of simply matching line/paragraph
|
|||
|
|
// separators because beginning-of-text and end-of-text tests are needed as well.
|
|||
|
|
if UCS2(C) = '^' then
|
|||
|
|
Symbol.AType := _URE_BOL_ANCHOR
|
|||
|
|
else
|
|||
|
|
Symbol.AType := _URE_EOL_ANCHOR;
|
|||
|
|
end
|
|||
|
|
else
|
|||
|
|
begin
|
|||
|
|
if UCS2(C) = '[' then
|
|||
|
|
begin
|
|||
|
|
// construct a character class
|
|||
|
|
Inc(Run, BuildCharacterClass(Run, ListEnd - Run, Symbol));
|
|||
|
|
end
|
|||
|
|
else
|
|||
|
|
begin
|
|||
|
|
if UCS2(C) = '.' then
|
|||
|
|
Symbol.AType := _URE_ANY_CHAR
|
|||
|
|
else
|
|||
|
|
begin
|
|||
|
|
Symbol.AType := _URE_CHAR;
|
|||
|
|
Symbol.Symbol.Chr := C;
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
// If the symbol type happens to be a character and is a high surrogate, then
|
|||
|
|
// probe forward to see if it is followed by a low surrogate that needs to be added.
|
|||
|
|
if (Run < ListEnd) and
|
|||
|
|
(Symbol.AType = _URE_CHAR) and
|
|||
|
|
(SurrogateHighStart <= Symbol.Symbol.Chr) and
|
|||
|
|
(Symbol.Symbol.Chr <= SurrogateHighEnd) then
|
|||
|
|
begin
|
|||
|
|
if (SurrogateLowStart <= UCS4(Run^)) and
|
|||
|
|
(UCS4(Run^) <= SurrogateLowEnd) then
|
|||
|
|
begin
|
|||
|
|
Symbol.Symbol.Chr := $10000 + (((Symbol.Symbol.Chr and $03FF) shl 10) or (UCS4(Run^) and $03FF));
|
|||
|
|
Inc(Run);
|
|||
|
|
end
|
|||
|
|
else
|
|||
|
|
begin
|
|||
|
|
if (Run^ = '\') and (((Run + 1)^ = 'x') or ((Run + 1)^ = 'X') or
|
|||
|
|
((Run + 1)^ = 'u') or ((Run + 1)^ = 'U')) then
|
|||
|
|
begin
|
|||
|
|
Inc(Run, ProbeLowSurrogate(Run + 2, ListEnd - (Run + 2), C));
|
|||
|
|
if (SurrogateLowStart <= C) and (C <= SurrogateLowEnd) then
|
|||
|
|
begin
|
|||
|
|
// Take into account the \[xu] in front of the hex code.
|
|||
|
|
Inc(Run, 2);
|
|||
|
|
Symbol.Symbol.Chr := $10000 + (((Symbol.Symbol.Chr and $03FF) shl 10) or (C and $03FF));
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
// Last, make sure any _URE_CHAR type symbols are changed to lower if the
|
|||
|
|
// 'Casefold' flag is set.
|
|||
|
|
{ TODO : use the entire mapping, not only the first character and use the
|
|||
|
|
case fold abilities of the unit. }
|
|||
|
|
if ((FUREBuffer.Flags and _URE_DFA_CASEFOLD) <> 0) and (Symbol.AType = _URE_CHAR) then
|
|||
|
|
Symbol.Symbol.Chr := UnicodeToLower(Symbol.Symbol.Chr)[0];
|
|||
|
|
|
|||
|
|
// If the symbol constructed is anything other than one of the anchors,
|
|||
|
|
// make sure the _URE_DFA_BLANKLINE flag is removed.
|
|||
|
|
if (Symbol.AType <> _URE_BOL_ANCHOR) and (Symbol.AType <> _URE_EOL_ANCHOR) then
|
|||
|
|
FUREBuffer.Flags := FUREBuffer.Flags and not _URE_DFA_BLANKLINE;
|
|||
|
|
|
|||
|
|
// Return the number of characters consumed.
|
|||
|
|
Result := Run - S;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function TURESearch.SymbolsAreDifferent(A, B: PUcSymbolTableEntry): Boolean;
|
|||
|
|
begin
|
|||
|
|
Result := False;
|
|||
|
|
if (A.AType <> B.AType) or (A.Mods <> B.Mods) or (A.Categories <> B.Categories) then
|
|||
|
|
Result := True
|
|||
|
|
else
|
|||
|
|
begin
|
|||
|
|
if (A.AType = _URE_CCLASS) or (A.AType = _URE_NCCLASS) then
|
|||
|
|
begin
|
|||
|
|
if A.Symbol.CCL.RangesUsed <> B.Symbol.CCL.RangesUsed then
|
|||
|
|
Result := True
|
|||
|
|
else
|
|||
|
|
begin
|
|||
|
|
if (A.Symbol.CCL.RangesUsed > 0) and
|
|||
|
|
not CompareMem(@A.Symbol.CCL.Ranges[0], @B.Symbol.CCL.Ranges[0],
|
|||
|
|
SizeOf(TUcRange) * A.Symbol.CCL.RangesUsed) then
|
|||
|
|
Result := True;;
|
|||
|
|
end;
|
|||
|
|
end
|
|||
|
|
else
|
|||
|
|
begin
|
|||
|
|
if (A.AType = _URE_CHAR) and (A.Symbol.Chr <> B.Symbol.Chr) then
|
|||
|
|
Result := True;
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function TURESearch.MakeSymbol(S: PUCS2; Limit: Cardinal; var Consumed: Cardinal): Cardinal;
|
|||
|
|
// constructs a symbol, but only keep unique symbols
|
|||
|
|
var
|
|||
|
|
I: Integer;
|
|||
|
|
Start: PUcSymbolTableEntry;
|
|||
|
|
Symbol: TUcSymbolTableEntry;
|
|||
|
|
begin
|
|||
|
|
// Build the next symbol so we can test to see if it is already in the symbol table.
|
|||
|
|
FillChar(Symbol, SizeOf(TUcSymbolTableEntry), 0);
|
|||
|
|
Consumed := CompileSymbol(S, Limit, @Symbol);
|
|||
|
|
|
|||
|
|
// Check to see if the symbol exists.
|
|||
|
|
I := 0;
|
|||
|
|
Start := @FUREBuffer.SymbolTable.Symbols[0];
|
|||
|
|
while (I < FUREBuffer.SymbolTable.SymbolsUsed) and SymbolsAreDifferent(@Symbol, Start) do
|
|||
|
|
begin
|
|||
|
|
Inc(I);
|
|||
|
|
Inc(Start);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
if I < FUREBuffer.SymbolTable.SymbolsUsed then
|
|||
|
|
begin
|
|||
|
|
// Free up any ranges used for the symbol.
|
|||
|
|
if (Symbol.AType = _URE_CCLASS) or (Symbol.AType = _URE_NCCLASS) then
|
|||
|
|
Symbol.Symbol.CCL.Ranges := nil;
|
|||
|
|
Result := FUREBuffer.SymbolTable.Symbols[I].ID;
|
|||
|
|
Exit;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
// Need to add the new symbol.
|
|||
|
|
if FUREBuffer.SymbolTable.SymbolsUsed = Length(FUREBuffer.SymbolTable.Symbols) then
|
|||
|
|
begin
|
|||
|
|
SetLength(FUREBuffer.SymbolTable.Symbols, Length(FUREBuffer.SymbolTable.Symbols) + 8);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
Symbol.ID := FUREBuffer.SymbolTable.SymbolsUsed;
|
|||
|
|
Inc(FUREBuffer.SymbolTable.SymbolsUsed);
|
|||
|
|
FUREBuffer.SymbolTable.Symbols[Symbol.ID] := Symbol;
|
|||
|
|
Result := Symbol.ID;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function TURESearch.MakeExpression(AType, LHS, RHS: Cardinal): Cardinal;
|
|||
|
|
var
|
|||
|
|
I: Integer;
|
|||
|
|
begin
|
|||
|
|
// Determine if the expression already exists or not.
|
|||
|
|
with FUREBuffer.ExpressionList do
|
|||
|
|
begin
|
|||
|
|
for I := 0 to ExpressionsUsed - 1 do
|
|||
|
|
begin
|
|||
|
|
if (Expressions[I].AType = AType) and
|
|||
|
|
(Expressions[I].LHS = LHS) and
|
|||
|
|
(Expressions[I].RHS = RHS) then
|
|||
|
|
begin
|
|||
|
|
Result := I;
|
|||
|
|
Exit;
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
// Need to add a new expression.
|
|||
|
|
if ExpressionsUsed = Length(Expressions) then
|
|||
|
|
SetLength(Expressions, Length(Expressions) + 8);
|
|||
|
|
|
|||
|
|
Expressions[ExpressionsUsed].OnStack := False;
|
|||
|
|
Expressions[ExpressionsUsed].AType := AType;
|
|||
|
|
Expressions[ExpressionsUsed].LHS := LHS;
|
|||
|
|
Expressions[ExpressionsUsed].RHS := RHS;
|
|||
|
|
|
|||
|
|
Result := ExpressionsUsed;
|
|||
|
|
Inc(ExpressionsUsed);
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function IsSpecial(C: Word): Boolean;
|
|||
|
|
begin
|
|||
|
|
Result := C in [Word('+'), Word('*'), Word('?'), Word('{'), Word('|'), Word(')')];
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
procedure TURESearch.CollectPendingOperations(var State: Cardinal);
|
|||
|
|
// collects all pending AND and OR operations and make corresponding expressions
|
|||
|
|
var
|
|||
|
|
Operation: Cardinal;
|
|||
|
|
begin
|
|||
|
|
repeat
|
|||
|
|
Operation := Peek;
|
|||
|
|
if (Operation <> _URE_AND) and (Operation <> _URE_OR) then
|
|||
|
|
Break;
|
|||
|
|
// make an expression with the AND or OR operator and its right hand side
|
|||
|
|
Operation := Pop;
|
|||
|
|
State := MakeExpression(Operation, Pop, State);
|
|||
|
|
until False;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function TURESearch.ConvertRegExpToNFA(RE: PWideChar; RELength: Cardinal): Cardinal;
|
|||
|
|
// Converts the regular expression into an NFA in a form that will be easy to
|
|||
|
|
// reduce to a DFA. The starting state for the reduction will be returned.
|
|||
|
|
var
|
|||
|
|
C: UCS2;
|
|||
|
|
Head, Tail: PUCS2;
|
|||
|
|
S: WideString;
|
|||
|
|
Symbol,
|
|||
|
|
State,
|
|||
|
|
LastState,
|
|||
|
|
Used,
|
|||
|
|
M, N: Cardinal;
|
|||
|
|
I: Integer;
|
|||
|
|
|
|||
|
|
begin
|
|||
|
|
State := _URE_NOOP;
|
|||
|
|
|
|||
|
|
Head := RE;
|
|||
|
|
Tail := Head + RELength;
|
|||
|
|
while (FUREBuffer.Error = _URE_OK) and (Head < Tail) do
|
|||
|
|
begin
|
|||
|
|
C := Head^;
|
|||
|
|
Inc(Head);
|
|||
|
|
case C of
|
|||
|
|
'(':
|
|||
|
|
Push(_URE_PAREN);
|
|||
|
|
')': // check for the case of too many close parentheses
|
|||
|
|
begin
|
|||
|
|
if Peek = _URE_NOOP then
|
|||
|
|
begin
|
|||
|
|
FUREBuffer.Error := _URE_UNBALANCED_GROUP;
|
|||
|
|
Break;
|
|||
|
|
end;
|
|||
|
|
CollectPendingOperations(State);
|
|||
|
|
// remove the _URE_PAREN off the stack
|
|||
|
|
Pop;
|
|||
|
|
end;
|
|||
|
|
'*':
|
|||
|
|
State := MakeExpression(_URE_STAR, State, _URE_NOOP);
|
|||
|
|
'+':
|
|||
|
|
State := MakeExpression(_URE_PLUS, State, _URE_NOOP);
|
|||
|
|
'?':
|
|||
|
|
State := MakeExpression(_URE_QUEST, State, _URE_NOOP);
|
|||
|
|
'|':
|
|||
|
|
begin
|
|||
|
|
CollectPendingOperations(State);
|
|||
|
|
Push(State);
|
|||
|
|
Push(_URE_OR);
|
|||
|
|
end;
|
|||
|
|
'{': // expressions of the form {m, n}
|
|||
|
|
begin
|
|||
|
|
C := #0;
|
|||
|
|
M := 0;
|
|||
|
|
N := 0;
|
|||
|
|
// get first number
|
|||
|
|
while UnicodeIsWhiteSpace(UCS4(Head^)) do
|
|||
|
|
Inc(Head);
|
|||
|
|
S := '';
|
|||
|
|
while Head^ in [WideChar('0')..WideChar('9')] do
|
|||
|
|
begin
|
|||
|
|
S := S + Head^;
|
|||
|
|
Inc(Head);
|
|||
|
|
end;
|
|||
|
|
if S <> '' then
|
|||
|
|
M := StrToInt(S);
|
|||
|
|
|
|||
|
|
while UnicodeIsWhiteSpace(UCS4(Head^)) do
|
|||
|
|
Inc(Head);
|
|||
|
|
if (Head^ <> ',') and (Head^ <> '}') then
|
|||
|
|
begin
|
|||
|
|
FUREBuffer.Error := _URE_INVALID_RANGE;
|
|||
|
|
Break;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
// check for an upper limit
|
|||
|
|
if Head^ <> '}' then
|
|||
|
|
begin
|
|||
|
|
Inc(Head);
|
|||
|
|
// get second number
|
|||
|
|
while UnicodeIsWhiteSpace(UCS4(Head^)) do
|
|||
|
|
Inc(Head);
|
|||
|
|
S := '';
|
|||
|
|
while Head^ in [WideChar('0')..WideChar('9')] do
|
|||
|
|
begin
|
|||
|
|
S := S + Head^;
|
|||
|
|
Inc(Head);
|
|||
|
|
end;
|
|||
|
|
if S <> '' then
|
|||
|
|
N := StrToInt(S);
|
|||
|
|
end
|
|||
|
|
else
|
|||
|
|
N := M;
|
|||
|
|
|
|||
|
|
if Head^ <> '}' then
|
|||
|
|
begin
|
|||
|
|
FUREBuffer.Error := _URE_RANGE_OPEN;
|
|||
|
|
Break;
|
|||
|
|
end
|
|||
|
|
else
|
|||
|
|
Inc(Head);
|
|||
|
|
|
|||
|
|
// N = 0 means unlimited number of occurences
|
|||
|
|
if N = 0 then
|
|||
|
|
begin
|
|||
|
|
case M of
|
|||
|
|
0: // {,} {0,} {0, 0} mean the same as the star operator
|
|||
|
|
State := MakeExpression(_URE_STAR, State, _URE_NOOP);
|
|||
|
|
1: // {1,} {1, 0} mean the same as the plus operator
|
|||
|
|
State := MakeExpression(_URE_PLUS, State, _URE_NOOP);
|
|||
|
|
else
|
|||
|
|
begin
|
|||
|
|
// encapsulate the expanded branches as would they be in parenthesis
|
|||
|
|
// in order to avoid unwanted concatenation with pending operations/symbols
|
|||
|
|
Push(_URE_PAREN);
|
|||
|
|
// {m,} {m, 0} mean M fixed occurences plus star operator
|
|||
|
|
// make E^m...
|
|||
|
|
for I := 1 to M - 1 do
|
|||
|
|
begin
|
|||
|
|
Push(State);
|
|||
|
|
Push(_URE_AND);
|
|||
|
|
end;
|
|||
|
|
// ...and repeat the last symbol one or more times
|
|||
|
|
State := MakeExpression(_URE_PLUS, State, _URE_NOOP);
|
|||
|
|
CollectPendingOperations(State);
|
|||
|
|
Pop;
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
end
|
|||
|
|
else
|
|||
|
|
begin
|
|||
|
|
// check proper range limits
|
|||
|
|
if M > N then
|
|||
|
|
begin
|
|||
|
|
FUREBuffer.Error := _URE_INVALID_RANGE;
|
|||
|
|
Break;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
// check special case {0, 1} (which corresponds to the ? operator)
|
|||
|
|
if (M = 0) and (N = 1) then
|
|||
|
|
State := MakeExpression(_URE_QUEST, State, _URE_NOOP)
|
|||
|
|
else
|
|||
|
|
begin
|
|||
|
|
// handle the general case by expanding {m, n} into the equivalent
|
|||
|
|
// expression E^m | E^(m + 1) | ... | E^n
|
|||
|
|
|
|||
|
|
// encapsulate the expanded branches as would they be in parenthesis
|
|||
|
|
// in order to avoid unwanted concatenation with pending operations/symbols
|
|||
|
|
Push(_URE_PAREN);
|
|||
|
|
// keep initial state as this is the one all alternatives start from
|
|||
|
|
LastState := State;
|
|||
|
|
|
|||
|
|
// Consider the special case M = 0 first. Because there's no construct
|
|||
|
|
// to enter a pure epsilon-transition into the expression array I
|
|||
|
|
// work around with the question mark operator to describe the first
|
|||
|
|
// and second branch alternative.
|
|||
|
|
if M = 0 then
|
|||
|
|
begin
|
|||
|
|
State := MakeExpression(_URE_QUEST, State, _URE_NOOP);
|
|||
|
|
Inc(M, 2);
|
|||
|
|
// Mark the pending OR operation (there must always follow at
|
|||
|
|
// least on more alternative because the special case {0, 1} has
|
|||
|
|
// already been handled).
|
|||
|
|
Push(State);
|
|||
|
|
Push(_URE_OR);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
while M <= N do
|
|||
|
|
begin
|
|||
|
|
State := LastState;
|
|||
|
|
// create E^M
|
|||
|
|
for I := 1 to Integer(M) - 1 do
|
|||
|
|
begin
|
|||
|
|
Push(State);
|
|||
|
|
Push(_URE_AND);
|
|||
|
|
end;
|
|||
|
|
// finish the branch and mark it as pending OR operation if it
|
|||
|
|
// isn't the last one
|
|||
|
|
CollectPendingOperations(State);
|
|||
|
|
if M < N then
|
|||
|
|
begin
|
|||
|
|
Push(State);
|
|||
|
|
Push(_URE_OR);
|
|||
|
|
end;
|
|||
|
|
Inc(M);
|
|||
|
|
end;
|
|||
|
|
// remove the _URE_PAREN off the stack
|
|||
|
|
Pop;
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
else
|
|||
|
|
Dec(Head);
|
|||
|
|
Symbol := MakeSymbol(Head, Tail - Head, Used);
|
|||
|
|
Inc(Head, Used);
|
|||
|
|
State := MakeExpression(_URE_SYMBOL, Symbol, _URE_NOOP);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
if (C <> '(') and (C <> '|') and (C <> '{') and (Head < Tail) and
|
|||
|
|
(not IsSpecial(Word(Head^)) or (Head^ = '(')) then
|
|||
|
|
begin
|
|||
|
|
Push(State);
|
|||
|
|
Push(_URE_AND);
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
CollectPendingOperations(State);
|
|||
|
|
if FUREBuffer.Stack.ListUsed > 0 then
|
|||
|
|
FUREBuffer.Error := _URE_UNBALANCED_GROUP;
|
|||
|
|
|
|||
|
|
if FUREBuffer.Error = _URE_OK then
|
|||
|
|
Result := State
|
|||
|
|
else
|
|||
|
|
Result := _URE_NOOP;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
procedure TURESearch.AddSymbolState(Symbol, State: Cardinal);
|
|||
|
|
var
|
|||
|
|
I, J: Integer;
|
|||
|
|
Found: Boolean;
|
|||
|
|
begin
|
|||
|
|
// Locate the symbol in the symbol table so the state can be added.
|
|||
|
|
// If the symbol doesn't exist, then we are in serious trouble.
|
|||
|
|
with FUREBuffer.SymbolTable do
|
|||
|
|
begin
|
|||
|
|
I := 0;
|
|||
|
|
while (I < SymbolsUsed) and (Symbol <> Symbols[I].ID) do
|
|||
|
|
Inc(I);
|
|||
|
|
|
|||
|
|
Assert(I < SymbolsUsed);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
// Now find out if the state exists in the symbol's state list.
|
|||
|
|
with FUREBuffer.SymbolTable.Symbols[I].States do
|
|||
|
|
begin
|
|||
|
|
Found := False;
|
|||
|
|
for J := 0 to ListUsed - 1 do
|
|||
|
|
begin
|
|||
|
|
if State <= List[J] then
|
|||
|
|
begin
|
|||
|
|
Found := True;
|
|||
|
|
Break;
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
if not Found then
|
|||
|
|
J := ListUsed;
|
|||
|
|
if not Found or (State < List[J]) then
|
|||
|
|
begin
|
|||
|
|
// Need to add the state in order.
|
|||
|
|
if ListUsed = Length(List) then
|
|||
|
|
SetLength(List, Length(List) + 8);
|
|||
|
|
if J < ListUsed then
|
|||
|
|
Move(List[J], List[J + 1], SizeOf(Cardinal) * (ListUsed - J));
|
|||
|
|
List[J] := State;
|
|||
|
|
Inc(ListUsed);
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function TURESearch.AddState(NewStates: array of Cardinal): Cardinal;
|
|||
|
|
var
|
|||
|
|
I: Integer;
|
|||
|
|
Found: Boolean;
|
|||
|
|
begin
|
|||
|
|
Found := False;
|
|||
|
|
for I := 0 to FUREBuffer.States.StatesUsed - 1 do
|
|||
|
|
begin
|
|||
|
|
if (FUREBuffer.States.States[I].StateList.ListUsed = Length(NewStates)) and
|
|||
|
|
CompareMem(@NewStates[0], @FUREBuffer.States.States[I].StateList.List[0],
|
|||
|
|
SizeOf(Cardinal) * Length(NewStates)) then
|
|||
|
|
begin
|
|||
|
|
Found := True;
|
|||
|
|
Break;
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
if not Found then
|
|||
|
|
begin
|
|||
|
|
// Need to add a new DFA State (set of NFA states).
|
|||
|
|
if FUREBuffer.States.StatesUsed = Length(FUREBuffer.States.States) then
|
|||
|
|
SetLength(FUREBuffer.States.States, Length(FUREBuffer.States.States) + 8);
|
|||
|
|
|
|||
|
|
with FUREBuffer.States.States[FUREBuffer.States.StatesUsed] do
|
|||
|
|
begin
|
|||
|
|
ID := FUREBuffer.States.StatesUsed;
|
|||
|
|
if (StateList.ListUsed + Length(NewStates)) >= Length(StateList.List) then
|
|||
|
|
SetLength(StateList.List, Length(StateList.List) + Length(NewStates) + 8);
|
|||
|
|
Move(NewStates[0], StateList.List[StateList.ListUsed], SizeOf(Cardinal) * Length(NewStates));
|
|||
|
|
Inc(StateList.ListUsed, Length(NewStates));
|
|||
|
|
end;
|
|||
|
|
Inc(FUREBuffer.States.StatesUsed);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
// Return the ID of the DFA state representing a group of NFA States.
|
|||
|
|
if Found then
|
|||
|
|
Result := I
|
|||
|
|
else
|
|||
|
|
Result := FUREBuffer.States.StatesUsed - 1;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
procedure TURESearch.Reduce(Start: Cardinal);
|
|||
|
|
var
|
|||
|
|
I, J,
|
|||
|
|
Symbols: Integer;
|
|||
|
|
State,
|
|||
|
|
RHS,
|
|||
|
|
s1, s2,
|
|||
|
|
ns1, ns2: Cardinal;
|
|||
|
|
Evaluating: Boolean;
|
|||
|
|
begin
|
|||
|
|
FUREBuffer.Reducing := True;
|
|||
|
|
|
|||
|
|
// Add the starting state for the reduction.
|
|||
|
|
AddState([Start]);
|
|||
|
|
|
|||
|
|
// Process each set of NFA states that get created.
|
|||
|
|
I := 0;
|
|||
|
|
// further states are added in the loop
|
|||
|
|
while I < FUREBuffer.States.StatesUsed do
|
|||
|
|
begin
|
|||
|
|
with FUREBuffer, States.States[I], ExpressionList do
|
|||
|
|
begin
|
|||
|
|
// Push the current states on the stack.
|
|||
|
|
for J := 0 to StateList.ListUsed - 1 do
|
|||
|
|
Push(StateList.List[J]);
|
|||
|
|
|
|||
|
|
// Reduce the NFA states.
|
|||
|
|
Accepting := False;
|
|||
|
|
Symbols := 0;
|
|||
|
|
J := 0;
|
|||
|
|
// need a while loop here as the stack will be modified within the loop and
|
|||
|
|
// so also its usage count used to terminate the loop
|
|||
|
|
while J < FUREBuffer.Stack.ListUsed do
|
|||
|
|
begin
|
|||
|
|
State := FUREBuffer.Stack.List[J];
|
|||
|
|
Evaluating := True;
|
|||
|
|
|
|||
|
|
// This inner loop is the iterative equivalent of recursively
|
|||
|
|
// reducing subexpressions generated as a result of a reduction.
|
|||
|
|
while Evaluating do
|
|||
|
|
begin
|
|||
|
|
case Expressions[State].AType of
|
|||
|
|
_URE_SYMBOL:
|
|||
|
|
begin
|
|||
|
|
ns1 := MakeExpression(_URE_ONE, _URE_NOOP, _URE_NOOP);
|
|||
|
|
AddSymbolState(Expressions[State].LHS, ns1);
|
|||
|
|
Inc(Symbols);
|
|||
|
|
Evaluating := False;
|
|||
|
|
end;
|
|||
|
|
_URE_ONE:
|
|||
|
|
begin
|
|||
|
|
Accepting := True;
|
|||
|
|
Evaluating := False;
|
|||
|
|
end;
|
|||
|
|
_URE_QUEST:
|
|||
|
|
begin
|
|||
|
|
s1 := Expressions[State].LHS;
|
|||
|
|
ns1 := MakeExpression(_URE_ONE, _URE_NOOP, _URE_NOOP);
|
|||
|
|
State := MakeExpression(_URE_OR, ns1, s1);
|
|||
|
|
end;
|
|||
|
|
_URE_PLUS:
|
|||
|
|
begin
|
|||
|
|
s1 := Expressions[State].LHS;
|
|||
|
|
ns1 := MakeExpression(_URE_STAR, s1, _URE_NOOP);
|
|||
|
|
State := MakeExpression(_URE_AND, s1, ns1);
|
|||
|
|
end;
|
|||
|
|
_URE_STAR:
|
|||
|
|
begin
|
|||
|
|
s1 := Expressions[State].LHS;
|
|||
|
|
ns1 := MakeExpression(_URE_ONE, _URE_NOOP, _URE_NOOP);
|
|||
|
|
ns2 := MakeExpression(_URE_PLUS, s1, _URE_NOOP);
|
|||
|
|
State := MakeExpression(_URE_OR, ns1, ns2);
|
|||
|
|
end;
|
|||
|
|
_URE_OR:
|
|||
|
|
begin
|
|||
|
|
s1 := Expressions[State].LHS;
|
|||
|
|
s2 := Expressions[State].RHS;
|
|||
|
|
Push(s1);
|
|||
|
|
Push(s2);
|
|||
|
|
Evaluating := False;
|
|||
|
|
end;
|
|||
|
|
_URE_AND:
|
|||
|
|
begin
|
|||
|
|
s1 := Expressions[State].LHS;
|
|||
|
|
s2 := Expressions[State].RHS;
|
|||
|
|
case Expressions[s1].AType of
|
|||
|
|
_URE_SYMBOL:
|
|||
|
|
begin
|
|||
|
|
AddSymbolState(Expressions[s1].LHS, s2);
|
|||
|
|
Inc(Symbols);
|
|||
|
|
Evaluating := False;
|
|||
|
|
end;
|
|||
|
|
_URE_ONE:
|
|||
|
|
State := s2;
|
|||
|
|
_URE_QUEST:
|
|||
|
|
begin
|
|||
|
|
ns1 := Expressions[s1].LHS;
|
|||
|
|
ns2 := MakeExpression(_URE_AND, ns1, s2);
|
|||
|
|
State := MakeExpression(_URE_OR, s2, ns2);
|
|||
|
|
end;
|
|||
|
|
_URE_PLUS:
|
|||
|
|
begin
|
|||
|
|
ns1 := Expressions[s1].LHS;
|
|||
|
|
ns2 := MakeExpression(_URE_OR, s2, State);
|
|||
|
|
State := MakeExpression(_URE_AND, ns1, ns2);
|
|||
|
|
end;
|
|||
|
|
_URE_STAR:
|
|||
|
|
begin
|
|||
|
|
ns1 := Expressions[s1].LHS;
|
|||
|
|
ns2 := MakeExpression(_URE_AND, ns1, State);
|
|||
|
|
State := MakeExpression(_URE_OR, s2, ns2);
|
|||
|
|
end;
|
|||
|
|
_URE_OR:
|
|||
|
|
begin
|
|||
|
|
ns1 := Expressions[s1].LHS;
|
|||
|
|
ns2 := Expressions[s1].RHS;
|
|||
|
|
ns1 := MakeExpression(_URE_AND, ns1, s2);
|
|||
|
|
ns2 := MakeExpression(_URE_AND, ns2, s2);
|
|||
|
|
State := MakeExpression(_URE_OR, ns1, ns2);
|
|||
|
|
end;
|
|||
|
|
_URE_AND:
|
|||
|
|
begin
|
|||
|
|
ns1 := Expressions[s1].LHS;
|
|||
|
|
ns2 := Expressions[s1].RHS;
|
|||
|
|
ns2 := MakeExpression(_URE_AND, ns2, s2);
|
|||
|
|
State := MakeExpression(_URE_AND, ns1, ns2);
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
Inc(J);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
// clear the state stack
|
|||
|
|
while Pop <> _URE_NOOP do
|
|||
|
|
{ nothing };
|
|||
|
|
|
|||
|
|
// generate the DFA states for the symbols collected during the current reduction
|
|||
|
|
if (TransitionsUsed + Symbols) > Length(Transitions) then
|
|||
|
|
SetLength(Transitions, Length(Transitions) + Symbols);
|
|||
|
|
|
|||
|
|
// go through the symbol table and generate the DFA state transitions for
|
|||
|
|
// each symbol that has collected NFA states
|
|||
|
|
Symbols := 0;
|
|||
|
|
J := 0;
|
|||
|
|
while J < FUREBuffer.SymbolTable.SymbolsUsed do
|
|||
|
|
begin
|
|||
|
|
begin
|
|||
|
|
if FUREBuffer.SymbolTable.Symbols[J].States.ListUsed > 0 then
|
|||
|
|
begin
|
|||
|
|
Transitions[Symbols].LHS := FUREBuffer.SymbolTable.Symbols[J].ID;
|
|||
|
|
with FUREBuffer.SymbolTable.Symbols[J] do
|
|||
|
|
begin
|
|||
|
|
RHS := AddState(Copy(States.List, 0, States.ListUsed));
|
|||
|
|
States.ListUsed := 0;
|
|||
|
|
end;
|
|||
|
|
Transitions[Symbols].RHS := RHS;
|
|||
|
|
Inc(Symbols);
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
Inc(J);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
// set the number of transitions actually used
|
|||
|
|
// Note: we need again to qualify a part of the TransistionsUsed path since the
|
|||
|
|
// state array could be reallocated in the AddState call above and the
|
|||
|
|
// with ... do will then be invalid.
|
|||
|
|
States.States[I].TransitionsUsed := Symbols;
|
|||
|
|
end;
|
|||
|
|
Inc(I);
|
|||
|
|
end;
|
|||
|
|
FUREBuffer.Reducing := False;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
procedure TURESearch.AddEquivalentPair(L, R: Cardinal);
|
|||
|
|
var
|
|||
|
|
I: Integer;
|
|||
|
|
begin
|
|||
|
|
L := FUREBuffer.States.States[L].ID;
|
|||
|
|
R := FUREBuffer.States.States[R].ID;
|
|||
|
|
|
|||
|
|
if L <> R then
|
|||
|
|
begin
|
|||
|
|
if L > R then
|
|||
|
|
begin
|
|||
|
|
I := L;
|
|||
|
|
L := R;
|
|||
|
|
R := I;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
// Check to see if the equivalence pair already exists.
|
|||
|
|
I := 0;
|
|||
|
|
with FUREBuffer.EquivalentList do
|
|||
|
|
begin
|
|||
|
|
while (I < EquivalentsUsed) and
|
|||
|
|
((Equivalents[I].Left <> L) or (Equivalents[I].Right <> R)) do
|
|||
|
|
Inc(I);
|
|||
|
|
|
|||
|
|
if I >= EquivalentsUsed then
|
|||
|
|
begin
|
|||
|
|
if EquivalentsUsed = Length(Equivalents) then
|
|||
|
|
SetLength(Equivalents, Length(Equivalents) + 8);
|
|||
|
|
|
|||
|
|
Equivalents[EquivalentsUsed].Left := L;
|
|||
|
|
Equivalents[EquivalentsUsed].Right := R;
|
|||
|
|
Inc(EquivalentsUsed);
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
procedure TURESearch.MergeEquivalents;
|
|||
|
|
// merges the DFA states that are equivalent
|
|||
|
|
var
|
|||
|
|
I, J, K,
|
|||
|
|
Equal: Integer;
|
|||
|
|
Done: Boolean;
|
|||
|
|
State1, State2,
|
|||
|
|
LeftState,
|
|||
|
|
RightState: PUcState;
|
|||
|
|
begin
|
|||
|
|
for I := 0 to FUREBuffer.States.StatesUsed - 1 do
|
|||
|
|
begin
|
|||
|
|
State1 := @FUREBuffer.States.States[I];
|
|||
|
|
if State1.ID = Cardinal(I) then
|
|||
|
|
begin
|
|||
|
|
J := 0;
|
|||
|
|
while J < I do
|
|||
|
|
begin
|
|||
|
|
State2 := @FUREBuffer.States.States[J];
|
|||
|
|
if State2.ID = Cardinal(J) then
|
|||
|
|
begin
|
|||
|
|
FUREBuffer.EquivalentList.EquivalentsUsed := 0;
|
|||
|
|
AddEquivalentPair(I, J);
|
|||
|
|
|
|||
|
|
Done := False;
|
|||
|
|
Equal := 0;
|
|||
|
|
while Equal < FUREBuffer.EquivalentList.EquivalentsUsed do
|
|||
|
|
begin
|
|||
|
|
LeftState := @FUREBuffer.States.States[FUREBuffer.EquivalentList.Equivalents[Equal].Left];
|
|||
|
|
RightState := @FUREBuffer.States.States[FUREBuffer.EquivalentList.Equivalents[Equal].Right];
|
|||
|
|
|
|||
|
|
if (LeftState.Accepting <> RightState.Accepting) or
|
|||
|
|
(LeftState.TransitionsUsed <> RightState.TransitionsUsed) then
|
|||
|
|
begin
|
|||
|
|
Done := True;
|
|||
|
|
Break;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
K := 0;
|
|||
|
|
while (K < LeftState.TransitionsUsed) and
|
|||
|
|
(LeftState.Transitions[K].LHS = RightState.Transitions[K].LHS) do
|
|||
|
|
Inc(K);
|
|||
|
|
|
|||
|
|
if K < LeftState.TransitionsUsed then
|
|||
|
|
begin
|
|||
|
|
Done := True;
|
|||
|
|
Break;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
for K := 0 to LeftState.TransitionsUsed - 1 do
|
|||
|
|
AddEquivalentPair(LeftState.Transitions[K].RHS, RightState.Transitions[K].RHS);
|
|||
|
|
|
|||
|
|
Inc(Equal);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
if not Done then
|
|||
|
|
Break;
|
|||
|
|
end;
|
|||
|
|
Inc(J);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
if J < I then
|
|||
|
|
begin
|
|||
|
|
with FUREBuffer do
|
|||
|
|
begin
|
|||
|
|
for Equal := 0 to EquivalentList.EquivalentsUsed - 1 do
|
|||
|
|
begin
|
|||
|
|
States.States[EquivalentList.Equivalents[Equal].Right].ID :=
|
|||
|
|
States.States[EquivalentList.Equivalents[Equal].Left].ID;
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
// Renumber the states appropriately
|
|||
|
|
State1 := @FUREBuffer.States.States[0];
|
|||
|
|
Equal := 0;
|
|||
|
|
for I := 0 to FUREBuffer.States.StatesUsed - 1 do
|
|||
|
|
begin
|
|||
|
|
if State1.ID = Cardinal(I) then
|
|||
|
|
begin
|
|||
|
|
State1.ID := Equal;
|
|||
|
|
Inc(Equal);
|
|||
|
|
end
|
|||
|
|
else
|
|||
|
|
State1.ID := FUREBuffer.States.States[State1.ID].ID;
|
|||
|
|
Inc(State1);
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
procedure TURESearch.ClearUREBuffer;
|
|||
|
|
var
|
|||
|
|
I: Integer;
|
|||
|
|
begin
|
|||
|
|
with FUREBuffer do
|
|||
|
|
begin
|
|||
|
|
// quite a few dynamic arrays to free
|
|||
|
|
Stack.List := nil;
|
|||
|
|
ExpressionList.Expressions := nil;
|
|||
|
|
|
|||
|
|
// the symbol table has been handed over to the DFA and will be freed on
|
|||
|
|
// release of the DFA
|
|||
|
|
SymbolTable.SymbolsUsed := 0;
|
|||
|
|
|
|||
|
|
for I := 0 to States.StatesUsed - 1 do
|
|||
|
|
begin
|
|||
|
|
States.States[I].Transitions := nil;
|
|||
|
|
States.States[I].StateList.List := nil;
|
|||
|
|
States.States[I].StateList.ListUsed := 0;
|
|||
|
|
States.States[I].TransitionsUsed := 0;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
States.StatesUsed := 0;
|
|||
|
|
States.States := nil;
|
|||
|
|
EquivalentList.Equivalents := nil;
|
|||
|
|
end;
|
|||
|
|
FillChar(FUREBuffer, SizeOf(FUREBuffer), 0);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
procedure TURESearch.CompileURE(RE: PWideChar; RELength: Cardinal; Casefold: Boolean);
|
|||
|
|
var
|
|||
|
|
I, J: Integer;
|
|||
|
|
State: Cardinal;
|
|||
|
|
Run: PUcState;
|
|||
|
|
TP: Integer;
|
|||
|
|
|
|||
|
|
procedure UREError(Text: string; RE: PWideChar);
|
|||
|
|
var
|
|||
|
|
S: string;
|
|||
|
|
begin
|
|||
|
|
S := RE;
|
|||
|
|
raise EJclUnicodeError.CreateResFmt(@RsUREErrorFmt, [LoadResString(@RsUREBaseString), Text, S]);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
begin
|
|||
|
|
// be paranoid
|
|||
|
|
if (RE <> nil) and (RE^ <> WideNull) and (RELength > 0) then
|
|||
|
|
begin
|
|||
|
|
// Reset the various fields of the compilation buffer. Default the Flags
|
|||
|
|
// to indicate the presense of the "^$" pattern. If any other pattern
|
|||
|
|
// occurs, then this flag will be removed. This is done to catch this
|
|||
|
|
// special pattern and handle it specially when matching.
|
|||
|
|
ClearUREBuffer;
|
|||
|
|
ClearDFA;
|
|||
|
|
FUREBuffer.Flags := _URE_DFA_BLANKLINE;
|
|||
|
|
if Casefold then
|
|||
|
|
FUREBuffer.Flags := FUREBuffer.Flags or _URE_DFA_CASEFOLD;
|
|||
|
|
|
|||
|
|
// Construct the NFA. If this stage returns a 0, then an error occured or an
|
|||
|
|
// empty expression was passed.
|
|||
|
|
State := ConvertRegExpToNFA(RE, RELength);
|
|||
|
|
if State <> _URE_NOOP then
|
|||
|
|
begin
|
|||
|
|
// Do the expression reduction to get the initial DFA.
|
|||
|
|
Reduce(State);
|
|||
|
|
|
|||
|
|
// Merge all the equivalent DFA States.
|
|||
|
|
MergeEquivalents;
|
|||
|
|
|
|||
|
|
// Construct the minimal DFA.
|
|||
|
|
FDFA.Flags := FUREBuffer.Flags and (_URE_DFA_CASEFOLD or _URE_DFA_BLANKLINE);
|
|||
|
|
|
|||
|
|
// Free up the NFA state groups and transfer the symbols from the buffer
|
|||
|
|
// to the DFA.
|
|||
|
|
FDFA.SymbolTable := FUREBuffer.SymbolTable;
|
|||
|
|
FUREBuffer.SymbolTable.Symbols := nil;
|
|||
|
|
|
|||
|
|
// Collect the total number of states and transitions needed for the DFA.
|
|||
|
|
State := 0;
|
|||
|
|
for I := 0 to FUREBuffer.States.StatesUsed - 1 do
|
|||
|
|
begin
|
|||
|
|
if FUREBuffer.States.States[I].ID = State then
|
|||
|
|
begin
|
|||
|
|
Inc(FDFA.StateList.StatesUsed);
|
|||
|
|
Inc(FDFA.TransitionList.TransitionsUsed, FUREBuffer.States.States[I].TransitionsUsed);
|
|||
|
|
Inc(State);
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
// Allocate enough space for the states and transitions.
|
|||
|
|
SetLength(FDFA.StateList.States, FDFA.StateList.StatesUsed);
|
|||
|
|
SetLength(FDFA.TransitionList.Transitions, FDFA.TransitionList.TransitionsUsed);
|
|||
|
|
|
|||
|
|
// Actually transfer the DFA States from the buffer.
|
|||
|
|
State := 0;
|
|||
|
|
TP := 0;
|
|||
|
|
Run := @FUREBuffer.States.States[0];
|
|||
|
|
for I := 0 to FUREBuffer.States.StatesUsed - 1 do
|
|||
|
|
begin
|
|||
|
|
if Run.ID = State then
|
|||
|
|
begin
|
|||
|
|
FDFA.StateList.States[I].StartTransition := TP;
|
|||
|
|
FDFA.StateList.States[I].NumberTransitions := Run.TransitionsUsed;
|
|||
|
|
FDFA.StateList.States[I].Accepting := Run.Accepting;
|
|||
|
|
|
|||
|
|
// Add the transitions for the state
|
|||
|
|
for J := 0 to FDFA.StateList.States[I].NumberTransitions - 1 do
|
|||
|
|
begin
|
|||
|
|
FDFA.TransitionList.Transitions[TP].Symbol := Run.Transitions[J].LHS;
|
|||
|
|
FDFA.TransitionList.Transitions[TP].NextState :=
|
|||
|
|
FUREBuffer.States.States[Run.Transitions[J].RHS].ID;
|
|||
|
|
Inc(TP);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
Inc(State);
|
|||
|
|
end;
|
|||
|
|
Inc(Run);
|
|||
|
|
end;
|
|||
|
|
end
|
|||
|
|
else
|
|||
|
|
begin
|
|||
|
|
// there might be an error while parsing the pattern, show it if so
|
|||
|
|
case FUREBuffer.Error of
|
|||
|
|
_URE_UNEXPECTED_EOS:
|
|||
|
|
UREError(LoadResString(@RsUREUnexpectedEOS), RE);
|
|||
|
|
_URE_CCLASS_OPEN:
|
|||
|
|
UREError(LoadResString(@RsURECharacterClassOpen), RE);
|
|||
|
|
_URE_UNBALANCED_GROUP:
|
|||
|
|
UREError(LoadResString(@RsUREUnbalancedGroup), RE);
|
|||
|
|
_URE_INVALID_PROPERTY:
|
|||
|
|
UREError(LoadResString(@RsUREInvalidCharProperty), RE);
|
|||
|
|
_URE_INVALID_RANGE:
|
|||
|
|
UREError(LoadResString(@RsUREInvalidRepeatRange), RE);
|
|||
|
|
_URE_RANGE_OPEN:
|
|||
|
|
UREError(LoadResString(@RsURERepeatRangeOpen), RE);
|
|||
|
|
else
|
|||
|
|
// expression was empty
|
|||
|
|
raise EJclUnicodeError.CreateRes(@RsUREExpressionEmpty);
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
procedure TURESearch.ClearDFA;
|
|||
|
|
var
|
|||
|
|
I: Integer;
|
|||
|
|
begin
|
|||
|
|
with FDFA do
|
|||
|
|
begin
|
|||
|
|
for I := 0 to SymbolTable.SymbolsUsed - 1 do
|
|||
|
|
begin
|
|||
|
|
if (SymbolTable.Symbols[I].AType = _URE_CCLASS) or
|
|||
|
|
(SymbolTable.Symbols[I].AType = _URE_NCCLASS) then
|
|||
|
|
SymbolTable.Symbols[I].Symbol.CCL.Ranges := nil;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
for I := 0 to SymbolTable.SymbolsUsed - 1 do
|
|||
|
|
begin
|
|||
|
|
FDFA.SymbolTable.Symbols[I].States.List := nil;
|
|||
|
|
FDFA.SymbolTable.Symbols[I].States.ListUsed := 0;
|
|||
|
|
end;
|
|||
|
|
SymbolTable.SymbolsUsed := 0;
|
|||
|
|
|
|||
|
|
SymbolTable.Symbols := nil;
|
|||
|
|
StateList.States := nil;
|
|||
|
|
TransitionList.Transitions := nil;
|
|||
|
|
end;
|
|||
|
|
FillChar(FDFA, SizeOf(FDFA), 0);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function IsSeparator(C: UCS4): Boolean;
|
|||
|
|
begin
|
|||
|
|
Result := (C = $D) or (C = $A) or (C = $2028) or (C = $2029);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function TURESearch.ExecuteURE(Flags: Cardinal; Text: PUCS2; TextLen: Cardinal; var MatchStart,
|
|||
|
|
MatchEnd: Cardinal): Boolean;
|
|||
|
|
var
|
|||
|
|
I, J: Integer;
|
|||
|
|
Matched,
|
|||
|
|
Found: Boolean;
|
|||
|
|
Start, Stop: Integer;
|
|||
|
|
C: UCS4;
|
|||
|
|
Run, Tail, lp: PUCS2;
|
|||
|
|
LastState: PDFAState;
|
|||
|
|
Symbol: PUcSymbolTableEntry;
|
|||
|
|
Rp: PUcRange;
|
|||
|
|
begin
|
|||
|
|
Result := False;
|
|||
|
|
if Text <> nil then
|
|||
|
|
begin
|
|||
|
|
// Handle the special case of an empty string matching the "^$" pattern.
|
|||
|
|
if (Textlen = 0) and ((FDFA.Flags and _URE_DFA_BLANKLINE) <> 0) then
|
|||
|
|
begin
|
|||
|
|
MatchStart := 0;
|
|||
|
|
MatchEnd := 0;
|
|||
|
|
Result := True;
|
|||
|
|
Exit;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
Run := Text;
|
|||
|
|
Tail := Run + TextLen;
|
|||
|
|
Start := -1;
|
|||
|
|
Stop := -1;
|
|||
|
|
LastState := @FDFA.StateList.States[0];
|
|||
|
|
|
|||
|
|
Found := False;
|
|||
|
|
while not Found and (Run < Tail) do
|
|||
|
|
begin
|
|||
|
|
lp := Run;
|
|||
|
|
C := UCS4(Run^);
|
|||
|
|
Inc(Run);
|
|||
|
|
|
|||
|
|
// Check to see if this is a high surrogate that should be combined with a
|
|||
|
|
// following low surrogate.
|
|||
|
|
if (Run < Tail) and
|
|||
|
|
(SurrogateHighStart <= C) and (C <= SurrogateHighEnd) and
|
|||
|
|
(SurrogateLowStart <= UCS4(Run^)) and (UCS4(Run^) <= SurrogateLowEnd) then
|
|||
|
|
begin
|
|||
|
|
C := $10000 + (((C and $03FF) shl 10) or (UCS4(Run^) and $03FF));
|
|||
|
|
Inc(Run);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
// Determine if the character is non-spacing and should be skipped.
|
|||
|
|
if ((Flags and URE_IGNORE_NONSPACING) <> 0) and UnicodeIsNonSpacingMark(C) then
|
|||
|
|
begin
|
|||
|
|
Inc(Run);
|
|||
|
|
Continue;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
if (FDFA.Flags and _URE_DFA_CASEFOLD) <> 0 then
|
|||
|
|
{ TODO : use the entire mapping, not only the first character }
|
|||
|
|
C := UnicodeToLower(C)[0];
|
|||
|
|
|
|||
|
|
// See if one of the transitions matches.
|
|||
|
|
I := LastState.NumberTransitions - 1;
|
|||
|
|
Matched := False;
|
|||
|
|
|
|||
|
|
while not Matched and (I >= 0) do
|
|||
|
|
begin
|
|||
|
|
Symbol := @FDFA.SymbolTable.Symbols[FDFA.TransitionList.Transitions[LastState.StartTransition + I].Symbol];
|
|||
|
|
case Symbol.AType of
|
|||
|
|
_URE_ANY_CHAR:
|
|||
|
|
if ((Flags and URE_DONT_MATCHES_SEPARATORS) <> 0) or
|
|||
|
|
not IsSeparator(C) then
|
|||
|
|
Matched := True;
|
|||
|
|
_URE_CHAR:
|
|||
|
|
if C = Symbol.Symbol.Chr then
|
|||
|
|
Matched := True;
|
|||
|
|
_URE_BOL_ANCHOR:
|
|||
|
|
if Lp = Text then
|
|||
|
|
begin
|
|||
|
|
Run := lp;
|
|||
|
|
Matched := True;
|
|||
|
|
end
|
|||
|
|
else
|
|||
|
|
begin
|
|||
|
|
if IsSeparator(C) then
|
|||
|
|
begin
|
|||
|
|
if (C = $D) and (Run < Tail) and (Run^ = #$A) then
|
|||
|
|
Inc(Run);
|
|||
|
|
Lp := Run;
|
|||
|
|
Matched := True;
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
_URE_EOL_ANCHOR:
|
|||
|
|
if IsSeparator(C) then
|
|||
|
|
begin
|
|||
|
|
// Put the pointer back before the separator so the match end
|
|||
|
|
// position will be correct. This will also cause the `Run'
|
|||
|
|
// pointer to be advanced over the current separator once the
|
|||
|
|
// match end point has been recorded.
|
|||
|
|
Run := Lp;
|
|||
|
|
Matched := True;
|
|||
|
|
end;
|
|||
|
|
_URE_CCLASS,
|
|||
|
|
_URE_NCCLASS:
|
|||
|
|
with Symbol^ do
|
|||
|
|
begin
|
|||
|
|
if Categories <> [] then
|
|||
|
|
Matched := CategoryLookup(C, Categories);
|
|||
|
|
if Symbol.CCL.RangesUsed > 0 then
|
|||
|
|
begin
|
|||
|
|
Rp := @Symbol.CCL.Ranges[0];
|
|||
|
|
for J := 0 to Symbol.CCL.RangesUsed - 1 do
|
|||
|
|
begin
|
|||
|
|
if (Rp.MinCode <= C) and (C <= Rp.MaxCode) then
|
|||
|
|
begin
|
|||
|
|
Matched := True;
|
|||
|
|
Break;
|
|||
|
|
end;
|
|||
|
|
Inc(Rp);
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
if AType = _URE_NCCLASS then
|
|||
|
|
Matched := not Matched;
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
if Matched then
|
|||
|
|
begin
|
|||
|
|
if Start = -1 then
|
|||
|
|
Start := Lp - Text
|
|||
|
|
else
|
|||
|
|
Stop := Run - Text;
|
|||
|
|
|
|||
|
|
LastState := @FDFA.StateList.States[FDFA.TransitionList.Transitions[LastState.StartTransition + I].NextState];
|
|||
|
|
|
|||
|
|
// If the match was an EOL anchor, adjust the pointer past the separator
|
|||
|
|
// that caused the match. The correct match position has been recorded
|
|||
|
|
// already.
|
|||
|
|
if Symbol.AType = _URE_EOL_ANCHOR then
|
|||
|
|
begin
|
|||
|
|
// skip the character that caused the match
|
|||
|
|
Inc(Run);
|
|||
|
|
// handle the infamous CRLF situation
|
|||
|
|
if (Run < Tail) and (C = $D) and (Run^ = #$A) then
|
|||
|
|
Inc(Run);
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
Dec(I);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
if not Matched then
|
|||
|
|
begin
|
|||
|
|
Found := LastState.Accepting;
|
|||
|
|
if not Found then
|
|||
|
|
begin
|
|||
|
|
// If the last state was not accepting, then reset and start over.
|
|||
|
|
LastState := @FDFA.StateList.States[0];
|
|||
|
|
Start := -1;
|
|||
|
|
Stop := -1;
|
|||
|
|
end
|
|||
|
|
else
|
|||
|
|
begin
|
|||
|
|
// set start and stop pointer if not yet done
|
|||
|
|
if Start = -1 then
|
|||
|
|
begin
|
|||
|
|
Start := Lp - Text;
|
|||
|
|
Stop := Run - Text;
|
|||
|
|
end
|
|||
|
|
else
|
|||
|
|
begin
|
|||
|
|
if Stop = -1 then
|
|||
|
|
Stop := Lp - Text;
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
end
|
|||
|
|
else
|
|||
|
|
begin
|
|||
|
|
if Run = Tail then
|
|||
|
|
begin
|
|||
|
|
if not LastState.Accepting then
|
|||
|
|
begin
|
|||
|
|
// This ugly hack is to make sure the end-of-line anchors match
|
|||
|
|
// when the source text hits the end. This is only done if the last
|
|||
|
|
// subexpression matches.
|
|||
|
|
for I := 0 to LastState.NumberTransitions - 1 do
|
|||
|
|
begin
|
|||
|
|
if Found then
|
|||
|
|
Break;
|
|||
|
|
Symbol := @FDFA.SymbolTable.Symbols[FDFA.TransitionList.Transitions[LastState.StartTransition + I].Symbol];
|
|||
|
|
if Symbol.AType =_URE_EOL_ANCHOR then
|
|||
|
|
begin
|
|||
|
|
LastState := @FDFA.StateList.States[FDFA.TransitionList.Transitions[LastState.StartTransition + I].NextState];
|
|||
|
|
if LastState.Accepting then
|
|||
|
|
begin
|
|||
|
|
Stop := Run - Text;
|
|||
|
|
Found := True;
|
|||
|
|
end
|
|||
|
|
else
|
|||
|
|
Break;
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
end
|
|||
|
|
else
|
|||
|
|
begin
|
|||
|
|
// Make sure any conditions that match all the way to the end of
|
|||
|
|
// the string match.
|
|||
|
|
Found := True;
|
|||
|
|
Stop := Run - Text;
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
if Found then
|
|||
|
|
begin
|
|||
|
|
MatchStart := Start;
|
|||
|
|
MatchEnd := Stop;
|
|||
|
|
end;
|
|||
|
|
Result := Found;
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function TURESearch.FindAll(const Text: WideString): Boolean;
|
|||
|
|
begin
|
|||
|
|
Result := FindAll(PWideChar(Text), Length(Text));
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function TURESearch.FindAll(Text: PWideChar; TextLen: Cardinal): Boolean;
|
|||
|
|
// Looks for all occurences of the pattern passed to FindPrepare and creates an
|
|||
|
|
// internal list of their positions.
|
|||
|
|
var
|
|||
|
|
Start, Stop: Cardinal;
|
|||
|
|
Run: PWideChar;
|
|||
|
|
RunLen: Cardinal;
|
|||
|
|
begin
|
|||
|
|
ClearResults;
|
|||
|
|
Run := Text;
|
|||
|
|
RunLen := TextLen;
|
|||
|
|
// repeat to find all occurences of the pattern
|
|||
|
|
while ExecuteURE(0, Run, RunLen, Start, Stop) do
|
|||
|
|
begin
|
|||
|
|
// store this result (consider text pointer movement)...
|
|||
|
|
AddResult(Start + Run - Text, Stop + Run - Text);
|
|||
|
|
// ... and advance text position and length
|
|||
|
|
Inc(Run, Stop);
|
|||
|
|
Dec(RunLen, Stop);
|
|||
|
|
end;
|
|||
|
|
Result := FResults.Count > 0;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function TURESearch.FindFirst(const Text: WideString; var Start, Stop: Cardinal): Boolean;
|
|||
|
|
begin
|
|||
|
|
Result := FindFirst(PWideChar(Text), Length(Text), Start, Stop);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function TURESearch.FindFirst(Text: PWideChar; TextLen: Cardinal; var Start, Stop: Cardinal): Boolean;
|
|||
|
|
// Looks for the first occurence of the pattern passed to FindPrepare in Text and
|
|||
|
|
// returns True if one could be found (in which case Start and Stop are set to
|
|||
|
|
// the according indices) otherwise False. This function is in particular of
|
|||
|
|
// interest if only one occurence needs to be found.
|
|||
|
|
begin
|
|||
|
|
ClearResults;
|
|||
|
|
Result := ExecuteURE(0, PWideChar(Text), Length(Text), Start, Stop);
|
|||
|
|
if Result then
|
|||
|
|
AddResult(Start, Stop);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
procedure TURESearch.FindPrepare(Pattern: PWideChar; PatternLength: Cardinal; Options: TSearchFlags);
|
|||
|
|
begin
|
|||
|
|
CompileURE(Pattern, PatternLength, not (sfCaseSensitive in Options));
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
procedure TURESearch.FindPrepare(const Pattern: WideString; Options: TSearchFlags);
|
|||
|
|
begin
|
|||
|
|
CompileURE(PWideChar(Pattern), Length(Pattern), not (sfCaseSensitive in Options));
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
//=== { TWideStrings } =======================================================
|
|||
|
|
|
|||
|
|
constructor TWideStrings.Create;
|
|||
|
|
begin
|
|||
|
|
inherited Create;
|
|||
|
|
FLanguage := GetUserDefaultLCID;
|
|||
|
|
FNormalizationForm := nfC;
|
|||
|
|
FSaveFormat := sfUnicodeLSB;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
procedure TWideStrings.SetLanguage(Value: LCID);
|
|||
|
|
begin
|
|||
|
|
FLanguage := Value;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function TWideStrings.GetSaveUnicode: Boolean;
|
|||
|
|
begin
|
|||
|
|
Result := SaveFormat = sfUnicodeLSB;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
procedure TWideStrings.SetSaveUnicode(const Value: Boolean);
|
|||
|
|
begin
|
|||
|
|
if Value then
|
|||
|
|
SaveFormat := sfUnicodeLSB
|
|||
|
|
else
|
|||
|
|
SaveFormat := sfAnsi;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function TWideStrings.Add(const S: WideString): Integer;
|
|||
|
|
begin
|
|||
|
|
Result := GetCount;
|
|||
|
|
Insert(Result, S);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function TWideStrings.AddObject(const S: WideString; AObject: TObject): Integer;
|
|||
|
|
begin
|
|||
|
|
Result := Add(S);
|
|||
|
|
PutObject(Result, AObject);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
procedure TWideStrings.Append(const S: WideString);
|
|||
|
|
begin
|
|||
|
|
Add(S);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
procedure TWideStrings.AddStrings(Strings: TStrings);
|
|||
|
|
var
|
|||
|
|
I: Integer;
|
|||
|
|
S: WideString;
|
|||
|
|
CP: Integer;
|
|||
|
|
begin
|
|||
|
|
BeginUpdate;
|
|||
|
|
try
|
|||
|
|
CP := CodePageFromLocale(FLanguage);
|
|||
|
|
for I := 0 to Strings.Count - 1 do
|
|||
|
|
begin
|
|||
|
|
S := StringToWideStringEx(Strings[I], CP);
|
|||
|
|
AddObject(S, Strings.Objects[I]);
|
|||
|
|
end;
|
|||
|
|
finally
|
|||
|
|
EndUpdate;
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
procedure TWideStrings.AddStrings(Strings: TWideStrings);
|
|||
|
|
var
|
|||
|
|
I: Integer;
|
|||
|
|
SourceCP,
|
|||
|
|
TargetCP: Integer;
|
|||
|
|
S: WideString;
|
|||
|
|
begin
|
|||
|
|
Assert(Strings <> nil);
|
|||
|
|
|
|||
|
|
BeginUpdate;
|
|||
|
|
try
|
|||
|
|
if Strings.FLanguage <> FLanguage then
|
|||
|
|
begin
|
|||
|
|
SourceCP := CodePageFromLocale(Strings.FLanguage);
|
|||
|
|
TargetCP := CodePageFromLocale(FLanguage);
|
|||
|
|
for I := 0 to Strings.Count - 1 do
|
|||
|
|
begin
|
|||
|
|
S := TranslateString(Strings[I], SourceCP, TargetCP);
|
|||
|
|
AddObject(S, Strings.Objects[I]);
|
|||
|
|
end;
|
|||
|
|
end
|
|||
|
|
else
|
|||
|
|
begin
|
|||
|
|
for I := 0 to Strings.Count - 1 do
|
|||
|
|
AddObject(Strings[I], Strings.Objects[I]);
|
|||
|
|
end;
|
|||
|
|
finally
|
|||
|
|
EndUpdate;
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
procedure TWideStrings.Assign(Source: TPersistent);
|
|||
|
|
// usual assignment routine, but able to assign wide and small strings
|
|||
|
|
begin
|
|||
|
|
if Source is TWideStrings then
|
|||
|
|
begin
|
|||
|
|
BeginUpdate;
|
|||
|
|
try
|
|||
|
|
Clear;
|
|||
|
|
AddStrings(TWideStrings(Source));
|
|||
|
|
finally
|
|||
|
|
EndUpdate;
|
|||
|
|
end;
|
|||
|
|
end
|
|||
|
|
else
|
|||
|
|
begin
|
|||
|
|
if Source is TStrings then
|
|||
|
|
begin
|
|||
|
|
BeginUpdate;
|
|||
|
|
try
|
|||
|
|
Clear;
|
|||
|
|
AddStrings(TStrings(Source));
|
|||
|
|
finally
|
|||
|
|
EndUpdate;
|
|||
|
|
end;
|
|||
|
|
end
|
|||
|
|
else
|
|||
|
|
inherited Assign(Source);
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
procedure TWideStrings.AssignTo(Dest: TPersistent);
|
|||
|
|
// need to do also assignment to old style TStrings, but this class doesn't know
|
|||
|
|
// TWideStrings, so we need to do it from here
|
|||
|
|
var
|
|||
|
|
I: Integer;
|
|||
|
|
S: string;
|
|||
|
|
CP: Integer;
|
|||
|
|
begin
|
|||
|
|
if Dest is TStrings then
|
|||
|
|
begin
|
|||
|
|
with Dest as TStrings do
|
|||
|
|
begin
|
|||
|
|
BeginUpdate;
|
|||
|
|
try
|
|||
|
|
CP := CodePageFromLocale(FLanguage);
|
|||
|
|
Clear;
|
|||
|
|
for I := 0 to Self.Count - 1 do
|
|||
|
|
begin
|
|||
|
|
S := WideStringToStringEx(Self[I], CP);
|
|||
|
|
AddObject(S, Self.Objects[I]);
|
|||
|
|
end;
|
|||
|
|
finally
|
|||
|
|
EndUpdate;
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
end
|
|||
|
|
else
|
|||
|
|
begin
|
|||
|
|
if Dest is TWideStrings then
|
|||
|
|
begin
|
|||
|
|
with Dest as TWideStrings do
|
|||
|
|
begin
|
|||
|
|
BeginUpdate;
|
|||
|
|
try
|
|||
|
|
Clear;
|
|||
|
|
AddStrings(Self);
|
|||
|
|
finally
|
|||
|
|
EndUpdate;
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
end
|
|||
|
|
else
|
|||
|
|
inherited AssignTo(Dest);
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
procedure TWideStrings.BeginUpdate;
|
|||
|
|
begin
|
|||
|
|
if FUpdateCount = 0 then
|
|||
|
|
SetUpdateState(True);
|
|||
|
|
Inc(FUpdateCount);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
procedure TWideStrings.DefineProperties(Filer: TFiler);
|
|||
|
|
|
|||
|
|
// Defines a private property for the content of the list.
|
|||
|
|
// There's a bug in the handling of text DFMs in Classes.pas which prevents
|
|||
|
|
// WideStrings from loading under some circumstances. Zbysek Hlinka
|
|||
|
|
// (zhlinka att login dott cz) brought this to my attention and supplied also a solution.
|
|||
|
|
// See ReadData and WriteData methods for implementation details.
|
|||
|
|
|
|||
|
|
//--------------- local function --------------------------------------------
|
|||
|
|
|
|||
|
|
function DoWrite: Boolean;
|
|||
|
|
begin
|
|||
|
|
if Filer.Ancestor <> nil then
|
|||
|
|
begin
|
|||
|
|
Result := True;
|
|||
|
|
if Filer.Ancestor is TWideStrings then
|
|||
|
|
Result := not Equals(TWideStrings(Filer.Ancestor))
|
|||
|
|
end
|
|||
|
|
else
|
|||
|
|
Result := Count > 0;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
//--------------- end local function ----------------------------------------
|
|||
|
|
|
|||
|
|
begin
|
|||
|
|
Filer.DefineProperty('WideStrings', ReadData, WriteData, DoWrite);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
procedure TWideStrings.DoConfirmConversion(var Allowed: Boolean);
|
|||
|
|
begin
|
|||
|
|
if Assigned(FOnConfirmConversion) then
|
|||
|
|
FOnConfirmConversion(Self, Allowed);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
procedure TWideStrings.EndUpdate;
|
|||
|
|
begin
|
|||
|
|
Dec(FUpdateCount);
|
|||
|
|
if FUpdateCount = 0 then
|
|||
|
|
SetUpdateState(False);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function TWideStrings.Equals(Strings: TWideStrings): Boolean;
|
|||
|
|
var
|
|||
|
|
I, Count: Integer;
|
|||
|
|
begin
|
|||
|
|
Assert(Strings <> nil);
|
|||
|
|
|
|||
|
|
Result := False;
|
|||
|
|
Count := GetCount;
|
|||
|
|
if Count <> Strings.GetCount then
|
|||
|
|
Exit;
|
|||
|
|
{ TODO : use internal comparation routine as soon as composition is implemented }
|
|||
|
|
for I := 0 to Count - 1 do
|
|||
|
|
if Get(I) <> Strings.Get(I) then
|
|||
|
|
Exit;
|
|||
|
|
Result := True;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
procedure TWideStrings.Error(const Msg: string; Data: Integer);
|
|||
|
|
|
|||
|
|
function ReturnAddr: Pointer;
|
|||
|
|
asm
|
|||
|
|
MOV EAX, [EBP + 4]
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
begin
|
|||
|
|
raise EStringListError.CreateFmt(Msg, [Data]) at ReturnAddr;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
procedure TWideStrings.Exchange(Index1, Index2: Integer);
|
|||
|
|
var
|
|||
|
|
TempObject: TObject;
|
|||
|
|
TempString: WideString;
|
|||
|
|
begin
|
|||
|
|
BeginUpdate;
|
|||
|
|
try
|
|||
|
|
TempString := Strings[Index1];
|
|||
|
|
TempObject := Objects[Index1];
|
|||
|
|
Strings[Index1] := Strings[Index2];
|
|||
|
|
Objects[Index1] := Objects[Index2];
|
|||
|
|
Strings[Index2] := TempString;
|
|||
|
|
Objects[Index2] := TempObject;
|
|||
|
|
finally
|
|||
|
|
EndUpdate;
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function TWideStrings.GetCapacity: Integer;
|
|||
|
|
// Descendants may optionally override/replace this default implementation.
|
|||
|
|
begin
|
|||
|
|
Result := Count;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function TWideStrings.GetCommaText: WideString;
|
|||
|
|
var
|
|||
|
|
S: WideString;
|
|||
|
|
P: PWideChar;
|
|||
|
|
I, Count: Integer;
|
|||
|
|
begin
|
|||
|
|
Count := GetCount;
|
|||
|
|
if (Count = 1) and (Get(0) = '') then
|
|||
|
|
Result := '""'
|
|||
|
|
else
|
|||
|
|
begin
|
|||
|
|
Result := '';
|
|||
|
|
for I := 0 to Count - 1 do
|
|||
|
|
begin
|
|||
|
|
S := Get(I);
|
|||
|
|
P := PWideChar(S);
|
|||
|
|
while not (P^ in [WideNull..WideSpace, WideChar('"'), WideChar(',')]) do
|
|||
|
|
Inc(P);
|
|||
|
|
if P^ <> WideNull then
|
|||
|
|
S := WideQuotedStr(S, '"');
|
|||
|
|
Result := Result + S + ',';
|
|||
|
|
end;
|
|||
|
|
System.Delete(Result, Length(Result), 1);
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function TWideStrings.GetName(Index: Integer): WideString;
|
|||
|
|
var
|
|||
|
|
P: Integer;
|
|||
|
|
begin
|
|||
|
|
Result := Get(Index);
|
|||
|
|
P := Pos('=', Result);
|
|||
|
|
if P > 0 then
|
|||
|
|
SetLength(Result, P - 1)
|
|||
|
|
else
|
|||
|
|
Result := '';
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function TWideStrings.GetObject(Index: Integer): TObject;
|
|||
|
|
begin
|
|||
|
|
Result := nil;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function TWideStrings.GetSeparatedText(Separators: WideString): WideString;
|
|||
|
|
// Same as GetText but with customizable separator characters.
|
|||
|
|
var
|
|||
|
|
I, L,
|
|||
|
|
Size,
|
|||
|
|
Count,
|
|||
|
|
SepSize: Integer;
|
|||
|
|
P: PWideChar;
|
|||
|
|
S: WideString;
|
|||
|
|
begin
|
|||
|
|
Count := GetCount;
|
|||
|
|
SepSize := Length(Separators);
|
|||
|
|
Size := 0;
|
|||
|
|
for I := 0 to Count - 1 do
|
|||
|
|
Inc(Size, Length(Get(I)) + SepSize);
|
|||
|
|
|
|||
|
|
// set one separator less, the last line does not need a trailing separator
|
|||
|
|
SetLength(Result, Size - SepSize);
|
|||
|
|
if Size > 0 then
|
|||
|
|
begin
|
|||
|
|
P := Pointer(Result);
|
|||
|
|
I := 0;
|
|||
|
|
while True do
|
|||
|
|
begin
|
|||
|
|
S := Get(I);
|
|||
|
|
L := Length(S);
|
|||
|
|
if L <> 0 then
|
|||
|
|
begin
|
|||
|
|
// add current string
|
|||
|
|
System.Move(Pointer(S)^, P^, 2 * L);
|
|||
|
|
Inc(P, L);
|
|||
|
|
end;
|
|||
|
|
Inc(I);
|
|||
|
|
if I = Count then
|
|||
|
|
Break;
|
|||
|
|
|
|||
|
|
// add separators
|
|||
|
|
System.Move(Pointer(Separators)^, P^, SizeOf(WideChar) * SepSize);
|
|||
|
|
Inc(P, SepSize);
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function TWideStrings.GetTextStr: WideString;
|
|||
|
|
begin
|
|||
|
|
Result := GetSeparatedText(WideCRLF);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function TWideStrings.GetText: PWideChar;
|
|||
|
|
begin
|
|||
|
|
Result := StrNewW(GetTextStr);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function TWideStrings.GetValue(const Name: WideString): WideString;
|
|||
|
|
var
|
|||
|
|
I: Integer;
|
|||
|
|
begin
|
|||
|
|
I := IndexOfName(Name);
|
|||
|
|
if I >= 0 then
|
|||
|
|
Result := Copy(Get(I), Length(Name) + 2, MaxInt)
|
|||
|
|
else
|
|||
|
|
Result := '';
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function TWideStrings.IndexOf(const S: WideString): Integer;
|
|||
|
|
var
|
|||
|
|
NormString: WideString;
|
|||
|
|
begin
|
|||
|
|
NormString := WideNormalize(S, FNormalizationForm);
|
|||
|
|
|
|||
|
|
for Result := 0 to GetCount - 1 do
|
|||
|
|
if WideCompareText(Get(Result), NormString, FLanguage) = 0 then
|
|||
|
|
Exit;
|
|||
|
|
Result := -1;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function TWideStrings.IndexOfName(const Name: WideString): Integer;
|
|||
|
|
var
|
|||
|
|
P: Integer;
|
|||
|
|
S: WideString;
|
|||
|
|
NormName: WideString;
|
|||
|
|
begin
|
|||
|
|
NormName := WideNormalize(Name, FNormalizationForm);
|
|||
|
|
|
|||
|
|
for Result := 0 to GetCount - 1 do
|
|||
|
|
begin
|
|||
|
|
S := Get(Result);
|
|||
|
|
P := Pos('=', S);
|
|||
|
|
if (P > 0) and (WideCompareText(Copy(S, 1, P - 1), NormName, FLanguage) = 0) then
|
|||
|
|
Exit;
|
|||
|
|
end;
|
|||
|
|
Result := -1;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function TWideStrings.IndexOfObject(AObject: TObject): Integer;
|
|||
|
|
begin
|
|||
|
|
for Result := 0 to GetCount - 1 do
|
|||
|
|
if GetObject(Result) = AObject then
|
|||
|
|
Exit;
|
|||
|
|
Result := -1;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
procedure TWideStrings.InsertObject(Index: Integer; const S: WideString; AObject: TObject);
|
|||
|
|
begin
|
|||
|
|
Insert(Index, S);
|
|||
|
|
PutObject(Index, AObject);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
procedure TWideStrings.LoadFromFile(const FileName: string);
|
|||
|
|
var
|
|||
|
|
Stream: TStream;
|
|||
|
|
begin
|
|||
|
|
try
|
|||
|
|
Stream := TFileStream.Create(FileName, fmOpenRead or fmShareDenyNone);
|
|||
|
|
try
|
|||
|
|
LoadFromStream(Stream);
|
|||
|
|
finally
|
|||
|
|
Stream.Free;
|
|||
|
|
end;
|
|||
|
|
except
|
|||
|
|
RaiseLastOSError;
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
procedure TWideStrings.LoadFromStream(Stream: TStream);
|
|||
|
|
// usual loader routine, but enhanced to handle byte order marks in stream
|
|||
|
|
var
|
|||
|
|
Size,
|
|||
|
|
BytesRead: Integer;
|
|||
|
|
ByteOrderMask: array [0..5] of Byte; // BOM size is max 5 bytes (cf: wikipedia)
|
|||
|
|
// but it is easier to implement with a multiple of 2
|
|||
|
|
Loaded: Boolean;
|
|||
|
|
SW: WideString;
|
|||
|
|
SA: string;
|
|||
|
|
begin
|
|||
|
|
BeginUpdate;
|
|||
|
|
try
|
|||
|
|
Loaded := False;
|
|||
|
|
|
|||
|
|
Size := Stream.Size - Stream.Position;
|
|||
|
|
BytesRead := Stream.Read(ByteOrderMask[0],SizeOf(ByteOrderMask));
|
|||
|
|
|
|||
|
|
// UTF16 LSB = Unicode LSB
|
|||
|
|
if (BytesRead >= 2) and (ByteOrderMask[0] = BOM_UTF16_LSB[0])
|
|||
|
|
and (ByteOrderMask[1] = BOM_UTF16_LSB[1]) then
|
|||
|
|
begin
|
|||
|
|
FSaveFormat := sfUTF16LSB;
|
|||
|
|
SetLength(SW, (Size - 2) div SizeOf(WideChar));
|
|||
|
|
Assert((Size and 1) <> 1,'Number of chars must be a multiple of 2');
|
|||
|
|
System.Move(ByteOrderMask[2],SW[1],BytesRead-2); // max 4 bytes = 2 widechars
|
|||
|
|
Stream.Read(SW[3], Size-BytesRead); // first 2 chars were copied by System.Move
|
|||
|
|
SetText(SW);
|
|||
|
|
Loaded := True;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
// UTF16 MSB = Unicode MSB
|
|||
|
|
if (BytesRead >= 2) and (ByteOrderMask[0] = BOM_UTF16_MSB[0])
|
|||
|
|
and (ByteOrderMask[1] = BOM_UTF16_MSB[1]) then
|
|||
|
|
begin
|
|||
|
|
FSaveFormat := sfUTF16MSB;
|
|||
|
|
SetLength(SW, (Size - 2) div SizeOf(WideChar));
|
|||
|
|
Assert((Size and 1) <> 1,'Number of chars must be a multiple of 2');
|
|||
|
|
System.Move(ByteOrderMask[2],SW[1],BytesRead-2); // max 4 bytes = 2 widechars
|
|||
|
|
Stream.Read(SW[3], Size-BytesRead); // first 2 chars were copied by System.Move
|
|||
|
|
StrSwapByteOrder(PWideChar(SW));
|
|||
|
|
SetText(SW);
|
|||
|
|
Loaded := True;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
// UTF8
|
|||
|
|
if (BytesRead >= 3) and (ByteOrderMask[0] = BOM_UTF8[0])
|
|||
|
|
and (ByteOrderMask[1] = BOM_UTF8[1]) and (ByteOrderMask[2] = BOM_UTF8[2]) then
|
|||
|
|
begin
|
|||
|
|
FSaveFormat := sfUTF8;
|
|||
|
|
SetLength(SA, (Size-3) div SizeOf(Char));
|
|||
|
|
System.Move(ByteOrderMask[3],SA[1],BytesRead-3); // max 3 bytes = 3 chars
|
|||
|
|
Stream.Read(SA[4], Size-BytesRead); // first 3 chars were copied by System.Move
|
|||
|
|
SW := UTF8ToWideString(SA);
|
|||
|
|
SetText(SW);
|
|||
|
|
Loaded := True;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
// default case (Ansi)
|
|||
|
|
if not Loaded then
|
|||
|
|
begin
|
|||
|
|
FSaveFormat := sfAnsi;
|
|||
|
|
SetLength(SA, Size div SizeOf(Char));
|
|||
|
|
System.Move(ByteOrderMask[0],SA[1],BytesRead); // max 6 bytes = 6 chars
|
|||
|
|
Stream.Read(SA[7], Size-BytesRead); // first 6 chars were copied by System.Move
|
|||
|
|
SetText(SA);
|
|||
|
|
end;
|
|||
|
|
finally
|
|||
|
|
EndUpdate;
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
procedure TWideStrings.Move(CurIndex, NewIndex: Integer);
|
|||
|
|
var
|
|||
|
|
TempObject: TObject;
|
|||
|
|
TempString: WideString;
|
|||
|
|
begin
|
|||
|
|
if CurIndex <> NewIndex then
|
|||
|
|
begin
|
|||
|
|
BeginUpdate;
|
|||
|
|
try
|
|||
|
|
TempString := Get(CurIndex);
|
|||
|
|
TempObject := GetObject(CurIndex);
|
|||
|
|
Delete(CurIndex);
|
|||
|
|
InsertObject(NewIndex, TempString, TempObject);
|
|||
|
|
finally
|
|||
|
|
EndUpdate;
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
procedure TWideStrings.ReadData(Reader: TReader);
|
|||
|
|
begin
|
|||
|
|
case Reader.NextValue of
|
|||
|
|
vaLString, vaString:
|
|||
|
|
SetText(Reader.ReadString);
|
|||
|
|
else
|
|||
|
|
SetText(Reader.ReadWideString);
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
procedure TWideStrings.SaveToFile(const FileName: string);
|
|||
|
|
var
|
|||
|
|
Stream: TStream;
|
|||
|
|
begin
|
|||
|
|
Stream := TFileStream.Create(FileName, fmCreate);
|
|||
|
|
try
|
|||
|
|
SaveToStream(Stream);
|
|||
|
|
finally
|
|||
|
|
Stream.Free;
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
procedure TWideStrings.SaveToStream(Stream: TStream; WithBOM: Boolean = True);
|
|||
|
|
// Saves the currently loaded text into the given stream. WithBOM determines whether to write a
|
|||
|
|
// byte order mark or not. Note: when saved as ANSI text there will never be a BOM.
|
|||
|
|
var
|
|||
|
|
SW: WideString;
|
|||
|
|
SA: string;
|
|||
|
|
Allowed: Boolean;
|
|||
|
|
Run: PWideChar;
|
|||
|
|
begin
|
|||
|
|
// The application can decide in which format to save the content.
|
|||
|
|
// If FSaveUnicode is False then all strings are saved in standard ANSI format
|
|||
|
|
// which is also loadable by TStrings but you should be aware that all Unicode
|
|||
|
|
// strings are then converted to ANSI based on the current system locale.
|
|||
|
|
// An extra event is supplied to ask the user about the potential loss of
|
|||
|
|
// information when converting Unicode to ANSI strings.
|
|||
|
|
SW := GetTextStr;
|
|||
|
|
Allowed := True;
|
|||
|
|
FSaved := False; // be pessimistic
|
|||
|
|
// A check for potential information loss makes only sense if the application has
|
|||
|
|
// set an event to be used as call back to ask about the conversion.
|
|||
|
|
if (FSaveFormat = sfAnsi) and Assigned(FOnConfirmConversion) then
|
|||
|
|
begin
|
|||
|
|
// application requests to save only ANSI characters, so check the text and
|
|||
|
|
// call back in case information could be lost
|
|||
|
|
Run := PWideChar(SW);
|
|||
|
|
// only ask if there's at least one Unicode character in the text
|
|||
|
|
while Run^ in [WideChar(#1)..WideChar(#255)] do
|
|||
|
|
Inc(Run);
|
|||
|
|
// Note: The application can still set FSaveUnicode to True in the callback.
|
|||
|
|
if Run^ <> WideNull then
|
|||
|
|
DoConfirmConversion(Allowed);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
if Allowed then
|
|||
|
|
begin
|
|||
|
|
// only save if allowed
|
|||
|
|
case SaveFormat of
|
|||
|
|
sfUTF16LSB :
|
|||
|
|
begin
|
|||
|
|
Stream.WriteBuffer(BOM_UTF16_LSB[0],SizeOf(BOM_UTF16_LSB));
|
|||
|
|
Stream.WriteBuffer(SW[1],Length(SW)*SizeOf(UTF16));
|
|||
|
|
FSaved := True;
|
|||
|
|
end;
|
|||
|
|
sfUTF16MSB :
|
|||
|
|
begin
|
|||
|
|
Stream.WriteBuffer(BOM_UTF16_MSB[0],SizeOf(BOM_UTF16_MSB));
|
|||
|
|
StrSwapByteOrder(PWideChar(SW));
|
|||
|
|
Stream.WriteBuffer(SW[1],Length(SW)*SizeOf(UTF16));
|
|||
|
|
FSaved := True;
|
|||
|
|
end;
|
|||
|
|
sfUTF8 :
|
|||
|
|
begin
|
|||
|
|
Stream.WriteBuffer(BOM_UTF8[0],SizeOf(BOM_UTF8));
|
|||
|
|
SA := WideStringToUTF8(SW);
|
|||
|
|
Stream.WriteBuffer(SA[1],Length(SA)*SizeOf(UTF8));
|
|||
|
|
FSaved := True;
|
|||
|
|
end;
|
|||
|
|
sfAnsi :
|
|||
|
|
begin
|
|||
|
|
SA := WideStringToStringEx(SW,CodePageFromLocale(FLanguage));
|
|||
|
|
Stream.WriteBuffer(SA[1],Length(SA)*SizeOf(Char));
|
|||
|
|
FSaved := True;
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
procedure TWideStrings.SetCapacity(NewCapacity: Integer);
|
|||
|
|
begin
|
|||
|
|
// do nothing - descendants may optionally implement this method
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
procedure TWideStrings.SetCommaText(const Value: WideString);
|
|||
|
|
var
|
|||
|
|
P, P1: PWideChar;
|
|||
|
|
S: WideString;
|
|||
|
|
begin
|
|||
|
|
BeginUpdate;
|
|||
|
|
try
|
|||
|
|
Clear;
|
|||
|
|
P := PWideChar(Value);
|
|||
|
|
while P^ in [WideChar(#1)..WideSpace] do
|
|||
|
|
Inc(P);
|
|||
|
|
while P^ <> WideNull do
|
|||
|
|
begin
|
|||
|
|
if P^ = '"' then
|
|||
|
|
S := WideExtractQuotedStr(P, '"')
|
|||
|
|
else
|
|||
|
|
begin
|
|||
|
|
P1 := P;
|
|||
|
|
while (P^ > WideSpace) and (P^ <> ',') do
|
|||
|
|
Inc(P);
|
|||
|
|
SetString(S, P1, P - P1);
|
|||
|
|
end;
|
|||
|
|
Add(S);
|
|||
|
|
|
|||
|
|
while P^ in [WideChar(#1)..WideSpace] do
|
|||
|
|
Inc(P);
|
|||
|
|
if P^ = ',' then
|
|||
|
|
begin
|
|||
|
|
repeat
|
|||
|
|
Inc(P);
|
|||
|
|
until not (P^ in [WideChar(#1)..WideSpace]);
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
finally
|
|||
|
|
EndUpdate;
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
procedure TWideStrings.SetText(const Value: WideString);
|
|||
|
|
var
|
|||
|
|
Head,
|
|||
|
|
Tail: PWideChar;
|
|||
|
|
S: WideString;
|
|||
|
|
begin
|
|||
|
|
BeginUpdate;
|
|||
|
|
try
|
|||
|
|
Clear;
|
|||
|
|
Head := PWideChar(Value);
|
|||
|
|
while Head^ <> WideNull do
|
|||
|
|
begin
|
|||
|
|
Tail := Head;
|
|||
|
|
while not (Tail^ in [WideNull, WideLineFeed, WideCarriageReturn, WideVerticalTab, WideFormFeed]) and
|
|||
|
|
(Tail^ <> WideLineSeparator) and (Tail^ <> WideParagraphSeparator) do
|
|||
|
|
Inc(Tail);
|
|||
|
|
SetString(S, Head, Tail - Head);
|
|||
|
|
Add(S);
|
|||
|
|
Head := Tail;
|
|||
|
|
if Head^ <> WideNull then
|
|||
|
|
begin
|
|||
|
|
Inc(Head);
|
|||
|
|
if (Tail^ = WideCarriageReturn) and (Head^ = WideLineFeed) then
|
|||
|
|
Inc(Head);
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
finally
|
|||
|
|
EndUpdate;
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
procedure TWideStrings.SetUpdateState(Updating: Boolean);
|
|||
|
|
begin
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
procedure TWideStrings.SetNormalizationForm(const Value: TNormalizationForm);
|
|||
|
|
var
|
|||
|
|
I: Integer;
|
|||
|
|
begin
|
|||
|
|
if FNormalizationForm <> Value then
|
|||
|
|
begin
|
|||
|
|
FNormalizationForm := Value;
|
|||
|
|
if FNormalizationForm <> nfNone then
|
|||
|
|
begin
|
|||
|
|
// renormalize all strings according to the new form
|
|||
|
|
for I := 0 to GetCount - 1 do
|
|||
|
|
Put(I, WideNormalize(Get(I), FNormalizationForm));
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
procedure TWideStrings.SetValue(const Name, Value: WideString);
|
|||
|
|
var
|
|||
|
|
I : Integer;
|
|||
|
|
begin
|
|||
|
|
I := IndexOfName(Name);
|
|||
|
|
if Value <> '' then
|
|||
|
|
begin
|
|||
|
|
if I < 0 then
|
|||
|
|
I := Add('');
|
|||
|
|
Put(I, Name + '=' + Value);
|
|||
|
|
end
|
|||
|
|
else
|
|||
|
|
begin
|
|||
|
|
if I >= 0 then
|
|||
|
|
Delete(I);
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
procedure TWideStrings.WriteData(Writer: TWriter);
|
|||
|
|
begin
|
|||
|
|
Writer.WriteWideString(GetTextStr);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
//=== { TWideStringList } ====================================================
|
|||
|
|
|
|||
|
|
destructor TWideStringList.Destroy;
|
|||
|
|
begin
|
|||
|
|
FOnChange := nil;
|
|||
|
|
FOnChanging := nil;
|
|||
|
|
Clear;
|
|||
|
|
inherited Destroy;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function TWideStringList.Add(const S: WideString): Integer;
|
|||
|
|
begin
|
|||
|
|
if not Sorted then
|
|||
|
|
Result := FCount
|
|||
|
|
else
|
|||
|
|
begin
|
|||
|
|
if Find(S, Result) then
|
|||
|
|
begin
|
|||
|
|
case Duplicates of
|
|||
|
|
dupIgnore:
|
|||
|
|
Exit;
|
|||
|
|
dupError:
|
|||
|
|
Error(SDuplicateString, 0);
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
InsertItem(Result, S);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
procedure TWideStringList.Changed;
|
|||
|
|
begin
|
|||
|
|
if (FUpdateCount = 0) and Assigned(FOnChange) then
|
|||
|
|
FOnChange(Self);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
procedure TWideStringList.Changing;
|
|||
|
|
begin
|
|||
|
|
if (FUpdateCount = 0) and Assigned(FOnChanging) then
|
|||
|
|
FOnChanging(Self);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
procedure TWideStringList.Clear;
|
|||
|
|
{$IFDEF OWN_WIDESTRING_MEMMGR}
|
|||
|
|
var
|
|||
|
|
I: Integer;
|
|||
|
|
{$ENDIF OWN_WIDESTRING_MEMMGR}
|
|||
|
|
begin
|
|||
|
|
if FCount <> 0 then
|
|||
|
|
begin
|
|||
|
|
Changing;
|
|||
|
|
{$IFDEF OWN_WIDESTRING_MEMMGR}
|
|||
|
|
for I := 0 to FCount - 1 do
|
|||
|
|
with FList[I] do
|
|||
|
|
if TDynWideCharArray(FString) <> nil then
|
|||
|
|
TDynWideCharArray(FString) := nil;
|
|||
|
|
{$ENDIF OWN_WIDESTRING_MEMMGR}
|
|||
|
|
// this will automatically finalize the array
|
|||
|
|
FList := nil;
|
|||
|
|
FCount := 0;
|
|||
|
|
SetCapacity(0);
|
|||
|
|
Changed;
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
procedure TWideStringList.Delete(Index: Integer);
|
|||
|
|
begin
|
|||
|
|
if Cardinal(Index) >= Cardinal(FCount) then
|
|||
|
|
Error(SListIndexError, Index);
|
|||
|
|
Changing;
|
|||
|
|
|
|||
|
|
{$IFDEF OWN_WIDESTRING_MEMMGR}
|
|||
|
|
SetListString(Index, '');
|
|||
|
|
{$ELSE}
|
|||
|
|
FList[Index].FString := '';
|
|||
|
|
{$ENDIF OWN_WIDESTRING_MEMMGR}
|
|||
|
|
Dec(FCount);
|
|||
|
|
if Index < FCount then
|
|||
|
|
begin
|
|||
|
|
System.Move(FList[Index + 1], FList[Index], (FCount - Index) * SizeOf(TWideStringItem));
|
|||
|
|
Pointer(FList[FCount].FString) := nil; // avoid freeing the string, the address is now used in another element
|
|||
|
|
end;
|
|||
|
|
Changed;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
procedure TWideStringList.Exchange(Index1, Index2: Integer);
|
|||
|
|
begin
|
|||
|
|
if Cardinal(Index1) >= Cardinal(FCount) then
|
|||
|
|
Error(SListIndexError, Index1);
|
|||
|
|
if Cardinal(Index2) >= Cardinal(FCount) then
|
|||
|
|
Error(SListIndexError, Index2);
|
|||
|
|
Changing;
|
|||
|
|
ExchangeItems(Index1, Index2);
|
|||
|
|
Changed;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
procedure TWideStringList.ExchangeItems(Index1, Index2: Integer);
|
|||
|
|
var
|
|||
|
|
Temp: TWideStringItem;
|
|||
|
|
begin
|
|||
|
|
Temp := FList[Index1];
|
|||
|
|
FList[Index1] := FList[Index2];
|
|||
|
|
FList[Index2] := Temp;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function TWideStringList.Find(const S: WideString; var Index: Integer): Boolean;
|
|||
|
|
var
|
|||
|
|
L, H, I, C: Integer;
|
|||
|
|
NormString: WideString;
|
|||
|
|
begin
|
|||
|
|
Result := False;
|
|||
|
|
NormString := WideNormalize(S, FNormalizationForm);
|
|||
|
|
L := 0;
|
|||
|
|
H := FCount - 1;
|
|||
|
|
while L <= H do
|
|||
|
|
begin
|
|||
|
|
I := (L + H) shr 1;
|
|||
|
|
C := WideCompareText(FList[I].FString, NormString, FLanguage);
|
|||
|
|
if C < 0 then
|
|||
|
|
L := I+1
|
|||
|
|
else
|
|||
|
|
begin
|
|||
|
|
H := I - 1;
|
|||
|
|
if C = 0 then
|
|||
|
|
begin
|
|||
|
|
Result := True;
|
|||
|
|
if Duplicates <> dupAccept then
|
|||
|
|
L := I;
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
Index := L;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function TWideStringList.Get(Index: Integer): WideString;
|
|||
|
|
{$IFDEF OWN_WIDESTRING_MEMMGR}
|
|||
|
|
var
|
|||
|
|
Len: Integer;
|
|||
|
|
{$ENDIF OWN_WIDESTRING_MEMMGR}
|
|||
|
|
begin
|
|||
|
|
if Cardinal(Index) >= Cardinal(FCount) then
|
|||
|
|
Error(SListIndexError, Index);
|
|||
|
|
{$IFDEF OWN_WIDESTRING_MEMMGR}
|
|||
|
|
with FList[Index] do
|
|||
|
|
begin
|
|||
|
|
Len := Length(TDynWideCharArray(FString));
|
|||
|
|
if Len > 0 then
|
|||
|
|
begin
|
|||
|
|
SetLength(Result, Len - 1); // exclude #0
|
|||
|
|
if Result <> '' then
|
|||
|
|
System.Move(FString^, Result[1], Len * SizeOf(WideChar));
|
|||
|
|
end
|
|||
|
|
else
|
|||
|
|
Result := '';
|
|||
|
|
end;
|
|||
|
|
{$ELSE}
|
|||
|
|
Result := FList[Index].FString;
|
|||
|
|
{$ENDIF OWN_WIDESTRING_MEMMGR}
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function TWideStringList.GetCapacity: Integer;
|
|||
|
|
begin
|
|||
|
|
Result := Length(FList);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function TWideStringList.GetCount: Integer;
|
|||
|
|
begin
|
|||
|
|
Result := FCount;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function TWideStringList.GetObject(Index: Integer): TObject;
|
|||
|
|
begin
|
|||
|
|
if Cardinal(Index) >= Cardinal(FCount) then
|
|||
|
|
Error(SListIndexError, Index);
|
|||
|
|
Result := FList[Index].FObject;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
procedure TWideStringList.Grow;
|
|||
|
|
var
|
|||
|
|
Delta,
|
|||
|
|
Len: Integer;
|
|||
|
|
begin
|
|||
|
|
Len := Length(FList);
|
|||
|
|
if Len > 64 then
|
|||
|
|
Delta := Len div 4
|
|||
|
|
else
|
|||
|
|
begin
|
|||
|
|
if Len > 8 then
|
|||
|
|
Delta := 16
|
|||
|
|
else
|
|||
|
|
Delta := 4;
|
|||
|
|
end;
|
|||
|
|
SetCapacity(Len + Delta);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function TWideStringList.IndexOf(const S: WideString): Integer;
|
|||
|
|
begin
|
|||
|
|
if not Sorted then
|
|||
|
|
Result := inherited IndexOf(S)
|
|||
|
|
else
|
|||
|
|
if not Find(S, Result) then
|
|||
|
|
Result := -1;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
procedure TWideStringList.Insert(Index: Integer; const S: WideString);
|
|||
|
|
begin
|
|||
|
|
if Sorted then
|
|||
|
|
Error(SSortedListError, 0);
|
|||
|
|
if Cardinal(Index) > Cardinal(FCount) then
|
|||
|
|
Error(SListIndexError, Index);
|
|||
|
|
InsertItem(Index, S);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
{$IFDEF OWN_WIDESTRING_MEMMGR}
|
|||
|
|
procedure TWideStringList.SetListString(Index: Integer; const S: WideString);
|
|||
|
|
var
|
|||
|
|
Len: Integer;
|
|||
|
|
A: TDynWideCharArray;
|
|||
|
|
begin
|
|||
|
|
with FList[Index] do
|
|||
|
|
begin
|
|||
|
|
A := TDynWideCharArray(FString);
|
|||
|
|
if A <> nil then
|
|||
|
|
A := nil; // free memory
|
|||
|
|
|
|||
|
|
Len := Length(S);
|
|||
|
|
if Len > 0 then
|
|||
|
|
begin
|
|||
|
|
SetLength(A, Len + 1); // include #0
|
|||
|
|
System.Move(S[1], A[0], Len * SizeOf(WideChar));
|
|||
|
|
A[Len] := #0;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
FString := PWideChar(A);
|
|||
|
|
Pointer(A) := nil; // do not release the array on procedure exit
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
{$ENDIF OWN_WIDESTRING_MEMMGR}
|
|||
|
|
|
|||
|
|
procedure TWideStringList.InsertItem(Index: Integer; const S: WideString);
|
|||
|
|
begin
|
|||
|
|
Changing;
|
|||
|
|
if FCount = Length(FList) then
|
|||
|
|
Grow;
|
|||
|
|
if Index < FCount then
|
|||
|
|
System.Move(FList[Index], FList[Index + 1], (FCount - Index) * SizeOf(TWideStringItem));
|
|||
|
|
with FList[Index] do
|
|||
|
|
begin
|
|||
|
|
Pointer(FString) := nil; // avoid freeing the string, the address is now used in another element
|
|||
|
|
FObject := nil;
|
|||
|
|
if (FNormalizationForm <> nfNone) and (Length(S) > 0) then
|
|||
|
|
{$IFDEF OWN_WIDESTRING_MEMMGR}
|
|||
|
|
SetListString(Index, WideNormalize(S, FNormalizationForm))
|
|||
|
|
else
|
|||
|
|
SetListString(Index, S);
|
|||
|
|
{$ELSE}
|
|||
|
|
FString := WideNormalize(S, FNormalizationForm)
|
|||
|
|
else
|
|||
|
|
FString := S;
|
|||
|
|
{$ENDIF OWN_WIDESTRING_MEMMGR}
|
|||
|
|
end;
|
|||
|
|
Inc(FCount);
|
|||
|
|
Changed;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
procedure TWideStringList.Put(Index: Integer; const S: WideString);
|
|||
|
|
begin
|
|||
|
|
if Sorted then
|
|||
|
|
Error(SSortedListError, 0);
|
|||
|
|
if Cardinal(Index) >= Cardinal(FCount) then
|
|||
|
|
Error(SListIndexError, Index);
|
|||
|
|
Changing;
|
|||
|
|
|
|||
|
|
if (FNormalizationForm <> nfNone) and (Length(S) > 0) then
|
|||
|
|
{$IFDEF OWN_WIDESTRING_MEMMGR}
|
|||
|
|
SetListString(Index, WideNormalize(S, FNormalizationForm))
|
|||
|
|
else
|
|||
|
|
SetListString(Index, S);
|
|||
|
|
{$ELSE}
|
|||
|
|
FList[Index].FString := WideNormalize(S, FNormalizationForm)
|
|||
|
|
else
|
|||
|
|
FList[Index].FString := S;
|
|||
|
|
{$ENDIF OWN_WIDESTRING_MEMMGR}
|
|||
|
|
Changed;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
procedure TWideStringList.PutObject(Index: Integer; AObject: TObject);
|
|||
|
|
begin
|
|||
|
|
if Cardinal(Index) >= Cardinal(FCount) then
|
|||
|
|
Error(SListIndexError, Index);
|
|||
|
|
Changing;
|
|||
|
|
FList[Index].FObject := AObject;
|
|||
|
|
Changed;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
procedure TWideStringList.QuickSort(L, R: Integer);
|
|||
|
|
var
|
|||
|
|
I, J: Integer;
|
|||
|
|
P: WideString;
|
|||
|
|
begin
|
|||
|
|
repeat
|
|||
|
|
I := L;
|
|||
|
|
J := R;
|
|||
|
|
P := FList[(L + R) shr 1].FString;
|
|||
|
|
repeat
|
|||
|
|
while WideCompareText(FList[I].FString, P, FLanguage) < 0 do
|
|||
|
|
Inc(I);
|
|||
|
|
while WideCompareText(FList[J].FString, P, FLanguage) > 0 do
|
|||
|
|
Dec(J);
|
|||
|
|
if I <= J then
|
|||
|
|
begin
|
|||
|
|
ExchangeItems(I, J);
|
|||
|
|
Inc(I);
|
|||
|
|
Dec(J);
|
|||
|
|
end;
|
|||
|
|
until I > J;
|
|||
|
|
if L < J then
|
|||
|
|
QuickSort(L, J);
|
|||
|
|
L := I;
|
|||
|
|
until I >= R;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
procedure TWideStringList.SetCapacity(NewCapacity: Integer);
|
|||
|
|
begin
|
|||
|
|
SetLength(FList, NewCapacity);
|
|||
|
|
if NewCapacity < FCount then
|
|||
|
|
FCount := NewCapacity;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
procedure TWideStringList.SetSorted(Value: Boolean);
|
|||
|
|
begin
|
|||
|
|
if FSorted <> Value then
|
|||
|
|
begin
|
|||
|
|
if Value then
|
|||
|
|
Sort;
|
|||
|
|
FSorted := Value;
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
procedure TWideStringList.SetUpdateState(Updating: Boolean);
|
|||
|
|
begin
|
|||
|
|
if Updating then
|
|||
|
|
Changing
|
|||
|
|
else
|
|||
|
|
Changed;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
procedure TWideStringList.Sort;
|
|||
|
|
begin
|
|||
|
|
if not Sorted and (FCount > 1) then
|
|||
|
|
begin
|
|||
|
|
Changing;
|
|||
|
|
QuickSort(0, FCount - 1);
|
|||
|
|
Changed;
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
procedure TWideStringList.SetLanguage(Value: LCID);
|
|||
|
|
begin
|
|||
|
|
inherited SetLanguage(Value);
|
|||
|
|
if Sorted then
|
|||
|
|
Sort;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
//----------------- functions for null terminated strings ------------------------------------------
|
|||
|
|
|
|||
|
|
function StrLenW(Str: PWideChar): Cardinal;
|
|||
|
|
// returns number of characters in a string excluding the null terminator
|
|||
|
|
asm
|
|||
|
|
MOV EDX, EDI
|
|||
|
|
MOV EDI, EAX
|
|||
|
|
MOV ECX, 0FFFFFFFFH
|
|||
|
|
XOR AX, AX
|
|||
|
|
REPNE SCASW
|
|||
|
|
MOV EAX, 0FFFFFFFEH
|
|||
|
|
SUB EAX, ECX
|
|||
|
|
MOV EDI, EDX
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function StrEndW(Str: PWideChar): PWideChar;
|
|||
|
|
// returns a pointer to the end of a null terminated string
|
|||
|
|
asm
|
|||
|
|
MOV EDX, EDI
|
|||
|
|
MOV EDI, EAX
|
|||
|
|
MOV ECX, 0FFFFFFFFH
|
|||
|
|
XOR AX, AX
|
|||
|
|
REPNE SCASW
|
|||
|
|
LEA EAX, [EDI - 2]
|
|||
|
|
MOV EDI, EDX
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function StrMoveW(Dest, Source: PWideChar; Count: Cardinal): PWideChar;
|
|||
|
|
// Copies the specified number of characters to the destination string and returns Dest
|
|||
|
|
// also as result. Dest must have enough room to store at least Count characters.
|
|||
|
|
asm
|
|||
|
|
PUSH ESI
|
|||
|
|
PUSH EDI
|
|||
|
|
MOV ESI, EDX
|
|||
|
|
MOV EDI, EAX
|
|||
|
|
MOV EDX, ECX
|
|||
|
|
CMP EDI, ESI
|
|||
|
|
JG @@1
|
|||
|
|
JE @@2
|
|||
|
|
SHR ECX, 1
|
|||
|
|
REP MOVSD
|
|||
|
|
MOV ECX, EDX
|
|||
|
|
AND ECX, 1
|
|||
|
|
REP MOVSW
|
|||
|
|
JMP @@2
|
|||
|
|
@@1:
|
|||
|
|
LEA ESI, [ESI + 2 * ECX - 2]
|
|||
|
|
LEA EDI, [EDI + 2 * ECX - 2]
|
|||
|
|
STD
|
|||
|
|
AND ECX, 1
|
|||
|
|
REP MOVSW
|
|||
|
|
SUB EDI, 2
|
|||
|
|
SUB ESI, 2
|
|||
|
|
MOV ECX, EDX
|
|||
|
|
SHR ECX, 1
|
|||
|
|
REP MOVSD
|
|||
|
|
CLD
|
|||
|
|
@@2:
|
|||
|
|
POP EDI
|
|||
|
|
POP ESI
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function StrCopyW(Dest, Source: PWideChar): PWideChar;
|
|||
|
|
// copies Source to Dest and returns Dest
|
|||
|
|
asm
|
|||
|
|
PUSH EDI
|
|||
|
|
PUSH ESI
|
|||
|
|
MOV ESI, EAX
|
|||
|
|
MOV EDI, EDX
|
|||
|
|
MOV ECX, 0FFFFFFFFH
|
|||
|
|
XOR AX, AX
|
|||
|
|
REPNE SCASW
|
|||
|
|
NOT ECX
|
|||
|
|
MOV EDI, ESI
|
|||
|
|
MOV ESI, EDX
|
|||
|
|
MOV EDX, ECX
|
|||
|
|
MOV EAX, EDI
|
|||
|
|
SHR ECX, 1
|
|||
|
|
REP MOVSD
|
|||
|
|
MOV ECX, EDX
|
|||
|
|
AND ECX, 1
|
|||
|
|
REP MOVSW
|
|||
|
|
POP ESI
|
|||
|
|
POP EDI
|
|||
|
|
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function StrECopyW(Dest, Source: PWideChar): PWideChar;
|
|||
|
|
// copies Source to Dest and returns a pointer to the null character ending the string
|
|||
|
|
asm
|
|||
|
|
PUSH EDI
|
|||
|
|
PUSH ESI
|
|||
|
|
MOV ESI, EAX
|
|||
|
|
MOV EDI, EDX
|
|||
|
|
MOV ECX, 0FFFFFFFFH
|
|||
|
|
XOR AX, AX
|
|||
|
|
REPNE SCASW
|
|||
|
|
NOT ECX
|
|||
|
|
MOV EDI, ESI
|
|||
|
|
MOV ESI, EDX
|
|||
|
|
MOV EDX, ECX
|
|||
|
|
SHR ECX, 1
|
|||
|
|
REP MOVSD
|
|||
|
|
MOV ECX, EDX
|
|||
|
|
AND ECX, 1
|
|||
|
|
REP MOVSW
|
|||
|
|
LEA EAX, [EDI - 2]
|
|||
|
|
POP ESI
|
|||
|
|
POP EDI
|
|||
|
|
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function StrLCopyW(Dest, Source: PWideChar; MaxLen: Cardinal): PWideChar;
|
|||
|
|
// copies a specified maximum number of characters from Source to Dest
|
|||
|
|
asm
|
|||
|
|
PUSH EDI
|
|||
|
|
PUSH ESI
|
|||
|
|
PUSH EBX
|
|||
|
|
MOV ESI, EAX
|
|||
|
|
MOV EDI, EDX
|
|||
|
|
MOV EBX, ECX
|
|||
|
|
XOR AX, AX
|
|||
|
|
TEST ECX, ECX
|
|||
|
|
JZ @@1
|
|||
|
|
REPNE SCASW
|
|||
|
|
JNE @@1
|
|||
|
|
INC ECX
|
|||
|
|
@@1:
|
|||
|
|
SUB EBX, ECX
|
|||
|
|
MOV EDI, ESI
|
|||
|
|
MOV ESI, EDX
|
|||
|
|
MOV EDX, EDI
|
|||
|
|
MOV ECX, EBX
|
|||
|
|
SHR ECX, 1
|
|||
|
|
REP MOVSD
|
|||
|
|
MOV ECX, EBX
|
|||
|
|
AND ECX, 1
|
|||
|
|
REP MOVSW
|
|||
|
|
STOSW
|
|||
|
|
MOV EAX, EDX
|
|||
|
|
POP EBX
|
|||
|
|
POP ESI
|
|||
|
|
POP EDI
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function StrPCopyWW(Dest: PWideChar; const Source: WideString): PWideChar;
|
|||
|
|
// copies a Pascal-style WideString to a null-terminated wide string
|
|||
|
|
begin
|
|||
|
|
Result := StrLCopyW(Dest, PWideChar(Source), Length(Source));
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function StrPCopyW(Dest: PWideChar; const Source: string): PWideChar;
|
|||
|
|
// copies a Pascal-style string to a null-terminated wide string
|
|||
|
|
begin
|
|||
|
|
Result := StrPLCopyW(Dest, Source, Cardinal(Length(Source)));
|
|||
|
|
Result[Length(Source)] := WideNull;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function StrPLCopyWW(Dest: PWideChar; const Source: WideString; MaxLen: Cardinal): PWideChar;
|
|||
|
|
// copies characters from a Pascal-style WideString into a null-terminated wide string
|
|||
|
|
begin
|
|||
|
|
Result := StrLCopyW(Dest, PWideChar(Source), MaxLen);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function StrPLCopyW(Dest: PWideChar; const Source: string; MaxLen: Cardinal): PWideChar;
|
|||
|
|
// copies characters from a Pascal-style string into a null-terminated wide string
|
|||
|
|
asm
|
|||
|
|
PUSH EDI
|
|||
|
|
PUSH ESI
|
|||
|
|
MOV EDI, EAX
|
|||
|
|
MOV ESI, EDX
|
|||
|
|
MOV EDX, EAX
|
|||
|
|
XOR AX, AX
|
|||
|
|
@@1: LODSB
|
|||
|
|
STOSW
|
|||
|
|
DEC ECX
|
|||
|
|
JNZ @@1
|
|||
|
|
MOV EAX, EDX
|
|||
|
|
POP ESI
|
|||
|
|
POP EDI
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function StrCatW(Dest: PWideChar; const Source: PWideChar): PWideChar;
|
|||
|
|
// appends a copy of Source to the end of Dest and returns the concatenated string
|
|||
|
|
begin
|
|||
|
|
StrCopyW(StrEndW(Dest), Source);
|
|||
|
|
Result := Dest;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
// appends a specified maximum number of WideCharacters to string
|
|||
|
|
|
|||
|
|
function StrLCatW(Dest, Source: PWideChar; MaxLen: Cardinal): PWideChar;
|
|||
|
|
asm
|
|||
|
|
PUSH EDI
|
|||
|
|
PUSH ESI
|
|||
|
|
PUSH EBX
|
|||
|
|
MOV EDI, Dest
|
|||
|
|
MOV ESI, Source
|
|||
|
|
MOV EBX, MaxLen
|
|||
|
|
SHL EBX, 1
|
|||
|
|
CALL StrEndW
|
|||
|
|
MOV ECX, EDI
|
|||
|
|
ADD ECX, EBX
|
|||
|
|
SUB ECX, EAX
|
|||
|
|
JBE @@1
|
|||
|
|
MOV EDX, ESI
|
|||
|
|
SHR ECX, 1
|
|||
|
|
CALL StrLCopyW
|
|||
|
|
@@1:
|
|||
|
|
MOV EAX, EDI
|
|||
|
|
POP EBX
|
|||
|
|
POP ESI
|
|||
|
|
POP EDI
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
const
|
|||
|
|
// data used to bring UTF-16 coded strings into correct UTF-32 order for correct comparation
|
|||
|
|
UTF16Fixup: array [0..31] of Word = (
|
|||
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|||
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|||
|
|
$2000, $F800, $F800, $F800, $F800
|
|||
|
|
);
|
|||
|
|
|
|||
|
|
function StrCompW(const Str1, Str2: PWideChar): Integer;
|
|||
|
|
// Binary comparation of Str1 and Str2 with surrogate fix-up.
|
|||
|
|
// Returns < 0 if Str1 is smaller in binary order than Str2, = 0 if both strings are
|
|||
|
|
// equal and > 0 if Str1 is larger than Str2.
|
|||
|
|
//
|
|||
|
|
// This code is based on an idea of Markus W. Scherer (IBM).
|
|||
|
|
// Note: The surrogate fix-up is necessary because some single value code points have
|
|||
|
|
// larger values than surrogates which are in UTF-32 actually larger.
|
|||
|
|
var
|
|||
|
|
C1, C2: Word;
|
|||
|
|
Run1, Run2: PWideChar;
|
|||
|
|
begin
|
|||
|
|
Run1 := Str1;
|
|||
|
|
Run2 := Str2;
|
|||
|
|
repeat
|
|||
|
|
C1 := Word(Run1^);
|
|||
|
|
C1 := Word(C1 + UTF16Fixup[C1 shr 11]);
|
|||
|
|
C2 := Word(Run2^);
|
|||
|
|
C2 := Word(C2 + UTF16Fixup[C2 shr 11]);
|
|||
|
|
|
|||
|
|
// now C1 and C2 are in UTF-32-compatible order
|
|||
|
|
Result := Integer(C1) - Integer(C2);
|
|||
|
|
if(Result <> 0) or (C1 = 0) or (C2 = 0) then
|
|||
|
|
Break;
|
|||
|
|
Inc(Run1);
|
|||
|
|
Inc(Run2);
|
|||
|
|
until False;
|
|||
|
|
|
|||
|
|
// If the strings have different lengths but the comparation returned equity so far
|
|||
|
|
// then adjust the result so that the longer string is marked as the larger one.
|
|||
|
|
if Result = 0 then
|
|||
|
|
Result := (Run1 - Str1) - (Run2 - Str2);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function StrICompW(const Str1, Str2: PWideChar): Integer;
|
|||
|
|
// Compares Str1 to Str2 without case sensitivity.
|
|||
|
|
// See also comments in StrCompW, but keep in mind that case folding might result in
|
|||
|
|
// one-to-many mappings which must be considered here.
|
|||
|
|
var
|
|||
|
|
C1, C2: Word;
|
|||
|
|
S1, S2: PWideChar;
|
|||
|
|
Run1, Run2: PWideChar;
|
|||
|
|
Folded1, Folded2: WideString;
|
|||
|
|
begin
|
|||
|
|
// Because of size changes of the string when doing case folding
|
|||
|
|
// it is unavoidable to convert both strings completely in advance.
|
|||
|
|
S1 := Str1;
|
|||
|
|
S2 := Str2;
|
|||
|
|
Folded1 := '';
|
|||
|
|
while S1^ <> #0 do
|
|||
|
|
begin
|
|||
|
|
Folded1 := Folded1 + WideCaseFolding(S1^);
|
|||
|
|
Inc(S1);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
Folded2 := '';
|
|||
|
|
while S2^ <> #0 do
|
|||
|
|
begin
|
|||
|
|
Folded2 := Folded2 + WideCaseFolding(S2^);
|
|||
|
|
Inc(S2);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
Run1 := PWideChar(Folded1);
|
|||
|
|
Run2 := PWideChar(Folded2);
|
|||
|
|
repeat
|
|||
|
|
C1 := Word(Run1^);
|
|||
|
|
C1 := Word(C1 + UTF16Fixup[C1 shr 11]);
|
|||
|
|
C2 := Word(Run2^);
|
|||
|
|
C2 := Word(C2 + UTF16Fixup[C2 shr 11]);
|
|||
|
|
|
|||
|
|
// now C1 and C2 are in UTF-32-compatible order
|
|||
|
|
Result := Integer(C1) - Integer(C2);
|
|||
|
|
if(Result <> 0) or (C1 = 0) or (C2 = 0) then
|
|||
|
|
Break;
|
|||
|
|
Inc(Run1);
|
|||
|
|
Inc(Run2);
|
|||
|
|
until False;
|
|||
|
|
|
|||
|
|
// If the strings have different lengths but the comparation returned equity so far
|
|||
|
|
// then adjust the result so that the longer string is marked as the larger one.
|
|||
|
|
if Result = 0 then
|
|||
|
|
Result := (Run1 - PWideChar(Folded1)) - (Run2 - PWideChar(Folded2));
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function StrLICompW(const Str1, Str2: PWideChar; MaxLen: Cardinal): Integer;
|
|||
|
|
// compares strings up to MaxLen code points
|
|||
|
|
// see also StrICompW
|
|||
|
|
var
|
|||
|
|
S1, S2: PWideChar;
|
|||
|
|
C1, C2: Word;
|
|||
|
|
Run1, Run2: PWideChar;
|
|||
|
|
Folded1, Folded2: WideString;
|
|||
|
|
begin
|
|||
|
|
if MaxLen > 0 then
|
|||
|
|
begin
|
|||
|
|
// Because of size changes of the string when doing case folding
|
|||
|
|
// it is unavoidable to convert both strings completely in advance.
|
|||
|
|
S1 := Str1;
|
|||
|
|
S2 := Str2;
|
|||
|
|
Folded1 := '';
|
|||
|
|
while S1^ <> #0 do
|
|||
|
|
begin
|
|||
|
|
Folded1 := Folded1 + WideCaseFolding(S1^);
|
|||
|
|
Inc(S1);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
Folded2 := '';
|
|||
|
|
while S2^ <> #0 do
|
|||
|
|
begin
|
|||
|
|
Folded2 := Folded2 + WideCaseFolding(S2^);
|
|||
|
|
Inc(S2);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
Run1 := PWideChar(Folded1);
|
|||
|
|
Run2 := PWideChar(Folded2);
|
|||
|
|
|
|||
|
|
repeat
|
|||
|
|
C1 := Word(Run1^);
|
|||
|
|
C1 := Word(C1 + UTF16Fixup[C1 shr 11]);
|
|||
|
|
C2 := Word(Run2^);
|
|||
|
|
C2 := Word(C2 + UTF16Fixup[C2 shr 11]);
|
|||
|
|
|
|||
|
|
// now C1 and C2 are in UTF-32-compatible order
|
|||
|
|
{ TODO : surrogates take up 2 words and are counted twice here, count them only once }
|
|||
|
|
Result := Integer(C1) - Integer(C2);
|
|||
|
|
Dec(MaxLen);
|
|||
|
|
if(Result <> 0) or (C1 = 0) or (C2 = 0) or (MaxLen = 0) then
|
|||
|
|
Break;
|
|||
|
|
Inc(Run1);
|
|||
|
|
Inc(Run2);
|
|||
|
|
until False;
|
|||
|
|
end
|
|||
|
|
else
|
|||
|
|
Result := 0;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function StrLCompW(const Str1, Str2: PWideChar; MaxLen: Cardinal): Integer;
|
|||
|
|
// compares strings up to MaxLen code points
|
|||
|
|
// see also StrCompW
|
|||
|
|
var
|
|||
|
|
S1, S2: PWideChar;
|
|||
|
|
C1, C2: Word;
|
|||
|
|
begin
|
|||
|
|
if MaxLen > 0 then
|
|||
|
|
begin
|
|||
|
|
S1 := Str1;
|
|||
|
|
S2 := Str2;
|
|||
|
|
repeat
|
|||
|
|
C1 := Word(S1^);
|
|||
|
|
C1 := Word(C1 + UTF16Fixup[C1 shr 11]);
|
|||
|
|
C2 := Word(S2^);
|
|||
|
|
C2 := Word(C2 + UTF16Fixup[C2 shr 11]);
|
|||
|
|
|
|||
|
|
// now C1 and C2 are in UTF-32-compatible order
|
|||
|
|
{ TODO : surrogates take up 2 words and are counted twice here, count them only once }
|
|||
|
|
Result := Integer(C1) - Integer(C2);
|
|||
|
|
Dec(MaxLen);
|
|||
|
|
if(Result <> 0) or (C1 = 0) or (C2 = 0) or (MaxLen = 0) then
|
|||
|
|
Break;
|
|||
|
|
Inc(S1);
|
|||
|
|
Inc(S2);
|
|||
|
|
until False;
|
|||
|
|
end
|
|||
|
|
else
|
|||
|
|
Result := 0;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function StrNScanW(const Str1, Str2: PWideChar): Integer;
|
|||
|
|
// Determines where (in Str1) the first time one of the characters of Str2 appear.
|
|||
|
|
// The result is the length of a string part of Str1 where none of the characters of
|
|||
|
|
// Str2 do appear (not counting the trailing #0 and starting with position 0 in Str1).
|
|||
|
|
var
|
|||
|
|
Run: PWideChar;
|
|||
|
|
begin
|
|||
|
|
Result := -1;
|
|||
|
|
if (Str1 <> nil) and (Str2 <> nil) then
|
|||
|
|
begin
|
|||
|
|
Run := Str1;
|
|||
|
|
while Run^ <> #0 do
|
|||
|
|
begin
|
|||
|
|
if StrScanW(Str2, Run^) <> nil then
|
|||
|
|
Break;
|
|||
|
|
Inc(Run);
|
|||
|
|
end;
|
|||
|
|
Result := Run - Str1;
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function StrRNScanW(const Str1, Str2: PWideChar): Integer;
|
|||
|
|
// This function does the same as StrRNScanW but uses Str1 in reverse order. This
|
|||
|
|
// means Str1 points to the last character of a string, is traversed reversely
|
|||
|
|
// and terminates with a starting #0. This is useful for parsing strings stored
|
|||
|
|
// in reversed macro buffers etc.
|
|||
|
|
var
|
|||
|
|
Run: PWideChar;
|
|||
|
|
begin
|
|||
|
|
Result := -1;
|
|||
|
|
if (Str1 <> nil) and (Str2 <> nil) then
|
|||
|
|
begin
|
|||
|
|
Run := Str1;
|
|||
|
|
while Run^ <> #0 do
|
|||
|
|
begin
|
|||
|
|
if StrScanW(Str2, Run^) <> nil then
|
|||
|
|
Break;
|
|||
|
|
Dec(Run);
|
|||
|
|
end;
|
|||
|
|
Result := Str1 - Run;
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
// returns a pointer to first occurrence of a specified character in a string
|
|||
|
|
|
|||
|
|
function StrScanW(Str: PWideChar; Chr: WideChar): PWideChar;
|
|||
|
|
asm
|
|||
|
|
PUSH EDI
|
|||
|
|
PUSH EAX
|
|||
|
|
MOV EDI, Str
|
|||
|
|
MOV ECX, 0FFFFFFFFH
|
|||
|
|
XOR AX, AX
|
|||
|
|
REPNE SCASW
|
|||
|
|
NOT ECX
|
|||
|
|
POP EDI
|
|||
|
|
MOV AX, Chr
|
|||
|
|
REPNE SCASW
|
|||
|
|
MOV EAX, 0
|
|||
|
|
JNE @@1
|
|||
|
|
MOV EAX, EDI
|
|||
|
|
SUB EAX, 2
|
|||
|
|
@@1:
|
|||
|
|
POP EDI
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
// Returns a pointer to first occurrence of a specified character in a string
|
|||
|
|
// or nil if not found.
|
|||
|
|
// Note: this is just a binary search for the specified character and there's no
|
|||
|
|
// check for a terminating null. Instead at most StrLen characters are
|
|||
|
|
// searched. This makes this function extremly fast.
|
|||
|
|
//
|
|||
|
|
// on enter EAX contains Str, EDX contains Chr and ECX StrLen
|
|||
|
|
// on exit EAX contains result pointer or nil
|
|||
|
|
|
|||
|
|
function StrScanW(Str: PWideChar; Chr: WideChar; StrLen: Cardinal): PWideChar;
|
|||
|
|
asm
|
|||
|
|
TEST EAX, EAX
|
|||
|
|
JZ @@Exit // get out if the string is nil or StrLen is 0
|
|||
|
|
JCXZ @@Exit
|
|||
|
|
@@Loop:
|
|||
|
|
CMP [EAX], DX // this unrolled loop is actually faster on modern processors
|
|||
|
|
JE @@Exit // than REP SCASW
|
|||
|
|
ADD EAX, 2
|
|||
|
|
DEC ECX
|
|||
|
|
JNZ @@Loop
|
|||
|
|
XOR EAX, EAX
|
|||
|
|
@@Exit:
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
// returns a pointer to the last occurance of Chr in Str
|
|||
|
|
|
|||
|
|
function StrRScanW(Str: PWideChar; Chr: WideChar): PWideChar;
|
|||
|
|
asm
|
|||
|
|
PUSH EDI
|
|||
|
|
MOV EDI, Str
|
|||
|
|
MOV ECX, 0FFFFFFFFH
|
|||
|
|
XOR AX, AX
|
|||
|
|
REPNE SCASW
|
|||
|
|
NOT ECX
|
|||
|
|
STD
|
|||
|
|
SUB EDI, 2
|
|||
|
|
MOV AX, Chr
|
|||
|
|
REPNE SCASW
|
|||
|
|
MOV EAX, 0
|
|||
|
|
JNE @@1
|
|||
|
|
MOV EAX, EDI
|
|||
|
|
ADD EAX, 2
|
|||
|
|
@@1:
|
|||
|
|
CLD
|
|||
|
|
POP EDI
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
// returns a pointer to the first occurance of SubStr in Str
|
|||
|
|
|
|||
|
|
function StrPosW(Str, SubStr: PWideChar): PWideChar;
|
|||
|
|
asm
|
|||
|
|
PUSH EDI
|
|||
|
|
PUSH ESI
|
|||
|
|
PUSH EBX
|
|||
|
|
OR EAX, EAX
|
|||
|
|
JZ @@2
|
|||
|
|
OR EDX, EDX
|
|||
|
|
JZ @@2
|
|||
|
|
MOV EBX, EAX
|
|||
|
|
MOV EDI, EDX
|
|||
|
|
XOR AX, AX
|
|||
|
|
MOV ECX, 0FFFFFFFFH
|
|||
|
|
REPNE SCASW
|
|||
|
|
NOT ECX
|
|||
|
|
DEC ECX
|
|||
|
|
JZ @@2
|
|||
|
|
MOV ESI, ECX
|
|||
|
|
MOV EDI, EBX
|
|||
|
|
MOV ECX, 0FFFFFFFFH
|
|||
|
|
REPNE SCASW
|
|||
|
|
NOT ECX
|
|||
|
|
SUB ECX, ESI
|
|||
|
|
JBE @@2
|
|||
|
|
MOV EDI, EBX
|
|||
|
|
LEA EBX, [ESI - 1]
|
|||
|
|
@@1:
|
|||
|
|
MOV ESI, EDX
|
|||
|
|
LODSW
|
|||
|
|
REPNE SCASW
|
|||
|
|
JNE @@2
|
|||
|
|
MOV EAX, ECX
|
|||
|
|
PUSH EDI
|
|||
|
|
MOV ECX, EBX
|
|||
|
|
REPE CMPSW
|
|||
|
|
POP EDI
|
|||
|
|
MOV ECX, EAX
|
|||
|
|
JNE @@1
|
|||
|
|
LEA EAX, [EDI - 2]
|
|||
|
|
JMP @@3
|
|||
|
|
@@2:
|
|||
|
|
XOR EAX, EAX
|
|||
|
|
@@3:
|
|||
|
|
POP EBX
|
|||
|
|
POP ESI
|
|||
|
|
POP EDI
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function StrAllocW(WideSize: Cardinal): PWideChar;
|
|||
|
|
// Allocates a buffer for a null-terminated wide string and returns a pointer
|
|||
|
|
// to the first character of the string.
|
|||
|
|
begin
|
|||
|
|
WideSize := SizeOf(WideChar) * WideSize + SizeOf(Cardinal);
|
|||
|
|
Result := AllocMem(WideSize);
|
|||
|
|
Cardinal(Pointer(Result)^) := WideSize;
|
|||
|
|
Inc(Result, SizeOf(Cardinal) div SizeOf(WideChar));
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function StrBufSizeW(const Str: PWideChar): Cardinal;
|
|||
|
|
// Returns max number of wide characters that can be stored in a buffer
|
|||
|
|
// allocated by StrAllocW.
|
|||
|
|
var
|
|||
|
|
P: PWideChar;
|
|||
|
|
begin
|
|||
|
|
if Str <> nil then
|
|||
|
|
begin
|
|||
|
|
P := Str;
|
|||
|
|
Dec(P, SizeOf(Cardinal) div SizeOf(WideChar));
|
|||
|
|
Result := (Cardinal(PInteger(P)^) - SizeOf(Cardinal)) div SizeOf(WideChar);
|
|||
|
|
end
|
|||
|
|
else
|
|||
|
|
Result := 0;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function StrNewW(const Str: PWideChar): PWideChar;
|
|||
|
|
// Duplicates the given string (if not nil) and returns the address of the new string.
|
|||
|
|
var
|
|||
|
|
Size: Cardinal;
|
|||
|
|
begin
|
|||
|
|
if Str = nil then
|
|||
|
|
Result := nil
|
|||
|
|
else
|
|||
|
|
begin
|
|||
|
|
Size := StrLenW(Str) + 1;
|
|||
|
|
Result := StrMoveW(StrAllocW(Size), Str, Size);
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function StrNewW(const Str: WideString): PWideChar;
|
|||
|
|
begin
|
|||
|
|
Result := StrNewW(PWideChar(Str));
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
procedure StrDisposeW(Str: PWideChar);
|
|||
|
|
// releases a string allocated with StrNewW or StrAllocW
|
|||
|
|
begin
|
|||
|
|
if Str <> nil then
|
|||
|
|
begin
|
|||
|
|
Dec(Str, SizeOf(Cardinal) div SizeOf(WideChar));
|
|||
|
|
FreeMem(Str);
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
procedure StrDisposeAndNilW(var Str: PWideChar);
|
|||
|
|
begin
|
|||
|
|
StrDisposeW(Str);
|
|||
|
|
Str := nil;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
// exchanges in each character of the given string the low order and high order
|
|||
|
|
// byte to go from LSB to MSB and vice versa.
|
|||
|
|
// EAX contains address of string
|
|||
|
|
|
|||
|
|
procedure StrSwapByteOrder(Str: PWideChar);
|
|||
|
|
asm
|
|||
|
|
PUSH ESI
|
|||
|
|
PUSH EDI
|
|||
|
|
MOV ESI, EAX
|
|||
|
|
MOV EDI, ESI
|
|||
|
|
XOR EAX, EAX // clear high order byte to be able to use 32bit operand below
|
|||
|
|
@@1:
|
|||
|
|
LODSW
|
|||
|
|
OR EAX, EAX
|
|||
|
|
JZ @@2
|
|||
|
|
XCHG AL, AH
|
|||
|
|
STOSW
|
|||
|
|
JMP @@1
|
|||
|
|
@@2:
|
|||
|
|
POP EDI
|
|||
|
|
POP ESI
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function WideAdjustLineBreaks(const S: WideString): WideString;
|
|||
|
|
var
|
|||
|
|
Source,
|
|||
|
|
SourceEnd,
|
|||
|
|
Dest: PWideChar;
|
|||
|
|
begin
|
|||
|
|
Source := Pointer(S);
|
|||
|
|
SourceEnd := Source + Length(S);
|
|||
|
|
|
|||
|
|
Source := Pointer(S);
|
|||
|
|
SetString(Result, nil, SourceEnd - Source);
|
|||
|
|
Dest := Pointer(Result);
|
|||
|
|
|
|||
|
|
while Source < SourceEnd do
|
|||
|
|
begin
|
|||
|
|
case Source^ of
|
|||
|
|
WideLineFeed:
|
|||
|
|
begin
|
|||
|
|
Dest^ := WideLineSeparator;
|
|||
|
|
Inc(Dest);
|
|||
|
|
Inc(Source);
|
|||
|
|
end;
|
|||
|
|
WideCarriageReturn:
|
|||
|
|
begin
|
|||
|
|
Dest^ := WideLineSeparator;
|
|||
|
|
Inc(Dest);
|
|||
|
|
Inc(Source);
|
|||
|
|
if Source^ = WideLineFeed then
|
|||
|
|
Inc(Source);
|
|||
|
|
end;
|
|||
|
|
else
|
|||
|
|
Dest^ := Source^;
|
|||
|
|
Inc(Dest);
|
|||
|
|
Inc(Source);
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
SetLength(Result, (Integer(Dest) - Integer(Result)) div 2);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function WideQuotedStr(const S: WideString; Quote: WideChar): WideString;
|
|||
|
|
// works like QuotedStr from SysUtils.pas but can insert any quotation character
|
|||
|
|
var
|
|||
|
|
P, Src,
|
|||
|
|
Dest: PWideChar;
|
|||
|
|
AddCount: Integer;
|
|||
|
|
begin
|
|||
|
|
AddCount := 0;
|
|||
|
|
P := StrScanW(PWideChar(S), Quote);
|
|||
|
|
while (P <> nil) do
|
|||
|
|
begin
|
|||
|
|
Inc(P);
|
|||
|
|
Inc(AddCount);
|
|||
|
|
P := StrScanW(P, Quote);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
if AddCount = 0 then
|
|||
|
|
Result := Quote + S + Quote
|
|||
|
|
else
|
|||
|
|
begin
|
|||
|
|
SetLength(Result, Length(S) + AddCount + 2);
|
|||
|
|
Dest := PWideChar(Result);
|
|||
|
|
Dest^ := Quote;
|
|||
|
|
Inc(Dest);
|
|||
|
|
Src := PWideChar(S);
|
|||
|
|
P := StrScanW(Src, Quote);
|
|||
|
|
repeat
|
|||
|
|
Inc(P);
|
|||
|
|
Move(Src^, Dest^, 2 * (P - Src));
|
|||
|
|
Inc(Dest, P - Src);
|
|||
|
|
Dest^ := Quote;
|
|||
|
|
Inc(Dest);
|
|||
|
|
Src := P;
|
|||
|
|
P := StrScanW(Src, Quote);
|
|||
|
|
until P = nil;
|
|||
|
|
P := StrEndW(Src);
|
|||
|
|
Move(Src^, Dest^, 2 * (P - Src));
|
|||
|
|
Inc(Dest, P - Src);
|
|||
|
|
Dest^ := Quote;
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function WideExtractQuotedStr(var Src: PWideChar; Quote: WideChar): WideString;
|
|||
|
|
// extracts a string enclosed in quote characters given by Quote
|
|||
|
|
var
|
|||
|
|
P, Dest: PWideChar;
|
|||
|
|
DropCount: Integer;
|
|||
|
|
begin
|
|||
|
|
Result := '';
|
|||
|
|
if (Src = nil) or (Src^ <> Quote) then
|
|||
|
|
Exit;
|
|||
|
|
|
|||
|
|
Inc(Src);
|
|||
|
|
DropCount := 1;
|
|||
|
|
P := Src;
|
|||
|
|
Src := StrScanW(Src, Quote);
|
|||
|
|
|
|||
|
|
while Src <> nil do // count adjacent pairs of quote chars
|
|||
|
|
begin
|
|||
|
|
Inc(Src);
|
|||
|
|
if Src^ <> Quote then
|
|||
|
|
Break;
|
|||
|
|
Inc(Src);
|
|||
|
|
Inc(DropCount);
|
|||
|
|
Src := StrScanW(Src, Quote);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
if Src = nil then
|
|||
|
|
Src := StrEndW(P);
|
|||
|
|
if (Src - P) <= 1 then
|
|||
|
|
Exit;
|
|||
|
|
|
|||
|
|
if DropCount = 1 then
|
|||
|
|
SetString(Result, P, Src - P - 1)
|
|||
|
|
else
|
|||
|
|
begin
|
|||
|
|
SetLength(Result, Src - P - DropCount);
|
|||
|
|
Dest := PWideChar(Result);
|
|||
|
|
Src := StrScanW(P, Quote);
|
|||
|
|
while Src <> nil do
|
|||
|
|
begin
|
|||
|
|
Inc(Src);
|
|||
|
|
if Src^ <> Quote then
|
|||
|
|
Break;
|
|||
|
|
Move(P^, Dest^, 2 * (Src - P));
|
|||
|
|
Inc(Dest, Src - P);
|
|||
|
|
Inc(Src);
|
|||
|
|
P := Src;
|
|||
|
|
Src := StrScanW(Src, Quote);
|
|||
|
|
end;
|
|||
|
|
if Src = nil then
|
|||
|
|
Src := StrEndW(P);
|
|||
|
|
Move(P^, Dest^, 2 * (Src - P - 1));
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function WideStringOfChar(C: WideChar; Count: Cardinal): WideString;
|
|||
|
|
// returns a string of Count characters filled with C
|
|||
|
|
var
|
|||
|
|
I: Integer;
|
|||
|
|
begin
|
|||
|
|
SetLength(Result, Count);
|
|||
|
|
for I := 1 to Count do
|
|||
|
|
Result[I] := C;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function WideTrim(const S: WideString): WideString;
|
|||
|
|
var
|
|||
|
|
I, L: Integer;
|
|||
|
|
begin
|
|||
|
|
L := Length(S);
|
|||
|
|
I := 1;
|
|||
|
|
while (I <= L) and (UnicodeIsWhiteSpace(UCS4(S[I])) or UnicodeIsControl(UCS4(S[I]))) do
|
|||
|
|
Inc(I);
|
|||
|
|
if I > L then
|
|||
|
|
Result := ''
|
|||
|
|
else
|
|||
|
|
begin
|
|||
|
|
while UnicodeIsWhiteSpace(UCS4(S[L])) or UnicodeIsControl(UCS4(S[L])) do
|
|||
|
|
Dec(L);
|
|||
|
|
Result := Copy(S, I, L - I + 1);
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function WideTrimLeft(const S: WideString): WideString;
|
|||
|
|
var
|
|||
|
|
I, L: Integer;
|
|||
|
|
begin
|
|||
|
|
L := Length(S);
|
|||
|
|
I := 1;
|
|||
|
|
while (I <= L) and (UnicodeIsWhiteSpace(UCS4(S[I])) or UnicodeIsControl(UCS4(S[I]))) do
|
|||
|
|
Inc(I);
|
|||
|
|
Result := Copy(S, I, Maxint);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function WideTrimRight(const S: WideString): WideString;
|
|||
|
|
var
|
|||
|
|
I: Integer;
|
|||
|
|
begin
|
|||
|
|
I := Length(S);
|
|||
|
|
while (I > 0) and (UnicodeIsWhiteSpace(UCS4(S[I])) or UnicodeIsControl(UCS4(S[I]))) do
|
|||
|
|
Dec(I);
|
|||
|
|
Result := Copy(S, 1, I);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
// returns the index of character Ch in S, starts searching at index Index
|
|||
|
|
// Note: This is a quick memory search. No attempt is made to interpret either
|
|||
|
|
// the given charcter nor the string (ligatures, modifiers, surrogates etc.)
|
|||
|
|
// Code from Azret Botash.
|
|||
|
|
|
|||
|
|
function WideCharPos(const S: WideString; const Ch: WideChar; const Index: Integer): Integer;
|
|||
|
|
asm
|
|||
|
|
TEST EAX,EAX // make sure we are not null
|
|||
|
|
JZ @@StrIsNil
|
|||
|
|
DEC ECX // make index zero based
|
|||
|
|
JL @@IdxIsSmall
|
|||
|
|
PUSH EBX
|
|||
|
|
PUSH EDI
|
|||
|
|
MOV EDI, EAX // EDI := S
|
|||
|
|
XOR EAX, EAX
|
|||
|
|
MOV AX, DX // AX := Ch
|
|||
|
|
MOV EDX, [EDI - 4] // EDX := Length(S) * 2
|
|||
|
|
SHR EDX, 1 // EDX := EDX div 2
|
|||
|
|
MOV EBX, EDX // save the length to calc. result
|
|||
|
|
SUB EDX, ECX // EDX = EDX - Index = # of chars to scan
|
|||
|
|
JLE @@IdxIsBig
|
|||
|
|
SHL ECX, 1 // two bytes per char
|
|||
|
|
ADD EDI, ECX // point to index'th char
|
|||
|
|
MOV ECX, EDX // loop counter
|
|||
|
|
REPNE SCASW
|
|||
|
|
JNE @@NoMatch
|
|||
|
|
MOV EAX, EBX // result := saved length -
|
|||
|
|
SUB EAX, ECX // loop counter value
|
|||
|
|
POP EDI
|
|||
|
|
POP EBX
|
|||
|
|
RET
|
|||
|
|
@@IdxIsBig:
|
|||
|
|
@@NoMatch:
|
|||
|
|
XOR EAX,EAX
|
|||
|
|
POP EDI
|
|||
|
|
POP EBX
|
|||
|
|
RET
|
|||
|
|
@@IdxIsSmall:
|
|||
|
|
XOR EAX, EAX
|
|||
|
|
@@StrIsNil:
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function WideComposeHangul(const Source: WideString): WideString;
|
|||
|
|
var
|
|||
|
|
Len: Integer;
|
|||
|
|
Ch, Last: WideChar;
|
|||
|
|
I: Integer;
|
|||
|
|
LIndex, VIndex,
|
|||
|
|
SIndex, TIndex: Integer;
|
|||
|
|
begin
|
|||
|
|
Result := '';
|
|||
|
|
Len := Length(Source);
|
|||
|
|
if Len > 0 then
|
|||
|
|
begin
|
|||
|
|
Last := Source[1];
|
|||
|
|
Result := Last;
|
|||
|
|
|
|||
|
|
for I := 2 to Len do
|
|||
|
|
begin
|
|||
|
|
Ch := Source[I];
|
|||
|
|
|
|||
|
|
// 1. check to see if two current characters are L and V
|
|||
|
|
LIndex := Word(Last) - LBase;
|
|||
|
|
if (0 <= LIndex) and (LIndex < LCount) then
|
|||
|
|
begin
|
|||
|
|
VIndex := Word(Ch) - VBase;
|
|||
|
|
if (0 <= VIndex) and (VIndex < VCount) then
|
|||
|
|
begin
|
|||
|
|
// make syllable of form LV
|
|||
|
|
Last := WideChar((SBase + (LIndex * VCount + VIndex) * TCount));
|
|||
|
|
Result[Length(Result)] := Last; // reset last
|
|||
|
|
Continue; // discard Ch
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
// 2. check to see if two current characters are LV and T
|
|||
|
|
SIndex := Word(Last) - SBase;
|
|||
|
|
if (0 <= SIndex) and (SIndex < SCount) and ((SIndex mod TCount) = 0) then
|
|||
|
|
begin
|
|||
|
|
TIndex := Word(Ch) - TBase;
|
|||
|
|
if (0 <= TIndex) and (TIndex <= TCount) then
|
|||
|
|
begin
|
|||
|
|
// make syllable of form LVT
|
|||
|
|
Inc(Word(Last), TIndex);
|
|||
|
|
Result[Length(Result)] := Last; // reset last
|
|||
|
|
Continue; // discard Ch
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
// if neither case was true, just add the character
|
|||
|
|
Last := Ch;
|
|||
|
|
Result := Result + Ch;
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
// Returns canonical composition of characters in S.
|
|||
|
|
|
|||
|
|
function WideCompose(const S: WideString): WideString;
|
|||
|
|
var
|
|||
|
|
StarterPos,
|
|||
|
|
CompPos,
|
|||
|
|
DecompPos: Integer;
|
|||
|
|
Composite: UCS4;
|
|||
|
|
Ch,
|
|||
|
|
StarterChar: WideChar;
|
|||
|
|
LastClass,
|
|||
|
|
CurrentClass: Cardinal;
|
|||
|
|
begin
|
|||
|
|
// Set an arbitrary length for the result. This is automatically done when checking
|
|||
|
|
// for hangul composition.
|
|||
|
|
Result := WideComposeHangul(S);
|
|||
|
|
|
|||
|
|
if Result = '' then
|
|||
|
|
Exit;
|
|||
|
|
|
|||
|
|
StarterPos := 1;
|
|||
|
|
CompPos := 2;
|
|||
|
|
|
|||
|
|
StarterChar := Result[StarterPos];
|
|||
|
|
LastClass := CanonicalCombiningClass(UCS4(StarterChar));
|
|||
|
|
if LastClass <> 0 then
|
|||
|
|
LastClass := 256; // fix for irregular combining sequence
|
|||
|
|
|
|||
|
|
// Loop on the (decomposed) characters, combining where possible.
|
|||
|
|
for DecompPos := 2 to Length(Result) do
|
|||
|
|
begin
|
|||
|
|
Ch := Result[DecompPos];
|
|||
|
|
CurrentClass := CanonicalCombiningClass(UCS4(Ch));
|
|||
|
|
if UnicodeComposePair(UCS4(StarterChar), UCS4(Ch), Composite) and
|
|||
|
|
((LastClass < CurrentClass) or (LastClass = 0)) then
|
|||
|
|
begin
|
|||
|
|
Result[StarterPos] := UCS2(Composite);
|
|||
|
|
StarterChar := UCS2(Composite);
|
|||
|
|
end
|
|||
|
|
else
|
|||
|
|
begin
|
|||
|
|
if CurrentClass = 0 then
|
|||
|
|
begin
|
|||
|
|
StarterPos := CompPos;
|
|||
|
|
StarterChar := Ch;
|
|||
|
|
end;
|
|||
|
|
LastClass := CurrentClass;
|
|||
|
|
Result[CompPos] := Ch;
|
|||
|
|
Inc(CompPos);
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
// since we have likely shortened the source string we have to set the correct length on exit
|
|||
|
|
SetLength(Result, CompPos - 1);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
procedure FixCanonical(var S: WideString);
|
|||
|
|
// Examines S and reorders all combining marks in the string so that they are in canonical order.
|
|||
|
|
var
|
|||
|
|
I: Integer;
|
|||
|
|
Temp: WideChar;
|
|||
|
|
CurrentClass,
|
|||
|
|
LastClass: Cardinal;
|
|||
|
|
begin
|
|||
|
|
I := Length(S);
|
|||
|
|
if I > 1 then
|
|||
|
|
begin
|
|||
|
|
CurrentClass := CanonicalCombiningClass(UCS4(S[I]));
|
|||
|
|
repeat
|
|||
|
|
Dec(I);
|
|||
|
|
LastClass := CurrentClass;
|
|||
|
|
CurrentClass := CanonicalCombiningClass(UCS4(S[I]));
|
|||
|
|
|
|||
|
|
// A swap is presumed to be rare (and a double-swap very rare),
|
|||
|
|
// so don't worry about efficiency here.
|
|||
|
|
if (CurrentClass > LastClass) and (LastClass > 0) then
|
|||
|
|
begin
|
|||
|
|
// swap characters
|
|||
|
|
Temp := S[I];
|
|||
|
|
S[I] := S[I + 1];
|
|||
|
|
S[I + 1] := Temp;
|
|||
|
|
|
|||
|
|
// if not at end, backup (one further, to compensate for loop)
|
|||
|
|
if I < Length(S) - 1 then
|
|||
|
|
Inc(I, 2);
|
|||
|
|
// reset type, since we swapped.
|
|||
|
|
CurrentClass := CanonicalCombiningClass(UCS4(S[I]));
|
|||
|
|
end;
|
|||
|
|
until I = 1;
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
procedure GetDecompositions(Compatible: Boolean; Code: UCS4; var Buffer: TUCS4Array);
|
|||
|
|
// helper function to recursively decompose a code point
|
|||
|
|
var
|
|||
|
|
Decomp: TUCS4Array;
|
|||
|
|
I: Integer;
|
|||
|
|
begin
|
|||
|
|
Decomp := UnicodeDecompose(Code, Compatible);
|
|||
|
|
if Assigned(Decomp) then
|
|||
|
|
begin
|
|||
|
|
for I := 0 to High(Decomp) do
|
|||
|
|
GetDecompositions(Compatible, Decomp[I], Buffer);
|
|||
|
|
end
|
|||
|
|
else // if no decomp, append
|
|||
|
|
begin
|
|||
|
|
I := Length(Buffer);
|
|||
|
|
SetLength(Buffer, I + 1);
|
|||
|
|
Buffer[I] := Code;
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function WideDecompose(const S: WideString; Compatible: Boolean): WideString;
|
|||
|
|
// returns a string with all characters of S but decomposed, e.g. <20> is returned as E^ etc.
|
|||
|
|
var
|
|||
|
|
I, J: Integer;
|
|||
|
|
Decomp: TUCS4Array;
|
|||
|
|
begin
|
|||
|
|
Result := '';
|
|||
|
|
Decomp := nil;
|
|||
|
|
|
|||
|
|
// iterate through each source code point
|
|||
|
|
for I := 1 to Length(S) do
|
|||
|
|
begin
|
|||
|
|
Decomp := nil;
|
|||
|
|
GetDecompositions(Compatible, UCS4(S[I]), Decomp);
|
|||
|
|
if Decomp = nil then
|
|||
|
|
Result := Result + S[I]
|
|||
|
|
else
|
|||
|
|
for J := 0 to High(Decomp) do
|
|||
|
|
Result := Result + WideChar(Decomp[J]);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
// combining marks must be sorted according to their canonical combining class
|
|||
|
|
FixCanonical(Result);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
//----------------- general purpose case mapping ---------------------------------------------------
|
|||
|
|
|
|||
|
|
// Note that most of the assigned code points don't have a case mapping and are therefore
|
|||
|
|
// returned as they are. Other code points, however, might be converted into several characters
|
|||
|
|
// like the german <20> (eszett) whose upper case mapping is SS.
|
|||
|
|
|
|||
|
|
function WideCaseFolding(C: WideChar): WideString;
|
|||
|
|
// Special case folding function to map a string to either its lower case or
|
|||
|
|
// to special cases. This can be used for case-insensitive comparation.
|
|||
|
|
var
|
|||
|
|
I: Integer;
|
|||
|
|
Mapping: TUCS4Array;
|
|||
|
|
begin
|
|||
|
|
Mapping := UnicodeCaseFold(UCS4(C));
|
|||
|
|
SetLength(Result, Length(Mapping));
|
|||
|
|
for I := 0 to High(Mapping) do
|
|||
|
|
Result[I + 1] := WideChar(Mapping[I]);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function WideCaseFolding(const S: WideString): WideString;
|
|||
|
|
var
|
|||
|
|
I: Integer;
|
|||
|
|
begin
|
|||
|
|
Result := '';
|
|||
|
|
for I := 1 to Length(S) do
|
|||
|
|
Result := Result + WideCaseFolding(S[I]);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function WideLowerCase(C: WideChar): WideString;
|
|||
|
|
var
|
|||
|
|
I: Integer;
|
|||
|
|
Mapping: TUCS4Array;
|
|||
|
|
begin
|
|||
|
|
Mapping := UnicodeToLower(UCS4(C));
|
|||
|
|
SetLength(Result, Length(Mapping));
|
|||
|
|
for I := 0 to High(Mapping) do
|
|||
|
|
Result[I + 1] := WideChar(Mapping[I]);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function WideLowerCase(const S: WideString): WideString;
|
|||
|
|
var
|
|||
|
|
I: Integer;
|
|||
|
|
begin
|
|||
|
|
Result := '';
|
|||
|
|
for I := 1 to Length(S) do
|
|||
|
|
Result := Result + WideLowerCase(S[I]);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function WideNormalize(const S: WideString; Form: TNormalizationForm): WideString;
|
|||
|
|
var
|
|||
|
|
Temp: WideString;
|
|||
|
|
Compatible: Boolean;
|
|||
|
|
begin
|
|||
|
|
Result := S;
|
|||
|
|
|
|||
|
|
if Form = nfNone then
|
|||
|
|
Exit; // No normalization needed.
|
|||
|
|
|
|||
|
|
Compatible := Form in [nfKC, nfKD];
|
|||
|
|
if Form in [nfD, nfKD] then
|
|||
|
|
Result := WideDecompose(S, Compatible)
|
|||
|
|
else
|
|||
|
|
begin
|
|||
|
|
Temp := WideDecompose(S, Compatible);
|
|||
|
|
Result := WideCompose(Temp);
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function WideSameText(const Str1, Str2: WideString): Boolean;
|
|||
|
|
// Compares both strings case-insensitively and returns True if both are equal, otherwise False is returned.
|
|||
|
|
begin
|
|||
|
|
Result := Length(Str1) = Length(Str2);
|
|||
|
|
if Result then
|
|||
|
|
Result := StrICompW(PWideChar(Str1), PWideChar(Str2)) = 0;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function WideTitleCase(C: WideChar): WideString;
|
|||
|
|
var
|
|||
|
|
I: Integer;
|
|||
|
|
Mapping: TUCS4Array;
|
|||
|
|
begin
|
|||
|
|
Mapping := UnicodeToTitle(UCS4(C));
|
|||
|
|
SetLength(Result, Length(Mapping));
|
|||
|
|
for I := 0 to High(Mapping) do
|
|||
|
|
Result[I + 1] := WideChar(Mapping[I]);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function WideTitleCase(const S: WideString): WideString;
|
|||
|
|
var
|
|||
|
|
I: Integer;
|
|||
|
|
begin
|
|||
|
|
Result := '';
|
|||
|
|
for I := 1 to Length(S) do
|
|||
|
|
Result := Result + WideTitleCase(S[I]);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function WideUpperCase(C: WideChar): WideString;
|
|||
|
|
var
|
|||
|
|
I: Integer;
|
|||
|
|
Mapping: TUCS4Array;
|
|||
|
|
begin
|
|||
|
|
Mapping := UnicodeToUpper(UCS4(C));
|
|||
|
|
SetLength(Result, Length(Mapping));
|
|||
|
|
for I := 0 to High(Mapping) do
|
|||
|
|
Result[I + 1] := WideChar(Mapping[I]);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function WideUpperCase(const S: WideString): WideString;
|
|||
|
|
var
|
|||
|
|
I: Integer;
|
|||
|
|
begin
|
|||
|
|
Result := '';
|
|||
|
|
for I := 1 to Length(S) do
|
|||
|
|
Result := Result + WideUpperCase(S[I]);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
//----------------- character test routines --------------------------------------------------------
|
|||
|
|
|
|||
|
|
function UnicodeIsAlpha(C: UCS4): Boolean; // Is the character alphabetic?
|
|||
|
|
begin
|
|||
|
|
Result := CategoryLookup(C, ClassLetter);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function UnicodeIsDigit(C: UCS4): Boolean; // Is the character a digit?
|
|||
|
|
begin
|
|||
|
|
Result := CategoryLookup(C, [ccNumberDecimalDigit]);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function UnicodeIsAlphaNum(C: UCS4): Boolean; // Is the character alphabetic or a number?
|
|||
|
|
begin
|
|||
|
|
Result := CategoryLookup(C, ClassLetter + [ccNumberDecimalDigit]);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function UnicodeIsCased(C: UCS4): Boolean;
|
|||
|
|
// Is the character a "cased" character, i.e. either lower case, title case or upper case
|
|||
|
|
begin
|
|||
|
|
Result := CategoryLookup(C, [ccLetterLowercase, ccLetterTitleCase, ccLetterUppercase]);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function UnicodeIsControl(C: UCS4): Boolean;
|
|||
|
|
// Is the character a control character?
|
|||
|
|
begin
|
|||
|
|
Result := CategoryLookup(C, [ccOtherControl, ccOtherFormat]);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function UnicodeIsSpace(C: UCS4): Boolean;
|
|||
|
|
// Is the character a spacing character?
|
|||
|
|
begin
|
|||
|
|
Result := CategoryLookup(C, ClassSpace);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function UnicodeIsWhiteSpace(C: UCS4): Boolean;
|
|||
|
|
// Is the character a white space character (same as UnicodeIsSpace plus
|
|||
|
|
// tabulator, new line etc.)?
|
|||
|
|
begin
|
|||
|
|
Result := CategoryLookup(C, ClassSpace + [ccWhiteSpace, ccSegmentSeparator]);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function UnicodeIsBlank(C: UCS4): Boolean;
|
|||
|
|
// Is the character a space separator?
|
|||
|
|
begin
|
|||
|
|
Result := CategoryLookup(C, [ccSeparatorSpace]);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function UnicodeIsPunctuation(C: UCS4): Boolean;
|
|||
|
|
// Is the character a punctuation mark?
|
|||
|
|
begin
|
|||
|
|
Result := CategoryLookup(C, ClassPunctuation);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function UnicodeIsGraph(C: UCS4): Boolean;
|
|||
|
|
// Is the character graphical?
|
|||
|
|
begin
|
|||
|
|
Result := CategoryLookup(C, ClassMark + ClassNumber + ClassLetter + ClassPunctuation + ClassSymbol);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function UnicodeIsPrintable(C: UCS4): Boolean;
|
|||
|
|
// Is the character printable?
|
|||
|
|
begin
|
|||
|
|
Result := CategoryLookup(C, ClassMark + ClassNumber + ClassLetter + ClassPunctuation + ClassSymbol +
|
|||
|
|
[ccSeparatorSpace]);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function UnicodeIsUpper(C: UCS4): Boolean;
|
|||
|
|
// Is the character already upper case?
|
|||
|
|
begin
|
|||
|
|
Result := CategoryLookup(C, [ccLetterUppercase]);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function UnicodeIsLower(C: UCS4): Boolean;
|
|||
|
|
// Is the character already lower case?
|
|||
|
|
begin
|
|||
|
|
Result := CategoryLookup(C, [ccLetterLowercase]);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function UnicodeIsTitle(C: UCS4): Boolean;
|
|||
|
|
// Is the character already title case?
|
|||
|
|
begin
|
|||
|
|
Result := CategoryLookup(C, [ccLetterTitlecase]);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function UnicodeIsHexDigit(C: UCS4): Boolean;
|
|||
|
|
// Is the character a hex digit?
|
|||
|
|
begin
|
|||
|
|
Result := CategoryLookup(C, [ccHexDigit]);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function UnicodeIsIsoControl(C: UCS4): Boolean;
|
|||
|
|
// Is the character a C0 control character (< 32)?
|
|||
|
|
begin
|
|||
|
|
Result := CategoryLookup(C, [ccOtherControl]);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function UnicodeIsFormatControl(C: UCS4): Boolean;
|
|||
|
|
// Is the character a format control character?
|
|||
|
|
begin
|
|||
|
|
Result := CategoryLookup(C, [ccOtherFormat]);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function UnicodeIsSymbol(C: UCS4): Boolean;
|
|||
|
|
// Is the character a symbol?
|
|||
|
|
begin
|
|||
|
|
Result := CategoryLookup(C, ClassSymbol);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function UnicodeIsNumber(C: UCS4): Boolean;
|
|||
|
|
// Is the character a number or digit?
|
|||
|
|
begin
|
|||
|
|
Result := CategoryLookup(C, ClassNumber);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function UnicodeIsNonSpacing(C: UCS4): Boolean;
|
|||
|
|
// Is the character non-spacing?
|
|||
|
|
begin
|
|||
|
|
Result := CategoryLookup(C, [ccMarkNonSpacing]);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function UnicodeIsOpenPunctuation(C: UCS4): Boolean;
|
|||
|
|
// Is the character an open/left punctuation (e.g. '[')?
|
|||
|
|
begin
|
|||
|
|
Result := CategoryLookup(C, [ccPunctuationOpen]);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function UnicodeIsClosePunctuation(C: UCS4): Boolean;
|
|||
|
|
// Is the character an close/right punctuation (e.g. ']')?
|
|||
|
|
begin
|
|||
|
|
Result := CategoryLookup(C, [ccPunctuationClose]);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function UnicodeIsInitialPunctuation(C: UCS4): Boolean;
|
|||
|
|
// Is the character an initial punctuation (e.g. U+2018 LEFT SINGLE QUOTATION MARK)?
|
|||
|
|
begin
|
|||
|
|
Result := CategoryLookup(C, [ccPunctuationInitialQuote]);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function UnicodeIsFinalPunctuation(C: UCS4): Boolean;
|
|||
|
|
// Is the character a final punctuation (e.g. U+2019 RIGHT SINGLE QUOTATION MARK)?
|
|||
|
|
begin
|
|||
|
|
Result := CategoryLookup(C, [ccPunctuationFinalQuote]);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function UnicodeIsComposed(C: UCS4): Boolean;
|
|||
|
|
// Can the character be decomposed into a set of other characters?
|
|||
|
|
begin
|
|||
|
|
Result := CategoryLookup(C, [ccComposed]);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function UnicodeIsQuotationMark(C: UCS4): Boolean;
|
|||
|
|
// Is the character one of the many quotation marks?
|
|||
|
|
begin
|
|||
|
|
Result := CategoryLookup(C, [ccQuotationMark]);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function UnicodeIsSymmetric(C: UCS4): Boolean;
|
|||
|
|
// Is the character one that has an opposite form (i.e. <>)?
|
|||
|
|
begin
|
|||
|
|
Result := CategoryLookup(C, [ccSymmetric]);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function UnicodeIsMirroring(C: UCS4): Boolean;
|
|||
|
|
// Is the character mirroring (superset of symmetric)?
|
|||
|
|
begin
|
|||
|
|
Result := CategoryLookup(C, [ccMirroring]);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function UnicodeIsNonBreaking(C: UCS4): Boolean;
|
|||
|
|
// Is the character non-breaking (i.e. non-breaking space)?
|
|||
|
|
begin
|
|||
|
|
Result := CategoryLookup(C, [ccNonBreaking]);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function UnicodeIsRightToLeft(C: UCS4): Boolean;
|
|||
|
|
// Does the character have strong right-to-left directionality (i.e. Arabic letters)?
|
|||
|
|
begin
|
|||
|
|
Result := CategoryLookup(C, [ccRightToLeft]);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function UnicodeIsLeftToRight(C: UCS4): Boolean;
|
|||
|
|
// Does the character have strong left-to-right directionality (i.e. Latin letters)?
|
|||
|
|
begin
|
|||
|
|
Result := CategoryLookup(C, [ccLeftToRight]);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function UnicodeIsStrong(C: UCS4): Boolean;
|
|||
|
|
// Does the character have strong directionality?
|
|||
|
|
begin
|
|||
|
|
Result := CategoryLookup(C, [ccLeftToRight, ccRightToLeft]);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function UnicodeIsWeak(C: UCS4): Boolean;
|
|||
|
|
// Does the character have weak directionality (i.e. numbers)?
|
|||
|
|
begin
|
|||
|
|
Result := CategoryLookup(C, ClassEuropeanNumber + [ccArabicNumber, ccCommonNumberSeparator]);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function UnicodeIsNeutral(C: UCS4): Boolean;
|
|||
|
|
// Does the character have neutral directionality (i.e. whitespace)?
|
|||
|
|
begin
|
|||
|
|
Result := CategoryLookup(C, [ccSeparatorParagraph, ccSegmentSeparator, ccWhiteSpace, ccOtherNeutrals]);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function UnicodeIsSeparator(C: UCS4): Boolean;
|
|||
|
|
// Is the character a block or segment separator?
|
|||
|
|
begin
|
|||
|
|
Result := CategoryLookup(C, [ccSeparatorParagraph, ccSegmentSeparator]);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function UnicodeIsMark(C: UCS4): Boolean;
|
|||
|
|
// Is the character a mark of some kind?
|
|||
|
|
begin
|
|||
|
|
Result := CategoryLookup(C, ClassMark);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function UnicodeIsModifier(C: UCS4): Boolean;
|
|||
|
|
// Is the character a letter modifier?
|
|||
|
|
begin
|
|||
|
|
Result := CategoryLookup(C, [ccLetterModifier]);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function UnicodeIsLetterNumber(C: UCS4): Boolean;
|
|||
|
|
// Is the character a number represented by a letter?
|
|||
|
|
begin
|
|||
|
|
Result := CategoryLookup(C, [ccNumberLetter]);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function UnicodeIsConnectionPunctuation(C: UCS4): Boolean;
|
|||
|
|
// Is the character connecting punctuation?
|
|||
|
|
begin
|
|||
|
|
Result := CategoryLookup(C, [ccPunctuationConnector]);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function UnicodeIsDash(C: UCS4): Boolean;
|
|||
|
|
// Is the character a dash punctuation?
|
|||
|
|
begin
|
|||
|
|
Result := CategoryLookup(C, [ccPunctuationDash]);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function UnicodeIsMath(C: UCS4): Boolean;
|
|||
|
|
// Is the character a math character?
|
|||
|
|
begin
|
|||
|
|
Result := CategoryLookup(C, [ccSymbolMath]);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function UnicodeIsCurrency(C: UCS4): Boolean;
|
|||
|
|
// Is the character a currency character?
|
|||
|
|
begin
|
|||
|
|
Result := CategoryLookup(C, [ccSymbolCurrency]);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function UnicodeIsModifierSymbol(C: UCS4): Boolean;
|
|||
|
|
// Is the character a modifier symbol?
|
|||
|
|
begin
|
|||
|
|
Result := CategoryLookup(C, [ccSymbolModifier]);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function UnicodeIsNonSpacingMark(C: UCS4): Boolean;
|
|||
|
|
// Is the character a non-spacing mark?
|
|||
|
|
begin
|
|||
|
|
Result := CategoryLookup(C, [ccMarkNonSpacing]);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function UnicodeIsSpacingMark(C: UCS4): Boolean;
|
|||
|
|
// Is the character a spacing mark?
|
|||
|
|
begin
|
|||
|
|
Result := CategoryLookup(C, [ccMarkSpacingCombining]);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function UnicodeIsEnclosing(C: UCS4): Boolean;
|
|||
|
|
// Is the character enclosing (i.e. enclosing box)?
|
|||
|
|
begin
|
|||
|
|
Result := CategoryLookup(C, [ccMarkEnclosing]);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function UnicodeIsPrivate(C: UCS4): Boolean;
|
|||
|
|
// Is the character from the Private Use Area?
|
|||
|
|
begin
|
|||
|
|
Result := CategoryLookup(C, [ccOtherPrivate]);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function UnicodeIsSurrogate(C: UCS4): Boolean;
|
|||
|
|
// Is the character one of the surrogate codes?
|
|||
|
|
begin
|
|||
|
|
Result := CategoryLookup(C, [ccOtherSurrogate]);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function UnicodeIsLineSeparator(C: UCS4): Boolean;
|
|||
|
|
// Is the character a line separator?
|
|||
|
|
begin
|
|||
|
|
Result := CategoryLookup(C, [ccSeparatorLine]);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function UnicodeIsParagraphSeparator(C: UCS4): Boolean;
|
|||
|
|
// Is th character a paragraph separator;
|
|||
|
|
begin
|
|||
|
|
Result := CategoryLookup(C, [ccSeparatorParagraph]);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function UnicodeIsIdentifierStart(C: UCS4): Boolean;
|
|||
|
|
// Can the character begin an identifier?
|
|||
|
|
begin
|
|||
|
|
Result := CategoryLookup(C, ClassLetter + [ccNumberLetter]);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function UnicodeIsIdentifierPart(C: UCS4): Boolean;
|
|||
|
|
// Can the character appear in an identifier?
|
|||
|
|
begin
|
|||
|
|
Result := CategoryLookup(C, ClassLetter + [ccNumberLetter, ccMarkNonSpacing, ccMarkSpacingCombining,
|
|||
|
|
ccNumberDecimalDigit, ccPunctuationConnector, ccOtherFormat]);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function UnicodeIsDefined(C: UCS4): Boolean;
|
|||
|
|
// Is the character defined (appears in one of the data files)?
|
|||
|
|
begin
|
|||
|
|
Result := CategoryLookup(C, [ccAssigned]);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function UnicodeIsUndefined(C: UCS4): Boolean;
|
|||
|
|
// Is the character undefined (not assigned in the Unicode database)?
|
|||
|
|
begin
|
|||
|
|
Result := not CategoryLookup(C, [ccAssigned]);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function UnicodeIsHan(C: UCS4): Boolean;
|
|||
|
|
// Is the character a Han ideograph?
|
|||
|
|
begin
|
|||
|
|
Result := ((C >= $4E00) and (C <= $9FFF)) or ((C >= $F900) and (C <= $FAFF));
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function UnicodeIsHangul(C: UCS4): Boolean;
|
|||
|
|
// Is the character a pre-composed Hangul syllable?
|
|||
|
|
begin
|
|||
|
|
Result := (C >= $AC00) and (C <= $D7FF);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
// I need to fix a problem (introduced by MS) here. The first parameter can be a pointer
|
|||
|
|
// (and is so defined) or can be a normal DWORD, depending on the dwFlags parameter.
|
|||
|
|
// As usual, lpSrc has been translated to a var parameter. But this does not work in
|
|||
|
|
// our case, hence the redeclaration of the function with a pointer as first parameter.
|
|||
|
|
|
|||
|
|
function TranslateCharsetInfoEx(lpSrc: PDWORD; var lpCs: TCharsetInfo; dwFlags: DWORD): BOOL; stdcall;
|
|||
|
|
external 'gdi32.dll' name 'TranslateCharsetInfo';
|
|||
|
|
|
|||
|
|
function GetCharSetFromLocale(Language: LCID; out FontCharSet: TFontCharSet): Boolean;
|
|||
|
|
var
|
|||
|
|
CP: Cardinal;
|
|||
|
|
CSI: TCharsetInfo;
|
|||
|
|
begin
|
|||
|
|
CP:= CodePageFromLocale(Language);
|
|||
|
|
Result := TranslateCharsetInfoEx(Pointer(CP), CSI, TCI_SRCCODEPAGE);
|
|||
|
|
if Result then
|
|||
|
|
FontCharset := CSI.ciCharset;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function CharSetFromLocale(Language: LCID): TFontCharSet;
|
|||
|
|
begin
|
|||
|
|
if not GetCharSetFromLocale(Language, Result) then
|
|||
|
|
RaiseLastOSError;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function CodePageFromLocale(Language: LCID): Integer;
|
|||
|
|
// determines the code page for a given locale
|
|||
|
|
var
|
|||
|
|
Buf: array [0..6] of Char;
|
|||
|
|
begin
|
|||
|
|
GetLocaleInfo(Language, LOCALE_IDefaultAnsiCodePage, Buf, 6);
|
|||
|
|
Result := StrToIntDef(Buf, GetACP);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function KeyboardCodePage: Word;
|
|||
|
|
begin
|
|||
|
|
Result := CodePageFromLocale(GetKeyboardLayout(0) and $FFFF);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function KeyUnicode(C: Char): WideChar;
|
|||
|
|
// converts the given character (as it comes with a WM_CHAR message) into its
|
|||
|
|
// corresponding Unicode character depending on the active keyboard layout
|
|||
|
|
begin
|
|||
|
|
MultiByteToWideChar(KeyboardCodePage, MB_USEGLYPHCHARS, @C, 1, @Result, 1);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function CodeBlockRange(const CB: TUnicodeBlock): TUnicodeBlockRange;
|
|||
|
|
// http://www.unicode.org/Public/4.1.0/ucd/Blocks.txt
|
|||
|
|
begin
|
|||
|
|
case CB of
|
|||
|
|
ubBasicLatin:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $0000;
|
|||
|
|
Result.RangeEnd := $007F;
|
|||
|
|
end;
|
|||
|
|
ubLatin1Supplement:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $0080;
|
|||
|
|
Result.RangeEnd := $00FF;
|
|||
|
|
end;
|
|||
|
|
ubLatinExtendedA:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $0100;
|
|||
|
|
Result.RangeEnd := $017F;
|
|||
|
|
end;
|
|||
|
|
ubLatinExtendedB:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $0180;
|
|||
|
|
Result.RangeEnd := $024F;
|
|||
|
|
end;
|
|||
|
|
ubIPAExtensions:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $0250;
|
|||
|
|
Result.RangeEnd := $02AF;
|
|||
|
|
end;
|
|||
|
|
ubSpacingModifierLetters:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $02B0;
|
|||
|
|
Result.RangeEnd := $02FF;
|
|||
|
|
end;
|
|||
|
|
ubCombiningDiacriticalMarks:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $0300;
|
|||
|
|
Result.RangeEnd := $036F;
|
|||
|
|
end;
|
|||
|
|
ubGreek:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $0370;
|
|||
|
|
Result.RangeEnd := $03FF;
|
|||
|
|
end;
|
|||
|
|
ubCyrillic:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $0400;
|
|||
|
|
Result.RangeEnd := $04FF;
|
|||
|
|
end;
|
|||
|
|
ubCyrillicSupplement:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $0500;
|
|||
|
|
Result.RangeEnd := $052F;
|
|||
|
|
end;
|
|||
|
|
ubArmenian:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $0530;
|
|||
|
|
Result.RangeEnd := $058F;
|
|||
|
|
end;
|
|||
|
|
ubHebrew:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $0590;
|
|||
|
|
Result.RangeEnd := $05FF;
|
|||
|
|
end;
|
|||
|
|
ubArabic:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $0600;
|
|||
|
|
Result.RangeEnd := $06FF;
|
|||
|
|
end;
|
|||
|
|
ubSyriac:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $0700;
|
|||
|
|
Result.RangeEnd := $074F;
|
|||
|
|
end;
|
|||
|
|
ubArabicSupplement:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $0750;
|
|||
|
|
Result.RangeEnd := $077F;
|
|||
|
|
end;
|
|||
|
|
ubThaana:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $0780;
|
|||
|
|
Result.RangeEnd := $07BF;
|
|||
|
|
end;
|
|||
|
|
ubDevanagari:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $0900;
|
|||
|
|
Result.RangeEnd := $097F;
|
|||
|
|
end;
|
|||
|
|
ubBengali:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $0980;
|
|||
|
|
Result.RangeEnd := $09FF;
|
|||
|
|
end;
|
|||
|
|
ubGurmukhi:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $0A00;
|
|||
|
|
Result.RangeEnd := $0A7F;
|
|||
|
|
end;
|
|||
|
|
ubGujarati:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $0A80;
|
|||
|
|
Result.RangeEnd := $0AFF;
|
|||
|
|
end;
|
|||
|
|
ubOriya:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $0B00;
|
|||
|
|
Result.RangeEnd := $0B7F;
|
|||
|
|
end;
|
|||
|
|
ubTamil:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $0B80;
|
|||
|
|
Result.RangeEnd := $0BFF;
|
|||
|
|
end;
|
|||
|
|
ubTelugu:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $0C00;
|
|||
|
|
Result.RangeEnd := $0C7F;
|
|||
|
|
end;
|
|||
|
|
ubKannada:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $0C80;
|
|||
|
|
Result.RangeEnd := $0CFF;
|
|||
|
|
end;
|
|||
|
|
ubMalayalam:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $0D00;
|
|||
|
|
Result.RangeEnd := $0D7F;
|
|||
|
|
end;
|
|||
|
|
ubSinhala:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $0D80;
|
|||
|
|
Result.RangeEnd := $0DFF;
|
|||
|
|
end;
|
|||
|
|
ubThai:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $0E00;
|
|||
|
|
Result.RangeEnd := $0E7F;
|
|||
|
|
end;
|
|||
|
|
ubLao:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $0E80;
|
|||
|
|
Result.RangeEnd := $0EFF;
|
|||
|
|
end;
|
|||
|
|
ubTibetan:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $0F00;
|
|||
|
|
Result.RangeEnd := $0FFF;
|
|||
|
|
end;
|
|||
|
|
ubMyanmar:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $1000;
|
|||
|
|
Result.RangeEnd := $109F;
|
|||
|
|
end;
|
|||
|
|
ubGeorgian:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $10A0;
|
|||
|
|
Result.RangeEnd := $10FF;
|
|||
|
|
end;
|
|||
|
|
ubHangulJamo:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $1100;
|
|||
|
|
Result.RangeEnd := $11FF;
|
|||
|
|
end;
|
|||
|
|
ubEthiopic:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $1200;
|
|||
|
|
Result.RangeEnd := $137F;
|
|||
|
|
end;
|
|||
|
|
ubEthiopicSupplement:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $1380;
|
|||
|
|
Result.RangeEnd := $139F;
|
|||
|
|
end;
|
|||
|
|
ubCherokee:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $13A0;
|
|||
|
|
Result.RangeEnd := $13FF;
|
|||
|
|
end;
|
|||
|
|
ubUnifiedCanadianAboriginalSyllabics:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $1400;
|
|||
|
|
Result.RangeEnd := $167F;
|
|||
|
|
end;
|
|||
|
|
ubOgham:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $1680;
|
|||
|
|
Result.RangeEnd := $169F;
|
|||
|
|
end;
|
|||
|
|
ubRunic:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $16A0;
|
|||
|
|
Result.RangeEnd := $16FF;
|
|||
|
|
end;
|
|||
|
|
ubTagalog:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $1700;
|
|||
|
|
Result.RangeEnd := $171F;
|
|||
|
|
end;
|
|||
|
|
ubHanunoo:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $1720;
|
|||
|
|
Result.RangeEnd := $173F;
|
|||
|
|
end;
|
|||
|
|
ubBuhid:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $1740;
|
|||
|
|
Result.RangeEnd := $175F;
|
|||
|
|
end;
|
|||
|
|
ubTagbanwa:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $1760;
|
|||
|
|
Result.RangeEnd := $177F;
|
|||
|
|
end;
|
|||
|
|
ubKhmer:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $1780;
|
|||
|
|
Result.RangeEnd := $17FF;
|
|||
|
|
end;
|
|||
|
|
ubMongolian:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $1800;
|
|||
|
|
Result.RangeEnd := $18AF;
|
|||
|
|
end;
|
|||
|
|
ubLimbu:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $1900;
|
|||
|
|
Result.RangeEnd := $194F;
|
|||
|
|
end;
|
|||
|
|
ubTaiLe:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $1950;
|
|||
|
|
Result.RangeEnd := $197F;
|
|||
|
|
end;
|
|||
|
|
ubNewTaiLue:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $1980;
|
|||
|
|
Result.RangeEnd := $19DF;
|
|||
|
|
end;
|
|||
|
|
ubKhmerSymbols:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $19E0;
|
|||
|
|
Result.RangeEnd := $19FF;
|
|||
|
|
end;
|
|||
|
|
ubBuginese:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $1A00;
|
|||
|
|
Result.RangeEnd := $1A1F;
|
|||
|
|
end;
|
|||
|
|
ubPhoneticExtensions:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $1D00;
|
|||
|
|
Result.RangeEnd := $1D7F;
|
|||
|
|
end;
|
|||
|
|
ubPhoneticExtensionsSupplement:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $1D80;
|
|||
|
|
Result.RangeEnd := $1DBF;
|
|||
|
|
end;
|
|||
|
|
ubCombiningDiacriticalMarksSupplement:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $1DC0;
|
|||
|
|
Result.RangeEnd := $1DFF;
|
|||
|
|
end;
|
|||
|
|
ubLatinExtendedAdditional:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $1E00;
|
|||
|
|
Result.RangeEnd := $1EFF;
|
|||
|
|
end;
|
|||
|
|
ubGreekExtended:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $1F00;
|
|||
|
|
Result.RangeEnd := $1FFF;
|
|||
|
|
end;
|
|||
|
|
ubGeneralPunctuation:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $2000;
|
|||
|
|
Result.RangeEnd := $206F;
|
|||
|
|
end;
|
|||
|
|
ubSuperscriptsandSubscripts:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $2070;
|
|||
|
|
Result.RangeEnd := $209F;
|
|||
|
|
end;
|
|||
|
|
ubCurrencySymbols:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $20A0;
|
|||
|
|
Result.RangeEnd := $20CF;
|
|||
|
|
end;
|
|||
|
|
ubCombiningMarksforSymbols:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $20D0;
|
|||
|
|
Result.RangeEnd := $20FF;
|
|||
|
|
end;
|
|||
|
|
ubLetterlikeSymbols:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $2100;
|
|||
|
|
Result.RangeEnd := $214F;
|
|||
|
|
end;
|
|||
|
|
ubNumberForms:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $2150;
|
|||
|
|
Result.RangeEnd := $218F;
|
|||
|
|
end;
|
|||
|
|
ubArrows:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $2190;
|
|||
|
|
Result.RangeEnd := $21FF;
|
|||
|
|
end;
|
|||
|
|
ubMathematicalOperators:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $2200;
|
|||
|
|
Result.RangeEnd := $22FF;
|
|||
|
|
end;
|
|||
|
|
ubMiscellaneousTechnical:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $2300;
|
|||
|
|
Result.RangeEnd := $23FF;
|
|||
|
|
end;
|
|||
|
|
ubControlPictures:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $2400;
|
|||
|
|
Result.RangeEnd := $243F;
|
|||
|
|
end;
|
|||
|
|
ubOpticalCharacterRecognition:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $2440;
|
|||
|
|
Result.RangeEnd := $245F;
|
|||
|
|
end;
|
|||
|
|
ubEnclosedAlphanumerics:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $2460;
|
|||
|
|
Result.RangeEnd := $24FF;
|
|||
|
|
end;
|
|||
|
|
ubBoxDrawing:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $2500;
|
|||
|
|
Result.RangeEnd := $257F;
|
|||
|
|
end;
|
|||
|
|
ubBlockElements:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $2580;
|
|||
|
|
Result.RangeEnd := $259F;
|
|||
|
|
end;
|
|||
|
|
ubGeometricShapes:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $25A0;
|
|||
|
|
Result.RangeEnd := $25FF;
|
|||
|
|
end;
|
|||
|
|
ubMiscellaneousSymbols:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $2600;
|
|||
|
|
Result.RangeEnd := $26FF;
|
|||
|
|
end;
|
|||
|
|
ubDingbats:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $2700;
|
|||
|
|
Result.RangeEnd := $27BF;
|
|||
|
|
end;
|
|||
|
|
ubMiscellaneousMathematicalSymbolsA:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $27C0;
|
|||
|
|
Result.RangeEnd := $27EF;
|
|||
|
|
end;
|
|||
|
|
ubSupplementalArrowsA:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $27F0;
|
|||
|
|
Result.RangeEnd := $27FF;
|
|||
|
|
end;
|
|||
|
|
ubBraillePatterns:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $2800;
|
|||
|
|
Result.RangeEnd := $28FF;
|
|||
|
|
end;
|
|||
|
|
ubSupplementalArrowsB:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $2900;
|
|||
|
|
Result.RangeEnd := $297F;
|
|||
|
|
end;
|
|||
|
|
ubMiscellaneousMathematicalSymbolsB:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $2980;
|
|||
|
|
Result.RangeEnd := $29FF;
|
|||
|
|
end;
|
|||
|
|
ubSupplementalMathematicalOperators:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $2A00;
|
|||
|
|
Result.RangeEnd := $2AFF;
|
|||
|
|
end;
|
|||
|
|
ubMiscellaneousSymbolsandArrows:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $2B00;
|
|||
|
|
Result.RangeEnd := $2BFF;
|
|||
|
|
end;
|
|||
|
|
ubGlagolitic:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $2C00;
|
|||
|
|
Result.RangeEnd := $2C5F;
|
|||
|
|
end;
|
|||
|
|
ubCoptic:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $2C80;
|
|||
|
|
Result.RangeEnd := $2CFF;
|
|||
|
|
end;
|
|||
|
|
ubGeorgianSupplement:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $2D00;
|
|||
|
|
Result.RangeEnd := $2D2F;
|
|||
|
|
end;
|
|||
|
|
ubTifinagh:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $2D30;
|
|||
|
|
Result.RangeEnd := $2D7F;
|
|||
|
|
end;
|
|||
|
|
ubEthiopicExtended:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $2D80;
|
|||
|
|
Result.RangeEnd := $2DDF;
|
|||
|
|
end;
|
|||
|
|
ubSupplementalPunctuation:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $2E00;
|
|||
|
|
Result.RangeEnd := $2E7F;
|
|||
|
|
end;
|
|||
|
|
ubCJKRadicalsSupplement:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $2E80;
|
|||
|
|
Result.RangeEnd := $2EFF;
|
|||
|
|
end;
|
|||
|
|
ubKangxiRadicals:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $2F00;
|
|||
|
|
Result.RangeEnd := $2FDF;
|
|||
|
|
end;
|
|||
|
|
ubIdeographicDescriptionCharacters:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $2FF0;
|
|||
|
|
Result.RangeEnd := $2FFF;
|
|||
|
|
end;
|
|||
|
|
ubCJKSymbolsandPunctuation:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $3000;
|
|||
|
|
Result.RangeEnd := $303F;
|
|||
|
|
end;
|
|||
|
|
ubHiragana:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $3040;
|
|||
|
|
Result.RangeEnd := $309F;
|
|||
|
|
end;
|
|||
|
|
ubKatakana:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $30A0;
|
|||
|
|
Result.RangeEnd := $30FF;
|
|||
|
|
end;
|
|||
|
|
ubBopomofo:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $3100;
|
|||
|
|
Result.RangeEnd := $312F;
|
|||
|
|
end;
|
|||
|
|
ubHangulCompatibilityJamo:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $3130;
|
|||
|
|
Result.RangeEnd := $318F;
|
|||
|
|
end;
|
|||
|
|
ubKanbun:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $3190;
|
|||
|
|
Result.RangeEnd := $319F;
|
|||
|
|
end;
|
|||
|
|
ubBopomofoExtended:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $31A0;
|
|||
|
|
Result.RangeEnd := $31BF;
|
|||
|
|
end;
|
|||
|
|
ubCJKStrokes:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $31C0;
|
|||
|
|
Result.RangeEnd := $31EF;
|
|||
|
|
end;
|
|||
|
|
ubKatakanaPhoneticExtensions:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $31F0;
|
|||
|
|
Result.RangeEnd := $31FF;
|
|||
|
|
end;
|
|||
|
|
ubEnclosedCJKLettersandMonths:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $3200;
|
|||
|
|
Result.RangeEnd := $32FF;
|
|||
|
|
end;
|
|||
|
|
ubCJKCompatibility:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $3300;
|
|||
|
|
Result.RangeEnd := $33FF;
|
|||
|
|
end;
|
|||
|
|
ubCJKUnifiedIdeographsExtensionA:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $3400;
|
|||
|
|
Result.RangeEnd := $4DBF;
|
|||
|
|
end;
|
|||
|
|
ubYijingHexagramSymbols:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $4DC0;
|
|||
|
|
Result.RangeEnd := $4DFF;
|
|||
|
|
end;
|
|||
|
|
ubCJKUnifiedIdeographs:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $4E00;
|
|||
|
|
Result.RangeEnd := $9FFF;
|
|||
|
|
end;
|
|||
|
|
ubYiSyllables:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $A000;
|
|||
|
|
Result.RangeEnd := $A48F;
|
|||
|
|
end;
|
|||
|
|
ubYiRadicals:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $A490;
|
|||
|
|
Result.RangeEnd := $A4CF;
|
|||
|
|
end;
|
|||
|
|
ubModifierToneLetters:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $A700;
|
|||
|
|
Result.RangeEnd := $A71F;
|
|||
|
|
end;
|
|||
|
|
ubSylotiNagri:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $A800;
|
|||
|
|
Result.RangeEnd := $A82F;
|
|||
|
|
end;
|
|||
|
|
ubHangulSyllables:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $AC00;
|
|||
|
|
Result.RangeEnd := $D7AF;
|
|||
|
|
end;
|
|||
|
|
ubHighSurrogates:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $D800;
|
|||
|
|
Result.RangeEnd := $DB7F;
|
|||
|
|
end;
|
|||
|
|
ubHighPrivateUseSurrogates:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $DB80;
|
|||
|
|
Result.RangeEnd := $DBFF;
|
|||
|
|
end;
|
|||
|
|
ubLowSurrogates:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $DC00;
|
|||
|
|
Result.RangeEnd := $DFFF;
|
|||
|
|
end;
|
|||
|
|
ubPrivateUse:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $E000;
|
|||
|
|
Result.RangeEnd := $F8FF;
|
|||
|
|
end;
|
|||
|
|
ubCJKCompatibilityIdeographs:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $F900;
|
|||
|
|
Result.RangeEnd := $FAFF;
|
|||
|
|
end;
|
|||
|
|
ubAlphabeticPresentationForms:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $FB00;
|
|||
|
|
Result.RangeEnd := $FB4F;
|
|||
|
|
end;
|
|||
|
|
ubArabicPresentationFormsA:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $FB50;
|
|||
|
|
Result.RangeEnd := $FDFF;
|
|||
|
|
end;
|
|||
|
|
ubVariationSelectors:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $FE00;
|
|||
|
|
Result.RangeEnd := $FE0F;
|
|||
|
|
end;
|
|||
|
|
ubVerticalForms:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $FE10;
|
|||
|
|
Result.RangeEnd := $FE1F;
|
|||
|
|
end;
|
|||
|
|
ubCombiningHalfMarks:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $FE20;
|
|||
|
|
Result.RangeEnd := $FE2F;
|
|||
|
|
end;
|
|||
|
|
ubCJKCompatibilityForms:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $FE30;
|
|||
|
|
Result.RangeEnd := $FE4F;
|
|||
|
|
end;
|
|||
|
|
ubSmallFormVariants:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $FE50;
|
|||
|
|
Result.RangeEnd := $FE6F;
|
|||
|
|
end;
|
|||
|
|
ubArabicPresentationFormsB:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $FE70;
|
|||
|
|
Result.RangeEnd := $FEFF;
|
|||
|
|
end;
|
|||
|
|
ubHalfwidthandFullwidthForms:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $FF00;
|
|||
|
|
Result.RangeEnd := $FFEF;
|
|||
|
|
end;
|
|||
|
|
ubSpecials:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $FFF0;
|
|||
|
|
Result.RangeEnd := $FFFF;
|
|||
|
|
end;
|
|||
|
|
ubLinearBSyllabary:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $10000;
|
|||
|
|
Result.RangeEnd := $1007F;
|
|||
|
|
end;
|
|||
|
|
ubLinearBIdeograms:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $10080;
|
|||
|
|
Result.RangeEnd := $100FF;
|
|||
|
|
end;
|
|||
|
|
ubAegeanNumbers:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $10100;
|
|||
|
|
Result.RangeEnd := $1013F;
|
|||
|
|
end;
|
|||
|
|
ubAncientGreekNumbers:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $10140;
|
|||
|
|
Result.RangeEnd := $1018F;
|
|||
|
|
end;
|
|||
|
|
ubOldItalic:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $10300;
|
|||
|
|
Result.RangeEnd := $1032F;
|
|||
|
|
end;
|
|||
|
|
ubGothic:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $10330;
|
|||
|
|
Result.RangeEnd := $1034F;
|
|||
|
|
end;
|
|||
|
|
ubUgaritic:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $10380;
|
|||
|
|
Result.RangeEnd := $1039F;
|
|||
|
|
end;
|
|||
|
|
ubOldPersian:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $103A0;
|
|||
|
|
Result.RangeEnd := $103DF;
|
|||
|
|
end;
|
|||
|
|
ubDeseret:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $10400;
|
|||
|
|
Result.RangeEnd := $1044F;
|
|||
|
|
end;
|
|||
|
|
ubShavian:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $10450;
|
|||
|
|
Result.RangeEnd := $1047F;
|
|||
|
|
end;
|
|||
|
|
ubOsmanya:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $10480;
|
|||
|
|
Result.RangeEnd := $104AF;
|
|||
|
|
end;
|
|||
|
|
ubCypriotSyllabary:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $10800;
|
|||
|
|
Result.RangeEnd := $1083F;
|
|||
|
|
end;
|
|||
|
|
ubKharoshthi:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $10A00;
|
|||
|
|
Result.RangeEnd := $10A5F;
|
|||
|
|
end;
|
|||
|
|
ubByzantineMusicalSymbols:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $1D000;
|
|||
|
|
Result.RangeEnd := $1D0FF;
|
|||
|
|
end;
|
|||
|
|
ubMusicalSymbols:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $1D100;
|
|||
|
|
Result.RangeEnd := $1D1FF;
|
|||
|
|
end;
|
|||
|
|
ubAncientGreekMusicalNotation:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $1D200;
|
|||
|
|
Result.RangeEnd := $1D24F;
|
|||
|
|
end;
|
|||
|
|
ubTaiXuanJingSymbols:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $1D300;
|
|||
|
|
Result.RangeEnd := $1D35F;
|
|||
|
|
end;
|
|||
|
|
ubMathematicalAlphanumericSymbols:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $1D400;
|
|||
|
|
Result.RangeEnd := $1D7FF;
|
|||
|
|
end;
|
|||
|
|
ubCJKUnifiedIdeographsExtensionB:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $20000;
|
|||
|
|
Result.RangeEnd := $2A6DF;
|
|||
|
|
end;
|
|||
|
|
ubCJKCompatibilityIdeographsSupplement:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $2F800;
|
|||
|
|
Result.RangeEnd := $2FA1F;
|
|||
|
|
end;
|
|||
|
|
ubTags:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $E0000;
|
|||
|
|
Result.RangeEnd := $E007F;
|
|||
|
|
end;
|
|||
|
|
ubVariationSelectorsSupplement:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $E0100;
|
|||
|
|
Result.RangeEnd := $E01EF;
|
|||
|
|
end;
|
|||
|
|
ubSupplementaryPrivateUseAreaA:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $F0000;
|
|||
|
|
Result.RangeEnd := $FFFFF;
|
|||
|
|
end;
|
|||
|
|
ubSupplementaryPrivateUseAreaB:
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := $100000;
|
|||
|
|
Result.RangeEnd := $10FFFF;
|
|||
|
|
end;
|
|||
|
|
else
|
|||
|
|
begin
|
|||
|
|
Result.RangeStart := 0;
|
|||
|
|
Result.RangeEnd := 0;
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
|
|||
|
|
// Returns the CodeBlockName of the Block specified by CB
|
|||
|
|
// Names taken from http://www.unicode.org/Public/4.1.0/ucd/Blocks.txt
|
|||
|
|
function CodeBlockName(const CB: TUnicodeBlock): string;
|
|||
|
|
begin
|
|||
|
|
case CB of
|
|||
|
|
ubBasicLatin:
|
|||
|
|
Result := 'Basic Latin';
|
|||
|
|
ubLatin1Supplement:
|
|||
|
|
Result := 'Latin-1 Supplement';
|
|||
|
|
ubLatinExtendedA:
|
|||
|
|
Result := 'Latin Extended-A';
|
|||
|
|
ubLatinExtendedB:
|
|||
|
|
Result := 'Latin Extended-B';
|
|||
|
|
ubIPAExtensions:
|
|||
|
|
Result := 'IPA Extensions';
|
|||
|
|
ubSpacingModifierLetters:
|
|||
|
|
Result := 'Spacing Modifier Letters';
|
|||
|
|
ubCombiningDiacriticalMarks:
|
|||
|
|
Result := 'Combining Diacritical Marks';
|
|||
|
|
//ubGreekandCoptic:
|
|||
|
|
ubGreek:
|
|||
|
|
Result := 'Greek and Coptic';
|
|||
|
|
ubCyrillic:
|
|||
|
|
Result := 'Cyrillic';
|
|||
|
|
ubCyrillicSupplement:
|
|||
|
|
Result := 'Cyrillic Supplement';
|
|||
|
|
ubArmenian:
|
|||
|
|
Result := 'Armenian';
|
|||
|
|
ubHebrew:
|
|||
|
|
Result := 'Hebrew';
|
|||
|
|
ubArabic:
|
|||
|
|
Result := 'Arabic';
|
|||
|
|
ubSyriac:
|
|||
|
|
Result := 'Syriac';
|
|||
|
|
ubArabicSupplement:
|
|||
|
|
Result := 'Arabic Supplement';
|
|||
|
|
ubThaana:
|
|||
|
|
Result := 'Thaana';
|
|||
|
|
ubDevanagari:
|
|||
|
|
Result := 'Devanagari';
|
|||
|
|
ubBengali:
|
|||
|
|
Result := 'Bengali';
|
|||
|
|
ubGurmukhi:
|
|||
|
|
Result := 'Gurmukhi';
|
|||
|
|
ubGujarati:
|
|||
|
|
Result := 'Gujarati';
|
|||
|
|
ubOriya:
|
|||
|
|
Result := 'Oriya';
|
|||
|
|
ubTamil:
|
|||
|
|
Result := 'Tamil';
|
|||
|
|
ubTelugu:
|
|||
|
|
Result := 'Telugu';
|
|||
|
|
ubKannada:
|
|||
|
|
Result := 'Kannada';
|
|||
|
|
ubMalayalam:
|
|||
|
|
Result := 'Malayalam';
|
|||
|
|
ubSinhala:
|
|||
|
|
Result := 'Sinhala';
|
|||
|
|
ubThai:
|
|||
|
|
Result := 'Thai';
|
|||
|
|
ubLao:
|
|||
|
|
Result := 'Lao';
|
|||
|
|
ubTibetan:
|
|||
|
|
Result := 'Tibetan';
|
|||
|
|
ubMyanmar:
|
|||
|
|
Result := 'Myanmar';
|
|||
|
|
ubGeorgian:
|
|||
|
|
Result := 'Georgian';
|
|||
|
|
ubHangulJamo:
|
|||
|
|
Result := 'Hangul Jamo';
|
|||
|
|
ubEthiopic:
|
|||
|
|
Result := 'Ethiopic';
|
|||
|
|
ubEthiopicSupplement:
|
|||
|
|
Result := 'Ethiopic Supplement';
|
|||
|
|
ubCherokee:
|
|||
|
|
Result := 'Cherokee';
|
|||
|
|
ubUnifiedCanadianAboriginalSyllabics:
|
|||
|
|
Result := 'Unified Canadian Aboriginal Syllabics';
|
|||
|
|
ubOgham:
|
|||
|
|
Result := 'Ogham';
|
|||
|
|
ubRunic:
|
|||
|
|
Result := 'Runic';
|
|||
|
|
ubTagalog:
|
|||
|
|
Result := 'Tagalog';
|
|||
|
|
ubHanunoo:
|
|||
|
|
Result := 'Hanunoo';
|
|||
|
|
ubBuhid:
|
|||
|
|
Result := 'Buhid';
|
|||
|
|
ubTagbanwa:
|
|||
|
|
Result := 'Tagbanwa';
|
|||
|
|
ubKhmer:
|
|||
|
|
Result := 'Khmer';
|
|||
|
|
ubMongolian:
|
|||
|
|
Result := 'Mongolian';
|
|||
|
|
ubLimbu:
|
|||
|
|
Result := 'Limbu';
|
|||
|
|
ubTaiLe:
|
|||
|
|
Result := 'Tai Le';
|
|||
|
|
ubNewTaiLue:
|
|||
|
|
Result := 'New Tai Lue';
|
|||
|
|
ubKhmerSymbols:
|
|||
|
|
Result := 'Khmer Symbols';
|
|||
|
|
ubBuginese:
|
|||
|
|
Result := 'Buginese';
|
|||
|
|
ubPhoneticExtensions:
|
|||
|
|
Result := 'Phonetic Extensions';
|
|||
|
|
ubPhoneticExtensionsSupplement:
|
|||
|
|
Result := 'Phonetic Extensions Supplement';
|
|||
|
|
ubCombiningDiacriticalMarksSupplement:
|
|||
|
|
Result := 'Combining Diacritical Marks Supplement';
|
|||
|
|
ubLatinExtendedAdditional:
|
|||
|
|
Result := 'Latin Extended Additional';
|
|||
|
|
ubGreekExtended:
|
|||
|
|
Result := 'Greek Extended';
|
|||
|
|
ubGeneralPunctuation:
|
|||
|
|
Result := 'General Punctuation';
|
|||
|
|
ubSuperscriptsandSubscripts:
|
|||
|
|
Result := 'Superscripts and Subscripts';
|
|||
|
|
ubCurrencySymbols:
|
|||
|
|
Result := 'Currency Symbols';
|
|||
|
|
//ubCombiningDiacriticalMarksforSymbols:
|
|||
|
|
ubCombiningMarksforSymbols:
|
|||
|
|
Result := 'Combining Diacritical Marks for Symbols';
|
|||
|
|
ubLetterlikeSymbols:
|
|||
|
|
Result := 'Letterlike Symbols';
|
|||
|
|
ubNumberForms:
|
|||
|
|
Result := 'Number Forms';
|
|||
|
|
ubArrows:
|
|||
|
|
Result := 'Arrows';
|
|||
|
|
ubMathematicalOperators:
|
|||
|
|
Result := 'Mathematical Operators';
|
|||
|
|
ubMiscellaneousTechnical:
|
|||
|
|
Result := 'Miscellaneous Technical';
|
|||
|
|
ubControlPictures:
|
|||
|
|
Result := 'Control Pictures';
|
|||
|
|
ubOpticalCharacterRecognition:
|
|||
|
|
Result := 'Optical Character Recognition';
|
|||
|
|
ubEnclosedAlphanumerics:
|
|||
|
|
Result := 'Enclosed Alphanumerics';
|
|||
|
|
ubBoxDrawing:
|
|||
|
|
Result := 'Box Drawing';
|
|||
|
|
ubBlockElements:
|
|||
|
|
Result := 'Block Elements';
|
|||
|
|
ubGeometricShapes:
|
|||
|
|
Result := 'Geometric Shapes';
|
|||
|
|
ubMiscellaneousSymbols:
|
|||
|
|
Result := 'Miscellaneous Symbols';
|
|||
|
|
ubDingbats:
|
|||
|
|
Result := 'Dingbats';
|
|||
|
|
ubMiscellaneousMathematicalSymbolsA:
|
|||
|
|
Result := 'Miscellaneous Mathematical Symbols-A';
|
|||
|
|
ubSupplementalArrowsA:
|
|||
|
|
Result := 'Supplemental Arrows-A';
|
|||
|
|
ubBraillePatterns:
|
|||
|
|
Result := 'Braille Patterns';
|
|||
|
|
ubSupplementalArrowsB:
|
|||
|
|
Result := 'Supplemental Arrows-B';
|
|||
|
|
ubMiscellaneousMathematicalSymbolsB:
|
|||
|
|
Result := 'Miscellaneous Mathematical Symbols-B';
|
|||
|
|
ubSupplementalMathematicalOperators:
|
|||
|
|
Result := 'Supplemental Mathematical Operators';
|
|||
|
|
ubMiscellaneousSymbolsandArrows:
|
|||
|
|
Result := 'Miscellaneous Symbols and Arrows';
|
|||
|
|
ubGlagolitic:
|
|||
|
|
Result := 'Glagolitic';
|
|||
|
|
ubCoptic:
|
|||
|
|
Result := 'Coptic';
|
|||
|
|
ubGeorgianSupplement:
|
|||
|
|
Result := 'Georgian Supplement';
|
|||
|
|
ubTifinagh:
|
|||
|
|
Result := 'Tifinagh';
|
|||
|
|
ubEthiopicExtended:
|
|||
|
|
Result := 'Ethiopic Extended';
|
|||
|
|
ubSupplementalPunctuation:
|
|||
|
|
Result := 'Supplemental Punctuation';
|
|||
|
|
ubCJKRadicalsSupplement:
|
|||
|
|
Result := 'CJK Radicals Supplement';
|
|||
|
|
ubKangxiRadicals:
|
|||
|
|
Result := 'Kangxi Radicals';
|
|||
|
|
ubIdeographicDescriptionCharacters:
|
|||
|
|
Result := 'Ideographic Description Characters';
|
|||
|
|
ubCJKSymbolsandPunctuation:
|
|||
|
|
Result := 'CJK Symbols and Punctuation';
|
|||
|
|
ubHiragana:
|
|||
|
|
Result := 'Hiragana';
|
|||
|
|
ubKatakana:
|
|||
|
|
Result := 'Katakana';
|
|||
|
|
ubBopomofo:
|
|||
|
|
Result := 'Bopomofo';
|
|||
|
|
ubHangulCompatibilityJamo:
|
|||
|
|
Result := 'Hangul Compatibility Jamo';
|
|||
|
|
ubKanbun:
|
|||
|
|
Result := 'Kanbun';
|
|||
|
|
ubBopomofoExtended:
|
|||
|
|
Result := 'Bopomofo Extended';
|
|||
|
|
ubCJKStrokes:
|
|||
|
|
Result := 'CJK Strokes';
|
|||
|
|
ubKatakanaPhoneticExtensions:
|
|||
|
|
Result := 'Katakana Phonetic Extensions';
|
|||
|
|
ubEnclosedCJKLettersandMonths:
|
|||
|
|
Result := 'Enclosed CJK Letters and Months';
|
|||
|
|
ubCJKCompatibility:
|
|||
|
|
Result := 'CJK Compatibility';
|
|||
|
|
ubCJKUnifiedIdeographsExtensionA:
|
|||
|
|
Result := 'CJK Unified Ideographs Extension A';
|
|||
|
|
ubYijingHexagramSymbols:
|
|||
|
|
Result := 'Yijing Hexagram Symbols';
|
|||
|
|
ubCJKUnifiedIdeographs:
|
|||
|
|
Result := 'CJK Unified Ideographs';
|
|||
|
|
ubYiSyllables:
|
|||
|
|
Result := 'Yi Syllables';
|
|||
|
|
ubYiRadicals:
|
|||
|
|
Result := 'Yi Radicals';
|
|||
|
|
ubModifierToneLetters:
|
|||
|
|
Result := 'Modifier Tone Letters';
|
|||
|
|
ubSylotiNagri:
|
|||
|
|
Result := 'Syloti Nagri';
|
|||
|
|
ubHangulSyllables:
|
|||
|
|
Result := 'Hangul Syllables';
|
|||
|
|
ubHighSurrogates:
|
|||
|
|
Result := 'High Surrogates';
|
|||
|
|
ubHighPrivateUseSurrogates:
|
|||
|
|
Result := 'High Private Use Surrogates';
|
|||
|
|
ubLowSurrogates:
|
|||
|
|
Result := 'Low Surrogates';
|
|||
|
|
//ubPrivateUseArea:
|
|||
|
|
ubPrivateUse:
|
|||
|
|
Result := 'Private Use Area';
|
|||
|
|
ubCJKCompatibilityIdeographs:
|
|||
|
|
Result := 'CJK Compatibility Ideographs';
|
|||
|
|
ubAlphabeticPresentationForms:
|
|||
|
|
Result := 'Alphabetic Presentation Forms';
|
|||
|
|
ubArabicPresentationFormsA:
|
|||
|
|
Result := 'Arabic Presentation Forms-A';
|
|||
|
|
ubVariationSelectors:
|
|||
|
|
Result := 'Variation Selectors';
|
|||
|
|
ubVerticalForms:
|
|||
|
|
Result := 'Vertical Forms';
|
|||
|
|
ubCombiningHalfMarks:
|
|||
|
|
Result := 'Combining Half Marks';
|
|||
|
|
ubCJKCompatibilityForms:
|
|||
|
|
Result := 'CJK Compatibility Forms';
|
|||
|
|
ubSmallFormVariants:
|
|||
|
|
Result := 'Small Form Variants';
|
|||
|
|
ubArabicPresentationFormsB:
|
|||
|
|
Result := 'Arabic Presentation Forms-B';
|
|||
|
|
ubHalfwidthandFullwidthForms:
|
|||
|
|
Result := 'Halfwidth and Fullwidth Forms';
|
|||
|
|
ubSpecials:
|
|||
|
|
Result := 'Specials';
|
|||
|
|
ubLinearBSyllabary:
|
|||
|
|
Result := 'Linear B Syllabary';
|
|||
|
|
ubLinearBIdeograms:
|
|||
|
|
Result := 'Linear B Ideograms';
|
|||
|
|
ubAegeanNumbers:
|
|||
|
|
Result := 'Aegean Numbers';
|
|||
|
|
ubAncientGreekNumbers:
|
|||
|
|
Result := 'Ancient Greek Numbers';
|
|||
|
|
ubOldItalic:
|
|||
|
|
Result := 'Old Italic';
|
|||
|
|
ubGothic:
|
|||
|
|
Result := 'Gothic';
|
|||
|
|
ubUgaritic:
|
|||
|
|
Result := 'Ugaritic';
|
|||
|
|
ubOldPersian:
|
|||
|
|
Result := 'Old Persian';
|
|||
|
|
ubDeseret:
|
|||
|
|
Result := 'Deseret';
|
|||
|
|
ubShavian:
|
|||
|
|
Result := 'Shavian';
|
|||
|
|
ubOsmanya:
|
|||
|
|
Result := 'Osmanya';
|
|||
|
|
ubCypriotSyllabary:
|
|||
|
|
Result := 'Cypriot Syllabary';
|
|||
|
|
ubKharoshthi:
|
|||
|
|
Result := 'Kharoshthi';
|
|||
|
|
ubByzantineMusicalSymbols:
|
|||
|
|
Result := 'Byzantine Musical Symbols';
|
|||
|
|
ubMusicalSymbols:
|
|||
|
|
Result := 'Musical Symbols';
|
|||
|
|
ubAncientGreekMusicalNotation:
|
|||
|
|
Result := 'Ancient Greek Musical Notation';
|
|||
|
|
ubTaiXuanJingSymbols:
|
|||
|
|
Result := 'Tai Xuan Jing Symbols';
|
|||
|
|
ubMathematicalAlphanumericSymbols:
|
|||
|
|
Result := 'Mathematical Alphanumeric Symbols';
|
|||
|
|
ubCJKUnifiedIdeographsExtensionB:
|
|||
|
|
Result := 'CJK Unified Ideographs Extension B';
|
|||
|
|
ubCJKCompatibilityIdeographsSupplement:
|
|||
|
|
Result := 'CJK Compatibility Ideographs Supplement';
|
|||
|
|
ubTags:
|
|||
|
|
Result := 'Tags';
|
|||
|
|
ubVariationSelectorsSupplement:
|
|||
|
|
Result := 'Variation Selectors Supplement';
|
|||
|
|
ubSupplementaryPrivateUseAreaA:
|
|||
|
|
Result := 'Supplementary Private Use Area-A';
|
|||
|
|
ubSupplementaryPrivateUseAreaB:
|
|||
|
|
Result := 'Supplementary Private Use Area-B';
|
|||
|
|
else
|
|||
|
|
Result := 'Undefined';
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
// Returns an ID for the Unicode code block to which C belongs.
|
|||
|
|
// If C does not belong to any of the defined blocks then ubUndefined is returned.
|
|||
|
|
// Note: the code blocks listed here are based on Unicode Version 3.1.
|
|||
|
|
function CodeBlockFromChar(const C: UCS4): TUnicodeBlock;
|
|||
|
|
// http://www.unicode.org/Public/4.1.0/ucd/Blocks.txt
|
|||
|
|
begin
|
|||
|
|
case C of
|
|||
|
|
$0000..$007F:
|
|||
|
|
Result := ubBasicLatin;
|
|||
|
|
$0080..$00FF:
|
|||
|
|
Result := ubLatin1Supplement;
|
|||
|
|
$0100..$017F:
|
|||
|
|
Result := ubLatinExtendedA;
|
|||
|
|
$0180..$024F:
|
|||
|
|
Result := ubLatinExtendedB;
|
|||
|
|
$0250..$02AF:
|
|||
|
|
Result := ubIPAExtensions;
|
|||
|
|
$02B0..$02FF:
|
|||
|
|
Result := ubSpacingModifierLetters;
|
|||
|
|
$0300..$036F:
|
|||
|
|
Result := ubCombiningDiacriticalMarks;
|
|||
|
|
$0370..$03FF:
|
|||
|
|
Result := ubGreek; //ubGreekandCoptic;
|
|||
|
|
$0400..$04FF:
|
|||
|
|
Result := ubCyrillic;
|
|||
|
|
$0500..$052F:
|
|||
|
|
Result := ubCyrillicSupplement;
|
|||
|
|
$0530..$058F:
|
|||
|
|
Result := ubArmenian;
|
|||
|
|
$0590..$05FF:
|
|||
|
|
Result := ubHebrew;
|
|||
|
|
$0600..$06FF:
|
|||
|
|
Result := ubArabic;
|
|||
|
|
$0700..$074F:
|
|||
|
|
Result := ubSyriac;
|
|||
|
|
$0750..$077F:
|
|||
|
|
Result := ubArabicSupplement;
|
|||
|
|
$0780..$07BF:
|
|||
|
|
Result := ubThaana;
|
|||
|
|
$0900..$097F:
|
|||
|
|
Result := ubDevanagari;
|
|||
|
|
$0980..$09FF:
|
|||
|
|
Result := ubBengali;
|
|||
|
|
$0A00..$0A7F:
|
|||
|
|
Result := ubGurmukhi;
|
|||
|
|
$0A80..$0AFF:
|
|||
|
|
Result := ubGujarati;
|
|||
|
|
$0B00..$0B7F:
|
|||
|
|
Result := ubOriya;
|
|||
|
|
$0B80..$0BFF:
|
|||
|
|
Result := ubTamil;
|
|||
|
|
$0C00..$0C7F:
|
|||
|
|
Result := ubTelugu;
|
|||
|
|
$0C80..$0CFF:
|
|||
|
|
Result := ubKannada;
|
|||
|
|
$0D00..$0D7F:
|
|||
|
|
Result := ubMalayalam;
|
|||
|
|
$0D80..$0DFF:
|
|||
|
|
Result := ubSinhala;
|
|||
|
|
$0E00..$0E7F:
|
|||
|
|
Result := ubThai;
|
|||
|
|
$0E80..$0EFF:
|
|||
|
|
Result := ubLao;
|
|||
|
|
$0F00..$0FFF:
|
|||
|
|
Result := ubTibetan;
|
|||
|
|
$1000..$109F:
|
|||
|
|
Result := ubMyanmar;
|
|||
|
|
$10A0..$10FF:
|
|||
|
|
Result := ubGeorgian;
|
|||
|
|
$1100..$11FF:
|
|||
|
|
Result := ubHangulJamo;
|
|||
|
|
$1200..$137F:
|
|||
|
|
Result := ubEthiopic;
|
|||
|
|
$1380..$139F:
|
|||
|
|
Result := ubEthiopicSupplement;
|
|||
|
|
$13A0..$13FF:
|
|||
|
|
Result := ubCherokee;
|
|||
|
|
$1400..$167F:
|
|||
|
|
Result := ubUnifiedCanadianAboriginalSyllabics;
|
|||
|
|
$1680..$169F:
|
|||
|
|
Result := ubOgham;
|
|||
|
|
$16A0..$16FF:
|
|||
|
|
Result := ubRunic;
|
|||
|
|
$1700..$171F:
|
|||
|
|
Result := ubTagalog;
|
|||
|
|
$1720..$173F:
|
|||
|
|
Result := ubHanunoo;
|
|||
|
|
$1740..$175F:
|
|||
|
|
Result := ubBuhid;
|
|||
|
|
$1760..$177F:
|
|||
|
|
Result := ubTagbanwa;
|
|||
|
|
$1780..$17FF:
|
|||
|
|
Result := ubKhmer;
|
|||
|
|
$1800..$18AF:
|
|||
|
|
Result := ubMongolian;
|
|||
|
|
$1900..$194F:
|
|||
|
|
Result := ubLimbu;
|
|||
|
|
$1950..$197F:
|
|||
|
|
Result := ubTaiLe;
|
|||
|
|
$1980..$19DF:
|
|||
|
|
Result := ubNewTaiLue;
|
|||
|
|
$19E0..$19FF:
|
|||
|
|
Result := ubKhmerSymbols;
|
|||
|
|
$1A00..$1A1F:
|
|||
|
|
Result := ubBuginese;
|
|||
|
|
$1D00..$1D7F:
|
|||
|
|
Result := ubPhoneticExtensions;
|
|||
|
|
$1D80..$1DBF:
|
|||
|
|
Result := ubPhoneticExtensionsSupplement;
|
|||
|
|
$1DC0..$1DFF:
|
|||
|
|
Result := ubCombiningDiacriticalMarksSupplement;
|
|||
|
|
$1E00..$1EFF:
|
|||
|
|
Result := ubLatinExtendedAdditional;
|
|||
|
|
$1F00..$1FFF:
|
|||
|
|
Result := ubGreekExtended;
|
|||
|
|
$2000..$206F:
|
|||
|
|
Result := ubGeneralPunctuation;
|
|||
|
|
$2070..$209F:
|
|||
|
|
Result := ubSuperscriptsandSubscripts;
|
|||
|
|
$20A0..$20CF:
|
|||
|
|
Result := ubCurrencySymbols;
|
|||
|
|
$20D0..$20FF:
|
|||
|
|
Result := ubCombiningMarksforSymbols; //ubCombiningDiacriticalMarksforSymbols;
|
|||
|
|
$2100..$214F:
|
|||
|
|
Result := ubLetterlikeSymbols;
|
|||
|
|
$2150..$218F:
|
|||
|
|
Result := ubNumberForms;
|
|||
|
|
$2190..$21FF:
|
|||
|
|
Result := ubArrows;
|
|||
|
|
$2200..$22FF:
|
|||
|
|
Result := ubMathematicalOperators;
|
|||
|
|
$2300..$23FF:
|
|||
|
|
Result := ubMiscellaneousTechnical;
|
|||
|
|
$2400..$243F:
|
|||
|
|
Result := ubControlPictures;
|
|||
|
|
$2440..$245F:
|
|||
|
|
Result := ubOpticalCharacterRecognition;
|
|||
|
|
$2460..$24FF:
|
|||
|
|
Result := ubEnclosedAlphanumerics;
|
|||
|
|
$2500..$257F:
|
|||
|
|
Result := ubBoxDrawing;
|
|||
|
|
$2580..$259F:
|
|||
|
|
Result := ubBlockElements;
|
|||
|
|
$25A0..$25FF:
|
|||
|
|
Result := ubGeometricShapes;
|
|||
|
|
$2600..$26FF:
|
|||
|
|
Result := ubMiscellaneousSymbols;
|
|||
|
|
$2700..$27BF:
|
|||
|
|
Result := ubDingbats;
|
|||
|
|
$27C0..$27EF:
|
|||
|
|
Result := ubMiscellaneousMathematicalSymbolsA;
|
|||
|
|
$27F0..$27FF:
|
|||
|
|
Result := ubSupplementalArrowsA;
|
|||
|
|
$2800..$28FF:
|
|||
|
|
Result := ubBraillePatterns;
|
|||
|
|
$2900..$297F:
|
|||
|
|
Result := ubSupplementalArrowsB;
|
|||
|
|
$2980..$29FF:
|
|||
|
|
Result := ubMiscellaneousMathematicalSymbolsB;
|
|||
|
|
$2A00..$2AFF:
|
|||
|
|
Result := ubSupplementalMathematicalOperators;
|
|||
|
|
$2B00..$2BFF:
|
|||
|
|
Result := ubMiscellaneousSymbolsandArrows;
|
|||
|
|
$2C00..$2C5F:
|
|||
|
|
Result := ubGlagolitic;
|
|||
|
|
$2C80..$2CFF:
|
|||
|
|
Result := ubCoptic;
|
|||
|
|
$2D00..$2D2F:
|
|||
|
|
Result := ubGeorgianSupplement;
|
|||
|
|
$2D30..$2D7F:
|
|||
|
|
Result := ubTifinagh;
|
|||
|
|
$2D80..$2DDF:
|
|||
|
|
Result := ubEthiopicExtended;
|
|||
|
|
$2E00..$2E7F:
|
|||
|
|
Result := ubSupplementalPunctuation;
|
|||
|
|
$2E80..$2EFF:
|
|||
|
|
Result := ubCJKRadicalsSupplement;
|
|||
|
|
$2F00..$2FDF:
|
|||
|
|
Result := ubKangxiRadicals;
|
|||
|
|
$2FF0..$2FFF:
|
|||
|
|
Result := ubIdeographicDescriptionCharacters;
|
|||
|
|
$3000..$303F:
|
|||
|
|
Result := ubCJKSymbolsandPunctuation;
|
|||
|
|
$3040..$309F:
|
|||
|
|
Result := ubHiragana;
|
|||
|
|
$30A0..$30FF:
|
|||
|
|
Result := ubKatakana;
|
|||
|
|
$3100..$312F:
|
|||
|
|
Result := ubBopomofo;
|
|||
|
|
$3130..$318F:
|
|||
|
|
Result := ubHangulCompatibilityJamo;
|
|||
|
|
$3190..$319F:
|
|||
|
|
Result := ubKanbun;
|
|||
|
|
$31A0..$31BF:
|
|||
|
|
Result := ubBopomofoExtended;
|
|||
|
|
$31C0..$31EF:
|
|||
|
|
Result := ubCJKStrokes;
|
|||
|
|
$31F0..$31FF:
|
|||
|
|
Result := ubKatakanaPhoneticExtensions;
|
|||
|
|
$3200..$32FF:
|
|||
|
|
Result := ubEnclosedCJKLettersandMonths;
|
|||
|
|
$3300..$33FF:
|
|||
|
|
Result := ubCJKCompatibility;
|
|||
|
|
$3400..$4DBF:
|
|||
|
|
Result := ubCJKUnifiedIdeographsExtensionA;
|
|||
|
|
$4DC0..$4DFF:
|
|||
|
|
Result := ubYijingHexagramSymbols;
|
|||
|
|
$4E00..$9FFF:
|
|||
|
|
Result := ubCJKUnifiedIdeographs;
|
|||
|
|
$A000..$A48F:
|
|||
|
|
Result := ubYiSyllables;
|
|||
|
|
$A490..$A4CF:
|
|||
|
|
Result := ubYiRadicals;
|
|||
|
|
$A700..$A71F:
|
|||
|
|
Result := ubModifierToneLetters;
|
|||
|
|
$A800..$A82F:
|
|||
|
|
Result := ubSylotiNagri;
|
|||
|
|
$AC00..$D7AF:
|
|||
|
|
Result := ubHangulSyllables;
|
|||
|
|
$D800..$DB7F:
|
|||
|
|
Result := ubHighSurrogates;
|
|||
|
|
$DB80..$DBFF:
|
|||
|
|
Result := ubHighPrivateUseSurrogates;
|
|||
|
|
$DC00..$DFFF:
|
|||
|
|
Result := ubLowSurrogates;
|
|||
|
|
$E000..$F8FF:
|
|||
|
|
Result := ubPrivateUse; //ubPrivateUseArea;
|
|||
|
|
$F900..$FAFF:
|
|||
|
|
Result := ubCJKCompatibilityIdeographs;
|
|||
|
|
$FB00..$FB4F:
|
|||
|
|
Result := ubAlphabeticPresentationForms;
|
|||
|
|
$FB50..$FDFF:
|
|||
|
|
Result := ubArabicPresentationFormsA;
|
|||
|
|
$FE00..$FE0F:
|
|||
|
|
Result := ubVariationSelectors;
|
|||
|
|
$FE10..$FE1F:
|
|||
|
|
Result := ubVerticalForms;
|
|||
|
|
$FE20..$FE2F:
|
|||
|
|
Result := ubCombiningHalfMarks;
|
|||
|
|
$FE30..$FE4F:
|
|||
|
|
Result := ubCJKCompatibilityForms;
|
|||
|
|
$FE50..$FE6F:
|
|||
|
|
Result := ubSmallFormVariants;
|
|||
|
|
$FE70..$FEFF:
|
|||
|
|
Result := ubArabicPresentationFormsB;
|
|||
|
|
$FF00..$FFEF:
|
|||
|
|
Result := ubHalfwidthandFullwidthForms;
|
|||
|
|
$FFF0..$FFFF:
|
|||
|
|
Result := ubSpecials;
|
|||
|
|
$10000..$1007F:
|
|||
|
|
Result := ubLinearBSyllabary;
|
|||
|
|
$10080..$100FF:
|
|||
|
|
Result := ubLinearBIdeograms;
|
|||
|
|
$10100..$1013F:
|
|||
|
|
Result := ubAegeanNumbers;
|
|||
|
|
$10140..$1018F:
|
|||
|
|
Result := ubAncientGreekNumbers;
|
|||
|
|
$10300..$1032F:
|
|||
|
|
Result := ubOldItalic;
|
|||
|
|
$10330..$1034F:
|
|||
|
|
Result := ubGothic;
|
|||
|
|
$10380..$1039F:
|
|||
|
|
Result := ubUgaritic;
|
|||
|
|
$103A0..$103DF:
|
|||
|
|
Result := ubOldPersian;
|
|||
|
|
$10400..$1044F:
|
|||
|
|
Result := ubDeseret;
|
|||
|
|
$10450..$1047F:
|
|||
|
|
Result := ubShavian;
|
|||
|
|
$10480..$104AF:
|
|||
|
|
Result := ubOsmanya;
|
|||
|
|
$10800..$1083F:
|
|||
|
|
Result := ubCypriotSyllabary;
|
|||
|
|
$10A00..$10A5F:
|
|||
|
|
Result := ubKharoshthi;
|
|||
|
|
$1D000..$1D0FF:
|
|||
|
|
Result := ubByzantineMusicalSymbols;
|
|||
|
|
$1D100..$1D1FF:
|
|||
|
|
Result := ubMusicalSymbols;
|
|||
|
|
$1D200..$1D24F:
|
|||
|
|
Result := ubAncientGreekMusicalNotation;
|
|||
|
|
$1D300..$1D35F:
|
|||
|
|
Result := ubTaiXuanJingSymbols;
|
|||
|
|
$1D400..$1D7FF:
|
|||
|
|
Result := ubMathematicalAlphanumericSymbols;
|
|||
|
|
$20000..$2A6DF:
|
|||
|
|
Result := ubCJKUnifiedIdeographsExtensionB;
|
|||
|
|
$2F800..$2FA1F:
|
|||
|
|
Result := ubCJKCompatibilityIdeographsSupplement;
|
|||
|
|
$E0000..$E007F:
|
|||
|
|
Result := ubTags;
|
|||
|
|
$E0100..$E01EF:
|
|||
|
|
Result := ubVariationSelectorsSupplement;
|
|||
|
|
$F0000..$FFFFF:
|
|||
|
|
Result := ubSupplementaryPrivateUseAreaA;
|
|||
|
|
$100000..$10FFFF:
|
|||
|
|
Result := ubSupplementaryPrivateUseAreaB;
|
|||
|
|
else
|
|||
|
|
Result := ubUndefined;
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
|
|||
|
|
function CompareTextWin95(const W1, W2: WideString; Locale: LCID): Integer;
|
|||
|
|
// special comparation function for Win9x since there's no system defined
|
|||
|
|
// comparation function, returns -1 if W1 < W2, 0 if W1 = W2 or 1 if W1 > W2
|
|||
|
|
var
|
|||
|
|
S1, S2: string;
|
|||
|
|
CP: Integer;
|
|||
|
|
L1, L2: Integer;
|
|||
|
|
begin
|
|||
|
|
L1 := Length(W1);
|
|||
|
|
L2 := Length(W2);
|
|||
|
|
SetLength(S1, L1);
|
|||
|
|
SetLength(S2, L2);
|
|||
|
|
CP := CodePageFromLocale(Locale);
|
|||
|
|
WideCharToMultiByte(CP, 0, PWideChar(W1), L1, PChar(S1), L1, nil, nil);
|
|||
|
|
WideCharToMultiByte(CP, 0, PWideChar(W2), L2, PChar(S2), L2, nil, nil);
|
|||
|
|
Result := CompareStringA(Locale, NORM_IGNORECASE, PChar(S1), Length(S1),
|
|||
|
|
PChar(S2), Length(S2)) - 2;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function CompareTextWinNT(const W1, W2: WideString; Locale: LCID): Integer;
|
|||
|
|
// Wrapper function for WinNT since there's no system defined comparation function
|
|||
|
|
// in Win9x and we need a central comparation function for TWideStringList.
|
|||
|
|
// Returns -1 if W1 < W2, 0 if W1 = W2 or 1 if W1 > W2
|
|||
|
|
begin
|
|||
|
|
Result := CompareStringW(Locale, NORM_IGNORECASE, PWideChar(W1), Length(W1),
|
|||
|
|
PWideChar(W2), Length(W2)) - 2;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function StringToWideStringEx(const S: string; CodePage: Word): WideString;
|
|||
|
|
var
|
|||
|
|
InputLength,
|
|||
|
|
OutputLength: Integer;
|
|||
|
|
begin
|
|||
|
|
InputLength := Length(S);
|
|||
|
|
OutputLength := MultiByteToWideChar(CodePage, 0, PChar(S), InputLength, nil, 0);
|
|||
|
|
SetLength(Result, OutputLength);
|
|||
|
|
MultiByteToWideChar(CodePage, 0, PChar(S), InputLength, PWideChar(Result), OutputLength);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function WideStringToStringEx(const WS: WideString; CodePage: Word): string;
|
|||
|
|
var
|
|||
|
|
InputLength,
|
|||
|
|
OutputLength: Integer;
|
|||
|
|
begin
|
|||
|
|
InputLength := Length(WS);
|
|||
|
|
OutputLength := WideCharToMultiByte(CodePage, 0, PWideChar(WS), InputLength, nil, 0, nil, nil);
|
|||
|
|
SetLength(Result, OutputLength);
|
|||
|
|
WideCharToMultiByte(CodePage, 0, PWideChar(WS), InputLength, PChar(Result), OutputLength, nil, nil);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function TranslateString(const S: string; CP1, CP2: Word): string;
|
|||
|
|
begin
|
|||
|
|
Result:= WideStringToStringEx(StringToWideStringEx(S, CP1), CP2);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
//----------------- conversion routines ------------------------------------------------------------
|
|||
|
|
|
|||
|
|
// Converts the given source ANSI string into a Unicode string by expanding each character
|
|||
|
|
// from one byte to two bytes.
|
|||
|
|
// EAX contains Source, EDX contains Target, ECX contains Count
|
|||
|
|
|
|||
|
|
procedure ExpandANSIString(const Source: PChar; Target: PWideChar; Count: Cardinal);
|
|||
|
|
asm
|
|||
|
|
JECXZ @@Finish // go out if there is nothing to do
|
|||
|
|
PUSH ESI
|
|||
|
|
MOV ESI, EAX
|
|||
|
|
XOR EAX, EAX
|
|||
|
|
@@1:
|
|||
|
|
MOV AL, [ESI]
|
|||
|
|
INC ESI
|
|||
|
|
MOV [EDX], AX
|
|||
|
|
ADD EDX, 2
|
|||
|
|
DEC ECX
|
|||
|
|
JNZ @@1
|
|||
|
|
POP ESI
|
|||
|
|
@@Finish:
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
const
|
|||
|
|
HalfShift: Integer = 10;
|
|||
|
|
|
|||
|
|
HalfBase: UCS4 = $0010000;
|
|||
|
|
HalfMask: UCS4 = $3FF;
|
|||
|
|
|
|||
|
|
OffsetsFromUTF8: array [0..5] of UCS4 =
|
|||
|
|
($00000000, $00003080, $000E2080,
|
|||
|
|
$03C82080, $FA082080, $82082080);
|
|||
|
|
|
|||
|
|
BytesFromUTF8: array [0..255] of Byte =
|
|||
|
|
(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
|||
|
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
|||
|
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
|||
|
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
|||
|
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
|||
|
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
|||
|
|
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
|||
|
|
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5);
|
|||
|
|
|
|||
|
|
FirstByteMark: array [0..6] of Byte =
|
|||
|
|
($00, $00, $C0, $E0, $F0, $F8, $FC);
|
|||
|
|
|
|||
|
|
function WideStringToUTF8(S: WideString): AnsiString;
|
|||
|
|
var
|
|||
|
|
Ch: UCS4;
|
|||
|
|
L, J, T,
|
|||
|
|
BytesToWrite: Cardinal;
|
|||
|
|
ByteMask: UCS4;
|
|||
|
|
ByteMark: UCS4;
|
|||
|
|
begin
|
|||
|
|
if S = '' then
|
|||
|
|
Result := ''
|
|||
|
|
else
|
|||
|
|
begin
|
|||
|
|
SetLength(Result, Length(S) * 6); // assume worst case
|
|||
|
|
T := 1;
|
|||
|
|
ByteMask := $BF;
|
|||
|
|
ByteMark := $80;
|
|||
|
|
|
|||
|
|
for J := 1 to Length(S) do
|
|||
|
|
begin
|
|||
|
|
Ch := UCS4(S[J]);
|
|||
|
|
|
|||
|
|
if Ch < $80 then
|
|||
|
|
BytesToWrite := 1
|
|||
|
|
else
|
|||
|
|
if Ch < $800 then
|
|||
|
|
BytesToWrite := 2
|
|||
|
|
else
|
|||
|
|
if Ch < $10000 then
|
|||
|
|
BytesToWrite := 3
|
|||
|
|
else
|
|||
|
|
if Ch < $200000 then
|
|||
|
|
BytesToWrite := 4
|
|||
|
|
else
|
|||
|
|
if Ch < $4000000 then
|
|||
|
|
BytesToWrite := 5
|
|||
|
|
else
|
|||
|
|
if Ch <= MaximumUCS4 then
|
|||
|
|
BytesToWrite := 6
|
|||
|
|
else
|
|||
|
|
begin
|
|||
|
|
BytesToWrite := 2;
|
|||
|
|
Ch := ReplacementCharacter;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
for L := BytesToWrite downto 2 do
|
|||
|
|
begin
|
|||
|
|
Result[T + L - 1] := Char((Ch or ByteMark) and ByteMask);
|
|||
|
|
Ch := Ch shr 6;
|
|||
|
|
end;
|
|||
|
|
Result[T] := Char(Ch or FirstByteMark[BytesToWrite]);
|
|||
|
|
Inc(T, BytesToWrite);
|
|||
|
|
end;
|
|||
|
|
SetLength(Result, T - 1); // set to actual length
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
function UTF8ToWideString(S: AnsiString): WideString;
|
|||
|
|
var
|
|||
|
|
L, J, T: Cardinal;
|
|||
|
|
Ch: UCS4;
|
|||
|
|
ExtraBytesToWrite: Word;
|
|||
|
|
begin
|
|||
|
|
if S = '' then
|
|||
|
|
Result := ''
|
|||
|
|
else
|
|||
|
|
begin
|
|||
|
|
SetLength(Result, Length(S)); // create enough room
|
|||
|
|
|
|||
|
|
L := 1;
|
|||
|
|
T := 1;
|
|||
|
|
while L <= Cardinal(Length(S)) do
|
|||
|
|
begin
|
|||
|
|
Ch := 0;
|
|||
|
|
ExtraBytesToWrite := BytesFromUTF8[Ord(S[L])];
|
|||
|
|
|
|||
|
|
for J := ExtraBytesToWrite downto 1 do
|
|||
|
|
begin
|
|||
|
|
Ch := Ch + Ord(S[L]);
|
|||
|
|
Inc(L);
|
|||
|
|
Ch := Ch shl 6;
|
|||
|
|
end;
|
|||
|
|
Ch := Ch + Ord(S[L]);
|
|||
|
|
Inc(L);
|
|||
|
|
Ch := Ch - OffsetsFromUTF8[ExtraBytesToWrite];
|
|||
|
|
|
|||
|
|
if Ch <= MaximumUCS2 then
|
|||
|
|
begin
|
|||
|
|
Result[T] := WideChar(Ch);
|
|||
|
|
Inc(T);
|
|||
|
|
end
|
|||
|
|
else
|
|||
|
|
if Ch > MaximumUCS4 then
|
|||
|
|
begin
|
|||
|
|
Result[T] := WideChar(ReplacementCharacter);
|
|||
|
|
Inc(T);
|
|||
|
|
end
|
|||
|
|
else
|
|||
|
|
begin
|
|||
|
|
Ch := Ch - HalfBase;
|
|||
|
|
Result[T] := WideChar((Ch shr HalfShift) + SurrogateHighStart);
|
|||
|
|
Inc(T);
|
|||
|
|
Result[T] := WideChar((Ch and HalfMask) + SurrogateLowStart);
|
|||
|
|
Inc(T);
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
SetLength(Result, T - 1); // now fix up length
|
|||
|
|
end;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
procedure PrepareUnicodeData;
|
|||
|
|
// Prepares structures which are globally needed.
|
|||
|
|
begin
|
|||
|
|
LoadInProgress := TJclCriticalSection.Create;
|
|||
|
|
|
|||
|
|
if (Win32Platform and VER_PLATFORM_WIN32_NT) <> 0 then
|
|||
|
|
@WideCompareText := @CompareTextWinNT
|
|||
|
|
else
|
|||
|
|
@WideCompareText := @CompareTextWin95;
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
procedure FreeUnicodeData;
|
|||
|
|
// Frees all data which has been allocated and which is not automatically freed by Delphi.
|
|||
|
|
begin
|
|||
|
|
FreeAndNil(LoadInProgress);
|
|||
|
|
end;
|
|||
|
|
|
|||
|
|
initialization
|
|||
|
|
PrepareUnicodeData;
|
|||
|
|
|
|||
|
|
finalization
|
|||
|
|
FreeUnicodeData;
|
|||
|
|
|
|||
|
|
{$ENDIF SUPPORTS_WIDESTRING}
|
|||
|
|
|
|||
|
|
// History:
|
|||
|
|
|
|||
|
|
// $Log: JclUnicode.pas,v $
|
|||
|
|
// Revision 1.31 2005/10/26 09:15:13 marquardt
|
|||
|
|
// most functions now have the same const parameters as their Ansi counterparts
|
|||
|
|
//
|
|||
|
|
// Revision 1.30 2005/10/26 08:36:29 marquardt
|
|||
|
|
// StrPCopyWW and StrPLCopyWW introduced to solve overloaded problem
|
|||
|
|
//
|
|||
|
|
// Revision 1.29 2005/10/25 18:20:10 outchy
|
|||
|
|
// IT3174: UTF8-file support in JclUnicode.pas
|
|||
|
|
//
|
|||
|
|
// Revision 1.28 2005/10/25 16:27:36 marquardt
|
|||
|
|
// StrPCopyW and StrPLCopyW overloaded versions deactivated because of Delphi5 compiler problems
|
|||
|
|
//
|
|||
|
|
// Revision 1.27 2005/10/25 10:33:40 marquardt
|
|||
|
|
// made StrPCopyW and StrPLCopyW compatible with the original Unicode.pas by adding overloaded versions
|
|||
|
|
//
|
|||
|
|
// Revision 1.26 2005/10/25 09:47:04 marquardt
|
|||
|
|
// minor fixes and cleanups
|
|||
|
|
//
|
|||
|
|
// Revision 1.25 2005/10/25 08:54:57 marquardt
|
|||
|
|
// make a union of the Str*W family of functions in JclUnicode and JclWideStrings
|
|||
|
|
//
|
|||
|
|
// Revision 1.24 2005/10/16 05:16:51 marquardt
|
|||
|
|
// TWideStrings now has GetText and GetTextStr like TStrings
|
|||
|
|
//
|
|||
|
|
// Revision 1.23 2005/07/19 21:28:26 outchy
|
|||
|
|
// IT 3066: JclUnicode.pas updated to Unicode 4.1
|
|||
|
|
//
|
|||
|
|
// Revision 1.22 2005/03/08 08:33:23 marquardt
|
|||
|
|
// overhaul of exceptions and resourcestrings, minor style cleaning
|
|||
|
|
//
|
|||
|
|
// Revision 1.21 2005/03/01 15:37:40 marquardt
|
|||
|
|
// addressing Mantis 0714, 0716, 0720, 0731, 0740 partly or completely
|
|||
|
|
//
|
|||
|
|
// Revision 1.20 2005/02/24 16:34:53 marquardt
|
|||
|
|
// remove divider lines, add section lines (unfinished)
|
|||
|
|
//
|
|||
|
|
// Revision 1.19 2005/02/24 07:36:25 marquardt
|
|||
|
|
// resolved the compiler warnings, style cleanup, removed code from JclContainerIntf.pas
|
|||
|
|
//
|
|||
|
|
// Revision 1.18 2005/02/14 03:20:59 rrossmair
|
|||
|
|
// - fixed issues #0000713 ( make CompareTextWin95/NT functions use const string parameters) and #0001909 (JclUnicode.CharSetFromLocale - result ignored)
|
|||
|
|
//
|
|||
|
|
// Revision 1.17 2004/11/22 19:17:18 ahuser
|
|||
|
|
// Fixed memory leak
|
|||
|
|
// Style cleaning
|
|||
|
|
//
|
|||
|
|
// Revision 1.16 2004/10/17 21:00:16 mthoma
|
|||
|
|
// cleaning
|
|||
|
|
//
|
|||
|
|
// Revision 1.15 2004/08/01 11:40:24 marquardt
|
|||
|
|
// move constructors/destructors
|
|||
|
|
//
|
|||
|
|
// Revision 1.14 2004/07/31 06:21:03 marquardt
|
|||
|
|
// fixing TStringLists, adding BeginUpdate/EndUpdate, finalization improved
|
|||
|
|
//
|
|||
|
|
// Revision 1.13 2004/07/28 18:00:54 marquardt
|
|||
|
|
// various style cleanings, some minor fixes
|
|||
|
|
//
|
|||
|
|
// Revision 1.12 2004/06/16 07:30:31 marquardt
|
|||
|
|
// added tilde to all IFNDEF ENDIFs, inherited qualified
|
|||
|
|
//
|
|||
|
|
// Revision 1.11 2004/06/14 13:05:22 marquardt
|
|||
|
|
// style cleaning ENDIF, Tabs
|
|||
|
|
//
|
|||
|
|
// Revision 1.10 2004/06/14 11:05:53 marquardt
|
|||
|
|
// symbols added to all ENDIFs and some other minor style changes like removing IFOPT
|
|||
|
|
//
|
|||
|
|
// Revision 1.9 2004/05/05 07:33:49 rrossmair
|
|||
|
|
// header updated according to new policy: initial developers & contributors listed
|
|||
|
|
//
|
|||
|
|
// Revision 1.8 2004/04/06 04:55:18
|
|||
|
|
// adapt compiler conditions, add log entry
|
|||
|
|
//
|
|||
|
|
|
|||
|
|
end.
|