Componentes.Terceros.jcl/official/2.1.1/source/common/JclUnicode.pas
2010-01-18 16:51:36 +00:00

6877 lines
218 KiB
ObjectPascal

{**************************************************************************************************}
{ }
{ Project JEDI Code Library (JCL) }
{ }
{ The contents of this file are subject to the Mozilla Public License Version 1.1 (the "License"); }
{ you may not use this file except in compliance with the License. You may obtain a copy of the }
{ License at http://www.mozilla.org/MPL/ }
{ }
{ Software distributed under the License is distributed on an "AS IS" basis, WITHOUT WARRANTY OF }
{ ANY KIND, either express or implied. See the License for the specific language governing rights }
{ and limitations under the License. }
{ }
{ The Original Code is JclUnicode.pas. }
{ }
{ The Initial Developer of the Original Code is Mike Lischke (public att lischke-online dott de). }
{ Portions created by Mike Lischke are Copyright (C) 1999-2000 Mike Lischke. All Rights Reserved. }
{ }
{ Contributor(s): }
{ Marcel van Brakel }
{ Andreas Hausladen (ahuser) }
{ Mike Lischke }
{ Flier Lu (flier) }
{ Robert Marquardt (marquardt) }
{ Robert Rossmair (rrossmair) }
{ Olivier Sannier (obones) }
{ Matthias Thoma (mthoma) }
{ Petr Vones (pvones) }
{ Peter Schraut (http://www.console-dev.de) }
{ Florent Ouchet (outchy) }
{ glchapman }
{ }
{**************************************************************************************************}
{ }
{ Various Unicode related routines }
{ }
{**************************************************************************************************}
{ }
{ Last modified: $Date:: 2009-08-09 19:06:40 +0200 (dim., 09 août 2009) $ }
{ Revision: $Rev:: 2930 $ }
{ Author: $Author:: outchy $ }
{ }
{**************************************************************************************************}
unit JclUnicode;
{$I jcl.inc}
// Copyright (c) 1999-2000 Mike Lischke (public att lischke-online dott de)
//
// 10-JUL-2005: (changes by Peter Schraut)
// - added CodeBlockName, returns the blockname as string
// - added CodeBlockRange, returns the range of the specified codeblock
// - updated TUnicodeBlock to reflect changes in unicode 4.1
// - updated CodeBlockFromChar to reflect changes in unicode 4.1
// - Notes:
// Here are a few suggestions to reflect latest namechanges in unicode 4.1,
// but they were not done due to compatibility with old code:
// ubGreek should be renamed to ubGreekandCoptic
// ubCombiningMarksforSymbols should be renamed to ubCombiningDiacriticalMarksforSymbols
// ubPrivateUse should be renamed to ubPrivateUseArea
//
//
// 19-SEP-2003: (changes by Andreas Hausladen)
// - added OWN_WIDESTRING_MEMMGR for faster memory managment in TWideStringList
// under Windows
// - fixed: TWideStringList.Destroy does not set OnChange and OnChanging to nil before calling Clear
//
//
// 29-MAR-2002: MT
// - WideNormalize now returns strings with normalization mode nfNone unchanged.
// - Bug fix in WideCompose: Raised exception when Result of WideComposeHangul was an
// empty string. (#0000044)
// - Bug fix in WideAdjustLineBreaks
// - Added Asserts were needed.
// - TWideStrings.IndexOfName now takes care of NormalizeForm as well.
// - TWideStrings.IndexOf now takes care of NormalizeForm as well.
// - TWideString.List Find now uses the same NormalizationForm for the search string as it uses
// within the list itself.
//
// 29-NOV-2001:
// - bug fix
// 06-JUN-2001:
// - small changes
// 28-APR-2001:
// - bug fixes
// 05-APR-2001:
// - bug fixes
// 23-MAR-2001:
// - WideSameText
// - small changes
// 10-FEB-2001:
// - bug fix in StringToWideStringEx and WideStringToStringEx
// 05-FEB-2001:
// - TWideStrings.GetSeparatedText changed (no separator anymore after the last line)
// 29-JAN-2001:
// - PrepareUnicodeData
// - LoadInProgress critical section is now created at init time to avoid critical thread races
// - bug fixes
// 26-JAN-2001:
// - ExpandANSIString
// - TWideStrings.SaveUnicode is by default True now
// 20..21-JAN-2001:
// - StrUpperW, StrLowerW and StrTitleW removed because they potentially would need
// a reallocation to work correctly (use the WideString versions instead)
// - further improvements related to internal data
// - introduced TUnicodeBlock
// - CodeBlockFromChar improved
// 07-JAN-2001:
// optimized access to character properties, combining class etc.
// 06-JAN-2001:
// TWideStrings and TWideStringList improved
// APR-DEC 2000: versions 2.1 - 2.6
// - preparation for public rlease
// - additional conversion routines
// - JCL compliance
// - character properties unified
// - character properties data and lookup improvements
// - reworked Unicode data resource file
// - improved simple string comparation routines (StrCompW, StrLCompW etc., include surrogate fix)
// - special case folding data for language neutral case insensitive comparations included
// - optimized decomposition
// - composition and normalization support
// - normalization conformance tests applied
// - bug fixes
// FEB-MAR 2000: version 2.0
// - Unicode regular expressions (URE) search class (TURESearch)
// - generic search engine base class for both the Boyer-Moore and the RE search class
// - whole word only search in UTBM, bug fixes in UTBM
// - string decompositon (including hangul)
// OCT/99 - JAN/2000: version 1.0
// - basic Unicode implementation, more than 100 WideString/UCS2 and UCS4 core functions
// - TWideStrings and TWideStringList classes
// - Unicode Tuned Boyer-Moore search class (TUTBMSearch)
// - low and high level Unicode/Wide* functions
// - low level Unicode UCS4 data import and functions
// - helper functions
//
// Version 2.9
// This unit contains routines and classes to manage and work with Unicode/WideString strings.
// You need Delphi 4 or higher to compile this code.
//
// Publicly available low level functions are all preceded by "Unicode..." (e.g.
// in UnicodeToUpper) while the high level functions use the Str... or Wide...
// naming scheme (e.g. StrLICompW and WideUpperCase).
//
// The normalization implementation in this unit has successfully and completely passed the
// official normative conformance testing as of Annex 9 in Technical Report #15
// (Unicode Standard Annex #15, http://www.unicode.org/unicode/reports/tr15, from 2000-08-31).
//
// Open issues:
// - Yet to do things in the URE class are:
// - check all character classes if they match correctly
// - optimize rebuild of DFA (build only when pattern changes)
// - set flag parameter of ExecuteURE
// - add \d any decimal digit
// \D any character that is not a decimal digit
// \s any whitespace character
// \S any character that is not a whitespace character
// \w any "word" character
// \W any "non-word" character
// - The wide string classes still compare text with functions provided by the
// particular system. This works usually fine under WinNT/W2K (although also
// there are limitations like maximum text lengths). Under Win9x conversions
// from and to MBCS are necessary which are bound to a particular locale and
// so very limited in general use. These comparisons should be changed so that
// the code in this unit is used.
interface
uses
{$IFDEF UNITVERSIONING}
JclUnitVersioning,
{$ENDIF UNITVERSIONING}
{$IFDEF MSWINDOWS}
Windows,
{$ENDIF MSWINDOWS}
SysUtils,
Classes,
JclBase;
{$IFNDEF FPC}
{$IFDEF MSWINDOWS}
{$DEFINE OWN_WIDESTRING_MEMMGR}
{$ENDIF MSWINDOWS}
{$ENDIF ~FPC}
const
// definitions of often used characters:
// Note: Use them only for tests of a certain character not to determine character
// classes (like white spaces) as in Unicode are often many code points defined
// being in a certain class. Hence your best option is to use the various
// UnicodeIs* functions.
WideNull = WideChar(#0);
WideTabulator = WideChar(#9);
WideSpace = WideChar(#32);
// logical line breaks
WideLF = WideChar(#10);
WideLineFeed = WideChar(#10);
WideVerticalTab = WideChar(#11);
WideFormFeed = WideChar(#12);
WideCR = WideChar(#13);
WideCarriageReturn = WideChar(#13);
WideCRLF = WideString(#13#10);
WideLineSeparator = WideChar($2028);
WideParagraphSeparator = WideChar($2029);
// byte order marks for Unicode files
// Unicode text files (in UTF-16 format) should contain $FFFE as first character to
// identify such a file clearly. Depending on the system where the file was created
// on this appears either in big endian or little endian style.
BOM_LSB_FIRST = WideChar($FEFF);
BOM_MSB_FIRST = WideChar($FFFE);
type
TSaveFormat = ( sfUTF16LSB, sfUTF16MSB, sfUTF8, sfAnsi );
const
sfUnicodeLSB = sfUTF16LSB;
sfUnicodeMSB = sfUTF16MSB;
type
// various predefined or otherwise useful character property categories
TCharacterCategory = (
// normative categories
ccLetterUppercase,
ccLetterLowercase,
ccLetterTitlecase,
ccMarkNonSpacing,
ccMarkSpacingCombining,
ccMarkEnclosing,
ccNumberDecimalDigit,
ccNumberLetter,
ccNumberOther,
ccSeparatorSpace,
ccSeparatorLine,
ccSeparatorParagraph,
ccOtherControl,
ccOtherFormat,
ccOtherSurrogate,
ccOtherPrivate,
ccOtherUnassigned,
// informative categories
ccLetterModifier,
ccLetterOther,
ccPunctuationConnector,
ccPunctuationDash,
ccPunctuationOpen,
ccPunctuationClose,
ccPunctuationInitialQuote,
ccPunctuationFinalQuote,
ccPunctuationOther,
ccSymbolMath,
ccSymbolCurrency,
ccSymbolModifier,
ccSymbolOther,
// bidirectional categories
ccLeftToRight,
ccLeftToRightEmbedding,
ccLeftToRightOverride,
ccRightToLeft,
ccRightToLeftArabic,
ccRightToLeftEmbedding,
ccRightToLeftoverride,
ccPopDirectionalFormat,
ccEuropeanNumber,
ccEuropeanNumberSeparator,
ccEuropeanNumberTerminator,
ccArabicNumber,
ccCommonNumberSeparator,
ccBoundaryNeutral,
ccSegmentSeparator, // this includes tab and vertical tab
ccWhiteSpace,
ccOtherNeutrals,
// self defined categories, they do not appear in the Unicode data file
ccComposed, // can be decomposed
ccNonBreaking,
ccSymmetric, // has left and right forms
ccHexDigit,
ccQuotationMark,
ccMirroring,
ccSpaceOther,
ccAssigned // means there is a definition in the Unicode standard
);
TCharacterCategories = set of TCharacterCategory;
// four forms of normalization are defined:
TNormalizationForm = (
nfNone, // do not normalize
nfC, // canonical decomposition followed by canonical composition (this is most often used)
nfD, // canonical decomposition
nfKC, // compatibility decomposition followed by a canonical composition
nfKD // compatibility decomposition
);
// used to hold information about the start and end
// position of a unicodeblock.
TUnicodeBlockRange = record
RangeStart,
RangeEnd: Cardinal;
end;
// An Unicode block usually corresponds to a particular language script but
// can also represent special characters, musical symbols and the like.
// http://www.unicode.org/Public/5.0.0/ucd/Blocks.txt
TUnicodeBlock = (
ubUndefined,
ubBasicLatin,
ubLatin1Supplement,
ubLatinExtendedA,
ubLatinExtendedB,
ubIPAExtensions,
ubSpacingModifierLetters,
ubCombiningDiacriticalMarks,
ubGreekandCoptic,
ubCyrillic,
ubCyrillicSupplement,
ubArmenian,
ubHebrew,
ubArabic,
ubSyriac,
ubArabicSupplement,
ubThaana,
ubNKo,
ubDevanagari,
ubBengali,
ubGurmukhi,
ubGujarati,
ubOriya,
ubTamil,
ubTelugu,
ubKannada,
ubMalayalam,
ubSinhala,
ubThai,
ubLao,
ubTibetan,
ubMyanmar,
ubGeorgian,
ubHangulJamo,
ubEthiopic,
ubEthiopicSupplement,
ubCherokee,
ubUnifiedCanadianAboriginalSyllabics,
ubOgham,
ubRunic,
ubTagalog,
ubHanunoo,
ubBuhid,
ubTagbanwa,
ubKhmer,
ubMongolian,
ubLimbu,
ubTaiLe,
ubNewTaiLue,
ubKhmerSymbols,
ubBuginese,
ubBalinese,
ubPhoneticExtensions,
ubPhoneticExtensionsSupplement,
ubCombiningDiacriticalMarksSupplement,
ubLatinExtendedAdditional,
ubGreekExtended,
ubGeneralPunctuation,
ubSuperscriptsandSubscripts,
ubCurrencySymbols,
ubCombiningDiacriticalMarksforSymbols,
ubLetterlikeSymbols,
ubNumberForms,
ubArrows,
ubMathematicalOperators,
ubMiscellaneousTechnical,
ubControlPictures,
ubOpticalCharacterRecognition,
ubEnclosedAlphanumerics,
ubBoxDrawing,
ubBlockElements,
ubGeometricShapes,
ubMiscellaneousSymbols,
ubDingbats,
ubMiscellaneousMathematicalSymbolsA,
ubSupplementalArrowsA,
ubBraillePatterns,
ubSupplementalArrowsB,
ubMiscellaneousMathematicalSymbolsB,
ubSupplementalMathematicalOperators,
ubMiscellaneousSymbolsandArrows,
ubGlagolitic,
ubLatinExtendedC,
ubCoptic,
ubGeorgianSupplement,
ubTifinagh,
ubEthiopicExtended,
ubSupplementalPunctuation,
ubCJKRadicalsSupplement,
ubKangxiRadicals,
ubIdeographicDescriptionCharacters,
ubCJKSymbolsandPunctuation,
ubHiragana,
ubKatakana,
ubBopomofo,
ubHangulCompatibilityJamo,
ubKanbun,
ubBopomofoExtended,
ubCJKStrokes,
ubKatakanaPhoneticExtensions,
ubEnclosedCJKLettersandMonths,
ubCJKCompatibility,
ubCJKUnifiedIdeographsExtensionA,
ubYijingHexagramSymbols,
ubCJKUnifiedIdeographs,
ubYiSyllables,
ubYiRadicals,
ubModifierToneLetters,
ubLatinExtendedD,
ubSylotiNagri,
ubPhagsPa,
ubHangulSyllables,
ubHighSurrogates,
ubHighPrivateUseSurrogates,
ubLowSurrogates,
ubPrivateUseArea,
ubCJKCompatibilityIdeographs,
ubAlphabeticPresentationForms,
ubArabicPresentationFormsA,
ubVariationSelectors,
ubVerticalForms,
ubCombiningHalfMarks,
ubCJKCompatibilityForms,
ubSmallFormVariants,
ubArabicPresentationFormsB,
ubHalfwidthandFullwidthForms,
ubSpecials,
ubLinearBSyllabary,
ubLinearBIdeograms,
ubAegeanNumbers,
ubAncientGreekNumbers,
ubOldItalic,
ubGothic,
ubUgaritic,
ubOldPersian,
ubDeseret,
ubShavian,
ubOsmanya,
ubCypriotSyllabary,
ubPhoenician,
ubKharoshthi,
ubCuneiform,
ubCuneiformNumbersAndPunctuation,
ubByzantineMusicalSymbols,
ubMusicalSymbols,
ubAncientGreekMusicalNotation,
ubTaiXuanJingSymbols,
ubCountingRodNumerals,
ubMathematicalAlphanumericSymbols,
ubCJKUnifiedIdeographsExtensionB,
ubCJKCompatibilityIdeographsSupplement,
ubTags,
ubVariationSelectorsSupplement,
ubSupplementaryPrivateUseAreaA,
ubSupplementaryPrivateUseAreaB
);
TUnicodeBlockData = record
Range: TUnicodeBlockRange;
Name: string;
end;
PUnicodeBlockData = ^TUnicodeBlockData;
const
UnicodeBlockData: array [TUnicodeBlock] of TUnicodeBlockData =
((Range:(RangeStart: $FFFFFFFF; RangeEnd: $0000); Name: 'No-block'),
(Range:(RangeStart: $0000; RangeEnd: $007F); Name: 'Basic Latin'),
(Range:(RangeStart: $0080; RangeEnd: $00FF); Name: 'Latin-1 Supplement'),
(Range:(RangeStart: $0100; RangeEnd: $017F); Name: 'Latin Extended-A'),
(Range:(RangeStart: $0180; RangeEnd: $024F); Name: 'Latin Extended-B'),
(Range:(RangeStart: $0250; RangeEnd: $02AF); Name: 'IPA Extensions'),
(Range:(RangeStart: $02B0; RangeEnd: $02FF); Name: 'Spacing Modifier Letters'),
(Range:(RangeStart: $0300; RangeEnd: $036F); Name: 'Combining Diacritical Marks'),
(Range:(RangeStart: $0370; RangeEnd: $03FF); Name: 'Greek and Coptic'),
(Range:(RangeStart: $0400; RangeEnd: $04FF); Name: 'Cyrillic'),
(Range:(RangeStart: $0500; RangeEnd: $052F); Name: 'Cyrillic Supplement'),
(Range:(RangeStart: $0530; RangeEnd: $058F); Name: 'Armenian'),
(Range:(RangeStart: $0590; RangeEnd: $05FF); Name: 'Hebrew'),
(Range:(RangeStart: $0600; RangeEnd: $06FF); Name: 'Arabic'),
(Range:(RangeStart: $0700; RangeEnd: $074F); Name: 'Syriac'),
(Range:(RangeStart: $0750; RangeEnd: $077F); Name: 'Arabic Supplement'),
(Range:(RangeStart: $0780; RangeEnd: $07BF); Name: 'Thaana'),
(Range:(RangeStart: $07C0; RangeEnd: $07FF); Name: 'NKo'),
(Range:(RangeStart: $0900; RangeEnd: $097F); Name: 'Devanagari'),
(Range:(RangeStart: $0980; RangeEnd: $09FF); Name: 'Bengali'),
(Range:(RangeStart: $0A00; RangeEnd: $0A7F); Name: 'Gurmukhi'),
(Range:(RangeStart: $0A80; RangeEnd: $0AFF); Name: 'Gujarati'),
(Range:(RangeStart: $0B00; RangeEnd: $0B7F); Name: 'Oriya'),
(Range:(RangeStart: $0B80; RangeEnd: $0BFF); Name: 'Tamil'),
(Range:(RangeStart: $0C00; RangeEnd: $0C7F); Name: 'Telugu'),
(Range:(RangeStart: $0C80; RangeEnd: $0CFF); Name: 'Kannada'),
(Range:(RangeStart: $0D00; RangeEnd: $0D7F); Name: 'Malayalam'),
(Range:(RangeStart: $0D80; RangeEnd: $0DFF); Name: 'Sinhala'),
(Range:(RangeStart: $0E00; RangeEnd: $0E7F); Name: 'Thai'),
(Range:(RangeStart: $0E80; RangeEnd: $0EFF); Name: 'Lao'),
(Range:(RangeStart: $0F00; RangeEnd: $0FFF); Name: 'Tibetan'),
(Range:(RangeStart: $1000; RangeEnd: $109F); Name: 'Myanmar'),
(Range:(RangeStart: $10A0; RangeEnd: $10FF); Name: 'Georgian'),
(Range:(RangeStart: $1100; RangeEnd: $11FF); Name: 'Hangul Jamo'),
(Range:(RangeStart: $1200; RangeEnd: $137F); Name: 'Ethiopic'),
(Range:(RangeStart: $1380; RangeEnd: $139F); Name: 'Ethiopic Supplement'),
(Range:(RangeStart: $13A0; RangeEnd: $13FF); Name: 'Cherokee'),
(Range:(RangeStart: $1400; RangeEnd: $167F); Name: 'Unified Canadian Aboriginal Syllabics'),
(Range:(RangeStart: $1680; RangeEnd: $169F); Name: 'Ogham'),
(Range:(RangeStart: $16A0; RangeEnd: $16FF); Name: 'Runic'),
(Range:(RangeStart: $1700; RangeEnd: $171F); Name: 'Tagalog'),
(Range:(RangeStart: $1720; RangeEnd: $173F); Name: 'Hanunoo'),
(Range:(RangeStart: $1740; RangeEnd: $175F); Name: 'Buhid'),
(Range:(RangeStart: $1760; RangeEnd: $177F); Name: 'Tagbanwa'),
(Range:(RangeStart: $1780; RangeEnd: $17FF); Name: 'Khmer'),
(Range:(RangeStart: $1800; RangeEnd: $18AF); Name: 'Mongolian'),
(Range:(RangeStart: $1900; RangeEnd: $194F); Name: 'Limbu'),
(Range:(RangeStart: $1950; RangeEnd: $197F); Name: 'Tai Le'),
(Range:(RangeStart: $1980; RangeEnd: $19DF); Name: 'New Tai Lue'),
(Range:(RangeStart: $19E0; RangeEnd: $19FF); Name: 'Khmer Symbols'),
(Range:(RangeStart: $1A00; RangeEnd: $1A1F); Name: 'Buginese'),
(Range:(RangeStart: $1B00; RangeEnd: $1B7F); Name: 'Balinese'),
(Range:(RangeStart: $1D00; RangeEnd: $1D7F); Name: 'Phonetic Extensions'),
(Range:(RangeStart: $1D80; RangeEnd: $1DBF); Name: 'Phonetic Extensions Supplement'),
(Range:(RangeStart: $1DC0; RangeEnd: $1DFF); Name: 'Combining Diacritical Marks Supplement'),
(Range:(RangeStart: $1E00; RangeEnd: $1EFF); Name: 'Latin Extended Additional'),
(Range:(RangeStart: $1F00; RangeEnd: $1FFF); Name: 'Greek Extended'),
(Range:(RangeStart: $2000; RangeEnd: $206F); Name: 'General Punctuation'),
(Range:(RangeStart: $2070; RangeEnd: $209F); Name: 'Superscripts and Subscripts'),
(Range:(RangeStart: $20A0; RangeEnd: $20CF); Name: 'Currency Symbols'),
(Range:(RangeStart: $20D0; RangeEnd: $20FF); Name: 'Combining Diacritical Marks for Symbols'),
(Range:(RangeStart: $2100; RangeEnd: $214F); Name: 'Letterlike Symbols'),
(Range:(RangeStart: $2150; RangeEnd: $218F); Name: 'Number Forms'),
(Range:(RangeStart: $2190; RangeEnd: $21FF); Name: 'Arrows'),
(Range:(RangeStart: $2200; RangeEnd: $22FF); Name: 'Mathematical Operators'),
(Range:(RangeStart: $2300; RangeEnd: $23FF); Name: 'Miscellaneous Technical'),
(Range:(RangeStart: $2400; RangeEnd: $243F); Name: 'Control Pictures'),
(Range:(RangeStart: $2440; RangeEnd: $245F); Name: 'Optical Character Recognition'),
(Range:(RangeStart: $2460; RangeEnd: $24FF); Name: 'Enclosed Alphanumerics'),
(Range:(RangeStart: $2500; RangeEnd: $257F); Name: 'Box Drawing'),
(Range:(RangeStart: $2580; RangeEnd: $259F); Name: 'Block Elements'),
(Range:(RangeStart: $25A0; RangeEnd: $25FF); Name: 'Geometric Shapes'),
(Range:(RangeStart: $2600; RangeEnd: $26FF); Name: 'Miscellaneous Symbols'),
(Range:(RangeStart: $2700; RangeEnd: $27BF); Name: 'Dingbats'),
(Range:(RangeStart: $27C0; RangeEnd: $27EF); Name: 'Miscellaneous Mathematical Symbols-A'),
(Range:(RangeStart: $27F0; RangeEnd: $27FF); Name: 'Supplemental Arrows-A'),
(Range:(RangeStart: $2800; RangeEnd: $28FF); Name: 'Braille Patterns'),
(Range:(RangeStart: $2900; RangeEnd: $297F); Name: 'Supplemental Arrows-B'),
(Range:(RangeStart: $2980; RangeEnd: $29FF); Name: 'Miscellaneous Mathematical Symbols-B'),
(Range:(RangeStart: $2A00; RangeEnd: $2AFF); Name: 'Supplemental Mathematical Operators'),
(Range:(RangeStart: $2B00; RangeEnd: $2BFF); Name: 'Miscellaneous Symbols and Arrows'),
(Range:(RangeStart: $2C00; RangeEnd: $2C5F); Name: 'Glagolitic'),
(Range:(RangeStart: $2C60; RangeEnd: $2C7F); Name: 'Latin Extended-C'),
(Range:(RangeStart: $2C80; RangeEnd: $2CFF); Name: 'Coptic'),
(Range:(RangeStart: $2D00; RangeEnd: $2D2F); Name: 'Georgian Supplement'),
(Range:(RangeStart: $2D30; RangeEnd: $2D7F); Name: 'Tifinagh'),
(Range:(RangeStart: $2D80; RangeEnd: $2DDF); Name: 'Ethiopic Extended'),
(Range:(RangeStart: $2E00; RangeEnd: $2E7F); Name: 'Supplemental Punctuation'),
(Range:(RangeStart: $2E80; RangeEnd: $2EFF); Name: 'CJK Radicals Supplement'),
(Range:(RangeStart: $2F00; RangeEnd: $2FDF); Name: 'Kangxi Radicals'),
(Range:(RangeStart: $2FF0; RangeEnd: $2FFF); Name: 'Ideographic Description Characters'),
(Range:(RangeStart: $3000; RangeEnd: $303F); Name: 'CJK Symbols and Punctuation'),
(Range:(RangeStart: $3040; RangeEnd: $309F); Name: 'Hiragana'),
(Range:(RangeStart: $30A0; RangeEnd: $30FF); Name: 'Katakana'),
(Range:(RangeStart: $3100; RangeEnd: $312F); Name: 'Bopomofo'),
(Range:(RangeStart: $3130; RangeEnd: $318F); Name: 'Hangul Compatibility Jamo'),
(Range:(RangeStart: $3190; RangeEnd: $319F); Name: 'Kanbun'),
(Range:(RangeStart: $31A0; RangeEnd: $31BF); Name: 'Bopomofo Extended'),
(Range:(RangeStart: $31C0; RangeEnd: $31EF); Name: 'CJK Strokes'),
(Range:(RangeStart: $31F0; RangeEnd: $31FF); Name: 'Katakana Phonetic Extensions'),
(Range:(RangeStart: $3200; RangeEnd: $32FF); Name: 'Enclosed CJK Letters and Months'),
(Range:(RangeStart: $3300; RangeEnd: $33FF); Name: 'CJK Compatibility'),
(Range:(RangeStart: $3400; RangeEnd: $4DBF); Name: 'CJK Unified Ideographs Extension A'),
(Range:(RangeStart: $4DC0; RangeEnd: $4DFF); Name: 'Yijing Hexagram Symbols'),
(Range:(RangeStart: $4E00; RangeEnd: $9FFF); Name: 'CJK Unified Ideographs'),
(Range:(RangeStart: $A000; RangeEnd: $A48F); Name: 'Yi Syllables'),
(Range:(RangeStart: $A490; RangeEnd: $A4CF); Name: 'Yi Radicals'),
(Range:(RangeStart: $A700; RangeEnd: $A71F); Name: 'Modifier Tone Letters'),
(Range:(RangeStart: $A720; RangeEnd: $A7FF); Name: 'Latin Extended-D'),
(Range:(RangeStart: $A800; RangeEnd: $A82F); Name: 'Syloti Nagri'),
(Range:(RangeStart: $A840; RangeEnd: $A87F); Name: 'Phags-pa'),
(Range:(RangeStart: $AC00; RangeEnd: $D7AF); Name: 'Hangul Syllables'),
(Range:(RangeStart: $D800; RangeEnd: $DB7F); Name: 'High Surrogates'),
(Range:(RangeStart: $DB80; RangeEnd: $DBFF); Name: 'High Private Use Surrogates'),
(Range:(RangeStart: $DC00; RangeEnd: $DFFF); Name: 'Low Surrogates'),
(Range:(RangeStart: $E000; RangeEnd: $F8FF); Name: 'Private Use Area'),
(Range:(RangeStart: $F900; RangeEnd: $FAFF); Name: 'CJK Compatibility Ideographs'),
(Range:(RangeStart: $FB00; RangeEnd: $FB4F); Name: 'Alphabetic Presentation Forms'),
(Range:(RangeStart: $FB50; RangeEnd: $FDFF); Name: 'Arabic Presentation Forms-A'),
(Range:(RangeStart: $FE00; RangeEnd: $FE0F); Name: 'Variation Selectors'),
(Range:(RangeStart: $FE10; RangeEnd: $FE1F); Name: 'Vertical Forms'),
(Range:(RangeStart: $FE20; RangeEnd: $FE2F); Name: 'Combining Half Marks'),
(Range:(RangeStart: $FE30; RangeEnd: $FE4F); Name: 'CJK Compatibility Forms'),
(Range:(RangeStart: $FE50; RangeEnd: $FE6F); Name: 'Small Form Variants'),
(Range:(RangeStart: $FE70; RangeEnd: $FEFF); Name: 'Arabic Presentation Forms-B'),
(Range:(RangeStart: $FF00; RangeEnd: $FFEF); Name: 'Halfwidth and Fullwidth Forms'),
(Range:(RangeStart: $FFF0; RangeEnd: $FFFF); Name: 'Specials'),
(Range:(RangeStart: $10000; RangeEnd: $1007F); Name: 'Linear B Syllabary'),
(Range:(RangeStart: $10080; RangeEnd: $100FF); Name: 'Linear B Ideograms'),
(Range:(RangeStart: $10100; RangeEnd: $1013F); Name: 'Aegean Numbers'),
(Range:(RangeStart: $10140; RangeEnd: $1018F); Name: 'Ancient Greek Numbers'),
(Range:(RangeStart: $10300; RangeEnd: $1032F); Name: 'Old Italic'),
(Range:(RangeStart: $10330; RangeEnd: $1034F); Name: 'Gothic'),
(Range:(RangeStart: $10380; RangeEnd: $1039F); Name: 'Ugaritic'),
(Range:(RangeStart: $103A0; RangeEnd: $103DF); Name: 'Old Persian'),
(Range:(RangeStart: $10400; RangeEnd: $1044F); Name: 'Deseret'),
(Range:(RangeStart: $10450; RangeEnd: $1047F); Name: 'Shavian'),
(Range:(RangeStart: $10480; RangeEnd: $104AF); Name: 'Osmanya'),
(Range:(RangeStart: $10800; RangeEnd: $1083F); Name: 'Cypriot Syllabary'),
(Range:(RangeStart: $10900; RangeEnd: $1091F); Name: 'Phoenician'),
(Range:(RangeStart: $10A00; RangeEnd: $10A5F); Name: 'Kharoshthi'),
(Range:(RangeStart: $12000; RangeEnd: $123FF); Name: 'Cuneiform'),
(Range:(RangeStart: $12400; RangeEnd: $1247F); Name: 'Cuneiform Numbers and Punctuation'),
(Range:(RangeStart: $1D000; RangeEnd: $1D0FF); Name: 'Byzantine Musical Symbols'),
(Range:(RangeStart: $1D100; RangeEnd: $1D1FF); Name: 'Musical Symbols'),
(Range:(RangeStart: $1D200; RangeEnd: $1D24F); Name: 'Ancient Greek Musical Notation'),
(Range:(RangeStart: $1D300; RangeEnd: $1D35F); Name: 'Tai Xuan Jing Symbols'),
(Range:(RangeStart: $1D360; RangeEnd: $1D37F); Name: 'Counting Rod Numerals'),
(Range:(RangeStart: $1D400; RangeEnd: $1D7FF); Name: 'Mathematical Alphanumeric Symbols'),
(Range:(RangeStart: $20000; RangeEnd: $2A6DF); Name: 'CJK Unified Ideographs Extension B'),
(Range:(RangeStart: $2F800; RangeEnd: $2FA1F); Name: 'CJK Compatibility Ideographs Supplement'),
(Range:(RangeStart: $E0000; RangeEnd: $E007F); Name: 'Tags'),
(Range:(RangeStart: $E0100; RangeEnd: $E01EF); Name: 'Variation Selectors Supplement'),
(Range:(RangeStart: $F0000; RangeEnd: $FFFFF); Name: 'Supplementary Private Use Area-A'),
(Range:(RangeStart: $100000; RangeEnd: $10FFFF); Name: 'Supplementary Private Use Area-B'));
type
TWideStrings = class;
TSearchFlag = (
sfCaseSensitive, // match letter case
sfIgnoreNonSpacing, // ignore non-spacing characters in search
sfSpaceCompress, // handle several consecutive white spaces as one white space
// (this applies to the pattern as well as the search text)
sfWholeWordOnly // match only text at end/start and/or surrounded by white spaces
);
TSearchFlags = set of TSearchFlag;
// a generic search class defininition used for tuned Boyer-Moore and Unicode
// regular expression searches
TSearchEngine = class(TObject)
private
FResults: TList; // 2 entries for each result (start and stop position)
FOwner: TWideStrings; // at the moment unused, perhaps later to access strings faster
protected
function GetCount: SizeInt; virtual;
public
constructor Create(AOwner: TWideStrings); virtual;
destructor Destroy; override;
procedure AddResult(Start, Stop: SizeInt); virtual;
procedure Clear; virtual;
procedure ClearResults; virtual;
procedure DeleteResult(Index: SizeInt); virtual;
procedure FindPrepare(const Pattern: WideString; Options: TSearchFlags); overload; virtual; abstract;
procedure FindPrepare(Pattern: PWideChar; PatternLength: SizeInt; Options: TSearchFlags); overload; virtual; abstract;
function FindFirst(const Text: WideString; var Start, Stop: SizeInt): Boolean; overload; virtual; abstract;
function FindFirst(Text: PWideChar; TextLen: SizeInt; var Start, Stop: SizeInt): Boolean; overload; virtual; abstract;
function FindAll(const Text: WideString): Boolean; overload; virtual; abstract;
function FindAll(Text: PWideChar; TextLen: SizeInt): Boolean; overload; virtual; abstract;
procedure GetResult(Index: SizeInt; var Start, Stop: SizeInt); virtual;
property Count: SizeInt read GetCount;
end;
// The Unicode Tuned Boyer-Moore (UTBM) search implementation is an extended
// translation created from a free package written by Mark Leisher (mleisher att crl dott nmsu dott edu).
//
// The code handles high and low surrogates as well as case (in)dependency,
// can ignore non-spacing characters and allows optionally to return whole
// words only.
// single pattern character
PUTBMChar = ^TUTBMChar;
TUTBMChar = record
LoCase,
UpCase,
TitleCase: UCS4;
end;
PUTBMSkip = ^TUTBMSkip;
TUTBMSkip = record
BMChar: PUTBMChar;
SkipValues: Integer;
end;
TUTBMSearch = class(TSearchEngine)
private
FFlags: TSearchFlags;
FPattern: PUTBMChar;
FPatternUsed: SizeInt;
FPatternSize: SizeInt;
FPatternLength: SizeInt;
FSkipValues: PUTBMSkip;
FSkipsUsed: SizeInt;
FMD4: SizeInt;
protected
procedure ClearPattern;
procedure Compile(Pattern: PUCS2; PatternLength: SizeInt; Flags: TSearchFlags);
function Find(Text: PUCS2; TextLen: SizeInt; var MatchStart, MatchEnd: SizeInt): Boolean;
function GetSkipValue(TextStart, TextEnd: PUCS2): SizeInt;
function Match(Text, Start, Stop: PUCS2; var MatchStart, MatchEnd: SizeInt): Boolean;
public
procedure Clear; override;
procedure FindPrepare(const Pattern: WideString; Options: TSearchFlags); overload; override;
procedure FindPrepare(Pattern: PWideChar; PatternLength: SizeInt; Options: TSearchFlags); overload; override;
function FindFirst(const Text: WideString; var Start, Stop: SizeInt): Boolean; overload; override;
function FindFirst(Text: PWideChar; TextLen: SizeInt; var Start, Stop: SizeInt): Boolean; overload; override;
function FindAll(const Text: WideString): Boolean; overload; override;
function FindAll(Text: PWideChar; TextLen: SizeInt): Boolean; overload; override;
end;
// Regular expression search engine for text in UCS2 form taking surrogates
// into account. This implementation is an improved translation from the URE
// package written by Mark Leisher (mleisher att crl dott nmsu dott edu) who used a variation
// of the RE->DFA algorithm done by Mark Hopkins (markh att csd4 dott csd dott uwm dott edu).
// Assumptions:
// o Regular expression and text already normalized.
// o Conversion to lower case assumes a 1-1 mapping.
//
// Definitions:
// Separator - any one of U+2028, U+2029, NL, CR.
//
// Operators:
// . - match any character
// * - match zero or more of the last subexpression
// + - match one or more of the last subexpression
// ? - match zero or one of the last subexpression
// () - subexpression grouping
// {m, n} - match at least m occurences and up to n occurences
// Note: both values can be 0 or ommitted which denotes then a unlimiting bound
// {,} and {0,} and {0, 0} correspond to *
// {, 1} and {0, 1} correspond to ?
// {1,} and {1, 0} correspond to +
// {m} - match exactly m occurences
//
// Notes:
// o The "." operator normally does not match separators, but a flag is
// available that will allow this operator to match a separator.
//
// Literals and Constants:
// c - literal UCS2 character
// \x.... - hexadecimal number of up to 4 digits
// \X.... - hexadecimal number of up to 4 digits
// \u.... - hexadecimal number of up to 4 digits
// \U.... - hexadecimal number of up to 4 digits
//
// Character classes:
// [...] - Character class
// [^...] - Negated character class
// \pN1,N2,...,Nn - Character properties class
// \PN1,N2,...,Nn - Negated character properties class
//
// POSIX character classes recognized:
// :alnum:
// :alpha:
// :cntrl:
// :digit:
// :graph:
// :lower:
// :print:
// :punct:
// :space:
// :upper:
// :xdigit:
//
// Notes:
// o Character property classes are \p or \P followed by a comma separated
// list of integers between 0 and the maximum entry index in TCharacterCategory.
// These integers directly correspond to the TCharacterCategory enumeration entries.
// Note: upper, lower and title case classes need to have case sensitive search
// be enabled to match correctly!
//
// o Character classes can contain literals, constants and character
// property classes. Example:
//
// [abc\U10A\p0,13,4]
// structure used to handle a compacted range of characters
PUcRange = ^TUcRange;
TUcRange = record
MinCode,
MaxCode: UCS4;
end;
TUcCClass = record
Ranges: array of TUcRange;
RangesUsed: SizeInt;
end;
// either a single character or a list of character classes
TUcSymbol = record
Chr: UCS4;
CCL: TUcCClass;
end;
// this is a general element structure used for expressions and stack elements
TUcElement = record
OnStack: Boolean;
AType,
LHS,
RHS: SizeInt;
end;
// this is a structure used to track a list or a stack of states
PUcStateList = ^TUcStateList;
TUcStateList = record
List: array of SizeInt;
ListUsed: SizeInt;
end;
// structure to track the list of unique states for a symbol during reduction
PUcSymbolTableEntry = ^TUcSymbolTableEntry;
TUcSymbolTableEntry = record
ID,
AType: SizeInt;
Mods,
Categories: TCharacterCategories;
Symbol: TUcSymbol;
States: TUcStateList;
end;
// structure to hold a single State
PUcState = ^TUcState;
TUcState = record
ID: SizeInt;
Accepting: Boolean;
StateList: TUcStateList;
Transitions: array of TUcElement;
TransitionsUsed: SizeInt;
end;
// structure used for keeping lists of states
TUcStateTable = record
States: array of TUcState;
StatesUsed: SizeInt;
end;
// structure to track pairs of DFA states when equivalent states are merged
TUcEquivalent = record
Left,
Right: SizeInt;
end;
TUcExpressionList = record
Expressions: array of TUcElement;
ExpressionsUsed: SizeInt;
end;
TUcSymbolTable = record
Symbols: array of TUcSymbolTableEntry;
SymbolsUsed: SizeInt;
end;
TUcEquivalentList = record
Equivalents: array of TUcEquivalent;
EquivalentsUsed: SizeInt;
end;
// structure used for constructing the NFA and reducing to a minimal DFA
PUREBuffer = ^TUREBuffer;
TUREBuffer = record
Reducing: Boolean;
Error: Integer;
Flags: Cardinal;
Stack: TUcStateList;
SymbolTable: TUcSymbolTable; // table of unique symbols encountered
ExpressionList: TUcExpressionList; // tracks the unique expressions generated
// for the NFA and when the NFA is reduced
States: TUcStateTable; // the reduced table of unique groups of NFA states
EquivalentList: TUcEquivalentList; // tracks states when equivalent states are merged
end;
TUcTransition = record
Symbol,
NextState: SizeInt;
end;
PDFAState = ^TDFAState;
TDFAState = record
Accepting: Boolean;
NumberTransitions: SizeInt;
StartTransition: SizeInt;
end;
TDFAStates = record
States: array of TDFAState;
StatesUsed: SizeInt;
end;
TUcTransitions = record
Transitions: array of TUcTransition;
TransitionsUsed: SizeInt;
end;
TDFA = record
Flags: Cardinal;
SymbolTable: TUcSymbolTable;
StateList: TDFAStates;
TransitionList: TUcTransitions;
end;
TURESearch = class(TSearchEngine)
private
FUREBuffer: TUREBuffer;
FDFA: TDFA;
protected
procedure AddEquivalentPair(L, R: SizeInt);
procedure AddRange(var CCL: TUcCClass; Range: TUcRange);
function AddState(NewStates: array of SizeInt): SizeInt;
procedure AddSymbolState(Symbol, State: SizeInt);
function BuildCharacterClass(CP: PUCS2; Limit: SizeInt; Symbol: PUcSymbolTableEntry): SizeInt;
procedure ClearUREBuffer;
function CompileSymbol(S: PUCS2; Limit: SizeInt; Symbol: PUcSymbolTableEntry): SizeInt;
procedure CompileURE(RE: PWideChar; RELength: SizeInt; Casefold: Boolean);
procedure CollectPendingOperations(var State: SizeInt);
function ConvertRegExpToNFA(RE: PWideChar; RELength: SizeInt): SizeInt;
function ExecuteURE(Flags: Cardinal; Text: PUCS2; TextLen: SizeInt; var MatchStart, MatchEnd: SizeInt): Boolean;
procedure ClearDFA;
procedure HexDigitSetup(Symbol: PUcSymbolTableEntry);
function MakeExpression(AType, LHS, RHS: SizeInt): SizeInt;
function MakeHexNumber(NP: PUCS2; Limit: SizeInt; var Number: UCS4): SizeInt;
function MakeSymbol(S: PUCS2; Limit: SizeInt; out Consumed: SizeInt): SizeInt;
procedure MergeEquivalents;
function ParsePropertyList(Properties: PUCS2; Limit: SizeInt; var Categories: TCharacterCategories): SizeInt;
function Peek: SizeInt;
function Pop: SizeInt;
function PosixCCL(CP: PUCS2; Limit: SizeInt; Symbol: PUcSymbolTableEntry): SizeInt;
function ProbeLowSurrogate(LeftState: PUCS2; Limit: SizeInt; var Code: UCS4): SizeInt;
procedure Push(V: SizeInt);
procedure Reduce(Start: SizeInt);
procedure SpaceSetup(Symbol: PUcSymbolTableEntry; Categories: TCharacterCategories);
function SymbolsAreDifferent(A, B: PUcSymbolTableEntry): Boolean;
public
procedure Clear; override;
procedure FindPrepare(const Pattern: WideString; Options: TSearchFlags); overload; override;
procedure FindPrepare(Pattern: PWideChar; PatternLength: SizeInt; Options: TSearchFlags); overload; override;
function FindFirst(const Text: WideString; var Start, Stop: SizeInt): Boolean; overload; override;
function FindFirst(Text: PWideChar; TextLen: SizeInt; var Start, Stop: SizeInt): Boolean; overload; override;
function FindAll(const Text: WideString): Boolean; overload; override;
function FindAll(Text: PWideChar; TextLen: SizeInt): Boolean; overload; override;
end;
// Event used to give the application a chance to switch the way of how to save
// the text in TWideStrings if the text contains characters not only from the
// ANSI block but the save type is ANSI. On triggering the event the application
// can change the property SaveUnicode as needed. This property is again checked
// after the callback returns.
TConfirmConversionEvent = procedure (Sender: TWideStrings; var Allowed: Boolean) of object;
TWideStrings = class(TPersistent)
private
FUpdateCount: Integer;
FLanguage: LCID; // language can usually left alone, the system's default is used
FSaved: Boolean; // set in SaveToStream, True in case saving was successfull otherwise False
FNormalizationForm: TNormalizationForm; // determines in which form Unicode strings should be stored
FOnConfirmConversion: TConfirmConversionEvent;
FSaveFormat: TSaveFormat; // overrides the FSaveUnicode flag, initialized when a file is loaded,
// expect losses if it is set to sfAnsi before saving
function GetCommaText: WideString;
function GetName(Index: Integer): WideString;
function GetValue(const Name: WideString): WideString;
procedure ReadData(Reader: TReader);
procedure SetCommaText(const Value: WideString);
procedure SetNormalizationForm(const Value: TNormalizationForm);
procedure SetValue(const Name, Value: WideString);
procedure WriteData(Writer: TWriter);
function GetSaveUnicode: Boolean;
procedure SetSaveUnicode(const Value: Boolean);
protected
procedure DefineProperties(Filer: TFiler); override;
procedure DoConfirmConversion(var Allowed: Boolean); virtual;
procedure Error(const Msg: string; Data: Integer);
function Get(Index: Integer): WideString; virtual; abstract;
function GetCapacity: Integer; virtual;
function GetCount: Integer; virtual; abstract;
function GetObject(Index: Integer): TObject; virtual;
function GetTextStr: WideString; virtual;
procedure Put(Index: Integer; const S: WideString); virtual; abstract;
procedure PutObject(Index: Integer; AObject: TObject); virtual; abstract;
procedure SetCapacity(NewCapacity: Integer); virtual;
procedure SetUpdateState(Updating: Boolean); virtual;
procedure SetLanguage(Value: LCID); virtual;
public
constructor Create;
function Add(const S: WideString): Integer; virtual;
function AddObject(const S: WideString; AObject: TObject): Integer; virtual;
procedure Append(const S: WideString);
procedure AddStrings(Strings: TStrings); overload; virtual;
procedure AddStrings(Strings: TWideStrings); overload; virtual;
procedure Assign(Source: TPersistent); override;
procedure AssignTo(Dest: TPersistent); override;
procedure BeginUpdate;
procedure Clear; virtual; abstract;
procedure Delete(Index: Integer); virtual; abstract;
procedure EndUpdate;
function Equals(Strings: TWideStrings): Boolean; {$IFDEF RTL200_UP} reintroduce; {$ENDIF RTL200_UP}
procedure Exchange(Index1, Index2: Integer); virtual;
function GetSeparatedText(Separators: WideString): WideString; virtual;
function GetText: PWideChar; virtual;
function IndexOf(const S: WideString): Integer; virtual;
function IndexOfName(const Name: WideString): Integer;
function IndexOfObject(AObject: TObject): Integer;
procedure Insert(Index: Integer; const S: WideString); virtual; abstract;
procedure InsertObject(Index: Integer; const S: WideString; AObject: TObject);
procedure LoadFromFile(const FileName: TFileName); virtual;
procedure LoadFromStream(Stream: TStream); virtual;
procedure Move(CurIndex, NewIndex: Integer); virtual;
procedure SaveToFile(const FileName: TFileName); virtual;
procedure SaveToStream(Stream: TStream; WithBOM: Boolean = True); virtual;
procedure SetText(const Value: WideString); virtual;
property Capacity: Integer read GetCapacity write SetCapacity;
property CommaText: WideString read GetCommaText write SetCommaText;
property Count: Integer read GetCount;
property Language: LCID read FLanguage write SetLanguage;
property Names[Index: Integer]: WideString read GetName;
property NormalizationForm: TNormalizationForm read FNormalizationForm write SetNormalizationForm default nfC;
property Objects[Index: Integer]: TObject read GetObject write PutObject;
property Values[const Name: WideString]: WideString read GetValue write SetValue;
property Saved: Boolean read FSaved;
property SaveUnicode: Boolean read GetSaveUnicode write SetSaveUnicode default True;
property SaveFormat: TSaveFormat read FSaveFormat write FSaveFormat default sfUnicodeLSB;
property Strings[Index: Integer]: WideString read Get write Put; default;
property Text: WideString read GetTextStr write SetText;
property OnConfirmConversion: TConfirmConversionEvent read FOnConfirmConversion write FOnConfirmConversion;
end;
//----- TWideStringList class
TWideStringItem = record
{$IFDEF OWN_WIDESTRING_MEMMGR}
FString: PWideChar; // "array of WideChar";
{$ELSE ~OWN_WIDESTRING_MEMMGR}
FString: WideString;
{$ENDIF ~OWN_WIDESTRING_MEMMGR}
FObject: TObject;
end;
TWideStringItemList = array of TWideStringItem;
TWideStringList = class(TWideStrings)
private
FList: TWideStringItemList;
FCount: Integer;
FSorted: Boolean;
FDuplicates: TDuplicates;
FOnChange: TNotifyEvent;
FOnChanging: TNotifyEvent;
procedure ExchangeItems(Index1, Index2: Integer);
procedure Grow;
procedure QuickSort(L, R: Integer);
procedure InsertItem(Index: Integer; const S: WideString);
procedure SetSorted(Value: Boolean);
{$IFDEF OWN_WIDESTRING_MEMMGR}
procedure SetListString(Index: Integer; const S: WideString);
{$ENDIF OWN_WIDESTRING_MEMMGR}
protected
procedure Changed; virtual;
procedure Changing; virtual;
function Get(Index: Integer): WideString; override;
function GetCapacity: Integer; override;
function GetCount: Integer; override;
function GetObject(Index: Integer): TObject; override;
procedure Put(Index: Integer; const S: WideString); override;
procedure PutObject(Index: Integer; AObject: TObject); override;
procedure SetCapacity(NewCapacity: Integer); override;
procedure SetUpdateState(Updating: Boolean); override;
procedure SetLanguage(Value: LCID); override;
public
destructor Destroy; override;
function Add(const S: WideString): Integer; override;
procedure Clear; override;
procedure Delete(Index: Integer); override;
procedure Exchange(Index1, Index2: Integer); override;
function Find(const S: WideString; var Index: Integer): Boolean; virtual;
function IndexOf(const S: WideString): Integer; override;
procedure Insert(Index: Integer; const S: WideString); override;
procedure Sort; virtual;
property Duplicates: TDuplicates read FDuplicates write FDuplicates;
property Sorted: Boolean read FSorted write SetSorted;
property OnChange: TNotifyEvent read FOnChange write FOnChange;
property OnChanging: TNotifyEvent read FOnChanging write FOnChanging;
end;
{
// all these functions are now in JclWideStrings.pas
function StrLenW(Str: PWideChar): SizeInt;
function StrEndW(Str: PWideChar): PWideChar;
function StrMoveW(Dest, Source: PWideChar; Count: SizeInt): PWideChar;
function StrCopyW(Dest, Source: PWideChar): PWideChar;
function StrECopyW(Dest, Source: PWideChar): PWideChar;
function StrLCopyW(Dest, Source: PWideChar; MaxLen: SizeInt): PWideChar;
function StrPCopyWW(Dest: PWideChar; const Source: WideString): PWideChar; overload;
function StrPCopyW(Dest: PWideChar; const Source: AnsiString): PWideChar;
function StrPLCopyWW(Dest: PWideChar; const Source: WideString; MaxLen: SizeInt): PWideChar;
function StrPLCopyW(Dest: PWideChar; const Source: AnsiString; MaxLen: SizeInt): PWideChar;
function StrCatW(Dest: PWideChar; const Source: PWideChar): PWideChar;
function StrLCatW(Dest, Source: PWideChar; MaxLen: SizeInt): PWideChar;
function StrCompW(const Str1, Str2: PWideChar): Integer;
function StrICompW(const Str1, Str2: PWideChar): Integer;
function StrLCompW(const Str1, Str2: PWideChar; MaxLen: SizeInt): Integer;
function StrLICompW(const Str1, Str2: PWideChar; MaxLen: SizeInt): Integer;
function StrNScanW(const Str1, Str2: PWideChar): SizeInt;
function StrRNScanW(const Str1, Str2: PWideChar): SizeInt;
function StrScanW(Str: PWideChar; Chr: WideChar): PWideChar; overload;
function StrScanW(Str: PWideChar; Chr: WideChar; StrLen: SizeInt): PWideChar; overload;
function StrRScanW(Str: PWideChar; Chr: WideChar): PWideChar;
function StrPosW(Str, SubStr: PWideChar): PWideChar;
function StrAllocW(WideSize: SizeInt): PWideChar;
function StrBufSizeW(const Str: PWideChar): SizeInt;
function StrNewW(const Str: PWideChar): PWideChar; overload;
function StrNewW(const Str: WideString): PWideChar; overload;
procedure StrDisposeW(Str: PWideChar);
procedure StrDisposeAndNilW(var Str: PWideChar);
procedure StrSwapByteOrder(Str: PWideChar);
}
// functions involving Delphi wide strings
function WideAdjustLineBreaks(const S: WideString): WideString;
function WideCharPos(const S: WideString; const Ch: WideChar; const Index: SizeInt): SizeInt; //az
function WideCompose(const S: WideString): WideString;
function WideDecompose(const S: WideString; Compatible: Boolean): WideString;
function WideExtractQuotedStr(var Src: PWideChar; Quote: WideChar): WideString;
function WideQuotedStr(const S: WideString; Quote: WideChar): WideString;
function WideStringOfChar(C: WideChar; Count: SizeInt): WideString;
// case conversion function
type
TCaseType = (ctFold, ctLower, ctTitle, ctUpper);
function WideCaseConvert(C: WideChar; CaseType: TCaseType): WideString; overload;
function WideCaseConvert(const S: WideString; CaseType: TCaseType): WideString; overload;
function WideCaseFolding(C: WideChar): WideString; overload; {$IFDEF SUPPORTS_INLINE} inline; {$ENDIF SUPPORTS_INLINE}
function WideCaseFolding(const S: WideString): WideString; overload; {$IFDEF SUPPORTS_INLINE} inline; {$ENDIF SUPPORTS_INLINE}
function WideLowerCase(C: WideChar): WideString; overload; {$IFDEF SUPPORTS_INLINE} inline; {$ENDIF SUPPORTS_INLINE}
function WideLowerCase(const S: WideString): WideString; overload; {$IFDEF SUPPORTS_INLINE} inline; {$ENDIF SUPPORTS_INLINE}
function WideTitleCase(C: WideChar): WideString; overload; {$IFDEF SUPPORTS_INLINE} inline; {$ENDIF SUPPORTS_INLINE}
function WideTitleCase(const S: WideString): WideString; overload; {$IFDEF SUPPORTS_INLINE} inline; {$ENDIF SUPPORTS_INLINE}
function WideUpperCase(C: WideChar): WideString; overload; {$IFDEF SUPPORTS_INLINE} inline; {$ENDIF SUPPORTS_INLINE}
function WideUpperCase(const S: WideString): WideString; overload; {$IFDEF SUPPORTS_INLINE} inline; {$ENDIF SUPPORTS_INLINE}
function WideNormalize(const S: WideString; Form: TNormalizationForm): WideString;
function WideSameText(const Str1, Str2: WideString): Boolean;
function WideTrim(const S: WideString): WideString;
function WideTrimLeft(const S: WideString): WideString;
function WideTrimRight(const S: WideString): WideString;
type
// result type for number retrieval functions
TUcNumber = record
Numerator,
Denominator: Integer;
end;
// Low level character routines
function UnicodeNumberLookup(Code: UCS4; var Number: TUcNumber): Boolean;
function UnicodeCompose(const Codes: array of UCS4; out Composite: UCS4): Integer;
function UnicodeCaseFold(Code: UCS4): TUCS4Array;
function UnicodeToUpper(Code: UCS4): TUCS4Array;
function UnicodeToLower(Code: UCS4): TUCS4Array;
function UnicodeToTitle(Code: UCS4): TUCS4Array;
// Character test routines
function UnicodeIsAlpha(C: UCS4): Boolean;
function UnicodeIsDigit(C: UCS4): Boolean;
function UnicodeIsAlphaNum(C: UCS4): Boolean;
function UnicodeIsCased(C: UCS4): Boolean;
function UnicodeIsControl(C: UCS4): Boolean;
function UnicodeIsSpace(C: UCS4): Boolean;
function UnicodeIsWhiteSpace(C: UCS4): Boolean;
function UnicodeIsBlank(C: UCS4): Boolean;
function UnicodeIsPunctuation(C: UCS4): Boolean;
function UnicodeIsGraph(C: UCS4): Boolean;
function UnicodeIsPrintable(C: UCS4): Boolean;
function UnicodeIsUpper(C: UCS4): Boolean;
function UnicodeIsLower(C: UCS4): Boolean;
function UnicodeIsTitle(C: UCS4): Boolean;
function UnicodeIsHexDigit(C: UCS4): Boolean;
function UnicodeIsIsoControl(C: UCS4): Boolean;
function UnicodeIsFormatControl(C: UCS4): Boolean;
function UnicodeIsSymbol(C: UCS4): Boolean;
function UnicodeIsNumber(C: UCS4): Boolean;
function UnicodeIsNonSpacing(C: UCS4): Boolean;
function UnicodeIsOpenPunctuation(C: UCS4): Boolean;
function UnicodeIsClosePunctuation(C: UCS4): Boolean;
function UnicodeIsInitialPunctuation(C: UCS4): Boolean;
function UnicodeIsFinalPunctuation(C: UCS4): Boolean;
function UnicodeIsComposed(C: UCS4): Boolean;
function UnicodeIsQuotationMark(C: UCS4): Boolean;
function UnicodeIsSymmetric(C: UCS4): Boolean;
function UnicodeIsMirroring(C: UCS4): Boolean;
function UnicodeIsNonBreaking(C: UCS4): Boolean;
// Directionality functions
function UnicodeIsRightToLeft(C: UCS4): Boolean;
function UnicodeIsLeftToRight(C: UCS4): Boolean;
function UnicodeIsStrong(C: UCS4): Boolean;
function UnicodeIsWeak(C: UCS4): Boolean;
function UnicodeIsNeutral(C: UCS4): Boolean;
function UnicodeIsSeparator(C: UCS4): Boolean;
// Other character test functions
function UnicodeIsMark(C: UCS4): Boolean;
function UnicodeIsModifier(C: UCS4): Boolean;
function UnicodeIsLetterNumber(C: UCS4): Boolean;
function UnicodeIsConnectionPunctuation(C: UCS4): Boolean;
function UnicodeIsDash(C: UCS4): Boolean;
function UnicodeIsMath(C: UCS4): Boolean;
function UnicodeIsCurrency(C: UCS4): Boolean;
function UnicodeIsModifierSymbol(C: UCS4): Boolean;
function UnicodeIsNonSpacingMark(C: UCS4): Boolean;
function UnicodeIsSpacingMark(C: UCS4): Boolean;
function UnicodeIsEnclosing(C: UCS4): Boolean;
function UnicodeIsPrivate(C: UCS4): Boolean;
function UnicodeIsSurrogate(C: UCS4): Boolean;
function UnicodeIsLineSeparator(C: UCS4): Boolean;
function UnicodeIsParagraphSeparator(C: UCS4): Boolean;
function UnicodeIsIdentifierStart(C: UCS4): Boolean;
function UnicodeIsIdentifierPart(C: UCS4): Boolean;
function UnicodeIsDefined(C: UCS4): Boolean;
function UnicodeIsUndefined(C: UCS4): Boolean;
function UnicodeIsHan(C: UCS4): Boolean;
function UnicodeIsHangul(C: UCS4): Boolean;
// Utility functions
function CharSetFromLocale(Language: LCID): Byte;
function GetCharSetFromLocale(Language: LCID; out FontCharSet: Byte): Boolean;
function CodePageFromLocale(Language: LCID): Word;
function CodeBlockName(const CB: TUnicodeBlock): string;
function CodeBlockRange(const CB: TUnicodeBlock): TUnicodeBlockRange;
function CodeBlockFromChar(const C: UCS4): TUnicodeBlock;
function KeyboardCodePage: Word;
function KeyUnicode(C: Char): WideChar;
function StringToWideStringEx(const S: AnsiString; CodePage: Word): WideString;
function TranslateString(const S: AnsiString; CP1, CP2: Word): AnsiString;
function WideStringToStringEx(const WS: WideString; CodePage: Word): AnsiString;
type
TCompareFunc = function (const W1, W2: WideString; Locale: LCID): Integer;
var
WideCompareText: TCompareFunc;
type
EJclUnicodeError = class(EJclError);
// functions to load Unicode data from resource
procedure LoadCharacterCategories;
procedure LoadCaseMappingData;
procedure LoadDecompositionData;
procedure LoadCombiningClassData;
procedure LoadNumberData;
procedure LoadCompositionData;
{$IFDEF UNITVERSIONING}
const
UnitVersioning: TUnitVersionInfo = (
RCSfile: '$URL: https://jcl.svn.sourceforge.net/svnroot/jcl/tags/JCL-2.1-Build3536/jcl/source/common/JclUnicode.pas $';
Revision: '$Revision: 2930 $';
Date: '$Date: 2009-08-09 19:06:40 +0200 (dim., 09 août 2009) $';
LogPath: 'JCL\source\common';
Extra: '';
Data: nil
);
{$ENDIF UNITVERSIONING}
implementation
// Unicode data for case mapping, decomposition, numbers etc. This data is
// loaded on demand which means only those parts will be put in memory which are
// needed by one of the lookup functions.
// Note: There is a little tool called UDExtract which creates a resouce script from
// the Unicode database file which can be compiled to the needed res file.
// This tool, including its source code, can be downloaded from www.lischke-online.de/Unicode.html.
{$IFDEF UNICODE_RAW_DATA}
{$R JclUnicode.res}
{$ENDIF UNICODE_RAW_DATA}
{$IFDEF UNICODE_BZIP2_DATA}
{$R JclUnicodeBZip2.res}
{$ENDIF UNICODE_BZIP2_DATA}
{$IFDEF UNICODE_ZLIB_DATA}
{$R JclUnicodeZLib.res}
{$ENDIF UNICODE_ZLIB_DATA}
uses
{$IFDEF HAS_UNIT_RTLCONSTS}
{$IFDEF BORLAND}
RtlConsts,
{$ENDIF BORLAND}
{$ELSE ~HAS_UNIT_RTLCONSTS}
{$IFNDEF FPC}
Consts,
{$ENDIF ~FPC}
{$ENDIF ~HAS_UNIT_RTLCONSTS}
{$IFDEF UNICODE_BZIP2_DATA}
BZip2,
{$ENDIF UNICODE_BZIP2_DATA}
{$IFDEF UNICODE_ZLIB_DATA}
ZLibh,
{$ENDIF UNICODE_ZLIB_DATA}
JclStreams,
{$IFNDEF UNICODE_RAW_DATA}
JclCompression,
{$ENDIF ~UNICODE_RAW_DATA}
JclResources, JclSynch, JclSysUtils, JclSysInfo, JclStringConversions, JclWideStrings;
const
{$IFDEF FPC} // declarations from unit [Rtl]Consts
SDuplicateString = 'String list does not allow duplicates';
SListIndexError = 'List index out of bounds (%d)';
SSortedListError = 'Operation not allowed on sorted string list';
{$ENDIF FPC}
// some predefined sets to shorten parameter lists below and ease repeative usage
ClassLetter = [ccLetterUppercase, ccLetterLowercase, ccLetterTitlecase, ccLetterModifier, ccLetterOther];
ClassSpace = [ccSeparatorSpace, ccSpaceOther];
ClassPunctuation = [ccPunctuationConnector, ccPunctuationDash, ccPunctuationOpen, ccPunctuationClose,
ccPunctuationOther, ccPunctuationInitialQuote, ccPunctuationFinalQuote];
ClassMark = [ccMarkNonSpacing, ccMarkSpacingCombining, ccMarkEnclosing];
ClassNumber = [ccNumberDecimalDigit, ccNumberLetter, ccNumberOther];
ClassSymbol = [ccSymbolMath, ccSymbolCurrency, ccSymbolModifier, ccSymbolOther];
ClassEuropeanNumber = [ccEuropeanNumber, ccEuropeanNumberSeparator, ccEuropeanNumberTerminator];
// used to negate a set of categories
ClassAll = [Low(TCharacterCategory)..High(TCharacterCategory)];
var
// As the global data can be accessed by several threads it should be guarded
// while the data is loaded.
LoadInProgress: TJclCriticalSection;
function OpenResourceStream(const ResName: string): TJclEasyStream;
var
ResourceStream: TStream;
{$IFNDEF UNICODE_RAW_DATA}
DecompressionStream: TStream;
RawStream: TMemoryStream;
{$ENDIF ~UNICODE_RAW_DATA}
begin
ResourceStream := TResourceStream.Create(HInstance, ResName, 'UNICODEDATA');
{$IFDEF UNICODE_RAW_DATA}
Result := TJclEasyStream.Create(ResourceStream, True);
{$ENDIF UNICODE_RAW_DATA}
{$IFDEF UNICODE_BZIP2_DATA}
try
LoadBZip2;
DecompressionStream := TJclBZIP2DecompressionStream.Create(ResourceStream);
try
RawStream := TMemoryStream.Create;
StreamCopy(DecompressionStream, RawStream);
RawStream.Seek(0, soBeginning);
Result := TJclEasyStream.Create(RawStream, True);
finally
DecompressionStream.Free;
end;
finally
ResourceStream.Free;
end;
{$ENDIF UNICODE_BZIP2_DATA}
{$IFDEF UNICODE_ZLIB_DATA}
try
LoadZLib;
DecompressionStream := TJclZLibDecompressStream.Create(ResourceStream);
try
RawStream := TMemoryStream.Create;
StreamCopy(DecompressionStream, RawStream);
RawStream.Seek(0, soBeginning);
Result := TJclEasyStream.Create(RawStream, True);
finally
DecompressionStream.Free;
end;
finally
ResourceStream.Free;
end;
{$ENDIF UNICODE_ZLIB_DATA}
end;
//----------------- support for character categories -----------------------------------------------
// Character category data is quite a large block since every defined character in Unicode is assigned at least
// one category. Because of this we cannot use a sparse matrix to provide quick access as implemented for
// e.g. composition data.
// The approach used here is based on the fact that an application seldomly uses all characters defined in Unicode
// simultanously. In fact the opposite is true. Most application will use either Western Europe or Arabic or
// Far East character data, but very rarely all together. Based on this fact is the implementation of virtual
// memory using the systems paging file (aka file mapping) to load only into virtual memory what is used currently.
// The implementation is not yet finished and needs a lot of improvements yet.
type
// start and stop of a range of code points
TRange = record
Start,
Stop: Cardinal;
end;
TRangeArray = array of TRange;
TCategoriesArray = array of array of TCharacterCategories;
var
// character categories, stored in the system's swap file and mapped on demand
CategoriesLoaded: Boolean;
Categories: array [Byte] of TCategoriesArray;
procedure LoadCharacterCategories;
// Loads the character categories data (as saved by the Unicode database extractor, see also
// the comments about JclUnicode.res above).
var
Size: Integer;
Stream: TJclEasyStream;
Category: TCharacterCategory;
Buffer: TRangeArray;
First, Second, Third: Byte;
J, K: Integer;
begin
// Data already loaded?
if not CategoriesLoaded then
begin
// make sure no other code is currently modifying the global data area
LoadInProgress.Enter;
try
CategoriesLoaded := True;
Stream := OpenResourceStream('CATEGORIES');
try
while Stream.Position < Stream.Size do
begin
// a) read which category is current in the stream
Category := TCharacterCategory(Stream.ReadByte);
// b) read the size of the ranges and the ranges themself
Size := Stream.ReadInteger;
if Size > 0 then
begin
SetLength(Buffer, Size);
for J := 0 to Size - 1 do
begin
Buffer[J].Start := Stream.ReadInteger;
Buffer[J].Stop := Stream.ReadInteger;
end;
// c) go through every range and add the current category to each code point
for J := 0 to Size - 1 do
for K := Buffer[J].Start to Buffer[J].Stop do
begin
Assert(K < $1000000, LoadResString(@RsCategoryUnicodeChar));
First := (K shr 16) and $FF;
Second := (K shr 8) and $FF;
Third := K and $FF;
// add second step array if not yet done
if Categories[First] = nil then
SetLength(Categories[First], 256);
if Categories[First, Second] = nil then
SetLength(Categories[First, Second], 256);
Include(Categories[First, Second, Third], Category);
end;
end;
end;
finally
Stream.Free;
end;
finally
LoadInProgress.Leave;
end;
end;
end;
function CategoryLookup(Code: Cardinal; Cats: TCharacterCategories): Boolean; overload;
// determines whether the Code is in the given category
var
First, Second, Third: Byte;
begin
Assert(Code < $1000000, LoadResString(@RsCategoryUnicodeChar));
// load property data if not already done
if not CategoriesLoaded then
LoadCharacterCategories;
First := (Code shr 16) and $FF;
Second := (Code shr 8) and $FF;
Third := Code and $FF;
if (Categories[First] <> nil) and (Categories[First, Second] <> nil) then
Result := Categories[First, Second, Third] * Cats <> []
else
Result := False;
end;
//----------------- support for case mapping -------------------------------------------------------
type
TCase = array [TCaseType] of TUCS4Array; // mapping for case fold, lower, title and upper in this order
TCaseArray = array of array of TCase;
var
// An array for all case mappings (including 1 to many casing if saved by the extraction program).
// The organization is a sparse, two stage matrix.
// SingletonMapping is to quickly return a single default mapping.
CaseDataLoaded: Boolean;
CaseMapping: array [Byte] of TCaseArray;
procedure LoadCaseMappingData;
var
Stream: TJclEasyStream;
I, J, Code, Size: Integer;
First, Second, Third: Byte;
begin
if not CaseDataLoaded then
begin
// make sure no other code is currently modifying the global data area
LoadInProgress.Enter;
try
CaseDataLoaded := True;
Stream := OpenResourceStream('CASE');
try
// the first entry in the stream is the number of entries in the case mapping table
Size := Stream.ReadInteger;
for I := 0 to Size - 1 do
begin
// a) read actual code point
Code := Stream.ReadInteger;
Assert(Code < $1000000, LoadResString(@RsCasedUnicodeChar));
// if there is no high byte entry in the first stage table then create one
First := (Code shr 16) and $FF;
Second := (Code shr 8) and $FF;
Third := Code and $FF;
if CaseMapping[First] = nil then
SetLength(CaseMapping[First], 256);
if CaseMapping[First, Second] = nil then
SetLength(CaseMapping[First, Second], 256);
// b) read fold case array
Size := Stream.ReadInteger;
if Size > 0 then
begin
SetLength(CaseMapping[First, Second, Third, ctFold], Size);
for J := 0 to Size - 1 do
CaseMapping[First, Second, Third, ctFold, J] := Stream.ReadInteger;
end;
// c) read lower case array
Size := Stream.ReadInteger;
if Size > 0 then
begin
SetLength(CaseMapping[First, Second, Third, ctLower], Size);
for J := 0 to Size - 1 do
CaseMapping[First, Second, Third, ctLower, J] := Stream.ReadInteger;
end;
// d) read title case array
Size := Stream.ReadInteger;
if Size > 0 then
begin
SetLength(CaseMapping[First, Second, Third, ctTitle], Size);
for J := 0 to Size - 1 do
CaseMapping[First, Second, Third, ctTitle, J] := Stream.ReadInteger;
end;
// e) read upper case array
Size := Stream.ReadInteger;
if Size > 0 then
begin
SetLength(CaseMapping[First, Second, Third, ctUpper], Size);
for J := 0 to Size - 1 do
CaseMapping[First, Second, Third, ctUpper, J] := Stream.ReadInteger;
end;
end;
finally
Stream.Free;
end;
finally
LoadInProgress.Leave;
end;
end;
end;
function CaseLookup(Code: Cardinal; CaseType: TCaseType; var Mapping: TUCS4Array): Boolean;
// Performs a lookup of the given code; returns True if Found, with Mapping referring to the mapping.
// ctFold is handled specially: if no mapping is found then result of looking up ctLower
// is returned
var
First, Second, Third: Byte;
begin
Assert(Code < $1000000, LoadResString(@RsCasedUnicodeChar));
// load case mapping data if not already done
if not CaseDataLoaded then
LoadCaseMappingData;
First := (Code shr 16) and $FF;
Second := (Code shr 8) and $FF;
Third := Code and $FF;
// Check first stage table whether there is a mapping for a particular block and
// (if so) then whether there is a mapping or not.
if (CaseMapping[First] <> nil) and (CaseMapping[First, Second] <> nil) and
(CaseMapping[First, Second, Third, CaseType] <> nil) then
Mapping := CaseMapping[First, Second, Third, CaseType]
else
Mapping := nil;
Result := Assigned(Mapping);
// defer to lower case if no fold case exists
if not Result and (CaseType = ctFold) and (CaseMapping[First] <> nil) and
(CaseMapping[First, Second] <> nil) and (CaseMapping[First, Second, Third, ctLower] <> nil) then
begin
Mapping := CaseMapping[First, Second, Third, ctLower];
Result := Assigned(Mapping);
end;
end;
function UnicodeCaseFold(Code: UCS4): TUCS4Array;
// This function returnes an array of special case fold mappings if there is one defined for the given
// code, otherwise the lower case will be returned. This all applies only to cased code points.
// Uncased code points are returned unchanged.
begin
SetLength(Result, 0);
if not CaseLookup(Code, ctFold, Result) then
begin
SetLength(Result, 1);
Result[0] := Code;
end;
end;
function UnicodeToUpper(Code: UCS4): TUCS4Array;
begin
SetLength(Result, 0);
if not CaseLookup(Code, ctUpper, Result) then
begin
SetLength(Result, 1);
Result[0] := Code;
end;
end;
function UnicodeToLower(Code: UCS4): TUCS4Array;
begin
SetLength(Result, 0);
if not CaseLookup(Code, ctLower, Result) then
begin
SetLength(Result, 1);
Result[0] := Code;
end;
end;
function UnicodeToTitle(Code: UCS4): TUCS4Array;
begin
SetLength(Result, 0);
if not CaseLookup(Code, ctTitle, Result) then
begin
SetLength(Result, 1);
Result[0] := Code;
end;
end;
//----------------- support for decomposition ------------------------------------------------------
const
// constants for hangul composition and hangul-to-jamo decomposition
SBase = $AC00; // hangul syllables start code point
LBase = $1100; // leading syllable
VBase = $1161;
TBase = $11A7; // trailing syllable
LCount = 19;
VCount = 21;
TCount = 28;
NCount = VCount * TCount; // 588
SCount = LCount * NCount; // 11172
type
TDecompositions = array of array of TUCS4Array;
TDecompositionsArray = array [Byte] of TDecompositions;
var
// list of decompositions, organized (again) as three stage matrix
// Note: there are two tables, one for canonical decompositions and the other one
// for compatibility decompositions.
DecompositionsLoaded: Boolean;
CanonicalDecompositions,
CompatibleDecompositions: TDecompositionsArray;
procedure LoadDecompositionData;
var
Stream: TJclEasyStream;
I, J, Code, Size: Integer;
First, Second, Third: Byte;
begin
if not DecompositionsLoaded then
begin
// make sure no other code is currently modifying the global data area
LoadInProgress.Enter;
try
DecompositionsLoaded := True;
Stream := OpenResourceStream('DECOMPOSITION');
try
// determine how many decomposition entries we have
Size := Stream.ReadInteger;
for I := 0 to Size - 1 do
begin
Code := Stream.ReadInteger;
Assert((Code and not $40000000) < $1000000, LoadResString(@RsDecomposedUnicodeChar));
// if there is no high byte entry in the first stage table then create one
First := (Code shr 16) and $FF;
Second := (Code shr 8) and $FF;
Third := Code and $FF;
// insert into the correct table depending on bit 30
// (if set then it is a compatibility decomposition)
if Code and $40000000 <> 0 then
begin
if CompatibleDecompositions[First] = nil then
SetLength(CompatibleDecompositions[First], 256);
if CompatibleDecompositions[First, Second] = nil then
SetLength(CompatibleDecompositions[First, Second], 256);
Size := Stream.ReadInteger;
if Size > 0 then
begin
SetLength(CompatibleDecompositions[First, Second, Third], Size);
for J := 0 to Size - 1 do
CompatibleDecompositions[First, Second, Third, J] := Stream.ReadInteger;
end;
end
else
begin
if CanonicalDecompositions[First] = nil then
SetLength(CanonicalDecompositions[First], 256);
if CanonicalDecompositions[First, Second] = nil then
SetLength(CanonicalDecompositions[First, Second], 256);
Size := Stream.ReadInteger;
if Size > 0 then
begin
SetLength(CanonicalDecompositions[First, Second, Third], Size);
for J := 0 to Size - 1 do
CanonicalDecompositions[First, Second, Third, J] := Stream.ReadInteger;
end;
end;
end;
finally
Stream.Free;
end;
finally
LoadInProgress.Leave;
end;
end;
end;
function UnicodeDecomposeHangul(Code: UCS4): TUCS4Array;
// algorithmically decomposes hangul character
var
Rest: Integer;
begin
Dec(Code, SBase);
Rest := Code mod TCount;
if Rest = 0 then
SetLength(Result, 2)
else
SetLength(Result, 3);
Result[0] := LBase + (Code div NCount);
Result[1] := VBase + ((Code mod NCount) div TCount);
if Rest <> 0 then
Result[2] := TBase + Rest;
end;
function UnicodeDecompose(Code: UCS4; Compatible: Boolean): TUCS4Array;
var
First, Second, Third: Byte;
begin
Assert((Code and not $40000000) < $1000000, LoadResString(@RsDecomposedUnicodeChar));
// load decomposition data if not already done
if not DecompositionsLoaded then
LoadDecompositionData;
Result := nil;
// if the code is hangul then decomposition is algorithmically
if UnicodeIsHangul(Code) then
Result := UnicodeDecomposeHangul(Code)
else
begin
First := (Code shr 16) and $FF;
Second := (Code shr 8) and $FF;
Third := Code and $FF;
if Compatible then
begin
// Check first stage table whether there is a particular block and
// (if so) then whether there is a decomposition or not.
if (CompatibleDecompositions[First] = nil) or (CompatibleDecompositions[First, Second] = nil)
or (CompatibleDecompositions[First, Second, Third] = nil) then
begin
// if there is no compatibility decompositions try canonical
if (CanonicalDecompositions[First] = nil) or (CanonicalDecompositions[First, Second] = nil)
or (CanonicalDecompositions[First, Second, Third] = nil) then
Result := nil
else
Result := CanonicalDecompositions[First, Second, Third];
end
else
Result := CompatibleDecompositions[First, Second, Third];
end
else
begin
if (CanonicalDecompositions[First] = nil) or (CanonicalDecompositions[First, Second] = nil)
or (CanonicalDecompositions[First, Second, Third] = nil) then
Result := nil
else
Result := CanonicalDecompositions[First, Second, Third];
end;
end;
end;
//----------------- support for combining classes --------------------------------------------------
type
TClassArray = array of array of Byte;
var
// canonical combining classes, again as two stage matrix
CCCsLoaded: Boolean;
CCCs: array [Byte] of TClassArray;
procedure LoadCombiningClassData;
var
Stream: TJclEasyStream;
I, J, K, Size: Integer;
Buffer: TRangeArray;
First, Second, Third: Byte;
begin
// make sure no other code is currently modifying the global data area
LoadInProgress.Enter;
try
if not CCCsLoaded then
begin
CCCsLoaded := True;
Stream := OpenResourceStream('COMBINING');
try
while Stream.Position < Stream.Size do
begin
// a) determine which class is stored here
I := Stream.ReadInteger;
// b) determine how many ranges are assigned to this class
Size := Stream.ReadInteger;
// c) read start and stop code of each range
if Size > 0 then
begin
SetLength(Buffer, Size);
for J := 0 to Size - 1 do
begin
Buffer[J].Start := Stream.ReadInteger;
Buffer[J].Stop := Stream.ReadInteger;
end;
// d) put this class in every of the code points just loaded
for J := 0 to Size - 1 do
for K := Buffer[J].Start to Buffer[J].Stop do
begin
// (outchy) TODO: handle in a cleaner way
Assert(K < $1000000, LoadResString(@RsCombiningClassUnicodeChar));
First := (K shr 16) and $FF;
Second := (K shr 8) and $FF;
Third := K and $FF;
// add second step array if not yet done
if CCCs[First] = nil then
SetLength(CCCs[First], 256);
if CCCs[First, Second] = nil then
SetLength(CCCs[First, Second], 256);
CCCs[First, Second, Third] := I;
end;
end;
end;
finally
Stream.Free;
end;
end;
finally
LoadInProgress.Leave;
end;
end;
function CanonicalCombiningClass(Code: Cardinal): Cardinal;
var
First, Second, Third: Byte;
begin
Assert(Code < $1000000, LoadResString(@RsCombiningClassUnicodeChar));
// load combining class data if not already done
if not CCCsLoaded then
LoadCombiningClassData;
First := (Code shr 16) and $FF;
Second := (Code shr 8) and $FF;
Third := Code and $FF;
if (CCCs[First] <> nil) and (CCCs[First, Second] <> nil) then
Result := CCCs[First, Second, Third]
else
Result := 0;
end;
//----------------- support for numeric values -----------------------------------------------------
type
// structures for handling numbers
TCodeIndex = record
Code,
Index: Cardinal;
end;
var
// array to hold the number equivalents for specific codes
NumberCodes: array of TCodeIndex;
// array of numbers used in NumberCodes
Numbers: array of TUcNumber;
procedure LoadNumberData;
var
Stream: TJclEasyStream;
Size, I: Integer;
begin
// make sure no other code is currently modifying the global data area
LoadInProgress.Enter;
try
if NumberCodes = nil then
begin
Stream := OpenResourceStream('NUMBERS');
try
// Numbers are special (compared to other Unicode data) as they utilize two
// arrays, one containing all used numbers (in nominator-denominator format) and
// another one which maps a code point to one of the numbers in the first array.
// a) determine size of numbers array
Size := Stream.ReadInteger;
SetLength(Numbers, Size);
// b) read numbers data
for I := 0 to Size - 1 do
begin
Numbers[I].Numerator := Stream.ReadInteger;
Numbers[I].Denominator := Stream.ReadInteger;
end;
// c) determine size of index array
Size := Stream.ReadInteger;
SetLength(NumberCodes, Size);
// d) read index data
for I := 0 to Size - 1 do
begin
NumberCodes[I].Code := Stream.ReadInteger;
NumberCodes[I].Index := Stream.ReadInteger;
end;
finally
Stream.Free;
end;
end;
finally
LoadInProgress.Leave;
end;
end;
function UnicodeNumberLookup(Code: UCS4; var Number: TUcNumber): Boolean;
// Searches for the given code and returns its number equivalent (if there is one).
// Typical cases are: '1/6' (U+2159), '3/8' (U+215C), 'XII' (U+216B) etc.
// Result is set to True if the code could be found.
var
L, R, M: Integer;
begin
// load number data if not already done
if NumberCodes = nil then
LoadNumberData;
Result := False;
L := 0;
R := High(NumberCodes);
while L <= R do
begin
M := (L + R) shr 1;
if Code > NumberCodes[M].Code then
L := M + 1
else
begin
if Code < NumberCodes[M].Code then
R := M - 1
else
begin
Number := Numbers[NumberCodes[M].Index];
Result := True;
Break;
end;
end;
end;
end;
//----------------- support for composition --------------------------------------------------------
type
// maps between a pair of code points to a composite code point
// Note: the source pair is packed into one 4 byte value to speed up search.
TComposition = record
Code: Cardinal;
First: Cardinal;
Next: array of Cardinal;
end;
var
// list of composition mappings
Compositions: array of TComposition;
MaxCompositionSize: Integer;
procedure LoadCompositionData;
var
Stream: TJclEasyStream;
I, J, Size: Integer;
begin
// make sure no other code is currently modifying the global data area
LoadInProgress.Enter;
try
if Compositions = nil then
begin
Stream := OpenResourceStream('COMPOSITION');
try
// a) determine size of compositions array
Size := Stream.ReadInteger;
SetLength(Compositions, Size);
// b) read data
for I := 0 to Size - 1 do
begin
Compositions[I].Code := Stream.ReadInteger;
Size := Stream.ReadInteger;
if Size > MaxCompositionSize then
MaxCompositionSize := Size;
SetLength(Compositions[I].Next, Size - 1);
Compositions[I].First := Stream.ReadInteger;
for J := 0 to Size - 2 do
Compositions[I].Next[J] := Stream.ReadInteger;
end;
finally
Stream.Free;
end;
end;
finally
LoadInProgress.Leave;
end;
end;
function UnicodeCompose(const Codes: array of UCS4; out Composite: UCS4): Integer;
// Maps the sequence of Codes (up to MaxCompositionSize codes) to a composite
// Result is the number of Codes that were composed (at least 1 if Codes is not empty)
var
L, R, M, I, HighCodes, HighNext: Integer;
begin
if Compositions = nil then
LoadCompositionData;
Result := 0;
HighCodes := High(Codes);
if HighCodes = -1 then
Exit;
if HighCodes = 0 then
begin
Result := 1;
Composite := Codes[0];
Exit;
end;
L := 0;
R := High(Compositions);
while L <= R do
begin
M := (L + R) shr 1;
if Compositions[M].First > Codes[0] then
R := M - 1
else
if Compositions[M].First < Codes[0] then
L := M + 1
else
begin
// back to the first element where Codes[0] = First
while (M > 0) and (Compositions[M-1].First = Codes[0]) do
Dec(M);
while (M <= High(Compositions)) and (Compositions[M].First = Codes[0]) do
begin
HighNext := High(Compositions[M].Next);
Result := 0;
if HighNext < HighCodes then // enough characters in buffer to be tested
begin
for I := 0 to HighNext do
if Compositions[M].Next[I] = Codes[I + 1] then
Result := I + 2 { +1 for first, +1 because of 0-based array }
else
Break;
if Result = HighNext + 2 then // all codes matched
begin
Composite := Compositions[M].Code;
Exit;
end;
end;
Inc(M);
end;
Break;
end;
end;
Result := 1;
Composite := Codes[0];
end;
//=== { TSearchEngine } ======================================================
constructor TSearchEngine.Create(AOwner: TWideStrings);
begin
inherited Create;
FOwner := AOwner;
FResults := TList.Create;
end;
destructor TSearchEngine.Destroy;
begin
Clear;
FResults.Free;
inherited Destroy;
end;
procedure TSearchEngine.AddResult(Start, Stop: SizeInt);
begin
FResults.Add(Pointer(Start));
FResults.Add(Pointer(Stop));
end;
procedure TSearchEngine.Clear;
begin
ClearResults;
end;
procedure TSearchEngine.ClearResults;
begin
FResults.Clear;
end;
procedure TSearchEngine.DeleteResult(Index: SizeInt);
// explicitly deletes a search result
begin
with FResults do
begin
// start index
Delete(2 * Index);
// stop index
Delete(2 * Index);
end;
end;
function TSearchEngine.GetCount: SizeInt;
// returns the number of matches found
begin
Result := FResults.Count div 2;
end;
procedure TSearchEngine.GetResult(Index: SizeInt; var Start, Stop: SizeInt);
// returns the start position of a match (end position can be determined by
// adding the length of the pattern to the start position)
begin
Start := SizeInt(FResults[2 * Index]);
Stop := SizeInt(FResults[2 * Index + 1]);
end;
//----------------- TUTBSearch ---------------------------------------------------------------------
procedure TUTBMSearch.ClearPattern;
begin
FreeMem(FPattern);
FPattern := nil;
FFlags := [];
FPatternUsed := 0;
FPatternSize := 0;
FPatternLength := 0;
FreeMem(FSkipValues);
FSkipValues := nil;
FSkipsUsed := 0;
FMD4 := 0;
end;
function TUTBMSearch.GetSkipValue(TextStart, TextEnd: PUCS2): SizeInt;
// looks up the SkipValues value for a character
var
I: SizeInt;
C1,
C2: UCS4;
Sp: PUTBMSkip;
begin
Result := 0;
if TJclAddr(TextStart) < TJclAddr(TextEnd) then
begin
C1 := UCS4(TextStart^);
if (TextStart + 1) < TextEnd then
C2 := UCS4((TextStart + 1)^)
else
C2 := $FFFFFFFF;
if (SurrogateHighStart <= C1) and (C1 <= SurrogateHighEnd) and
(SurrogateLowStart <= C2) and (C2 <= $DDDD) then
C1 := $10000 + (((C1 and $03FF) shl 10) or (C2 and $03FF));
Sp := FSkipValues;
for I := 0 to FSkipsUsed - 1 do
begin
if not (Boolean(C1 xor Sp.BMChar.UpCase) and
Boolean(C1 xor Sp.BMChar.LoCase) and
Boolean(C1 xor Sp.BMChar.TitleCase)) then
begin
if (TextEnd - TextStart) < Sp.SkipValues then
Result := TextEnd - TextStart
else
Result := Sp.SkipValues;
Exit;
end;
Inc(Sp);
end;
Result := FPatternLength;
end;
end;
function TUTBMSearch.Match(Text, Start, Stop: PUCS2; var MatchStart, MatchEnd: SizeInt): Boolean;
// Checks once whether the text at position Start (which points to the end of the
// current text part to be matched) matches.
// Note: If whole words only are allowed then the left and right border tests are
// done here too. The keypoint for the right border is that the next character
// after the search string is either the text end or a space character.
// For the left side this is similar, but there is nothing like a string
// start marker (like the string end marker #0).
//
// It seems not obvious, but we still can use the passed Text pointer to do
// the left check. Although this pointer might not point to the real string
// start (e.g. in TUTBMSearch.FindAll Text is incremented as needed) it is
// still a valid check mark. The reason is that Text either points to the
// real string start or a previous match (happend already, keep in mind the
// search options do not change in the FindAll loop) and the character just
// before Text is a space character.
// This fact implies, though, that strings passed to Find (or FindFirst,
// FindAll in TUTBMSearch) always really start at the given address. Although
// this might not be the case in some circumstances (e.g. if you pass only
// the selection from an editor) it is still assumed that a pattern matching
// from the first position on (from the search string start) also matches
// when whole words only are allowed.
var
CheckSpace: Boolean;
C1, C2: UCS4;
Count: SizeInt;
Cp: PUTBMChar;
begin
// be pessimistic
Result := False;
// set the potential match endpoint first
MatchEnd := (Start - Text) + 1;
C1 := UCS4(Start^);
if (Start + 1) < Stop then
C2 := UCS4((Start + 1)^)
else
C2 := $FFFFFFFF;
if (SurrogateHighStart <= C1) and (C1 <= SurrogateHighEnd) and
(SurrogateLowStart <= C2) and (C2 <= SurrogateLowEnd) then
begin
C1 := $10000 + (((C1 and $03FF) shl 10) or (C2 and $03FF));
// Adjust the match end point to occur after the UTF-16 character.
Inc(MatchEnd);
end;
// check special cases
if FPatternUsed = 1 then
begin
MatchStart := Start - Text;
Result := True;
Exit;
end;
// Early out if entire words need to be matched and the next character
// in the search string is neither the string end nor a space character.
if (sfWholeWordOnly in FFlags) and
not ((Start + 1)^ = WideNull) and
not UnicodeIsWhiteSpace(UCS4((Start + 1)^)) then
Exit;
// compare backward
Cp := FPattern;
Inc(Cp, FPatternUsed - 1);
Count := FPatternLength;
while (Start >= Text) and (Count > 0) do
begin
// ignore non-spacing characters if indicated
if sfIgnoreNonSpacing in FFlags then
begin
while (Start > Text) and UnicodeIsNonSpacing(C1) do
begin
Dec(Start);
C2 := UCS4(Start^);
if (Start - 1) > Text then
C1 := UCS4((Start - 1)^)
else
C1 := $FFFFFFFF;
if (SurrogateLowStart <= C2) and (C2 <= SurrogateLowEnd) and
(SurrogateHighStart <= C1) and (C1 <= SurrogateHighEnd) then
begin
C1 := $10000 + (((C1 and $03FF) shl 10) or (C2 and $03FF));
Dec(Start);
end
else
C1 := C2;
end;
end;
// handle space compression if indicated
if sfSpaceCompress in FFlags then
begin
CheckSpace := False;
while (Start > Text) and (UnicodeIsWhiteSpace(C1) or UnicodeIsControl(C1)) do
begin
CheckSpace := UnicodeIsWhiteSpace(C1);
Dec(Start);
C2 := UCS4(Start^);
if (Start - 1) > Text then
C1 := UCS4((Start - 1)^)
else
C1 := $FFFFFFFF;
if (SurrogateLowStart <= C2) and (C2 <= SurrogateLowEnd) and
(SurrogateHighStart <= C1) and (C1 <= SurrogateHighEnd) then
begin
C1 := $10000 + (((C1 and $03FF) shl 10) or (C2 and $03FF));
Dec(Start);
end
else
C1 := C2;
end;
// Handle things if space compression was indicated and one or
// more member characters were found.
if CheckSpace then
begin
if Cp.UpCase <> $20 then
Exit;
Dec(Cp);
Dec(Count);
// If Count is 0 at this place then the space character(s) was the first
// in the pattern and we need to correct the start position.
if Count = 0 then
Inc(Start);
end;
end;
// handle the normal comparison cases
if (Count > 0) and
(Boolean(C1 xor Cp.UpCase) and
Boolean(C1 xor Cp.LoCase) and
Boolean(C1 xor Cp.TitleCase)) then
Exit;
if C1 >= $10000 then
Dec(Count, 2)
else
Dec(Count, 1);
if Count > 0 then
begin
Dec(Cp);
// get the next preceding character
if Start > Text then
begin
Dec(Start);
C2 := UCS4(Start^);
if (Start - 1) > Text then
C1 := UCS4((Start - 1)^)
else
C1 := $FFFFFFFF;
if (SurrogateLowStart <= C2) and (C2 <= SurrogateLowEnd) and
(SurrogateHighStart <= C1) and (C1 <= SurrogateHighEnd) then
begin
C1 := $10000 + (((C1 and $03FF) shl 10) or (C2 and $03FF));
Dec(Start);
end
else
C1 := C2;
end;
end;
end;
// So far the string matched. Now check its left border for a space character
// if whole word only are allowed.
if not (sfWholeWordOnly in FFlags) or
(Start <= Text) or
UnicodeIsWhiteSpace(UCS4((Start - 1)^)) then
begin
// set the match start position
MatchStart := Start - Text;
Result := True;
end;
end;
procedure TUTBMSearch.Compile(Pattern: PUCS2; PatternLength: SizeInt; Flags: TSearchFlags);
var
HaveSpace: Boolean;
I, J, K,
SLen: SizeInt;
Cp: PUTBMChar;
Sp: PUTBMSkip;
C1, C2,
Sentinel: UCS4;
begin
if (Pattern <> nil) and (Pattern^ <> #0) and (PatternLength > 0) then
begin
// do some initialization
FFlags := Flags;
// extra skip flag
FMD4 := 1;
Sentinel := 0;
// allocate more storage if necessary
FPattern := AllocMem(SizeOf(TUTBMChar) * PatternLength);
FSkipValues := AllocMem(SizeOf(TUTBMSkip) * PatternLength);
FPatternSize := PatternLength;
// Preprocess the pattern to remove controls (if specified) and determine case.
Cp := FPattern;
I := 0;
HaveSpace := False;
while I < PatternLength do
begin
C1 := UCS4(Pattern[I]);
if (I + 1) < PatternLength then
C2 := UCS4(Pattern[I + 1])
else
C2 := $FFFFFFFF;
if (SurrogateHighStart <= C1) and (C1 <= SurrogateHighEnd) and
(SurrogateLowStart <= C2) and (C2 <= SurrogateLowEnd) then
C1 := $10000 + (((C1 and $03FF) shl 10) or (C2 and $03FF));
// Make sure the HaveSpace flag is turned off if the character is not an
// appropriate one.
if not UnicodeIsWhiteSpace(C1) then
HaveSpace := False;
// If non-spacing characters should be ignored, do it here.
if (sfIgnoreNonSpacing in Flags) and UnicodeIsNonSpacing(C1) then
begin
Inc(I);
Continue;
end;
// check if spaces and controls need to be compressed
if sfSpaceCompress in Flags then
begin
if UnicodeIsWhiteSpace(C1) then
begin
if not HaveSpace then
begin
// Add a space and set the flag.
Cp.UpCase := $20;
Cp.LoCase := $20;
Cp.TitleCase := $20;
Inc(Cp);
// increase the real pattern length
Inc(FPatternLength);
Sentinel := $20;
HaveSpace := True;
end;
Inc(I);
Continue;
end;
// ignore all control characters
if UnicodeIsControl(C1) then
begin
Inc(I);
Continue;
end;
end;
// add the character
if not (sfCaseSensitive in Flags) then
begin
{ TODO : use the entire mapping, not only the first character }
Cp.UpCase := UnicodeToUpper(C1)[0];
Cp.LoCase := UnicodeToLower(C1)[0];
Cp.TitleCase := UnicodeToTitle(C1)[0];
end
else
begin
Cp.UpCase := C1;
Cp.LoCase := C1;
Cp.TitleCase := C1;
end;
Sentinel := Cp.UpCase;
// move to the next character
Inc(Cp);
// increase the real pattern length appropriately
if C1 >= $10000 then
Inc(FPatternLength, 2)
else
Inc(FPatternLength);
// increment the loop index for UTF-16 characters
if C1 > $10000 then
Inc(I, 2)
else
Inc(I);
end;
// set the number of characters actually used
FPatternUsed := (TJclAddr(Cp) - TJclAddr(FPattern)) div SizeOf(TUTBMChar);
// Go through and construct the skip array and determine the actual length
// of the pattern in UCS2 terms.
SLen := FPatternLength - 1;
Cp := FPattern;
K := 0;
for I := 0 to FPatternUsed - 1 do
begin
// locate the character in the FSkipValues array
Sp := FSkipValues;
J := 0;
while (J < FSkipsUsed) and (Sp.BMChar.UpCase <> Cp.UpCase) do
begin
Inc(J);
Inc(Sp);
end;
// If the character is not found, set the new FSkipValues element and
// increase the number of FSkipValues elements.
if J = FSkipsUsed then
begin
Sp.BMChar := Cp;
Inc(FSkipsUsed);
end;
// Set the updated FSkipValues value. If the character is UTF-16 and is
// not the last one in the pattern, add one to its FSkipValues value.
Sp.SkipValues := SLen - K;
if (Cp.UpCase >= $10000) and ((K + 2) < SLen) then
Inc(Sp.SkipValues);
// set the new extra FSkipValues for the sentinel character
if ((Cp.UpCase >= $10000) and
((K + 2) <= SLen) or ((K + 1) <= SLen) and
(Cp.UpCase = Sentinel)) then
FMD4 := SLen - K;
// increase the actual index
if Cp.UpCase >= $10000 then
Inc(K, 2)
else
Inc(K);
Inc(Cp);
end;
end;
end;
function TUTBMSearch.Find(Text: PUCS2; TextLen: SizeInt; var MatchStart, MatchEnd: SizeInt): Boolean;
// this is the main matching routine using a tuned Boyer-Moore algorithm
var
K: SizeInt;
Start,
Stop: PUCS2;
begin
Result := False;
if (FPattern <> nil) and (FPatternUsed > 0) and (Text <> nil) and
(TextLen > 0) and (TextLen >= FPatternLength) then
begin
Start := Text + FPatternLength - 1;
Stop := Text + TextLen;
// adjust the start point if it points to a low surrogate
if (SurrogateLowStart <= UCS4(Start^)) and
(UCS4(Start^) <= SurrogateLowEnd) and
(SurrogateHighStart <= UCS4((Start - 1)^)) and
(UCS4((Start - 1)^) <= SurrogateHighEnd) then
Dec(Start);
while Start < Stop do
begin
repeat
K := GetSkipValue(Start, Stop);
if K = 0 then
Break;
Inc(Start, K);
if (Start < Stop) and
(SurrogateLowStart <= UCS4(Start^)) and
(UCS4(Start^) <= SurrogateLowEnd) and
(SurrogateHighStart <= UCS4((Start - 1)^)) and
(UCS4((Start - 1)^) <= SurrogateHighEnd) then
Dec(Start);
until False;
if (Start < Stop) and Match(Text, Start, Stop, MatchStart, MatchEnd) then
begin
Result := True;
Break;
end;
Inc(Start, FMD4);
if (Start < Stop) and
(SurrogateLowStart <= UCS4(Start^)) and
(UCS4(Start^) <= SurrogateLowEnd) and
(SurrogateHighStart <= UCS4((Start - 1)^)) and
(UCS4((Start - 1)^) <= SurrogateHighEnd) then
Dec(Start);
end;
end;
end;
procedure TUTBMSearch.Clear;
begin
ClearPattern;
inherited Clear;
end;
function TUTBMSearch.FindAll(const Text: WideString): Boolean;
begin
Result := FindAll(PWideChar(Text), Length(Text));
end;
function TUTBMSearch.FindAll(Text: PWideChar; TextLen: SizeInt): Boolean;
// Looks for all occurences of the pattern passed to FindPrepare and creates an
// internal list of their positions.
var
Start, Stop: SizeInt;
Run: PWideChar;
RunLen: SizeInt;
begin
ClearResults;
Run := Text;
RunLen := TextLen;
Start := 0;
Stop := 0;
// repeat to find all occurences of the pattern
while Find(Run, RunLen, Start, Stop) do
begin
// store this result (consider text pointer movement)...
AddResult(Start + Run - Text, Stop + Run - Text);
// ... and advance text position and length
Inc(Run, Stop);
Dec(RunLen, Stop);
end;
Result := Count > 0;
end;
function TUTBMSearch.FindFirst(const Text: WideString; var Start, Stop: SizeInt): Boolean;
// Looks for the first occurence of the pattern passed to FindPrepare in Text and
// returns True if one could be found (in which case Start and Stop are set to
// the according indices) otherwise False. This function is in particular of
// interest if only one occurence needs to be found.
begin
ClearResults;
Result := Find(PWideChar(Text), Length(Text), Start, Stop);
if Result then
AddResult(Start, Stop);
end;
function TUTBMSearch.FindFirst(Text: PWideChar; TextLen: SizeInt; var Start, Stop: SizeInt): Boolean;
// Same as the WideString version of this method.
begin
ClearResults;
Result := Find(Text, TextLen, Start, Stop);
if Result then
AddResult(Start, Stop);
end;
procedure TUTBMSearch.FindPrepare(const Pattern: WideString; Options: TSearchFlags);
begin
FindPrepare(PWideChar(Pattern), Length(Pattern), Options);
end;
procedure TUTBMSearch.FindPrepare(Pattern: PWideChar; PatternLength: SizeInt; Options: TSearchFlags);
// prepares following search by compiling the given pattern into an internal structure
begin
Compile(Pattern, PatternLength, Options);
end;
//----------------- Unicode RE search core ---------------------------------------------------------
const
// error codes
_URE_OK = 0;
_URE_UNEXPECTED_EOS = -1;
_URE_CCLASS_OPEN = -2;
_URE_UNBALANCED_GROUP = -3;
_URE_INVALID_PROPERTY = -4;
_URE_INVALID_RANGE = -5;
_URE_RANGE_OPEN = -6;
// options that can be combined for searching
URE_IGNORE_NONSPACING = $01;
URE_DONT_MATCHES_SEPARATORS = $02;
const
// Flags used internally in the DFA
_URE_DFA_CASEFOLD = $01;
_URE_DFA_BLANKLINE = $02;
// symbol types for the DFA
_URE_ANY_CHAR = 1;
_URE_CHAR = 2;
_URE_CCLASS = 3;
_URE_NCCLASS = 4;
_URE_BOL_ANCHOR = 5;
_URE_EOL_ANCHOR = 6;
// op codes for converting the NFA to a DFA
_URE_SYMBOL = 10;
_URE_PAREN = 11;
_URE_QUEST = 12;
_URE_STAR = 13;
_URE_PLUS = 14;
_URE_ONE = 15;
_URE_AND = 16;
_URE_OR = 17;
_URE_NOOP = $FFFF;
//----------------- TURESearch ---------------------------------------------------------------------
procedure TURESearch.Clear;
begin
inherited Clear;
ClearUREBuffer;
ClearDFA;
end;
procedure TURESearch.Push(V: SizeInt);
begin
with FUREBuffer do
begin
// If the 'Reducing' parameter is True, check to see if the value passed is
// already on the stack.
if Reducing and ExpressionList.Expressions[Word(V)].OnStack then
Exit;
if Stack.ListUsed = Length(Stack.List) then
SetLength(Stack.List, Length(Stack.List) + 8);
Stack.List[Stack.ListUsed] := V;
Inc(Stack.ListUsed);
// If the 'reducing' parameter is True, flag the element as being on the Stack.
if Reducing then
ExpressionList.Expressions[Word(V)].OnStack := True;
end;
end;
function TURESearch.Peek: SizeInt;
begin
if FUREBuffer.Stack.ListUsed = 0 then
Result := _URE_NOOP
else
Result := FUREBuffer.Stack.List[FUREBuffer.Stack.ListUsed - 1];
end;
function TURESearch.Pop: SizeInt;
begin
if FUREBuffer.Stack.ListUsed = 0 then
Result := _URE_NOOP
else
begin
Dec(FUREBuffer.Stack.ListUsed);
Result := FUREBuffer.Stack.List[FUREBuffer.Stack.ListUsed];
if FUREBuffer.Reducing then
FUREBuffer.ExpressionList.Expressions[Word(Result)].OnStack := False;
end;
end;
function TURESearch.ParsePropertyList(Properties: PUCS2; Limit: SizeInt;
var Categories: TCharacterCategories): SizeInt;
// Parse a comma-separated list of integers that represent character properties.
// Combine them into a set of categories and return the number of characters consumed.
var
N: SizeInt;
Run,
ListEnd: PUCS2;
begin
Run := Properties;
ListEnd := Run + Limit;
N := 0;
Categories := [];
while (FUREBuffer.Error = _URE_OK) and (Run < ListEnd) do
begin
if Run^ = ',' then
begin
// Encountered a comma, so take the number parsed so far as category and
// reset the number.
Include(Categories, TCharacterCategory(N));
N := 0;
end
else
begin
if (Run^ >= '0') and (Run^ <= '9') then
begin
// Encountered a digit, so start or continue building the SizeInt that
// represents the character category.
N := (N * 10) + SizeInt(Word(Run^) - Ord('0'));
end
else
begin
// Encountered something that is not part of the property list.
// Indicate that we are done.
Break;
end;
end;
// If the number is to large then there is a problem.
// Most likely a missing comma separator.
if SizeInt(N) > Ord(High(TCharacterCategory)) then
FUREBuffer.Error := _URE_INVALID_PROPERTY;
Inc(Run);
end;
// Return the number of characters consumed.
Result := Run - Properties;
end;
function TURESearch.MakeHexNumber(NP: PUCS2; Limit: SizeInt; var Number: UCS4): SizeInt;
// Collect a hex number with 1 to 4 digits and return the number of characters used.
var
I: SizeInt;
Run,
ListEnd: PUCS2;
begin
Run := np;
ListEnd := Run + Limit;
Number := 0;
I := 0;
while (I < 4) and (Run < ListEnd) do
begin
if (Run^ >= '0') and (Run^ <= '9') then
Number := (Number shl 4) or (Cardinal(Word(Run^) - Ord('0')))
else
begin
if (Run^ >= 'A') and (Run^ <= 'F') then
Number := (Number shl 4) or (Cardinal(Word(Run^) - Ord('A')) + 10)
else
begin
if (Run^ >= 'a') and (Run^ <= 'f') then
Number := (Number shl 4) or (Cardinal(Word(Run^) - Ord('a')) + 10)
else
Break;
end;
end;
Inc(I);
Inc(Run);
end;
Result := Run - NP;
end;
procedure TURESearch.AddRange(var CCL: TUcCClass; Range: TUcRange);
// Insert a Range into a character class, removing duplicates and ordering them
// in increasing Range-start order.
var
I: SizeInt;
Temp: UCS4;
begin
// If the `Casefold' flag is set, then make sure both endpoints of the Range
// are converted to lower.
if (FUREBuffer.Flags and _URE_DFA_CASEFOLD) <> 0 then
begin
{ TODO : use the entire mapping, not only the first character }
Range.MinCode := UnicodeToLower(Range.MinCode)[0];
Range.MaxCode := UnicodeToLower(Range.MaxCode)[0];
end;
// Swap the Range endpoints if they are not in increasing order.
if Range.MinCode > Range.MaxCode then
begin
Temp := Range.MinCode;
Range.MinCode := Range.MaxCode;
Range.MaxCode := Temp;
end;
I := 0;
while (I < CCL.RangesUsed) and (Range.MinCode < CCL.Ranges[I].MinCode) do
Inc(I);
// check for a duplicate
if (I < CCL.RangesUsed) and (Range.MinCode = CCL.Ranges[I].MinCode) and
(Range.MaxCode = CCL.Ranges[I].MaxCode) then
Exit;
if CCL.RangesUsed = Length(CCL.Ranges) then
SetLength(CCL.Ranges, Length(CCL.Ranges) + 8);
if I < CCL.RangesUsed then
Move(CCL.Ranges[I], CCL.Ranges[I + 1], SizeOf(TUcRange) * (CCL.RangesUsed - I));
CCL.Ranges[I].MinCode := Range.MinCode;
CCL.Ranges[I].MaxCode := Range.MaxCode;
Inc(CCL.RangesUsed);
end;
type
PTrie = ^TTrie;
TTrie = record
Key: UCS2;
Len,
Next: SizeInt;
Setup: SizeInt;
Categories: TCharacterCategories;
end;
procedure TURESearch.SpaceSetup(Symbol: PUcSymbolTableEntry; Categories: TCharacterCategories);
var
Range: TUcRange;
begin
Symbol.Categories := Symbol.Categories + Categories;
Range.MinCode := UCS4(WideTabulator);
Range.MaxCode := UCS4(WideTabulator);
AddRange(Symbol.Symbol.CCL, Range);
Range.MinCode := UCS4(WideCarriageReturn);
Range.MaxCode := UCS4(WideCarriageReturn);
AddRange(Symbol.Symbol.CCL, Range);
Range.MinCode := UCS4(WideLineFeed);
Range.MaxCode := UCS4(WideLineFeed);
AddRange(Symbol.Symbol.CCL, Range);
Range.MinCode := UCS4(WideFormFeed);
Range.MaxCode := UCS4(WideFormFeed);
AddRange(Symbol.Symbol.CCL, Range);
Range.MinCode := $FEFF;
Range.MaxCode := $FEFF;
AddRange(Symbol.Symbol.CCL, Range);
end;
procedure TURESearch.HexDigitSetup(Symbol: PUcSymbolTableEntry);
var
Range: TUcRange;
begin
Range.MinCode := UCS4('0');
Range.MaxCode := UCS4('9');
AddRange(Symbol.Symbol.CCL, Range);
Range.MinCode := UCS4('A');
Range.MaxCode := UCS4('F');
AddRange(Symbol.Symbol.CCL, Range);
Range.MinCode := UCS4('a');
Range.MaxCode := UCS4('f');
AddRange(Symbol.Symbol.CCL, Range);
end;
const
CClassTrie: array [0..64] of TTrie = (
(Key: #$003A; Len: 1; Next: 1; Setup: 0; Categories: []),
(Key: #$0061; Len: 9; Next: 10; Setup: 0; Categories: []),
(Key: #$0063; Len: 8; Next: 19; Setup: 0; Categories: []),
(Key: #$0064; Len: 7; Next: 24; Setup: 0; Categories: []),
(Key: #$0067; Len: 6; Next: 29; Setup: 0; Categories: []),
(Key: #$006C; Len: 5; Next: 34; Setup: 0; Categories: []),
(Key: #$0070; Len: 4; Next: 39; Setup: 0; Categories: []),
(Key: #$0073; Len: 3; Next: 49; Setup: 0; Categories: []),
(Key: #$0075; Len: 2; Next: 54; Setup: 0; Categories: []),
(Key: #$0078; Len: 1; Next: 59; Setup: 0; Categories: []),
(Key: #$006C; Len: 1; Next: 11; Setup: 0; Categories: []),
(Key: #$006E; Len: 2; Next: 13; Setup: 0; Categories: []),
(Key: #$0070; Len: 1; Next: 16; Setup: 0; Categories: []),
(Key: #$0075; Len: 1; Next: 14; Setup: 0; Categories: []),
(Key: #$006D; Len: 1; Next: 15; Setup: 0; Categories: []),
(Key: #$003A; Len: 1; Next: 16; Setup: 1; Categories: ClassLetter + ClassNumber),
(Key: #$0068; Len: 1; Next: 17; Setup: 0; Categories: []),
(Key: #$0061; Len: 1; Next: 18; Setup: 0; Categories: []),
(Key: #$003A; Len: 1; Next: 19; Setup: 1; Categories: ClassLetter),
(Key: #$006E; Len: 1; Next: 20; Setup: 0; Categories: []),
(Key: #$0074; Len: 1; Next: 21; Setup: 0; Categories: []),
(Key: #$0072; Len: 1; Next: 22; Setup: 0; Categories: []),
(Key: #$006C; Len: 1; Next: 23; Setup: 0; Categories: []),
(Key: #$003A; Len: 1; Next: 24; Setup: 1; Categories: [ccOtherControl, ccOtherFormat]),
(Key: #$0069; Len: 1; Next: 25; Setup: 0; Categories: []),
(Key: #$0067; Len: 1; Next: 26; Setup: 0; Categories: []),
(Key: #$0069; Len: 1; Next: 27; Setup: 0; Categories: []),
(Key: #$0074; Len: 1; Next: 28; Setup: 0; Categories: []),
(Key: #$003A; Len: 1; Next: 29; Setup: 1; Categories: ClassNumber),
(Key: #$0072; Len: 1; Next: 30; Setup: 0; Categories: []),
(Key: #$0061; Len: 1; Next: 31; Setup: 0; Categories: []),
(Key: #$0070; Len: 1; Next: 32; Setup: 0; Categories: []),
(Key: #$0068; Len: 1; Next: 33; Setup: 0; Categories: []),
(Key: #$003A; Len: 1; Next: 34; Setup: 1; Categories: ClassMark + ClassNumber + ClassLetter + ClassPunctuation +
ClassSymbol),
(Key: #$006F; Len: 1; Next: 35; Setup: 0; Categories: []),
(Key: #$0077; Len: 1; Next: 36; Setup: 0; Categories: []),
(Key: #$0065; Len: 1; Next: 37; Setup: 0; Categories: []),
(Key: #$0072; Len: 1; Next: 38; Setup: 0; Categories: []),
(Key: #$003A; Len: 1; Next: 39; Setup: 1; Categories: [ccLetterLowercase]),
(Key: #$0072; Len: 2; Next: 41; Setup: 0; Categories: []),
(Key: #$0075; Len: 1; Next: 45; Setup: 0; Categories: []),
(Key: #$0069; Len: 1; Next: 42; Setup: 0; Categories: []),
(Key: #$006E; Len: 1; Next: 43; Setup: 0; Categories: []),
(Key: #$0074; Len: 1; Next: 44; Setup: 0; Categories: []),
(Key: #$003A; Len: 1; Next: 45; Setup: 1; Categories: ClassMark + ClassNumber + ClassLetter + ClassPunctuation +
ClassSymbol + [ccSeparatorSpace]),
(Key: #$006E; Len: 1; Next: 46; Setup: 0; Categories: []),
(Key: #$0063; Len: 1; Next: 47; Setup: 0; Categories: []),
(Key: #$0074; Len: 1; Next: 48; Setup: 0; Categories: []),
(Key: #$003A; Len: 1; Next: 49; Setup: 1; Categories: ClassPunctuation),
(Key: #$0070; Len: 1; Next: 50; Setup: 0; Categories: []),
(Key: #$0061; Len: 1; Next: 51; Setup: 0; Categories: []),
(Key: #$0063; Len: 1; Next: 52; Setup: 0; Categories: []),
(Key: #$0065; Len: 1; Next: 53; Setup: 0; Categories: []),
(Key: #$003A; Len: 1; Next: 54; Setup: 2; Categories: ClassSpace),
(Key: #$0070; Len: 1; Next: 55; Setup: 0; Categories: []),
(Key: #$0070; Len: 1; Next: 56; Setup: 0; Categories: []),
(Key: #$0065; Len: 1; Next: 57; Setup: 0; Categories: []),
(Key: #$0072; Len: 1; Next: 58; Setup: 0; Categories: []),
(Key: #$003A; Len: 1; Next: 59; Setup: 1; Categories: [ccLetterUppercase]),
(Key: #$0064; Len: 1; Next: 60; Setup: 0; Categories: []),
(Key: #$0069; Len: 1; Next: 61; Setup: 0; Categories: []),
(Key: #$0067; Len: 1; Next: 62; Setup: 0; Categories: []),
(Key: #$0069; Len: 1; Next: 63; Setup: 0; Categories: []),
(Key: #$0074; Len: 1; Next: 64; Setup: 0; Categories: []),
(Key: #$003A; Len: 1; Next: 65; Setup: 3; Categories: [])
);
function TURESearch.PosixCCL(CP: PUCS2; Limit: SizeInt; Symbol: PUcSymbolTableEntry): SizeInt;
// Probe for one of the POSIX colon delimited character classes in the static trie.
var
I: SizeInt;
N: SizeInt;
TP: PTrie;
Run,
ListEnd: PUCS2;
begin
Result := 0;
// If the number of characters left is less than 7,
// then this cannot be interpreted as one of the colon delimited classes.
if Limit >= 7 then
begin
Run := cp;
ListEnd := Run + Limit;
TP := @CClassTrie[0];
I := 0;
while (Run < ListEnd) and (I < 8) do
begin
N := TP.Len;
while (N > 0) and (TP.Key <> Run^) do
begin
Inc(TP);
Dec(N);
end;
if N = 0 then
begin
Result := 0;
Exit;
end;
if (Run^ = ':') and ((I = 6) or (I = 7)) then
begin
Inc(Run);
Break;
end;
if (Run + 1) < ListEnd then
TP := @CClassTrie[TP.Next];
Inc(I);
Inc(Run);
end;
Result := Run - CP;
case TP.Setup of
1:
Symbol.Categories := Symbol.Categories + TP.Categories;
2:
SpaceSetup(Symbol, TP.Categories);
3:
HexDigitSetup(Symbol);
else
Result := 0;
end;
end;
end;
function TURESearch.BuildCharacterClass(CP: PUCS2; Limit: SizeInt; Symbol: PUcSymbolTableEntry): SizeInt;
// Construct a list of ranges and return the number of characters consumed.
var
RangeEnd: SizeInt;
N: SizeInt;
Run,
ListEnd: PUCS2;
C, Last: UCS4;
Range: TUcRange;
begin
Run := cp;
ListEnd := Run + Limit;
if Run^ = '^' then
begin
Symbol.AType := _URE_NCCLASS;
Inc(Run);
end
else
Symbol.AType := _URE_CCLASS;
Last := 0;
RangeEnd := 0;
while (FUREBuffer.Error = _URE_OK) and (Run < ListEnd) do
begin
// Allow for the special case []abc], where the first closing bracket would end an empty
// character class, which makes no sense. Hence this bracket is treaded literally.
if (Run^ = ']') and (Symbol.Symbol.CCL.RangesUsed > 0) then
Break;
C := UCS4(Run^);
Inc(Run);
// escape character
if C = Ord('\') then
begin
if Run = ListEnd then
begin
// The EOS was encountered when expecting the reverse solidus to be followed by the character it is escaping.
// Set an Error code and return the number of characters consumed up to this point.
FUREBuffer.Error := _URE_UNEXPECTED_EOS;
Result := Run - CP;
Exit;
end;
C := UCS4(Run^);
Inc(Run);
case UCS2(C) of
'a':
C := $07;
'b':
C := $08;
'f':
C := $0C;
'n':
C := $0A;
'R':
C := $0D;
't':
C := $09;
'v':
C := $0B;
'p', 'P':
begin
Inc(Run, ParsePropertyList(Run, ListEnd - Run, Symbol.Categories));
// Invert the bit mask of the properties if this is a negated character class or if 'P' is used to specify
// a list of character properties that should *not* match in a character class.
if C = Ord('P') then
Symbol.Categories := ClassAll - Symbol.Categories;
Continue;
end;
'x', 'X', 'u', 'U':
begin
if (Run < ListEnd) and
((Run^ >= '0') and (Run^ <= '9') or
(Run^ >= 'A') and (Run^ <= 'F') or
(Run^ >= 'a') and (Run^ <= 'f')) then
Inc(Run, MakeHexNumber(Run, ListEnd - Run, C));
end;
end;
end
else
begin
if C = Ord(':') then
begin
// Probe for a POSIX colon delimited character class.
Dec(Run);
N := PosixCCL(Run, ListEnd - Run, Symbol);
if N = 0 then
Inc(Run)
else
begin
Inc(Run, N);
Continue;
end;
end;
end;
// Check to see if the current character is a low surrogate that needs
// to be combined with a preceding high surrogate.
if Last <> 0 then
begin
if (C >= SurrogateLowStart) and (C <= SurrogateLowEnd) then
begin
// Construct the UTF16 character code.
C := $10000 + (((Last and $03FF) shl 10) or (C and $03FF))
end
else
begin
// Add the isolated high surrogate to the range.
if RangeEnd = 1 then
Range.MaxCode := Last and $FFFF
else
begin
Range.MinCode := Last and $FFFF;
Range.MaxCode := Last and $FFFF;
end;
AddRange(Symbol.Symbol.CCL, Range);
RangeEnd := 0;
end;
end;
// Clear the Last character code.
Last := 0;
// This slightly awkward code handles the different cases needed to construct a range.
if (C >= SurrogateHighStart) and (C <= SurrogateHighEnd) then
begin
// If the high surrogate is followed by a Range indicator, simply add it as the Range start. Otherwise,
// save it in the next character is a low surrogate.
if Run^ = '-' then
begin
Inc(Run);
Range.MinCode := C;
RangeEnd := 1;
end
else
Last := C;
end
else
begin
if RangeEnd = 1 then
begin
Range.MaxCode := C;
AddRange(Symbol.Symbol.CCL, Range);
RangeEnd := 0;
end
else
begin
Range.MinCode := C;
Range.MaxCode := C;
if Run^ = '-' then
begin
Inc(Run);
RangeEnd := 1;
end
else
AddRange(Symbol.Symbol.CCL, Range);
end;
end;
end;
if (Run < ListEnd) and (Run^ = ']') then
Inc(Run)
else
begin
// The parse was not terminated by the character class close symbol (']'), so set an error code.
FUREBuffer.Error := _URE_CCLASS_OPEN;
end;
Result := Run - CP;
end;
function TURESearch.ProbeLowSurrogate(LeftState: PUCS2; Limit: SizeInt; var Code: UCS4): SizeInt;
// probes for a low surrogate hex code
var
I: SizeInt;
Run,
ListEnd: PUCS2;
begin
I := 0;
Code := 0;
Run := LeftState;
ListEnd := Run + Limit;
while (I < 4) and (Run < ListEnd) do
begin
if (Run^ >= '0') and (Run^ <= '9') then
Code := (Code shl 4) or (Cardinal(Word(Run^) - Ord('0')))
else
begin
if (Run^ >= 'A') and (Run^ <= 'F') then
Code := (Code shl 4) or (Cardinal(Word(Run^) - Ord('A')) + 10)
else
begin
if (Run^ >= 'a') and (Run^ <= 'f') then
Code := (Code shl 4) or (Cardinal(Word(Run^) - Ord('a')) + 10)
else
Break;
end;
end;
Inc(Run);
end;
if (SurrogateLowStart <= Code) and (Code <= SurrogateLowEnd) then
Result := Run - LeftState
else
Result := 0;
end;
function TURESearch.CompileSymbol(S: PUCS2; Limit: SizeInt; Symbol: PUcSymbolTableEntry): SizeInt;
var
C: UCS4;
Run,
ListEnd: PUCS2;
begin
Run := S;
ListEnd := S + Limit;
C := UCS4(Run^);
Inc(Run);
if C = Ord('\') then
begin
if Run = ListEnd then
begin
// The EOS was encountered when expecting the reverse solidus to be followed
// by the character it is escaping. Set an Error code and return the number
// of characters consumed up to this point.
FUREBuffer.Error := _URE_UNEXPECTED_EOS;
Result := Run - S;
Exit;
end;
C := UCS4(Run^);
Inc(Run);
case UCS2(C) of
'p', 'P':
begin
if UCS2(C) = 'p' then
Symbol.AType :=_URE_CCLASS
else
Symbol.AType :=_URE_NCCLASS;
Inc(Run, ParsePropertyList(Run, ListEnd - Run, Symbol.Categories));
end;
'a':
begin
Symbol.AType := _URE_CHAR;
Symbol.Symbol.Chr := $07;
end;
'b':
begin
Symbol.AType := _URE_CHAR;
Symbol.Symbol.Chr := $08;
end;
'f':
begin
Symbol.AType := _URE_CHAR;
Symbol.Symbol.Chr := $0C;
end;
'n':
begin
Symbol.AType := _URE_CHAR;
Symbol.Symbol.Chr := $0A;
end;
'r':
begin
Symbol.AType := _URE_CHAR;
Symbol.Symbol.Chr := $0D;
end;
't':
begin
Symbol.AType := _URE_CHAR;
Symbol.Symbol.Chr := $09;
end;
'v':
begin
Symbol.AType := _URE_CHAR;
Symbol.Symbol.Chr := $0B;
end;
else
case UCS2(C) of
'x', 'X', 'u', 'U':
begin
// Collect between 1 and 4 digits representing an UCS2 code.
if (Run < ListEnd) and
((Run^ >= '0') and (Run^ <= '9') or
(Run^ >= 'A') and (Run^ <= 'F') or
(Run^ >= 'a') and (Run^ <= 'f')) then
Inc(Run, MakeHexNumber(Run, ListEnd - Run, C));
end;
end;
// Simply add an escaped character here.
Symbol.AType := _URE_CHAR;
Symbol.Symbol.Chr := C;
end;
end
else
begin
if (UCS2(C) = '^') or (UCS2(C) = '$') then
begin
// Handle the BOL and EOL anchors. This actually consists simply of setting
// a flag that indicates that the user supplied anchor match function should
// be called. This needs to be done instead of simply matching line/paragraph
// separators because beginning-of-text and end-of-text tests are needed as well.
if UCS2(C) = '^' then
Symbol.AType := _URE_BOL_ANCHOR
else
Symbol.AType := _URE_EOL_ANCHOR;
end
else
begin
if UCS2(C) = '[' then
begin
// construct a character class
Inc(Run, BuildCharacterClass(Run, ListEnd - Run, Symbol));
end
else
begin
if UCS2(C) = '.' then
Symbol.AType := _URE_ANY_CHAR
else
begin
Symbol.AType := _URE_CHAR;
Symbol.Symbol.Chr := C;
end;
end;
end;
end;
// If the symbol type happens to be a character and is a high surrogate, then
// probe forward to see if it is followed by a low surrogate that needs to be added.
if (Run < ListEnd) and
(Symbol.AType = _URE_CHAR) and
(SurrogateHighStart <= Symbol.Symbol.Chr) and
(Symbol.Symbol.Chr <= SurrogateHighEnd) then
begin
if (SurrogateLowStart <= UCS4(Run^)) and
(UCS4(Run^) <= SurrogateLowEnd) then
begin
Symbol.Symbol.Chr := $10000 + (((Symbol.Symbol.Chr and $03FF) shl 10) or (UCS4(Run^) and $03FF));
Inc(Run);
end
else
begin
if (Run^ = '\') and (((Run + 1)^ = 'x') or ((Run + 1)^ = 'X') or
((Run + 1)^ = 'u') or ((Run + 1)^ = 'U')) then
begin
Inc(Run, ProbeLowSurrogate(Run + 2, ListEnd - (Run + 2), C));
if (SurrogateLowStart <= C) and (C <= SurrogateLowEnd) then
begin
// Take into account the \[xu] in front of the hex code.
Inc(Run, 2);
Symbol.Symbol.Chr := $10000 + (((Symbol.Symbol.Chr and $03FF) shl 10) or (C and $03FF));
end;
end;
end;
end;
// Last, make sure any _URE_CHAR type symbols are changed to lower if the
// 'Casefold' flag is set.
{ TODO : use the entire mapping, not only the first character and use the
case fold abilities of the unit. }
if ((FUREBuffer.Flags and _URE_DFA_CASEFOLD) <> 0) and (Symbol.AType = _URE_CHAR) then
Symbol.Symbol.Chr := UnicodeToLower(Symbol.Symbol.Chr)[0];
// If the symbol constructed is anything other than one of the anchors,
// make sure the _URE_DFA_BLANKLINE flag is removed.
if (Symbol.AType <> _URE_BOL_ANCHOR) and (Symbol.AType <> _URE_EOL_ANCHOR) then
FUREBuffer.Flags := FUREBuffer.Flags and not _URE_DFA_BLANKLINE;
// Return the number of characters consumed.
Result := Run - S;
end;
function TURESearch.SymbolsAreDifferent(A, B: PUcSymbolTableEntry): Boolean;
begin
Result := False;
if (A.AType <> B.AType) or (A.Mods <> B.Mods) or (A.Categories <> B.Categories) then
Result := True
else
begin
if (A.AType = _URE_CCLASS) or (A.AType = _URE_NCCLASS) then
begin
if A.Symbol.CCL.RangesUsed <> B.Symbol.CCL.RangesUsed then
Result := True
else
begin
if (A.Symbol.CCL.RangesUsed > 0) and
not CompareMem(@A.Symbol.CCL.Ranges[0], @B.Symbol.CCL.Ranges[0],
SizeOf(TUcRange) * A.Symbol.CCL.RangesUsed) then
Result := True;;
end;
end
else
begin
if (A.AType = _URE_CHAR) and (A.Symbol.Chr <> B.Symbol.Chr) then
Result := True;
end;
end;
end;
function TURESearch.MakeSymbol(S: PUCS2; Limit: SizeInt; out Consumed: SizeInt): SizeInt;
// constructs a symbol, but only keep unique symbols
var
I: SizeInt;
Start: PUcSymbolTableEntry;
Symbol: TUcSymbolTableEntry;
begin
// Build the next symbol so we can test to see if it is already in the symbol table.
ResetMemory(Symbol, SizeOf(TUcSymbolTableEntry));
Consumed := CompileSymbol(S, Limit, @Symbol);
// Check to see if the symbol exists.
I := 0;
Start := @FUREBuffer.SymbolTable.Symbols[0];
while (I < FUREBuffer.SymbolTable.SymbolsUsed) and SymbolsAreDifferent(@Symbol, Start) do
begin
Inc(I);
Inc(Start);
end;
if I < FUREBuffer.SymbolTable.SymbolsUsed then
begin
// Free up any ranges used for the symbol.
if (Symbol.AType = _URE_CCLASS) or (Symbol.AType = _URE_NCCLASS) then
Symbol.Symbol.CCL.Ranges := nil;
Result := FUREBuffer.SymbolTable.Symbols[I].ID;
Exit;
end;
// Need to add the new symbol.
if FUREBuffer.SymbolTable.SymbolsUsed = Length(FUREBuffer.SymbolTable.Symbols) then
begin
SetLength(FUREBuffer.SymbolTable.Symbols, Length(FUREBuffer.SymbolTable.Symbols) + 8);
end;
Symbol.ID := FUREBuffer.SymbolTable.SymbolsUsed;
Inc(FUREBuffer.SymbolTable.SymbolsUsed);
FUREBuffer.SymbolTable.Symbols[Symbol.ID] := Symbol;
Result := Symbol.ID;
end;
function TURESearch.MakeExpression(AType, LHS, RHS: SizeInt): SizeInt;
var
I: SizeInt;
begin
// Determine if the expression already exists or not.
with FUREBuffer.ExpressionList do
begin
for I := 0 to ExpressionsUsed - 1 do
begin
if (Expressions[I].AType = AType) and
(Expressions[I].LHS = LHS) and
(Expressions[I].RHS = RHS) then
begin
Result := I;
Exit;
end;
end;
// Need to add a new expression.
if ExpressionsUsed = Length(Expressions) then
SetLength(Expressions, Length(Expressions) + 8);
Expressions[ExpressionsUsed].OnStack := False;
Expressions[ExpressionsUsed].AType := AType;
Expressions[ExpressionsUsed].LHS := LHS;
Expressions[ExpressionsUsed].RHS := RHS;
Result := ExpressionsUsed;
Inc(ExpressionsUsed);
end;
end;
function IsSpecial(C: Word): Boolean;
begin
case C of
Word('+'),
Word('*'),
Word('?'),
Word('{'),
Word('|'),
Word(')'):
Result := True;
else
Result := False;
end;
end;
procedure TURESearch.CollectPendingOperations(var State: SizeInt);
// collects all pending AND and OR operations and make corresponding expressions
var
Operation: SizeInt;
begin
repeat
Operation := Peek;
if (Operation <> _URE_AND) and (Operation <> _URE_OR) then
Break;
// make an expression with the AND or OR operator and its right hand side
Operation := Pop;
State := MakeExpression(Operation, Pop, State);
until False;
end;
function TURESearch.ConvertRegExpToNFA(RE: PWideChar; RELength: SizeInt): SizeInt;
// Converts the regular expression into an NFA in a form that will be easy to
// reduce to a DFA. The starting state for the reduction will be returned.
var
C: UCS2;
Head, Tail: PUCS2;
S: WideString;
Symbol,
State,
LastState,
Used,
M, N: SizeInt;
I: SizeInt;
begin
State := _URE_NOOP;
Head := RE;
Tail := Head + RELength;
while (FUREBuffer.Error = _URE_OK) and (Head < Tail) do
begin
C := Head^;
Inc(Head);
case C of
'(':
Push(_URE_PAREN);
')': // check for the case of too many close parentheses
begin
if Peek = _URE_NOOP then
begin
FUREBuffer.Error := _URE_UNBALANCED_GROUP;
Break;
end;
CollectPendingOperations(State);
// remove the _URE_PAREN off the stack
Pop;
end;
'*':
State := MakeExpression(_URE_STAR, State, _URE_NOOP);
'+':
State := MakeExpression(_URE_PLUS, State, _URE_NOOP);
'?':
State := MakeExpression(_URE_QUEST, State, _URE_NOOP);
'|':
begin
CollectPendingOperations(State);
Push(State);
Push(_URE_OR);
end;
'{': // expressions of the form {m, n}
begin
C := #0;
M := 0;
N := 0;
// get first number
while UnicodeIsWhiteSpace(UCS4(Head^)) do
Inc(Head);
// very slow implementation
S := '';
while (Head^ >= WideChar('0')) and (Head^ <= WideChar('9')) do
begin
S := S + Head^;
Inc(Head);
end;
if S <> '' then
M := StrToInt(S);
while UnicodeIsWhiteSpace(UCS4(Head^)) do
Inc(Head);
if (Head^ <> ',') and (Head^ <> '}') then
begin
FUREBuffer.Error := _URE_INVALID_RANGE;
Break;
end;
// check for an upper limit
if Head^ <> '}' then
begin
Inc(Head);
// get second number
while UnicodeIsWhiteSpace(UCS4(Head^)) do
Inc(Head);
// very slow implementation
S := '';
while (Head^ >= WideChar('0')) and (Head^ <= WideChar('9')) do
begin
S := S + Head^;
Inc(Head);
end;
if S <> '' then
N := StrToInt(S);
end
else
N := M;
if Head^ <> '}' then
begin
FUREBuffer.Error := _URE_RANGE_OPEN;
Break;
end
else
Inc(Head);
// N = 0 means unlimited number of occurences
if N = 0 then
begin
case M of
0: // {,} {0,} {0, 0} mean the same as the star operator
State := MakeExpression(_URE_STAR, State, _URE_NOOP);
1: // {1,} {1, 0} mean the same as the plus operator
State := MakeExpression(_URE_PLUS, State, _URE_NOOP);
else
begin
// encapsulate the expanded branches as would they be in parenthesis
// in order to avoid unwanted concatenation with pending operations/symbols
Push(_URE_PAREN);
// {m,} {m, 0} mean M fixed occurences plus star operator
// make E^m...
for I := 1 to M - 1 do
begin
Push(State);
Push(_URE_AND);
end;
// ...and repeat the last symbol one or more times
State := MakeExpression(_URE_PLUS, State, _URE_NOOP);
CollectPendingOperations(State);
Pop;
end;
end;
end
else
begin
// check proper range limits
if M > N then
begin
FUREBuffer.Error := _URE_INVALID_RANGE;
Break;
end;
// check special case {0, 1} (which corresponds to the ? operator)
if (M = 0) and (N = 1) then
State := MakeExpression(_URE_QUEST, State, _URE_NOOP)
else
begin
// handle the general case by expanding {m, n} into the equivalent
// expression E^m | E^(m + 1) | ... | E^n
// encapsulate the expanded branches as would they be in parenthesis
// in order to avoid unwanted concatenation with pending operations/symbols
Push(_URE_PAREN);
// keep initial state as this is the one all alternatives start from
LastState := State;
// Consider the special case M = 0 first. Because there's no construct
// to enter a pure epsilon-transition into the expression array I
// work around with the question mark operator to describe the first
// and second branch alternative.
if M = 0 then
begin
State := MakeExpression(_URE_QUEST, State, _URE_NOOP);
Inc(M, 2);
// Mark the pending OR operation (there must always follow at
// least on more alternative because the special case {0, 1} has
// already been handled).
Push(State);
Push(_URE_OR);
end;
while M <= N do
begin
State := LastState;
// create E^M
for I := 1 to SizeInt(M) - 1 do
begin
Push(State);
Push(_URE_AND);
end;
// finish the branch and mark it as pending OR operation if it
// isn't the last one
CollectPendingOperations(State);
if M < N then
begin
Push(State);
Push(_URE_OR);
end;
Inc(M);
end;
// remove the _URE_PAREN off the stack
Pop;
end;
end;
end;
else
Dec(Head);
Symbol := MakeSymbol(Head, Tail - Head, Used);
Inc(Head, Used);
State := MakeExpression(_URE_SYMBOL, Symbol, _URE_NOOP);
end;
if (C <> '(') and (C <> '|') and (C <> '{') and (Head < Tail) and
(not IsSpecial(Word(Head^)) or (Head^ = '(')) then
begin
Push(State);
Push(_URE_AND);
end;
end;
CollectPendingOperations(State);
if FUREBuffer.Stack.ListUsed > 0 then
FUREBuffer.Error := _URE_UNBALANCED_GROUP;
if FUREBuffer.Error = _URE_OK then
Result := State
else
Result := _URE_NOOP;
end;
procedure TURESearch.AddSymbolState(Symbol, State: SizeInt);
var
I, J: SizeInt;
Found: Boolean;
begin
// Locate the symbol in the symbol table so the state can be added.
// If the symbol doesn't exist, then we are in serious trouble.
with FUREBuffer.SymbolTable do
begin
I := 0;
while (I < SymbolsUsed) and (Symbol <> Symbols[I].ID) do
Inc(I);
Assert(I < SymbolsUsed);
end;
// Now find out if the state exists in the symbol's state list.
with FUREBuffer.SymbolTable.Symbols[I].States do
begin
Found := False;
for J := 0 to ListUsed - 1 do
begin
if State <= List[J] then
begin
Found := True;
Break;
end;
end;
if not Found then
J := ListUsed;
if not Found or (State < List[J]) then
begin
// Need to add the state in order.
if ListUsed = Length(List) then
SetLength(List, Length(List) + 8);
if J < ListUsed then
Move(List[J], List[J + 1], SizeOf(SizeInt) * (ListUsed - J));
List[J] := State;
Inc(ListUsed);
end;
end;
end;
function TURESearch.AddState(NewStates: array of SizeInt): SizeInt;
var
I: SizeInt;
Found: Boolean;
begin
Found := False;
for I := 0 to FUREBuffer.States.StatesUsed - 1 do
begin
if (FUREBuffer.States.States[I].StateList.ListUsed = Length(NewStates)) and
CompareMem(@NewStates[0], @FUREBuffer.States.States[I].StateList.List[0],
SizeOf(SizeInt) * Length(NewStates)) then
begin
Found := True;
Break;
end;
end;
if not Found then
begin
// Need to add a new DFA State (set of NFA states).
if FUREBuffer.States.StatesUsed = Length(FUREBuffer.States.States) then
SetLength(FUREBuffer.States.States, Length(FUREBuffer.States.States) + 8);
with FUREBuffer.States.States[FUREBuffer.States.StatesUsed] do
begin
ID := FUREBuffer.States.StatesUsed;
if (StateList.ListUsed + Length(NewStates)) >= Length(StateList.List) then
SetLength(StateList.List, Length(StateList.List) + Length(NewStates) + 8);
Move(NewStates[0], StateList.List[StateList.ListUsed], SizeOf(SizeInt) * Length(NewStates));
Inc(StateList.ListUsed, Length(NewStates));
end;
Inc(FUREBuffer.States.StatesUsed);
end;
// Return the ID of the DFA state representing a group of NFA States.
if Found then
Result := I
else
Result := FUREBuffer.States.StatesUsed - 1;
end;
procedure TURESearch.Reduce(Start: SizeInt);
var
I, J,
Symbols: SizeInt;
State,
RHS,
s1, s2,
ns1, ns2: SizeInt;
Evaluating: Boolean;
begin
FUREBuffer.Reducing := True;
// Add the starting state for the reduction.
AddState([Start]);
// Process each set of NFA states that get created.
I := 0;
// further states are added in the loop
while I < FUREBuffer.States.StatesUsed do
begin
with FUREBuffer, States.States[I], ExpressionList do
begin
// Push the current states on the stack.
for J := 0 to StateList.ListUsed - 1 do
Push(StateList.List[J]);
// Reduce the NFA states.
Accepting := False;
Symbols := 0;
J := 0;
// need a while loop here as the stack will be modified within the loop and
// so also its usage count used to terminate the loop
while J < FUREBuffer.Stack.ListUsed do
begin
State := FUREBuffer.Stack.List[J];
Evaluating := True;
// This inner loop is the iterative equivalent of recursively
// reducing subexpressions generated as a result of a reduction.
while Evaluating do
begin
case Expressions[State].AType of
_URE_SYMBOL:
begin
ns1 := MakeExpression(_URE_ONE, _URE_NOOP, _URE_NOOP);
AddSymbolState(Expressions[State].LHS, ns1);
Inc(Symbols);
Evaluating := False;
end;
_URE_ONE:
begin
Accepting := True;
Evaluating := False;
end;
_URE_QUEST:
begin
s1 := Expressions[State].LHS;
ns1 := MakeExpression(_URE_ONE, _URE_NOOP, _URE_NOOP);
State := MakeExpression(_URE_OR, ns1, s1);
end;
_URE_PLUS:
begin
s1 := Expressions[State].LHS;
ns1 := MakeExpression(_URE_STAR, s1, _URE_NOOP);
State := MakeExpression(_URE_AND, s1, ns1);
end;
_URE_STAR:
begin
s1 := Expressions[State].LHS;
ns1 := MakeExpression(_URE_ONE, _URE_NOOP, _URE_NOOP);
ns2 := MakeExpression(_URE_PLUS, s1, _URE_NOOP);
State := MakeExpression(_URE_OR, ns1, ns2);
end;
_URE_OR:
begin
s1 := Expressions[State].LHS;
s2 := Expressions[State].RHS;
Push(s1);
Push(s2);
Evaluating := False;
end;
_URE_AND:
begin
s1 := Expressions[State].LHS;
s2 := Expressions[State].RHS;
case Expressions[s1].AType of
_URE_SYMBOL:
begin
AddSymbolState(Expressions[s1].LHS, s2);
Inc(Symbols);
Evaluating := False;
end;
_URE_ONE:
State := s2;
_URE_QUEST:
begin
ns1 := Expressions[s1].LHS;
ns2 := MakeExpression(_URE_AND, ns1, s2);
State := MakeExpression(_URE_OR, s2, ns2);
end;
_URE_PLUS:
begin
ns1 := Expressions[s1].LHS;
ns2 := MakeExpression(_URE_OR, s2, State);
State := MakeExpression(_URE_AND, ns1, ns2);
end;
_URE_STAR:
begin
ns1 := Expressions[s1].LHS;
ns2 := MakeExpression(_URE_AND, ns1, State);
State := MakeExpression(_URE_OR, s2, ns2);
end;
_URE_OR:
begin
ns1 := Expressions[s1].LHS;
ns2 := Expressions[s1].RHS;
ns1 := MakeExpression(_URE_AND, ns1, s2);
ns2 := MakeExpression(_URE_AND, ns2, s2);
State := MakeExpression(_URE_OR, ns1, ns2);
end;
_URE_AND:
begin
ns1 := Expressions[s1].LHS;
ns2 := Expressions[s1].RHS;
ns2 := MakeExpression(_URE_AND, ns2, s2);
State := MakeExpression(_URE_AND, ns1, ns2);
end;
end;
end;
end;
end;
Inc(J);
end;
// clear the state stack
while Pop <> _URE_NOOP do
{ nothing };
// generate the DFA states for the symbols collected during the current reduction
if (TransitionsUsed + Symbols) > Length(Transitions) then
SetLength(Transitions, Length(Transitions) + Symbols);
// go through the symbol table and generate the DFA state transitions for
// each symbol that has collected NFA states
Symbols := 0;
J := 0;
while J < FUREBuffer.SymbolTable.SymbolsUsed do
begin
begin
if FUREBuffer.SymbolTable.Symbols[J].States.ListUsed > 0 then
begin
Transitions[Symbols].LHS := FUREBuffer.SymbolTable.Symbols[J].ID;
with FUREBuffer.SymbolTable.Symbols[J] do
begin
RHS := AddState(Copy(States.List, 0, States.ListUsed));
States.ListUsed := 0;
end;
Transitions[Symbols].RHS := RHS;
Inc(Symbols);
end;
end;
Inc(J);
end;
// set the number of transitions actually used
// Note: we need again to qualify a part of the TransistionsUsed path since the
// state array could be reallocated in the AddState call above and the
// with ... do will then be invalid.
States.States[I].TransitionsUsed := Symbols;
end;
Inc(I);
end;
FUREBuffer.Reducing := False;
end;
procedure TURESearch.AddEquivalentPair(L, R: SizeInt);
var
I: SizeInt;
begin
L := FUREBuffer.States.States[L].ID;
R := FUREBuffer.States.States[R].ID;
if L <> R then
begin
if L > R then
begin
I := L;
L := R;
R := I;
end;
// Check to see if the equivalence pair already exists.
I := 0;
with FUREBuffer.EquivalentList do
begin
while (I < EquivalentsUsed) and
((Equivalents[I].Left <> L) or (Equivalents[I].Right <> R)) do
Inc(I);
if I >= EquivalentsUsed then
begin
if EquivalentsUsed = Length(Equivalents) then
SetLength(Equivalents, Length(Equivalents) + 8);
Equivalents[EquivalentsUsed].Left := L;
Equivalents[EquivalentsUsed].Right := R;
Inc(EquivalentsUsed);
end;
end;
end;
end;
procedure TURESearch.MergeEquivalents;
// merges the DFA states that are equivalent
var
I, J, K,
Equal: SizeInt;
Done: Boolean;
State1, State2,
LeftState,
RightState: PUcState;
begin
for I := 0 to FUREBuffer.States.StatesUsed - 1 do
begin
State1 := @FUREBuffer.States.States[I];
if State1.ID = SizeInt(I) then
begin
J := 0;
while J < I do
begin
State2 := @FUREBuffer.States.States[J];
if State2.ID = SizeInt(J) then
begin
FUREBuffer.EquivalentList.EquivalentsUsed := 0;
AddEquivalentPair(I, J);
Done := False;
Equal := 0;
while Equal < FUREBuffer.EquivalentList.EquivalentsUsed do
begin
LeftState := @FUREBuffer.States.States[FUREBuffer.EquivalentList.Equivalents[Equal].Left];
RightState := @FUREBuffer.States.States[FUREBuffer.EquivalentList.Equivalents[Equal].Right];
if (LeftState.Accepting <> RightState.Accepting) or
(LeftState.TransitionsUsed <> RightState.TransitionsUsed) then
begin
Done := True;
Break;
end;
K := 0;
while (K < LeftState.TransitionsUsed) and
(LeftState.Transitions[K].LHS = RightState.Transitions[K].LHS) do
Inc(K);
if K < LeftState.TransitionsUsed then
begin
Done := True;
Break;
end;
for K := 0 to LeftState.TransitionsUsed - 1 do
AddEquivalentPair(LeftState.Transitions[K].RHS, RightState.Transitions[K].RHS);
Inc(Equal);
end;
if not Done then
Break;
end;
Inc(J);
end;
if J < I then
begin
with FUREBuffer do
begin
for Equal := 0 to EquivalentList.EquivalentsUsed - 1 do
begin
States.States[EquivalentList.Equivalents[Equal].Right].ID :=
States.States[EquivalentList.Equivalents[Equal].Left].ID;
end;
end;
end;
end;
end;
// Renumber the states appropriately
State1 := @FUREBuffer.States.States[0];
Equal := 0;
for I := 0 to FUREBuffer.States.StatesUsed - 1 do
begin
if State1.ID = SizeInt(I) then
begin
State1.ID := Equal;
Inc(Equal);
end
else
State1.ID := FUREBuffer.States.States[State1.ID].ID;
Inc(State1);
end;
end;
procedure TURESearch.ClearUREBuffer;
var
I: SizeInt;
begin
with FUREBuffer do
begin
// quite a few dynamic arrays to free
Stack.List := nil;
ExpressionList.Expressions := nil;
// the symbol table has been handed over to the DFA and will be freed on
// release of the DFA
SymbolTable.SymbolsUsed := 0;
for I := 0 to States.StatesUsed - 1 do
begin
States.States[I].Transitions := nil;
States.States[I].StateList.List := nil;
States.States[I].StateList.ListUsed := 0;
States.States[I].TransitionsUsed := 0;
end;
States.StatesUsed := 0;
States.States := nil;
EquivalentList.Equivalents := nil;
end;
ResetMemory(FUREBuffer, SizeOf(FUREBuffer));
end;
procedure TURESearch.CompileURE(RE: PWideChar; RELength: SizeInt; Casefold: Boolean);
var
I, J: SizeInt;
State: SizeInt;
Run: PUcState;
TP: SizeInt;
procedure UREError(Text: string; RE: PWideChar);
var
S: string;
begin
S := RE;
raise EJclUnicodeError.CreateResFmt(@RsUREErrorFmt, [LoadResString(@RsUREBaseString), Text, S]);
end;
begin
// be paranoid
if (RE <> nil) and (RE^ <> WideNull) and (RELength > 0) then
begin
// Reset the various fields of the compilation buffer. Default the Flags
// to indicate the presense of the "^$" pattern. If any other pattern
// occurs, then this flag will be removed. This is done to catch this
// special pattern and handle it specially when matching.
ClearUREBuffer;
ClearDFA;
FUREBuffer.Flags := _URE_DFA_BLANKLINE;
if Casefold then
FUREBuffer.Flags := FUREBuffer.Flags or _URE_DFA_CASEFOLD;
// Construct the NFA. If this stage returns a 0, then an error occured or an
// empty expression was passed.
State := ConvertRegExpToNFA(RE, RELength);
if State <> _URE_NOOP then
begin
// Do the expression reduction to get the initial DFA.
Reduce(State);
// Merge all the equivalent DFA States.
MergeEquivalents;
// Construct the minimal DFA.
FDFA.Flags := FUREBuffer.Flags and (_URE_DFA_CASEFOLD or _URE_DFA_BLANKLINE);
// Free up the NFA state groups and transfer the symbols from the buffer
// to the DFA.
FDFA.SymbolTable := FUREBuffer.SymbolTable;
FUREBuffer.SymbolTable.Symbols := nil;
// Collect the total number of states and transitions needed for the DFA.
State := 0;
for I := 0 to FUREBuffer.States.StatesUsed - 1 do
begin
if FUREBuffer.States.States[I].ID = State then
begin
Inc(FDFA.StateList.StatesUsed);
Inc(FDFA.TransitionList.TransitionsUsed, FUREBuffer.States.States[I].TransitionsUsed);
Inc(State);
end;
end;
// Allocate enough space for the states and transitions.
SetLength(FDFA.StateList.States, FDFA.StateList.StatesUsed);
SetLength(FDFA.TransitionList.Transitions, FDFA.TransitionList.TransitionsUsed);
// Actually transfer the DFA States from the buffer.
State := 0;
TP := 0;
Run := @FUREBuffer.States.States[0];
for I := 0 to FUREBuffer.States.StatesUsed - 1 do
begin
if Run.ID = State then
begin
FDFA.StateList.States[I].StartTransition := TP;
FDFA.StateList.States[I].NumberTransitions := Run.TransitionsUsed;
FDFA.StateList.States[I].Accepting := Run.Accepting;
// Add the transitions for the state
for J := 0 to FDFA.StateList.States[I].NumberTransitions - 1 do
begin
FDFA.TransitionList.Transitions[TP].Symbol := Run.Transitions[J].LHS;
FDFA.TransitionList.Transitions[TP].NextState :=
FUREBuffer.States.States[Run.Transitions[J].RHS].ID;
Inc(TP);
end;
Inc(State);
end;
Inc(Run);
end;
end
else
begin
// there might be an error while parsing the pattern, show it if so
case FUREBuffer.Error of
_URE_UNEXPECTED_EOS:
UREError(LoadResString(@RsUREUnexpectedEOS), RE);
_URE_CCLASS_OPEN:
UREError(LoadResString(@RsURECharacterClassOpen), RE);
_URE_UNBALANCED_GROUP:
UREError(LoadResString(@RsUREUnbalancedGroup), RE);
_URE_INVALID_PROPERTY:
UREError(LoadResString(@RsUREInvalidCharProperty), RE);
_URE_INVALID_RANGE:
UREError(LoadResString(@RsUREInvalidRepeatRange), RE);
_URE_RANGE_OPEN:
UREError(LoadResString(@RsURERepeatRangeOpen), RE);
else
// expression was empty
raise EJclUnicodeError.CreateRes(@RsUREExpressionEmpty);
end;
end;
end;
end;
procedure TURESearch.ClearDFA;
var
I: SizeInt;
begin
with FDFA do
begin
for I := 0 to SymbolTable.SymbolsUsed - 1 do
begin
if (SymbolTable.Symbols[I].AType = _URE_CCLASS) or
(SymbolTable.Symbols[I].AType = _URE_NCCLASS) then
SymbolTable.Symbols[I].Symbol.CCL.Ranges := nil;
end;
for I := 0 to SymbolTable.SymbolsUsed - 1 do
begin
FDFA.SymbolTable.Symbols[I].States.List := nil;
FDFA.SymbolTable.Symbols[I].States.ListUsed := 0;
end;
SymbolTable.SymbolsUsed := 0;
SymbolTable.Symbols := nil;
StateList.States := nil;
TransitionList.Transitions := nil;
end;
ResetMemory(FDFA, SizeOf(FDFA));
end;
function IsSeparator(C: UCS4): Boolean;
begin
Result := (C = $D) or (C = $A) or (C = $2028) or (C = $2029);
end;
function TURESearch.ExecuteURE(Flags: Cardinal; Text: PUCS2; TextLen: SizeInt; var MatchStart,
MatchEnd: SizeInt): Boolean;
var
I, J: SizeInt;
Matched,
Found: Boolean;
Start, Stop: SizeInt;
C: UCS4;
Run, Tail, lp: PUCS2;
LastState: PDFAState;
Symbol: PUcSymbolTableEntry;
Rp: PUcRange;
LCMapping: TUCS4Array;
begin
Result := False;
if Text <> nil then
begin
// Handle the special case of an empty string matching the "^$" pattern.
if (Textlen = 0) and ((FDFA.Flags and _URE_DFA_BLANKLINE) <> 0) then
begin
MatchStart := 0;
MatchEnd := 0;
Result := True;
Exit;
end;
Run := Text;
Tail := Run + TextLen;
Start := -1;
Stop := -1;
LastState := @FDFA.StateList.States[0];
Found := False;
while not Found and (Run < Tail) do
begin
lp := Run;
C := UCS4(Run^);
Inc(Run);
// Check to see if this is a high surrogate that should be combined with a
// following low surrogate.
if (Run < Tail) and
(SurrogateHighStart <= C) and (C <= SurrogateHighEnd) and
(SurrogateLowStart <= UCS4(Run^)) and (UCS4(Run^) <= SurrogateLowEnd) then
begin
C := $10000 + (((C and $03FF) shl 10) or (UCS4(Run^) and $03FF));
Inc(Run);
end;
// Determine if the character is non-spacing and should be skipped.
if ((Flags and URE_IGNORE_NONSPACING) <> 0) and UnicodeIsNonSpacingMark(C) then
begin
Inc(Run);
Continue;
end;
if (FDFA.Flags and _URE_DFA_CASEFOLD) <> 0 then
begin
SetLength(LCMapping, 0);
{ TODO : use the entire mapping, not only the first character }
// (CaseLookup used for a little extra speed: avoids dynamic array allocation)
if CaseLookup(C, ctLower, LCMapping) then
C := LCMapping[0];
end;
// See if one of the transitions matches.
I := LastState.NumberTransitions - 1;
Matched := False;
while not Matched and (I >= 0) do
begin
Symbol := @FDFA.SymbolTable.Symbols[FDFA.TransitionList.Transitions[LastState.StartTransition + I].Symbol];
case Symbol.AType of
_URE_ANY_CHAR:
if ((Flags and URE_DONT_MATCHES_SEPARATORS) <> 0) or
not IsSeparator(C) then
Matched := True;
_URE_CHAR:
if C = Symbol.Symbol.Chr then
Matched := True;
_URE_BOL_ANCHOR:
if Lp = Text then
begin
Run := lp;
Matched := True;
end
else
begin
if IsSeparator(C) then
begin
if (C = $D) and (Run < Tail) and (Run^ = #$A) then
Inc(Run);
Lp := Run;
Matched := True;
end;
end;
_URE_EOL_ANCHOR:
if IsSeparator(C) then
begin
// Put the pointer back before the separator so the match end
// position will be correct. This will also cause the `Run'
// pointer to be advanced over the current separator once the
// match end point has been recorded.
Run := Lp;
Matched := True;
end;
_URE_CCLASS,
_URE_NCCLASS:
with Symbol^ do
begin
if Categories <> [] then
Matched := CategoryLookup(C, Categories);
if Symbol.CCL.RangesUsed > 0 then
begin
Rp := @Symbol.CCL.Ranges[0];
for J := 0 to Symbol.CCL.RangesUsed - 1 do
begin
if (Rp.MinCode <= C) and (C <= Rp.MaxCode) then
begin
Matched := True;
Break;
end;
Inc(Rp);
end;
end;
if AType = _URE_NCCLASS then
Matched := not Matched;
end;
end;
if Matched then
begin
if Start = -1 then
Start := Lp - Text
else
Stop := Run - Text;
LastState := @FDFA.StateList.States[FDFA.TransitionList.Transitions[LastState.StartTransition + I].NextState];
// If the match was an EOL anchor, adjust the pointer past the separator
// that caused the match. The correct match position has been recorded
// already.
if Symbol.AType = _URE_EOL_ANCHOR then
begin
// skip the character that caused the match
Inc(Run);
// handle the infamous CRLF situation
if (Run < Tail) and (C = $D) and (Run^ = #$A) then
Inc(Run);
end;
end;
Dec(I);
end;
if not Matched then
begin
Found := LastState.Accepting;
if not Found then
begin
// If the last state was not accepting, then reset and start over.
LastState := @FDFA.StateList.States[0];
Start := -1;
Stop := -1;
end
else
begin
// set start and stop pointer if not yet done
if Start = -1 then
begin
Start := Lp - Text;
Stop := Run - Text;
end
else
begin
if Stop = -1 then
Stop := Lp - Text;
end;
end;
end
else
begin
if Run = Tail then
begin
if not LastState.Accepting then
begin
// This ugly hack is to make sure the end-of-line anchors match
// when the source text hits the end. This is only done if the last
// subexpression matches.
for I := 0 to LastState.NumberTransitions - 1 do
begin
if Found then
Break;
Symbol := @FDFA.SymbolTable.Symbols[FDFA.TransitionList.Transitions[LastState.StartTransition + I].Symbol];
if Symbol.AType =_URE_EOL_ANCHOR then
begin
LastState := @FDFA.StateList.States[FDFA.TransitionList.Transitions[LastState.StartTransition + I].NextState];
if LastState.Accepting then
begin
Stop := Run - Text;
Found := True;
end
else
Break;
end;
end;
end
else
begin
// Make sure any conditions that match all the way to the end of
// the string match.
Found := True;
Stop := Run - Text;
end;
end;
end;
end;
if Found then
begin
MatchStart := Start;
MatchEnd := Stop;
end;
Result := Found;
end;
end;
function TURESearch.FindAll(const Text: WideString): Boolean;
begin
Result := FindAll(PWideChar(Text), Length(Text));
end;
function TURESearch.FindAll(Text: PWideChar; TextLen: SizeInt): Boolean;
// Looks for all occurences of the pattern passed to FindPrepare and creates an
// internal list of their positions.
var
Start, Stop: SizeInt;
Run: PWideChar;
RunLen: SizeInt;
begin
ClearResults;
Run := Text;
RunLen := TextLen;
// repeat to find all occurences of the pattern
Start := 0;
Stop := 0;
while ExecuteURE(0, Run, RunLen, Start, Stop) do
begin
// store this result (consider text pointer movement)...
AddResult(Start + Run - Text, Stop + Run - Text);
// ... and advance text position and length
Inc(Run, Stop);
Dec(RunLen, Stop);
end;
Result := FResults.Count > 0;
end;
function TURESearch.FindFirst(const Text: WideString; var Start, Stop: SizeInt): Boolean;
begin
Result := FindFirst(PWideChar(Text), Length(Text), Start, Stop);
end;
function TURESearch.FindFirst(Text: PWideChar; TextLen: SizeInt; var Start, Stop: SizeInt): Boolean;
// Looks for the first occurence of the pattern passed to FindPrepare in Text and
// returns True if one could be found (in which case Start and Stop are set to
// the according indices) otherwise False. This function is in particular of
// interest if only one occurence needs to be found.
begin
ClearResults;
Result := ExecuteURE(0, Text, TextLen, Start, Stop);
if Result then
AddResult(Start, Stop);
end;
procedure TURESearch.FindPrepare(Pattern: PWideChar; PatternLength: SizeInt; Options: TSearchFlags);
begin
CompileURE(Pattern, PatternLength, not (sfCaseSensitive in Options));
end;
procedure TURESearch.FindPrepare(const Pattern: WideString; Options: TSearchFlags);
begin
CompileURE(PWideChar(Pattern), Length(Pattern), not (sfCaseSensitive in Options));
end;
//=== { TWideStrings } =======================================================
constructor TWideStrings.Create;
begin
inherited Create;
FLanguage := GetUserDefaultLCID;
FNormalizationForm := nfC;
FSaveFormat := sfUnicodeLSB;
end;
procedure TWideStrings.SetLanguage(Value: LCID);
begin
FLanguage := Value;
end;
function TWideStrings.GetSaveUnicode: Boolean;
begin
Result := SaveFormat = sfUnicodeLSB;
end;
procedure TWideStrings.SetSaveUnicode(const Value: Boolean);
begin
if Value then
SaveFormat := sfUnicodeLSB
else
SaveFormat := sfAnsi;
end;
function TWideStrings.Add(const S: WideString): Integer;
begin
Result := GetCount;
Insert(Result, S);
end;
function TWideStrings.AddObject(const S: WideString; AObject: TObject): Integer;
begin
Result := Add(S);
PutObject(Result, AObject);
end;
procedure TWideStrings.Append(const S: WideString);
begin
Add(S);
end;
procedure TWideStrings.AddStrings(Strings: TStrings);
var
I: Integer;
{$IFNDEF SUPPORTS_UNICODE}
CP: Word;
{$ENDIF ~SUPPORTS_UNICODE}
begin
BeginUpdate;
try
{$IFNDEF SUPPORTS_UNICODE}
CP := CodePageFromLocale(FLanguage);
{$ENDIF ~SUPPORTS_UNICODE}
for I := 0 to Strings.Count - 1 do
begin
{$IFDEF SUPPORTS_UNICODE}
AddObject(Strings[I], Strings.Objects[I])
{$ELSE ~SUPPORTS_UNICODE}
AddObject(StringToWideStringEx(Strings[I], CP), Strings.Objects[I])
{$ENDIF ~SUPPORTS_UNICODE}
end;
finally
EndUpdate;
end;
end;
procedure TWideStrings.AddStrings(Strings: TWideStrings);
var
I: Integer;
begin
Assert(Strings <> nil);
BeginUpdate;
try
for I := 0 to Strings.Count - 1 do
AddObject(Strings[I], Strings.Objects[I]);
finally
EndUpdate;
end;
end;
procedure TWideStrings.Assign(Source: TPersistent);
// usual assignment routine, but able to assign wide and small strings
begin
if Source is TWideStrings then
begin
BeginUpdate;
try
Clear;
AddStrings(TWideStrings(Source));
finally
EndUpdate;
end;
end
else
begin
if Source is TStrings then
begin
BeginUpdate;
try
Clear;
AddStrings(TStrings(Source));
finally
EndUpdate;
end;
end
else
inherited Assign(Source);
end;
end;
procedure TWideStrings.AssignTo(Dest: TPersistent);
// need to do also assignment to old style TStrings, but this class doesn't know
// TWideStrings, so we need to do it from here
var
I: Integer;
{$IFNDEF SUPPORTS_UNICODE}
CP: Word;
{$ENDIF ~SUPPORTS_UNICODE}
begin
if Dest is TStrings then
begin
with Dest as TStrings do
begin
BeginUpdate;
try
{$IFNDEF SUPPORTS_UNICODE}
CP := CodePageFromLocale(FLanguage);
{$ENDIF SUPPORTS_UNICODE}
Clear;
for I := 0 to Self.Count - 1 do
begin
{$IFDEF SUPPORTS_UNICODE}
AddObject(Self[I], Self.Objects[I]);
{$ELSE ~SUPPORTS_UNICODE}
AddObject(WideStringToStringEx(Self[I], CP), Self.Objects[I]);
{$ENDIF ~SUPPORTS_UNICODE}
end;
finally
EndUpdate;
end;
end;
end
else
begin
if Dest is TWideStrings then
begin
with Dest as TWideStrings do
begin
BeginUpdate;
try
Clear;
AddStrings(Self);
finally
EndUpdate;
end;
end;
end
else
inherited AssignTo(Dest);
end;
end;
procedure TWideStrings.BeginUpdate;
begin
if FUpdateCount = 0 then
SetUpdateState(True);
Inc(FUpdateCount);
end;
procedure TWideStrings.DefineProperties(Filer: TFiler);
// Defines a private property for the content of the list.
// There's a bug in the handling of text DFMs in Classes.pas which prevents
// WideStrings from loading under some circumstances. Zbysek Hlinka
// (zhlinka att login dott cz) brought this to my attention and supplied also a solution.
// See ReadData and WriteData methods for implementation details.
//--------------- local function --------------------------------------------
function DoWrite: Boolean;
begin
if Filer.Ancestor <> nil then
begin
Result := True;
if Filer.Ancestor is TWideStrings then
Result := not Equals(TWideStrings(Filer.Ancestor))
end
else
Result := Count > 0;
end;
//--------------- end local function ----------------------------------------
begin
Filer.DefineProperty('WideStrings', ReadData, WriteData, DoWrite);
end;
procedure TWideStrings.DoConfirmConversion(var Allowed: Boolean);
begin
if Assigned(FOnConfirmConversion) then
FOnConfirmConversion(Self, Allowed);
end;
procedure TWideStrings.EndUpdate;
begin
Dec(FUpdateCount);
if FUpdateCount = 0 then
SetUpdateState(False);
end;
function TWideStrings.Equals(Strings: TWideStrings): Boolean;
var
I, Count: Integer;
begin
Assert(Strings <> nil);
Result := False;
Count := GetCount;
if Count <> Strings.GetCount then
Exit;
{ TODO : use internal comparation routine as soon as composition is implemented }
for I := 0 to Count - 1 do
if Get(I) <> Strings.Get(I) then
Exit;
Result := True;
end;
procedure TWideStrings.Error(const Msg: string; Data: Integer);
function ReturnAddr: Pointer;
asm
{$IFDEF CPU32}
MOV EAX, EBP
MOV EAX, [EAX + 4]
{$ENDIF CPU32}
{$IFDEF CPU64}
MOV RAX, RBP
MOV RAX, [RAX + 8]
{$ENDIF CPU64}
end;
begin
raise EStringListError.CreateFmt(Msg, [Data]) at ReturnAddr;
end;
procedure TWideStrings.Exchange(Index1, Index2: Integer);
var
TempObject: TObject;
TempString: WideString;
begin
BeginUpdate;
try
TempString := Strings[Index1];
TempObject := Objects[Index1];
Strings[Index1] := Strings[Index2];
Objects[Index1] := Objects[Index2];
Strings[Index2] := TempString;
Objects[Index2] := TempObject;
finally
EndUpdate;
end;
end;
function TWideStrings.GetCapacity: Integer;
// Descendants may optionally override/replace this default implementation.
begin
Result := Count;
end;
function TWideStrings.GetCommaText: WideString;
var
S: WideString;
P: PWideChar;
I, Count: Integer;
begin
Count := GetCount;
if (Count = 1) and (Get(0) = '') then
Result := '""'
else
begin
Result := '';
for I := 0 to Count - 1 do
begin
S := Get(I);
P := PWideChar(S);
while (P^ > WideSpace) and (P^ <> '"') and (P^ <> ',') do
Inc(P);
if P^ <> WideNull then
S := WideQuotedStr(S, '"');
Result := Result + S + ',';
end;
System.Delete(Result, Length(Result), 1);
end;
end;
function TWideStrings.GetName(Index: Integer): WideString;
var
P: Integer;
begin
Result := Get(Index);
P := Pos('=', Result);
if P > 0 then
SetLength(Result, P - 1)
else
Result := '';
end;
function TWideStrings.GetObject(Index: Integer): TObject;
begin
Result := nil;
end;
function TWideStrings.GetSeparatedText(Separators: WideString): WideString;
// Same as GetText but with customizable separator characters.
var
I, L,
Size,
Count,
SepSize: Integer;
P: PWideChar;
S: WideString;
begin
Count := GetCount;
SepSize := Length(Separators);
Size := 0;
for I := 0 to Count - 1 do
Inc(Size, Length(Get(I)) + SepSize);
// set one separator less, the last line does not need a trailing separator
SetLength(Result, Size - SepSize);
if Size > 0 then
begin
P := Pointer(Result);
I := 0;
while True do
begin
S := Get(I);
L := Length(S);
if L <> 0 then
begin
// add current string
System.Move(Pointer(S)^, P^, 2 * L);
Inc(P, L);
end;
Inc(I);
if I = Count then
Break;
// add separators
System.Move(Pointer(Separators)^, P^, SizeOf(WideChar) * SepSize);
Inc(P, SepSize);
end;
end;
end;
function TWideStrings.GetTextStr: WideString;
begin
Result := GetSeparatedText(WideCRLF);
end;
function TWideStrings.GetText: PWideChar;
begin
Result := StrNewW(GetTextStr);
end;
function TWideStrings.GetValue(const Name: WideString): WideString;
var
I: Integer;
begin
I := IndexOfName(Name);
if I >= 0 then
Result := Copy(Get(I), Length(Name) + 2, MaxInt)
else
Result := '';
end;
function TWideStrings.IndexOf(const S: WideString): Integer;
var
NormString: WideString;
begin
NormString := WideNormalize(S, FNormalizationForm);
for Result := 0 to GetCount - 1 do
if WideCompareText(Get(Result), NormString, FLanguage) = 0 then
Exit;
Result := -1;
end;
function TWideStrings.IndexOfName(const Name: WideString): Integer;
var
P: Integer;
S: WideString;
NormName: WideString;
begin
NormName := WideNormalize(Name, FNormalizationForm);
for Result := 0 to GetCount - 1 do
begin
S := Get(Result);
P := Pos('=', S);
if (P > 0) and (WideCompareText(Copy(S, 1, P - 1), NormName, FLanguage) = 0) then
Exit;
end;
Result := -1;
end;
function TWideStrings.IndexOfObject(AObject: TObject): Integer;
begin
for Result := 0 to GetCount - 1 do
if GetObject(Result) = AObject then
Exit;
Result := -1;
end;
procedure TWideStrings.InsertObject(Index: Integer; const S: WideString; AObject: TObject);
begin
Insert(Index, S);
PutObject(Index, AObject);
end;
procedure TWideStrings.LoadFromFile(const FileName: TFileName);
var
Stream: TStream;
begin
try
Stream := TFileStream.Create(FileName, fmOpenRead or fmShareDenyNone);
try
LoadFromStream(Stream);
finally
Stream.Free;
end;
except
RaiseLastOSError;
end;
end;
procedure TWideStrings.LoadFromStream(Stream: TStream);
// usual loader routine, but enhanced to handle byte order marks in stream
var
Size,
BytesRead: Integer;
ByteOrderMask: array [0..5] of Byte; // BOM size is max 5 bytes (cf: wikipedia)
// but it is easier to implement with a multiple of 2
Loaded: Boolean;
SW: WideString;
SA: AnsiString;
begin
BeginUpdate;
try
Loaded := False;
Size := Stream.Size - Stream.Position;
ByteOrderMask[0] := 0;
BytesRead := Stream.Read(ByteOrderMask[0],SizeOf(ByteOrderMask));
// UTF16 LSB = Unicode LSB
if (BytesRead >= 2) and (ByteOrderMask[0] = BOM_UTF16_LSB[0])
and (ByteOrderMask[1] = BOM_UTF16_LSB[1]) then
begin
FSaveFormat := sfUTF16LSB;
SetLength(SW, (Size - 2) div SizeOf(WideChar));
Assert((Size and 1) <> 1,'Number of chars must be a multiple of 2');
if BytesRead > 2 then
begin
System.Move(ByteOrderMask[2], SW[1], BytesRead-2); // max 4 bytes = 2 widechars
if Size > BytesRead then
// first 2 chars (maximum) were copied by System.Move
Stream.Read(SW[3], Size-BytesRead);
end;
SetText(SW);
Loaded := True;
end;
// UTF16 MSB = Unicode MSB
if (BytesRead >= 2) and (ByteOrderMask[0] = BOM_UTF16_MSB[0])
and (ByteOrderMask[1] = BOM_UTF16_MSB[1]) then
begin
FSaveFormat := sfUTF16MSB;
SetLength(SW, (Size - 2) div SizeOf(WideChar));
Assert((Size and 1) <> 1,'Number of chars must be a multiple of 2');
if BytesRead > 2 then
begin
System.Move(ByteOrderMask[2],SW[1],BytesRead-2); // max 4 bytes = 2 widechars
if Size > BytesRead then
// first 2 chars (maximum) were copied by System.Move
Stream.Read(SW[3], Size-BytesRead);
StrSwapByteOrder(PWideChar(SW));
end;
SetText(SW);
Loaded := True;
end;
// UTF8
if (BytesRead >= 3) and (ByteOrderMask[0] = BOM_UTF8[0])
and (ByteOrderMask[1] = BOM_UTF8[1]) and (ByteOrderMask[2] = BOM_UTF8[2]) then
begin
FSaveFormat := sfUTF8;
SetLength(SA, (Size-3) div SizeOf(AnsiChar));
if BytesRead > 3 then
begin
System.Move(ByteOrderMask[3],SA[1],BytesRead-3); // max 3 bytes = 3 chars
if Size > BytesRead then
// first 3 chars were copied by System.Move
Stream.Read(SA[4], Size-BytesRead);
SW := UTF8ToWideString(SA);
end;
SetText(SW);
Loaded := True;
end;
// default case (Ansi)
if not Loaded then
begin
FSaveFormat := sfAnsi;
SetLength(SA, Size div SizeOf(AnsiChar));
if BytesRead > 0 then
begin
System.Move(ByteOrderMask[0], SA[1], BytesRead); // max 6 bytes = 6 chars
if Size > BytesRead then
Stream.Read(SA[7], Size-BytesRead); // first 6 chars were copied by System.Move
end;
SetText(StringToWideStringEx(SA, CodePageFromLocale(FLanguage)));
end;
finally
EndUpdate;
end;
end;
procedure TWideStrings.Move(CurIndex, NewIndex: Integer);
var
TempObject: TObject;
TempString: WideString;
begin
if CurIndex <> NewIndex then
begin
BeginUpdate;
try
TempString := Get(CurIndex);
TempObject := GetObject(CurIndex);
Delete(CurIndex);
InsertObject(NewIndex, TempString, TempObject);
finally
EndUpdate;
end;
end;
end;
procedure TWideStrings.ReadData(Reader: TReader);
begin
case Reader.NextValue of
vaLString, vaString:
SetText(Reader.ReadString);
else
SetText(Reader.ReadWideString);
end;
end;
procedure TWideStrings.SaveToFile(const FileName: TFileName);
var
Stream: TStream;
begin
Stream := TFileStream.Create(FileName, fmCreate);
try
SaveToStream(Stream);
finally
Stream.Free;
end;
end;
procedure TWideStrings.SaveToStream(Stream: TStream; WithBOM: Boolean = True);
// Saves the currently loaded text into the given stream. WithBOM determines whether to write a
// byte order mark or not. Note: when saved as ANSI text there will never be a BOM.
var
SW: WideString;
SA: AnsiString;
Allowed: Boolean;
Run: PWideChar;
begin
// The application can decide in which format to save the content.
// If FSaveUnicode is False then all strings are saved in standard ANSI format
// which is also loadable by TStrings but you should be aware that all Unicode
// strings are then converted to ANSI based on the current system locale.
// An extra event is supplied to ask the user about the potential loss of
// information when converting Unicode to ANSI strings.
SW := GetTextStr;
Allowed := True;
FSaved := False; // be pessimistic
// A check for potential information loss makes only sense if the application has
// set an event to be used as call back to ask about the conversion.
if (FSaveFormat = sfAnsi) and Assigned(FOnConfirmConversion) then
begin
// application requests to save only ANSI characters, so check the text and
// call back in case information could be lost
Run := PWideChar(SW);
// only ask if there's at least one Unicode character in the text
while (Run^>#0) and (run^<=#255) do
Inc(Run);
// Note: The application can still set FSaveUnicode to True in the callback.
if Run^ <> WideNull then
DoConfirmConversion(Allowed);
end;
if Allowed then
begin
// only save if allowed
case SaveFormat of
sfUTF16LSB :
begin
if WithBOM then
Stream.WriteBuffer(BOM_UTF16_LSB[0],SizeOf(BOM_UTF16_LSB));
if Length(SW) > 0 then
Stream.WriteBuffer(SW[1],Length(SW)*SizeOf(UTF16));
FSaved := True;
end;
sfUTF16MSB :
begin
if WithBOM then
Stream.WriteBuffer(BOM_UTF16_MSB[0],SizeOf(BOM_UTF16_MSB));
if Length(SW) > 0 then
begin
StrSwapByteOrder(PWideChar(SW));
Stream.WriteBuffer(SW[1],Length(SW)*SizeOf(UTF16));
end;
FSaved := True;
end;
sfUTF8 :
begin
if WithBOM then
Stream.WriteBuffer(BOM_UTF8[0],SizeOf(BOM_UTF8));
if Length(SW) > 0 then
begin
SA := WideStringToUTF8(SW);
Stream.WriteBuffer(SA[1],Length(SA)*SizeOf(UTF8));
end;
FSaved := True;
end;
sfAnsi :
begin
if Length(SW) > 0 then
begin
SA := WideStringToStringEx(SW,CodePageFromLocale(FLanguage));
Stream.WriteBuffer(SA[1],Length(SA)*SizeOf(AnsiChar));
end;
FSaved := True;
end;
end;
end;
end;
procedure TWideStrings.SetCapacity(NewCapacity: Integer);
begin
// do nothing - descendants may optionally implement this method
end;
procedure TWideStrings.SetCommaText(const Value: WideString);
var
P, P1: PWideChar;
S: WideString;
begin
BeginUpdate;
try
Clear;
P := PWideChar(Value);
while (P^ >= #1) and (P^ <= WideSpace) do
Inc(P);
while P^ <> WideNull do
begin
if P^ = '"' then
S := WideExtractQuotedStr(P, '"')
else
begin
P1 := P;
while (P^ > WideSpace) and (P^ <> ',') do
Inc(P);
SetString(S, P1, P - P1);
end;
Add(S);
while (P^ >= #1) and (P^ <= WideSpace) do
Inc(P);
if P^ = ',' then
begin
repeat
Inc(P);
until not ((P^ >= #1) and (P^ <= WideSpace));
end;
end;
finally
EndUpdate;
end;
end;
procedure TWideStrings.SetText(const Value: WideString);
var
Head,
Tail: PWideChar;
S: WideString;
begin
BeginUpdate;
try
Clear;
Head := PWideChar(Value);
while Head^ <> WideNull do
begin
Tail := Head;
while (Tail^ <> WideNull) and (Tail^ <> WideLineFeed) and (Tail^ <> WideCarriageReturn) and
(Tail^ <> WideVerticalTab) and (Tail^ <> WideFormFeed) and (Tail^ <> WideLineSeparator) and
(Tail^ <> WideParagraphSeparator) do
Inc(Tail);
SetString(S, Head, Tail - Head);
Add(S);
Head := Tail;
if Head^ <> WideNull then
begin
Inc(Head);
if (Tail^ = WideCarriageReturn) and (Head^ = WideLineFeed) then
Inc(Head);
end;
end;
finally
EndUpdate;
end;
end;
procedure TWideStrings.SetUpdateState(Updating: Boolean);
begin
end;
procedure TWideStrings.SetNormalizationForm(const Value: TNormalizationForm);
var
I: Integer;
begin
if FNormalizationForm <> Value then
begin
FNormalizationForm := Value;
if FNormalizationForm <> nfNone then
begin
// renormalize all strings according to the new form
for I := 0 to GetCount - 1 do
Put(I, WideNormalize(Get(I), FNormalizationForm));
end;
end;
end;
procedure TWideStrings.SetValue(const Name, Value: WideString);
var
I : Integer;
begin
I := IndexOfName(Name);
if Value <> '' then
begin
if I < 0 then
I := Add('');
Put(I, Name + '=' + Value);
end
else
begin
if I >= 0 then
Delete(I);
end;
end;
procedure TWideStrings.WriteData(Writer: TWriter);
begin
Writer.WriteWideString(GetTextStr);
end;
//=== { TWideStringList } ====================================================
destructor TWideStringList.Destroy;
begin
FOnChange := nil;
FOnChanging := nil;
Clear;
inherited Destroy;
end;
function TWideStringList.Add(const S: WideString): Integer;
begin
if not Sorted then
Result := FCount
else
begin
if Find(S, Result) then
begin
case Duplicates of
dupIgnore:
Exit;
dupError:
Error(SDuplicateString, 0);
end;
end;
end;
InsertItem(Result, S);
end;
procedure TWideStringList.Changed;
begin
if (FUpdateCount = 0) and Assigned(FOnChange) then
FOnChange(Self);
end;
procedure TWideStringList.Changing;
begin
if (FUpdateCount = 0) and Assigned(FOnChanging) then
FOnChanging(Self);
end;
procedure TWideStringList.Clear;
{$IFDEF OWN_WIDESTRING_MEMMGR}
var
I: Integer;
{$ENDIF OWN_WIDESTRING_MEMMGR}
begin
if FCount <> 0 then
begin
Changing;
{$IFDEF OWN_WIDESTRING_MEMMGR}
for I := 0 to FCount - 1 do
with FList[I] do
if TDynWideCharArray(FString) <> nil then
TDynWideCharArray(FString) := nil;
{$ENDIF OWN_WIDESTRING_MEMMGR}
// this will automatically finalize the array
FList := nil;
FCount := 0;
SetCapacity(0);
Changed;
end;
end;
procedure TWideStringList.Delete(Index: Integer);
begin
if Cardinal(Index) >= Cardinal(FCount) then
Error(SListIndexError, Index);
Changing;
{$IFDEF OWN_WIDESTRING_MEMMGR}
SetListString(Index, '');
{$ELSE ~OWN_WIDESTRING_MEMMGR}
FList[Index].FString := '';
{$ENDIF ~OWN_WIDESTRING_MEMMGR}
Dec(FCount);
if Index < FCount then
begin
System.Move(FList[Index + 1], FList[Index], (FCount - Index) * SizeOf(TWideStringItem));
Pointer(FList[FCount].FString) := nil; // avoid freeing the string, the address is now used in another element
end;
Changed;
end;
procedure TWideStringList.Exchange(Index1, Index2: Integer);
begin
if Cardinal(Index1) >= Cardinal(FCount) then
Error(SListIndexError, Index1);
if Cardinal(Index2) >= Cardinal(FCount) then
Error(SListIndexError, Index2);
Changing;
ExchangeItems(Index1, Index2);
Changed;
end;
procedure TWideStringList.ExchangeItems(Index1, Index2: Integer);
var
Temp: TWideStringItem;
begin
Temp := FList[Index1];
FList[Index1] := FList[Index2];
FList[Index2] := Temp;
end;
function TWideStringList.Find(const S: WideString; var Index: Integer): Boolean;
var
L, H, I, C: Integer;
NormString: WideString;
begin
Result := False;
NormString := WideNormalize(S, FNormalizationForm);
L := 0;
H := FCount - 1;
while L <= H do
begin
I := (L + H) shr 1;
C := WideCompareText(FList[I].FString, NormString, FLanguage);
if C < 0 then
L := I+1
else
begin
H := I - 1;
if C = 0 then
begin
Result := True;
if Duplicates <> dupAccept then
L := I;
end;
end;
end;
Index := L;
end;
function TWideStringList.Get(Index: Integer): WideString;
{$IFDEF OWN_WIDESTRING_MEMMGR}
var
Len: Integer;
{$ENDIF OWN_WIDESTRING_MEMMGR}
begin
if Cardinal(Index) >= Cardinal(FCount) then
Error(SListIndexError, Index);
{$IFDEF OWN_WIDESTRING_MEMMGR}
with FList[Index] do
begin
Len := Length(TDynWideCharArray(FString));
if Len > 0 then
begin
SetLength(Result, Len - 1); // exclude #0
if Result <> '' then
System.Move(FString^, Result[1], Len * SizeOf(WideChar));
end
else
Result := '';
end;
{$ELSE ~OWN_WIDESTRING_MEMMGR}
Result := FList[Index].FString;
{$ENDIF ~OWN_WIDESTRING_MEMMGR}
end;
function TWideStringList.GetCapacity: Integer;
begin
Result := Length(FList);
end;
function TWideStringList.GetCount: Integer;
begin
Result := FCount;
end;
function TWideStringList.GetObject(Index: Integer): TObject;
begin
if Cardinal(Index) >= Cardinal(FCount) then
Error(SListIndexError, Index);
Result := FList[Index].FObject;
end;
procedure TWideStringList.Grow;
var
Delta,
Len: Integer;
begin
Len := Length(FList);
if Len > 64 then
Delta := Len div 4
else
begin
if Len > 8 then
Delta := 16
else
Delta := 4;
end;
SetCapacity(Len + Delta);
end;
function TWideStringList.IndexOf(const S: WideString): Integer;
begin
if not Sorted then
Result := inherited IndexOf(S)
else
if not Find(S, Result) then
Result := -1;
end;
procedure TWideStringList.Insert(Index: Integer; const S: WideString);
begin
if Sorted then
Error(SSortedListError, 0);
if Cardinal(Index) > Cardinal(FCount) then
Error(SListIndexError, Index);
InsertItem(Index, S);
end;
{$IFDEF OWN_WIDESTRING_MEMMGR}
procedure TWideStringList.SetListString(Index: Integer; const S: WideString);
var
Len: Integer;
A: TDynWideCharArray;
begin
with FList[Index] do
begin
Pointer(A) := TDynWideCharArray(FString);
if A <> nil then
A := nil; // free memory
Len := Length(S);
if Len > 0 then
begin
SetLength(A, Len + 1); // include #0
System.Move(S[1], A[0], Len * SizeOf(WideChar));
A[Len] := #0;
end;
FString := PWideChar(A);
Pointer(A) := nil; // do not release the array on procedure exit
end;
end;
{$ENDIF OWN_WIDESTRING_MEMMGR}
procedure TWideStringList.InsertItem(Index: Integer; const S: WideString);
begin
Changing;
if FCount = Length(FList) then
Grow;
if Index < FCount then
System.Move(FList[Index], FList[Index + 1], (FCount - Index) * SizeOf(TWideStringItem));
with FList[Index] do
begin
Pointer(FString) := nil; // avoid freeing the string, the address is now used in another element
FObject := nil;
if (FNormalizationForm <> nfNone) and (Length(S) > 0) then
{$IFDEF OWN_WIDESTRING_MEMMGR}
SetListString(Index, WideNormalize(S, FNormalizationForm))
else
SetListString(Index, S);
{$ELSE ~OWN_WIDESTRING_MEMMGR}
FString := WideNormalize(S, FNormalizationForm)
else
FString := S;
{$ENDIF ~OWN_WIDESTRING_MEMMGR}
end;
Inc(FCount);
Changed;
end;
procedure TWideStringList.Put(Index: Integer; const S: WideString);
begin
if Sorted then
Error(SSortedListError, 0);
if Cardinal(Index) >= Cardinal(FCount) then
Error(SListIndexError, Index);
Changing;
if (FNormalizationForm <> nfNone) and (Length(S) > 0) then
{$IFDEF OWN_WIDESTRING_MEMMGR}
SetListString(Index, WideNormalize(S, FNormalizationForm))
else
SetListString(Index, S);
{$ELSE ~OWN_WIDESTRING_MEMMGR}
FList[Index].FString := WideNormalize(S, FNormalizationForm)
else
FList[Index].FString := S;
{$ENDIF ~OWN_WIDESTRING_MEMMGR}
Changed;
end;
procedure TWideStringList.PutObject(Index: Integer; AObject: TObject);
begin
if Cardinal(Index) >= Cardinal(FCount) then
Error(SListIndexError, Index);
Changing;
FList[Index].FObject := AObject;
Changed;
end;
procedure TWideStringList.QuickSort(L, R: Integer);
var
I, J: Integer;
P: WideString;
begin
repeat
I := L;
J := R;
P := FList[(L + R) shr 1].FString;
repeat
while WideCompareText(FList[I].FString, P, FLanguage) < 0 do
Inc(I);
while WideCompareText(FList[J].FString, P, FLanguage) > 0 do
Dec(J);
if I <= J then
begin
ExchangeItems(I, J);
Inc(I);
Dec(J);
end;
until I > J;
if L < J then
QuickSort(L, J);
L := I;
until I >= R;
end;
procedure TWideStringList.SetCapacity(NewCapacity: Integer);
begin
SetLength(FList, NewCapacity);
if NewCapacity < FCount then
FCount := NewCapacity;
end;
procedure TWideStringList.SetSorted(Value: Boolean);
begin
if FSorted <> Value then
begin
if Value then
Sort;
FSorted := Value;
end;
end;
procedure TWideStringList.SetUpdateState(Updating: Boolean);
begin
if Updating then
Changing
else
Changed;
end;
procedure TWideStringList.Sort;
begin
if not Sorted and (FCount > 1) then
begin
Changing;
QuickSort(0, FCount - 1);
Changed;
end;
end;
procedure TWideStringList.SetLanguage(Value: LCID);
begin
inherited SetLanguage(Value);
if Sorted then
Sort;
end;
// exchanges in each character of the given string the low order and high order
// byte to go from LSB to MSB and vice versa.
// EAX contains address of string
function WideAdjustLineBreaks(const S: WideString): WideString;
var
Source,
SourceEnd,
Dest: PWideChar;
begin
Source := Pointer(S);
SourceEnd := Source + Length(S);
Source := Pointer(S);
SetString(Result, nil, SourceEnd - Source);
Dest := Pointer(Result);
while Source < SourceEnd do
begin
case Source^ of
WideLineFeed:
begin
Dest^ := WideLineSeparator;
Inc(Dest);
Inc(Source);
end;
WideCarriageReturn:
begin
Dest^ := WideLineSeparator;
Inc(Dest);
Inc(Source);
if Source^ = WideLineFeed then
Inc(Source);
end;
else
Dest^ := Source^;
Inc(Dest);
Inc(Source);
end;
end;
SetLength(Result, (TJclAddr(Dest) - TJclAddr(Result)) div 2);
end;
function WideQuotedStr(const S: WideString; Quote: WideChar): WideString;
// works like QuotedStr from SysUtils.pas but can insert any quotation character
var
P, Src,
Dest: PWideChar;
AddCount: SizeInt;
begin
AddCount := 0;
P := StrScanW(PWideChar(S), Quote);
while (P <> nil) do
begin
Inc(P);
Inc(AddCount);
P := StrScanW(P, Quote);
end;
if AddCount = 0 then
Result := Quote + S + Quote
else
begin
SetLength(Result, Length(S) + AddCount + 2);
Dest := PWideChar(Result);
Dest^ := Quote;
Inc(Dest);
Src := PWideChar(S);
P := StrScanW(Src, Quote);
repeat
Inc(P);
Move(Src^, Dest^, 2 * (P - Src));
Inc(Dest, P - Src);
Dest^ := Quote;
Inc(Dest);
Src := P;
P := StrScanW(Src, Quote);
until P = nil;
P := StrEndW(Src);
Move(Src^, Dest^, 2 * (P - Src));
Inc(Dest, P - Src);
Dest^ := Quote;
end;
end;
function WideExtractQuotedStr(var Src: PWideChar; Quote: WideChar): WideString;
// extracts a string enclosed in quote characters given by Quote
var
P, Dest: PWideChar;
DropCount: SizeInt;
begin
Result := '';
if (Src = nil) or (Src^ <> Quote) then
Exit;
Inc(Src);
DropCount := 1;
P := Src;
Src := StrScanW(Src, Quote);
while Src <> nil do // count adjacent pairs of quote chars
begin
Inc(Src);
if Src^ <> Quote then
Break;
Inc(Src);
Inc(DropCount);
Src := StrScanW(Src, Quote);
end;
if Src = nil then
Src := StrEndW(P);
if (Src - P) <= 1 then
Exit;
if DropCount = 1 then
SetString(Result, P, Src - P - 1)
else
begin
SetLength(Result, Src - P - DropCount);
Dest := PWideChar(Result);
Src := StrScanW(P, Quote);
while Src <> nil do
begin
Inc(Src);
if Src^ <> Quote then
Break;
Move(P^, Dest^, 2 * (Src - P));
Inc(Dest, Src - P);
Inc(Src);
P := Src;
Src := StrScanW(Src, Quote);
end;
if Src = nil then
Src := StrEndW(P);
Move(P^, Dest^, 2 * (Src - P - 1));
end;
end;
function WideStringOfChar(C: WideChar; Count: SizeInt): WideString;
// returns a string of Count characters filled with C
var
I: SizeInt;
begin
SetLength(Result, Count);
for I := 1 to Count do
Result[I] := C;
end;
function WideTrim(const S: WideString): WideString;
var
I, L: SizeInt;
begin
L := Length(S);
I := 1;
while (I <= L) and (UnicodeIsWhiteSpace(UCS4(S[I])) or UnicodeIsControl(UCS4(S[I]))) do
Inc(I);
if I > L then
Result := ''
else
begin
while UnicodeIsWhiteSpace(UCS4(S[L])) or UnicodeIsControl(UCS4(S[L])) do
Dec(L);
Result := Copy(S, I, L - I + 1);
end;
end;
function WideTrimLeft(const S: WideString): WideString;
var
I, L: SizeInt;
begin
L := Length(S);
I := 1;
while (I <= L) and (UnicodeIsWhiteSpace(UCS4(S[I])) or UnicodeIsControl(UCS4(S[I]))) do
Inc(I);
Result := Copy(S, I, Maxint);
end;
function WideTrimRight(const S: WideString): WideString;
var
I: SizeInt;
begin
I := Length(S);
while (I > 0) and (UnicodeIsWhiteSpace(UCS4(S[I])) or UnicodeIsControl(UCS4(S[I]))) do
Dec(I);
Result := Copy(S, 1, I);
end;
// returns the index of character Ch in S, starts searching at index Index
// Note: This is a quick memory search. No attempt is made to interpret either
// the given charcter nor the string (ligatures, modifiers, surrogates etc.)
// Code from Azret Botash.
function WideCharPos(const S: WideString; const Ch: WideChar; const Index: SizeInt): SizeInt;
var
P, R: PWideChar;
begin
if (Index > 0) and (Index <= Length(S)) then
begin
P := PWideChar(@S[Index]);
R := StrScanW(P, Ch);
if R <> nil then
Result := R - P + Index
else
Result := 0;
end
else
Result := 0;
end;
function WideComposeHangul(const Source: WideString): WideString;
var
Len: SizeInt;
Ch, Last: WideChar;
I: SizeInt;
LIndex, VIndex,
SIndex, TIndex: SizeInt;
begin
Result := '';
Len := Length(Source);
if Len > 0 then
begin
Last := Source[1];
Result := Last;
for I := 2 to Len do
begin
Ch := Source[I];
// 1. check to see if two current characters are L and V
LIndex := Word(Last) - LBase;
if (0 <= LIndex) and (LIndex < LCount) then
begin
VIndex := Word(Ch) - VBase;
if (0 <= VIndex) and (VIndex < VCount) then
begin
// make syllable of form LV
Last := WideChar((SBase + (LIndex * VCount + VIndex) * TCount));
Result[Length(Result)] := Last; // reset last
Continue; // discard Ch
end;
end;
// 2. check to see if two current characters are LV and T
SIndex := Word(Last) - SBase;
if (0 <= SIndex) and (SIndex < SCount) and ((SIndex mod TCount) = 0) then
begin
TIndex := Word(Ch) - TBase;
if (0 <= TIndex) and (TIndex <= TCount) then
begin
// make syllable of form LVT
Inc(Word(Last), TIndex);
Result[Length(Result)] := Last; // reset last
Continue; // discard Ch
end;
end;
// if neither case was true, just add the character
Last := Ch;
Result := Result + Ch;
end;
end;
end;
// Returns canonical composition of characters in S.
function WideCompose(const S: WideString): WideString;
var
Buffer: array of UCS4;
LastInPos, InPos, OutPos, BufferSize, NbProcessed: SizeInt;
Composite: UCS4;
begin
// Set an arbitrary length for the result. This is automatically done when checking
// for hangul composition.
Result := WideComposeHangul(S);
if Result = '' then
Exit;
if Compositions = nil then
LoadCompositionData;
LastInPos := Length(Result);
if LastInPos > MaxCompositionSize then
SetLength(Buffer, MaxCompositionSize)
else
SetLength(Buffer, LastInPos);
BufferSize := 0;
InPos := 0;
OutPos := 0;
while (InPos < LastInPos) or (BufferSize > 0) do
begin
// fill buffer from input
while BufferSize < Length(Buffer) do
begin
if InPos < LastInPos then
begin
Inc(InPos);
Buffer[BufferSize] := UCS4(Result[InPos]);
Inc(BufferSize);
end
else
SetLength(Buffer, BufferSize);
end;
if Length(Buffer) = 0 then
Break;
NbProcessed := UnicodeCompose(Buffer, Composite);
if NbProcessed = 0 then
Break;
if BufferSize > NbProcessed then
Move(Buffer[NbProcessed], Buffer[0], (BufferSize - NbProcessed) * SizeOf(UCS4));
Dec(BufferSize, NbProcessed);
Inc(OutPos);
Result[OutPos] := UCS2(Composite);
end;
// since we have likely shortened the source string we have to set the correct length on exit
SetLength(Result, OutPos);
end;
procedure FixCanonical(var S: WideString);
// Examines S and reorders all combining marks in the string so that they are in canonical order.
var
I: SizeInt;
Temp: WideChar;
CurrentClass,
LastClass: Cardinal;
begin
I := Length(S);
if I > 1 then
begin
CurrentClass := CanonicalCombiningClass(UCS4(S[I]));
repeat
Dec(I);
LastClass := CurrentClass;
CurrentClass := CanonicalCombiningClass(UCS4(S[I]));
// A swap is presumed to be rare (and a double-swap very rare),
// so don't worry about efficiency here.
if (CurrentClass > LastClass) and (LastClass > 0) then
begin
// swap characters
Temp := S[I];
S[I] := S[I + 1];
S[I + 1] := Temp;
// if not at end, backup (one further, to compensate for loop)
if I < Length(S) - 1 then
Inc(I, 2);
// reset type, since we swapped.
CurrentClass := CanonicalCombiningClass(UCS4(S[I]));
end;
until I = 1;
end;
end;
procedure GetDecompositions(Compatible: Boolean; Code: UCS4; var Buffer: TUCS4Array);
// helper function to recursively decompose a code point
var
Decomp: TUCS4Array;
I: SizeInt;
begin
Decomp := UnicodeDecompose(Code, Compatible);
if Assigned(Decomp) then
begin
for I := 0 to High(Decomp) do
GetDecompositions(Compatible, Decomp[I], Buffer);
end
else // if no decomp, append
begin
I := Length(Buffer);
SetLength(Buffer, I + 1);
Buffer[I] := Code;
end;
end;
function WideDecompose(const S: WideString; Compatible: Boolean): WideString;
// returns a string with all characters of S but decomposed, e.g. is returned as E^ etc.
var
I, J: SizeInt;
Decomp: TUCS4Array;
begin
Result := '';
Decomp := nil;
// iterate through each source code point
for I := 1 to Length(S) do
begin
Decomp := nil;
GetDecompositions(Compatible, UCS4(S[I]), Decomp);
if Decomp = nil then
Result := Result + S[I]
else
for J := 0 to High(Decomp) do
Result := Result + WideChar(Decomp[J]);
end;
// combining marks must be sorted according to their canonical combining class
FixCanonical(Result);
end;
function WideNormalize(const S: WideString; Form: TNormalizationForm): WideString;
var
Temp: WideString;
Compatible: Boolean;
begin
Result := S;
if Form = nfNone then
Exit; // No normalization needed.
Compatible := Form in [nfKC, nfKD];
if Form in [nfD, nfKD] then
Result := WideDecompose(S, Compatible)
else
begin
Temp := WideDecompose(S, Compatible);
Result := WideCompose(Temp);
end;
end;
function WideSameText(const Str1, Str2: WideString): Boolean;
// Compares both strings case-insensitively and returns True if both are equal, otherwise False is returned.
begin
Result := Length(Str1) = Length(Str2);
if Result then
Result := StrICompW(PWideChar(Str1), PWideChar(Str2)) = 0;
end;
//----------------- general purpose case mapping ---------------------------------------------------
function WideCaseConvert(C: WideChar; CaseType: TCaseType): WideString;
var
I, RPos: SizeInt;
Mapping: TUCS4Array;
begin
SetLength(Mapping, 0);
if not CaseLookup(UCS4(C), CaseType, Mapping) then
Result := C
else
begin
SetLength(Result, 2 * Length(Mapping));
RPos := 1;
for I := Low(Mapping) to High(Mapping) do
UTF16SetNextChar(Result, RPos, Mapping[I]);
if RPos > 0 then
SetLength(Result, RPos - 1)
else
raise EJclUnexpectedEOSequenceError.Create;
end;
end;
function WideCaseConvert(const S: WideString; CaseType: TCaseType): WideString;
var
SLen, RLen, SPos, RPos, K, MapLen: SizeInt;
Code: UCS4;
Mapping: TUCS4Array;
begin
SetLength(Mapping, 0);
SLen := Length(S);
RLen := SLen;
SetLength(Result, RLen);
SPos := 1;
RPos := 1;
while (SPos > 0) and (SPos <= SLen) do
begin
Code := UTF16GetNextChar(S, SPos);
if SPos = -1 then
raise EJclUnexpectedEOSequenceError.Create;
if CaseLookup(Code, CaseType, Mapping) then
begin
MapLen:= Length(Mapping);
if MapLen = 1 then
Code := Mapping[0];
end
else
MapLen := 1;
if MapLen = 1 then
begin
if not UTF16SetNextChar(Result, RPos, Code) then
begin
Inc(RLen, SLen);
SetLength(Result, RLen);
UTF16SetNextChar(Result, RPos, Code);
end;
end
else
begin
for K := Low(Mapping) to High(Mapping) do
if not UTF16SetNextChar(Result, RPos, Code) then
begin
Inc(RLen, SLen);
SetLength(Result, RLen);
UTF16SetNextChar(Result, RPos, Code);
end;
end;
end;
if RPos > 0 then
SetLength(Result, RPos - 1)
else
raise EJclUnexpectedEOSequenceError.Create;
end;
// Note that most of the assigned code points don't have a case mapping and are therefore
// returned as they are. Other code points, however, might be converted into several characters
// like the german (eszett) whose upper case mapping is SS.
function WideCaseFolding(C: WideChar): WideString;
// Special case folding function to map a string to either its lower case or
// to special cases. This can be used for case-insensitive comparation.
begin
Result:= WideCaseConvert(C, ctFold);
end;
function WideCaseFolding(const S: WideString): WideString;
begin
Result:= WideCaseConvert(S, ctFold);
end;
function WideLowerCase(C: WideChar): WideString;
begin
Result:= WideCaseConvert(C, ctLower);
end;
function WideLowerCase(const S: WideString): WideString;
begin
Result:= WideCaseConvert(S, ctLower);
end;
function WideTitleCase(C: WideChar): WideString;
begin
Result:= WideCaseConvert(C, ctTitle);
end;
function WideTitleCase(const S: WideString): WideString;
begin
Result:= WideCaseConvert(S, ctTitle);
end;
function WideUpperCase(C: WideChar): WideString;
begin
Result:= WideCaseConvert(C, ctUpper);
end;
function WideUpperCase(const S: WideString): WideString;
begin
Result:= WideCaseConvert(S, ctUpper);
end;
//----------------- character test routines --------------------------------------------------------
function UnicodeIsAlpha(C: UCS4): Boolean; // Is the character alphabetic?
begin
Result := CategoryLookup(C, ClassLetter);
end;
function UnicodeIsDigit(C: UCS4): Boolean; // Is the character a digit?
begin
Result := CategoryLookup(C, [ccNumberDecimalDigit]);
end;
function UnicodeIsAlphaNum(C: UCS4): Boolean; // Is the character alphabetic or a number?
begin
Result := CategoryLookup(C, ClassLetter + [ccNumberDecimalDigit]);
end;
function UnicodeIsCased(C: UCS4): Boolean;
// Is the character a "cased" character, i.e. either lower case, title case or upper case
begin
Result := CategoryLookup(C, [ccLetterLowercase, ccLetterTitleCase, ccLetterUppercase]);
end;
function UnicodeIsControl(C: UCS4): Boolean;
// Is the character a control character?
begin
Result := CategoryLookup(C, [ccOtherControl, ccOtherFormat]);
end;
function UnicodeIsSpace(C: UCS4): Boolean;
// Is the character a spacing character?
begin
Result := CategoryLookup(C, ClassSpace);
end;
function UnicodeIsWhiteSpace(C: UCS4): Boolean;
// Is the character a white space character (same as UnicodeIsSpace plus
// tabulator, new line etc.)?
begin
Result := CategoryLookup(C, ClassSpace + [ccWhiteSpace, ccSegmentSeparator]);
end;
function UnicodeIsBlank(C: UCS4): Boolean;
// Is the character a space separator?
begin
Result := CategoryLookup(C, [ccSeparatorSpace]);
end;
function UnicodeIsPunctuation(C: UCS4): Boolean;
// Is the character a punctuation mark?
begin
Result := CategoryLookup(C, ClassPunctuation);
end;
function UnicodeIsGraph(C: UCS4): Boolean;
// Is the character graphical?
begin
Result := CategoryLookup(C, ClassMark + ClassNumber + ClassLetter + ClassPunctuation + ClassSymbol);
end;
function UnicodeIsPrintable(C: UCS4): Boolean;
// Is the character printable?
begin
Result := CategoryLookup(C, ClassMark + ClassNumber + ClassLetter + ClassPunctuation + ClassSymbol +
[ccSeparatorSpace]);
end;
function UnicodeIsUpper(C: UCS4): Boolean;
// Is the character already upper case?
begin
Result := CategoryLookup(C, [ccLetterUppercase]);
end;
function UnicodeIsLower(C: UCS4): Boolean;
// Is the character already lower case?
begin
Result := CategoryLookup(C, [ccLetterLowercase]);
end;
function UnicodeIsTitle(C: UCS4): Boolean;
// Is the character already title case?
begin
Result := CategoryLookup(C, [ccLetterTitlecase]);
end;
function UnicodeIsHexDigit(C: UCS4): Boolean;
// Is the character a hex digit?
begin
Result := CategoryLookup(C, [ccHexDigit]);
end;
function UnicodeIsIsoControl(C: UCS4): Boolean;
// Is the character a C0 control character (< 32)?
begin
Result := CategoryLookup(C, [ccOtherControl]);
end;
function UnicodeIsFormatControl(C: UCS4): Boolean;
// Is the character a format control character?
begin
Result := CategoryLookup(C, [ccOtherFormat]);
end;
function UnicodeIsSymbol(C: UCS4): Boolean;
// Is the character a symbol?
begin
Result := CategoryLookup(C, ClassSymbol);
end;
function UnicodeIsNumber(C: UCS4): Boolean;
// Is the character a number or digit?
begin
Result := CategoryLookup(C, ClassNumber);
end;
function UnicodeIsNonSpacing(C: UCS4): Boolean;
// Is the character non-spacing?
begin
Result := CategoryLookup(C, [ccMarkNonSpacing]);
end;
function UnicodeIsOpenPunctuation(C: UCS4): Boolean;
// Is the character an open/left punctuation (e.g. '[')?
begin
Result := CategoryLookup(C, [ccPunctuationOpen]);
end;
function UnicodeIsClosePunctuation(C: UCS4): Boolean;
// Is the character an close/right punctuation (e.g. ']')?
begin
Result := CategoryLookup(C, [ccPunctuationClose]);
end;
function UnicodeIsInitialPunctuation(C: UCS4): Boolean;
// Is the character an initial punctuation (e.g. U+2018 LEFT SINGLE QUOTATION MARK)?
begin
Result := CategoryLookup(C, [ccPunctuationInitialQuote]);
end;
function UnicodeIsFinalPunctuation(C: UCS4): Boolean;
// Is the character a final punctuation (e.g. U+2019 RIGHT SINGLE QUOTATION MARK)?
begin
Result := CategoryLookup(C, [ccPunctuationFinalQuote]);
end;
function UnicodeIsComposed(C: UCS4): Boolean;
// Can the character be decomposed into a set of other characters?
begin
Result := CategoryLookup(C, [ccComposed]);
end;
function UnicodeIsQuotationMark(C: UCS4): Boolean;
// Is the character one of the many quotation marks?
begin
Result := CategoryLookup(C, [ccQuotationMark]);
end;
function UnicodeIsSymmetric(C: UCS4): Boolean;
// Is the character one that has an opposite form (i.e. <>)?
begin
Result := CategoryLookup(C, [ccSymmetric]);
end;
function UnicodeIsMirroring(C: UCS4): Boolean;
// Is the character mirroring (superset of symmetric)?
begin
Result := CategoryLookup(C, [ccMirroring]);
end;
function UnicodeIsNonBreaking(C: UCS4): Boolean;
// Is the character non-breaking (i.e. non-breaking space)?
begin
Result := CategoryLookup(C, [ccNonBreaking]);
end;
function UnicodeIsRightToLeft(C: UCS4): Boolean;
// Does the character have strong right-to-left directionality (i.e. Arabic letters)?
begin
Result := CategoryLookup(C, [ccRightToLeft]);
end;
function UnicodeIsLeftToRight(C: UCS4): Boolean;
// Does the character have strong left-to-right directionality (i.e. Latin letters)?
begin
Result := CategoryLookup(C, [ccLeftToRight]);
end;
function UnicodeIsStrong(C: UCS4): Boolean;
// Does the character have strong directionality?
begin
Result := CategoryLookup(C, [ccLeftToRight, ccRightToLeft]);
end;
function UnicodeIsWeak(C: UCS4): Boolean;
// Does the character have weak directionality (i.e. numbers)?
begin
Result := CategoryLookup(C, ClassEuropeanNumber + [ccArabicNumber, ccCommonNumberSeparator]);
end;
function UnicodeIsNeutral(C: UCS4): Boolean;
// Does the character have neutral directionality (i.e. whitespace)?
begin
Result := CategoryLookup(C, [ccSeparatorParagraph, ccSegmentSeparator, ccWhiteSpace, ccOtherNeutrals]);
end;
function UnicodeIsSeparator(C: UCS4): Boolean;
// Is the character a block or segment separator?
begin
Result := CategoryLookup(C, [ccSeparatorParagraph, ccSegmentSeparator]);
end;
function UnicodeIsMark(C: UCS4): Boolean;
// Is the character a mark of some kind?
begin
Result := CategoryLookup(C, ClassMark);
end;
function UnicodeIsModifier(C: UCS4): Boolean;
// Is the character a letter modifier?
begin
Result := CategoryLookup(C, [ccLetterModifier]);
end;
function UnicodeIsLetterNumber(C: UCS4): Boolean;
// Is the character a number represented by a letter?
begin
Result := CategoryLookup(C, [ccNumberLetter]);
end;
function UnicodeIsConnectionPunctuation(C: UCS4): Boolean;
// Is the character connecting punctuation?
begin
Result := CategoryLookup(C, [ccPunctuationConnector]);
end;
function UnicodeIsDash(C: UCS4): Boolean;
// Is the character a dash punctuation?
begin
Result := CategoryLookup(C, [ccPunctuationDash]);
end;
function UnicodeIsMath(C: UCS4): Boolean;
// Is the character a math character?
begin
Result := CategoryLookup(C, [ccSymbolMath]);
end;
function UnicodeIsCurrency(C: UCS4): Boolean;
// Is the character a currency character?
begin
Result := CategoryLookup(C, [ccSymbolCurrency]);
end;
function UnicodeIsModifierSymbol(C: UCS4): Boolean;
// Is the character a modifier symbol?
begin
Result := CategoryLookup(C, [ccSymbolModifier]);
end;
function UnicodeIsNonSpacingMark(C: UCS4): Boolean;
// Is the character a non-spacing mark?
begin
Result := CategoryLookup(C, [ccMarkNonSpacing]);
end;
function UnicodeIsSpacingMark(C: UCS4): Boolean;
// Is the character a spacing mark?
begin
Result := CategoryLookup(C, [ccMarkSpacingCombining]);
end;
function UnicodeIsEnclosing(C: UCS4): Boolean;
// Is the character enclosing (i.e. enclosing box)?
begin
Result := CategoryLookup(C, [ccMarkEnclosing]);
end;
function UnicodeIsPrivate(C: UCS4): Boolean;
// Is the character from the Private Use Area?
begin
Result := CategoryLookup(C, [ccOtherPrivate]);
end;
function UnicodeIsSurrogate(C: UCS4): Boolean;
// Is the character one of the surrogate codes?
begin
Result := CategoryLookup(C, [ccOtherSurrogate]);
end;
function UnicodeIsLineSeparator(C: UCS4): Boolean;
// Is the character a line separator?
begin
Result := CategoryLookup(C, [ccSeparatorLine]);
end;
function UnicodeIsParagraphSeparator(C: UCS4): Boolean;
// Is th character a paragraph separator;
begin
Result := CategoryLookup(C, [ccSeparatorParagraph]);
end;
function UnicodeIsIdentifierStart(C: UCS4): Boolean;
// Can the character begin an identifier?
begin
Result := CategoryLookup(C, ClassLetter + [ccNumberLetter]);
end;
function UnicodeIsIdentifierPart(C: UCS4): Boolean;
// Can the character appear in an identifier?
begin
Result := CategoryLookup(C, ClassLetter + [ccNumberLetter, ccMarkNonSpacing, ccMarkSpacingCombining,
ccNumberDecimalDigit, ccPunctuationConnector, ccOtherFormat]);
end;
function UnicodeIsDefined(C: UCS4): Boolean;
// Is the character defined (appears in one of the data files)?
begin
Result := CategoryLookup(C, [ccAssigned]);
end;
function UnicodeIsUndefined(C: UCS4): Boolean;
// Is the character undefined (not assigned in the Unicode database)?
begin
Result := not CategoryLookup(C, [ccAssigned]);
end;
function UnicodeIsHan(C: UCS4): Boolean;
// Is the character a Han ideograph?
begin
Result := ((C >= $4E00) and (C <= $9FFF)) or ((C >= $F900) and (C <= $FAFF));
end;
function UnicodeIsHangul(C: UCS4): Boolean;
// Is the character a pre-composed Hangul syllable?
begin
Result := (C >= $AC00) and (C <= $D7FF);
end;
// I need to fix a problem (introduced by MS) here. The first parameter can be a pointer
// (and is so defined) or can be a normal DWORD, depending on the dwFlags parameter.
// As usual, lpSrc has been translated to a var parameter. But this does not work in
// our case, hence the redeclaration of the function with a pointer as first parameter.
function TranslateCharsetInfoEx(lpSrc: SizeInt; out lpCs: TCharsetInfo; dwFlags: DWORD): BOOL; stdcall;
external 'gdi32.dll' name 'TranslateCharsetInfo';
function GetCharSetFromLocale(Language: LCID; out FontCharSet: Byte): Boolean;
const
TCI_SRCLOCALE = $1000;
var
CP: Word;
CSI: TCharsetInfo;
begin
if GetWindowsVersion in [wvUnknown, wvWin95, wvWin95OSR2, wvWin98, wvWin98SE,
wvWinME, wvWinNT31, wvWinNT35, wvWinNT351, wvWinNT4] then
begin
// these versions of Windows don't support TCI_SRCLOCALE
CP := CodePageFromLocale(Language);
if CP = 0 then
RaiseLastOSError;
Result := TranslateCharsetInfoEx(CP, CSI, TCI_SRCCODEPAGE);
end
else
Result := TranslateCharsetInfoEx(Language, CSI, TCI_SRCLOCALE);
if Result then
FontCharset := CSI.ciCharset;
end;
function CharSetFromLocale(Language: LCID): Byte;
begin
if not GetCharSetFromLocale(Language, Result) then
RaiseLastOSError;
end;
function CodePageFromLocale(Language: LCID): Word;
// determines the code page for a given locale
var
Buf: array [0..6] of Char;
begin
GetLocaleInfo(Language, LOCALE_IDefaultAnsiCodePage, Buf, 6);
Result := StrToIntDef(Buf, GetACP);
end;
function KeyboardCodePage: Word;
begin
Result := CodePageFromLocale(GetKeyboardLayout(0) and $FFFF);
end;
function KeyUnicode(C: Char): WideChar;
// converts the given character (as it comes with a WM_CHAR message) into its
// corresponding Unicode character depending on the active keyboard layout
begin
MultiByteToWideChar(KeyboardCodePage, MB_USEGLYPHCHARS, @C, 1, @Result, 1);
end;
function CodeBlockRange(const CB: TUnicodeBlock): TUnicodeBlockRange;
// http://www.unicode.org/Public/5.0.0/ucd/Blocks.txt
begin
Result := UnicodeBlockData[CB].Range;
end;
// Names taken from http://www.unicode.org/Public/5.0.0/ucd/Blocks.txt
function CodeBlockName(const CB: TUnicodeBlock): string;
begin
Result := UnicodeBlockData[CB].Name;
end;
// Returns an ID for the Unicode code block to which C belongs.
// If C does not belong to any of the defined blocks then ubUndefined is returned.
// Note: the code blocks listed here are based on Unicode Version 5.0.0
function CodeBlockFromChar(const C: UCS4): TUnicodeBlock;
// http://www.unicode.org/Public/5.0.0/ucd/Blocks.txt
var
L, H, I: TUnicodeBlock;
begin
Result := ubUndefined;
L := ubBasicLatin;
H := High(TUnicodeBlock);
while L <= H do
begin
I := TUnicodeBlock((Cardinal(L) + Cardinal(H)) shr 1);
if (C >= UnicodeBlockData[I].Range.RangeStart) and (C <= UnicodeBlockData[I].Range.RangeEnd) then
begin
Result := I;
Break;
end
else
if C < UnicodeBlockData[I].Range.RangeStart then
begin
Dec(I);
H := I;
end
else
begin
Inc(I);
L := I;
end;
end;
end;
function CompareTextWin95(const W1, W2: WideString; Locale: LCID): SizeInt;
// special comparation function for Win9x since there's no system defined
// comparation function, returns -1 if W1 < W2, 0 if W1 = W2 or 1 if W1 > W2
var
S1, S2: AnsiString;
CP: Word;
L1, L2: SizeInt;
begin
L1 := Length(W1);
L2 := Length(W2);
SetLength(S1, L1);
SetLength(S2, L2);
CP := CodePageFromLocale(Locale);
WideCharToMultiByte(CP, 0, PWideChar(W1), L1, PAnsiChar(S1), L1, nil, nil);
WideCharToMultiByte(CP, 0, PWideChar(W2), L2, PAnsiChar(S2), L2, nil, nil);
Result := CompareStringA(Locale, NORM_IGNORECASE, PAnsiChar(S1), Length(S1),
PAnsiChar(S2), Length(S2)) - 2;
end;
function CompareTextWinNT(const W1, W2: WideString; Locale: LCID): SizeInt;
// Wrapper function for WinNT since there's no system defined comparation function
// in Win9x and we need a central comparation function for TWideStringList.
// Returns -1 if W1 < W2, 0 if W1 = W2 or 1 if W1 > W2
begin
Result := CompareStringW(Locale, NORM_IGNORECASE, PWideChar(W1), Length(W1),
PWideChar(W2), Length(W2)) - 2;
end;
function StringToWideStringEx(const S: AnsiString; CodePage: Word): WideString;
var
InputLength,
OutputLength: SizeInt;
begin
InputLength := Length(S);
OutputLength := MultiByteToWideChar(CodePage, 0, PAnsiChar(S), InputLength, nil, 0);
SetLength(Result, OutputLength);
MultiByteToWideChar(CodePage, 0, PAnsiChar(S), InputLength, PWideChar(Result), OutputLength);
end;
function WideStringToStringEx(const WS: WideString; CodePage: Word): AnsiString;
var
InputLength,
OutputLength: SizeInt;
begin
InputLength := Length(WS);
OutputLength := WideCharToMultiByte(CodePage, 0, PWideChar(WS), InputLength, nil, 0, nil, nil);
SetLength(Result, OutputLength);
WideCharToMultiByte(CodePage, 0, PWideChar(WS), InputLength, PAnsiChar(Result), OutputLength, nil, nil);
end;
function TranslateString(const S: AnsiString; CP1, CP2: Word): AnsiString;
begin
Result:= WideStringToStringEx(StringToWideStringEx(S, CP1), CP2);
end;
procedure PrepareUnicodeData;
// Prepares structures which are globally needed.
begin
LoadInProgress := TJclCriticalSection.Create;
if (Win32Platform and VER_PLATFORM_WIN32_NT) <> 0 then
@WideCompareText := @CompareTextWinNT
else
@WideCompareText := @CompareTextWin95;
end;
procedure FreeUnicodeData;
// Frees all data which has been allocated and which is not automatically freed by Delphi.
begin
FreeAndNil(LoadInProgress);
end;
initialization
PrepareUnicodeData;
{$IFDEF UNITVERSIONING}
RegisterUnitVersion(HInstance, UnitVersioning);
{$ENDIF UNITVERSIONING}
finalization
{$IFDEF UNITVERSIONING}
UnregisterUnitVersion(HInstance);
{$ENDIF UNITVERSIONING}
FreeUnicodeData;
end.