{**************************************************************************************************} { } { Project JEDI Code Library (JCL) } { } { The contents of this file are subject to the Mozilla Public License Version 1.1 (the "License"); } { you may not use this file except in compliance with the License. You may obtain a copy of the } { License at http://www.mozilla.org/MPL/ } { } { Software distributed under the License is distributed on an "AS IS" basis, WITHOUT WARRANTY OF } { ANY KIND, either express or implied. See the License for the specific language governing rights } { and limitations under the License. } { } { The Original Code is JclUnicode.pas. } { } { The Initial Developer of the Original Code is Mike Lischke (public att lischke-online dott de). } { Portions created by Mike Lischke are Copyright (C) 1999-2000 Mike Lischke. All Rights Reserved. } { } { Contributor(s): } { Marcel van Brakel } { Andreas Hausladen (ahuser) } { Mike Lischke } { Flier Lu (flier) } { Robert Marquardt (marquardt) } { Robert Rossmair (rrossmair) } { Olivier Sannier (obones) } { Matthias Thoma (mthoma) } { Petr Vones (pvones) } { Peter Schraut (http://www.console-dev.de) } { Florent Ouchet (outchy) } { glchapman } { } {**************************************************************************************************} { } { Various Unicode related routines } { } {**************************************************************************************************} { } { Last modified: $Date:: 2009-08-09 19:06:40 +0200 (dim., 09 août 2009) $ } { Revision: $Rev:: 2930 $ } { Author: $Author:: outchy $ } { } {**************************************************************************************************} unit JclUnicode; {$I jcl.inc} // Copyright (c) 1999-2000 Mike Lischke (public att lischke-online dott de) // // 10-JUL-2005: (changes by Peter Schraut) // - added CodeBlockName, returns the blockname as string // - added CodeBlockRange, returns the range of the specified codeblock // - updated TUnicodeBlock to reflect changes in unicode 4.1 // - updated CodeBlockFromChar to reflect changes in unicode 4.1 // - Notes: // Here are a few suggestions to reflect latest namechanges in unicode 4.1, // but they were not done due to compatibility with old code: // ubGreek should be renamed to ubGreekandCoptic // ubCombiningMarksforSymbols should be renamed to ubCombiningDiacriticalMarksforSymbols // ubPrivateUse should be renamed to ubPrivateUseArea // // // 19-SEP-2003: (changes by Andreas Hausladen) // - added OWN_WIDESTRING_MEMMGR for faster memory managment in TWideStringList // under Windows // - fixed: TWideStringList.Destroy does not set OnChange and OnChanging to nil before calling Clear // // // 29-MAR-2002: MT // - WideNormalize now returns strings with normalization mode nfNone unchanged. // - Bug fix in WideCompose: Raised exception when Result of WideComposeHangul was an // empty string. (#0000044) // - Bug fix in WideAdjustLineBreaks // - Added Asserts were needed. // - TWideStrings.IndexOfName now takes care of NormalizeForm as well. // - TWideStrings.IndexOf now takes care of NormalizeForm as well. // - TWideString.List Find now uses the same NormalizationForm for the search string as it uses // within the list itself. // // 29-NOV-2001: // - bug fix // 06-JUN-2001: // - small changes // 28-APR-2001: // - bug fixes // 05-APR-2001: // - bug fixes // 23-MAR-2001: // - WideSameText // - small changes // 10-FEB-2001: // - bug fix in StringToWideStringEx and WideStringToStringEx // 05-FEB-2001: // - TWideStrings.GetSeparatedText changed (no separator anymore after the last line) // 29-JAN-2001: // - PrepareUnicodeData // - LoadInProgress critical section is now created at init time to avoid critical thread races // - bug fixes // 26-JAN-2001: // - ExpandANSIString // - TWideStrings.SaveUnicode is by default True now // 20..21-JAN-2001: // - StrUpperW, StrLowerW and StrTitleW removed because they potentially would need // a reallocation to work correctly (use the WideString versions instead) // - further improvements related to internal data // - introduced TUnicodeBlock // - CodeBlockFromChar improved // 07-JAN-2001: // optimized access to character properties, combining class etc. // 06-JAN-2001: // TWideStrings and TWideStringList improved // APR-DEC 2000: versions 2.1 - 2.6 // - preparation for public rlease // - additional conversion routines // - JCL compliance // - character properties unified // - character properties data and lookup improvements // - reworked Unicode data resource file // - improved simple string comparation routines (StrCompW, StrLCompW etc., include surrogate fix) // - special case folding data for language neutral case insensitive comparations included // - optimized decomposition // - composition and normalization support // - normalization conformance tests applied // - bug fixes // FEB-MAR 2000: version 2.0 // - Unicode regular expressions (URE) search class (TURESearch) // - generic search engine base class for both the Boyer-Moore and the RE search class // - whole word only search in UTBM, bug fixes in UTBM // - string decompositon (including hangul) // OCT/99 - JAN/2000: version 1.0 // - basic Unicode implementation, more than 100 WideString/UCS2 and UCS4 core functions // - TWideStrings and TWideStringList classes // - Unicode Tuned Boyer-Moore search class (TUTBMSearch) // - low and high level Unicode/Wide* functions // - low level Unicode UCS4 data import and functions // - helper functions // // Version 2.9 // This unit contains routines and classes to manage and work with Unicode/WideString strings. // You need Delphi 4 or higher to compile this code. // // Publicly available low level functions are all preceded by "Unicode..." (e.g. // in UnicodeToUpper) while the high level functions use the Str... or Wide... // naming scheme (e.g. StrLICompW and WideUpperCase). // // The normalization implementation in this unit has successfully and completely passed the // official normative conformance testing as of Annex 9 in Technical Report #15 // (Unicode Standard Annex #15, http://www.unicode.org/unicode/reports/tr15, from 2000-08-31). // // Open issues: // - Yet to do things in the URE class are: // - check all character classes if they match correctly // - optimize rebuild of DFA (build only when pattern changes) // - set flag parameter of ExecuteURE // - add \d any decimal digit // \D any character that is not a decimal digit // \s any whitespace character // \S any character that is not a whitespace character // \w any "word" character // \W any "non-word" character // - The wide string classes still compare text with functions provided by the // particular system. This works usually fine under WinNT/W2K (although also // there are limitations like maximum text lengths). Under Win9x conversions // from and to MBCS are necessary which are bound to a particular locale and // so very limited in general use. These comparisons should be changed so that // the code in this unit is used. interface uses {$IFDEF UNITVERSIONING} JclUnitVersioning, {$ENDIF UNITVERSIONING} {$IFDEF MSWINDOWS} Windows, {$ENDIF MSWINDOWS} SysUtils, Classes, JclBase; {$IFNDEF FPC} {$IFDEF MSWINDOWS} {$DEFINE OWN_WIDESTRING_MEMMGR} {$ENDIF MSWINDOWS} {$ENDIF ~FPC} const // definitions of often used characters: // Note: Use them only for tests of a certain character not to determine character // classes (like white spaces) as in Unicode are often many code points defined // being in a certain class. Hence your best option is to use the various // UnicodeIs* functions. WideNull = WideChar(#0); WideTabulator = WideChar(#9); WideSpace = WideChar(#32); // logical line breaks WideLF = WideChar(#10); WideLineFeed = WideChar(#10); WideVerticalTab = WideChar(#11); WideFormFeed = WideChar(#12); WideCR = WideChar(#13); WideCarriageReturn = WideChar(#13); WideCRLF = WideString(#13#10); WideLineSeparator = WideChar($2028); WideParagraphSeparator = WideChar($2029); // byte order marks for Unicode files // Unicode text files (in UTF-16 format) should contain $FFFE as first character to // identify such a file clearly. Depending on the system where the file was created // on this appears either in big endian or little endian style. BOM_LSB_FIRST = WideChar($FEFF); BOM_MSB_FIRST = WideChar($FFFE); type TSaveFormat = ( sfUTF16LSB, sfUTF16MSB, sfUTF8, sfAnsi ); const sfUnicodeLSB = sfUTF16LSB; sfUnicodeMSB = sfUTF16MSB; type // various predefined or otherwise useful character property categories TCharacterCategory = ( // normative categories ccLetterUppercase, ccLetterLowercase, ccLetterTitlecase, ccMarkNonSpacing, ccMarkSpacingCombining, ccMarkEnclosing, ccNumberDecimalDigit, ccNumberLetter, ccNumberOther, ccSeparatorSpace, ccSeparatorLine, ccSeparatorParagraph, ccOtherControl, ccOtherFormat, ccOtherSurrogate, ccOtherPrivate, ccOtherUnassigned, // informative categories ccLetterModifier, ccLetterOther, ccPunctuationConnector, ccPunctuationDash, ccPunctuationOpen, ccPunctuationClose, ccPunctuationInitialQuote, ccPunctuationFinalQuote, ccPunctuationOther, ccSymbolMath, ccSymbolCurrency, ccSymbolModifier, ccSymbolOther, // bidirectional categories ccLeftToRight, ccLeftToRightEmbedding, ccLeftToRightOverride, ccRightToLeft, ccRightToLeftArabic, ccRightToLeftEmbedding, ccRightToLeftoverride, ccPopDirectionalFormat, ccEuropeanNumber, ccEuropeanNumberSeparator, ccEuropeanNumberTerminator, ccArabicNumber, ccCommonNumberSeparator, ccBoundaryNeutral, ccSegmentSeparator, // this includes tab and vertical tab ccWhiteSpace, ccOtherNeutrals, // self defined categories, they do not appear in the Unicode data file ccComposed, // can be decomposed ccNonBreaking, ccSymmetric, // has left and right forms ccHexDigit, ccQuotationMark, ccMirroring, ccSpaceOther, ccAssigned // means there is a definition in the Unicode standard ); TCharacterCategories = set of TCharacterCategory; // four forms of normalization are defined: TNormalizationForm = ( nfNone, // do not normalize nfC, // canonical decomposition followed by canonical composition (this is most often used) nfD, // canonical decomposition nfKC, // compatibility decomposition followed by a canonical composition nfKD // compatibility decomposition ); // used to hold information about the start and end // position of a unicodeblock. TUnicodeBlockRange = record RangeStart, RangeEnd: Cardinal; end; // An Unicode block usually corresponds to a particular language script but // can also represent special characters, musical symbols and the like. // http://www.unicode.org/Public/5.0.0/ucd/Blocks.txt TUnicodeBlock = ( ubUndefined, ubBasicLatin, ubLatin1Supplement, ubLatinExtendedA, ubLatinExtendedB, ubIPAExtensions, ubSpacingModifierLetters, ubCombiningDiacriticalMarks, ubGreekandCoptic, ubCyrillic, ubCyrillicSupplement, ubArmenian, ubHebrew, ubArabic, ubSyriac, ubArabicSupplement, ubThaana, ubNKo, ubDevanagari, ubBengali, ubGurmukhi, ubGujarati, ubOriya, ubTamil, ubTelugu, ubKannada, ubMalayalam, ubSinhala, ubThai, ubLao, ubTibetan, ubMyanmar, ubGeorgian, ubHangulJamo, ubEthiopic, ubEthiopicSupplement, ubCherokee, ubUnifiedCanadianAboriginalSyllabics, ubOgham, ubRunic, ubTagalog, ubHanunoo, ubBuhid, ubTagbanwa, ubKhmer, ubMongolian, ubLimbu, ubTaiLe, ubNewTaiLue, ubKhmerSymbols, ubBuginese, ubBalinese, ubPhoneticExtensions, ubPhoneticExtensionsSupplement, ubCombiningDiacriticalMarksSupplement, ubLatinExtendedAdditional, ubGreekExtended, ubGeneralPunctuation, ubSuperscriptsandSubscripts, ubCurrencySymbols, ubCombiningDiacriticalMarksforSymbols, ubLetterlikeSymbols, ubNumberForms, ubArrows, ubMathematicalOperators, ubMiscellaneousTechnical, ubControlPictures, ubOpticalCharacterRecognition, ubEnclosedAlphanumerics, ubBoxDrawing, ubBlockElements, ubGeometricShapes, ubMiscellaneousSymbols, ubDingbats, ubMiscellaneousMathematicalSymbolsA, ubSupplementalArrowsA, ubBraillePatterns, ubSupplementalArrowsB, ubMiscellaneousMathematicalSymbolsB, ubSupplementalMathematicalOperators, ubMiscellaneousSymbolsandArrows, ubGlagolitic, ubLatinExtendedC, ubCoptic, ubGeorgianSupplement, ubTifinagh, ubEthiopicExtended, ubSupplementalPunctuation, ubCJKRadicalsSupplement, ubKangxiRadicals, ubIdeographicDescriptionCharacters, ubCJKSymbolsandPunctuation, ubHiragana, ubKatakana, ubBopomofo, ubHangulCompatibilityJamo, ubKanbun, ubBopomofoExtended, ubCJKStrokes, ubKatakanaPhoneticExtensions, ubEnclosedCJKLettersandMonths, ubCJKCompatibility, ubCJKUnifiedIdeographsExtensionA, ubYijingHexagramSymbols, ubCJKUnifiedIdeographs, ubYiSyllables, ubYiRadicals, ubModifierToneLetters, ubLatinExtendedD, ubSylotiNagri, ubPhagsPa, ubHangulSyllables, ubHighSurrogates, ubHighPrivateUseSurrogates, ubLowSurrogates, ubPrivateUseArea, ubCJKCompatibilityIdeographs, ubAlphabeticPresentationForms, ubArabicPresentationFormsA, ubVariationSelectors, ubVerticalForms, ubCombiningHalfMarks, ubCJKCompatibilityForms, ubSmallFormVariants, ubArabicPresentationFormsB, ubHalfwidthandFullwidthForms, ubSpecials, ubLinearBSyllabary, ubLinearBIdeograms, ubAegeanNumbers, ubAncientGreekNumbers, ubOldItalic, ubGothic, ubUgaritic, ubOldPersian, ubDeseret, ubShavian, ubOsmanya, ubCypriotSyllabary, ubPhoenician, ubKharoshthi, ubCuneiform, ubCuneiformNumbersAndPunctuation, ubByzantineMusicalSymbols, ubMusicalSymbols, ubAncientGreekMusicalNotation, ubTaiXuanJingSymbols, ubCountingRodNumerals, ubMathematicalAlphanumericSymbols, ubCJKUnifiedIdeographsExtensionB, ubCJKCompatibilityIdeographsSupplement, ubTags, ubVariationSelectorsSupplement, ubSupplementaryPrivateUseAreaA, ubSupplementaryPrivateUseAreaB ); TUnicodeBlockData = record Range: TUnicodeBlockRange; Name: string; end; PUnicodeBlockData = ^TUnicodeBlockData; const UnicodeBlockData: array [TUnicodeBlock] of TUnicodeBlockData = ((Range:(RangeStart: $FFFFFFFF; RangeEnd: $0000); Name: 'No-block'), (Range:(RangeStart: $0000; RangeEnd: $007F); Name: 'Basic Latin'), (Range:(RangeStart: $0080; RangeEnd: $00FF); Name: 'Latin-1 Supplement'), (Range:(RangeStart: $0100; RangeEnd: $017F); Name: 'Latin Extended-A'), (Range:(RangeStart: $0180; RangeEnd: $024F); Name: 'Latin Extended-B'), (Range:(RangeStart: $0250; RangeEnd: $02AF); Name: 'IPA Extensions'), (Range:(RangeStart: $02B0; RangeEnd: $02FF); Name: 'Spacing Modifier Letters'), (Range:(RangeStart: $0300; RangeEnd: $036F); Name: 'Combining Diacritical Marks'), (Range:(RangeStart: $0370; RangeEnd: $03FF); Name: 'Greek and Coptic'), (Range:(RangeStart: $0400; RangeEnd: $04FF); Name: 'Cyrillic'), (Range:(RangeStart: $0500; RangeEnd: $052F); Name: 'Cyrillic Supplement'), (Range:(RangeStart: $0530; RangeEnd: $058F); Name: 'Armenian'), (Range:(RangeStart: $0590; RangeEnd: $05FF); Name: 'Hebrew'), (Range:(RangeStart: $0600; RangeEnd: $06FF); Name: 'Arabic'), (Range:(RangeStart: $0700; RangeEnd: $074F); Name: 'Syriac'), (Range:(RangeStart: $0750; RangeEnd: $077F); Name: 'Arabic Supplement'), (Range:(RangeStart: $0780; RangeEnd: $07BF); Name: 'Thaana'), (Range:(RangeStart: $07C0; RangeEnd: $07FF); Name: 'NKo'), (Range:(RangeStart: $0900; RangeEnd: $097F); Name: 'Devanagari'), (Range:(RangeStart: $0980; RangeEnd: $09FF); Name: 'Bengali'), (Range:(RangeStart: $0A00; RangeEnd: $0A7F); Name: 'Gurmukhi'), (Range:(RangeStart: $0A80; RangeEnd: $0AFF); Name: 'Gujarati'), (Range:(RangeStart: $0B00; RangeEnd: $0B7F); Name: 'Oriya'), (Range:(RangeStart: $0B80; RangeEnd: $0BFF); Name: 'Tamil'), (Range:(RangeStart: $0C00; RangeEnd: $0C7F); Name: 'Telugu'), (Range:(RangeStart: $0C80; RangeEnd: $0CFF); Name: 'Kannada'), (Range:(RangeStart: $0D00; RangeEnd: $0D7F); Name: 'Malayalam'), (Range:(RangeStart: $0D80; RangeEnd: $0DFF); Name: 'Sinhala'), (Range:(RangeStart: $0E00; RangeEnd: $0E7F); Name: 'Thai'), (Range:(RangeStart: $0E80; RangeEnd: $0EFF); Name: 'Lao'), (Range:(RangeStart: $0F00; RangeEnd: $0FFF); Name: 'Tibetan'), (Range:(RangeStart: $1000; RangeEnd: $109F); Name: 'Myanmar'), (Range:(RangeStart: $10A0; RangeEnd: $10FF); Name: 'Georgian'), (Range:(RangeStart: $1100; RangeEnd: $11FF); Name: 'Hangul Jamo'), (Range:(RangeStart: $1200; RangeEnd: $137F); Name: 'Ethiopic'), (Range:(RangeStart: $1380; RangeEnd: $139F); Name: 'Ethiopic Supplement'), (Range:(RangeStart: $13A0; RangeEnd: $13FF); Name: 'Cherokee'), (Range:(RangeStart: $1400; RangeEnd: $167F); Name: 'Unified Canadian Aboriginal Syllabics'), (Range:(RangeStart: $1680; RangeEnd: $169F); Name: 'Ogham'), (Range:(RangeStart: $16A0; RangeEnd: $16FF); Name: 'Runic'), (Range:(RangeStart: $1700; RangeEnd: $171F); Name: 'Tagalog'), (Range:(RangeStart: $1720; RangeEnd: $173F); Name: 'Hanunoo'), (Range:(RangeStart: $1740; RangeEnd: $175F); Name: 'Buhid'), (Range:(RangeStart: $1760; RangeEnd: $177F); Name: 'Tagbanwa'), (Range:(RangeStart: $1780; RangeEnd: $17FF); Name: 'Khmer'), (Range:(RangeStart: $1800; RangeEnd: $18AF); Name: 'Mongolian'), (Range:(RangeStart: $1900; RangeEnd: $194F); Name: 'Limbu'), (Range:(RangeStart: $1950; RangeEnd: $197F); Name: 'Tai Le'), (Range:(RangeStart: $1980; RangeEnd: $19DF); Name: 'New Tai Lue'), (Range:(RangeStart: $19E0; RangeEnd: $19FF); Name: 'Khmer Symbols'), (Range:(RangeStart: $1A00; RangeEnd: $1A1F); Name: 'Buginese'), (Range:(RangeStart: $1B00; RangeEnd: $1B7F); Name: 'Balinese'), (Range:(RangeStart: $1D00; RangeEnd: $1D7F); Name: 'Phonetic Extensions'), (Range:(RangeStart: $1D80; RangeEnd: $1DBF); Name: 'Phonetic Extensions Supplement'), (Range:(RangeStart: $1DC0; RangeEnd: $1DFF); Name: 'Combining Diacritical Marks Supplement'), (Range:(RangeStart: $1E00; RangeEnd: $1EFF); Name: 'Latin Extended Additional'), (Range:(RangeStart: $1F00; RangeEnd: $1FFF); Name: 'Greek Extended'), (Range:(RangeStart: $2000; RangeEnd: $206F); Name: 'General Punctuation'), (Range:(RangeStart: $2070; RangeEnd: $209F); Name: 'Superscripts and Subscripts'), (Range:(RangeStart: $20A0; RangeEnd: $20CF); Name: 'Currency Symbols'), (Range:(RangeStart: $20D0; RangeEnd: $20FF); Name: 'Combining Diacritical Marks for Symbols'), (Range:(RangeStart: $2100; RangeEnd: $214F); Name: 'Letterlike Symbols'), (Range:(RangeStart: $2150; RangeEnd: $218F); Name: 'Number Forms'), (Range:(RangeStart: $2190; RangeEnd: $21FF); Name: 'Arrows'), (Range:(RangeStart: $2200; RangeEnd: $22FF); Name: 'Mathematical Operators'), (Range:(RangeStart: $2300; RangeEnd: $23FF); Name: 'Miscellaneous Technical'), (Range:(RangeStart: $2400; RangeEnd: $243F); Name: 'Control Pictures'), (Range:(RangeStart: $2440; RangeEnd: $245F); Name: 'Optical Character Recognition'), (Range:(RangeStart: $2460; RangeEnd: $24FF); Name: 'Enclosed Alphanumerics'), (Range:(RangeStart: $2500; RangeEnd: $257F); Name: 'Box Drawing'), (Range:(RangeStart: $2580; RangeEnd: $259F); Name: 'Block Elements'), (Range:(RangeStart: $25A0; RangeEnd: $25FF); Name: 'Geometric Shapes'), (Range:(RangeStart: $2600; RangeEnd: $26FF); Name: 'Miscellaneous Symbols'), (Range:(RangeStart: $2700; RangeEnd: $27BF); Name: 'Dingbats'), (Range:(RangeStart: $27C0; RangeEnd: $27EF); Name: 'Miscellaneous Mathematical Symbols-A'), (Range:(RangeStart: $27F0; RangeEnd: $27FF); Name: 'Supplemental Arrows-A'), (Range:(RangeStart: $2800; RangeEnd: $28FF); Name: 'Braille Patterns'), (Range:(RangeStart: $2900; RangeEnd: $297F); Name: 'Supplemental Arrows-B'), (Range:(RangeStart: $2980; RangeEnd: $29FF); Name: 'Miscellaneous Mathematical Symbols-B'), (Range:(RangeStart: $2A00; RangeEnd: $2AFF); Name: 'Supplemental Mathematical Operators'), (Range:(RangeStart: $2B00; RangeEnd: $2BFF); Name: 'Miscellaneous Symbols and Arrows'), (Range:(RangeStart: $2C00; RangeEnd: $2C5F); Name: 'Glagolitic'), (Range:(RangeStart: $2C60; RangeEnd: $2C7F); Name: 'Latin Extended-C'), (Range:(RangeStart: $2C80; RangeEnd: $2CFF); Name: 'Coptic'), (Range:(RangeStart: $2D00; RangeEnd: $2D2F); Name: 'Georgian Supplement'), (Range:(RangeStart: $2D30; RangeEnd: $2D7F); Name: 'Tifinagh'), (Range:(RangeStart: $2D80; RangeEnd: $2DDF); Name: 'Ethiopic Extended'), (Range:(RangeStart: $2E00; RangeEnd: $2E7F); Name: 'Supplemental Punctuation'), (Range:(RangeStart: $2E80; RangeEnd: $2EFF); Name: 'CJK Radicals Supplement'), (Range:(RangeStart: $2F00; RangeEnd: $2FDF); Name: 'Kangxi Radicals'), (Range:(RangeStart: $2FF0; RangeEnd: $2FFF); Name: 'Ideographic Description Characters'), (Range:(RangeStart: $3000; RangeEnd: $303F); Name: 'CJK Symbols and Punctuation'), (Range:(RangeStart: $3040; RangeEnd: $309F); Name: 'Hiragana'), (Range:(RangeStart: $30A0; RangeEnd: $30FF); Name: 'Katakana'), (Range:(RangeStart: $3100; RangeEnd: $312F); Name: 'Bopomofo'), (Range:(RangeStart: $3130; RangeEnd: $318F); Name: 'Hangul Compatibility Jamo'), (Range:(RangeStart: $3190; RangeEnd: $319F); Name: 'Kanbun'), (Range:(RangeStart: $31A0; RangeEnd: $31BF); Name: 'Bopomofo Extended'), (Range:(RangeStart: $31C0; RangeEnd: $31EF); Name: 'CJK Strokes'), (Range:(RangeStart: $31F0; RangeEnd: $31FF); Name: 'Katakana Phonetic Extensions'), (Range:(RangeStart: $3200; RangeEnd: $32FF); Name: 'Enclosed CJK Letters and Months'), (Range:(RangeStart: $3300; RangeEnd: $33FF); Name: 'CJK Compatibility'), (Range:(RangeStart: $3400; RangeEnd: $4DBF); Name: 'CJK Unified Ideographs Extension A'), (Range:(RangeStart: $4DC0; RangeEnd: $4DFF); Name: 'Yijing Hexagram Symbols'), (Range:(RangeStart: $4E00; RangeEnd: $9FFF); Name: 'CJK Unified Ideographs'), (Range:(RangeStart: $A000; RangeEnd: $A48F); Name: 'Yi Syllables'), (Range:(RangeStart: $A490; RangeEnd: $A4CF); Name: 'Yi Radicals'), (Range:(RangeStart: $A700; RangeEnd: $A71F); Name: 'Modifier Tone Letters'), (Range:(RangeStart: $A720; RangeEnd: $A7FF); Name: 'Latin Extended-D'), (Range:(RangeStart: $A800; RangeEnd: $A82F); Name: 'Syloti Nagri'), (Range:(RangeStart: $A840; RangeEnd: $A87F); Name: 'Phags-pa'), (Range:(RangeStart: $AC00; RangeEnd: $D7AF); Name: 'Hangul Syllables'), (Range:(RangeStart: $D800; RangeEnd: $DB7F); Name: 'High Surrogates'), (Range:(RangeStart: $DB80; RangeEnd: $DBFF); Name: 'High Private Use Surrogates'), (Range:(RangeStart: $DC00; RangeEnd: $DFFF); Name: 'Low Surrogates'), (Range:(RangeStart: $E000; RangeEnd: $F8FF); Name: 'Private Use Area'), (Range:(RangeStart: $F900; RangeEnd: $FAFF); Name: 'CJK Compatibility Ideographs'), (Range:(RangeStart: $FB00; RangeEnd: $FB4F); Name: 'Alphabetic Presentation Forms'), (Range:(RangeStart: $FB50; RangeEnd: $FDFF); Name: 'Arabic Presentation Forms-A'), (Range:(RangeStart: $FE00; RangeEnd: $FE0F); Name: 'Variation Selectors'), (Range:(RangeStart: $FE10; RangeEnd: $FE1F); Name: 'Vertical Forms'), (Range:(RangeStart: $FE20; RangeEnd: $FE2F); Name: 'Combining Half Marks'), (Range:(RangeStart: $FE30; RangeEnd: $FE4F); Name: 'CJK Compatibility Forms'), (Range:(RangeStart: $FE50; RangeEnd: $FE6F); Name: 'Small Form Variants'), (Range:(RangeStart: $FE70; RangeEnd: $FEFF); Name: 'Arabic Presentation Forms-B'), (Range:(RangeStart: $FF00; RangeEnd: $FFEF); Name: 'Halfwidth and Fullwidth Forms'), (Range:(RangeStart: $FFF0; RangeEnd: $FFFF); Name: 'Specials'), (Range:(RangeStart: $10000; RangeEnd: $1007F); Name: 'Linear B Syllabary'), (Range:(RangeStart: $10080; RangeEnd: $100FF); Name: 'Linear B Ideograms'), (Range:(RangeStart: $10100; RangeEnd: $1013F); Name: 'Aegean Numbers'), (Range:(RangeStart: $10140; RangeEnd: $1018F); Name: 'Ancient Greek Numbers'), (Range:(RangeStart: $10300; RangeEnd: $1032F); Name: 'Old Italic'), (Range:(RangeStart: $10330; RangeEnd: $1034F); Name: 'Gothic'), (Range:(RangeStart: $10380; RangeEnd: $1039F); Name: 'Ugaritic'), (Range:(RangeStart: $103A0; RangeEnd: $103DF); Name: 'Old Persian'), (Range:(RangeStart: $10400; RangeEnd: $1044F); Name: 'Deseret'), (Range:(RangeStart: $10450; RangeEnd: $1047F); Name: 'Shavian'), (Range:(RangeStart: $10480; RangeEnd: $104AF); Name: 'Osmanya'), (Range:(RangeStart: $10800; RangeEnd: $1083F); Name: 'Cypriot Syllabary'), (Range:(RangeStart: $10900; RangeEnd: $1091F); Name: 'Phoenician'), (Range:(RangeStart: $10A00; RangeEnd: $10A5F); Name: 'Kharoshthi'), (Range:(RangeStart: $12000; RangeEnd: $123FF); Name: 'Cuneiform'), (Range:(RangeStart: $12400; RangeEnd: $1247F); Name: 'Cuneiform Numbers and Punctuation'), (Range:(RangeStart: $1D000; RangeEnd: $1D0FF); Name: 'Byzantine Musical Symbols'), (Range:(RangeStart: $1D100; RangeEnd: $1D1FF); Name: 'Musical Symbols'), (Range:(RangeStart: $1D200; RangeEnd: $1D24F); Name: 'Ancient Greek Musical Notation'), (Range:(RangeStart: $1D300; RangeEnd: $1D35F); Name: 'Tai Xuan Jing Symbols'), (Range:(RangeStart: $1D360; RangeEnd: $1D37F); Name: 'Counting Rod Numerals'), (Range:(RangeStart: $1D400; RangeEnd: $1D7FF); Name: 'Mathematical Alphanumeric Symbols'), (Range:(RangeStart: $20000; RangeEnd: $2A6DF); Name: 'CJK Unified Ideographs Extension B'), (Range:(RangeStart: $2F800; RangeEnd: $2FA1F); Name: 'CJK Compatibility Ideographs Supplement'), (Range:(RangeStart: $E0000; RangeEnd: $E007F); Name: 'Tags'), (Range:(RangeStart: $E0100; RangeEnd: $E01EF); Name: 'Variation Selectors Supplement'), (Range:(RangeStart: $F0000; RangeEnd: $FFFFF); Name: 'Supplementary Private Use Area-A'), (Range:(RangeStart: $100000; RangeEnd: $10FFFF); Name: 'Supplementary Private Use Area-B')); type TWideStrings = class; TSearchFlag = ( sfCaseSensitive, // match letter case sfIgnoreNonSpacing, // ignore non-spacing characters in search sfSpaceCompress, // handle several consecutive white spaces as one white space // (this applies to the pattern as well as the search text) sfWholeWordOnly // match only text at end/start and/or surrounded by white spaces ); TSearchFlags = set of TSearchFlag; // a generic search class defininition used for tuned Boyer-Moore and Unicode // regular expression searches TSearchEngine = class(TObject) private FResults: TList; // 2 entries for each result (start and stop position) FOwner: TWideStrings; // at the moment unused, perhaps later to access strings faster protected function GetCount: SizeInt; virtual; public constructor Create(AOwner: TWideStrings); virtual; destructor Destroy; override; procedure AddResult(Start, Stop: SizeInt); virtual; procedure Clear; virtual; procedure ClearResults; virtual; procedure DeleteResult(Index: SizeInt); virtual; procedure FindPrepare(const Pattern: WideString; Options: TSearchFlags); overload; virtual; abstract; procedure FindPrepare(Pattern: PWideChar; PatternLength: SizeInt; Options: TSearchFlags); overload; virtual; abstract; function FindFirst(const Text: WideString; var Start, Stop: SizeInt): Boolean; overload; virtual; abstract; function FindFirst(Text: PWideChar; TextLen: SizeInt; var Start, Stop: SizeInt): Boolean; overload; virtual; abstract; function FindAll(const Text: WideString): Boolean; overload; virtual; abstract; function FindAll(Text: PWideChar; TextLen: SizeInt): Boolean; overload; virtual; abstract; procedure GetResult(Index: SizeInt; var Start, Stop: SizeInt); virtual; property Count: SizeInt read GetCount; end; // The Unicode Tuned Boyer-Moore (UTBM) search implementation is an extended // translation created from a free package written by Mark Leisher (mleisher att crl dott nmsu dott edu). // // The code handles high and low surrogates as well as case (in)dependency, // can ignore non-spacing characters and allows optionally to return whole // words only. // single pattern character PUTBMChar = ^TUTBMChar; TUTBMChar = record LoCase, UpCase, TitleCase: UCS4; end; PUTBMSkip = ^TUTBMSkip; TUTBMSkip = record BMChar: PUTBMChar; SkipValues: Integer; end; TUTBMSearch = class(TSearchEngine) private FFlags: TSearchFlags; FPattern: PUTBMChar; FPatternUsed: SizeInt; FPatternSize: SizeInt; FPatternLength: SizeInt; FSkipValues: PUTBMSkip; FSkipsUsed: SizeInt; FMD4: SizeInt; protected procedure ClearPattern; procedure Compile(Pattern: PUCS2; PatternLength: SizeInt; Flags: TSearchFlags); function Find(Text: PUCS2; TextLen: SizeInt; var MatchStart, MatchEnd: SizeInt): Boolean; function GetSkipValue(TextStart, TextEnd: PUCS2): SizeInt; function Match(Text, Start, Stop: PUCS2; var MatchStart, MatchEnd: SizeInt): Boolean; public procedure Clear; override; procedure FindPrepare(const Pattern: WideString; Options: TSearchFlags); overload; override; procedure FindPrepare(Pattern: PWideChar; PatternLength: SizeInt; Options: TSearchFlags); overload; override; function FindFirst(const Text: WideString; var Start, Stop: SizeInt): Boolean; overload; override; function FindFirst(Text: PWideChar; TextLen: SizeInt; var Start, Stop: SizeInt): Boolean; overload; override; function FindAll(const Text: WideString): Boolean; overload; override; function FindAll(Text: PWideChar; TextLen: SizeInt): Boolean; overload; override; end; // Regular expression search engine for text in UCS2 form taking surrogates // into account. This implementation is an improved translation from the URE // package written by Mark Leisher (mleisher att crl dott nmsu dott edu) who used a variation // of the RE->DFA algorithm done by Mark Hopkins (markh att csd4 dott csd dott uwm dott edu). // Assumptions: // o Regular expression and text already normalized. // o Conversion to lower case assumes a 1-1 mapping. // // Definitions: // Separator - any one of U+2028, U+2029, NL, CR. // // Operators: // . - match any character // * - match zero or more of the last subexpression // + - match one or more of the last subexpression // ? - match zero or one of the last subexpression // () - subexpression grouping // {m, n} - match at least m occurences and up to n occurences // Note: both values can be 0 or ommitted which denotes then a unlimiting bound // {,} and {0,} and {0, 0} correspond to * // {, 1} and {0, 1} correspond to ? // {1,} and {1, 0} correspond to + // {m} - match exactly m occurences // // Notes: // o The "." operator normally does not match separators, but a flag is // available that will allow this operator to match a separator. // // Literals and Constants: // c - literal UCS2 character // \x.... - hexadecimal number of up to 4 digits // \X.... - hexadecimal number of up to 4 digits // \u.... - hexadecimal number of up to 4 digits // \U.... - hexadecimal number of up to 4 digits // // Character classes: // [...] - Character class // [^...] - Negated character class // \pN1,N2,...,Nn - Character properties class // \PN1,N2,...,Nn - Negated character properties class // // POSIX character classes recognized: // :alnum: // :alpha: // :cntrl: // :digit: // :graph: // :lower: // :print: // :punct: // :space: // :upper: // :xdigit: // // Notes: // o Character property classes are \p or \P followed by a comma separated // list of integers between 0 and the maximum entry index in TCharacterCategory. // These integers directly correspond to the TCharacterCategory enumeration entries. // Note: upper, lower and title case classes need to have case sensitive search // be enabled to match correctly! // // o Character classes can contain literals, constants and character // property classes. Example: // // [abc\U10A\p0,13,4] // structure used to handle a compacted range of characters PUcRange = ^TUcRange; TUcRange = record MinCode, MaxCode: UCS4; end; TUcCClass = record Ranges: array of TUcRange; RangesUsed: SizeInt; end; // either a single character or a list of character classes TUcSymbol = record Chr: UCS4; CCL: TUcCClass; end; // this is a general element structure used for expressions and stack elements TUcElement = record OnStack: Boolean; AType, LHS, RHS: SizeInt; end; // this is a structure used to track a list or a stack of states PUcStateList = ^TUcStateList; TUcStateList = record List: array of SizeInt; ListUsed: SizeInt; end; // structure to track the list of unique states for a symbol during reduction PUcSymbolTableEntry = ^TUcSymbolTableEntry; TUcSymbolTableEntry = record ID, AType: SizeInt; Mods, Categories: TCharacterCategories; Symbol: TUcSymbol; States: TUcStateList; end; // structure to hold a single State PUcState = ^TUcState; TUcState = record ID: SizeInt; Accepting: Boolean; StateList: TUcStateList; Transitions: array of TUcElement; TransitionsUsed: SizeInt; end; // structure used for keeping lists of states TUcStateTable = record States: array of TUcState; StatesUsed: SizeInt; end; // structure to track pairs of DFA states when equivalent states are merged TUcEquivalent = record Left, Right: SizeInt; end; TUcExpressionList = record Expressions: array of TUcElement; ExpressionsUsed: SizeInt; end; TUcSymbolTable = record Symbols: array of TUcSymbolTableEntry; SymbolsUsed: SizeInt; end; TUcEquivalentList = record Equivalents: array of TUcEquivalent; EquivalentsUsed: SizeInt; end; // structure used for constructing the NFA and reducing to a minimal DFA PUREBuffer = ^TUREBuffer; TUREBuffer = record Reducing: Boolean; Error: Integer; Flags: Cardinal; Stack: TUcStateList; SymbolTable: TUcSymbolTable; // table of unique symbols encountered ExpressionList: TUcExpressionList; // tracks the unique expressions generated // for the NFA and when the NFA is reduced States: TUcStateTable; // the reduced table of unique groups of NFA states EquivalentList: TUcEquivalentList; // tracks states when equivalent states are merged end; TUcTransition = record Symbol, NextState: SizeInt; end; PDFAState = ^TDFAState; TDFAState = record Accepting: Boolean; NumberTransitions: SizeInt; StartTransition: SizeInt; end; TDFAStates = record States: array of TDFAState; StatesUsed: SizeInt; end; TUcTransitions = record Transitions: array of TUcTransition; TransitionsUsed: SizeInt; end; TDFA = record Flags: Cardinal; SymbolTable: TUcSymbolTable; StateList: TDFAStates; TransitionList: TUcTransitions; end; TURESearch = class(TSearchEngine) private FUREBuffer: TUREBuffer; FDFA: TDFA; protected procedure AddEquivalentPair(L, R: SizeInt); procedure AddRange(var CCL: TUcCClass; Range: TUcRange); function AddState(NewStates: array of SizeInt): SizeInt; procedure AddSymbolState(Symbol, State: SizeInt); function BuildCharacterClass(CP: PUCS2; Limit: SizeInt; Symbol: PUcSymbolTableEntry): SizeInt; procedure ClearUREBuffer; function CompileSymbol(S: PUCS2; Limit: SizeInt; Symbol: PUcSymbolTableEntry): SizeInt; procedure CompileURE(RE: PWideChar; RELength: SizeInt; Casefold: Boolean); procedure CollectPendingOperations(var State: SizeInt); function ConvertRegExpToNFA(RE: PWideChar; RELength: SizeInt): SizeInt; function ExecuteURE(Flags: Cardinal; Text: PUCS2; TextLen: SizeInt; var MatchStart, MatchEnd: SizeInt): Boolean; procedure ClearDFA; procedure HexDigitSetup(Symbol: PUcSymbolTableEntry); function MakeExpression(AType, LHS, RHS: SizeInt): SizeInt; function MakeHexNumber(NP: PUCS2; Limit: SizeInt; var Number: UCS4): SizeInt; function MakeSymbol(S: PUCS2; Limit: SizeInt; out Consumed: SizeInt): SizeInt; procedure MergeEquivalents; function ParsePropertyList(Properties: PUCS2; Limit: SizeInt; var Categories: TCharacterCategories): SizeInt; function Peek: SizeInt; function Pop: SizeInt; function PosixCCL(CP: PUCS2; Limit: SizeInt; Symbol: PUcSymbolTableEntry): SizeInt; function ProbeLowSurrogate(LeftState: PUCS2; Limit: SizeInt; var Code: UCS4): SizeInt; procedure Push(V: SizeInt); procedure Reduce(Start: SizeInt); procedure SpaceSetup(Symbol: PUcSymbolTableEntry; Categories: TCharacterCategories); function SymbolsAreDifferent(A, B: PUcSymbolTableEntry): Boolean; public procedure Clear; override; procedure FindPrepare(const Pattern: WideString; Options: TSearchFlags); overload; override; procedure FindPrepare(Pattern: PWideChar; PatternLength: SizeInt; Options: TSearchFlags); overload; override; function FindFirst(const Text: WideString; var Start, Stop: SizeInt): Boolean; overload; override; function FindFirst(Text: PWideChar; TextLen: SizeInt; var Start, Stop: SizeInt): Boolean; overload; override; function FindAll(const Text: WideString): Boolean; overload; override; function FindAll(Text: PWideChar; TextLen: SizeInt): Boolean; overload; override; end; // Event used to give the application a chance to switch the way of how to save // the text in TWideStrings if the text contains characters not only from the // ANSI block but the save type is ANSI. On triggering the event the application // can change the property SaveUnicode as needed. This property is again checked // after the callback returns. TConfirmConversionEvent = procedure (Sender: TWideStrings; var Allowed: Boolean) of object; TWideStrings = class(TPersistent) private FUpdateCount: Integer; FLanguage: LCID; // language can usually left alone, the system's default is used FSaved: Boolean; // set in SaveToStream, True in case saving was successfull otherwise False FNormalizationForm: TNormalizationForm; // determines in which form Unicode strings should be stored FOnConfirmConversion: TConfirmConversionEvent; FSaveFormat: TSaveFormat; // overrides the FSaveUnicode flag, initialized when a file is loaded, // expect losses if it is set to sfAnsi before saving function GetCommaText: WideString; function GetName(Index: Integer): WideString; function GetValue(const Name: WideString): WideString; procedure ReadData(Reader: TReader); procedure SetCommaText(const Value: WideString); procedure SetNormalizationForm(const Value: TNormalizationForm); procedure SetValue(const Name, Value: WideString); procedure WriteData(Writer: TWriter); function GetSaveUnicode: Boolean; procedure SetSaveUnicode(const Value: Boolean); protected procedure DefineProperties(Filer: TFiler); override; procedure DoConfirmConversion(var Allowed: Boolean); virtual; procedure Error(const Msg: string; Data: Integer); function Get(Index: Integer): WideString; virtual; abstract; function GetCapacity: Integer; virtual; function GetCount: Integer; virtual; abstract; function GetObject(Index: Integer): TObject; virtual; function GetTextStr: WideString; virtual; procedure Put(Index: Integer; const S: WideString); virtual; abstract; procedure PutObject(Index: Integer; AObject: TObject); virtual; abstract; procedure SetCapacity(NewCapacity: Integer); virtual; procedure SetUpdateState(Updating: Boolean); virtual; procedure SetLanguage(Value: LCID); virtual; public constructor Create; function Add(const S: WideString): Integer; virtual; function AddObject(const S: WideString; AObject: TObject): Integer; virtual; procedure Append(const S: WideString); procedure AddStrings(Strings: TStrings); overload; virtual; procedure AddStrings(Strings: TWideStrings); overload; virtual; procedure Assign(Source: TPersistent); override; procedure AssignTo(Dest: TPersistent); override; procedure BeginUpdate; procedure Clear; virtual; abstract; procedure Delete(Index: Integer); virtual; abstract; procedure EndUpdate; function Equals(Strings: TWideStrings): Boolean; {$IFDEF RTL200_UP} reintroduce; {$ENDIF RTL200_UP} procedure Exchange(Index1, Index2: Integer); virtual; function GetSeparatedText(Separators: WideString): WideString; virtual; function GetText: PWideChar; virtual; function IndexOf(const S: WideString): Integer; virtual; function IndexOfName(const Name: WideString): Integer; function IndexOfObject(AObject: TObject): Integer; procedure Insert(Index: Integer; const S: WideString); virtual; abstract; procedure InsertObject(Index: Integer; const S: WideString; AObject: TObject); procedure LoadFromFile(const FileName: TFileName); virtual; procedure LoadFromStream(Stream: TStream); virtual; procedure Move(CurIndex, NewIndex: Integer); virtual; procedure SaveToFile(const FileName: TFileName); virtual; procedure SaveToStream(Stream: TStream; WithBOM: Boolean = True); virtual; procedure SetText(const Value: WideString); virtual; property Capacity: Integer read GetCapacity write SetCapacity; property CommaText: WideString read GetCommaText write SetCommaText; property Count: Integer read GetCount; property Language: LCID read FLanguage write SetLanguage; property Names[Index: Integer]: WideString read GetName; property NormalizationForm: TNormalizationForm read FNormalizationForm write SetNormalizationForm default nfC; property Objects[Index: Integer]: TObject read GetObject write PutObject; property Values[const Name: WideString]: WideString read GetValue write SetValue; property Saved: Boolean read FSaved; property SaveUnicode: Boolean read GetSaveUnicode write SetSaveUnicode default True; property SaveFormat: TSaveFormat read FSaveFormat write FSaveFormat default sfUnicodeLSB; property Strings[Index: Integer]: WideString read Get write Put; default; property Text: WideString read GetTextStr write SetText; property OnConfirmConversion: TConfirmConversionEvent read FOnConfirmConversion write FOnConfirmConversion; end; //----- TWideStringList class TWideStringItem = record {$IFDEF OWN_WIDESTRING_MEMMGR} FString: PWideChar; // "array of WideChar"; {$ELSE ~OWN_WIDESTRING_MEMMGR} FString: WideString; {$ENDIF ~OWN_WIDESTRING_MEMMGR} FObject: TObject; end; TWideStringItemList = array of TWideStringItem; TWideStringList = class(TWideStrings) private FList: TWideStringItemList; FCount: Integer; FSorted: Boolean; FDuplicates: TDuplicates; FOnChange: TNotifyEvent; FOnChanging: TNotifyEvent; procedure ExchangeItems(Index1, Index2: Integer); procedure Grow; procedure QuickSort(L, R: Integer); procedure InsertItem(Index: Integer; const S: WideString); procedure SetSorted(Value: Boolean); {$IFDEF OWN_WIDESTRING_MEMMGR} procedure SetListString(Index: Integer; const S: WideString); {$ENDIF OWN_WIDESTRING_MEMMGR} protected procedure Changed; virtual; procedure Changing; virtual; function Get(Index: Integer): WideString; override; function GetCapacity: Integer; override; function GetCount: Integer; override; function GetObject(Index: Integer): TObject; override; procedure Put(Index: Integer; const S: WideString); override; procedure PutObject(Index: Integer; AObject: TObject); override; procedure SetCapacity(NewCapacity: Integer); override; procedure SetUpdateState(Updating: Boolean); override; procedure SetLanguage(Value: LCID); override; public destructor Destroy; override; function Add(const S: WideString): Integer; override; procedure Clear; override; procedure Delete(Index: Integer); override; procedure Exchange(Index1, Index2: Integer); override; function Find(const S: WideString; var Index: Integer): Boolean; virtual; function IndexOf(const S: WideString): Integer; override; procedure Insert(Index: Integer; const S: WideString); override; procedure Sort; virtual; property Duplicates: TDuplicates read FDuplicates write FDuplicates; property Sorted: Boolean read FSorted write SetSorted; property OnChange: TNotifyEvent read FOnChange write FOnChange; property OnChanging: TNotifyEvent read FOnChanging write FOnChanging; end; { // all these functions are now in JclWideStrings.pas function StrLenW(Str: PWideChar): SizeInt; function StrEndW(Str: PWideChar): PWideChar; function StrMoveW(Dest, Source: PWideChar; Count: SizeInt): PWideChar; function StrCopyW(Dest, Source: PWideChar): PWideChar; function StrECopyW(Dest, Source: PWideChar): PWideChar; function StrLCopyW(Dest, Source: PWideChar; MaxLen: SizeInt): PWideChar; function StrPCopyWW(Dest: PWideChar; const Source: WideString): PWideChar; overload; function StrPCopyW(Dest: PWideChar; const Source: AnsiString): PWideChar; function StrPLCopyWW(Dest: PWideChar; const Source: WideString; MaxLen: SizeInt): PWideChar; function StrPLCopyW(Dest: PWideChar; const Source: AnsiString; MaxLen: SizeInt): PWideChar; function StrCatW(Dest: PWideChar; const Source: PWideChar): PWideChar; function StrLCatW(Dest, Source: PWideChar; MaxLen: SizeInt): PWideChar; function StrCompW(const Str1, Str2: PWideChar): Integer; function StrICompW(const Str1, Str2: PWideChar): Integer; function StrLCompW(const Str1, Str2: PWideChar; MaxLen: SizeInt): Integer; function StrLICompW(const Str1, Str2: PWideChar; MaxLen: SizeInt): Integer; function StrNScanW(const Str1, Str2: PWideChar): SizeInt; function StrRNScanW(const Str1, Str2: PWideChar): SizeInt; function StrScanW(Str: PWideChar; Chr: WideChar): PWideChar; overload; function StrScanW(Str: PWideChar; Chr: WideChar; StrLen: SizeInt): PWideChar; overload; function StrRScanW(Str: PWideChar; Chr: WideChar): PWideChar; function StrPosW(Str, SubStr: PWideChar): PWideChar; function StrAllocW(WideSize: SizeInt): PWideChar; function StrBufSizeW(const Str: PWideChar): SizeInt; function StrNewW(const Str: PWideChar): PWideChar; overload; function StrNewW(const Str: WideString): PWideChar; overload; procedure StrDisposeW(Str: PWideChar); procedure StrDisposeAndNilW(var Str: PWideChar); procedure StrSwapByteOrder(Str: PWideChar); } // functions involving Delphi wide strings function WideAdjustLineBreaks(const S: WideString): WideString; function WideCharPos(const S: WideString; const Ch: WideChar; const Index: SizeInt): SizeInt; //az function WideCompose(const S: WideString): WideString; function WideDecompose(const S: WideString; Compatible: Boolean): WideString; function WideExtractQuotedStr(var Src: PWideChar; Quote: WideChar): WideString; function WideQuotedStr(const S: WideString; Quote: WideChar): WideString; function WideStringOfChar(C: WideChar; Count: SizeInt): WideString; // case conversion function type TCaseType = (ctFold, ctLower, ctTitle, ctUpper); function WideCaseConvert(C: WideChar; CaseType: TCaseType): WideString; overload; function WideCaseConvert(const S: WideString; CaseType: TCaseType): WideString; overload; function WideCaseFolding(C: WideChar): WideString; overload; {$IFDEF SUPPORTS_INLINE} inline; {$ENDIF SUPPORTS_INLINE} function WideCaseFolding(const S: WideString): WideString; overload; {$IFDEF SUPPORTS_INLINE} inline; {$ENDIF SUPPORTS_INLINE} function WideLowerCase(C: WideChar): WideString; overload; {$IFDEF SUPPORTS_INLINE} inline; {$ENDIF SUPPORTS_INLINE} function WideLowerCase(const S: WideString): WideString; overload; {$IFDEF SUPPORTS_INLINE} inline; {$ENDIF SUPPORTS_INLINE} function WideTitleCase(C: WideChar): WideString; overload; {$IFDEF SUPPORTS_INLINE} inline; {$ENDIF SUPPORTS_INLINE} function WideTitleCase(const S: WideString): WideString; overload; {$IFDEF SUPPORTS_INLINE} inline; {$ENDIF SUPPORTS_INLINE} function WideUpperCase(C: WideChar): WideString; overload; {$IFDEF SUPPORTS_INLINE} inline; {$ENDIF SUPPORTS_INLINE} function WideUpperCase(const S: WideString): WideString; overload; {$IFDEF SUPPORTS_INLINE} inline; {$ENDIF SUPPORTS_INLINE} function WideNormalize(const S: WideString; Form: TNormalizationForm): WideString; function WideSameText(const Str1, Str2: WideString): Boolean; function WideTrim(const S: WideString): WideString; function WideTrimLeft(const S: WideString): WideString; function WideTrimRight(const S: WideString): WideString; type // result type for number retrieval functions TUcNumber = record Numerator, Denominator: Integer; end; // Low level character routines function UnicodeNumberLookup(Code: UCS4; var Number: TUcNumber): Boolean; function UnicodeCompose(const Codes: array of UCS4; out Composite: UCS4): Integer; function UnicodeCaseFold(Code: UCS4): TUCS4Array; function UnicodeToUpper(Code: UCS4): TUCS4Array; function UnicodeToLower(Code: UCS4): TUCS4Array; function UnicodeToTitle(Code: UCS4): TUCS4Array; // Character test routines function UnicodeIsAlpha(C: UCS4): Boolean; function UnicodeIsDigit(C: UCS4): Boolean; function UnicodeIsAlphaNum(C: UCS4): Boolean; function UnicodeIsCased(C: UCS4): Boolean; function UnicodeIsControl(C: UCS4): Boolean; function UnicodeIsSpace(C: UCS4): Boolean; function UnicodeIsWhiteSpace(C: UCS4): Boolean; function UnicodeIsBlank(C: UCS4): Boolean; function UnicodeIsPunctuation(C: UCS4): Boolean; function UnicodeIsGraph(C: UCS4): Boolean; function UnicodeIsPrintable(C: UCS4): Boolean; function UnicodeIsUpper(C: UCS4): Boolean; function UnicodeIsLower(C: UCS4): Boolean; function UnicodeIsTitle(C: UCS4): Boolean; function UnicodeIsHexDigit(C: UCS4): Boolean; function UnicodeIsIsoControl(C: UCS4): Boolean; function UnicodeIsFormatControl(C: UCS4): Boolean; function UnicodeIsSymbol(C: UCS4): Boolean; function UnicodeIsNumber(C: UCS4): Boolean; function UnicodeIsNonSpacing(C: UCS4): Boolean; function UnicodeIsOpenPunctuation(C: UCS4): Boolean; function UnicodeIsClosePunctuation(C: UCS4): Boolean; function UnicodeIsInitialPunctuation(C: UCS4): Boolean; function UnicodeIsFinalPunctuation(C: UCS4): Boolean; function UnicodeIsComposed(C: UCS4): Boolean; function UnicodeIsQuotationMark(C: UCS4): Boolean; function UnicodeIsSymmetric(C: UCS4): Boolean; function UnicodeIsMirroring(C: UCS4): Boolean; function UnicodeIsNonBreaking(C: UCS4): Boolean; // Directionality functions function UnicodeIsRightToLeft(C: UCS4): Boolean; function UnicodeIsLeftToRight(C: UCS4): Boolean; function UnicodeIsStrong(C: UCS4): Boolean; function UnicodeIsWeak(C: UCS4): Boolean; function UnicodeIsNeutral(C: UCS4): Boolean; function UnicodeIsSeparator(C: UCS4): Boolean; // Other character test functions function UnicodeIsMark(C: UCS4): Boolean; function UnicodeIsModifier(C: UCS4): Boolean; function UnicodeIsLetterNumber(C: UCS4): Boolean; function UnicodeIsConnectionPunctuation(C: UCS4): Boolean; function UnicodeIsDash(C: UCS4): Boolean; function UnicodeIsMath(C: UCS4): Boolean; function UnicodeIsCurrency(C: UCS4): Boolean; function UnicodeIsModifierSymbol(C: UCS4): Boolean; function UnicodeIsNonSpacingMark(C: UCS4): Boolean; function UnicodeIsSpacingMark(C: UCS4): Boolean; function UnicodeIsEnclosing(C: UCS4): Boolean; function UnicodeIsPrivate(C: UCS4): Boolean; function UnicodeIsSurrogate(C: UCS4): Boolean; function UnicodeIsLineSeparator(C: UCS4): Boolean; function UnicodeIsParagraphSeparator(C: UCS4): Boolean; function UnicodeIsIdentifierStart(C: UCS4): Boolean; function UnicodeIsIdentifierPart(C: UCS4): Boolean; function UnicodeIsDefined(C: UCS4): Boolean; function UnicodeIsUndefined(C: UCS4): Boolean; function UnicodeIsHan(C: UCS4): Boolean; function UnicodeIsHangul(C: UCS4): Boolean; // Utility functions function CharSetFromLocale(Language: LCID): Byte; function GetCharSetFromLocale(Language: LCID; out FontCharSet: Byte): Boolean; function CodePageFromLocale(Language: LCID): Word; function CodeBlockName(const CB: TUnicodeBlock): string; function CodeBlockRange(const CB: TUnicodeBlock): TUnicodeBlockRange; function CodeBlockFromChar(const C: UCS4): TUnicodeBlock; function KeyboardCodePage: Word; function KeyUnicode(C: Char): WideChar; function StringToWideStringEx(const S: AnsiString; CodePage: Word): WideString; function TranslateString(const S: AnsiString; CP1, CP2: Word): AnsiString; function WideStringToStringEx(const WS: WideString; CodePage: Word): AnsiString; type TCompareFunc = function (const W1, W2: WideString; Locale: LCID): Integer; var WideCompareText: TCompareFunc; type EJclUnicodeError = class(EJclError); // functions to load Unicode data from resource procedure LoadCharacterCategories; procedure LoadCaseMappingData; procedure LoadDecompositionData; procedure LoadCombiningClassData; procedure LoadNumberData; procedure LoadCompositionData; {$IFDEF UNITVERSIONING} const UnitVersioning: TUnitVersionInfo = ( RCSfile: '$URL: https://jcl.svn.sourceforge.net/svnroot/jcl/tags/JCL-2.1-Build3536/jcl/source/common/JclUnicode.pas $'; Revision: '$Revision: 2930 $'; Date: '$Date: 2009-08-09 19:06:40 +0200 (dim., 09 août 2009) $'; LogPath: 'JCL\source\common'; Extra: ''; Data: nil ); {$ENDIF UNITVERSIONING} implementation // Unicode data for case mapping, decomposition, numbers etc. This data is // loaded on demand which means only those parts will be put in memory which are // needed by one of the lookup functions. // Note: There is a little tool called UDExtract which creates a resouce script from // the Unicode database file which can be compiled to the needed res file. // This tool, including its source code, can be downloaded from www.lischke-online.de/Unicode.html. {$IFDEF UNICODE_RAW_DATA} {$R JclUnicode.res} {$ENDIF UNICODE_RAW_DATA} {$IFDEF UNICODE_BZIP2_DATA} {$R JclUnicodeBZip2.res} {$ENDIF UNICODE_BZIP2_DATA} {$IFDEF UNICODE_ZLIB_DATA} {$R JclUnicodeZLib.res} {$ENDIF UNICODE_ZLIB_DATA} uses {$IFDEF HAS_UNIT_RTLCONSTS} {$IFDEF BORLAND} RtlConsts, {$ENDIF BORLAND} {$ELSE ~HAS_UNIT_RTLCONSTS} {$IFNDEF FPC} Consts, {$ENDIF ~FPC} {$ENDIF ~HAS_UNIT_RTLCONSTS} {$IFDEF UNICODE_BZIP2_DATA} BZip2, {$ENDIF UNICODE_BZIP2_DATA} {$IFDEF UNICODE_ZLIB_DATA} ZLibh, {$ENDIF UNICODE_ZLIB_DATA} JclStreams, {$IFNDEF UNICODE_RAW_DATA} JclCompression, {$ENDIF ~UNICODE_RAW_DATA} JclResources, JclSynch, JclSysUtils, JclSysInfo, JclStringConversions, JclWideStrings; const {$IFDEF FPC} // declarations from unit [Rtl]Consts SDuplicateString = 'String list does not allow duplicates'; SListIndexError = 'List index out of bounds (%d)'; SSortedListError = 'Operation not allowed on sorted string list'; {$ENDIF FPC} // some predefined sets to shorten parameter lists below and ease repeative usage ClassLetter = [ccLetterUppercase, ccLetterLowercase, ccLetterTitlecase, ccLetterModifier, ccLetterOther]; ClassSpace = [ccSeparatorSpace, ccSpaceOther]; ClassPunctuation = [ccPunctuationConnector, ccPunctuationDash, ccPunctuationOpen, ccPunctuationClose, ccPunctuationOther, ccPunctuationInitialQuote, ccPunctuationFinalQuote]; ClassMark = [ccMarkNonSpacing, ccMarkSpacingCombining, ccMarkEnclosing]; ClassNumber = [ccNumberDecimalDigit, ccNumberLetter, ccNumberOther]; ClassSymbol = [ccSymbolMath, ccSymbolCurrency, ccSymbolModifier, ccSymbolOther]; ClassEuropeanNumber = [ccEuropeanNumber, ccEuropeanNumberSeparator, ccEuropeanNumberTerminator]; // used to negate a set of categories ClassAll = [Low(TCharacterCategory)..High(TCharacterCategory)]; var // As the global data can be accessed by several threads it should be guarded // while the data is loaded. LoadInProgress: TJclCriticalSection; function OpenResourceStream(const ResName: string): TJclEasyStream; var ResourceStream: TStream; {$IFNDEF UNICODE_RAW_DATA} DecompressionStream: TStream; RawStream: TMemoryStream; {$ENDIF ~UNICODE_RAW_DATA} begin ResourceStream := TResourceStream.Create(HInstance, ResName, 'UNICODEDATA'); {$IFDEF UNICODE_RAW_DATA} Result := TJclEasyStream.Create(ResourceStream, True); {$ENDIF UNICODE_RAW_DATA} {$IFDEF UNICODE_BZIP2_DATA} try LoadBZip2; DecompressionStream := TJclBZIP2DecompressionStream.Create(ResourceStream); try RawStream := TMemoryStream.Create; StreamCopy(DecompressionStream, RawStream); RawStream.Seek(0, soBeginning); Result := TJclEasyStream.Create(RawStream, True); finally DecompressionStream.Free; end; finally ResourceStream.Free; end; {$ENDIF UNICODE_BZIP2_DATA} {$IFDEF UNICODE_ZLIB_DATA} try LoadZLib; DecompressionStream := TJclZLibDecompressStream.Create(ResourceStream); try RawStream := TMemoryStream.Create; StreamCopy(DecompressionStream, RawStream); RawStream.Seek(0, soBeginning); Result := TJclEasyStream.Create(RawStream, True); finally DecompressionStream.Free; end; finally ResourceStream.Free; end; {$ENDIF UNICODE_ZLIB_DATA} end; //----------------- support for character categories ----------------------------------------------- // Character category data is quite a large block since every defined character in Unicode is assigned at least // one category. Because of this we cannot use a sparse matrix to provide quick access as implemented for // e.g. composition data. // The approach used here is based on the fact that an application seldomly uses all characters defined in Unicode // simultanously. In fact the opposite is true. Most application will use either Western Europe or Arabic or // Far East character data, but very rarely all together. Based on this fact is the implementation of virtual // memory using the systems paging file (aka file mapping) to load only into virtual memory what is used currently. // The implementation is not yet finished and needs a lot of improvements yet. type // start and stop of a range of code points TRange = record Start, Stop: Cardinal; end; TRangeArray = array of TRange; TCategoriesArray = array of array of TCharacterCategories; var // character categories, stored in the system's swap file and mapped on demand CategoriesLoaded: Boolean; Categories: array [Byte] of TCategoriesArray; procedure LoadCharacterCategories; // Loads the character categories data (as saved by the Unicode database extractor, see also // the comments about JclUnicode.res above). var Size: Integer; Stream: TJclEasyStream; Category: TCharacterCategory; Buffer: TRangeArray; First, Second, Third: Byte; J, K: Integer; begin // Data already loaded? if not CategoriesLoaded then begin // make sure no other code is currently modifying the global data area LoadInProgress.Enter; try CategoriesLoaded := True; Stream := OpenResourceStream('CATEGORIES'); try while Stream.Position < Stream.Size do begin // a) read which category is current in the stream Category := TCharacterCategory(Stream.ReadByte); // b) read the size of the ranges and the ranges themself Size := Stream.ReadInteger; if Size > 0 then begin SetLength(Buffer, Size); for J := 0 to Size - 1 do begin Buffer[J].Start := Stream.ReadInteger; Buffer[J].Stop := Stream.ReadInteger; end; // c) go through every range and add the current category to each code point for J := 0 to Size - 1 do for K := Buffer[J].Start to Buffer[J].Stop do begin Assert(K < $1000000, LoadResString(@RsCategoryUnicodeChar)); First := (K shr 16) and $FF; Second := (K shr 8) and $FF; Third := K and $FF; // add second step array if not yet done if Categories[First] = nil then SetLength(Categories[First], 256); if Categories[First, Second] = nil then SetLength(Categories[First, Second], 256); Include(Categories[First, Second, Third], Category); end; end; end; finally Stream.Free; end; finally LoadInProgress.Leave; end; end; end; function CategoryLookup(Code: Cardinal; Cats: TCharacterCategories): Boolean; overload; // determines whether the Code is in the given category var First, Second, Third: Byte; begin Assert(Code < $1000000, LoadResString(@RsCategoryUnicodeChar)); // load property data if not already done if not CategoriesLoaded then LoadCharacterCategories; First := (Code shr 16) and $FF; Second := (Code shr 8) and $FF; Third := Code and $FF; if (Categories[First] <> nil) and (Categories[First, Second] <> nil) then Result := Categories[First, Second, Third] * Cats <> [] else Result := False; end; //----------------- support for case mapping ------------------------------------------------------- type TCase = array [TCaseType] of TUCS4Array; // mapping for case fold, lower, title and upper in this order TCaseArray = array of array of TCase; var // An array for all case mappings (including 1 to many casing if saved by the extraction program). // The organization is a sparse, two stage matrix. // SingletonMapping is to quickly return a single default mapping. CaseDataLoaded: Boolean; CaseMapping: array [Byte] of TCaseArray; procedure LoadCaseMappingData; var Stream: TJclEasyStream; I, J, Code, Size: Integer; First, Second, Third: Byte; begin if not CaseDataLoaded then begin // make sure no other code is currently modifying the global data area LoadInProgress.Enter; try CaseDataLoaded := True; Stream := OpenResourceStream('CASE'); try // the first entry in the stream is the number of entries in the case mapping table Size := Stream.ReadInteger; for I := 0 to Size - 1 do begin // a) read actual code point Code := Stream.ReadInteger; Assert(Code < $1000000, LoadResString(@RsCasedUnicodeChar)); // if there is no high byte entry in the first stage table then create one First := (Code shr 16) and $FF; Second := (Code shr 8) and $FF; Third := Code and $FF; if CaseMapping[First] = nil then SetLength(CaseMapping[First], 256); if CaseMapping[First, Second] = nil then SetLength(CaseMapping[First, Second], 256); // b) read fold case array Size := Stream.ReadInteger; if Size > 0 then begin SetLength(CaseMapping[First, Second, Third, ctFold], Size); for J := 0 to Size - 1 do CaseMapping[First, Second, Third, ctFold, J] := Stream.ReadInteger; end; // c) read lower case array Size := Stream.ReadInteger; if Size > 0 then begin SetLength(CaseMapping[First, Second, Third, ctLower], Size); for J := 0 to Size - 1 do CaseMapping[First, Second, Third, ctLower, J] := Stream.ReadInteger; end; // d) read title case array Size := Stream.ReadInteger; if Size > 0 then begin SetLength(CaseMapping[First, Second, Third, ctTitle], Size); for J := 0 to Size - 1 do CaseMapping[First, Second, Third, ctTitle, J] := Stream.ReadInteger; end; // e) read upper case array Size := Stream.ReadInteger; if Size > 0 then begin SetLength(CaseMapping[First, Second, Third, ctUpper], Size); for J := 0 to Size - 1 do CaseMapping[First, Second, Third, ctUpper, J] := Stream.ReadInteger; end; end; finally Stream.Free; end; finally LoadInProgress.Leave; end; end; end; function CaseLookup(Code: Cardinal; CaseType: TCaseType; var Mapping: TUCS4Array): Boolean; // Performs a lookup of the given code; returns True if Found, with Mapping referring to the mapping. // ctFold is handled specially: if no mapping is found then result of looking up ctLower // is returned var First, Second, Third: Byte; begin Assert(Code < $1000000, LoadResString(@RsCasedUnicodeChar)); // load case mapping data if not already done if not CaseDataLoaded then LoadCaseMappingData; First := (Code shr 16) and $FF; Second := (Code shr 8) and $FF; Third := Code and $FF; // Check first stage table whether there is a mapping for a particular block and // (if so) then whether there is a mapping or not. if (CaseMapping[First] <> nil) and (CaseMapping[First, Second] <> nil) and (CaseMapping[First, Second, Third, CaseType] <> nil) then Mapping := CaseMapping[First, Second, Third, CaseType] else Mapping := nil; Result := Assigned(Mapping); // defer to lower case if no fold case exists if not Result and (CaseType = ctFold) and (CaseMapping[First] <> nil) and (CaseMapping[First, Second] <> nil) and (CaseMapping[First, Second, Third, ctLower] <> nil) then begin Mapping := CaseMapping[First, Second, Third, ctLower]; Result := Assigned(Mapping); end; end; function UnicodeCaseFold(Code: UCS4): TUCS4Array; // This function returnes an array of special case fold mappings if there is one defined for the given // code, otherwise the lower case will be returned. This all applies only to cased code points. // Uncased code points are returned unchanged. begin SetLength(Result, 0); if not CaseLookup(Code, ctFold, Result) then begin SetLength(Result, 1); Result[0] := Code; end; end; function UnicodeToUpper(Code: UCS4): TUCS4Array; begin SetLength(Result, 0); if not CaseLookup(Code, ctUpper, Result) then begin SetLength(Result, 1); Result[0] := Code; end; end; function UnicodeToLower(Code: UCS4): TUCS4Array; begin SetLength(Result, 0); if not CaseLookup(Code, ctLower, Result) then begin SetLength(Result, 1); Result[0] := Code; end; end; function UnicodeToTitle(Code: UCS4): TUCS4Array; begin SetLength(Result, 0); if not CaseLookup(Code, ctTitle, Result) then begin SetLength(Result, 1); Result[0] := Code; end; end; //----------------- support for decomposition ------------------------------------------------------ const // constants for hangul composition and hangul-to-jamo decomposition SBase = $AC00; // hangul syllables start code point LBase = $1100; // leading syllable VBase = $1161; TBase = $11A7; // trailing syllable LCount = 19; VCount = 21; TCount = 28; NCount = VCount * TCount; // 588 SCount = LCount * NCount; // 11172 type TDecompositions = array of array of TUCS4Array; TDecompositionsArray = array [Byte] of TDecompositions; var // list of decompositions, organized (again) as three stage matrix // Note: there are two tables, one for canonical decompositions and the other one // for compatibility decompositions. DecompositionsLoaded: Boolean; CanonicalDecompositions, CompatibleDecompositions: TDecompositionsArray; procedure LoadDecompositionData; var Stream: TJclEasyStream; I, J, Code, Size: Integer; First, Second, Third: Byte; begin if not DecompositionsLoaded then begin // make sure no other code is currently modifying the global data area LoadInProgress.Enter; try DecompositionsLoaded := True; Stream := OpenResourceStream('DECOMPOSITION'); try // determine how many decomposition entries we have Size := Stream.ReadInteger; for I := 0 to Size - 1 do begin Code := Stream.ReadInteger; Assert((Code and not $40000000) < $1000000, LoadResString(@RsDecomposedUnicodeChar)); // if there is no high byte entry in the first stage table then create one First := (Code shr 16) and $FF; Second := (Code shr 8) and $FF; Third := Code and $FF; // insert into the correct table depending on bit 30 // (if set then it is a compatibility decomposition) if Code and $40000000 <> 0 then begin if CompatibleDecompositions[First] = nil then SetLength(CompatibleDecompositions[First], 256); if CompatibleDecompositions[First, Second] = nil then SetLength(CompatibleDecompositions[First, Second], 256); Size := Stream.ReadInteger; if Size > 0 then begin SetLength(CompatibleDecompositions[First, Second, Third], Size); for J := 0 to Size - 1 do CompatibleDecompositions[First, Second, Third, J] := Stream.ReadInteger; end; end else begin if CanonicalDecompositions[First] = nil then SetLength(CanonicalDecompositions[First], 256); if CanonicalDecompositions[First, Second] = nil then SetLength(CanonicalDecompositions[First, Second], 256); Size := Stream.ReadInteger; if Size > 0 then begin SetLength(CanonicalDecompositions[First, Second, Third], Size); for J := 0 to Size - 1 do CanonicalDecompositions[First, Second, Third, J] := Stream.ReadInteger; end; end; end; finally Stream.Free; end; finally LoadInProgress.Leave; end; end; end; function UnicodeDecomposeHangul(Code: UCS4): TUCS4Array; // algorithmically decomposes hangul character var Rest: Integer; begin Dec(Code, SBase); Rest := Code mod TCount; if Rest = 0 then SetLength(Result, 2) else SetLength(Result, 3); Result[0] := LBase + (Code div NCount); Result[1] := VBase + ((Code mod NCount) div TCount); if Rest <> 0 then Result[2] := TBase + Rest; end; function UnicodeDecompose(Code: UCS4; Compatible: Boolean): TUCS4Array; var First, Second, Third: Byte; begin Assert((Code and not $40000000) < $1000000, LoadResString(@RsDecomposedUnicodeChar)); // load decomposition data if not already done if not DecompositionsLoaded then LoadDecompositionData; Result := nil; // if the code is hangul then decomposition is algorithmically if UnicodeIsHangul(Code) then Result := UnicodeDecomposeHangul(Code) else begin First := (Code shr 16) and $FF; Second := (Code shr 8) and $FF; Third := Code and $FF; if Compatible then begin // Check first stage table whether there is a particular block and // (if so) then whether there is a decomposition or not. if (CompatibleDecompositions[First] = nil) or (CompatibleDecompositions[First, Second] = nil) or (CompatibleDecompositions[First, Second, Third] = nil) then begin // if there is no compatibility decompositions try canonical if (CanonicalDecompositions[First] = nil) or (CanonicalDecompositions[First, Second] = nil) or (CanonicalDecompositions[First, Second, Third] = nil) then Result := nil else Result := CanonicalDecompositions[First, Second, Third]; end else Result := CompatibleDecompositions[First, Second, Third]; end else begin if (CanonicalDecompositions[First] = nil) or (CanonicalDecompositions[First, Second] = nil) or (CanonicalDecompositions[First, Second, Third] = nil) then Result := nil else Result := CanonicalDecompositions[First, Second, Third]; end; end; end; //----------------- support for combining classes -------------------------------------------------- type TClassArray = array of array of Byte; var // canonical combining classes, again as two stage matrix CCCsLoaded: Boolean; CCCs: array [Byte] of TClassArray; procedure LoadCombiningClassData; var Stream: TJclEasyStream; I, J, K, Size: Integer; Buffer: TRangeArray; First, Second, Third: Byte; begin // make sure no other code is currently modifying the global data area LoadInProgress.Enter; try if not CCCsLoaded then begin CCCsLoaded := True; Stream := OpenResourceStream('COMBINING'); try while Stream.Position < Stream.Size do begin // a) determine which class is stored here I := Stream.ReadInteger; // b) determine how many ranges are assigned to this class Size := Stream.ReadInteger; // c) read start and stop code of each range if Size > 0 then begin SetLength(Buffer, Size); for J := 0 to Size - 1 do begin Buffer[J].Start := Stream.ReadInteger; Buffer[J].Stop := Stream.ReadInteger; end; // d) put this class in every of the code points just loaded for J := 0 to Size - 1 do for K := Buffer[J].Start to Buffer[J].Stop do begin // (outchy) TODO: handle in a cleaner way Assert(K < $1000000, LoadResString(@RsCombiningClassUnicodeChar)); First := (K shr 16) and $FF; Second := (K shr 8) and $FF; Third := K and $FF; // add second step array if not yet done if CCCs[First] = nil then SetLength(CCCs[First], 256); if CCCs[First, Second] = nil then SetLength(CCCs[First, Second], 256); CCCs[First, Second, Third] := I; end; end; end; finally Stream.Free; end; end; finally LoadInProgress.Leave; end; end; function CanonicalCombiningClass(Code: Cardinal): Cardinal; var First, Second, Third: Byte; begin Assert(Code < $1000000, LoadResString(@RsCombiningClassUnicodeChar)); // load combining class data if not already done if not CCCsLoaded then LoadCombiningClassData; First := (Code shr 16) and $FF; Second := (Code shr 8) and $FF; Third := Code and $FF; if (CCCs[First] <> nil) and (CCCs[First, Second] <> nil) then Result := CCCs[First, Second, Third] else Result := 0; end; //----------------- support for numeric values ----------------------------------------------------- type // structures for handling numbers TCodeIndex = record Code, Index: Cardinal; end; var // array to hold the number equivalents for specific codes NumberCodes: array of TCodeIndex; // array of numbers used in NumberCodes Numbers: array of TUcNumber; procedure LoadNumberData; var Stream: TJclEasyStream; Size, I: Integer; begin // make sure no other code is currently modifying the global data area LoadInProgress.Enter; try if NumberCodes = nil then begin Stream := OpenResourceStream('NUMBERS'); try // Numbers are special (compared to other Unicode data) as they utilize two // arrays, one containing all used numbers (in nominator-denominator format) and // another one which maps a code point to one of the numbers in the first array. // a) determine size of numbers array Size := Stream.ReadInteger; SetLength(Numbers, Size); // b) read numbers data for I := 0 to Size - 1 do begin Numbers[I].Numerator := Stream.ReadInteger; Numbers[I].Denominator := Stream.ReadInteger; end; // c) determine size of index array Size := Stream.ReadInteger; SetLength(NumberCodes, Size); // d) read index data for I := 0 to Size - 1 do begin NumberCodes[I].Code := Stream.ReadInteger; NumberCodes[I].Index := Stream.ReadInteger; end; finally Stream.Free; end; end; finally LoadInProgress.Leave; end; end; function UnicodeNumberLookup(Code: UCS4; var Number: TUcNumber): Boolean; // Searches for the given code and returns its number equivalent (if there is one). // Typical cases are: '1/6' (U+2159), '3/8' (U+215C), 'XII' (U+216B) etc. // Result is set to True if the code could be found. var L, R, M: Integer; begin // load number data if not already done if NumberCodes = nil then LoadNumberData; Result := False; L := 0; R := High(NumberCodes); while L <= R do begin M := (L + R) shr 1; if Code > NumberCodes[M].Code then L := M + 1 else begin if Code < NumberCodes[M].Code then R := M - 1 else begin Number := Numbers[NumberCodes[M].Index]; Result := True; Break; end; end; end; end; //----------------- support for composition -------------------------------------------------------- type // maps between a pair of code points to a composite code point // Note: the source pair is packed into one 4 byte value to speed up search. TComposition = record Code: Cardinal; First: Cardinal; Next: array of Cardinal; end; var // list of composition mappings Compositions: array of TComposition; MaxCompositionSize: Integer; procedure LoadCompositionData; var Stream: TJclEasyStream; I, J, Size: Integer; begin // make sure no other code is currently modifying the global data area LoadInProgress.Enter; try if Compositions = nil then begin Stream := OpenResourceStream('COMPOSITION'); try // a) determine size of compositions array Size := Stream.ReadInteger; SetLength(Compositions, Size); // b) read data for I := 0 to Size - 1 do begin Compositions[I].Code := Stream.ReadInteger; Size := Stream.ReadInteger; if Size > MaxCompositionSize then MaxCompositionSize := Size; SetLength(Compositions[I].Next, Size - 1); Compositions[I].First := Stream.ReadInteger; for J := 0 to Size - 2 do Compositions[I].Next[J] := Stream.ReadInteger; end; finally Stream.Free; end; end; finally LoadInProgress.Leave; end; end; function UnicodeCompose(const Codes: array of UCS4; out Composite: UCS4): Integer; // Maps the sequence of Codes (up to MaxCompositionSize codes) to a composite // Result is the number of Codes that were composed (at least 1 if Codes is not empty) var L, R, M, I, HighCodes, HighNext: Integer; begin if Compositions = nil then LoadCompositionData; Result := 0; HighCodes := High(Codes); if HighCodes = -1 then Exit; if HighCodes = 0 then begin Result := 1; Composite := Codes[0]; Exit; end; L := 0; R := High(Compositions); while L <= R do begin M := (L + R) shr 1; if Compositions[M].First > Codes[0] then R := M - 1 else if Compositions[M].First < Codes[0] then L := M + 1 else begin // back to the first element where Codes[0] = First while (M > 0) and (Compositions[M-1].First = Codes[0]) do Dec(M); while (M <= High(Compositions)) and (Compositions[M].First = Codes[0]) do begin HighNext := High(Compositions[M].Next); Result := 0; if HighNext < HighCodes then // enough characters in buffer to be tested begin for I := 0 to HighNext do if Compositions[M].Next[I] = Codes[I + 1] then Result := I + 2 { +1 for first, +1 because of 0-based array } else Break; if Result = HighNext + 2 then // all codes matched begin Composite := Compositions[M].Code; Exit; end; end; Inc(M); end; Break; end; end; Result := 1; Composite := Codes[0]; end; //=== { TSearchEngine } ====================================================== constructor TSearchEngine.Create(AOwner: TWideStrings); begin inherited Create; FOwner := AOwner; FResults := TList.Create; end; destructor TSearchEngine.Destroy; begin Clear; FResults.Free; inherited Destroy; end; procedure TSearchEngine.AddResult(Start, Stop: SizeInt); begin FResults.Add(Pointer(Start)); FResults.Add(Pointer(Stop)); end; procedure TSearchEngine.Clear; begin ClearResults; end; procedure TSearchEngine.ClearResults; begin FResults.Clear; end; procedure TSearchEngine.DeleteResult(Index: SizeInt); // explicitly deletes a search result begin with FResults do begin // start index Delete(2 * Index); // stop index Delete(2 * Index); end; end; function TSearchEngine.GetCount: SizeInt; // returns the number of matches found begin Result := FResults.Count div 2; end; procedure TSearchEngine.GetResult(Index: SizeInt; var Start, Stop: SizeInt); // returns the start position of a match (end position can be determined by // adding the length of the pattern to the start position) begin Start := SizeInt(FResults[2 * Index]); Stop := SizeInt(FResults[2 * Index + 1]); end; //----------------- TUTBSearch --------------------------------------------------------------------- procedure TUTBMSearch.ClearPattern; begin FreeMem(FPattern); FPattern := nil; FFlags := []; FPatternUsed := 0; FPatternSize := 0; FPatternLength := 0; FreeMem(FSkipValues); FSkipValues := nil; FSkipsUsed := 0; FMD4 := 0; end; function TUTBMSearch.GetSkipValue(TextStart, TextEnd: PUCS2): SizeInt; // looks up the SkipValues value for a character var I: SizeInt; C1, C2: UCS4; Sp: PUTBMSkip; begin Result := 0; if TJclAddr(TextStart) < TJclAddr(TextEnd) then begin C1 := UCS4(TextStart^); if (TextStart + 1) < TextEnd then C2 := UCS4((TextStart + 1)^) else C2 := $FFFFFFFF; if (SurrogateHighStart <= C1) and (C1 <= SurrogateHighEnd) and (SurrogateLowStart <= C2) and (C2 <= $DDDD) then C1 := $10000 + (((C1 and $03FF) shl 10) or (C2 and $03FF)); Sp := FSkipValues; for I := 0 to FSkipsUsed - 1 do begin if not (Boolean(C1 xor Sp.BMChar.UpCase) and Boolean(C1 xor Sp.BMChar.LoCase) and Boolean(C1 xor Sp.BMChar.TitleCase)) then begin if (TextEnd - TextStart) < Sp.SkipValues then Result := TextEnd - TextStart else Result := Sp.SkipValues; Exit; end; Inc(Sp); end; Result := FPatternLength; end; end; function TUTBMSearch.Match(Text, Start, Stop: PUCS2; var MatchStart, MatchEnd: SizeInt): Boolean; // Checks once whether the text at position Start (which points to the end of the // current text part to be matched) matches. // Note: If whole words only are allowed then the left and right border tests are // done here too. The keypoint for the right border is that the next character // after the search string is either the text end or a space character. // For the left side this is similar, but there is nothing like a string // start marker (like the string end marker #0). // // It seems not obvious, but we still can use the passed Text pointer to do // the left check. Although this pointer might not point to the real string // start (e.g. in TUTBMSearch.FindAll Text is incremented as needed) it is // still a valid check mark. The reason is that Text either points to the // real string start or a previous match (happend already, keep in mind the // search options do not change in the FindAll loop) and the character just // before Text is a space character. // This fact implies, though, that strings passed to Find (or FindFirst, // FindAll in TUTBMSearch) always really start at the given address. Although // this might not be the case in some circumstances (e.g. if you pass only // the selection from an editor) it is still assumed that a pattern matching // from the first position on (from the search string start) also matches // when whole words only are allowed. var CheckSpace: Boolean; C1, C2: UCS4; Count: SizeInt; Cp: PUTBMChar; begin // be pessimistic Result := False; // set the potential match endpoint first MatchEnd := (Start - Text) + 1; C1 := UCS4(Start^); if (Start + 1) < Stop then C2 := UCS4((Start + 1)^) else C2 := $FFFFFFFF; if (SurrogateHighStart <= C1) and (C1 <= SurrogateHighEnd) and (SurrogateLowStart <= C2) and (C2 <= SurrogateLowEnd) then begin C1 := $10000 + (((C1 and $03FF) shl 10) or (C2 and $03FF)); // Adjust the match end point to occur after the UTF-16 character. Inc(MatchEnd); end; // check special cases if FPatternUsed = 1 then begin MatchStart := Start - Text; Result := True; Exit; end; // Early out if entire words need to be matched and the next character // in the search string is neither the string end nor a space character. if (sfWholeWordOnly in FFlags) and not ((Start + 1)^ = WideNull) and not UnicodeIsWhiteSpace(UCS4((Start + 1)^)) then Exit; // compare backward Cp := FPattern; Inc(Cp, FPatternUsed - 1); Count := FPatternLength; while (Start >= Text) and (Count > 0) do begin // ignore non-spacing characters if indicated if sfIgnoreNonSpacing in FFlags then begin while (Start > Text) and UnicodeIsNonSpacing(C1) do begin Dec(Start); C2 := UCS4(Start^); if (Start - 1) > Text then C1 := UCS4((Start - 1)^) else C1 := $FFFFFFFF; if (SurrogateLowStart <= C2) and (C2 <= SurrogateLowEnd) and (SurrogateHighStart <= C1) and (C1 <= SurrogateHighEnd) then begin C1 := $10000 + (((C1 and $03FF) shl 10) or (C2 and $03FF)); Dec(Start); end else C1 := C2; end; end; // handle space compression if indicated if sfSpaceCompress in FFlags then begin CheckSpace := False; while (Start > Text) and (UnicodeIsWhiteSpace(C1) or UnicodeIsControl(C1)) do begin CheckSpace := UnicodeIsWhiteSpace(C1); Dec(Start); C2 := UCS4(Start^); if (Start - 1) > Text then C1 := UCS4((Start - 1)^) else C1 := $FFFFFFFF; if (SurrogateLowStart <= C2) and (C2 <= SurrogateLowEnd) and (SurrogateHighStart <= C1) and (C1 <= SurrogateHighEnd) then begin C1 := $10000 + (((C1 and $03FF) shl 10) or (C2 and $03FF)); Dec(Start); end else C1 := C2; end; // Handle things if space compression was indicated and one or // more member characters were found. if CheckSpace then begin if Cp.UpCase <> $20 then Exit; Dec(Cp); Dec(Count); // If Count is 0 at this place then the space character(s) was the first // in the pattern and we need to correct the start position. if Count = 0 then Inc(Start); end; end; // handle the normal comparison cases if (Count > 0) and (Boolean(C1 xor Cp.UpCase) and Boolean(C1 xor Cp.LoCase) and Boolean(C1 xor Cp.TitleCase)) then Exit; if C1 >= $10000 then Dec(Count, 2) else Dec(Count, 1); if Count > 0 then begin Dec(Cp); // get the next preceding character if Start > Text then begin Dec(Start); C2 := UCS4(Start^); if (Start - 1) > Text then C1 := UCS4((Start - 1)^) else C1 := $FFFFFFFF; if (SurrogateLowStart <= C2) and (C2 <= SurrogateLowEnd) and (SurrogateHighStart <= C1) and (C1 <= SurrogateHighEnd) then begin C1 := $10000 + (((C1 and $03FF) shl 10) or (C2 and $03FF)); Dec(Start); end else C1 := C2; end; end; end; // So far the string matched. Now check its left border for a space character // if whole word only are allowed. if not (sfWholeWordOnly in FFlags) or (Start <= Text) or UnicodeIsWhiteSpace(UCS4((Start - 1)^)) then begin // set the match start position MatchStart := Start - Text; Result := True; end; end; procedure TUTBMSearch.Compile(Pattern: PUCS2; PatternLength: SizeInt; Flags: TSearchFlags); var HaveSpace: Boolean; I, J, K, SLen: SizeInt; Cp: PUTBMChar; Sp: PUTBMSkip; C1, C2, Sentinel: UCS4; begin if (Pattern <> nil) and (Pattern^ <> #0) and (PatternLength > 0) then begin // do some initialization FFlags := Flags; // extra skip flag FMD4 := 1; Sentinel := 0; // allocate more storage if necessary FPattern := AllocMem(SizeOf(TUTBMChar) * PatternLength); FSkipValues := AllocMem(SizeOf(TUTBMSkip) * PatternLength); FPatternSize := PatternLength; // Preprocess the pattern to remove controls (if specified) and determine case. Cp := FPattern; I := 0; HaveSpace := False; while I < PatternLength do begin C1 := UCS4(Pattern[I]); if (I + 1) < PatternLength then C2 := UCS4(Pattern[I + 1]) else C2 := $FFFFFFFF; if (SurrogateHighStart <= C1) and (C1 <= SurrogateHighEnd) and (SurrogateLowStart <= C2) and (C2 <= SurrogateLowEnd) then C1 := $10000 + (((C1 and $03FF) shl 10) or (C2 and $03FF)); // Make sure the HaveSpace flag is turned off if the character is not an // appropriate one. if not UnicodeIsWhiteSpace(C1) then HaveSpace := False; // If non-spacing characters should be ignored, do it here. if (sfIgnoreNonSpacing in Flags) and UnicodeIsNonSpacing(C1) then begin Inc(I); Continue; end; // check if spaces and controls need to be compressed if sfSpaceCompress in Flags then begin if UnicodeIsWhiteSpace(C1) then begin if not HaveSpace then begin // Add a space and set the flag. Cp.UpCase := $20; Cp.LoCase := $20; Cp.TitleCase := $20; Inc(Cp); // increase the real pattern length Inc(FPatternLength); Sentinel := $20; HaveSpace := True; end; Inc(I); Continue; end; // ignore all control characters if UnicodeIsControl(C1) then begin Inc(I); Continue; end; end; // add the character if not (sfCaseSensitive in Flags) then begin { TODO : use the entire mapping, not only the first character } Cp.UpCase := UnicodeToUpper(C1)[0]; Cp.LoCase := UnicodeToLower(C1)[0]; Cp.TitleCase := UnicodeToTitle(C1)[0]; end else begin Cp.UpCase := C1; Cp.LoCase := C1; Cp.TitleCase := C1; end; Sentinel := Cp.UpCase; // move to the next character Inc(Cp); // increase the real pattern length appropriately if C1 >= $10000 then Inc(FPatternLength, 2) else Inc(FPatternLength); // increment the loop index for UTF-16 characters if C1 > $10000 then Inc(I, 2) else Inc(I); end; // set the number of characters actually used FPatternUsed := (TJclAddr(Cp) - TJclAddr(FPattern)) div SizeOf(TUTBMChar); // Go through and construct the skip array and determine the actual length // of the pattern in UCS2 terms. SLen := FPatternLength - 1; Cp := FPattern; K := 0; for I := 0 to FPatternUsed - 1 do begin // locate the character in the FSkipValues array Sp := FSkipValues; J := 0; while (J < FSkipsUsed) and (Sp.BMChar.UpCase <> Cp.UpCase) do begin Inc(J); Inc(Sp); end; // If the character is not found, set the new FSkipValues element and // increase the number of FSkipValues elements. if J = FSkipsUsed then begin Sp.BMChar := Cp; Inc(FSkipsUsed); end; // Set the updated FSkipValues value. If the character is UTF-16 and is // not the last one in the pattern, add one to its FSkipValues value. Sp.SkipValues := SLen - K; if (Cp.UpCase >= $10000) and ((K + 2) < SLen) then Inc(Sp.SkipValues); // set the new extra FSkipValues for the sentinel character if ((Cp.UpCase >= $10000) and ((K + 2) <= SLen) or ((K + 1) <= SLen) and (Cp.UpCase = Sentinel)) then FMD4 := SLen - K; // increase the actual index if Cp.UpCase >= $10000 then Inc(K, 2) else Inc(K); Inc(Cp); end; end; end; function TUTBMSearch.Find(Text: PUCS2; TextLen: SizeInt; var MatchStart, MatchEnd: SizeInt): Boolean; // this is the main matching routine using a tuned Boyer-Moore algorithm var K: SizeInt; Start, Stop: PUCS2; begin Result := False; if (FPattern <> nil) and (FPatternUsed > 0) and (Text <> nil) and (TextLen > 0) and (TextLen >= FPatternLength) then begin Start := Text + FPatternLength - 1; Stop := Text + TextLen; // adjust the start point if it points to a low surrogate if (SurrogateLowStart <= UCS4(Start^)) and (UCS4(Start^) <= SurrogateLowEnd) and (SurrogateHighStart <= UCS4((Start - 1)^)) and (UCS4((Start - 1)^) <= SurrogateHighEnd) then Dec(Start); while Start < Stop do begin repeat K := GetSkipValue(Start, Stop); if K = 0 then Break; Inc(Start, K); if (Start < Stop) and (SurrogateLowStart <= UCS4(Start^)) and (UCS4(Start^) <= SurrogateLowEnd) and (SurrogateHighStart <= UCS4((Start - 1)^)) and (UCS4((Start - 1)^) <= SurrogateHighEnd) then Dec(Start); until False; if (Start < Stop) and Match(Text, Start, Stop, MatchStart, MatchEnd) then begin Result := True; Break; end; Inc(Start, FMD4); if (Start < Stop) and (SurrogateLowStart <= UCS4(Start^)) and (UCS4(Start^) <= SurrogateLowEnd) and (SurrogateHighStart <= UCS4((Start - 1)^)) and (UCS4((Start - 1)^) <= SurrogateHighEnd) then Dec(Start); end; end; end; procedure TUTBMSearch.Clear; begin ClearPattern; inherited Clear; end; function TUTBMSearch.FindAll(const Text: WideString): Boolean; begin Result := FindAll(PWideChar(Text), Length(Text)); end; function TUTBMSearch.FindAll(Text: PWideChar; TextLen: SizeInt): Boolean; // Looks for all occurences of the pattern passed to FindPrepare and creates an // internal list of their positions. var Start, Stop: SizeInt; Run: PWideChar; RunLen: SizeInt; begin ClearResults; Run := Text; RunLen := TextLen; Start := 0; Stop := 0; // repeat to find all occurences of the pattern while Find(Run, RunLen, Start, Stop) do begin // store this result (consider text pointer movement)... AddResult(Start + Run - Text, Stop + Run - Text); // ... and advance text position and length Inc(Run, Stop); Dec(RunLen, Stop); end; Result := Count > 0; end; function TUTBMSearch.FindFirst(const Text: WideString; var Start, Stop: SizeInt): Boolean; // Looks for the first occurence of the pattern passed to FindPrepare in Text and // returns True if one could be found (in which case Start and Stop are set to // the according indices) otherwise False. This function is in particular of // interest if only one occurence needs to be found. begin ClearResults; Result := Find(PWideChar(Text), Length(Text), Start, Stop); if Result then AddResult(Start, Stop); end; function TUTBMSearch.FindFirst(Text: PWideChar; TextLen: SizeInt; var Start, Stop: SizeInt): Boolean; // Same as the WideString version of this method. begin ClearResults; Result := Find(Text, TextLen, Start, Stop); if Result then AddResult(Start, Stop); end; procedure TUTBMSearch.FindPrepare(const Pattern: WideString; Options: TSearchFlags); begin FindPrepare(PWideChar(Pattern), Length(Pattern), Options); end; procedure TUTBMSearch.FindPrepare(Pattern: PWideChar; PatternLength: SizeInt; Options: TSearchFlags); // prepares following search by compiling the given pattern into an internal structure begin Compile(Pattern, PatternLength, Options); end; //----------------- Unicode RE search core --------------------------------------------------------- const // error codes _URE_OK = 0; _URE_UNEXPECTED_EOS = -1; _URE_CCLASS_OPEN = -2; _URE_UNBALANCED_GROUP = -3; _URE_INVALID_PROPERTY = -4; _URE_INVALID_RANGE = -5; _URE_RANGE_OPEN = -6; // options that can be combined for searching URE_IGNORE_NONSPACING = $01; URE_DONT_MATCHES_SEPARATORS = $02; const // Flags used internally in the DFA _URE_DFA_CASEFOLD = $01; _URE_DFA_BLANKLINE = $02; // symbol types for the DFA _URE_ANY_CHAR = 1; _URE_CHAR = 2; _URE_CCLASS = 3; _URE_NCCLASS = 4; _URE_BOL_ANCHOR = 5; _URE_EOL_ANCHOR = 6; // op codes for converting the NFA to a DFA _URE_SYMBOL = 10; _URE_PAREN = 11; _URE_QUEST = 12; _URE_STAR = 13; _URE_PLUS = 14; _URE_ONE = 15; _URE_AND = 16; _URE_OR = 17; _URE_NOOP = $FFFF; //----------------- TURESearch --------------------------------------------------------------------- procedure TURESearch.Clear; begin inherited Clear; ClearUREBuffer; ClearDFA; end; procedure TURESearch.Push(V: SizeInt); begin with FUREBuffer do begin // If the 'Reducing' parameter is True, check to see if the value passed is // already on the stack. if Reducing and ExpressionList.Expressions[Word(V)].OnStack then Exit; if Stack.ListUsed = Length(Stack.List) then SetLength(Stack.List, Length(Stack.List) + 8); Stack.List[Stack.ListUsed] := V; Inc(Stack.ListUsed); // If the 'reducing' parameter is True, flag the element as being on the Stack. if Reducing then ExpressionList.Expressions[Word(V)].OnStack := True; end; end; function TURESearch.Peek: SizeInt; begin if FUREBuffer.Stack.ListUsed = 0 then Result := _URE_NOOP else Result := FUREBuffer.Stack.List[FUREBuffer.Stack.ListUsed - 1]; end; function TURESearch.Pop: SizeInt; begin if FUREBuffer.Stack.ListUsed = 0 then Result := _URE_NOOP else begin Dec(FUREBuffer.Stack.ListUsed); Result := FUREBuffer.Stack.List[FUREBuffer.Stack.ListUsed]; if FUREBuffer.Reducing then FUREBuffer.ExpressionList.Expressions[Word(Result)].OnStack := False; end; end; function TURESearch.ParsePropertyList(Properties: PUCS2; Limit: SizeInt; var Categories: TCharacterCategories): SizeInt; // Parse a comma-separated list of integers that represent character properties. // Combine them into a set of categories and return the number of characters consumed. var N: SizeInt; Run, ListEnd: PUCS2; begin Run := Properties; ListEnd := Run + Limit; N := 0; Categories := []; while (FUREBuffer.Error = _URE_OK) and (Run < ListEnd) do begin if Run^ = ',' then begin // Encountered a comma, so take the number parsed so far as category and // reset the number. Include(Categories, TCharacterCategory(N)); N := 0; end else begin if (Run^ >= '0') and (Run^ <= '9') then begin // Encountered a digit, so start or continue building the SizeInt that // represents the character category. N := (N * 10) + SizeInt(Word(Run^) - Ord('0')); end else begin // Encountered something that is not part of the property list. // Indicate that we are done. Break; end; end; // If the number is to large then there is a problem. // Most likely a missing comma separator. if SizeInt(N) > Ord(High(TCharacterCategory)) then FUREBuffer.Error := _URE_INVALID_PROPERTY; Inc(Run); end; // Return the number of characters consumed. Result := Run - Properties; end; function TURESearch.MakeHexNumber(NP: PUCS2; Limit: SizeInt; var Number: UCS4): SizeInt; // Collect a hex number with 1 to 4 digits and return the number of characters used. var I: SizeInt; Run, ListEnd: PUCS2; begin Run := np; ListEnd := Run + Limit; Number := 0; I := 0; while (I < 4) and (Run < ListEnd) do begin if (Run^ >= '0') and (Run^ <= '9') then Number := (Number shl 4) or (Cardinal(Word(Run^) - Ord('0'))) else begin if (Run^ >= 'A') and (Run^ <= 'F') then Number := (Number shl 4) or (Cardinal(Word(Run^) - Ord('A')) + 10) else begin if (Run^ >= 'a') and (Run^ <= 'f') then Number := (Number shl 4) or (Cardinal(Word(Run^) - Ord('a')) + 10) else Break; end; end; Inc(I); Inc(Run); end; Result := Run - NP; end; procedure TURESearch.AddRange(var CCL: TUcCClass; Range: TUcRange); // Insert a Range into a character class, removing duplicates and ordering them // in increasing Range-start order. var I: SizeInt; Temp: UCS4; begin // If the `Casefold' flag is set, then make sure both endpoints of the Range // are converted to lower. if (FUREBuffer.Flags and _URE_DFA_CASEFOLD) <> 0 then begin { TODO : use the entire mapping, not only the first character } Range.MinCode := UnicodeToLower(Range.MinCode)[0]; Range.MaxCode := UnicodeToLower(Range.MaxCode)[0]; end; // Swap the Range endpoints if they are not in increasing order. if Range.MinCode > Range.MaxCode then begin Temp := Range.MinCode; Range.MinCode := Range.MaxCode; Range.MaxCode := Temp; end; I := 0; while (I < CCL.RangesUsed) and (Range.MinCode < CCL.Ranges[I].MinCode) do Inc(I); // check for a duplicate if (I < CCL.RangesUsed) and (Range.MinCode = CCL.Ranges[I].MinCode) and (Range.MaxCode = CCL.Ranges[I].MaxCode) then Exit; if CCL.RangesUsed = Length(CCL.Ranges) then SetLength(CCL.Ranges, Length(CCL.Ranges) + 8); if I < CCL.RangesUsed then Move(CCL.Ranges[I], CCL.Ranges[I + 1], SizeOf(TUcRange) * (CCL.RangesUsed - I)); CCL.Ranges[I].MinCode := Range.MinCode; CCL.Ranges[I].MaxCode := Range.MaxCode; Inc(CCL.RangesUsed); end; type PTrie = ^TTrie; TTrie = record Key: UCS2; Len, Next: SizeInt; Setup: SizeInt; Categories: TCharacterCategories; end; procedure TURESearch.SpaceSetup(Symbol: PUcSymbolTableEntry; Categories: TCharacterCategories); var Range: TUcRange; begin Symbol.Categories := Symbol.Categories + Categories; Range.MinCode := UCS4(WideTabulator); Range.MaxCode := UCS4(WideTabulator); AddRange(Symbol.Symbol.CCL, Range); Range.MinCode := UCS4(WideCarriageReturn); Range.MaxCode := UCS4(WideCarriageReturn); AddRange(Symbol.Symbol.CCL, Range); Range.MinCode := UCS4(WideLineFeed); Range.MaxCode := UCS4(WideLineFeed); AddRange(Symbol.Symbol.CCL, Range); Range.MinCode := UCS4(WideFormFeed); Range.MaxCode := UCS4(WideFormFeed); AddRange(Symbol.Symbol.CCL, Range); Range.MinCode := $FEFF; Range.MaxCode := $FEFF; AddRange(Symbol.Symbol.CCL, Range); end; procedure TURESearch.HexDigitSetup(Symbol: PUcSymbolTableEntry); var Range: TUcRange; begin Range.MinCode := UCS4('0'); Range.MaxCode := UCS4('9'); AddRange(Symbol.Symbol.CCL, Range); Range.MinCode := UCS4('A'); Range.MaxCode := UCS4('F'); AddRange(Symbol.Symbol.CCL, Range); Range.MinCode := UCS4('a'); Range.MaxCode := UCS4('f'); AddRange(Symbol.Symbol.CCL, Range); end; const CClassTrie: array [0..64] of TTrie = ( (Key: #$003A; Len: 1; Next: 1; Setup: 0; Categories: []), (Key: #$0061; Len: 9; Next: 10; Setup: 0; Categories: []), (Key: #$0063; Len: 8; Next: 19; Setup: 0; Categories: []), (Key: #$0064; Len: 7; Next: 24; Setup: 0; Categories: []), (Key: #$0067; Len: 6; Next: 29; Setup: 0; Categories: []), (Key: #$006C; Len: 5; Next: 34; Setup: 0; Categories: []), (Key: #$0070; Len: 4; Next: 39; Setup: 0; Categories: []), (Key: #$0073; Len: 3; Next: 49; Setup: 0; Categories: []), (Key: #$0075; Len: 2; Next: 54; Setup: 0; Categories: []), (Key: #$0078; Len: 1; Next: 59; Setup: 0; Categories: []), (Key: #$006C; Len: 1; Next: 11; Setup: 0; Categories: []), (Key: #$006E; Len: 2; Next: 13; Setup: 0; Categories: []), (Key: #$0070; Len: 1; Next: 16; Setup: 0; Categories: []), (Key: #$0075; Len: 1; Next: 14; Setup: 0; Categories: []), (Key: #$006D; Len: 1; Next: 15; Setup: 0; Categories: []), (Key: #$003A; Len: 1; Next: 16; Setup: 1; Categories: ClassLetter + ClassNumber), (Key: #$0068; Len: 1; Next: 17; Setup: 0; Categories: []), (Key: #$0061; Len: 1; Next: 18; Setup: 0; Categories: []), (Key: #$003A; Len: 1; Next: 19; Setup: 1; Categories: ClassLetter), (Key: #$006E; Len: 1; Next: 20; Setup: 0; Categories: []), (Key: #$0074; Len: 1; Next: 21; Setup: 0; Categories: []), (Key: #$0072; Len: 1; Next: 22; Setup: 0; Categories: []), (Key: #$006C; Len: 1; Next: 23; Setup: 0; Categories: []), (Key: #$003A; Len: 1; Next: 24; Setup: 1; Categories: [ccOtherControl, ccOtherFormat]), (Key: #$0069; Len: 1; Next: 25; Setup: 0; Categories: []), (Key: #$0067; Len: 1; Next: 26; Setup: 0; Categories: []), (Key: #$0069; Len: 1; Next: 27; Setup: 0; Categories: []), (Key: #$0074; Len: 1; Next: 28; Setup: 0; Categories: []), (Key: #$003A; Len: 1; Next: 29; Setup: 1; Categories: ClassNumber), (Key: #$0072; Len: 1; Next: 30; Setup: 0; Categories: []), (Key: #$0061; Len: 1; Next: 31; Setup: 0; Categories: []), (Key: #$0070; Len: 1; Next: 32; Setup: 0; Categories: []), (Key: #$0068; Len: 1; Next: 33; Setup: 0; Categories: []), (Key: #$003A; Len: 1; Next: 34; Setup: 1; Categories: ClassMark + ClassNumber + ClassLetter + ClassPunctuation + ClassSymbol), (Key: #$006F; Len: 1; Next: 35; Setup: 0; Categories: []), (Key: #$0077; Len: 1; Next: 36; Setup: 0; Categories: []), (Key: #$0065; Len: 1; Next: 37; Setup: 0; Categories: []), (Key: #$0072; Len: 1; Next: 38; Setup: 0; Categories: []), (Key: #$003A; Len: 1; Next: 39; Setup: 1; Categories: [ccLetterLowercase]), (Key: #$0072; Len: 2; Next: 41; Setup: 0; Categories: []), (Key: #$0075; Len: 1; Next: 45; Setup: 0; Categories: []), (Key: #$0069; Len: 1; Next: 42; Setup: 0; Categories: []), (Key: #$006E; Len: 1; Next: 43; Setup: 0; Categories: []), (Key: #$0074; Len: 1; Next: 44; Setup: 0; Categories: []), (Key: #$003A; Len: 1; Next: 45; Setup: 1; Categories: ClassMark + ClassNumber + ClassLetter + ClassPunctuation + ClassSymbol + [ccSeparatorSpace]), (Key: #$006E; Len: 1; Next: 46; Setup: 0; Categories: []), (Key: #$0063; Len: 1; Next: 47; Setup: 0; Categories: []), (Key: #$0074; Len: 1; Next: 48; Setup: 0; Categories: []), (Key: #$003A; Len: 1; Next: 49; Setup: 1; Categories: ClassPunctuation), (Key: #$0070; Len: 1; Next: 50; Setup: 0; Categories: []), (Key: #$0061; Len: 1; Next: 51; Setup: 0; Categories: []), (Key: #$0063; Len: 1; Next: 52; Setup: 0; Categories: []), (Key: #$0065; Len: 1; Next: 53; Setup: 0; Categories: []), (Key: #$003A; Len: 1; Next: 54; Setup: 2; Categories: ClassSpace), (Key: #$0070; Len: 1; Next: 55; Setup: 0; Categories: []), (Key: #$0070; Len: 1; Next: 56; Setup: 0; Categories: []), (Key: #$0065; Len: 1; Next: 57; Setup: 0; Categories: []), (Key: #$0072; Len: 1; Next: 58; Setup: 0; Categories: []), (Key: #$003A; Len: 1; Next: 59; Setup: 1; Categories: [ccLetterUppercase]), (Key: #$0064; Len: 1; Next: 60; Setup: 0; Categories: []), (Key: #$0069; Len: 1; Next: 61; Setup: 0; Categories: []), (Key: #$0067; Len: 1; Next: 62; Setup: 0; Categories: []), (Key: #$0069; Len: 1; Next: 63; Setup: 0; Categories: []), (Key: #$0074; Len: 1; Next: 64; Setup: 0; Categories: []), (Key: #$003A; Len: 1; Next: 65; Setup: 3; Categories: []) ); function TURESearch.PosixCCL(CP: PUCS2; Limit: SizeInt; Symbol: PUcSymbolTableEntry): SizeInt; // Probe for one of the POSIX colon delimited character classes in the static trie. var I: SizeInt; N: SizeInt; TP: PTrie; Run, ListEnd: PUCS2; begin Result := 0; // If the number of characters left is less than 7, // then this cannot be interpreted as one of the colon delimited classes. if Limit >= 7 then begin Run := cp; ListEnd := Run + Limit; TP := @CClassTrie[0]; I := 0; while (Run < ListEnd) and (I < 8) do begin N := TP.Len; while (N > 0) and (TP.Key <> Run^) do begin Inc(TP); Dec(N); end; if N = 0 then begin Result := 0; Exit; end; if (Run^ = ':') and ((I = 6) or (I = 7)) then begin Inc(Run); Break; end; if (Run + 1) < ListEnd then TP := @CClassTrie[TP.Next]; Inc(I); Inc(Run); end; Result := Run - CP; case TP.Setup of 1: Symbol.Categories := Symbol.Categories + TP.Categories; 2: SpaceSetup(Symbol, TP.Categories); 3: HexDigitSetup(Symbol); else Result := 0; end; end; end; function TURESearch.BuildCharacterClass(CP: PUCS2; Limit: SizeInt; Symbol: PUcSymbolTableEntry): SizeInt; // Construct a list of ranges and return the number of characters consumed. var RangeEnd: SizeInt; N: SizeInt; Run, ListEnd: PUCS2; C, Last: UCS4; Range: TUcRange; begin Run := cp; ListEnd := Run + Limit; if Run^ = '^' then begin Symbol.AType := _URE_NCCLASS; Inc(Run); end else Symbol.AType := _URE_CCLASS; Last := 0; RangeEnd := 0; while (FUREBuffer.Error = _URE_OK) and (Run < ListEnd) do begin // Allow for the special case []abc], where the first closing bracket would end an empty // character class, which makes no sense. Hence this bracket is treaded literally. if (Run^ = ']') and (Symbol.Symbol.CCL.RangesUsed > 0) then Break; C := UCS4(Run^); Inc(Run); // escape character if C = Ord('\') then begin if Run = ListEnd then begin // The EOS was encountered when expecting the reverse solidus to be followed by the character it is escaping. // Set an Error code and return the number of characters consumed up to this point. FUREBuffer.Error := _URE_UNEXPECTED_EOS; Result := Run - CP; Exit; end; C := UCS4(Run^); Inc(Run); case UCS2(C) of 'a': C := $07; 'b': C := $08; 'f': C := $0C; 'n': C := $0A; 'R': C := $0D; 't': C := $09; 'v': C := $0B; 'p', 'P': begin Inc(Run, ParsePropertyList(Run, ListEnd - Run, Symbol.Categories)); // Invert the bit mask of the properties if this is a negated character class or if 'P' is used to specify // a list of character properties that should *not* match in a character class. if C = Ord('P') then Symbol.Categories := ClassAll - Symbol.Categories; Continue; end; 'x', 'X', 'u', 'U': begin if (Run < ListEnd) and ((Run^ >= '0') and (Run^ <= '9') or (Run^ >= 'A') and (Run^ <= 'F') or (Run^ >= 'a') and (Run^ <= 'f')) then Inc(Run, MakeHexNumber(Run, ListEnd - Run, C)); end; end; end else begin if C = Ord(':') then begin // Probe for a POSIX colon delimited character class. Dec(Run); N := PosixCCL(Run, ListEnd - Run, Symbol); if N = 0 then Inc(Run) else begin Inc(Run, N); Continue; end; end; end; // Check to see if the current character is a low surrogate that needs // to be combined with a preceding high surrogate. if Last <> 0 then begin if (C >= SurrogateLowStart) and (C <= SurrogateLowEnd) then begin // Construct the UTF16 character code. C := $10000 + (((Last and $03FF) shl 10) or (C and $03FF)) end else begin // Add the isolated high surrogate to the range. if RangeEnd = 1 then Range.MaxCode := Last and $FFFF else begin Range.MinCode := Last and $FFFF; Range.MaxCode := Last and $FFFF; end; AddRange(Symbol.Symbol.CCL, Range); RangeEnd := 0; end; end; // Clear the Last character code. Last := 0; // This slightly awkward code handles the different cases needed to construct a range. if (C >= SurrogateHighStart) and (C <= SurrogateHighEnd) then begin // If the high surrogate is followed by a Range indicator, simply add it as the Range start. Otherwise, // save it in the next character is a low surrogate. if Run^ = '-' then begin Inc(Run); Range.MinCode := C; RangeEnd := 1; end else Last := C; end else begin if RangeEnd = 1 then begin Range.MaxCode := C; AddRange(Symbol.Symbol.CCL, Range); RangeEnd := 0; end else begin Range.MinCode := C; Range.MaxCode := C; if Run^ = '-' then begin Inc(Run); RangeEnd := 1; end else AddRange(Symbol.Symbol.CCL, Range); end; end; end; if (Run < ListEnd) and (Run^ = ']') then Inc(Run) else begin // The parse was not terminated by the character class close symbol (']'), so set an error code. FUREBuffer.Error := _URE_CCLASS_OPEN; end; Result := Run - CP; end; function TURESearch.ProbeLowSurrogate(LeftState: PUCS2; Limit: SizeInt; var Code: UCS4): SizeInt; // probes for a low surrogate hex code var I: SizeInt; Run, ListEnd: PUCS2; begin I := 0; Code := 0; Run := LeftState; ListEnd := Run + Limit; while (I < 4) and (Run < ListEnd) do begin if (Run^ >= '0') and (Run^ <= '9') then Code := (Code shl 4) or (Cardinal(Word(Run^) - Ord('0'))) else begin if (Run^ >= 'A') and (Run^ <= 'F') then Code := (Code shl 4) or (Cardinal(Word(Run^) - Ord('A')) + 10) else begin if (Run^ >= 'a') and (Run^ <= 'f') then Code := (Code shl 4) or (Cardinal(Word(Run^) - Ord('a')) + 10) else Break; end; end; Inc(Run); end; if (SurrogateLowStart <= Code) and (Code <= SurrogateLowEnd) then Result := Run - LeftState else Result := 0; end; function TURESearch.CompileSymbol(S: PUCS2; Limit: SizeInt; Symbol: PUcSymbolTableEntry): SizeInt; var C: UCS4; Run, ListEnd: PUCS2; begin Run := S; ListEnd := S + Limit; C := UCS4(Run^); Inc(Run); if C = Ord('\') then begin if Run = ListEnd then begin // The EOS was encountered when expecting the reverse solidus to be followed // by the character it is escaping. Set an Error code and return the number // of characters consumed up to this point. FUREBuffer.Error := _URE_UNEXPECTED_EOS; Result := Run - S; Exit; end; C := UCS4(Run^); Inc(Run); case UCS2(C) of 'p', 'P': begin if UCS2(C) = 'p' then Symbol.AType :=_URE_CCLASS else Symbol.AType :=_URE_NCCLASS; Inc(Run, ParsePropertyList(Run, ListEnd - Run, Symbol.Categories)); end; 'a': begin Symbol.AType := _URE_CHAR; Symbol.Symbol.Chr := $07; end; 'b': begin Symbol.AType := _URE_CHAR; Symbol.Symbol.Chr := $08; end; 'f': begin Symbol.AType := _URE_CHAR; Symbol.Symbol.Chr := $0C; end; 'n': begin Symbol.AType := _URE_CHAR; Symbol.Symbol.Chr := $0A; end; 'r': begin Symbol.AType := _URE_CHAR; Symbol.Symbol.Chr := $0D; end; 't': begin Symbol.AType := _URE_CHAR; Symbol.Symbol.Chr := $09; end; 'v': begin Symbol.AType := _URE_CHAR; Symbol.Symbol.Chr := $0B; end; else case UCS2(C) of 'x', 'X', 'u', 'U': begin // Collect between 1 and 4 digits representing an UCS2 code. if (Run < ListEnd) and ((Run^ >= '0') and (Run^ <= '9') or (Run^ >= 'A') and (Run^ <= 'F') or (Run^ >= 'a') and (Run^ <= 'f')) then Inc(Run, MakeHexNumber(Run, ListEnd - Run, C)); end; end; // Simply add an escaped character here. Symbol.AType := _URE_CHAR; Symbol.Symbol.Chr := C; end; end else begin if (UCS2(C) = '^') or (UCS2(C) = '$') then begin // Handle the BOL and EOL anchors. This actually consists simply of setting // a flag that indicates that the user supplied anchor match function should // be called. This needs to be done instead of simply matching line/paragraph // separators because beginning-of-text and end-of-text tests are needed as well. if UCS2(C) = '^' then Symbol.AType := _URE_BOL_ANCHOR else Symbol.AType := _URE_EOL_ANCHOR; end else begin if UCS2(C) = '[' then begin // construct a character class Inc(Run, BuildCharacterClass(Run, ListEnd - Run, Symbol)); end else begin if UCS2(C) = '.' then Symbol.AType := _URE_ANY_CHAR else begin Symbol.AType := _URE_CHAR; Symbol.Symbol.Chr := C; end; end; end; end; // If the symbol type happens to be a character and is a high surrogate, then // probe forward to see if it is followed by a low surrogate that needs to be added. if (Run < ListEnd) and (Symbol.AType = _URE_CHAR) and (SurrogateHighStart <= Symbol.Symbol.Chr) and (Symbol.Symbol.Chr <= SurrogateHighEnd) then begin if (SurrogateLowStart <= UCS4(Run^)) and (UCS4(Run^) <= SurrogateLowEnd) then begin Symbol.Symbol.Chr := $10000 + (((Symbol.Symbol.Chr and $03FF) shl 10) or (UCS4(Run^) and $03FF)); Inc(Run); end else begin if (Run^ = '\') and (((Run + 1)^ = 'x') or ((Run + 1)^ = 'X') or ((Run + 1)^ = 'u') or ((Run + 1)^ = 'U')) then begin Inc(Run, ProbeLowSurrogate(Run + 2, ListEnd - (Run + 2), C)); if (SurrogateLowStart <= C) and (C <= SurrogateLowEnd) then begin // Take into account the \[xu] in front of the hex code. Inc(Run, 2); Symbol.Symbol.Chr := $10000 + (((Symbol.Symbol.Chr and $03FF) shl 10) or (C and $03FF)); end; end; end; end; // Last, make sure any _URE_CHAR type symbols are changed to lower if the // 'Casefold' flag is set. { TODO : use the entire mapping, not only the first character and use the case fold abilities of the unit. } if ((FUREBuffer.Flags and _URE_DFA_CASEFOLD) <> 0) and (Symbol.AType = _URE_CHAR) then Symbol.Symbol.Chr := UnicodeToLower(Symbol.Symbol.Chr)[0]; // If the symbol constructed is anything other than one of the anchors, // make sure the _URE_DFA_BLANKLINE flag is removed. if (Symbol.AType <> _URE_BOL_ANCHOR) and (Symbol.AType <> _URE_EOL_ANCHOR) then FUREBuffer.Flags := FUREBuffer.Flags and not _URE_DFA_BLANKLINE; // Return the number of characters consumed. Result := Run - S; end; function TURESearch.SymbolsAreDifferent(A, B: PUcSymbolTableEntry): Boolean; begin Result := False; if (A.AType <> B.AType) or (A.Mods <> B.Mods) or (A.Categories <> B.Categories) then Result := True else begin if (A.AType = _URE_CCLASS) or (A.AType = _URE_NCCLASS) then begin if A.Symbol.CCL.RangesUsed <> B.Symbol.CCL.RangesUsed then Result := True else begin if (A.Symbol.CCL.RangesUsed > 0) and not CompareMem(@A.Symbol.CCL.Ranges[0], @B.Symbol.CCL.Ranges[0], SizeOf(TUcRange) * A.Symbol.CCL.RangesUsed) then Result := True;; end; end else begin if (A.AType = _URE_CHAR) and (A.Symbol.Chr <> B.Symbol.Chr) then Result := True; end; end; end; function TURESearch.MakeSymbol(S: PUCS2; Limit: SizeInt; out Consumed: SizeInt): SizeInt; // constructs a symbol, but only keep unique symbols var I: SizeInt; Start: PUcSymbolTableEntry; Symbol: TUcSymbolTableEntry; begin // Build the next symbol so we can test to see if it is already in the symbol table. ResetMemory(Symbol, SizeOf(TUcSymbolTableEntry)); Consumed := CompileSymbol(S, Limit, @Symbol); // Check to see if the symbol exists. I := 0; Start := @FUREBuffer.SymbolTable.Symbols[0]; while (I < FUREBuffer.SymbolTable.SymbolsUsed) and SymbolsAreDifferent(@Symbol, Start) do begin Inc(I); Inc(Start); end; if I < FUREBuffer.SymbolTable.SymbolsUsed then begin // Free up any ranges used for the symbol. if (Symbol.AType = _URE_CCLASS) or (Symbol.AType = _URE_NCCLASS) then Symbol.Symbol.CCL.Ranges := nil; Result := FUREBuffer.SymbolTable.Symbols[I].ID; Exit; end; // Need to add the new symbol. if FUREBuffer.SymbolTable.SymbolsUsed = Length(FUREBuffer.SymbolTable.Symbols) then begin SetLength(FUREBuffer.SymbolTable.Symbols, Length(FUREBuffer.SymbolTable.Symbols) + 8); end; Symbol.ID := FUREBuffer.SymbolTable.SymbolsUsed; Inc(FUREBuffer.SymbolTable.SymbolsUsed); FUREBuffer.SymbolTable.Symbols[Symbol.ID] := Symbol; Result := Symbol.ID; end; function TURESearch.MakeExpression(AType, LHS, RHS: SizeInt): SizeInt; var I: SizeInt; begin // Determine if the expression already exists or not. with FUREBuffer.ExpressionList do begin for I := 0 to ExpressionsUsed - 1 do begin if (Expressions[I].AType = AType) and (Expressions[I].LHS = LHS) and (Expressions[I].RHS = RHS) then begin Result := I; Exit; end; end; // Need to add a new expression. if ExpressionsUsed = Length(Expressions) then SetLength(Expressions, Length(Expressions) + 8); Expressions[ExpressionsUsed].OnStack := False; Expressions[ExpressionsUsed].AType := AType; Expressions[ExpressionsUsed].LHS := LHS; Expressions[ExpressionsUsed].RHS := RHS; Result := ExpressionsUsed; Inc(ExpressionsUsed); end; end; function IsSpecial(C: Word): Boolean; begin case C of Word('+'), Word('*'), Word('?'), Word('{'), Word('|'), Word(')'): Result := True; else Result := False; end; end; procedure TURESearch.CollectPendingOperations(var State: SizeInt); // collects all pending AND and OR operations and make corresponding expressions var Operation: SizeInt; begin repeat Operation := Peek; if (Operation <> _URE_AND) and (Operation <> _URE_OR) then Break; // make an expression with the AND or OR operator and its right hand side Operation := Pop; State := MakeExpression(Operation, Pop, State); until False; end; function TURESearch.ConvertRegExpToNFA(RE: PWideChar; RELength: SizeInt): SizeInt; // Converts the regular expression into an NFA in a form that will be easy to // reduce to a DFA. The starting state for the reduction will be returned. var C: UCS2; Head, Tail: PUCS2; S: WideString; Symbol, State, LastState, Used, M, N: SizeInt; I: SizeInt; begin State := _URE_NOOP; Head := RE; Tail := Head + RELength; while (FUREBuffer.Error = _URE_OK) and (Head < Tail) do begin C := Head^; Inc(Head); case C of '(': Push(_URE_PAREN); ')': // check for the case of too many close parentheses begin if Peek = _URE_NOOP then begin FUREBuffer.Error := _URE_UNBALANCED_GROUP; Break; end; CollectPendingOperations(State); // remove the _URE_PAREN off the stack Pop; end; '*': State := MakeExpression(_URE_STAR, State, _URE_NOOP); '+': State := MakeExpression(_URE_PLUS, State, _URE_NOOP); '?': State := MakeExpression(_URE_QUEST, State, _URE_NOOP); '|': begin CollectPendingOperations(State); Push(State); Push(_URE_OR); end; '{': // expressions of the form {m, n} begin C := #0; M := 0; N := 0; // get first number while UnicodeIsWhiteSpace(UCS4(Head^)) do Inc(Head); // very slow implementation S := ''; while (Head^ >= WideChar('0')) and (Head^ <= WideChar('9')) do begin S := S + Head^; Inc(Head); end; if S <> '' then M := StrToInt(S); while UnicodeIsWhiteSpace(UCS4(Head^)) do Inc(Head); if (Head^ <> ',') and (Head^ <> '}') then begin FUREBuffer.Error := _URE_INVALID_RANGE; Break; end; // check for an upper limit if Head^ <> '}' then begin Inc(Head); // get second number while UnicodeIsWhiteSpace(UCS4(Head^)) do Inc(Head); // very slow implementation S := ''; while (Head^ >= WideChar('0')) and (Head^ <= WideChar('9')) do begin S := S + Head^; Inc(Head); end; if S <> '' then N := StrToInt(S); end else N := M; if Head^ <> '}' then begin FUREBuffer.Error := _URE_RANGE_OPEN; Break; end else Inc(Head); // N = 0 means unlimited number of occurences if N = 0 then begin case M of 0: // {,} {0,} {0, 0} mean the same as the star operator State := MakeExpression(_URE_STAR, State, _URE_NOOP); 1: // {1,} {1, 0} mean the same as the plus operator State := MakeExpression(_URE_PLUS, State, _URE_NOOP); else begin // encapsulate the expanded branches as would they be in parenthesis // in order to avoid unwanted concatenation with pending operations/symbols Push(_URE_PAREN); // {m,} {m, 0} mean M fixed occurences plus star operator // make E^m... for I := 1 to M - 1 do begin Push(State); Push(_URE_AND); end; // ...and repeat the last symbol one or more times State := MakeExpression(_URE_PLUS, State, _URE_NOOP); CollectPendingOperations(State); Pop; end; end; end else begin // check proper range limits if M > N then begin FUREBuffer.Error := _URE_INVALID_RANGE; Break; end; // check special case {0, 1} (which corresponds to the ? operator) if (M = 0) and (N = 1) then State := MakeExpression(_URE_QUEST, State, _URE_NOOP) else begin // handle the general case by expanding {m, n} into the equivalent // expression E^m | E^(m + 1) | ... | E^n // encapsulate the expanded branches as would they be in parenthesis // in order to avoid unwanted concatenation with pending operations/symbols Push(_URE_PAREN); // keep initial state as this is the one all alternatives start from LastState := State; // Consider the special case M = 0 first. Because there's no construct // to enter a pure epsilon-transition into the expression array I // work around with the question mark operator to describe the first // and second branch alternative. if M = 0 then begin State := MakeExpression(_URE_QUEST, State, _URE_NOOP); Inc(M, 2); // Mark the pending OR operation (there must always follow at // least on more alternative because the special case {0, 1} has // already been handled). Push(State); Push(_URE_OR); end; while M <= N do begin State := LastState; // create E^M for I := 1 to SizeInt(M) - 1 do begin Push(State); Push(_URE_AND); end; // finish the branch and mark it as pending OR operation if it // isn't the last one CollectPendingOperations(State); if M < N then begin Push(State); Push(_URE_OR); end; Inc(M); end; // remove the _URE_PAREN off the stack Pop; end; end; end; else Dec(Head); Symbol := MakeSymbol(Head, Tail - Head, Used); Inc(Head, Used); State := MakeExpression(_URE_SYMBOL, Symbol, _URE_NOOP); end; if (C <> '(') and (C <> '|') and (C <> '{') and (Head < Tail) and (not IsSpecial(Word(Head^)) or (Head^ = '(')) then begin Push(State); Push(_URE_AND); end; end; CollectPendingOperations(State); if FUREBuffer.Stack.ListUsed > 0 then FUREBuffer.Error := _URE_UNBALANCED_GROUP; if FUREBuffer.Error = _URE_OK then Result := State else Result := _URE_NOOP; end; procedure TURESearch.AddSymbolState(Symbol, State: SizeInt); var I, J: SizeInt; Found: Boolean; begin // Locate the symbol in the symbol table so the state can be added. // If the symbol doesn't exist, then we are in serious trouble. with FUREBuffer.SymbolTable do begin I := 0; while (I < SymbolsUsed) and (Symbol <> Symbols[I].ID) do Inc(I); Assert(I < SymbolsUsed); end; // Now find out if the state exists in the symbol's state list. with FUREBuffer.SymbolTable.Symbols[I].States do begin Found := False; for J := 0 to ListUsed - 1 do begin if State <= List[J] then begin Found := True; Break; end; end; if not Found then J := ListUsed; if not Found or (State < List[J]) then begin // Need to add the state in order. if ListUsed = Length(List) then SetLength(List, Length(List) + 8); if J < ListUsed then Move(List[J], List[J + 1], SizeOf(SizeInt) * (ListUsed - J)); List[J] := State; Inc(ListUsed); end; end; end; function TURESearch.AddState(NewStates: array of SizeInt): SizeInt; var I: SizeInt; Found: Boolean; begin Found := False; for I := 0 to FUREBuffer.States.StatesUsed - 1 do begin if (FUREBuffer.States.States[I].StateList.ListUsed = Length(NewStates)) and CompareMem(@NewStates[0], @FUREBuffer.States.States[I].StateList.List[0], SizeOf(SizeInt) * Length(NewStates)) then begin Found := True; Break; end; end; if not Found then begin // Need to add a new DFA State (set of NFA states). if FUREBuffer.States.StatesUsed = Length(FUREBuffer.States.States) then SetLength(FUREBuffer.States.States, Length(FUREBuffer.States.States) + 8); with FUREBuffer.States.States[FUREBuffer.States.StatesUsed] do begin ID := FUREBuffer.States.StatesUsed; if (StateList.ListUsed + Length(NewStates)) >= Length(StateList.List) then SetLength(StateList.List, Length(StateList.List) + Length(NewStates) + 8); Move(NewStates[0], StateList.List[StateList.ListUsed], SizeOf(SizeInt) * Length(NewStates)); Inc(StateList.ListUsed, Length(NewStates)); end; Inc(FUREBuffer.States.StatesUsed); end; // Return the ID of the DFA state representing a group of NFA States. if Found then Result := I else Result := FUREBuffer.States.StatesUsed - 1; end; procedure TURESearch.Reduce(Start: SizeInt); var I, J, Symbols: SizeInt; State, RHS, s1, s2, ns1, ns2: SizeInt; Evaluating: Boolean; begin FUREBuffer.Reducing := True; // Add the starting state for the reduction. AddState([Start]); // Process each set of NFA states that get created. I := 0; // further states are added in the loop while I < FUREBuffer.States.StatesUsed do begin with FUREBuffer, States.States[I], ExpressionList do begin // Push the current states on the stack. for J := 0 to StateList.ListUsed - 1 do Push(StateList.List[J]); // Reduce the NFA states. Accepting := False; Symbols := 0; J := 0; // need a while loop here as the stack will be modified within the loop and // so also its usage count used to terminate the loop while J < FUREBuffer.Stack.ListUsed do begin State := FUREBuffer.Stack.List[J]; Evaluating := True; // This inner loop is the iterative equivalent of recursively // reducing subexpressions generated as a result of a reduction. while Evaluating do begin case Expressions[State].AType of _URE_SYMBOL: begin ns1 := MakeExpression(_URE_ONE, _URE_NOOP, _URE_NOOP); AddSymbolState(Expressions[State].LHS, ns1); Inc(Symbols); Evaluating := False; end; _URE_ONE: begin Accepting := True; Evaluating := False; end; _URE_QUEST: begin s1 := Expressions[State].LHS; ns1 := MakeExpression(_URE_ONE, _URE_NOOP, _URE_NOOP); State := MakeExpression(_URE_OR, ns1, s1); end; _URE_PLUS: begin s1 := Expressions[State].LHS; ns1 := MakeExpression(_URE_STAR, s1, _URE_NOOP); State := MakeExpression(_URE_AND, s1, ns1); end; _URE_STAR: begin s1 := Expressions[State].LHS; ns1 := MakeExpression(_URE_ONE, _URE_NOOP, _URE_NOOP); ns2 := MakeExpression(_URE_PLUS, s1, _URE_NOOP); State := MakeExpression(_URE_OR, ns1, ns2); end; _URE_OR: begin s1 := Expressions[State].LHS; s2 := Expressions[State].RHS; Push(s1); Push(s2); Evaluating := False; end; _URE_AND: begin s1 := Expressions[State].LHS; s2 := Expressions[State].RHS; case Expressions[s1].AType of _URE_SYMBOL: begin AddSymbolState(Expressions[s1].LHS, s2); Inc(Symbols); Evaluating := False; end; _URE_ONE: State := s2; _URE_QUEST: begin ns1 := Expressions[s1].LHS; ns2 := MakeExpression(_URE_AND, ns1, s2); State := MakeExpression(_URE_OR, s2, ns2); end; _URE_PLUS: begin ns1 := Expressions[s1].LHS; ns2 := MakeExpression(_URE_OR, s2, State); State := MakeExpression(_URE_AND, ns1, ns2); end; _URE_STAR: begin ns1 := Expressions[s1].LHS; ns2 := MakeExpression(_URE_AND, ns1, State); State := MakeExpression(_URE_OR, s2, ns2); end; _URE_OR: begin ns1 := Expressions[s1].LHS; ns2 := Expressions[s1].RHS; ns1 := MakeExpression(_URE_AND, ns1, s2); ns2 := MakeExpression(_URE_AND, ns2, s2); State := MakeExpression(_URE_OR, ns1, ns2); end; _URE_AND: begin ns1 := Expressions[s1].LHS; ns2 := Expressions[s1].RHS; ns2 := MakeExpression(_URE_AND, ns2, s2); State := MakeExpression(_URE_AND, ns1, ns2); end; end; end; end; end; Inc(J); end; // clear the state stack while Pop <> _URE_NOOP do { nothing }; // generate the DFA states for the symbols collected during the current reduction if (TransitionsUsed + Symbols) > Length(Transitions) then SetLength(Transitions, Length(Transitions) + Symbols); // go through the symbol table and generate the DFA state transitions for // each symbol that has collected NFA states Symbols := 0; J := 0; while J < FUREBuffer.SymbolTable.SymbolsUsed do begin begin if FUREBuffer.SymbolTable.Symbols[J].States.ListUsed > 0 then begin Transitions[Symbols].LHS := FUREBuffer.SymbolTable.Symbols[J].ID; with FUREBuffer.SymbolTable.Symbols[J] do begin RHS := AddState(Copy(States.List, 0, States.ListUsed)); States.ListUsed := 0; end; Transitions[Symbols].RHS := RHS; Inc(Symbols); end; end; Inc(J); end; // set the number of transitions actually used // Note: we need again to qualify a part of the TransistionsUsed path since the // state array could be reallocated in the AddState call above and the // with ... do will then be invalid. States.States[I].TransitionsUsed := Symbols; end; Inc(I); end; FUREBuffer.Reducing := False; end; procedure TURESearch.AddEquivalentPair(L, R: SizeInt); var I: SizeInt; begin L := FUREBuffer.States.States[L].ID; R := FUREBuffer.States.States[R].ID; if L <> R then begin if L > R then begin I := L; L := R; R := I; end; // Check to see if the equivalence pair already exists. I := 0; with FUREBuffer.EquivalentList do begin while (I < EquivalentsUsed) and ((Equivalents[I].Left <> L) or (Equivalents[I].Right <> R)) do Inc(I); if I >= EquivalentsUsed then begin if EquivalentsUsed = Length(Equivalents) then SetLength(Equivalents, Length(Equivalents) + 8); Equivalents[EquivalentsUsed].Left := L; Equivalents[EquivalentsUsed].Right := R; Inc(EquivalentsUsed); end; end; end; end; procedure TURESearch.MergeEquivalents; // merges the DFA states that are equivalent var I, J, K, Equal: SizeInt; Done: Boolean; State1, State2, LeftState, RightState: PUcState; begin for I := 0 to FUREBuffer.States.StatesUsed - 1 do begin State1 := @FUREBuffer.States.States[I]; if State1.ID = SizeInt(I) then begin J := 0; while J < I do begin State2 := @FUREBuffer.States.States[J]; if State2.ID = SizeInt(J) then begin FUREBuffer.EquivalentList.EquivalentsUsed := 0; AddEquivalentPair(I, J); Done := False; Equal := 0; while Equal < FUREBuffer.EquivalentList.EquivalentsUsed do begin LeftState := @FUREBuffer.States.States[FUREBuffer.EquivalentList.Equivalents[Equal].Left]; RightState := @FUREBuffer.States.States[FUREBuffer.EquivalentList.Equivalents[Equal].Right]; if (LeftState.Accepting <> RightState.Accepting) or (LeftState.TransitionsUsed <> RightState.TransitionsUsed) then begin Done := True; Break; end; K := 0; while (K < LeftState.TransitionsUsed) and (LeftState.Transitions[K].LHS = RightState.Transitions[K].LHS) do Inc(K); if K < LeftState.TransitionsUsed then begin Done := True; Break; end; for K := 0 to LeftState.TransitionsUsed - 1 do AddEquivalentPair(LeftState.Transitions[K].RHS, RightState.Transitions[K].RHS); Inc(Equal); end; if not Done then Break; end; Inc(J); end; if J < I then begin with FUREBuffer do begin for Equal := 0 to EquivalentList.EquivalentsUsed - 1 do begin States.States[EquivalentList.Equivalents[Equal].Right].ID := States.States[EquivalentList.Equivalents[Equal].Left].ID; end; end; end; end; end; // Renumber the states appropriately State1 := @FUREBuffer.States.States[0]; Equal := 0; for I := 0 to FUREBuffer.States.StatesUsed - 1 do begin if State1.ID = SizeInt(I) then begin State1.ID := Equal; Inc(Equal); end else State1.ID := FUREBuffer.States.States[State1.ID].ID; Inc(State1); end; end; procedure TURESearch.ClearUREBuffer; var I: SizeInt; begin with FUREBuffer do begin // quite a few dynamic arrays to free Stack.List := nil; ExpressionList.Expressions := nil; // the symbol table has been handed over to the DFA and will be freed on // release of the DFA SymbolTable.SymbolsUsed := 0; for I := 0 to States.StatesUsed - 1 do begin States.States[I].Transitions := nil; States.States[I].StateList.List := nil; States.States[I].StateList.ListUsed := 0; States.States[I].TransitionsUsed := 0; end; States.StatesUsed := 0; States.States := nil; EquivalentList.Equivalents := nil; end; ResetMemory(FUREBuffer, SizeOf(FUREBuffer)); end; procedure TURESearch.CompileURE(RE: PWideChar; RELength: SizeInt; Casefold: Boolean); var I, J: SizeInt; State: SizeInt; Run: PUcState; TP: SizeInt; procedure UREError(Text: string; RE: PWideChar); var S: string; begin S := RE; raise EJclUnicodeError.CreateResFmt(@RsUREErrorFmt, [LoadResString(@RsUREBaseString), Text, S]); end; begin // be paranoid if (RE <> nil) and (RE^ <> WideNull) and (RELength > 0) then begin // Reset the various fields of the compilation buffer. Default the Flags // to indicate the presense of the "^$" pattern. If any other pattern // occurs, then this flag will be removed. This is done to catch this // special pattern and handle it specially when matching. ClearUREBuffer; ClearDFA; FUREBuffer.Flags := _URE_DFA_BLANKLINE; if Casefold then FUREBuffer.Flags := FUREBuffer.Flags or _URE_DFA_CASEFOLD; // Construct the NFA. If this stage returns a 0, then an error occured or an // empty expression was passed. State := ConvertRegExpToNFA(RE, RELength); if State <> _URE_NOOP then begin // Do the expression reduction to get the initial DFA. Reduce(State); // Merge all the equivalent DFA States. MergeEquivalents; // Construct the minimal DFA. FDFA.Flags := FUREBuffer.Flags and (_URE_DFA_CASEFOLD or _URE_DFA_BLANKLINE); // Free up the NFA state groups and transfer the symbols from the buffer // to the DFA. FDFA.SymbolTable := FUREBuffer.SymbolTable; FUREBuffer.SymbolTable.Symbols := nil; // Collect the total number of states and transitions needed for the DFA. State := 0; for I := 0 to FUREBuffer.States.StatesUsed - 1 do begin if FUREBuffer.States.States[I].ID = State then begin Inc(FDFA.StateList.StatesUsed); Inc(FDFA.TransitionList.TransitionsUsed, FUREBuffer.States.States[I].TransitionsUsed); Inc(State); end; end; // Allocate enough space for the states and transitions. SetLength(FDFA.StateList.States, FDFA.StateList.StatesUsed); SetLength(FDFA.TransitionList.Transitions, FDFA.TransitionList.TransitionsUsed); // Actually transfer the DFA States from the buffer. State := 0; TP := 0; Run := @FUREBuffer.States.States[0]; for I := 0 to FUREBuffer.States.StatesUsed - 1 do begin if Run.ID = State then begin FDFA.StateList.States[I].StartTransition := TP; FDFA.StateList.States[I].NumberTransitions := Run.TransitionsUsed; FDFA.StateList.States[I].Accepting := Run.Accepting; // Add the transitions for the state for J := 0 to FDFA.StateList.States[I].NumberTransitions - 1 do begin FDFA.TransitionList.Transitions[TP].Symbol := Run.Transitions[J].LHS; FDFA.TransitionList.Transitions[TP].NextState := FUREBuffer.States.States[Run.Transitions[J].RHS].ID; Inc(TP); end; Inc(State); end; Inc(Run); end; end else begin // there might be an error while parsing the pattern, show it if so case FUREBuffer.Error of _URE_UNEXPECTED_EOS: UREError(LoadResString(@RsUREUnexpectedEOS), RE); _URE_CCLASS_OPEN: UREError(LoadResString(@RsURECharacterClassOpen), RE); _URE_UNBALANCED_GROUP: UREError(LoadResString(@RsUREUnbalancedGroup), RE); _URE_INVALID_PROPERTY: UREError(LoadResString(@RsUREInvalidCharProperty), RE); _URE_INVALID_RANGE: UREError(LoadResString(@RsUREInvalidRepeatRange), RE); _URE_RANGE_OPEN: UREError(LoadResString(@RsURERepeatRangeOpen), RE); else // expression was empty raise EJclUnicodeError.CreateRes(@RsUREExpressionEmpty); end; end; end; end; procedure TURESearch.ClearDFA; var I: SizeInt; begin with FDFA do begin for I := 0 to SymbolTable.SymbolsUsed - 1 do begin if (SymbolTable.Symbols[I].AType = _URE_CCLASS) or (SymbolTable.Symbols[I].AType = _URE_NCCLASS) then SymbolTable.Symbols[I].Symbol.CCL.Ranges := nil; end; for I := 0 to SymbolTable.SymbolsUsed - 1 do begin FDFA.SymbolTable.Symbols[I].States.List := nil; FDFA.SymbolTable.Symbols[I].States.ListUsed := 0; end; SymbolTable.SymbolsUsed := 0; SymbolTable.Symbols := nil; StateList.States := nil; TransitionList.Transitions := nil; end; ResetMemory(FDFA, SizeOf(FDFA)); end; function IsSeparator(C: UCS4): Boolean; begin Result := (C = $D) or (C = $A) or (C = $2028) or (C = $2029); end; function TURESearch.ExecuteURE(Flags: Cardinal; Text: PUCS2; TextLen: SizeInt; var MatchStart, MatchEnd: SizeInt): Boolean; var I, J: SizeInt; Matched, Found: Boolean; Start, Stop: SizeInt; C: UCS4; Run, Tail, lp: PUCS2; LastState: PDFAState; Symbol: PUcSymbolTableEntry; Rp: PUcRange; LCMapping: TUCS4Array; begin Result := False; if Text <> nil then begin // Handle the special case of an empty string matching the "^$" pattern. if (Textlen = 0) and ((FDFA.Flags and _URE_DFA_BLANKLINE) <> 0) then begin MatchStart := 0; MatchEnd := 0; Result := True; Exit; end; Run := Text; Tail := Run + TextLen; Start := -1; Stop := -1; LastState := @FDFA.StateList.States[0]; Found := False; while not Found and (Run < Tail) do begin lp := Run; C := UCS4(Run^); Inc(Run); // Check to see if this is a high surrogate that should be combined with a // following low surrogate. if (Run < Tail) and (SurrogateHighStart <= C) and (C <= SurrogateHighEnd) and (SurrogateLowStart <= UCS4(Run^)) and (UCS4(Run^) <= SurrogateLowEnd) then begin C := $10000 + (((C and $03FF) shl 10) or (UCS4(Run^) and $03FF)); Inc(Run); end; // Determine if the character is non-spacing and should be skipped. if ((Flags and URE_IGNORE_NONSPACING) <> 0) and UnicodeIsNonSpacingMark(C) then begin Inc(Run); Continue; end; if (FDFA.Flags and _URE_DFA_CASEFOLD) <> 0 then begin SetLength(LCMapping, 0); { TODO : use the entire mapping, not only the first character } // (CaseLookup used for a little extra speed: avoids dynamic array allocation) if CaseLookup(C, ctLower, LCMapping) then C := LCMapping[0]; end; // See if one of the transitions matches. I := LastState.NumberTransitions - 1; Matched := False; while not Matched and (I >= 0) do begin Symbol := @FDFA.SymbolTable.Symbols[FDFA.TransitionList.Transitions[LastState.StartTransition + I].Symbol]; case Symbol.AType of _URE_ANY_CHAR: if ((Flags and URE_DONT_MATCHES_SEPARATORS) <> 0) or not IsSeparator(C) then Matched := True; _URE_CHAR: if C = Symbol.Symbol.Chr then Matched := True; _URE_BOL_ANCHOR: if Lp = Text then begin Run := lp; Matched := True; end else begin if IsSeparator(C) then begin if (C = $D) and (Run < Tail) and (Run^ = #$A) then Inc(Run); Lp := Run; Matched := True; end; end; _URE_EOL_ANCHOR: if IsSeparator(C) then begin // Put the pointer back before the separator so the match end // position will be correct. This will also cause the `Run' // pointer to be advanced over the current separator once the // match end point has been recorded. Run := Lp; Matched := True; end; _URE_CCLASS, _URE_NCCLASS: with Symbol^ do begin if Categories <> [] then Matched := CategoryLookup(C, Categories); if Symbol.CCL.RangesUsed > 0 then begin Rp := @Symbol.CCL.Ranges[0]; for J := 0 to Symbol.CCL.RangesUsed - 1 do begin if (Rp.MinCode <= C) and (C <= Rp.MaxCode) then begin Matched := True; Break; end; Inc(Rp); end; end; if AType = _URE_NCCLASS then Matched := not Matched; end; end; if Matched then begin if Start = -1 then Start := Lp - Text else Stop := Run - Text; LastState := @FDFA.StateList.States[FDFA.TransitionList.Transitions[LastState.StartTransition + I].NextState]; // If the match was an EOL anchor, adjust the pointer past the separator // that caused the match. The correct match position has been recorded // already. if Symbol.AType = _URE_EOL_ANCHOR then begin // skip the character that caused the match Inc(Run); // handle the infamous CRLF situation if (Run < Tail) and (C = $D) and (Run^ = #$A) then Inc(Run); end; end; Dec(I); end; if not Matched then begin Found := LastState.Accepting; if not Found then begin // If the last state was not accepting, then reset and start over. LastState := @FDFA.StateList.States[0]; Start := -1; Stop := -1; end else begin // set start and stop pointer if not yet done if Start = -1 then begin Start := Lp - Text; Stop := Run - Text; end else begin if Stop = -1 then Stop := Lp - Text; end; end; end else begin if Run = Tail then begin if not LastState.Accepting then begin // This ugly hack is to make sure the end-of-line anchors match // when the source text hits the end. This is only done if the last // subexpression matches. for I := 0 to LastState.NumberTransitions - 1 do begin if Found then Break; Symbol := @FDFA.SymbolTable.Symbols[FDFA.TransitionList.Transitions[LastState.StartTransition + I].Symbol]; if Symbol.AType =_URE_EOL_ANCHOR then begin LastState := @FDFA.StateList.States[FDFA.TransitionList.Transitions[LastState.StartTransition + I].NextState]; if LastState.Accepting then begin Stop := Run - Text; Found := True; end else Break; end; end; end else begin // Make sure any conditions that match all the way to the end of // the string match. Found := True; Stop := Run - Text; end; end; end; end; if Found then begin MatchStart := Start; MatchEnd := Stop; end; Result := Found; end; end; function TURESearch.FindAll(const Text: WideString): Boolean; begin Result := FindAll(PWideChar(Text), Length(Text)); end; function TURESearch.FindAll(Text: PWideChar; TextLen: SizeInt): Boolean; // Looks for all occurences of the pattern passed to FindPrepare and creates an // internal list of their positions. var Start, Stop: SizeInt; Run: PWideChar; RunLen: SizeInt; begin ClearResults; Run := Text; RunLen := TextLen; // repeat to find all occurences of the pattern Start := 0; Stop := 0; while ExecuteURE(0, Run, RunLen, Start, Stop) do begin // store this result (consider text pointer movement)... AddResult(Start + Run - Text, Stop + Run - Text); // ... and advance text position and length Inc(Run, Stop); Dec(RunLen, Stop); end; Result := FResults.Count > 0; end; function TURESearch.FindFirst(const Text: WideString; var Start, Stop: SizeInt): Boolean; begin Result := FindFirst(PWideChar(Text), Length(Text), Start, Stop); end; function TURESearch.FindFirst(Text: PWideChar; TextLen: SizeInt; var Start, Stop: SizeInt): Boolean; // Looks for the first occurence of the pattern passed to FindPrepare in Text and // returns True if one could be found (in which case Start and Stop are set to // the according indices) otherwise False. This function is in particular of // interest if only one occurence needs to be found. begin ClearResults; Result := ExecuteURE(0, Text, TextLen, Start, Stop); if Result then AddResult(Start, Stop); end; procedure TURESearch.FindPrepare(Pattern: PWideChar; PatternLength: SizeInt; Options: TSearchFlags); begin CompileURE(Pattern, PatternLength, not (sfCaseSensitive in Options)); end; procedure TURESearch.FindPrepare(const Pattern: WideString; Options: TSearchFlags); begin CompileURE(PWideChar(Pattern), Length(Pattern), not (sfCaseSensitive in Options)); end; //=== { TWideStrings } ======================================================= constructor TWideStrings.Create; begin inherited Create; FLanguage := GetUserDefaultLCID; FNormalizationForm := nfC; FSaveFormat := sfUnicodeLSB; end; procedure TWideStrings.SetLanguage(Value: LCID); begin FLanguage := Value; end; function TWideStrings.GetSaveUnicode: Boolean; begin Result := SaveFormat = sfUnicodeLSB; end; procedure TWideStrings.SetSaveUnicode(const Value: Boolean); begin if Value then SaveFormat := sfUnicodeLSB else SaveFormat := sfAnsi; end; function TWideStrings.Add(const S: WideString): Integer; begin Result := GetCount; Insert(Result, S); end; function TWideStrings.AddObject(const S: WideString; AObject: TObject): Integer; begin Result := Add(S); PutObject(Result, AObject); end; procedure TWideStrings.Append(const S: WideString); begin Add(S); end; procedure TWideStrings.AddStrings(Strings: TStrings); var I: Integer; {$IFNDEF SUPPORTS_UNICODE} CP: Word; {$ENDIF ~SUPPORTS_UNICODE} begin BeginUpdate; try {$IFNDEF SUPPORTS_UNICODE} CP := CodePageFromLocale(FLanguage); {$ENDIF ~SUPPORTS_UNICODE} for I := 0 to Strings.Count - 1 do begin {$IFDEF SUPPORTS_UNICODE} AddObject(Strings[I], Strings.Objects[I]) {$ELSE ~SUPPORTS_UNICODE} AddObject(StringToWideStringEx(Strings[I], CP), Strings.Objects[I]) {$ENDIF ~SUPPORTS_UNICODE} end; finally EndUpdate; end; end; procedure TWideStrings.AddStrings(Strings: TWideStrings); var I: Integer; begin Assert(Strings <> nil); BeginUpdate; try for I := 0 to Strings.Count - 1 do AddObject(Strings[I], Strings.Objects[I]); finally EndUpdate; end; end; procedure TWideStrings.Assign(Source: TPersistent); // usual assignment routine, but able to assign wide and small strings begin if Source is TWideStrings then begin BeginUpdate; try Clear; AddStrings(TWideStrings(Source)); finally EndUpdate; end; end else begin if Source is TStrings then begin BeginUpdate; try Clear; AddStrings(TStrings(Source)); finally EndUpdate; end; end else inherited Assign(Source); end; end; procedure TWideStrings.AssignTo(Dest: TPersistent); // need to do also assignment to old style TStrings, but this class doesn't know // TWideStrings, so we need to do it from here var I: Integer; {$IFNDEF SUPPORTS_UNICODE} CP: Word; {$ENDIF ~SUPPORTS_UNICODE} begin if Dest is TStrings then begin with Dest as TStrings do begin BeginUpdate; try {$IFNDEF SUPPORTS_UNICODE} CP := CodePageFromLocale(FLanguage); {$ENDIF SUPPORTS_UNICODE} Clear; for I := 0 to Self.Count - 1 do begin {$IFDEF SUPPORTS_UNICODE} AddObject(Self[I], Self.Objects[I]); {$ELSE ~SUPPORTS_UNICODE} AddObject(WideStringToStringEx(Self[I], CP), Self.Objects[I]); {$ENDIF ~SUPPORTS_UNICODE} end; finally EndUpdate; end; end; end else begin if Dest is TWideStrings then begin with Dest as TWideStrings do begin BeginUpdate; try Clear; AddStrings(Self); finally EndUpdate; end; end; end else inherited AssignTo(Dest); end; end; procedure TWideStrings.BeginUpdate; begin if FUpdateCount = 0 then SetUpdateState(True); Inc(FUpdateCount); end; procedure TWideStrings.DefineProperties(Filer: TFiler); // Defines a private property for the content of the list. // There's a bug in the handling of text DFMs in Classes.pas which prevents // WideStrings from loading under some circumstances. Zbysek Hlinka // (zhlinka att login dott cz) brought this to my attention and supplied also a solution. // See ReadData and WriteData methods for implementation details. //--------------- local function -------------------------------------------- function DoWrite: Boolean; begin if Filer.Ancestor <> nil then begin Result := True; if Filer.Ancestor is TWideStrings then Result := not Equals(TWideStrings(Filer.Ancestor)) end else Result := Count > 0; end; //--------------- end local function ---------------------------------------- begin Filer.DefineProperty('WideStrings', ReadData, WriteData, DoWrite); end; procedure TWideStrings.DoConfirmConversion(var Allowed: Boolean); begin if Assigned(FOnConfirmConversion) then FOnConfirmConversion(Self, Allowed); end; procedure TWideStrings.EndUpdate; begin Dec(FUpdateCount); if FUpdateCount = 0 then SetUpdateState(False); end; function TWideStrings.Equals(Strings: TWideStrings): Boolean; var I, Count: Integer; begin Assert(Strings <> nil); Result := False; Count := GetCount; if Count <> Strings.GetCount then Exit; { TODO : use internal comparation routine as soon as composition is implemented } for I := 0 to Count - 1 do if Get(I) <> Strings.Get(I) then Exit; Result := True; end; procedure TWideStrings.Error(const Msg: string; Data: Integer); function ReturnAddr: Pointer; asm {$IFDEF CPU32} MOV EAX, EBP MOV EAX, [EAX + 4] {$ENDIF CPU32} {$IFDEF CPU64} MOV RAX, RBP MOV RAX, [RAX + 8] {$ENDIF CPU64} end; begin raise EStringListError.CreateFmt(Msg, [Data]) at ReturnAddr; end; procedure TWideStrings.Exchange(Index1, Index2: Integer); var TempObject: TObject; TempString: WideString; begin BeginUpdate; try TempString := Strings[Index1]; TempObject := Objects[Index1]; Strings[Index1] := Strings[Index2]; Objects[Index1] := Objects[Index2]; Strings[Index2] := TempString; Objects[Index2] := TempObject; finally EndUpdate; end; end; function TWideStrings.GetCapacity: Integer; // Descendants may optionally override/replace this default implementation. begin Result := Count; end; function TWideStrings.GetCommaText: WideString; var S: WideString; P: PWideChar; I, Count: Integer; begin Count := GetCount; if (Count = 1) and (Get(0) = '') then Result := '""' else begin Result := ''; for I := 0 to Count - 1 do begin S := Get(I); P := PWideChar(S); while (P^ > WideSpace) and (P^ <> '"') and (P^ <> ',') do Inc(P); if P^ <> WideNull then S := WideQuotedStr(S, '"'); Result := Result + S + ','; end; System.Delete(Result, Length(Result), 1); end; end; function TWideStrings.GetName(Index: Integer): WideString; var P: Integer; begin Result := Get(Index); P := Pos('=', Result); if P > 0 then SetLength(Result, P - 1) else Result := ''; end; function TWideStrings.GetObject(Index: Integer): TObject; begin Result := nil; end; function TWideStrings.GetSeparatedText(Separators: WideString): WideString; // Same as GetText but with customizable separator characters. var I, L, Size, Count, SepSize: Integer; P: PWideChar; S: WideString; begin Count := GetCount; SepSize := Length(Separators); Size := 0; for I := 0 to Count - 1 do Inc(Size, Length(Get(I)) + SepSize); // set one separator less, the last line does not need a trailing separator SetLength(Result, Size - SepSize); if Size > 0 then begin P := Pointer(Result); I := 0; while True do begin S := Get(I); L := Length(S); if L <> 0 then begin // add current string System.Move(Pointer(S)^, P^, 2 * L); Inc(P, L); end; Inc(I); if I = Count then Break; // add separators System.Move(Pointer(Separators)^, P^, SizeOf(WideChar) * SepSize); Inc(P, SepSize); end; end; end; function TWideStrings.GetTextStr: WideString; begin Result := GetSeparatedText(WideCRLF); end; function TWideStrings.GetText: PWideChar; begin Result := StrNewW(GetTextStr); end; function TWideStrings.GetValue(const Name: WideString): WideString; var I: Integer; begin I := IndexOfName(Name); if I >= 0 then Result := Copy(Get(I), Length(Name) + 2, MaxInt) else Result := ''; end; function TWideStrings.IndexOf(const S: WideString): Integer; var NormString: WideString; begin NormString := WideNormalize(S, FNormalizationForm); for Result := 0 to GetCount - 1 do if WideCompareText(Get(Result), NormString, FLanguage) = 0 then Exit; Result := -1; end; function TWideStrings.IndexOfName(const Name: WideString): Integer; var P: Integer; S: WideString; NormName: WideString; begin NormName := WideNormalize(Name, FNormalizationForm); for Result := 0 to GetCount - 1 do begin S := Get(Result); P := Pos('=', S); if (P > 0) and (WideCompareText(Copy(S, 1, P - 1), NormName, FLanguage) = 0) then Exit; end; Result := -1; end; function TWideStrings.IndexOfObject(AObject: TObject): Integer; begin for Result := 0 to GetCount - 1 do if GetObject(Result) = AObject then Exit; Result := -1; end; procedure TWideStrings.InsertObject(Index: Integer; const S: WideString; AObject: TObject); begin Insert(Index, S); PutObject(Index, AObject); end; procedure TWideStrings.LoadFromFile(const FileName: TFileName); var Stream: TStream; begin try Stream := TFileStream.Create(FileName, fmOpenRead or fmShareDenyNone); try LoadFromStream(Stream); finally Stream.Free; end; except RaiseLastOSError; end; end; procedure TWideStrings.LoadFromStream(Stream: TStream); // usual loader routine, but enhanced to handle byte order marks in stream var Size, BytesRead: Integer; ByteOrderMask: array [0..5] of Byte; // BOM size is max 5 bytes (cf: wikipedia) // but it is easier to implement with a multiple of 2 Loaded: Boolean; SW: WideString; SA: AnsiString; begin BeginUpdate; try Loaded := False; Size := Stream.Size - Stream.Position; ByteOrderMask[0] := 0; BytesRead := Stream.Read(ByteOrderMask[0],SizeOf(ByteOrderMask)); // UTF16 LSB = Unicode LSB if (BytesRead >= 2) and (ByteOrderMask[0] = BOM_UTF16_LSB[0]) and (ByteOrderMask[1] = BOM_UTF16_LSB[1]) then begin FSaveFormat := sfUTF16LSB; SetLength(SW, (Size - 2) div SizeOf(WideChar)); Assert((Size and 1) <> 1,'Number of chars must be a multiple of 2'); if BytesRead > 2 then begin System.Move(ByteOrderMask[2], SW[1], BytesRead-2); // max 4 bytes = 2 widechars if Size > BytesRead then // first 2 chars (maximum) were copied by System.Move Stream.Read(SW[3], Size-BytesRead); end; SetText(SW); Loaded := True; end; // UTF16 MSB = Unicode MSB if (BytesRead >= 2) and (ByteOrderMask[0] = BOM_UTF16_MSB[0]) and (ByteOrderMask[1] = BOM_UTF16_MSB[1]) then begin FSaveFormat := sfUTF16MSB; SetLength(SW, (Size - 2) div SizeOf(WideChar)); Assert((Size and 1) <> 1,'Number of chars must be a multiple of 2'); if BytesRead > 2 then begin System.Move(ByteOrderMask[2],SW[1],BytesRead-2); // max 4 bytes = 2 widechars if Size > BytesRead then // first 2 chars (maximum) were copied by System.Move Stream.Read(SW[3], Size-BytesRead); StrSwapByteOrder(PWideChar(SW)); end; SetText(SW); Loaded := True; end; // UTF8 if (BytesRead >= 3) and (ByteOrderMask[0] = BOM_UTF8[0]) and (ByteOrderMask[1] = BOM_UTF8[1]) and (ByteOrderMask[2] = BOM_UTF8[2]) then begin FSaveFormat := sfUTF8; SetLength(SA, (Size-3) div SizeOf(AnsiChar)); if BytesRead > 3 then begin System.Move(ByteOrderMask[3],SA[1],BytesRead-3); // max 3 bytes = 3 chars if Size > BytesRead then // first 3 chars were copied by System.Move Stream.Read(SA[4], Size-BytesRead); SW := UTF8ToWideString(SA); end; SetText(SW); Loaded := True; end; // default case (Ansi) if not Loaded then begin FSaveFormat := sfAnsi; SetLength(SA, Size div SizeOf(AnsiChar)); if BytesRead > 0 then begin System.Move(ByteOrderMask[0], SA[1], BytesRead); // max 6 bytes = 6 chars if Size > BytesRead then Stream.Read(SA[7], Size-BytesRead); // first 6 chars were copied by System.Move end; SetText(StringToWideStringEx(SA, CodePageFromLocale(FLanguage))); end; finally EndUpdate; end; end; procedure TWideStrings.Move(CurIndex, NewIndex: Integer); var TempObject: TObject; TempString: WideString; begin if CurIndex <> NewIndex then begin BeginUpdate; try TempString := Get(CurIndex); TempObject := GetObject(CurIndex); Delete(CurIndex); InsertObject(NewIndex, TempString, TempObject); finally EndUpdate; end; end; end; procedure TWideStrings.ReadData(Reader: TReader); begin case Reader.NextValue of vaLString, vaString: SetText(Reader.ReadString); else SetText(Reader.ReadWideString); end; end; procedure TWideStrings.SaveToFile(const FileName: TFileName); var Stream: TStream; begin Stream := TFileStream.Create(FileName, fmCreate); try SaveToStream(Stream); finally Stream.Free; end; end; procedure TWideStrings.SaveToStream(Stream: TStream; WithBOM: Boolean = True); // Saves the currently loaded text into the given stream. WithBOM determines whether to write a // byte order mark or not. Note: when saved as ANSI text there will never be a BOM. var SW: WideString; SA: AnsiString; Allowed: Boolean; Run: PWideChar; begin // The application can decide in which format to save the content. // If FSaveUnicode is False then all strings are saved in standard ANSI format // which is also loadable by TStrings but you should be aware that all Unicode // strings are then converted to ANSI based on the current system locale. // An extra event is supplied to ask the user about the potential loss of // information when converting Unicode to ANSI strings. SW := GetTextStr; Allowed := True; FSaved := False; // be pessimistic // A check for potential information loss makes only sense if the application has // set an event to be used as call back to ask about the conversion. if (FSaveFormat = sfAnsi) and Assigned(FOnConfirmConversion) then begin // application requests to save only ANSI characters, so check the text and // call back in case information could be lost Run := PWideChar(SW); // only ask if there's at least one Unicode character in the text while (Run^>#0) and (run^<=#255) do Inc(Run); // Note: The application can still set FSaveUnicode to True in the callback. if Run^ <> WideNull then DoConfirmConversion(Allowed); end; if Allowed then begin // only save if allowed case SaveFormat of sfUTF16LSB : begin if WithBOM then Stream.WriteBuffer(BOM_UTF16_LSB[0],SizeOf(BOM_UTF16_LSB)); if Length(SW) > 0 then Stream.WriteBuffer(SW[1],Length(SW)*SizeOf(UTF16)); FSaved := True; end; sfUTF16MSB : begin if WithBOM then Stream.WriteBuffer(BOM_UTF16_MSB[0],SizeOf(BOM_UTF16_MSB)); if Length(SW) > 0 then begin StrSwapByteOrder(PWideChar(SW)); Stream.WriteBuffer(SW[1],Length(SW)*SizeOf(UTF16)); end; FSaved := True; end; sfUTF8 : begin if WithBOM then Stream.WriteBuffer(BOM_UTF8[0],SizeOf(BOM_UTF8)); if Length(SW) > 0 then begin SA := WideStringToUTF8(SW); Stream.WriteBuffer(SA[1],Length(SA)*SizeOf(UTF8)); end; FSaved := True; end; sfAnsi : begin if Length(SW) > 0 then begin SA := WideStringToStringEx(SW,CodePageFromLocale(FLanguage)); Stream.WriteBuffer(SA[1],Length(SA)*SizeOf(AnsiChar)); end; FSaved := True; end; end; end; end; procedure TWideStrings.SetCapacity(NewCapacity: Integer); begin // do nothing - descendants may optionally implement this method end; procedure TWideStrings.SetCommaText(const Value: WideString); var P, P1: PWideChar; S: WideString; begin BeginUpdate; try Clear; P := PWideChar(Value); while (P^ >= #1) and (P^ <= WideSpace) do Inc(P); while P^ <> WideNull do begin if P^ = '"' then S := WideExtractQuotedStr(P, '"') else begin P1 := P; while (P^ > WideSpace) and (P^ <> ',') do Inc(P); SetString(S, P1, P - P1); end; Add(S); while (P^ >= #1) and (P^ <= WideSpace) do Inc(P); if P^ = ',' then begin repeat Inc(P); until not ((P^ >= #1) and (P^ <= WideSpace)); end; end; finally EndUpdate; end; end; procedure TWideStrings.SetText(const Value: WideString); var Head, Tail: PWideChar; S: WideString; begin BeginUpdate; try Clear; Head := PWideChar(Value); while Head^ <> WideNull do begin Tail := Head; while (Tail^ <> WideNull) and (Tail^ <> WideLineFeed) and (Tail^ <> WideCarriageReturn) and (Tail^ <> WideVerticalTab) and (Tail^ <> WideFormFeed) and (Tail^ <> WideLineSeparator) and (Tail^ <> WideParagraphSeparator) do Inc(Tail); SetString(S, Head, Tail - Head); Add(S); Head := Tail; if Head^ <> WideNull then begin Inc(Head); if (Tail^ = WideCarriageReturn) and (Head^ = WideLineFeed) then Inc(Head); end; end; finally EndUpdate; end; end; procedure TWideStrings.SetUpdateState(Updating: Boolean); begin end; procedure TWideStrings.SetNormalizationForm(const Value: TNormalizationForm); var I: Integer; begin if FNormalizationForm <> Value then begin FNormalizationForm := Value; if FNormalizationForm <> nfNone then begin // renormalize all strings according to the new form for I := 0 to GetCount - 1 do Put(I, WideNormalize(Get(I), FNormalizationForm)); end; end; end; procedure TWideStrings.SetValue(const Name, Value: WideString); var I : Integer; begin I := IndexOfName(Name); if Value <> '' then begin if I < 0 then I := Add(''); Put(I, Name + '=' + Value); end else begin if I >= 0 then Delete(I); end; end; procedure TWideStrings.WriteData(Writer: TWriter); begin Writer.WriteWideString(GetTextStr); end; //=== { TWideStringList } ==================================================== destructor TWideStringList.Destroy; begin FOnChange := nil; FOnChanging := nil; Clear; inherited Destroy; end; function TWideStringList.Add(const S: WideString): Integer; begin if not Sorted then Result := FCount else begin if Find(S, Result) then begin case Duplicates of dupIgnore: Exit; dupError: Error(SDuplicateString, 0); end; end; end; InsertItem(Result, S); end; procedure TWideStringList.Changed; begin if (FUpdateCount = 0) and Assigned(FOnChange) then FOnChange(Self); end; procedure TWideStringList.Changing; begin if (FUpdateCount = 0) and Assigned(FOnChanging) then FOnChanging(Self); end; procedure TWideStringList.Clear; {$IFDEF OWN_WIDESTRING_MEMMGR} var I: Integer; {$ENDIF OWN_WIDESTRING_MEMMGR} begin if FCount <> 0 then begin Changing; {$IFDEF OWN_WIDESTRING_MEMMGR} for I := 0 to FCount - 1 do with FList[I] do if TDynWideCharArray(FString) <> nil then TDynWideCharArray(FString) := nil; {$ENDIF OWN_WIDESTRING_MEMMGR} // this will automatically finalize the array FList := nil; FCount := 0; SetCapacity(0); Changed; end; end; procedure TWideStringList.Delete(Index: Integer); begin if Cardinal(Index) >= Cardinal(FCount) then Error(SListIndexError, Index); Changing; {$IFDEF OWN_WIDESTRING_MEMMGR} SetListString(Index, ''); {$ELSE ~OWN_WIDESTRING_MEMMGR} FList[Index].FString := ''; {$ENDIF ~OWN_WIDESTRING_MEMMGR} Dec(FCount); if Index < FCount then begin System.Move(FList[Index + 1], FList[Index], (FCount - Index) * SizeOf(TWideStringItem)); Pointer(FList[FCount].FString) := nil; // avoid freeing the string, the address is now used in another element end; Changed; end; procedure TWideStringList.Exchange(Index1, Index2: Integer); begin if Cardinal(Index1) >= Cardinal(FCount) then Error(SListIndexError, Index1); if Cardinal(Index2) >= Cardinal(FCount) then Error(SListIndexError, Index2); Changing; ExchangeItems(Index1, Index2); Changed; end; procedure TWideStringList.ExchangeItems(Index1, Index2: Integer); var Temp: TWideStringItem; begin Temp := FList[Index1]; FList[Index1] := FList[Index2]; FList[Index2] := Temp; end; function TWideStringList.Find(const S: WideString; var Index: Integer): Boolean; var L, H, I, C: Integer; NormString: WideString; begin Result := False; NormString := WideNormalize(S, FNormalizationForm); L := 0; H := FCount - 1; while L <= H do begin I := (L + H) shr 1; C := WideCompareText(FList[I].FString, NormString, FLanguage); if C < 0 then L := I+1 else begin H := I - 1; if C = 0 then begin Result := True; if Duplicates <> dupAccept then L := I; end; end; end; Index := L; end; function TWideStringList.Get(Index: Integer): WideString; {$IFDEF OWN_WIDESTRING_MEMMGR} var Len: Integer; {$ENDIF OWN_WIDESTRING_MEMMGR} begin if Cardinal(Index) >= Cardinal(FCount) then Error(SListIndexError, Index); {$IFDEF OWN_WIDESTRING_MEMMGR} with FList[Index] do begin Len := Length(TDynWideCharArray(FString)); if Len > 0 then begin SetLength(Result, Len - 1); // exclude #0 if Result <> '' then System.Move(FString^, Result[1], Len * SizeOf(WideChar)); end else Result := ''; end; {$ELSE ~OWN_WIDESTRING_MEMMGR} Result := FList[Index].FString; {$ENDIF ~OWN_WIDESTRING_MEMMGR} end; function TWideStringList.GetCapacity: Integer; begin Result := Length(FList); end; function TWideStringList.GetCount: Integer; begin Result := FCount; end; function TWideStringList.GetObject(Index: Integer): TObject; begin if Cardinal(Index) >= Cardinal(FCount) then Error(SListIndexError, Index); Result := FList[Index].FObject; end; procedure TWideStringList.Grow; var Delta, Len: Integer; begin Len := Length(FList); if Len > 64 then Delta := Len div 4 else begin if Len > 8 then Delta := 16 else Delta := 4; end; SetCapacity(Len + Delta); end; function TWideStringList.IndexOf(const S: WideString): Integer; begin if not Sorted then Result := inherited IndexOf(S) else if not Find(S, Result) then Result := -1; end; procedure TWideStringList.Insert(Index: Integer; const S: WideString); begin if Sorted then Error(SSortedListError, 0); if Cardinal(Index) > Cardinal(FCount) then Error(SListIndexError, Index); InsertItem(Index, S); end; {$IFDEF OWN_WIDESTRING_MEMMGR} procedure TWideStringList.SetListString(Index: Integer; const S: WideString); var Len: Integer; A: TDynWideCharArray; begin with FList[Index] do begin Pointer(A) := TDynWideCharArray(FString); if A <> nil then A := nil; // free memory Len := Length(S); if Len > 0 then begin SetLength(A, Len + 1); // include #0 System.Move(S[1], A[0], Len * SizeOf(WideChar)); A[Len] := #0; end; FString := PWideChar(A); Pointer(A) := nil; // do not release the array on procedure exit end; end; {$ENDIF OWN_WIDESTRING_MEMMGR} procedure TWideStringList.InsertItem(Index: Integer; const S: WideString); begin Changing; if FCount = Length(FList) then Grow; if Index < FCount then System.Move(FList[Index], FList[Index + 1], (FCount - Index) * SizeOf(TWideStringItem)); with FList[Index] do begin Pointer(FString) := nil; // avoid freeing the string, the address is now used in another element FObject := nil; if (FNormalizationForm <> nfNone) and (Length(S) > 0) then {$IFDEF OWN_WIDESTRING_MEMMGR} SetListString(Index, WideNormalize(S, FNormalizationForm)) else SetListString(Index, S); {$ELSE ~OWN_WIDESTRING_MEMMGR} FString := WideNormalize(S, FNormalizationForm) else FString := S; {$ENDIF ~OWN_WIDESTRING_MEMMGR} end; Inc(FCount); Changed; end; procedure TWideStringList.Put(Index: Integer; const S: WideString); begin if Sorted then Error(SSortedListError, 0); if Cardinal(Index) >= Cardinal(FCount) then Error(SListIndexError, Index); Changing; if (FNormalizationForm <> nfNone) and (Length(S) > 0) then {$IFDEF OWN_WIDESTRING_MEMMGR} SetListString(Index, WideNormalize(S, FNormalizationForm)) else SetListString(Index, S); {$ELSE ~OWN_WIDESTRING_MEMMGR} FList[Index].FString := WideNormalize(S, FNormalizationForm) else FList[Index].FString := S; {$ENDIF ~OWN_WIDESTRING_MEMMGR} Changed; end; procedure TWideStringList.PutObject(Index: Integer; AObject: TObject); begin if Cardinal(Index) >= Cardinal(FCount) then Error(SListIndexError, Index); Changing; FList[Index].FObject := AObject; Changed; end; procedure TWideStringList.QuickSort(L, R: Integer); var I, J: Integer; P: WideString; begin repeat I := L; J := R; P := FList[(L + R) shr 1].FString; repeat while WideCompareText(FList[I].FString, P, FLanguage) < 0 do Inc(I); while WideCompareText(FList[J].FString, P, FLanguage) > 0 do Dec(J); if I <= J then begin ExchangeItems(I, J); Inc(I); Dec(J); end; until I > J; if L < J then QuickSort(L, J); L := I; until I >= R; end; procedure TWideStringList.SetCapacity(NewCapacity: Integer); begin SetLength(FList, NewCapacity); if NewCapacity < FCount then FCount := NewCapacity; end; procedure TWideStringList.SetSorted(Value: Boolean); begin if FSorted <> Value then begin if Value then Sort; FSorted := Value; end; end; procedure TWideStringList.SetUpdateState(Updating: Boolean); begin if Updating then Changing else Changed; end; procedure TWideStringList.Sort; begin if not Sorted and (FCount > 1) then begin Changing; QuickSort(0, FCount - 1); Changed; end; end; procedure TWideStringList.SetLanguage(Value: LCID); begin inherited SetLanguage(Value); if Sorted then Sort; end; // exchanges in each character of the given string the low order and high order // byte to go from LSB to MSB and vice versa. // EAX contains address of string function WideAdjustLineBreaks(const S: WideString): WideString; var Source, SourceEnd, Dest: PWideChar; begin Source := Pointer(S); SourceEnd := Source + Length(S); Source := Pointer(S); SetString(Result, nil, SourceEnd - Source); Dest := Pointer(Result); while Source < SourceEnd do begin case Source^ of WideLineFeed: begin Dest^ := WideLineSeparator; Inc(Dest); Inc(Source); end; WideCarriageReturn: begin Dest^ := WideLineSeparator; Inc(Dest); Inc(Source); if Source^ = WideLineFeed then Inc(Source); end; else Dest^ := Source^; Inc(Dest); Inc(Source); end; end; SetLength(Result, (TJclAddr(Dest) - TJclAddr(Result)) div 2); end; function WideQuotedStr(const S: WideString; Quote: WideChar): WideString; // works like QuotedStr from SysUtils.pas but can insert any quotation character var P, Src, Dest: PWideChar; AddCount: SizeInt; begin AddCount := 0; P := StrScanW(PWideChar(S), Quote); while (P <> nil) do begin Inc(P); Inc(AddCount); P := StrScanW(P, Quote); end; if AddCount = 0 then Result := Quote + S + Quote else begin SetLength(Result, Length(S) + AddCount + 2); Dest := PWideChar(Result); Dest^ := Quote; Inc(Dest); Src := PWideChar(S); P := StrScanW(Src, Quote); repeat Inc(P); Move(Src^, Dest^, 2 * (P - Src)); Inc(Dest, P - Src); Dest^ := Quote; Inc(Dest); Src := P; P := StrScanW(Src, Quote); until P = nil; P := StrEndW(Src); Move(Src^, Dest^, 2 * (P - Src)); Inc(Dest, P - Src); Dest^ := Quote; end; end; function WideExtractQuotedStr(var Src: PWideChar; Quote: WideChar): WideString; // extracts a string enclosed in quote characters given by Quote var P, Dest: PWideChar; DropCount: SizeInt; begin Result := ''; if (Src = nil) or (Src^ <> Quote) then Exit; Inc(Src); DropCount := 1; P := Src; Src := StrScanW(Src, Quote); while Src <> nil do // count adjacent pairs of quote chars begin Inc(Src); if Src^ <> Quote then Break; Inc(Src); Inc(DropCount); Src := StrScanW(Src, Quote); end; if Src = nil then Src := StrEndW(P); if (Src - P) <= 1 then Exit; if DropCount = 1 then SetString(Result, P, Src - P - 1) else begin SetLength(Result, Src - P - DropCount); Dest := PWideChar(Result); Src := StrScanW(P, Quote); while Src <> nil do begin Inc(Src); if Src^ <> Quote then Break; Move(P^, Dest^, 2 * (Src - P)); Inc(Dest, Src - P); Inc(Src); P := Src; Src := StrScanW(Src, Quote); end; if Src = nil then Src := StrEndW(P); Move(P^, Dest^, 2 * (Src - P - 1)); end; end; function WideStringOfChar(C: WideChar; Count: SizeInt): WideString; // returns a string of Count characters filled with C var I: SizeInt; begin SetLength(Result, Count); for I := 1 to Count do Result[I] := C; end; function WideTrim(const S: WideString): WideString; var I, L: SizeInt; begin L := Length(S); I := 1; while (I <= L) and (UnicodeIsWhiteSpace(UCS4(S[I])) or UnicodeIsControl(UCS4(S[I]))) do Inc(I); if I > L then Result := '' else begin while UnicodeIsWhiteSpace(UCS4(S[L])) or UnicodeIsControl(UCS4(S[L])) do Dec(L); Result := Copy(S, I, L - I + 1); end; end; function WideTrimLeft(const S: WideString): WideString; var I, L: SizeInt; begin L := Length(S); I := 1; while (I <= L) and (UnicodeIsWhiteSpace(UCS4(S[I])) or UnicodeIsControl(UCS4(S[I]))) do Inc(I); Result := Copy(S, I, Maxint); end; function WideTrimRight(const S: WideString): WideString; var I: SizeInt; begin I := Length(S); while (I > 0) and (UnicodeIsWhiteSpace(UCS4(S[I])) or UnicodeIsControl(UCS4(S[I]))) do Dec(I); Result := Copy(S, 1, I); end; // returns the index of character Ch in S, starts searching at index Index // Note: This is a quick memory search. No attempt is made to interpret either // the given charcter nor the string (ligatures, modifiers, surrogates etc.) // Code from Azret Botash. function WideCharPos(const S: WideString; const Ch: WideChar; const Index: SizeInt): SizeInt; var P, R: PWideChar; begin if (Index > 0) and (Index <= Length(S)) then begin P := PWideChar(@S[Index]); R := StrScanW(P, Ch); if R <> nil then Result := R - P + Index else Result := 0; end else Result := 0; end; function WideComposeHangul(const Source: WideString): WideString; var Len: SizeInt; Ch, Last: WideChar; I: SizeInt; LIndex, VIndex, SIndex, TIndex: SizeInt; begin Result := ''; Len := Length(Source); if Len > 0 then begin Last := Source[1]; Result := Last; for I := 2 to Len do begin Ch := Source[I]; // 1. check to see if two current characters are L and V LIndex := Word(Last) - LBase; if (0 <= LIndex) and (LIndex < LCount) then begin VIndex := Word(Ch) - VBase; if (0 <= VIndex) and (VIndex < VCount) then begin // make syllable of form LV Last := WideChar((SBase + (LIndex * VCount + VIndex) * TCount)); Result[Length(Result)] := Last; // reset last Continue; // discard Ch end; end; // 2. check to see if two current characters are LV and T SIndex := Word(Last) - SBase; if (0 <= SIndex) and (SIndex < SCount) and ((SIndex mod TCount) = 0) then begin TIndex := Word(Ch) - TBase; if (0 <= TIndex) and (TIndex <= TCount) then begin // make syllable of form LVT Inc(Word(Last), TIndex); Result[Length(Result)] := Last; // reset last Continue; // discard Ch end; end; // if neither case was true, just add the character Last := Ch; Result := Result + Ch; end; end; end; // Returns canonical composition of characters in S. function WideCompose(const S: WideString): WideString; var Buffer: array of UCS4; LastInPos, InPos, OutPos, BufferSize, NbProcessed: SizeInt; Composite: UCS4; begin // Set an arbitrary length for the result. This is automatically done when checking // for hangul composition. Result := WideComposeHangul(S); if Result = '' then Exit; if Compositions = nil then LoadCompositionData; LastInPos := Length(Result); if LastInPos > MaxCompositionSize then SetLength(Buffer, MaxCompositionSize) else SetLength(Buffer, LastInPos); BufferSize := 0; InPos := 0; OutPos := 0; while (InPos < LastInPos) or (BufferSize > 0) do begin // fill buffer from input while BufferSize < Length(Buffer) do begin if InPos < LastInPos then begin Inc(InPos); Buffer[BufferSize] := UCS4(Result[InPos]); Inc(BufferSize); end else SetLength(Buffer, BufferSize); end; if Length(Buffer) = 0 then Break; NbProcessed := UnicodeCompose(Buffer, Composite); if NbProcessed = 0 then Break; if BufferSize > NbProcessed then Move(Buffer[NbProcessed], Buffer[0], (BufferSize - NbProcessed) * SizeOf(UCS4)); Dec(BufferSize, NbProcessed); Inc(OutPos); Result[OutPos] := UCS2(Composite); end; // since we have likely shortened the source string we have to set the correct length on exit SetLength(Result, OutPos); end; procedure FixCanonical(var S: WideString); // Examines S and reorders all combining marks in the string so that they are in canonical order. var I: SizeInt; Temp: WideChar; CurrentClass, LastClass: Cardinal; begin I := Length(S); if I > 1 then begin CurrentClass := CanonicalCombiningClass(UCS4(S[I])); repeat Dec(I); LastClass := CurrentClass; CurrentClass := CanonicalCombiningClass(UCS4(S[I])); // A swap is presumed to be rare (and a double-swap very rare), // so don't worry about efficiency here. if (CurrentClass > LastClass) and (LastClass > 0) then begin // swap characters Temp := S[I]; S[I] := S[I + 1]; S[I + 1] := Temp; // if not at end, backup (one further, to compensate for loop) if I < Length(S) - 1 then Inc(I, 2); // reset type, since we swapped. CurrentClass := CanonicalCombiningClass(UCS4(S[I])); end; until I = 1; end; end; procedure GetDecompositions(Compatible: Boolean; Code: UCS4; var Buffer: TUCS4Array); // helper function to recursively decompose a code point var Decomp: TUCS4Array; I: SizeInt; begin Decomp := UnicodeDecompose(Code, Compatible); if Assigned(Decomp) then begin for I := 0 to High(Decomp) do GetDecompositions(Compatible, Decomp[I], Buffer); end else // if no decomp, append begin I := Length(Buffer); SetLength(Buffer, I + 1); Buffer[I] := Code; end; end; function WideDecompose(const S: WideString; Compatible: Boolean): WideString; // returns a string with all characters of S but decomposed, e.g. is returned as E^ etc. var I, J: SizeInt; Decomp: TUCS4Array; begin Result := ''; Decomp := nil; // iterate through each source code point for I := 1 to Length(S) do begin Decomp := nil; GetDecompositions(Compatible, UCS4(S[I]), Decomp); if Decomp = nil then Result := Result + S[I] else for J := 0 to High(Decomp) do Result := Result + WideChar(Decomp[J]); end; // combining marks must be sorted according to their canonical combining class FixCanonical(Result); end; function WideNormalize(const S: WideString; Form: TNormalizationForm): WideString; var Temp: WideString; Compatible: Boolean; begin Result := S; if Form = nfNone then Exit; // No normalization needed. Compatible := Form in [nfKC, nfKD]; if Form in [nfD, nfKD] then Result := WideDecompose(S, Compatible) else begin Temp := WideDecompose(S, Compatible); Result := WideCompose(Temp); end; end; function WideSameText(const Str1, Str2: WideString): Boolean; // Compares both strings case-insensitively and returns True if both are equal, otherwise False is returned. begin Result := Length(Str1) = Length(Str2); if Result then Result := StrICompW(PWideChar(Str1), PWideChar(Str2)) = 0; end; //----------------- general purpose case mapping --------------------------------------------------- function WideCaseConvert(C: WideChar; CaseType: TCaseType): WideString; var I, RPos: SizeInt; Mapping: TUCS4Array; begin SetLength(Mapping, 0); if not CaseLookup(UCS4(C), CaseType, Mapping) then Result := C else begin SetLength(Result, 2 * Length(Mapping)); RPos := 1; for I := Low(Mapping) to High(Mapping) do UTF16SetNextChar(Result, RPos, Mapping[I]); if RPos > 0 then SetLength(Result, RPos - 1) else raise EJclUnexpectedEOSequenceError.Create; end; end; function WideCaseConvert(const S: WideString; CaseType: TCaseType): WideString; var SLen, RLen, SPos, RPos, K, MapLen: SizeInt; Code: UCS4; Mapping: TUCS4Array; begin SetLength(Mapping, 0); SLen := Length(S); RLen := SLen; SetLength(Result, RLen); SPos := 1; RPos := 1; while (SPos > 0) and (SPos <= SLen) do begin Code := UTF16GetNextChar(S, SPos); if SPos = -1 then raise EJclUnexpectedEOSequenceError.Create; if CaseLookup(Code, CaseType, Mapping) then begin MapLen:= Length(Mapping); if MapLen = 1 then Code := Mapping[0]; end else MapLen := 1; if MapLen = 1 then begin if not UTF16SetNextChar(Result, RPos, Code) then begin Inc(RLen, SLen); SetLength(Result, RLen); UTF16SetNextChar(Result, RPos, Code); end; end else begin for K := Low(Mapping) to High(Mapping) do if not UTF16SetNextChar(Result, RPos, Code) then begin Inc(RLen, SLen); SetLength(Result, RLen); UTF16SetNextChar(Result, RPos, Code); end; end; end; if RPos > 0 then SetLength(Result, RPos - 1) else raise EJclUnexpectedEOSequenceError.Create; end; // Note that most of the assigned code points don't have a case mapping and are therefore // returned as they are. Other code points, however, might be converted into several characters // like the german (eszett) whose upper case mapping is SS. function WideCaseFolding(C: WideChar): WideString; // Special case folding function to map a string to either its lower case or // to special cases. This can be used for case-insensitive comparation. begin Result:= WideCaseConvert(C, ctFold); end; function WideCaseFolding(const S: WideString): WideString; begin Result:= WideCaseConvert(S, ctFold); end; function WideLowerCase(C: WideChar): WideString; begin Result:= WideCaseConvert(C, ctLower); end; function WideLowerCase(const S: WideString): WideString; begin Result:= WideCaseConvert(S, ctLower); end; function WideTitleCase(C: WideChar): WideString; begin Result:= WideCaseConvert(C, ctTitle); end; function WideTitleCase(const S: WideString): WideString; begin Result:= WideCaseConvert(S, ctTitle); end; function WideUpperCase(C: WideChar): WideString; begin Result:= WideCaseConvert(C, ctUpper); end; function WideUpperCase(const S: WideString): WideString; begin Result:= WideCaseConvert(S, ctUpper); end; //----------------- character test routines -------------------------------------------------------- function UnicodeIsAlpha(C: UCS4): Boolean; // Is the character alphabetic? begin Result := CategoryLookup(C, ClassLetter); end; function UnicodeIsDigit(C: UCS4): Boolean; // Is the character a digit? begin Result := CategoryLookup(C, [ccNumberDecimalDigit]); end; function UnicodeIsAlphaNum(C: UCS4): Boolean; // Is the character alphabetic or a number? begin Result := CategoryLookup(C, ClassLetter + [ccNumberDecimalDigit]); end; function UnicodeIsCased(C: UCS4): Boolean; // Is the character a "cased" character, i.e. either lower case, title case or upper case begin Result := CategoryLookup(C, [ccLetterLowercase, ccLetterTitleCase, ccLetterUppercase]); end; function UnicodeIsControl(C: UCS4): Boolean; // Is the character a control character? begin Result := CategoryLookup(C, [ccOtherControl, ccOtherFormat]); end; function UnicodeIsSpace(C: UCS4): Boolean; // Is the character a spacing character? begin Result := CategoryLookup(C, ClassSpace); end; function UnicodeIsWhiteSpace(C: UCS4): Boolean; // Is the character a white space character (same as UnicodeIsSpace plus // tabulator, new line etc.)? begin Result := CategoryLookup(C, ClassSpace + [ccWhiteSpace, ccSegmentSeparator]); end; function UnicodeIsBlank(C: UCS4): Boolean; // Is the character a space separator? begin Result := CategoryLookup(C, [ccSeparatorSpace]); end; function UnicodeIsPunctuation(C: UCS4): Boolean; // Is the character a punctuation mark? begin Result := CategoryLookup(C, ClassPunctuation); end; function UnicodeIsGraph(C: UCS4): Boolean; // Is the character graphical? begin Result := CategoryLookup(C, ClassMark + ClassNumber + ClassLetter + ClassPunctuation + ClassSymbol); end; function UnicodeIsPrintable(C: UCS4): Boolean; // Is the character printable? begin Result := CategoryLookup(C, ClassMark + ClassNumber + ClassLetter + ClassPunctuation + ClassSymbol + [ccSeparatorSpace]); end; function UnicodeIsUpper(C: UCS4): Boolean; // Is the character already upper case? begin Result := CategoryLookup(C, [ccLetterUppercase]); end; function UnicodeIsLower(C: UCS4): Boolean; // Is the character already lower case? begin Result := CategoryLookup(C, [ccLetterLowercase]); end; function UnicodeIsTitle(C: UCS4): Boolean; // Is the character already title case? begin Result := CategoryLookup(C, [ccLetterTitlecase]); end; function UnicodeIsHexDigit(C: UCS4): Boolean; // Is the character a hex digit? begin Result := CategoryLookup(C, [ccHexDigit]); end; function UnicodeIsIsoControl(C: UCS4): Boolean; // Is the character a C0 control character (< 32)? begin Result := CategoryLookup(C, [ccOtherControl]); end; function UnicodeIsFormatControl(C: UCS4): Boolean; // Is the character a format control character? begin Result := CategoryLookup(C, [ccOtherFormat]); end; function UnicodeIsSymbol(C: UCS4): Boolean; // Is the character a symbol? begin Result := CategoryLookup(C, ClassSymbol); end; function UnicodeIsNumber(C: UCS4): Boolean; // Is the character a number or digit? begin Result := CategoryLookup(C, ClassNumber); end; function UnicodeIsNonSpacing(C: UCS4): Boolean; // Is the character non-spacing? begin Result := CategoryLookup(C, [ccMarkNonSpacing]); end; function UnicodeIsOpenPunctuation(C: UCS4): Boolean; // Is the character an open/left punctuation (e.g. '[')? begin Result := CategoryLookup(C, [ccPunctuationOpen]); end; function UnicodeIsClosePunctuation(C: UCS4): Boolean; // Is the character an close/right punctuation (e.g. ']')? begin Result := CategoryLookup(C, [ccPunctuationClose]); end; function UnicodeIsInitialPunctuation(C: UCS4): Boolean; // Is the character an initial punctuation (e.g. U+2018 LEFT SINGLE QUOTATION MARK)? begin Result := CategoryLookup(C, [ccPunctuationInitialQuote]); end; function UnicodeIsFinalPunctuation(C: UCS4): Boolean; // Is the character a final punctuation (e.g. U+2019 RIGHT SINGLE QUOTATION MARK)? begin Result := CategoryLookup(C, [ccPunctuationFinalQuote]); end; function UnicodeIsComposed(C: UCS4): Boolean; // Can the character be decomposed into a set of other characters? begin Result := CategoryLookup(C, [ccComposed]); end; function UnicodeIsQuotationMark(C: UCS4): Boolean; // Is the character one of the many quotation marks? begin Result := CategoryLookup(C, [ccQuotationMark]); end; function UnicodeIsSymmetric(C: UCS4): Boolean; // Is the character one that has an opposite form (i.e. <>)? begin Result := CategoryLookup(C, [ccSymmetric]); end; function UnicodeIsMirroring(C: UCS4): Boolean; // Is the character mirroring (superset of symmetric)? begin Result := CategoryLookup(C, [ccMirroring]); end; function UnicodeIsNonBreaking(C: UCS4): Boolean; // Is the character non-breaking (i.e. non-breaking space)? begin Result := CategoryLookup(C, [ccNonBreaking]); end; function UnicodeIsRightToLeft(C: UCS4): Boolean; // Does the character have strong right-to-left directionality (i.e. Arabic letters)? begin Result := CategoryLookup(C, [ccRightToLeft]); end; function UnicodeIsLeftToRight(C: UCS4): Boolean; // Does the character have strong left-to-right directionality (i.e. Latin letters)? begin Result := CategoryLookup(C, [ccLeftToRight]); end; function UnicodeIsStrong(C: UCS4): Boolean; // Does the character have strong directionality? begin Result := CategoryLookup(C, [ccLeftToRight, ccRightToLeft]); end; function UnicodeIsWeak(C: UCS4): Boolean; // Does the character have weak directionality (i.e. numbers)? begin Result := CategoryLookup(C, ClassEuropeanNumber + [ccArabicNumber, ccCommonNumberSeparator]); end; function UnicodeIsNeutral(C: UCS4): Boolean; // Does the character have neutral directionality (i.e. whitespace)? begin Result := CategoryLookup(C, [ccSeparatorParagraph, ccSegmentSeparator, ccWhiteSpace, ccOtherNeutrals]); end; function UnicodeIsSeparator(C: UCS4): Boolean; // Is the character a block or segment separator? begin Result := CategoryLookup(C, [ccSeparatorParagraph, ccSegmentSeparator]); end; function UnicodeIsMark(C: UCS4): Boolean; // Is the character a mark of some kind? begin Result := CategoryLookup(C, ClassMark); end; function UnicodeIsModifier(C: UCS4): Boolean; // Is the character a letter modifier? begin Result := CategoryLookup(C, [ccLetterModifier]); end; function UnicodeIsLetterNumber(C: UCS4): Boolean; // Is the character a number represented by a letter? begin Result := CategoryLookup(C, [ccNumberLetter]); end; function UnicodeIsConnectionPunctuation(C: UCS4): Boolean; // Is the character connecting punctuation? begin Result := CategoryLookup(C, [ccPunctuationConnector]); end; function UnicodeIsDash(C: UCS4): Boolean; // Is the character a dash punctuation? begin Result := CategoryLookup(C, [ccPunctuationDash]); end; function UnicodeIsMath(C: UCS4): Boolean; // Is the character a math character? begin Result := CategoryLookup(C, [ccSymbolMath]); end; function UnicodeIsCurrency(C: UCS4): Boolean; // Is the character a currency character? begin Result := CategoryLookup(C, [ccSymbolCurrency]); end; function UnicodeIsModifierSymbol(C: UCS4): Boolean; // Is the character a modifier symbol? begin Result := CategoryLookup(C, [ccSymbolModifier]); end; function UnicodeIsNonSpacingMark(C: UCS4): Boolean; // Is the character a non-spacing mark? begin Result := CategoryLookup(C, [ccMarkNonSpacing]); end; function UnicodeIsSpacingMark(C: UCS4): Boolean; // Is the character a spacing mark? begin Result := CategoryLookup(C, [ccMarkSpacingCombining]); end; function UnicodeIsEnclosing(C: UCS4): Boolean; // Is the character enclosing (i.e. enclosing box)? begin Result := CategoryLookup(C, [ccMarkEnclosing]); end; function UnicodeIsPrivate(C: UCS4): Boolean; // Is the character from the Private Use Area? begin Result := CategoryLookup(C, [ccOtherPrivate]); end; function UnicodeIsSurrogate(C: UCS4): Boolean; // Is the character one of the surrogate codes? begin Result := CategoryLookup(C, [ccOtherSurrogate]); end; function UnicodeIsLineSeparator(C: UCS4): Boolean; // Is the character a line separator? begin Result := CategoryLookup(C, [ccSeparatorLine]); end; function UnicodeIsParagraphSeparator(C: UCS4): Boolean; // Is th character a paragraph separator; begin Result := CategoryLookup(C, [ccSeparatorParagraph]); end; function UnicodeIsIdentifierStart(C: UCS4): Boolean; // Can the character begin an identifier? begin Result := CategoryLookup(C, ClassLetter + [ccNumberLetter]); end; function UnicodeIsIdentifierPart(C: UCS4): Boolean; // Can the character appear in an identifier? begin Result := CategoryLookup(C, ClassLetter + [ccNumberLetter, ccMarkNonSpacing, ccMarkSpacingCombining, ccNumberDecimalDigit, ccPunctuationConnector, ccOtherFormat]); end; function UnicodeIsDefined(C: UCS4): Boolean; // Is the character defined (appears in one of the data files)? begin Result := CategoryLookup(C, [ccAssigned]); end; function UnicodeIsUndefined(C: UCS4): Boolean; // Is the character undefined (not assigned in the Unicode database)? begin Result := not CategoryLookup(C, [ccAssigned]); end; function UnicodeIsHan(C: UCS4): Boolean; // Is the character a Han ideograph? begin Result := ((C >= $4E00) and (C <= $9FFF)) or ((C >= $F900) and (C <= $FAFF)); end; function UnicodeIsHangul(C: UCS4): Boolean; // Is the character a pre-composed Hangul syllable? begin Result := (C >= $AC00) and (C <= $D7FF); end; // I need to fix a problem (introduced by MS) here. The first parameter can be a pointer // (and is so defined) or can be a normal DWORD, depending on the dwFlags parameter. // As usual, lpSrc has been translated to a var parameter. But this does not work in // our case, hence the redeclaration of the function with a pointer as first parameter. function TranslateCharsetInfoEx(lpSrc: SizeInt; out lpCs: TCharsetInfo; dwFlags: DWORD): BOOL; stdcall; external 'gdi32.dll' name 'TranslateCharsetInfo'; function GetCharSetFromLocale(Language: LCID; out FontCharSet: Byte): Boolean; const TCI_SRCLOCALE = $1000; var CP: Word; CSI: TCharsetInfo; begin if GetWindowsVersion in [wvUnknown, wvWin95, wvWin95OSR2, wvWin98, wvWin98SE, wvWinME, wvWinNT31, wvWinNT35, wvWinNT351, wvWinNT4] then begin // these versions of Windows don't support TCI_SRCLOCALE CP := CodePageFromLocale(Language); if CP = 0 then RaiseLastOSError; Result := TranslateCharsetInfoEx(CP, CSI, TCI_SRCCODEPAGE); end else Result := TranslateCharsetInfoEx(Language, CSI, TCI_SRCLOCALE); if Result then FontCharset := CSI.ciCharset; end; function CharSetFromLocale(Language: LCID): Byte; begin if not GetCharSetFromLocale(Language, Result) then RaiseLastOSError; end; function CodePageFromLocale(Language: LCID): Word; // determines the code page for a given locale var Buf: array [0..6] of Char; begin GetLocaleInfo(Language, LOCALE_IDefaultAnsiCodePage, Buf, 6); Result := StrToIntDef(Buf, GetACP); end; function KeyboardCodePage: Word; begin Result := CodePageFromLocale(GetKeyboardLayout(0) and $FFFF); end; function KeyUnicode(C: Char): WideChar; // converts the given character (as it comes with a WM_CHAR message) into its // corresponding Unicode character depending on the active keyboard layout begin MultiByteToWideChar(KeyboardCodePage, MB_USEGLYPHCHARS, @C, 1, @Result, 1); end; function CodeBlockRange(const CB: TUnicodeBlock): TUnicodeBlockRange; // http://www.unicode.org/Public/5.0.0/ucd/Blocks.txt begin Result := UnicodeBlockData[CB].Range; end; // Names taken from http://www.unicode.org/Public/5.0.0/ucd/Blocks.txt function CodeBlockName(const CB: TUnicodeBlock): string; begin Result := UnicodeBlockData[CB].Name; end; // Returns an ID for the Unicode code block to which C belongs. // If C does not belong to any of the defined blocks then ubUndefined is returned. // Note: the code blocks listed here are based on Unicode Version 5.0.0 function CodeBlockFromChar(const C: UCS4): TUnicodeBlock; // http://www.unicode.org/Public/5.0.0/ucd/Blocks.txt var L, H, I: TUnicodeBlock; begin Result := ubUndefined; L := ubBasicLatin; H := High(TUnicodeBlock); while L <= H do begin I := TUnicodeBlock((Cardinal(L) + Cardinal(H)) shr 1); if (C >= UnicodeBlockData[I].Range.RangeStart) and (C <= UnicodeBlockData[I].Range.RangeEnd) then begin Result := I; Break; end else if C < UnicodeBlockData[I].Range.RangeStart then begin Dec(I); H := I; end else begin Inc(I); L := I; end; end; end; function CompareTextWin95(const W1, W2: WideString; Locale: LCID): SizeInt; // special comparation function for Win9x since there's no system defined // comparation function, returns -1 if W1 < W2, 0 if W1 = W2 or 1 if W1 > W2 var S1, S2: AnsiString; CP: Word; L1, L2: SizeInt; begin L1 := Length(W1); L2 := Length(W2); SetLength(S1, L1); SetLength(S2, L2); CP := CodePageFromLocale(Locale); WideCharToMultiByte(CP, 0, PWideChar(W1), L1, PAnsiChar(S1), L1, nil, nil); WideCharToMultiByte(CP, 0, PWideChar(W2), L2, PAnsiChar(S2), L2, nil, nil); Result := CompareStringA(Locale, NORM_IGNORECASE, PAnsiChar(S1), Length(S1), PAnsiChar(S2), Length(S2)) - 2; end; function CompareTextWinNT(const W1, W2: WideString; Locale: LCID): SizeInt; // Wrapper function for WinNT since there's no system defined comparation function // in Win9x and we need a central comparation function for TWideStringList. // Returns -1 if W1 < W2, 0 if W1 = W2 or 1 if W1 > W2 begin Result := CompareStringW(Locale, NORM_IGNORECASE, PWideChar(W1), Length(W1), PWideChar(W2), Length(W2)) - 2; end; function StringToWideStringEx(const S: AnsiString; CodePage: Word): WideString; var InputLength, OutputLength: SizeInt; begin InputLength := Length(S); OutputLength := MultiByteToWideChar(CodePage, 0, PAnsiChar(S), InputLength, nil, 0); SetLength(Result, OutputLength); MultiByteToWideChar(CodePage, 0, PAnsiChar(S), InputLength, PWideChar(Result), OutputLength); end; function WideStringToStringEx(const WS: WideString; CodePage: Word): AnsiString; var InputLength, OutputLength: SizeInt; begin InputLength := Length(WS); OutputLength := WideCharToMultiByte(CodePage, 0, PWideChar(WS), InputLength, nil, 0, nil, nil); SetLength(Result, OutputLength); WideCharToMultiByte(CodePage, 0, PWideChar(WS), InputLength, PAnsiChar(Result), OutputLength, nil, nil); end; function TranslateString(const S: AnsiString; CP1, CP2: Word): AnsiString; begin Result:= WideStringToStringEx(StringToWideStringEx(S, CP1), CP2); end; procedure PrepareUnicodeData; // Prepares structures which are globally needed. begin LoadInProgress := TJclCriticalSection.Create; if (Win32Platform and VER_PLATFORM_WIN32_NT) <> 0 then @WideCompareText := @CompareTextWinNT else @WideCompareText := @CompareTextWin95; end; procedure FreeUnicodeData; // Frees all data which has been allocated and which is not automatically freed by Delphi. begin FreeAndNil(LoadInProgress); end; initialization PrepareUnicodeData; {$IFDEF UNITVERSIONING} RegisterUnitVersion(HInstance, UnitVersioning); {$ENDIF UNITVERSIONING} finalization {$IFDEF UNITVERSIONING} UnregisterUnitVersion(HInstance); {$ENDIF UNITVERSIONING} FreeUnicodeData; end.