{********************************************************************} { } { Developer Express Visual Component Library } { ExpressSpellChecker } { } { Copyright (c) 1998-2009 Developer Express Inc. } { ALL RIGHTS RESERVED } { } { The entire contents of this file is protected by U.S. and } { International Copyright Laws. Unauthorized reproduction, } { reverse-engineering, and distribution of all or any portion of } { the code contained in this file is strictly prohibited and may } { result in severe civil and criminal penalties and will be } { prosecuted to the maximum extent possible under the law. } { } { RESTRICTIONS } { } { THIS SOURCE CODE AND ALL RESULTING INTERMEDIATE FILES } { (DCU, OBJ, DLL, ETC.) ARE CONFIDENTIAL AND PROPRIETARY TRADE } { SECRETS OF DEVELOPER EXPRESS INC. THE REGISTERED DEVELOPER IS } { LICENSED TO DISTRIBUTE THE EXPRESSSPELLCHECKER AND ALL } { ACCOMPANYING VCL CONTROLS AS PART OF AN EXECUTABLE PROGRAM ONLY. } { } { THE SOURCE CODE CONTAINED WITHIN THIS FILE AND ALL RELATED } { FILES OR ANY PORTION OF ITS CONTENTS SHALL AT NO TIME BE } { COPIED, TRANSFERRED, SOLD, DISTRIBUTED, OR OTHERWISE MADE } { AVAILABLE TO OTHER INDIVIDUALS WITHOUT EXPRESS WRITTEN CONSENT } { AND PERMISSION FROM DEVELOPER EXPRESS INC. } { } { CONSULT THE END USER LICENSE AGREEMENT FOR INFORMATION ON } { ADDITIONAL RESTRICTIONS. } { } {********************************************************************} unit dxHunspellDictionary; {$I cxVer.inc} interface uses SysUtils, Classes, dxHunspellWords, dxHunspellAffixes, dxSpellChecker; type TdxSpellInfoItem = (siiCompound, siiForbidden, siiInitialCapitalization); TdxSpellInfo = set of TdxSpellInfoItem; { TdxHunspellDictionary } TdxHunspellDictionary = class(TdxCustomSpellCheckerDictionary) private FDictionaryPath: TFileName; FGrammarAlphabet: WideString; FGrammarPath: TFileName; FAffixManager: TdxHunspellAffixManager; FIsUTF8: Boolean; FComplexPrefixes: Boolean; FWordBaseManager: TdxHunspellWordBaseManager; function CheckWord(const AWord: PAnsiChar; var AInfo: TdxSpellInfo): TdxHunspellWordItem; function MakeAllSmall(AWord: PAnsiChar; u: PWideChar; AWordLength: Integer): Integer; function MakeCapitalized(AWord: PAnsiChar; u: PWideChar; AWordLength: Integer): Integer; function SpellSharps(base, pos: PAnsiChar; n, repnum: Integer; tmp: PAnsiChar; var AInfo: TdxSpellInfo): TdxHunspellWordItem; function IsKeepCase(AWordItem: TdxHunspellWordItem): Boolean; function sharps_u8_l1(dest, source: PAnsiChar): PAnsiChar; function CleanWord(ADest: PAnsiChar; const ASource: PAnsiChar; ADestUTF: PWideChar; out ACharacterCount: Integer; out ACapitalizationType: TdxCapitalizationType; out AAbbreviationCount: Integer): Integer; //setters procedure SetDictionaryPath(const AValue: TFileName); procedure SetGrammarPath(const AValue: TFileName); protected function DoLoad: Boolean; override; function DoUnload: Boolean; override; procedure FreeContent; override; function GetActiveAlphabet: WideString; override; function GetDisplayName: string; override; function Spell(const AWord: PAnsiChar; var AInfo: TdxSpellInfo): Boolean; property AffixManager: TdxHunspellAffixManager read FAffixManager; property IsUTF8: Boolean read FIsUTF8; property WordBaseManager: TdxHunspellWordBaseManager read FWordBaseManager; public procedure Assign(Source: TPersistent); override; function HasWord(const AWord: WideString): Boolean; override; published property Alphabet; property DictionaryPath: TFileName read FDictionaryPath write SetDictionaryPath; property Enabled; property GrammarPath: TFileName read FGrammarPath write SetGrammarPath; property Language; property OnLoaded; property OnLoading; end; implementation uses dxCore, cxClasses, dxSpellCheckerStrs, dxHunspellUtils, dxHunspellTypes; const MAXSHARPS = 5; { TdxHunspellDictionary } procedure TdxHunspellDictionary.Assign(Source: TPersistent); begin inherited Assign(Source); if Source is TdxHunspellDictionary then begin DictionaryPath := TdxHunspellDictionary(Source).DictionaryPath; GrammarPath := TdxHunspellDictionary(Source).GrammarPath; end; end; function TdxHunspellDictionary.HasWord(const AWord: WideString): Boolean; var AInfo: TdxSpellInfo; Ansi: AnsiString; begin Result := (WordBaseManager <> nil) and (AffixManager <> nil); if Result then begin Ansi := dxWideStringToAnsiString(AWord, CodePage); AInfo := []; Result := Spell(PAnsiChar(Ansi), AInfo); end; end; function TdxHunspellDictionary.DoLoad: Boolean; begin FIsUTF8 := False; FWordBaseManager := TdxHunspellWordBaseManager.Create(Language); try FWordBaseManager.Load(DictionaryPath, GrammarPath); FAffixManager := TdxHunspellAffixManager.Create(GrammarPath, WordBaseManager); except FreeContent; raise; end; Language := AffixManager.Language; FGrammarAlphabet := dxAnsiStringToWideString(AffixManager.TryChars, CodePage); FComplexPrefixes := AffixManager.ComplexPrefixes; CodePage := AffixManager.CodePage; Result := True; end; function TdxHunspellDictionary.DoUnload: Boolean; begin FreeContent; Result := True; end; procedure TdxHunspellDictionary.FreeContent; begin FreeAndNil(FAffixManager); FreeAndNil(FWordBaseManager); end; function TdxHunspellDictionary.GetActiveAlphabet: WideString; begin if Alphabet <> '' then Result := Alphabet else Result := FGrammarAlphabet; end; function TdxHunspellDictionary.GetDisplayName: string; var AFileName: TFileName; begin Result := inherited GetDisplayName; AFileName := SysUtils.ExtractFileName(DictionaryPath); if AFileName <> '' then Result := Format('%s (%s)', [Result, AFileName]); end; function TdxHunspellDictionary.Spell(const AWord: PAnsiChar; var AInfo: TdxSpellInfo): Boolean; type TNumberSeparatorState = (nssNone, nssNumber, nssSeparatedNumber); var AWordItem: TdxHunspellWordItem; ACleanedWord, AWordForCompound: array [0..MAXWORDUTF8LEN - 1] of Char; AUnicodeWord, AWordForSharps: array [0..MaxWordLength - 1] of WideChar; AWordLen, AWordForCompoundLength, AAbbreviationCount, AWordLength, I, AWordBreakItemLength: Integer; ALocalInfo: TdxSpellInfo; ACapitalizationType: TdxCapitalizationType; ASeparatorState: TNumberSeparatorState; FinishCase: Boolean; AApostropheCursor, AWordBrakeItemCursor, AWordBreakItem: PAnsiChar; ATempChar: Char; ABreakTable: TdxBreakTable; begin AWordItem := nil; AWordLen := StrLen(AWord); AWordForCompoundLength := 0; if FIsUTF8 then begin //TODO: end else if AWordLen >= MaxWordLength then begin Result := False; Exit; end; ACapitalizationType := ctNoCapital; AAbbreviationCount := 0; // input conversion if AffixManager.InputReplaceTable.conv(AWord, AWordForCompound) <> 0 then AWordLength := CleanWord(ACleanedWord, AWordForCompound, AUnicodeWord, AWordLen, ACapitalizationType, AAbbreviationCount) else AWordLength := CleanWord(ACleanedWord, AWord, AUnicodeWord, AWordLen, ACapitalizationType, AAbbreviationCount); if AWordLength = 0 then begin Result := True; Exit; end; // allow numbers with dots, dashes and commas (but forbid double separators: "..", "--" etc.) ASeparatorState := nssNone; for I := 0 to AWordLength - 1 do begin if (ACleanedWord[I] <= '9') and (ACleanedWord[I] >= '0') then ASeparatorState := nssNumber else if (ACleanedWord[I] = ',') or (ACleanedWord[I] = '.') or(ACleanedWord[I] = '-') then //TODO: refactor begin if (ASeparatorState = nssSeparatedNumber) or (I = 0) then Break; ASeparatorState := nssSeparatedNumber; end else Break; end; if (I = AWordLength) and (ASeparatorState = nssNumber) then begin Result := True; Exit; end; case ACapitalizationType of ctMixedCapital, ctFirstMixedCapital, ctNoCapital: begin AWordItem := CheckWord(ACleanedWord, AInfo); if (AAbbreviationCount <> 0) and (AWordItem = nil) then begin Move(ACleanedWord, AWordForCompound, AWordLength); (AWordForCompound + AWordLength)^ := '.'; (AWordForCompound + AWordLength + 1)^ := #0; AWordItem := CheckWord(AWordForCompound, AInfo); end; end; ctAllCapital, ctFirstCapital: begin if ACapitalizationType = ctAllCapital then begin AWordItem := CheckWord(ACleanedWord, AInfo); if AWordItem <> nil then begin Result := True; Exit; end; if AAbbreviationCount <> 0 then begin Move(ACleanedWord, AWordForCompound, AWordLength); (AWordForCompound + AWordLength)^ := '.'; (AWordForCompound + AWordLength + 1)^ := #0; AWordItem := CheckWord(AWordForCompound, AInfo); if AWordItem <> nil then begin Result := True; Exit; end; end; if StrScan(ACleanedWord, '''') <> nil then begin MakeAllSmall(ACleanedWord, AUnicodeWord, AWordLen); AApostropheCursor := StrScan(ACleanedWord, ''''); if FIsUTF8 then begin //TODO: end else begin MakeCapitalized(AApostropheCursor + 1, AUnicodeWord, AWordLen); AWordItem := CheckWord(ACleanedWord, AInfo); if AWordItem <> nil then begin Result := True; Exit; end; end; MakeCapitalized(ACleanedWord, AUnicodeWord, AWordLen); AWordItem := CheckWord(ACleanedWord, AInfo); if AWordItem <> nil then begin Result := True; Exit; end; end; if (AffixManager.Checksharps <> 0) and (StrPos(ACleanedWord, 'SS') <> nil) then begin AWordLength := MakeAllSmall(ACleanedWord, AUnicodeWord, AWordLen); Move(ACleanedWord, AWordForCompound, AWordLength + 1); AWordItem := SpellSharps(AWordForCompound, AWordForCompound, 0, 0, @AWordForSharps, AInfo); if AWordItem = nil then begin AWordForCompoundLength := MakeCapitalized(ACleanedWord, AUnicodeWord, AWordLen); AWordItem := SpellSharps(ACleanedWord, ACleanedWord, 0, 0, @AWordForSharps, AInfo); end; if (AAbbreviationCount <> 0) and (AWordItem = nil) then begin (AWordForCompound + AWordLength)^ := '.'; (AWordForCompound + AWordLength + 1)^ := #0; AWordItem := SpellSharps(AWordForCompound, AWordForCompound, 0, 0, @AWordForSharps, AInfo); if AWordItem = nil then begin Move(ACleanedWord, AWordForCompound, AWordForCompoundLength); (AWordForCompound + AWordForCompoundLength)^ := '.'; (AWordForCompound + AWordForCompoundLength + 1)^ := #0; AWordItem := SpellSharps(AWordForCompound, AWordForCompound, 0, 0, @AWordForSharps, AInfo); end; end; if AWordItem <> nil then begin Result := True; Exit; end; end; end; //INITCAP section AWordLength := MakeAllSmall(ACleanedWord, AUnicodeWord, AWordLen); Move(ACleanedWord, AWordForCompound, AWordLength + 1); AWordForCompoundLength := MakeCapitalized(ACleanedWord, AUnicodeWord, AWordLen); if ACapitalizationType = ctFirstCapital then Include(AInfo, siiInitialCapitalization); AWordItem := CheckWord(ACleanedWord, AInfo); if ACapitalizationType = ctFirstCapital then Exclude(AInfo, siiInitialCapitalization); FinishCase := False; if siiForbidden in AInfo then begin AWordItem := nil; FinishCase := True; end; if not FinishCase then begin if (AWordItem <> nil) and IsKeepCase(AWordItem) and (ACapitalizationType = ctAllCapital) then AWordItem := nil; if AWordItem <> nil then begin Result := True; Exit; end; AWordItem := CheckWord(AWordForCompound, AInfo); if (AAbbreviationCount <> 0) and (AWordItem = nil) then begin (AWordForCompound + AWordLength)^ := '.'; (AWordForCompound + AWordLength + 1)^ := #0; AWordItem := CheckWord(AWordForCompound, AInfo); if AWordItem = nil then begin Move(ACleanedWord, AWordForCompound, AWordForCompoundLength); (AWordForCompound + AWordForCompoundLength)^ := '.'; (AWordForCompound + AWordForCompoundLength + 1)^ := #0; if ACapitalizationType = ctFirstCapital then Include(AInfo, siiInitialCapitalization); AWordItem := CheckWord(AWordForCompound, AInfo); if ACapitalizationType = ctFirstCapital then Exclude(AInfo, siiInitialCapitalization); if (AWordItem <> nil) and IsKeepCase(AWordItem) and (ACapitalizationType = ctAllCapital) then AWordItem := nil; FinishCase := True; end; end; if not FinishCase and (AWordItem <> nil) and IsKeepCase(AWordItem) and ((ACapitalizationType = ctAllCapital) or not ((AffixManager.CheckSharps <> 0) and ((IsUTF8 and (StrPos(AWordForCompound, #$C3#$9F) <> nil)) or (not IsUTF8 and (StrScan(AWordForCompound, #$DF) <> nil))))) then AWordItem := nil; end; end; end; if AWordItem <> nil then begin Result := True; Exit; end; AWordLength := StrLen(ACleanedWord); ABreakTable := AffixManager.BreakTable; for I := 0 to ABreakTable.Count - 1 do begin AWordBreakItem := ABreakTable[I]; AWordBreakItemLength := StrLen(AWordBreakItem); if (AWordBreakItemLength = 1) or (AWordBreakItemLength > AWordLength) then Continue; ALocalInfo := []; if (AWordBreakItem^ = '^') and (StrLComp(ACleanedWord, AWordBreakItem + 1, AWordBreakItemLength - 1) = 0) and Spell(ACleanedWord + AWordBreakItemLength - 1, ALocalInfo) then begin Result := True; Exit; end; if ((AWordBreakItem + AWordBreakItemLength - 1)^ = '$') and (StrLComp(ACleanedWord + AWordLength - AWordBreakItemLength + 1, AWordBreakItem, AWordBreakItemLength - 1) = 0) then begin ATempChar := ACleanedWord[AWordLength - AWordBreakItemLength + 1]; ACleanedWord[AWordLength - AWordBreakItemLength + 1] := #0; ALocalInfo := []; if Spell(ACleanedWord, ALocalInfo) then begin Result := True; Exit; end; ACleanedWord[AWordLength - AWordBreakItemLength + 1] := ATempChar; end; end; // other patterns for I := 0 to ABreakTable.Count - 1 do begin AWordBreakItem := ABreakTable[I]; AWordBreakItemLength := StrLen(AWordBreakItem); AWordBrakeItemCursor := StrPos(ACleanedWord, AWordBreakItem); if (AWordBrakeItemCursor <> nil) and (AWordBrakeItemCursor > ACleanedWord) and (AWordBrakeItemCursor < ACleanedWord + AWordLength - AWordBreakItemLength) then begin ALocalInfo := []; if not Spell(AWordBrakeItemCursor + AWordBreakItemLength, ALocalInfo) then Continue; ATempChar := AWordBrakeItemCursor^; AWordBrakeItemCursor^ := #0; // examine 2 sides of the break point ALocalInfo := []; if Spell(ACleanedWord, ALocalInfo) then begin Result := True; Exit; end; AWordBrakeItemCursor^ := ATempChar; {TODO: // LANG_hu: spec. dash rule if (Language = LANG_hu) and (StrComp(wordbreakItem, '-') = 0) then begin r := (s + 1)^; (s + 1)^ := #0; if Spell(cw) then // check the first part with dash begin Result := True; Exit; end; (s + 1)^ := r; end; // end of LANG speficic region} end; end; Result := False; end; function TdxHunspellDictionary.CheckWord(const AWord: PAnsiChar; var AInfo: TdxSpellInfo): TdxHunspellWordItem; var AWordLength: Integer; ATempWord: array[0..MAXWORDUTF8LEN - 1] of ShortInt; AUnderTestWord, AIgnoredChars: PAnsiChar; ATakenSuffix: TdxSuffix; ATakenPrefix: TdxPrefix; begin AIgnoredChars := AffixManager.Ignore; if AIgnoredChars <> nil then begin StrCopy(@ATempWord, AWord); RemoveIgnoredChars(@ATempWord, AIgnoredChars); AUnderTestWord := @ATempWord; end else AUnderTestWord := AWord; if FComplexPrefixes then begin if AUnderTestWord <> @ATempWord then begin StrCopy(@ATempWord, AUnderTestWord); AUnderTestWord := @ATempWord; end; if IsUTF8 then //TODO: else StrReverse(@ATempWord); end; Result := WordBaseManager.Lookup(AUnderTestWord); // check forbidden and onlyincompound words if (Result <> nil) and Result.IsCompatibleWithFlag(AffixManager.ForbiddenWord) then begin Include(AInfo, siiForbidden); {TODO: // LANG_hu section: set dash information for suggestions if Language = LANG_hu then if (AffixManager.CompoundFlag <> 0) and Result.IsCompatibleWithFlag(AffixManager.CompoundFlag) then if info <> nil then Inc(info^, SPELL_COMPOUND); } Result := nil; Exit; end; // he = next not needaffix, onlyincompound homonym or onlyupcase word while (Result <> nil) and ((AffixManager.NeedAffix <> NullFlag) and Result.IsCompatibleWithFlag(AffixManager.NeedAffix) or (AffixManager.OnlyInCompound <> NullFlag) and Result.IsCompatibleWithFlag(AffixManager.OnlyInCompound) or (siiInitialCapitalization in AInfo) and Result.IsCompatibleWithFlag(ONLYUPCASEFLAG)) do Result := Result.NextHomonym; // check with affixes if (Result = nil) then begin // try stripping off affixes AWordLength := StrLen(AUnderTestWord); Result := AffixManager.AffixCheck(AUnderTestWord, AWordLength, ATakenPrefix, ATakenSuffix, 0); // check compound restriction and onlyupcase if (Result <> nil) and (((AffixManager.OnlyInCompound <> NullFlag) and Result.IsCompatibleWithFlag(AffixManager.OnlyInCompound)) or ((siiInitialCapitalization in AInfo) and Result.IsCompatibleWithFlag(ONLYUPCASEFLAG))) then Result := nil; if Result <> nil then begin if Result.IsCompatibleWithFlag(AffixManager.ForbiddenWord) then begin Include(AInfo, siiForbidden); Result := nil; Exit; end; end else // try check compound word if AffixManager.IsCompoundWordsAvailable then begin Result := AffixManager.CompoundCheck(AUnderTestWord, 0, 0, 100, 0, nil, False, False); {TODO: // LANG_hu section: `moving rule' with last dash if (Result = nil) and (Language = LANG_hu) and ((AWord + AWordLength - 1)^ = '-') then begin dup := StrNew(AWord); if dup = nil then begin Result := nil; Exit; end; (dup + AWordLength - 1)^ := #0; Result := AffixManager.CompoundCheck(dup, -5, 0, 100, 0, nil, True, False); StrDispose(dup); end; // end of LANG speficic region} if Result <> nil then Include(AInfo, siiCompound); end; end; end; function TdxHunspellDictionary.MakeAllSmall(AWord: PAnsiChar; u: PWideChar; AWordLength: Integer): Integer; begin Result := AWordLength; if IsUTF8 then begin //TODO: Result := StrLen(AWord); end else dxHunspellUtils.MakeAllSmall(AWord, CodePage, Language); end; function TdxHunspellDictionary.MakeCapitalized(AWord: PAnsiChar; u: PWideChar; AWordLength: Integer): Integer; begin Result := AWordLength; dxHunspellUtils.MakeCapitalized(AWord, CodePage, Language); end; // recursive search for right ss - sharp s permutations function TdxHunspellDictionary.SpellSharps(base, pos: PAnsiChar; n, repnum: Integer; tmp: PAnsiChar; var AInfo: TdxSpellInfo): TdxHunspellWordItem; begin Result := nil; pos := StrPos(pos, 'ss'); if (pos <> nil) and (n < MAXSHARPS) then begin pos^ := #$C3; (pos + 1)^ := #$9F; Result := SpellSharps(base, pos + 2, n + 1, repnum + 1, tmp, AInfo); if Result = nil then begin pos^ := 's'; (pos + 1)^ := 's'; Result := SpellSharps(base, pos + 2, n + 1, repnum, tmp, AInfo); end; end else if repnum > 0 then begin if IsUTF8 then Result := CheckWord(base, AInfo) else Result := CheckWord(sharps_u8_l1(tmp, base), AInfo); end; end; function TdxHunspellDictionary.IsKeepCase(AWordItem: TdxHunspellWordItem): Boolean; begin Result := (AffixManager.KeepCase <> 0) and AWordItem.IsCompatibleWithFlag(AffixManager.KeepCase); end; // convert UTF-8 sharp S codes to latin 1 function TdxHunspellDictionary.sharps_u8_l1(dest, source: PAnsiChar): PAnsiChar; var p: PAnsiChar; begin p := dest; p^ := source^; Inc(p); Inc(source); while (source - 1)^ <> #0 do begin p^ := source^; if source^ = #$9F then begin Dec(p); p^ := #$DF; end; Inc(p); Inc(source); end; Result := dest; end; function TdxHunspellDictionary.CleanWord(ADest: PAnsiChar; const ASource: PAnsiChar; ADestUTF: PWideChar; out ACharacterCount: Integer; out ACapitalizationType: TdxCapitalizationType; out AAbbreviationCount: Integer): Integer; var ADestCursor, ASourceCursor: PAnsiChar; ALength: Integer; begin ADestCursor := ADest; ASourceCursor := ASource; // first skip over any leading blanks while (ASourceCursor^ <> #0) and (ASourceCursor^ = ' ') do Inc(ASourceCursor); // now strip off any trailing periods (recording their presence) AabbreviationCount := 0; ALength := StrLen(ASourceCursor); while (ALength > 0) and ((ASourceCursor + ALength - 1)^ = '.') do begin Dec(ALength); Inc(AabbreviationCount); end; // if no characters are left it can't be capitalized if ALength <= 0 then begin ACapitalizationType := ctNoCapital; ADestCursor^ := #0; Result := 0; Exit; end; StrLCopy(ADest, ASourceCursor, ALength); (ADest + ALength)^ := #0; ALength := StrLen(ADest); if IsUTF8 then begin //TODO: end else begin ACapitalizationType := GetCapitalizationType(ADest, ALength); ACharacterCount := ALength; end; Result := ALength; end; procedure TdxHunspellDictionary.SetDictionaryPath(const AValue: TFileName); begin if FDictionaryPath <> AValue then begin Unload; FDictionaryPath := AValue; end; end; procedure TdxHunspellDictionary.SetGrammarPath(const AValue: TFileName); begin if FGrammarPath <> AValue then begin Unload; FGrammarPath := AValue; end; end; initialization GetRegisteredDictionaryTypes.Register(TdxHunspellDictionary, cxGetResourceString(@sdxSpellCheckerHunspellDictionary)); end.