Componentes.Terceros.DevExp.../official/x.44/ExpressSpellChecker/Sources/dxSpellCheckerAlgorithms.pas

1123 lines
30 KiB
ObjectPascal

{********************************************************************}
{ }
{ Developer Express Visual Component Library }
{ ExpressSpellChecker }
{ }
{ Copyright (c) 1998-2009 Developer Express Inc. }
{ ALL RIGHTS RESERVED }
{ }
{ The entire contents of this file is protected by U.S. and }
{ International Copyright Laws. Unauthorized reproduction, }
{ reverse-engineering, and distribution of all or any portion of }
{ the code contained in this file is strictly prohibited and may }
{ result in severe civil and criminal penalties and will be }
{ prosecuted to the maximum extent possible under the law. }
{ }
{ RESTRICTIONS }
{ }
{ THIS SOURCE CODE AND ALL RESULTING INTERMEDIATE FILES }
{ (DCU, OBJ, DLL, ETC.) ARE CONFIDENTIAL AND PROPRIETARY TRADE }
{ SECRETS OF DEVELOPER EXPRESS INC. THE REGISTERED DEVELOPER IS }
{ LICENSED TO DISTRIBUTE THE EXPRESSSPELLCHECKER AND ALL }
{ ACCOMPANYING VCL CONTROLS AS PART OF AN EXECUTABLE PROGRAM ONLY. }
{ }
{ THE SOURCE CODE CONTAINED WITHIN THIS FILE AND ALL RELATED }
{ FILES OR ANY PORTION OF ITS CONTENTS SHALL AT NO TIME BE }
{ COPIED, TRANSFERRED, SOLD, DISTRIBUTED, OR OTHERWISE MADE }
{ AVAILABLE TO OTHER INDIVIDUALS WITHOUT EXPRESS WRITTEN CONSENT }
{ AND PERMISSION FROM DEVELOPER EXPRESS INC. }
{ }
{ CONSULT THE END USER LICENSE AGREEMENT FOR INFORMATION ON }
{ ADDITIONAL RESTRICTIONS. }
{ }
{********************************************************************}
unit dxSpellCheckerAlgorithms;
{$I cxVer.inc}
interface
const
MaxSoundLen = 4;
type
{ TdxDoubleMetaphone }
TdxDoubleMetaphone = class
private
FIsSlavoGermanic: Boolean;
FIsSlavoGermanicReady: Boolean;
FLast: PWideChar;
FLength: Integer;
FPrimary: PWideChar;
FPrimaryLen: Integer;
FAlternate: PWideChar;
FAlternateLen: Integer;
FValue: PWideChar;
function GetAlternate: WideString;
function GetAlternateKey: Word;
function GetPrimary: WideString;
function GetPrimaryKey: Word;
procedure MetaphPrimaryAdd(AChar: WideChar); {$IFDEF DELPHI9} inline; {$ENDIF}
procedure MetaphAlternateAdd(AChar: WideChar); {$IFDEF DELPHI9} inline; {$ENDIF}
procedure ProcessB; {$IFDEF DELPHI9} inline; {$ENDIF}
procedure ProcessC; {$IFDEF DELPHI9} inline; {$ENDIF}
procedure ProcessD; {$IFDEF DELPHI9} inline; {$ENDIF}
procedure ProcessF; {$IFDEF DELPHI9} inline; {$ENDIF}
procedure ProcessG; {$IFDEF DELPHI9} inline; {$ENDIF}
procedure ProcessH; {$IFDEF DELPHI9} inline; {$ENDIF}
procedure ProcessJ; {$IFDEF DELPHI9} inline; {$ENDIF}
procedure ProcessK; {$IFDEF DELPHI9} inline; {$ENDIF}
procedure ProcessL; {$IFDEF DELPHI9} inline; {$ENDIF}
procedure ProcessM; {$IFDEF DELPHI9} inline; {$ENDIF}
procedure ProcessN; {$IFDEF DELPHI9} inline; {$ENDIF}
procedure ProcessP; {$IFDEF DELPHI9} inline; {$ENDIF}
procedure ProcessQ; {$IFDEF DELPHI9} inline; {$ENDIF}
procedure ProcessR; {$IFDEF DELPHI9} inline; {$ENDIF}
procedure ProcessS; {$IFDEF DELPHI9} inline; {$ENDIF}
procedure ProcessT; {$IFDEF DELPHI9} inline; {$ENDIF}
procedure ProcessV; {$IFDEF DELPHI9} inline; {$ENDIF}
procedure ProcessW; {$IFDEF DELPHI9} inline; {$ENDIF}
procedure ProcessX; {$IFDEF DELPHI9} inline; {$ENDIF}
procedure ProcessZ; {$IFDEF DELPHI9} inline; {$ENDIF}
procedure Reset(P: PWideChar; ALength: Integer);
protected
Current: PWideChar;
function AtEnd: Boolean; {$IFDEF DELPHI9} inline; {$ENDIF}
function AtStart: Boolean; {$IFDEF DELPHI9} inline; {$ENDIF}
function Contains(AStart: PWideChar; ALength: Integer; APatterns: PWideChar): Boolean; {$IFDEF DELPHI9} inline; {$ENDIF}
function CurrentIndex: Integer; {$IFDEF DELPHI9} inline; {$ENDIF}
function IsSlavoGermanic: Boolean;
function IsVowel(P: PWideChar): Boolean; {$IFDEF DELPHI9} inline; {$ENDIF}
procedure MetaphAdd(APrimary: WideChar; AAlternate: WideChar = #0); {$IFDEF DELPHI9} inline; {$ENDIF}
property Last: PWideChar read FLast;
property Length: Integer read FLength;
property Value: PWideChar read FValue write FValue;
public
constructor Create;
destructor Destroy; override;
procedure DoubleMetaphone(P: PWideChar; ALength: Integer);
property Alternate: WideString read GetAlternate;
property AlternateKey: Word read GetAlternateKey;
property Primary: WideString read GetPrimary;
property PrimaryKey: Word read GetPrimaryKey;
end;
{ TdxStringSimilarityCalculator }
TdxStringSimilarityCalculator = class
protected
function DeleteCost(S: PWideChar): Integer; {$IFDEF DELPHI9} inline; {$ENDIF}
function InsertCost(S: PWideChar): Integer; {$IFDEF DELPHI9} inline; {$ENDIF}
function SubstitutionCost(S1: PWideChar; S2: PWideChar): Integer; {$IFDEF DELPHI9} inline; {$ENDIF}
public
function GetDistance(AStr1: PWideChar; ALen1: Integer; AStr2: PWideChar; ALen2: Integer): Integer;
end;
implementation
uses
SysUtils, dxSpellCheckerUtils, Windows;
function SameChars(P1, P2: PWideChar; ALength: Integer): Boolean; {$IFDEF DELPHI9} inline; {$ENDIF}
begin
repeat
if P1^ <> P2^ then
begin
Result := False;
Exit;
end;
Inc(P1);
Inc(P2);
Dec(ALength);
until ALength = 0;
Result := True;
end;
function CharInString(AChar: WideChar; AString: PWideChar): Boolean; {$IFDEF DELPHI9} inline; {$ENDIF}
begin
repeat
if AString^ = AChar then
begin
Result := True;
Exit;
end;
Inc(AString);
until AString^ = #0;
Result := False;
end;
function ContainsSubstr(const AString, ASubString: PWideChar): Boolean;
var
P, AStr, ASubStr: PWideChar;
AChar: WideChar;
begin
P := AString;
AChar := ASubString^;
repeat
if P^ = AChar then
begin
AStr := P;
ASubStr := ASubString;
repeat
Inc(AStr);
Inc(ASubStr);
if ASubStr^ = #0 then
begin
Result := True;
Exit;
end;
if AStr^ = #0 then
begin
Result := False;
Exit;
end;
if AStr^ <> ASubStr^ then
Break;
until False;
end;
Inc(P);
until (P^ = #0);
Result := False;
end;
function GetMetaphoneKey(P: PWideChar; ALength: Integer): Cardinal;
var
I: Integer;
begin
Result := 0;
for I := 0 to ALength - 1 do
case (P + I)^ of
'S': Result := Result or (1 shl (4 * I));
'A': Result := Result or (2 shl (4 * I));
'P': Result := Result or (3 shl (4 * I));
'K': Result := Result or (4 shl (4 * I));
'X': Result := Result or (5 shl (4 * I));
'J': Result := Result or (6 shl (4 * I));
'T': Result := Result or (7 shl (4 * I));
'F': Result := Result or (8 shl (4 * I));
'N': Result := Result or (9 shl (4 * I));
'L': Result := Result or (10 shl (4 * I));
'H': Result := Result or (11 shl (4 * I));
' ': Result := Result or (12 shl (4 * I));
'M': Result := Result or (13 shl (4 * I));
'R': Result := Result or (14 shl (4 * I));
'0': Result := Result or (15 shl (4 * I));
else
Result := 0;
Break;
end;
end;
function Minimum(A, B, C: Integer): Integer; {$IFDEF DELPHI9} inline; {$ENDIF}
begin
Result := A;
if B < A then
Result := B;
if C < Result then
Result := C;
end;
{ TdxDoubleMetaphone }
constructor TdxDoubleMetaphone.Create;
begin
inherited Create;
GetMem(FPrimary, (MaxSoundLen + 1) * SizeOf(WideChar));
GetMem(FAlternate, (MaxSoundLen + 1) * SizeOf(WideChar));
GetMem(FValue, 256 * SizeOf(WideChar));
end;
destructor TdxDoubleMetaphone.Destroy;
begin
FreeMem(FPrimary);
FreeMem(FAlternate);
FreeMem(FValue);
inherited Destroy;
end;
procedure TdxDoubleMetaphone.DoubleMetaphone(P: PWideChar; ALength: Integer);
begin
//pad the original string so that we can index beyond the edge of the world
if ALength = 0 then Exit;
Reset(P, ALength);
//skip these when at start of word
if Contains(Value, 2, 'GN;KN;PN;WR;PS;') then
Inc(Current)
else
begin
case Value^ of
'A', 'E', 'I', 'O', 'U', 'Y':
begin
//all init vowels now map to 'A'
MetaphAdd('A');
Inc(Current);
end;
'X': //Initial 'X' is pronounced 'Z' e.g. 'Xavier'
begin
MetaphAdd('S'); //'Z' maps to 'S'
Inc(Current);
end;
'W':
begin
if IsVowel(Value + 1) then //Wasserman should match Vasserman
begin
MetaphAdd('A', 'F');
Inc(Current);
end
else
if Contains(Current, 2, 'WH;') then
begin
//need Uomo to match Womo
MetaphAdd('A');
Inc(Current);
end;
end;
end;
end;
// main loop
while ((FPrimaryLen < MaxSoundLen) or (FAlternateLen < MaxSoundLen)) and (Current <= Last) do
begin
case Current^ of
'B': ProcessB;
'C': ProcessC;
'D': ProcessD;
'F': ProcessF;
'G': ProcessG;
'H': ProcessH;
'J': ProcessJ;
'K': ProcessK;
'L': ProcessL;
'M': ProcessM;
'N': ProcessN;
'P': ProcessP;
'Q': ProcessQ;
'R': ProcessR;
'S': ProcessS;
'T': ProcessT;
'V': ProcessV;
'W': ProcessW;
'X': ProcessX;
'Z': ProcessZ;
//special case
#$00C7: // A C with a Cedilla
MetaphAdd('S');
#$00D1: // N with a tilde (spanish ene)
MetaphAdd('N');
end;
Inc(Current);
end;
end;
function TdxDoubleMetaphone.IsSlavoGermanic: Boolean;
begin
if not FIsSlavoGermanicReady then
begin
FIsSlavoGermanic := ContainsSubstr(Value, 'W') or ContainsSubstr(Value, 'K') or
ContainsSubstr(Value, 'CZ') or ContainsSubstr(Value, 'WITZ');
FIsSlavoGermanicReady := True;
end;
Result := FIsSlavoGermanic;
end;
function TdxDoubleMetaphone.IsVowel(P: PWideChar): Boolean;
begin
if (P >= Value) and (P <= Last) then
case P^ of
'A', 'E', 'I', 'O', 'U', 'Y': Result := True;
else
Result := False;
end
else
Result := False;
end;
procedure TdxDoubleMetaphone.MetaphAdd(APrimary: WideChar; AAlternate: WideChar = #0);
begin
if APrimary <> #0 then
begin
MetaphPrimaryAdd(APrimary);
if AAlternate = #0 then
MetaphAlternateAdd(APrimary)
else
MetaphAlternateAdd(AAlternate);
end
else
MetaphAlternateAdd(AAlternate);
end;
procedure TdxDoubleMetaphone.MetaphPrimaryAdd(AChar: WideChar);
begin
if FPrimaryLen < MaxSoundLen then
begin
(FPrimary + FPrimaryLen)^ := AChar;
Inc(FPrimaryLen);
end;
end;
procedure TdxDoubleMetaphone.MetaphAlternateAdd(AChar: WideChar);
begin
if FAlternateLen < MaxSoundLen then
begin
(FAlternate + FAlternateLen)^ := AChar;
Inc(FAlternateLen);
end;
end;
function TdxDoubleMetaphone.AtEnd: Boolean;
begin
Result := Current = Last;
end;
function TdxDoubleMetaphone.AtStart: Boolean;
begin
Result := Current = Value;
end;
function TdxDoubleMetaphone.Contains(AStart: PWideChar; ALength: Integer;
APatterns: PWideChar): Boolean;
begin
Result := (AStart >= Value) and (AStart + ALength - 1 <= Last);
if Result then
begin
repeat
if SameChars(AStart, APatterns, ALength) then
begin
Result := True;
Exit;
end;
Inc(APatterns, ALength + 1);
until APatterns^ = #0;
Result := False;
end;
end;
function TdxDoubleMetaphone.CurrentIndex: Integer;
begin
Result := Current - Value;
end;
function TdxDoubleMetaphone.GetAlternate: WideString;
begin
SetString(Result, FAlternate, FAlternateLen);
end;
function TdxDoubleMetaphone.GetAlternateKey: Word;
begin
Result := GetMetaphoneKey(FAlternate, FAlternateLen);
end;
function TdxDoubleMetaphone.GetPrimary: WideString;
begin
SetString(Result, FPrimary, FPrimaryLen);
end;
function TdxDoubleMetaphone.GetPrimaryKey: Word;
begin
Result := GetMetaphoneKey(FPrimary, FPrimaryLen);
end;
procedure TdxDoubleMetaphone.ProcessB;
begin
//"-mb", e.g", "dumb", already skipped over...
MetaphAdd('P');
if (Current + 1)^ = 'B' then
Inc(Current);
end;
procedure TdxDoubleMetaphone.ProcessC;
begin
//various germanic
if (CurrentIndex > 1) and not IsVowel(Current - 2) and Contains(Current - 1, 3, 'ACH;') and
(((Current + 2)^ <> 'I') and (((Current + 2)^ <> 'E') or Contains(Current - 2, 6, 'BACHER;MACHER;'))) then
begin
MetaphAdd('K');
Inc(Current);
Exit;
end;
//special case 'caesar'
if AtStart and Contains(Current, 6, 'CAESAR;') then
begin
MetaphAdd('S');
Inc(Current);
Exit;
end;
//italian 'chianti'
if Contains(Current, 4, 'CHIA;') then
begin
MetaphAdd('K');
Inc(Current);
Exit;
end;
if Contains(Current, 2, 'CH;') then
begin
//find 'michael'
if (Current > Value) and Contains(Current, 4, 'CHAE;') then
begin
MetaphAdd('K', 'X');
Inc(Current);
Exit;
end;
//greek roots e.g. 'chemistry', 'chorus'
if (AtStart and (Contains(Current + 1, 5, 'HARAC;HARIS;') or
Contains(Current + 1, 3, 'HOR;HYM;HIA;HEM;')) and not Contains(Current, 5, 'CHORE;')) then
begin
MetaphAdd('K');
Inc(Current);
Exit;
end;
//germanic, greek, or otherwise 'ch' for 'kh' sound
if (Contains(Current, 4, 'VAN ;VON ;') or Contains(Current, 3, 'SCH;')) or
// 'architect but not 'arch', 'orchestra', 'orchid'
Contains(Current - 2, 6, 'ORCHES;ARCHIT;ORCHID;') or
CharInString((Current + 2)^, 'TS') or
(AtStart or CharInString((Current - 1)^, 'AOUE')) and
//e.g., 'wachtler', 'wechsler', but not 'tichner'
CharInString((Current + 2)^, 'LRNMBHFVW ') then
MetaphAdd('K')
else
begin
if Current > Value then
begin
if Contains(Current, 2, 'MC;') then
//e.g., 'McHugh'
MetaphAdd('K')
else
MetaphAdd('X', 'K');
end
else
MetaphAdd('X');
end;
Inc(Current);
Exit;
end;
//e.g, 'czerny'
if Contains(Current, 2, 'CZ;') and not Contains(Current - 2, 4, 'WICZ;') then
begin
MetaphAdd('S', 'X');
Inc(Current);
Exit;
end;
//e.g., 'focaccia'
if Contains(Current + 1, 3, 'CIA;') then
begin
MetaphAdd('X');
Inc(Current, 2);
Exit;
end;
//double 'C', but not if e.g. 'McClellan'
if Contains(Current, 2, 'CC;') and not ((CurrentIndex = 1) and (Value^ = 'M')) then
begin
//'bellocchio' but not 'bacchus'
if CharInString((Current + 2)^, 'IEH') and not Contains(Current + 2, 2, 'HU;') then
begin
//'accident', 'accede' 'succeed'
if (((CurrentIndex = 1) and ((Current - 1)^ = 'A'))
or Contains(Current - 1, 5, 'UCCEE;UCCES;')) then
begin
MetaphAdd('K');
MetaphAdd('S');
end
else
//'bacci', 'bertucci', other italian
MetaphAdd('X');
Inc(Current, 2);
end
else
begin
//Pierce's rule
MetaphAdd('K');
Inc(Current);
end;
Exit;
end;
if Contains(Current, 2, 'CK;CG;CQ;') then
begin
MetaphAdd('K');
Inc(Current);
Exit;
end;
if Contains(Current, 2, 'CI;CE;CY;') then
begin
//italian vs. english
if Contains(Current, 3, 'CIO;CIE;CIA;') then
MetaphAdd('S', 'X')
else
MetaphAdd('S');
Inc(Current);
Exit;
end;
//else
MetaphAdd('K');
//name sent in 'mac caffrey', 'mac gregor
if Contains(Current + 1, 2, ' C; Q; G;') then
Inc(Current, 2)
else
if CharInString((Value + 1)^, 'CKQ') and not Contains(Value + 1, 2, 'CE;CI;') then
Inc(Current);
end;
procedure TdxDoubleMetaphone.ProcessD;
begin
if Contains(Current, 2, 'DG;') then
begin
if CharInString((Current + 2)^, 'IEY') then
begin
//e.g. 'edge'
MetaphAdd('J');
Inc(Current, 2);
end
else
begin
//e.g. 'edgar'
MetaphAdd('T');
MetaphAdd('K');
Inc(Current);
end;
Exit;
end;
if Contains(Current, 2, 'DT;DD;') then
begin
MetaphAdd('T');
Inc(Current);
end
else
MetaphAdd('T');
end;
procedure TdxDoubleMetaphone.ProcessF;
begin
if (Current + 1)^ = 'F' then
Inc(Current);
MetaphAdd('F');
end;
procedure TdxDoubleMetaphone.ProcessG;
begin
if (Current + 1)^ = 'H' then
begin
if (CurrentIndex > 0) and not IsVowel(Current - 1) then
begin
MetaphAdd('K');
Inc(Current);
Exit;
end;
//'ghislane', ghiradelli
if Current = Value then
begin
if (Current + 2)^ = 'I' then
MetaphAdd('J')
else
MetaphAdd('K');
Inc(Current);
Exit;
end;
//Parker's rule (with some further refinements) - e.g., 'hugh'
if (CurrentIndex > 1) and CharInString((Current - 2)^, 'BHD') or
//e.g., 'bough'
(CurrentIndex > 2) and CharInString((Current - 3)^, 'BHD') or
//e.g., 'broughton'
(CurrentIndex > 3) and CharInString((Current - 4)^, 'BH') then
Inc(Current)
else
begin
//e.g., 'laugh', 'McLaughlin', 'cough', 'gough', 'rough', 'tough'
if (CurrentIndex > 2) and ((Current - 1)^ = 'U') and CharInString((Current - 3)^, 'CGLRT') then
MetaphAdd('F')
else
if (CurrentIndex > 0) and ((Current - 1)^ <> 'I') then
MetaphAdd('K');
Inc(Current);
end;
Exit;
end;
if (Current + 1)^ = 'N' then
begin
if (CurrentIndex = 1) and IsVowel(Value) and not IsSlavoGermanic then
begin
MetaphAdd('K', 'N');
MetaphAdd('N', #0);
end
else
//not e.g. 'cagney'
if not Contains(Current + 2, 2, 'EY;') and ((Current + 1)^ <> 'Y') and not IsSlavoGermanic then
begin
MetaphAdd('N', 'K');
MetaphAdd(#0, 'N');
end
else
begin
MetaphAdd('K');
MetaphAdd('N');
end;
Inc(Current);
Exit;
end;
//'tagliaro'
if Contains(Current + 1, 2, 'LI;') and not IsSlavoGermanic then
begin
MetaphAdd('K', 'L');
MetaphAdd('L', #0);
Inc(Current);
Exit;
end;
//-ges-,-gep-,-gel-, -gie- at beginning
if (AtStart and (((Current + 1)^ = 'Y')
or Contains(Current + 1, 2, 'ES;EP;EB;EL;EY;IB;IL;IN;IE;EI;ER;'))) then
begin
MetaphAdd('K', 'J');
Inc(Current);
Exit;
end;
// -ger-, -gy-
if (Contains(Current + 1, 2, 'ER;') or ((Current + 1)^ = 'Y')) and
not Contains(Current, 6, 'DANGER;RANGER;MANGER;') and
not ((CurrentIndex > 0) and ((Current - 1)^ = 'E') or ((Current - 1)^ = 'I')) and
not Contains(Current - 1, 3, 'RGY;OGY;') then
begin
MetaphAdd('K', 'J');
Inc(Current);
Exit;
end;
// italian e.g, 'biaggi'
if CharInString((Current + 1)^, 'EIY') or Contains(Current - 1, 4, 'AGGI;OGGI;') then
begin
//obvious germanic
if Contains(Current, 4, 'VAN ;VON ;') or
Contains(Current, 3, 'SCH;') or Contains(Current + 1, 2, 'ET;') then
MetaphAdd('K')
else
//always soft if french ending
if Contains(Current + 1, 4, 'IER ;') then
MetaphAdd('J')
else
MetaphAdd('J', 'K');
Inc(Current);
Exit;
end;
if (Current + 1)^ = 'G' then
Inc(Current);
MetaphAdd('K');
end;
procedure TdxDoubleMetaphone.ProcessH;
begin
//only keep if first & before vowel or btw. 2 vowels
if (AtStart or IsVowel(Current - 1)) and IsVowel(Current + 1) then
begin
MetaphAdd('H');
Inc(Current);
end;
end;
procedure TdxDoubleMetaphone.ProcessJ;
begin
//obvious spanish, 'jose', 'san jacinto'
if Contains(Current, 4, 'JOSE;') or Contains(Current, 4, 'SAN ;') then
begin
if (AtStart and ((Current + 4)^ = ' ')) or Contains(Current, 4, 'SAN ;') then
MetaphAdd('H')
else
MetaphAdd('J', 'H');
Exit;
end;
if AtStart and not Contains(Current, 4, 'JOSE;') then
MetaphAdd('J', 'A') //Yankelovich/Jankelowicz
else
//spanish pron. of e.g. 'bajador'
if IsVowel(Current - 1) and not IsSlavoGermanic and (((Current + 1)^ = 'A') or ((Current + 1)^ = 'O')) then
MetaphAdd('J', 'H')
else
if AtEnd then
MetaphAdd('J', ' ')
else
if not CharInString((Current + 1)^, 'LTKSNMBZ') and
not ((CurrentIndex > 0) and CharInString((Current - 1)^, 'SKL')) then
MetaphAdd('J');
if (Current + 1)^ = 'J' then //it could happennot
Inc(Current);
end;
procedure TdxDoubleMetaphone.ProcessK;
begin
if (Current + 1)^ = 'K' then
Inc(Current);
MetaphAdd('K');
end;
procedure TdxDoubleMetaphone.ProcessL;
begin
if (Current + 1)^ = 'L' then
begin
//spanish e.g. 'cabrillo', 'gallegos'
if ((Current = Last - 2) and Contains(Current - 1, 4, 'ILLO;ILLA;ALLE;')) or
((Contains(Last - 1, 2, 'AS;OS;') or (Last^ = 'A') or (Last^ = 'O'))
and Contains(Current - 1, 4, 'ALLE;')) then
begin
MetaphAdd('L', ' ');
Inc(Current);
Exit;
end;
Inc(Current);
end;
MetaphAdd('L');
end;
procedure TdxDoubleMetaphone.ProcessM;
begin
if Contains(Current - 1, 3, 'UMB;') and
((Current + 1 = Last) or Contains(Current + 2, 2, 'ER;'))
//'dumb','thumb'
or ((Current + 1)^ = 'M') then
Inc(Current);
MetaphAdd('M');
end;
procedure TdxDoubleMetaphone.ProcessN;
begin
if (Current + 1)^ = 'N' then
Inc(Current);
MetaphAdd('N');
end;
procedure TdxDoubleMetaphone.ProcessP;
begin
if (Current + 1)^ = 'H' then
begin
MetaphAdd('F');
Inc(Current);
Exit;
end;
//also account for 'campbell', 'raspberry'
if ((Current + 1)^ = 'P') or ((Current + 1)^ = 'B') then
Inc(Current);
MetaphAdd('P');
end;
procedure TdxDoubleMetaphone.ProcessQ;
begin
if (Current + 1)^ = 'Q' then
Inc(Current);
MetaphAdd('K');
end;
procedure TdxDoubleMetaphone.ProcessR;
begin
//french e.g. 'rogier', but exclude 'hochmeier'
if AtEnd and not IsSlavoGermanic and Contains(Current - 2, 2, 'IE;')
and not Contains(Current - 4, 2, 'ME;MA;') then
MetaphAdd(#0, 'R')
else
MetaphAdd('R');
if (Current + 1)^ = 'R' then
Inc(Current);
end;
procedure TdxDoubleMetaphone.ProcessS;
begin
//special cases 'island', 'isle', 'carlisle', 'carlysle'
if Contains(Current - 1, 3, 'ISL;YSL;') then
Exit;
//special case 'sugar-'
if AtStart and Contains(Current, 5, 'SUGAR;') then
begin
MetaphAdd('X', 'S');
Exit;
end;
if Contains(Current, 2, 'SH;') then
begin
//germanic
if Contains(Current + 1, 4, 'HEIM;HOEK;HOLM;HOLZ;') then
MetaphAdd('S')
else
MetaphAdd('X');
Inc(Current);
Exit;
end;
//italian & armenian
if Contains(Current, 3, 'SIO;SIA;') or Contains(Current, 4, 'SIAN;') then
begin
if not IsSlavoGermanic then
MetaphAdd('S', 'X')
else
MetaphAdd('S');
Inc(Current, 2);
Exit;
end;
//german & anglicisations, e.g. 'smith' match 'schmidt', 'snider' match 'schneider'
//also, -sz- in slavic language altho in hungarian it is pronounced 's'
if (AtStart and CharInString((Current + 1)^, 'MNLW')) or ((Current + 1)^ = 'Z') then
begin
MetaphAdd('S', 'X');
if (Current + 1)^ = 'Z' then
Inc(Current);
Exit;
end;
if Contains(Current, 2, 'SC;') then
begin
//Schlesinger's rule
if (Current + 2)^ = 'H' then
begin
//dutch origin, e.g. 'school', 'schooner'
if Contains(Current + 3, 2, 'OO;ER;EN;UY;ED;EM;') then
begin
//'schermerhorn', 'schenker'
if Contains(Current + 3, 2, 'ER;EN;') then
begin
MetaphAdd('X', 'S');
MetaphAdd(#0, 'K');
end
else
begin
MetaphAdd('S');
MetaphAdd('K');
end;
end
else
begin
if AtStart and not IsVowel(Value + 3) and ((Current + 3)^ <> 'W') then
MetaphAdd('X', 'S')
else
MetaphAdd('X');
end;
Inc(Current, 2);
Exit;
end;
if CharInString((Current + 2)^, 'IEY') then
MetaphAdd('S')
else
begin
MetaphAdd('S');
MetaphAdd('K');
end;
Inc(Current, 2);
Exit;
end;
//french e.g. 'resnais', 'artois'
if AtEnd and Contains(Current - 2, 2, 'AI;OI;') then
MetaphAdd(#0, 'S')
else
MetaphAdd('S');
if ((Current + 1)^ = 'S') or ((Current + 1)^ = 'Z') then
Inc(Current);
end;
procedure TdxDoubleMetaphone.ProcessT;
begin
if Contains(Current, 4, 'TION;') or Contains(Current, 3, 'TIA;TCH;') then
begin
MetaphAdd('X');
Inc(Current, 2);
Exit;
end;
if Contains(Current, 2, 'TH;') or Contains(Current, 3, 'TTH;') then
begin
//special case 'thomas', 'thames' or germanic
if Contains(Current + 2, 2, 'OM;AM;') or Contains(Current, 4, 'VAN ;VON ;')
or Contains(Current, 3, 'SCH;') then
MetaphAdd('T')
else
MetaphAdd('0', 'T');
Inc(Current);
Exit;
end;
if ((Current + 1)^ = 'T') or ((Current + 1)^ = 'D') then
Inc(Current);
MetaphAdd('T');
end;
procedure TdxDoubleMetaphone.ProcessV;
begin
if (Current + 1)^ = 'V' then
Inc(Current);
MetaphAdd('F');
end;
procedure TdxDoubleMetaphone.ProcessW;
begin
//can also be in middle of word
if Contains(Current, 2, 'WR;') then
begin
MetaphAdd('R');
Inc(Current);
Exit;
end;
//Arnow should match Arnoff
if (AtEnd and IsVowel(Current - 1)) or
Contains(Current - 1, 5, 'EWSKI;EWSKY;OWSKI;OWSKY;') or
Contains(Current, 3, 'SCH;') then
begin
MetaphAdd(#0, 'F');
Exit;
end;
//polish e.g. 'filipowicz'
if Contains(Current, 4, 'WICZ;WITZ;') then
begin
MetaphAdd('T', 'F');
MetaphAdd('S', 'X');
Inc(Current, 3);
end;
end;
procedure TdxDoubleMetaphone.ProcessX;
begin
//french e.g. breaux
if not (AtEnd and (Contains(Current - 3, 3, 'IAU;EAU;')
or Contains(Current - 2, 2, 'AU;OU;'))) then
begin
MetaphAdd('K');
MetaphAdd('S');
end;
if ((Current + 1)^ = 'C') or ((Current + 1)^ = 'X') then
Inc(Current);
end;
procedure TdxDoubleMetaphone.ProcessZ;
begin
//chinese pinyin e.g. 'zhao'
if (Current + 1)^ = 'H' then
begin
MetaphAdd('J');
Inc(Current);
Exit;
end
else
if Contains(Current + 1, 2, 'ZO;ZI;ZA;')
or (IsSlavoGermanic and ((CurrentIndex > 0) and ((Current - 1)^ <> 'T'))) then
begin
MetaphAdd('S', 'T');
MetaphAdd(#0, 'S');
end
else
MetaphAdd('S');
if (Current + 1)^ = 'Z' then
Inc(Current);
end;
procedure TdxDoubleMetaphone.Reset(P: PWideChar; ALength: Integer);
var
ATemp: PWideChar;
I: Integer;
begin
FLength := ALength;
ATemp := FValue;
I := 0;
while I < (Length + 6) do
begin
if I < Length then
(ATemp + I)^ := (P + I)^
else
if I = Length + 5 then
(ATemp + I)^ := #0
else
(ATemp + I)^ := ' ';
Inc(I);
end;
FLast := FValue + ALength - 1;
FPrimaryLen := 0;
FAlternateLen := 0;
Current := FValue;
FIsSlavoGermanicReady := False;
end;
{ TdxStringSimilarityCalculator }
function TdxStringSimilarityCalculator.GetDistance(AStr1: PWideChar; ALen1: Integer;
AStr2: PWideChar; ALen2: Integer): Integer;
var
AMatrix: array of array of Integer;
I, J: Integer;
begin
if (ALen1 = 0) or (ALen2 = 0) then
begin
Result := 0;
Exit;
end;
SetLength(AMatrix, ALen1 + 1, ALen2 + 1);
AMatrix[0, 0] := 0;
for I := 0 to ALen2 - 1 do
AMatrix[0, I + 1] := AMatrix[0, I] + InsertCost(AStr2 + I);
for I := 0 to ALen1 - 1 do
begin
AMatrix[I + 1, 0] := AMatrix[I, 0] + DeleteCost(AStr1 + I);
for J := 0 to ALen2 - 1 do
begin
AMatrix[I + 1, J + 1] := Minimum(
AMatrix[I, J] + SubstitutionCost(AStr1 + I, AStr2 + J),
AMatrix[I, J + 1] + DeleteCost(AStr1 + I),
AMatrix[I + 1, J] + InsertCost(AStr2 + J));
end;
end;
Result := AMatrix[ALen1, ALen2];
if Result > 0 then
begin
if AStr1^ <> AStr2^ then
Inc(Result);
if AStr1[ALen1 - 1] <> AStr2[ALen2 - 1] then
Inc(Result);
end;
end;
function TdxStringSimilarityCalculator.DeleteCost(S: PWideChar): Integer;
begin
Result := 2;
end;
function TdxStringSimilarityCalculator.InsertCost(S: PWideChar): Integer;
begin
Result := 2;
end;
function TdxStringSimilarityCalculator.SubstitutionCost(S1: PWideChar; S2: PWideChar): Integer;
begin
if S1^ = S2^ then
Result := 0
else
Result := 2;
end;
end.