1123 lines
30 KiB
ObjectPascal
1123 lines
30 KiB
ObjectPascal
|
|
{********************************************************************}
|
||
|
|
{ }
|
||
|
|
{ Developer Express Visual Component Library }
|
||
|
|
{ ExpressSpellChecker }
|
||
|
|
{ }
|
||
|
|
{ Copyright (c) 1998-2009 Developer Express Inc. }
|
||
|
|
{ ALL RIGHTS RESERVED }
|
||
|
|
{ }
|
||
|
|
{ The entire contents of this file is protected by U.S. and }
|
||
|
|
{ International Copyright Laws. Unauthorized reproduction, }
|
||
|
|
{ reverse-engineering, and distribution of all or any portion of }
|
||
|
|
{ the code contained in this file is strictly prohibited and may }
|
||
|
|
{ result in severe civil and criminal penalties and will be }
|
||
|
|
{ prosecuted to the maximum extent possible under the law. }
|
||
|
|
{ }
|
||
|
|
{ RESTRICTIONS }
|
||
|
|
{ }
|
||
|
|
{ THIS SOURCE CODE AND ALL RESULTING INTERMEDIATE FILES }
|
||
|
|
{ (DCU, OBJ, DLL, ETC.) ARE CONFIDENTIAL AND PROPRIETARY TRADE }
|
||
|
|
{ SECRETS OF DEVELOPER EXPRESS INC. THE REGISTERED DEVELOPER IS }
|
||
|
|
{ LICENSED TO DISTRIBUTE THE EXPRESSSPELLCHECKER AND ALL }
|
||
|
|
{ ACCOMPANYING VCL CONTROLS AS PART OF AN EXECUTABLE PROGRAM ONLY. }
|
||
|
|
{ }
|
||
|
|
{ THE SOURCE CODE CONTAINED WITHIN THIS FILE AND ALL RELATED }
|
||
|
|
{ FILES OR ANY PORTION OF ITS CONTENTS SHALL AT NO TIME BE }
|
||
|
|
{ COPIED, TRANSFERRED, SOLD, DISTRIBUTED, OR OTHERWISE MADE }
|
||
|
|
{ AVAILABLE TO OTHER INDIVIDUALS WITHOUT EXPRESS WRITTEN CONSENT }
|
||
|
|
{ AND PERMISSION FROM DEVELOPER EXPRESS INC. }
|
||
|
|
{ }
|
||
|
|
{ CONSULT THE END USER LICENSE AGREEMENT FOR INFORMATION ON }
|
||
|
|
{ ADDITIONAL RESTRICTIONS. }
|
||
|
|
{ }
|
||
|
|
{********************************************************************}
|
||
|
|
|
||
|
|
unit dxSpellCheckerAlgorithms;
|
||
|
|
|
||
|
|
{$I cxVer.inc}
|
||
|
|
|
||
|
|
interface
|
||
|
|
|
||
|
|
const
|
||
|
|
MaxSoundLen = 4;
|
||
|
|
|
||
|
|
type
|
||
|
|
{ TdxDoubleMetaphone }
|
||
|
|
|
||
|
|
TdxDoubleMetaphone = class
|
||
|
|
private
|
||
|
|
FIsSlavoGermanic: Boolean;
|
||
|
|
FIsSlavoGermanicReady: Boolean;
|
||
|
|
FLast: PWideChar;
|
||
|
|
FLength: Integer;
|
||
|
|
FPrimary: PWideChar;
|
||
|
|
FPrimaryLen: Integer;
|
||
|
|
FAlternate: PWideChar;
|
||
|
|
FAlternateLen: Integer;
|
||
|
|
FValue: PWideChar;
|
||
|
|
function GetAlternate: WideString;
|
||
|
|
function GetAlternateKey: Word;
|
||
|
|
function GetPrimary: WideString;
|
||
|
|
function GetPrimaryKey: Word;
|
||
|
|
procedure MetaphPrimaryAdd(AChar: WideChar); {$IFDEF DELPHI9} inline; {$ENDIF}
|
||
|
|
procedure MetaphAlternateAdd(AChar: WideChar); {$IFDEF DELPHI9} inline; {$ENDIF}
|
||
|
|
procedure ProcessB; {$IFDEF DELPHI9} inline; {$ENDIF}
|
||
|
|
procedure ProcessC; {$IFDEF DELPHI9} inline; {$ENDIF}
|
||
|
|
procedure ProcessD; {$IFDEF DELPHI9} inline; {$ENDIF}
|
||
|
|
procedure ProcessF; {$IFDEF DELPHI9} inline; {$ENDIF}
|
||
|
|
procedure ProcessG; {$IFDEF DELPHI9} inline; {$ENDIF}
|
||
|
|
procedure ProcessH; {$IFDEF DELPHI9} inline; {$ENDIF}
|
||
|
|
procedure ProcessJ; {$IFDEF DELPHI9} inline; {$ENDIF}
|
||
|
|
procedure ProcessK; {$IFDEF DELPHI9} inline; {$ENDIF}
|
||
|
|
procedure ProcessL; {$IFDEF DELPHI9} inline; {$ENDIF}
|
||
|
|
procedure ProcessM; {$IFDEF DELPHI9} inline; {$ENDIF}
|
||
|
|
procedure ProcessN; {$IFDEF DELPHI9} inline; {$ENDIF}
|
||
|
|
procedure ProcessP; {$IFDEF DELPHI9} inline; {$ENDIF}
|
||
|
|
procedure ProcessQ; {$IFDEF DELPHI9} inline; {$ENDIF}
|
||
|
|
procedure ProcessR; {$IFDEF DELPHI9} inline; {$ENDIF}
|
||
|
|
procedure ProcessS; {$IFDEF DELPHI9} inline; {$ENDIF}
|
||
|
|
procedure ProcessT; {$IFDEF DELPHI9} inline; {$ENDIF}
|
||
|
|
procedure ProcessV; {$IFDEF DELPHI9} inline; {$ENDIF}
|
||
|
|
procedure ProcessW; {$IFDEF DELPHI9} inline; {$ENDIF}
|
||
|
|
procedure ProcessX; {$IFDEF DELPHI9} inline; {$ENDIF}
|
||
|
|
procedure ProcessZ; {$IFDEF DELPHI9} inline; {$ENDIF}
|
||
|
|
procedure Reset(P: PWideChar; ALength: Integer);
|
||
|
|
protected
|
||
|
|
Current: PWideChar;
|
||
|
|
function AtEnd: Boolean; {$IFDEF DELPHI9} inline; {$ENDIF}
|
||
|
|
function AtStart: Boolean; {$IFDEF DELPHI9} inline; {$ENDIF}
|
||
|
|
function Contains(AStart: PWideChar; ALength: Integer; APatterns: PWideChar): Boolean; {$IFDEF DELPHI9} inline; {$ENDIF}
|
||
|
|
function CurrentIndex: Integer; {$IFDEF DELPHI9} inline; {$ENDIF}
|
||
|
|
function IsSlavoGermanic: Boolean;
|
||
|
|
function IsVowel(P: PWideChar): Boolean; {$IFDEF DELPHI9} inline; {$ENDIF}
|
||
|
|
procedure MetaphAdd(APrimary: WideChar; AAlternate: WideChar = #0); {$IFDEF DELPHI9} inline; {$ENDIF}
|
||
|
|
|
||
|
|
property Last: PWideChar read FLast;
|
||
|
|
property Length: Integer read FLength;
|
||
|
|
property Value: PWideChar read FValue write FValue;
|
||
|
|
public
|
||
|
|
constructor Create;
|
||
|
|
destructor Destroy; override;
|
||
|
|
procedure DoubleMetaphone(P: PWideChar; ALength: Integer);
|
||
|
|
|
||
|
|
property Alternate: WideString read GetAlternate;
|
||
|
|
property AlternateKey: Word read GetAlternateKey;
|
||
|
|
property Primary: WideString read GetPrimary;
|
||
|
|
property PrimaryKey: Word read GetPrimaryKey;
|
||
|
|
end;
|
||
|
|
|
||
|
|
{ TdxStringSimilarityCalculator }
|
||
|
|
|
||
|
|
TdxStringSimilarityCalculator = class
|
||
|
|
protected
|
||
|
|
function DeleteCost(S: PWideChar): Integer; {$IFDEF DELPHI9} inline; {$ENDIF}
|
||
|
|
function InsertCost(S: PWideChar): Integer; {$IFDEF DELPHI9} inline; {$ENDIF}
|
||
|
|
function SubstitutionCost(S1: PWideChar; S2: PWideChar): Integer; {$IFDEF DELPHI9} inline; {$ENDIF}
|
||
|
|
public
|
||
|
|
function GetDistance(AStr1: PWideChar; ALen1: Integer; AStr2: PWideChar; ALen2: Integer): Integer;
|
||
|
|
end;
|
||
|
|
|
||
|
|
implementation
|
||
|
|
|
||
|
|
uses
|
||
|
|
SysUtils, dxSpellCheckerUtils, Windows;
|
||
|
|
|
||
|
|
function SameChars(P1, P2: PWideChar; ALength: Integer): Boolean; {$IFDEF DELPHI9} inline; {$ENDIF}
|
||
|
|
begin
|
||
|
|
repeat
|
||
|
|
if P1^ <> P2^ then
|
||
|
|
begin
|
||
|
|
Result := False;
|
||
|
|
Exit;
|
||
|
|
end;
|
||
|
|
Inc(P1);
|
||
|
|
Inc(P2);
|
||
|
|
Dec(ALength);
|
||
|
|
until ALength = 0;
|
||
|
|
Result := True;
|
||
|
|
end;
|
||
|
|
|
||
|
|
function CharInString(AChar: WideChar; AString: PWideChar): Boolean; {$IFDEF DELPHI9} inline; {$ENDIF}
|
||
|
|
begin
|
||
|
|
repeat
|
||
|
|
if AString^ = AChar then
|
||
|
|
begin
|
||
|
|
Result := True;
|
||
|
|
Exit;
|
||
|
|
end;
|
||
|
|
Inc(AString);
|
||
|
|
until AString^ = #0;
|
||
|
|
Result := False;
|
||
|
|
end;
|
||
|
|
|
||
|
|
function ContainsSubstr(const AString, ASubString: PWideChar): Boolean;
|
||
|
|
var
|
||
|
|
P, AStr, ASubStr: PWideChar;
|
||
|
|
AChar: WideChar;
|
||
|
|
begin
|
||
|
|
P := AString;
|
||
|
|
AChar := ASubString^;
|
||
|
|
repeat
|
||
|
|
if P^ = AChar then
|
||
|
|
begin
|
||
|
|
AStr := P;
|
||
|
|
ASubStr := ASubString;
|
||
|
|
repeat
|
||
|
|
Inc(AStr);
|
||
|
|
Inc(ASubStr);
|
||
|
|
if ASubStr^ = #0 then
|
||
|
|
begin
|
||
|
|
Result := True;
|
||
|
|
Exit;
|
||
|
|
end;
|
||
|
|
if AStr^ = #0 then
|
||
|
|
begin
|
||
|
|
Result := False;
|
||
|
|
Exit;
|
||
|
|
end;
|
||
|
|
if AStr^ <> ASubStr^ then
|
||
|
|
Break;
|
||
|
|
until False;
|
||
|
|
end;
|
||
|
|
Inc(P);
|
||
|
|
until (P^ = #0);
|
||
|
|
Result := False;
|
||
|
|
end;
|
||
|
|
|
||
|
|
function GetMetaphoneKey(P: PWideChar; ALength: Integer): Cardinal;
|
||
|
|
var
|
||
|
|
I: Integer;
|
||
|
|
begin
|
||
|
|
Result := 0;
|
||
|
|
for I := 0 to ALength - 1 do
|
||
|
|
case (P + I)^ of
|
||
|
|
'S': Result := Result or (1 shl (4 * I));
|
||
|
|
'A': Result := Result or (2 shl (4 * I));
|
||
|
|
'P': Result := Result or (3 shl (4 * I));
|
||
|
|
'K': Result := Result or (4 shl (4 * I));
|
||
|
|
'X': Result := Result or (5 shl (4 * I));
|
||
|
|
'J': Result := Result or (6 shl (4 * I));
|
||
|
|
'T': Result := Result or (7 shl (4 * I));
|
||
|
|
'F': Result := Result or (8 shl (4 * I));
|
||
|
|
'N': Result := Result or (9 shl (4 * I));
|
||
|
|
'L': Result := Result or (10 shl (4 * I));
|
||
|
|
'H': Result := Result or (11 shl (4 * I));
|
||
|
|
' ': Result := Result or (12 shl (4 * I));
|
||
|
|
'M': Result := Result or (13 shl (4 * I));
|
||
|
|
'R': Result := Result or (14 shl (4 * I));
|
||
|
|
'0': Result := Result or (15 shl (4 * I));
|
||
|
|
else
|
||
|
|
Result := 0;
|
||
|
|
Break;
|
||
|
|
end;
|
||
|
|
end;
|
||
|
|
|
||
|
|
function Minimum(A, B, C: Integer): Integer; {$IFDEF DELPHI9} inline; {$ENDIF}
|
||
|
|
begin
|
||
|
|
Result := A;
|
||
|
|
if B < A then
|
||
|
|
Result := B;
|
||
|
|
if C < Result then
|
||
|
|
Result := C;
|
||
|
|
end;
|
||
|
|
|
||
|
|
{ TdxDoubleMetaphone }
|
||
|
|
|
||
|
|
constructor TdxDoubleMetaphone.Create;
|
||
|
|
begin
|
||
|
|
inherited Create;
|
||
|
|
GetMem(FPrimary, (MaxSoundLen + 1) * SizeOf(WideChar));
|
||
|
|
GetMem(FAlternate, (MaxSoundLen + 1) * SizeOf(WideChar));
|
||
|
|
GetMem(FValue, 256 * SizeOf(WideChar));
|
||
|
|
end;
|
||
|
|
|
||
|
|
destructor TdxDoubleMetaphone.Destroy;
|
||
|
|
begin
|
||
|
|
FreeMem(FPrimary);
|
||
|
|
FreeMem(FAlternate);
|
||
|
|
FreeMem(FValue);
|
||
|
|
inherited Destroy;
|
||
|
|
end;
|
||
|
|
|
||
|
|
procedure TdxDoubleMetaphone.DoubleMetaphone(P: PWideChar; ALength: Integer);
|
||
|
|
begin
|
||
|
|
//pad the original string so that we can index beyond the edge of the world
|
||
|
|
if ALength = 0 then Exit;
|
||
|
|
Reset(P, ALength);
|
||
|
|
|
||
|
|
//skip these when at start of word
|
||
|
|
if Contains(Value, 2, 'GN;KN;PN;WR;PS;') then
|
||
|
|
Inc(Current)
|
||
|
|
else
|
||
|
|
begin
|
||
|
|
case Value^ of
|
||
|
|
'A', 'E', 'I', 'O', 'U', 'Y':
|
||
|
|
begin
|
||
|
|
//all init vowels now map to 'A'
|
||
|
|
MetaphAdd('A');
|
||
|
|
Inc(Current);
|
||
|
|
end;
|
||
|
|
'X': //Initial 'X' is pronounced 'Z' e.g. 'Xavier'
|
||
|
|
begin
|
||
|
|
MetaphAdd('S'); //'Z' maps to 'S'
|
||
|
|
Inc(Current);
|
||
|
|
end;
|
||
|
|
'W':
|
||
|
|
begin
|
||
|
|
if IsVowel(Value + 1) then //Wasserman should match Vasserman
|
||
|
|
begin
|
||
|
|
MetaphAdd('A', 'F');
|
||
|
|
Inc(Current);
|
||
|
|
end
|
||
|
|
else
|
||
|
|
if Contains(Current, 2, 'WH;') then
|
||
|
|
begin
|
||
|
|
//need Uomo to match Womo
|
||
|
|
MetaphAdd('A');
|
||
|
|
Inc(Current);
|
||
|
|
end;
|
||
|
|
end;
|
||
|
|
end;
|
||
|
|
end;
|
||
|
|
|
||
|
|
// main loop
|
||
|
|
while ((FPrimaryLen < MaxSoundLen) or (FAlternateLen < MaxSoundLen)) and (Current <= Last) do
|
||
|
|
begin
|
||
|
|
case Current^ of
|
||
|
|
'B': ProcessB;
|
||
|
|
'C': ProcessC;
|
||
|
|
'D': ProcessD;
|
||
|
|
'F': ProcessF;
|
||
|
|
'G': ProcessG;
|
||
|
|
'H': ProcessH;
|
||
|
|
'J': ProcessJ;
|
||
|
|
'K': ProcessK;
|
||
|
|
'L': ProcessL;
|
||
|
|
'M': ProcessM;
|
||
|
|
'N': ProcessN;
|
||
|
|
'P': ProcessP;
|
||
|
|
'Q': ProcessQ;
|
||
|
|
'R': ProcessR;
|
||
|
|
'S': ProcessS;
|
||
|
|
'T': ProcessT;
|
||
|
|
'V': ProcessV;
|
||
|
|
'W': ProcessW;
|
||
|
|
'X': ProcessX;
|
||
|
|
'Z': ProcessZ;
|
||
|
|
//special case
|
||
|
|
#$00C7: // A C with a Cedilla
|
||
|
|
MetaphAdd('S');
|
||
|
|
#$00D1: // N with a tilde (spanish ene)
|
||
|
|
MetaphAdd('N');
|
||
|
|
end;
|
||
|
|
Inc(Current);
|
||
|
|
end;
|
||
|
|
end;
|
||
|
|
|
||
|
|
function TdxDoubleMetaphone.IsSlavoGermanic: Boolean;
|
||
|
|
begin
|
||
|
|
if not FIsSlavoGermanicReady then
|
||
|
|
begin
|
||
|
|
FIsSlavoGermanic := ContainsSubstr(Value, 'W') or ContainsSubstr(Value, 'K') or
|
||
|
|
ContainsSubstr(Value, 'CZ') or ContainsSubstr(Value, 'WITZ');
|
||
|
|
FIsSlavoGermanicReady := True;
|
||
|
|
end;
|
||
|
|
Result := FIsSlavoGermanic;
|
||
|
|
end;
|
||
|
|
|
||
|
|
function TdxDoubleMetaphone.IsVowel(P: PWideChar): Boolean;
|
||
|
|
begin
|
||
|
|
if (P >= Value) and (P <= Last) then
|
||
|
|
case P^ of
|
||
|
|
'A', 'E', 'I', 'O', 'U', 'Y': Result := True;
|
||
|
|
else
|
||
|
|
Result := False;
|
||
|
|
end
|
||
|
|
else
|
||
|
|
Result := False;
|
||
|
|
end;
|
||
|
|
|
||
|
|
procedure TdxDoubleMetaphone.MetaphAdd(APrimary: WideChar; AAlternate: WideChar = #0);
|
||
|
|
begin
|
||
|
|
if APrimary <> #0 then
|
||
|
|
begin
|
||
|
|
MetaphPrimaryAdd(APrimary);
|
||
|
|
if AAlternate = #0 then
|
||
|
|
MetaphAlternateAdd(APrimary)
|
||
|
|
else
|
||
|
|
MetaphAlternateAdd(AAlternate);
|
||
|
|
end
|
||
|
|
else
|
||
|
|
MetaphAlternateAdd(AAlternate);
|
||
|
|
end;
|
||
|
|
|
||
|
|
procedure TdxDoubleMetaphone.MetaphPrimaryAdd(AChar: WideChar);
|
||
|
|
begin
|
||
|
|
if FPrimaryLen < MaxSoundLen then
|
||
|
|
begin
|
||
|
|
(FPrimary + FPrimaryLen)^ := AChar;
|
||
|
|
Inc(FPrimaryLen);
|
||
|
|
end;
|
||
|
|
end;
|
||
|
|
|
||
|
|
procedure TdxDoubleMetaphone.MetaphAlternateAdd(AChar: WideChar);
|
||
|
|
begin
|
||
|
|
if FAlternateLen < MaxSoundLen then
|
||
|
|
begin
|
||
|
|
(FAlternate + FAlternateLen)^ := AChar;
|
||
|
|
Inc(FAlternateLen);
|
||
|
|
end;
|
||
|
|
end;
|
||
|
|
|
||
|
|
function TdxDoubleMetaphone.AtEnd: Boolean;
|
||
|
|
begin
|
||
|
|
Result := Current = Last;
|
||
|
|
end;
|
||
|
|
|
||
|
|
function TdxDoubleMetaphone.AtStart: Boolean;
|
||
|
|
begin
|
||
|
|
Result := Current = Value;
|
||
|
|
end;
|
||
|
|
|
||
|
|
function TdxDoubleMetaphone.Contains(AStart: PWideChar; ALength: Integer;
|
||
|
|
APatterns: PWideChar): Boolean;
|
||
|
|
begin
|
||
|
|
Result := (AStart >= Value) and (AStart + ALength - 1 <= Last);
|
||
|
|
if Result then
|
||
|
|
begin
|
||
|
|
repeat
|
||
|
|
if SameChars(AStart, APatterns, ALength) then
|
||
|
|
begin
|
||
|
|
Result := True;
|
||
|
|
Exit;
|
||
|
|
end;
|
||
|
|
Inc(APatterns, ALength + 1);
|
||
|
|
until APatterns^ = #0;
|
||
|
|
Result := False;
|
||
|
|
end;
|
||
|
|
end;
|
||
|
|
|
||
|
|
function TdxDoubleMetaphone.CurrentIndex: Integer;
|
||
|
|
begin
|
||
|
|
Result := Current - Value;
|
||
|
|
end;
|
||
|
|
|
||
|
|
function TdxDoubleMetaphone.GetAlternate: WideString;
|
||
|
|
begin
|
||
|
|
SetString(Result, FAlternate, FAlternateLen);
|
||
|
|
end;
|
||
|
|
|
||
|
|
function TdxDoubleMetaphone.GetAlternateKey: Word;
|
||
|
|
begin
|
||
|
|
Result := GetMetaphoneKey(FAlternate, FAlternateLen);
|
||
|
|
end;
|
||
|
|
|
||
|
|
function TdxDoubleMetaphone.GetPrimary: WideString;
|
||
|
|
begin
|
||
|
|
SetString(Result, FPrimary, FPrimaryLen);
|
||
|
|
end;
|
||
|
|
|
||
|
|
function TdxDoubleMetaphone.GetPrimaryKey: Word;
|
||
|
|
begin
|
||
|
|
Result := GetMetaphoneKey(FPrimary, FPrimaryLen);
|
||
|
|
end;
|
||
|
|
|
||
|
|
procedure TdxDoubleMetaphone.ProcessB;
|
||
|
|
begin
|
||
|
|
//"-mb", e.g", "dumb", already skipped over...
|
||
|
|
MetaphAdd('P');
|
||
|
|
if (Current + 1)^ = 'B' then
|
||
|
|
Inc(Current);
|
||
|
|
end;
|
||
|
|
|
||
|
|
procedure TdxDoubleMetaphone.ProcessC;
|
||
|
|
begin
|
||
|
|
//various germanic
|
||
|
|
if (CurrentIndex > 1) and not IsVowel(Current - 2) and Contains(Current - 1, 3, 'ACH;') and
|
||
|
|
(((Current + 2)^ <> 'I') and (((Current + 2)^ <> 'E') or Contains(Current - 2, 6, 'BACHER;MACHER;'))) then
|
||
|
|
begin
|
||
|
|
MetaphAdd('K');
|
||
|
|
Inc(Current);
|
||
|
|
Exit;
|
||
|
|
end;
|
||
|
|
|
||
|
|
//special case 'caesar'
|
||
|
|
if AtStart and Contains(Current, 6, 'CAESAR;') then
|
||
|
|
begin
|
||
|
|
MetaphAdd('S');
|
||
|
|
Inc(Current);
|
||
|
|
Exit;
|
||
|
|
end;
|
||
|
|
|
||
|
|
//italian 'chianti'
|
||
|
|
if Contains(Current, 4, 'CHIA;') then
|
||
|
|
begin
|
||
|
|
MetaphAdd('K');
|
||
|
|
Inc(Current);
|
||
|
|
Exit;
|
||
|
|
end;
|
||
|
|
|
||
|
|
if Contains(Current, 2, 'CH;') then
|
||
|
|
begin
|
||
|
|
//find 'michael'
|
||
|
|
if (Current > Value) and Contains(Current, 4, 'CHAE;') then
|
||
|
|
begin
|
||
|
|
MetaphAdd('K', 'X');
|
||
|
|
Inc(Current);
|
||
|
|
Exit;
|
||
|
|
end;
|
||
|
|
|
||
|
|
//greek roots e.g. 'chemistry', 'chorus'
|
||
|
|
if (AtStart and (Contains(Current + 1, 5, 'HARAC;HARIS;') or
|
||
|
|
Contains(Current + 1, 3, 'HOR;HYM;HIA;HEM;')) and not Contains(Current, 5, 'CHORE;')) then
|
||
|
|
begin
|
||
|
|
MetaphAdd('K');
|
||
|
|
Inc(Current);
|
||
|
|
Exit;
|
||
|
|
end;
|
||
|
|
|
||
|
|
//germanic, greek, or otherwise 'ch' for 'kh' sound
|
||
|
|
if (Contains(Current, 4, 'VAN ;VON ;') or Contains(Current, 3, 'SCH;')) or
|
||
|
|
// 'architect but not 'arch', 'orchestra', 'orchid'
|
||
|
|
Contains(Current - 2, 6, 'ORCHES;ARCHIT;ORCHID;') or
|
||
|
|
CharInString((Current + 2)^, 'TS') or
|
||
|
|
(AtStart or CharInString((Current - 1)^, 'AOUE')) and
|
||
|
|
//e.g., 'wachtler', 'wechsler', but not 'tichner'
|
||
|
|
CharInString((Current + 2)^, 'LRNMBHFVW ') then
|
||
|
|
MetaphAdd('K')
|
||
|
|
else
|
||
|
|
begin
|
||
|
|
if Current > Value then
|
||
|
|
begin
|
||
|
|
if Contains(Current, 2, 'MC;') then
|
||
|
|
//e.g., 'McHugh'
|
||
|
|
MetaphAdd('K')
|
||
|
|
else
|
||
|
|
MetaphAdd('X', 'K');
|
||
|
|
end
|
||
|
|
else
|
||
|
|
MetaphAdd('X');
|
||
|
|
end;
|
||
|
|
Inc(Current);
|
||
|
|
Exit;
|
||
|
|
end;
|
||
|
|
|
||
|
|
//e.g, 'czerny'
|
||
|
|
if Contains(Current, 2, 'CZ;') and not Contains(Current - 2, 4, 'WICZ;') then
|
||
|
|
begin
|
||
|
|
MetaphAdd('S', 'X');
|
||
|
|
Inc(Current);
|
||
|
|
Exit;
|
||
|
|
end;
|
||
|
|
|
||
|
|
//e.g., 'focaccia'
|
||
|
|
if Contains(Current + 1, 3, 'CIA;') then
|
||
|
|
begin
|
||
|
|
MetaphAdd('X');
|
||
|
|
Inc(Current, 2);
|
||
|
|
Exit;
|
||
|
|
end;
|
||
|
|
|
||
|
|
//double 'C', but not if e.g. 'McClellan'
|
||
|
|
if Contains(Current, 2, 'CC;') and not ((CurrentIndex = 1) and (Value^ = 'M')) then
|
||
|
|
begin
|
||
|
|
//'bellocchio' but not 'bacchus'
|
||
|
|
if CharInString((Current + 2)^, 'IEH') and not Contains(Current + 2, 2, 'HU;') then
|
||
|
|
begin
|
||
|
|
//'accident', 'accede' 'succeed'
|
||
|
|
if (((CurrentIndex = 1) and ((Current - 1)^ = 'A'))
|
||
|
|
or Contains(Current - 1, 5, 'UCCEE;UCCES;')) then
|
||
|
|
begin
|
||
|
|
MetaphAdd('K');
|
||
|
|
MetaphAdd('S');
|
||
|
|
end
|
||
|
|
else
|
||
|
|
//'bacci', 'bertucci', other italian
|
||
|
|
MetaphAdd('X');
|
||
|
|
Inc(Current, 2);
|
||
|
|
end
|
||
|
|
else
|
||
|
|
begin
|
||
|
|
//Pierce's rule
|
||
|
|
MetaphAdd('K');
|
||
|
|
Inc(Current);
|
||
|
|
end;
|
||
|
|
Exit;
|
||
|
|
end;
|
||
|
|
|
||
|
|
if Contains(Current, 2, 'CK;CG;CQ;') then
|
||
|
|
begin
|
||
|
|
MetaphAdd('K');
|
||
|
|
Inc(Current);
|
||
|
|
Exit;
|
||
|
|
end;
|
||
|
|
|
||
|
|
if Contains(Current, 2, 'CI;CE;CY;') then
|
||
|
|
begin
|
||
|
|
//italian vs. english
|
||
|
|
if Contains(Current, 3, 'CIO;CIE;CIA;') then
|
||
|
|
MetaphAdd('S', 'X')
|
||
|
|
else
|
||
|
|
MetaphAdd('S');
|
||
|
|
Inc(Current);
|
||
|
|
Exit;
|
||
|
|
end;
|
||
|
|
|
||
|
|
//else
|
||
|
|
MetaphAdd('K');
|
||
|
|
|
||
|
|
//name sent in 'mac caffrey', 'mac gregor
|
||
|
|
if Contains(Current + 1, 2, ' C; Q; G;') then
|
||
|
|
Inc(Current, 2)
|
||
|
|
else
|
||
|
|
if CharInString((Value + 1)^, 'CKQ') and not Contains(Value + 1, 2, 'CE;CI;') then
|
||
|
|
Inc(Current);
|
||
|
|
end;
|
||
|
|
|
||
|
|
procedure TdxDoubleMetaphone.ProcessD;
|
||
|
|
begin
|
||
|
|
if Contains(Current, 2, 'DG;') then
|
||
|
|
begin
|
||
|
|
if CharInString((Current + 2)^, 'IEY') then
|
||
|
|
begin
|
||
|
|
//e.g. 'edge'
|
||
|
|
MetaphAdd('J');
|
||
|
|
Inc(Current, 2);
|
||
|
|
end
|
||
|
|
else
|
||
|
|
begin
|
||
|
|
//e.g. 'edgar'
|
||
|
|
MetaphAdd('T');
|
||
|
|
MetaphAdd('K');
|
||
|
|
Inc(Current);
|
||
|
|
end;
|
||
|
|
Exit;
|
||
|
|
end;
|
||
|
|
|
||
|
|
if Contains(Current, 2, 'DT;DD;') then
|
||
|
|
begin
|
||
|
|
MetaphAdd('T');
|
||
|
|
Inc(Current);
|
||
|
|
end
|
||
|
|
else
|
||
|
|
MetaphAdd('T');
|
||
|
|
end;
|
||
|
|
|
||
|
|
procedure TdxDoubleMetaphone.ProcessF;
|
||
|
|
begin
|
||
|
|
if (Current + 1)^ = 'F' then
|
||
|
|
Inc(Current);
|
||
|
|
MetaphAdd('F');
|
||
|
|
end;
|
||
|
|
|
||
|
|
procedure TdxDoubleMetaphone.ProcessG;
|
||
|
|
begin
|
||
|
|
if (Current + 1)^ = 'H' then
|
||
|
|
begin
|
||
|
|
if (CurrentIndex > 0) and not IsVowel(Current - 1) then
|
||
|
|
begin
|
||
|
|
MetaphAdd('K');
|
||
|
|
Inc(Current);
|
||
|
|
Exit;
|
||
|
|
end;
|
||
|
|
|
||
|
|
//'ghislane', ghiradelli
|
||
|
|
if Current = Value then
|
||
|
|
begin
|
||
|
|
if (Current + 2)^ = 'I' then
|
||
|
|
MetaphAdd('J')
|
||
|
|
else
|
||
|
|
MetaphAdd('K');
|
||
|
|
Inc(Current);
|
||
|
|
Exit;
|
||
|
|
end;
|
||
|
|
|
||
|
|
//Parker's rule (with some further refinements) - e.g., 'hugh'
|
||
|
|
if (CurrentIndex > 1) and CharInString((Current - 2)^, 'BHD') or
|
||
|
|
//e.g., 'bough'
|
||
|
|
(CurrentIndex > 2) and CharInString((Current - 3)^, 'BHD') or
|
||
|
|
//e.g., 'broughton'
|
||
|
|
(CurrentIndex > 3) and CharInString((Current - 4)^, 'BH') then
|
||
|
|
Inc(Current)
|
||
|
|
else
|
||
|
|
begin
|
||
|
|
//e.g., 'laugh', 'McLaughlin', 'cough', 'gough', 'rough', 'tough'
|
||
|
|
if (CurrentIndex > 2) and ((Current - 1)^ = 'U') and CharInString((Current - 3)^, 'CGLRT') then
|
||
|
|
MetaphAdd('F')
|
||
|
|
else
|
||
|
|
if (CurrentIndex > 0) and ((Current - 1)^ <> 'I') then
|
||
|
|
MetaphAdd('K');
|
||
|
|
Inc(Current);
|
||
|
|
end;
|
||
|
|
Exit;
|
||
|
|
end;
|
||
|
|
|
||
|
|
if (Current + 1)^ = 'N' then
|
||
|
|
begin
|
||
|
|
if (CurrentIndex = 1) and IsVowel(Value) and not IsSlavoGermanic then
|
||
|
|
begin
|
||
|
|
MetaphAdd('K', 'N');
|
||
|
|
MetaphAdd('N', #0);
|
||
|
|
end
|
||
|
|
else
|
||
|
|
//not e.g. 'cagney'
|
||
|
|
if not Contains(Current + 2, 2, 'EY;') and ((Current + 1)^ <> 'Y') and not IsSlavoGermanic then
|
||
|
|
begin
|
||
|
|
MetaphAdd('N', 'K');
|
||
|
|
MetaphAdd(#0, 'N');
|
||
|
|
end
|
||
|
|
else
|
||
|
|
begin
|
||
|
|
MetaphAdd('K');
|
||
|
|
MetaphAdd('N');
|
||
|
|
end;
|
||
|
|
Inc(Current);
|
||
|
|
Exit;
|
||
|
|
end;
|
||
|
|
|
||
|
|
//'tagliaro'
|
||
|
|
if Contains(Current + 1, 2, 'LI;') and not IsSlavoGermanic then
|
||
|
|
begin
|
||
|
|
MetaphAdd('K', 'L');
|
||
|
|
MetaphAdd('L', #0);
|
||
|
|
Inc(Current);
|
||
|
|
Exit;
|
||
|
|
end;
|
||
|
|
|
||
|
|
//-ges-,-gep-,-gel-, -gie- at beginning
|
||
|
|
if (AtStart and (((Current + 1)^ = 'Y')
|
||
|
|
or Contains(Current + 1, 2, 'ES;EP;EB;EL;EY;IB;IL;IN;IE;EI;ER;'))) then
|
||
|
|
begin
|
||
|
|
MetaphAdd('K', 'J');
|
||
|
|
Inc(Current);
|
||
|
|
Exit;
|
||
|
|
end;
|
||
|
|
|
||
|
|
// -ger-, -gy-
|
||
|
|
if (Contains(Current + 1, 2, 'ER;') or ((Current + 1)^ = 'Y')) and
|
||
|
|
not Contains(Current, 6, 'DANGER;RANGER;MANGER;') and
|
||
|
|
not ((CurrentIndex > 0) and ((Current - 1)^ = 'E') or ((Current - 1)^ = 'I')) and
|
||
|
|
not Contains(Current - 1, 3, 'RGY;OGY;') then
|
||
|
|
begin
|
||
|
|
MetaphAdd('K', 'J');
|
||
|
|
Inc(Current);
|
||
|
|
Exit;
|
||
|
|
end;
|
||
|
|
|
||
|
|
// italian e.g, 'biaggi'
|
||
|
|
if CharInString((Current + 1)^, 'EIY') or Contains(Current - 1, 4, 'AGGI;OGGI;') then
|
||
|
|
begin
|
||
|
|
//obvious germanic
|
||
|
|
if Contains(Current, 4, 'VAN ;VON ;') or
|
||
|
|
Contains(Current, 3, 'SCH;') or Contains(Current + 1, 2, 'ET;') then
|
||
|
|
MetaphAdd('K')
|
||
|
|
else
|
||
|
|
//always soft if french ending
|
||
|
|
if Contains(Current + 1, 4, 'IER ;') then
|
||
|
|
MetaphAdd('J')
|
||
|
|
else
|
||
|
|
MetaphAdd('J', 'K');
|
||
|
|
Inc(Current);
|
||
|
|
Exit;
|
||
|
|
end;
|
||
|
|
|
||
|
|
if (Current + 1)^ = 'G' then
|
||
|
|
Inc(Current);
|
||
|
|
MetaphAdd('K');
|
||
|
|
end;
|
||
|
|
|
||
|
|
procedure TdxDoubleMetaphone.ProcessH;
|
||
|
|
begin
|
||
|
|
//only keep if first & before vowel or btw. 2 vowels
|
||
|
|
if (AtStart or IsVowel(Current - 1)) and IsVowel(Current + 1) then
|
||
|
|
begin
|
||
|
|
MetaphAdd('H');
|
||
|
|
Inc(Current);
|
||
|
|
end;
|
||
|
|
end;
|
||
|
|
|
||
|
|
procedure TdxDoubleMetaphone.ProcessJ;
|
||
|
|
begin
|
||
|
|
//obvious spanish, 'jose', 'san jacinto'
|
||
|
|
if Contains(Current, 4, 'JOSE;') or Contains(Current, 4, 'SAN ;') then
|
||
|
|
begin
|
||
|
|
if (AtStart and ((Current + 4)^ = ' ')) or Contains(Current, 4, 'SAN ;') then
|
||
|
|
MetaphAdd('H')
|
||
|
|
else
|
||
|
|
MetaphAdd('J', 'H');
|
||
|
|
Exit;
|
||
|
|
end;
|
||
|
|
|
||
|
|
if AtStart and not Contains(Current, 4, 'JOSE;') then
|
||
|
|
MetaphAdd('J', 'A') //Yankelovich/Jankelowicz
|
||
|
|
else
|
||
|
|
//spanish pron. of e.g. 'bajador'
|
||
|
|
if IsVowel(Current - 1) and not IsSlavoGermanic and (((Current + 1)^ = 'A') or ((Current + 1)^ = 'O')) then
|
||
|
|
MetaphAdd('J', 'H')
|
||
|
|
else
|
||
|
|
if AtEnd then
|
||
|
|
MetaphAdd('J', ' ')
|
||
|
|
else
|
||
|
|
if not CharInString((Current + 1)^, 'LTKSNMBZ') and
|
||
|
|
not ((CurrentIndex > 0) and CharInString((Current - 1)^, 'SKL')) then
|
||
|
|
MetaphAdd('J');
|
||
|
|
|
||
|
|
if (Current + 1)^ = 'J' then //it could happennot
|
||
|
|
Inc(Current);
|
||
|
|
end;
|
||
|
|
|
||
|
|
procedure TdxDoubleMetaphone.ProcessK;
|
||
|
|
begin
|
||
|
|
if (Current + 1)^ = 'K' then
|
||
|
|
Inc(Current);
|
||
|
|
MetaphAdd('K');
|
||
|
|
end;
|
||
|
|
|
||
|
|
procedure TdxDoubleMetaphone.ProcessL;
|
||
|
|
begin
|
||
|
|
if (Current + 1)^ = 'L' then
|
||
|
|
begin
|
||
|
|
//spanish e.g. 'cabrillo', 'gallegos'
|
||
|
|
if ((Current = Last - 2) and Contains(Current - 1, 4, 'ILLO;ILLA;ALLE;')) or
|
||
|
|
((Contains(Last - 1, 2, 'AS;OS;') or (Last^ = 'A') or (Last^ = 'O'))
|
||
|
|
and Contains(Current - 1, 4, 'ALLE;')) then
|
||
|
|
begin
|
||
|
|
MetaphAdd('L', ' ');
|
||
|
|
Inc(Current);
|
||
|
|
Exit;
|
||
|
|
end;
|
||
|
|
Inc(Current);
|
||
|
|
end;
|
||
|
|
MetaphAdd('L');
|
||
|
|
end;
|
||
|
|
|
||
|
|
procedure TdxDoubleMetaphone.ProcessM;
|
||
|
|
begin
|
||
|
|
if Contains(Current - 1, 3, 'UMB;') and
|
||
|
|
((Current + 1 = Last) or Contains(Current + 2, 2, 'ER;'))
|
||
|
|
//'dumb','thumb'
|
||
|
|
or ((Current + 1)^ = 'M') then
|
||
|
|
Inc(Current);
|
||
|
|
MetaphAdd('M');
|
||
|
|
end;
|
||
|
|
|
||
|
|
procedure TdxDoubleMetaphone.ProcessN;
|
||
|
|
begin
|
||
|
|
if (Current + 1)^ = 'N' then
|
||
|
|
Inc(Current);
|
||
|
|
MetaphAdd('N');
|
||
|
|
end;
|
||
|
|
|
||
|
|
procedure TdxDoubleMetaphone.ProcessP;
|
||
|
|
begin
|
||
|
|
if (Current + 1)^ = 'H' then
|
||
|
|
begin
|
||
|
|
MetaphAdd('F');
|
||
|
|
Inc(Current);
|
||
|
|
Exit;
|
||
|
|
end;
|
||
|
|
|
||
|
|
//also account for 'campbell', 'raspberry'
|
||
|
|
if ((Current + 1)^ = 'P') or ((Current + 1)^ = 'B') then
|
||
|
|
Inc(Current);
|
||
|
|
MetaphAdd('P');
|
||
|
|
end;
|
||
|
|
|
||
|
|
procedure TdxDoubleMetaphone.ProcessQ;
|
||
|
|
begin
|
||
|
|
if (Current + 1)^ = 'Q' then
|
||
|
|
Inc(Current);
|
||
|
|
MetaphAdd('K');
|
||
|
|
end;
|
||
|
|
|
||
|
|
procedure TdxDoubleMetaphone.ProcessR;
|
||
|
|
begin
|
||
|
|
//french e.g. 'rogier', but exclude 'hochmeier'
|
||
|
|
if AtEnd and not IsSlavoGermanic and Contains(Current - 2, 2, 'IE;')
|
||
|
|
and not Contains(Current - 4, 2, 'ME;MA;') then
|
||
|
|
MetaphAdd(#0, 'R')
|
||
|
|
else
|
||
|
|
MetaphAdd('R');
|
||
|
|
|
||
|
|
if (Current + 1)^ = 'R' then
|
||
|
|
Inc(Current);
|
||
|
|
end;
|
||
|
|
|
||
|
|
procedure TdxDoubleMetaphone.ProcessS;
|
||
|
|
begin
|
||
|
|
//special cases 'island', 'isle', 'carlisle', 'carlysle'
|
||
|
|
if Contains(Current - 1, 3, 'ISL;YSL;') then
|
||
|
|
Exit;
|
||
|
|
|
||
|
|
//special case 'sugar-'
|
||
|
|
if AtStart and Contains(Current, 5, 'SUGAR;') then
|
||
|
|
begin
|
||
|
|
MetaphAdd('X', 'S');
|
||
|
|
Exit;
|
||
|
|
end;
|
||
|
|
|
||
|
|
if Contains(Current, 2, 'SH;') then
|
||
|
|
begin
|
||
|
|
//germanic
|
||
|
|
if Contains(Current + 1, 4, 'HEIM;HOEK;HOLM;HOLZ;') then
|
||
|
|
MetaphAdd('S')
|
||
|
|
else
|
||
|
|
MetaphAdd('X');
|
||
|
|
Inc(Current);
|
||
|
|
Exit;
|
||
|
|
end;
|
||
|
|
|
||
|
|
//italian & armenian
|
||
|
|
if Contains(Current, 3, 'SIO;SIA;') or Contains(Current, 4, 'SIAN;') then
|
||
|
|
begin
|
||
|
|
if not IsSlavoGermanic then
|
||
|
|
MetaphAdd('S', 'X')
|
||
|
|
else
|
||
|
|
MetaphAdd('S');
|
||
|
|
Inc(Current, 2);
|
||
|
|
Exit;
|
||
|
|
end;
|
||
|
|
|
||
|
|
//german & anglicisations, e.g. 'smith' match 'schmidt', 'snider' match 'schneider'
|
||
|
|
//also, -sz- in slavic language altho in hungarian it is pronounced 's'
|
||
|
|
if (AtStart and CharInString((Current + 1)^, 'MNLW')) or ((Current + 1)^ = 'Z') then
|
||
|
|
begin
|
||
|
|
MetaphAdd('S', 'X');
|
||
|
|
if (Current + 1)^ = 'Z' then
|
||
|
|
Inc(Current);
|
||
|
|
Exit;
|
||
|
|
end;
|
||
|
|
|
||
|
|
if Contains(Current, 2, 'SC;') then
|
||
|
|
begin
|
||
|
|
//Schlesinger's rule
|
||
|
|
if (Current + 2)^ = 'H' then
|
||
|
|
begin
|
||
|
|
//dutch origin, e.g. 'school', 'schooner'
|
||
|
|
if Contains(Current + 3, 2, 'OO;ER;EN;UY;ED;EM;') then
|
||
|
|
begin
|
||
|
|
//'schermerhorn', 'schenker'
|
||
|
|
if Contains(Current + 3, 2, 'ER;EN;') then
|
||
|
|
begin
|
||
|
|
MetaphAdd('X', 'S');
|
||
|
|
MetaphAdd(#0, 'K');
|
||
|
|
end
|
||
|
|
else
|
||
|
|
begin
|
||
|
|
MetaphAdd('S');
|
||
|
|
MetaphAdd('K');
|
||
|
|
end;
|
||
|
|
end
|
||
|
|
else
|
||
|
|
begin
|
||
|
|
if AtStart and not IsVowel(Value + 3) and ((Current + 3)^ <> 'W') then
|
||
|
|
MetaphAdd('X', 'S')
|
||
|
|
else
|
||
|
|
MetaphAdd('X');
|
||
|
|
end;
|
||
|
|
Inc(Current, 2);
|
||
|
|
Exit;
|
||
|
|
end;
|
||
|
|
|
||
|
|
if CharInString((Current + 2)^, 'IEY') then
|
||
|
|
MetaphAdd('S')
|
||
|
|
else
|
||
|
|
begin
|
||
|
|
MetaphAdd('S');
|
||
|
|
MetaphAdd('K');
|
||
|
|
end;
|
||
|
|
Inc(Current, 2);
|
||
|
|
Exit;
|
||
|
|
end;
|
||
|
|
|
||
|
|
//french e.g. 'resnais', 'artois'
|
||
|
|
if AtEnd and Contains(Current - 2, 2, 'AI;OI;') then
|
||
|
|
MetaphAdd(#0, 'S')
|
||
|
|
else
|
||
|
|
MetaphAdd('S');
|
||
|
|
|
||
|
|
if ((Current + 1)^ = 'S') or ((Current + 1)^ = 'Z') then
|
||
|
|
Inc(Current);
|
||
|
|
end;
|
||
|
|
|
||
|
|
procedure TdxDoubleMetaphone.ProcessT;
|
||
|
|
begin
|
||
|
|
if Contains(Current, 4, 'TION;') or Contains(Current, 3, 'TIA;TCH;') then
|
||
|
|
begin
|
||
|
|
MetaphAdd('X');
|
||
|
|
Inc(Current, 2);
|
||
|
|
Exit;
|
||
|
|
end;
|
||
|
|
|
||
|
|
if Contains(Current, 2, 'TH;') or Contains(Current, 3, 'TTH;') then
|
||
|
|
begin
|
||
|
|
//special case 'thomas', 'thames' or germanic
|
||
|
|
if Contains(Current + 2, 2, 'OM;AM;') or Contains(Current, 4, 'VAN ;VON ;')
|
||
|
|
or Contains(Current, 3, 'SCH;') then
|
||
|
|
MetaphAdd('T')
|
||
|
|
else
|
||
|
|
MetaphAdd('0', 'T');
|
||
|
|
Inc(Current);
|
||
|
|
Exit;
|
||
|
|
end;
|
||
|
|
|
||
|
|
if ((Current + 1)^ = 'T') or ((Current + 1)^ = 'D') then
|
||
|
|
Inc(Current);
|
||
|
|
MetaphAdd('T');
|
||
|
|
end;
|
||
|
|
|
||
|
|
procedure TdxDoubleMetaphone.ProcessV;
|
||
|
|
begin
|
||
|
|
if (Current + 1)^ = 'V' then
|
||
|
|
Inc(Current);
|
||
|
|
MetaphAdd('F');
|
||
|
|
end;
|
||
|
|
|
||
|
|
procedure TdxDoubleMetaphone.ProcessW;
|
||
|
|
begin
|
||
|
|
//can also be in middle of word
|
||
|
|
if Contains(Current, 2, 'WR;') then
|
||
|
|
begin
|
||
|
|
MetaphAdd('R');
|
||
|
|
Inc(Current);
|
||
|
|
Exit;
|
||
|
|
end;
|
||
|
|
|
||
|
|
//Arnow should match Arnoff
|
||
|
|
if (AtEnd and IsVowel(Current - 1)) or
|
||
|
|
Contains(Current - 1, 5, 'EWSKI;EWSKY;OWSKI;OWSKY;') or
|
||
|
|
Contains(Current, 3, 'SCH;') then
|
||
|
|
begin
|
||
|
|
MetaphAdd(#0, 'F');
|
||
|
|
Exit;
|
||
|
|
end;
|
||
|
|
|
||
|
|
//polish e.g. 'filipowicz'
|
||
|
|
if Contains(Current, 4, 'WICZ;WITZ;') then
|
||
|
|
begin
|
||
|
|
MetaphAdd('T', 'F');
|
||
|
|
MetaphAdd('S', 'X');
|
||
|
|
Inc(Current, 3);
|
||
|
|
end;
|
||
|
|
end;
|
||
|
|
|
||
|
|
procedure TdxDoubleMetaphone.ProcessX;
|
||
|
|
begin
|
||
|
|
//french e.g. breaux
|
||
|
|
if not (AtEnd and (Contains(Current - 3, 3, 'IAU;EAU;')
|
||
|
|
or Contains(Current - 2, 2, 'AU;OU;'))) then
|
||
|
|
begin
|
||
|
|
MetaphAdd('K');
|
||
|
|
MetaphAdd('S');
|
||
|
|
end;
|
||
|
|
if ((Current + 1)^ = 'C') or ((Current + 1)^ = 'X') then
|
||
|
|
Inc(Current);
|
||
|
|
end;
|
||
|
|
|
||
|
|
procedure TdxDoubleMetaphone.ProcessZ;
|
||
|
|
begin
|
||
|
|
//chinese pinyin e.g. 'zhao'
|
||
|
|
if (Current + 1)^ = 'H' then
|
||
|
|
begin
|
||
|
|
MetaphAdd('J');
|
||
|
|
Inc(Current);
|
||
|
|
Exit;
|
||
|
|
end
|
||
|
|
else
|
||
|
|
if Contains(Current + 1, 2, 'ZO;ZI;ZA;')
|
||
|
|
or (IsSlavoGermanic and ((CurrentIndex > 0) and ((Current - 1)^ <> 'T'))) then
|
||
|
|
begin
|
||
|
|
MetaphAdd('S', 'T');
|
||
|
|
MetaphAdd(#0, 'S');
|
||
|
|
end
|
||
|
|
else
|
||
|
|
MetaphAdd('S');
|
||
|
|
|
||
|
|
if (Current + 1)^ = 'Z' then
|
||
|
|
Inc(Current);
|
||
|
|
end;
|
||
|
|
|
||
|
|
procedure TdxDoubleMetaphone.Reset(P: PWideChar; ALength: Integer);
|
||
|
|
var
|
||
|
|
ATemp: PWideChar;
|
||
|
|
I: Integer;
|
||
|
|
begin
|
||
|
|
FLength := ALength;
|
||
|
|
ATemp := FValue;
|
||
|
|
I := 0;
|
||
|
|
while I < (Length + 6) do
|
||
|
|
begin
|
||
|
|
if I < Length then
|
||
|
|
(ATemp + I)^ := (P + I)^
|
||
|
|
else
|
||
|
|
if I = Length + 5 then
|
||
|
|
(ATemp + I)^ := #0
|
||
|
|
else
|
||
|
|
(ATemp + I)^ := ' ';
|
||
|
|
Inc(I);
|
||
|
|
end;
|
||
|
|
FLast := FValue + ALength - 1;
|
||
|
|
FPrimaryLen := 0;
|
||
|
|
FAlternateLen := 0;
|
||
|
|
Current := FValue;
|
||
|
|
FIsSlavoGermanicReady := False;
|
||
|
|
end;
|
||
|
|
|
||
|
|
{ TdxStringSimilarityCalculator }
|
||
|
|
|
||
|
|
function TdxStringSimilarityCalculator.GetDistance(AStr1: PWideChar; ALen1: Integer;
|
||
|
|
AStr2: PWideChar; ALen2: Integer): Integer;
|
||
|
|
var
|
||
|
|
AMatrix: array of array of Integer;
|
||
|
|
I, J: Integer;
|
||
|
|
begin
|
||
|
|
if (ALen1 = 0) or (ALen2 = 0) then
|
||
|
|
begin
|
||
|
|
Result := 0;
|
||
|
|
Exit;
|
||
|
|
end;
|
||
|
|
SetLength(AMatrix, ALen1 + 1, ALen2 + 1);
|
||
|
|
AMatrix[0, 0] := 0;
|
||
|
|
for I := 0 to ALen2 - 1 do
|
||
|
|
AMatrix[0, I + 1] := AMatrix[0, I] + InsertCost(AStr2 + I);
|
||
|
|
for I := 0 to ALen1 - 1 do
|
||
|
|
begin
|
||
|
|
AMatrix[I + 1, 0] := AMatrix[I, 0] + DeleteCost(AStr1 + I);
|
||
|
|
for J := 0 to ALen2 - 1 do
|
||
|
|
begin
|
||
|
|
AMatrix[I + 1, J + 1] := Minimum(
|
||
|
|
AMatrix[I, J] + SubstitutionCost(AStr1 + I, AStr2 + J),
|
||
|
|
AMatrix[I, J + 1] + DeleteCost(AStr1 + I),
|
||
|
|
AMatrix[I + 1, J] + InsertCost(AStr2 + J));
|
||
|
|
end;
|
||
|
|
end;
|
||
|
|
Result := AMatrix[ALen1, ALen2];
|
||
|
|
if Result > 0 then
|
||
|
|
begin
|
||
|
|
if AStr1^ <> AStr2^ then
|
||
|
|
Inc(Result);
|
||
|
|
if AStr1[ALen1 - 1] <> AStr2[ALen2 - 1] then
|
||
|
|
Inc(Result);
|
||
|
|
end;
|
||
|
|
end;
|
||
|
|
|
||
|
|
function TdxStringSimilarityCalculator.DeleteCost(S: PWideChar): Integer;
|
||
|
|
begin
|
||
|
|
Result := 2;
|
||
|
|
end;
|
||
|
|
|
||
|
|
function TdxStringSimilarityCalculator.InsertCost(S: PWideChar): Integer;
|
||
|
|
begin
|
||
|
|
Result := 2;
|
||
|
|
end;
|
||
|
|
|
||
|
|
function TdxStringSimilarityCalculator.SubstitutionCost(S1: PWideChar; S2: PWideChar): Integer;
|
||
|
|
begin
|
||
|
|
if S1^ = S2^ then
|
||
|
|
Result := 0
|
||
|
|
else
|
||
|
|
Result := 2;
|
||
|
|
end;
|
||
|
|
|
||
|
|
end.
|