Componentes.Terceros.DevExp.../official/x.38/ExpressSpellChecker/Sources/dxSpellCheckerAlgorithms.pas
2008-08-27 11:56:15 +00:00

979 lines
25 KiB
ObjectPascal

{********************************************************************}
{ }
{ Developer Express Visual Component Library }
{ ExpressSpellChecker }
{ }
{ Copyright (c) 1998-2008 Developer Express Inc. }
{ ALL RIGHTS RESERVED }
{ }
{ The entire contents of this file is protected by U.S. and }
{ International Copyright Laws. Unauthorized reproduction, }
{ reverse-engineering, and distribution of all or any portion of }
{ the code contained in this file is strictly prohibited and may }
{ result in severe civil and criminal penalties and will be }
{ prosecuted to the maximum extent possible under the law. }
{ }
{ RESTRICTIONS }
{ }
{ THIS SOURCE CODE AND ALL RESULTING INTERMEDIATE FILES }
{ (DCU, OBJ, DLL, ETC.) ARE CONFIDENTIAL AND PROPRIETARY TRADE }
{ SECRETS OF DEVELOPER EXPRESS INC. THE REGISTERED DEVELOPER IS }
{ LICENSED TO DISTRIBUTE THE EXPRESSSPELLCHECKER AND ALL }
{ ACCOMPANYING VCL CONTROLS AS PART OF AN EXECUTABLE PROGRAM ONLY. }
{ }
{ THE SOURCE CODE CONTAINED WITHIN THIS FILE AND ALL RELATED }
{ FILES OR ANY PORTION OF ITS CONTENTS SHALL AT NO TIME BE }
{ COPIED, TRANSFERRED, SOLD, DISTRIBUTED, OR OTHERWISE MADE }
{ AVAILABLE TO OTHER INDIVIDUALS WITHOUT EXPRESS WRITTEN CONSENT }
{ AND PERMISSION FROM DEVELOPER EXPRESS INC. }
{ }
{ CONSULT THE END USER LICENSE AGREEMENT FOR INFORMATION ON }
{ ADDITIONAL RESTRICTIONS. }
{ }
{********************************************************************}
unit dxSpellCheckerAlgorithms;
{$I cxVer.inc}
interface
type
{ TdxDoubleMetaphone }
TdxDoubleMetaphone = class
private
FAlternate: Boolean;
FCurrent: Integer;
FLast: Integer;
FPrimary: WideString;
FSecondary: WideString;
FValue: WideString;
procedure IncCurrent(ADelta: Integer = 1); {$IFDEF DELPHI9} inline; {$ENDIF}
procedure ProcessAEIOUY;
procedure ProcessB;
procedure ProcessC;
procedure ProcessD;
procedure ProcessF;
procedure ProcessG;
procedure ProcessH;
procedure ProcessJ;
procedure ProcessK;
procedure ProcessL;
procedure ProcessM;
procedure ProcessN;
procedure ProcessP;
procedure ProcessQ;
procedure ProcessR;
procedure ProcessS;
procedure ProcessT;
procedure ProcessV;
procedure ProcessW;
procedure ProcessX;
procedure ProcessZ;
procedure Reset;
protected
function StringAt(AStart, ALength: Integer; const ASubStr: WideString): Boolean; overload;
function StringAt(AStart, ALength: Integer; const ASubStrs: array of WideString): Boolean; overload;
function IsSlavoGermanic: Boolean;
function IsVowel(AIndex: Integer): Boolean;
procedure MetaphAdd(const AMain: WideString); overload; {$IFDEF DELPHI9} inline; {$ENDIF}
procedure MetaphAdd(const AMain, AAlt: WideString); overload; {$IFDEF DELPHI9} inline; {$ENDIF}
property Current: Integer read FCurrent;
property Last: Integer read FLast;
property Value: WideString read FValue;
public
procedure DoubleMetaphone(const S: WideString); virtual;
property Primary: WideString read FPrimary;
property Secondary: WideString read FSecondary;
end;
{ TdxStringSimilarityCalculator }
TdxStringSimilarityCalculator = class
protected
function DeleteCost(const S: WideString; APos: Integer): Integer; {$IFDEF DELPHI9} inline; {$ENDIF}
function InsertCost(const S: WideString; APos: Integer): Integer; {$IFDEF DELPHI9} inline; {$ENDIF}
function Minimum(A, B, C: Integer): Integer;
function SubstitutionCost(const S1: WideString; APos1: Integer;
const S2: WideString; APos2: Integer): Integer; {$IFDEF DELPHI9} inline; {$ENDIF}
public
function GetDistance(const S1, S2: WideString): Integer; virtual;
end;
implementation
uses
SysUtils;
{ TdxDoubleMetaphone }
procedure TdxDoubleMetaphone.DoubleMetaphone(const S: WideString);
begin
Reset;
FLast := Length(S);
if FLast = 0 then Exit;
//pad the original string so that we can index beyond the edge of the world
FValue := WideUpperCase(S) + ' ';
//skip these when at start of word
if StringAt(1, 2, ['GN', 'KN', 'PN', 'WR', 'PS']) then
IncCurrent;
//Initial 'X' is pronounced 'Z' e.g. 'Xavier'
if Value[1] = 'X' then
begin
MetaphAdd('S'); //'Z' maps to 'S'
IncCurrent;
end;
// main loop
while ((Length(Primary) < 4) or (Length(Secondary) < 4)) and (Current <= Last) do
case FValue[Current] of
'A', 'E', 'I', 'O', 'U', 'Y': ProcessAEIOUY;
'B': ProcessB;
'C': ProcessC;
'D': ProcessD;
'F': ProcessF;
'G': ProcessG;
'H': ProcessH;
'J': ProcessJ;
'K': ProcessK;
'L': ProcessL;
'M': ProcessM;
'N': ProcessN;
'P': ProcessP;
'Q': ProcessQ;
'R': ProcessR;
'S': ProcessS;
'T': ProcessT;
'V': ProcessV;
'W': ProcessW;
'X': ProcessX;
'Z': ProcessZ;
else
Inc(FCurrent);
end;
if Length(Primary) > 4 then
Delete(FPrimary, 1, 4);
if FAlternate and (Length(Secondary) > 4) then
Delete(FSecondary, 1, 4);
end;
function TdxDoubleMetaphone.IsSlavoGermanic: Boolean;
begin
Result := (Pos('W', Value) > 0) or (Pos('K', Value) > 0) or
(Pos('CZ', Value) > 0) or (Pos('WITZ', Value) > 0);
end;
function TdxDoubleMetaphone.IsVowel(AIndex: Integer): Boolean;
begin
Result := (AIndex > 0) and (AIndex <= Last);
if Result then
case Value[AIndex] of
'A', 'E', 'I', 'O', 'U', 'Y':;
else
Result := False;
end;
end;
procedure TdxDoubleMetaphone.MetaphAdd(const AMain: WideString);
begin
if Length(AMain) <> 0 then
begin
FPrimary := FPrimary + AMain;
FSecondary := FSecondary + AMain;
end;
end;
procedure TdxDoubleMetaphone.MetaphAdd(const AMain, AAlt: WideString);
begin
if Length(AMain) <> 0 then
FPrimary := FPrimary + AMain;
if Length(AAlt) <> 0 then
begin
FAlternate := True;
if AAlt[1] <> ' ' then
FSecondary := FSecondary + AAlt;
end
else
if (Length(AMain) <> 0) and (AMain[1] <> ' ') then
FSecondary := FSecondary + AMain;
end;
function TdxDoubleMetaphone.StringAt(AStart, ALength: Integer;
const ASubStr: WideString): Boolean;
begin
Result := (AStart > 0) and (Copy(Value, AStart, ALength) = ASubStr);
end;
function TdxDoubleMetaphone.StringAt(AStart, ALength: Integer;
const ASubStrs: array of WideString): Boolean;
var
I: Integer;
T: WideString;
begin
Result := False;
if AStart < 1 then Exit;
T := Copy(Value, AStart, ALength);
for I := Low(ASubStrs) to High(ASubStrs) do
if T = ASubStrs[I] then
begin
Result := True;
Break;
end;
end;
procedure TdxDoubleMetaphone.IncCurrent(ADelta: Integer = 1);
begin
Inc(FCurrent, ADelta);
end;
procedure TdxDoubleMetaphone.ProcessAEIOUY;
begin
if Current = 0 then
//all init vowels now map to 'A'
MetaphAdd('A');
Inc(FCurrent);
end;
procedure TdxDoubleMetaphone.ProcessB;
begin
//"-mb", e.g", "dumb", already skipped over...
MetaphAdd('P');
if Value[Current + 1] = 'B' then
IncCurrent(2)
else
IncCurrent;
end;
procedure TdxDoubleMetaphone.ProcessC;
begin
//various germanic
if ((Current > 2) and not IsVowel(Current - 2) and StringAt(Current - 1, 3, 'ACH') and
((Value[Current + 2] <> 'I') and ((Value[Current + 2] <> 'E') or StringAt(Current - 2, 6, ['BACHER', 'MACHER'])) )) then
begin
MetaphAdd('K');
IncCurrent(2);
Exit;
end;
//special case 'caesar'
if (Current = 1) AND StringAt(Current, 6, 'CAESAR') then
begin
MetaphAdd('S');
IncCurrent(2);
Exit;
end;
//italian 'chianti'
if StringAt(Current, 4, 'CHIA') then
begin
MetaphAdd('K');
IncCurrent(2);
Exit;
end;
if StringAt(Current, 2, 'CH') then
begin
//find 'michael'
if (Current > 1) and StringAt(Current, 4, 'CHAE') then
begin
MetaphAdd('K', 'X');
IncCurrent(2);
Exit;
end;
//greek roots e.g. 'chemistry', 'chorus'
if ((Current = 1) and (StringAt(Current + 1, 5, ['HARAC', 'HARIS']) or
StringAt(Current + 1, 3, ['HOR', 'HYM', 'HIA', 'HEM'])) and not StringAt(1, 5, 'CHORE')) then
begin
MetaphAdd('K');
IncCurrent(2);
Exit;
end;
//germanic, greek, or otherwise 'ch' for 'kh' sound
if ((StringAt(1, 4, ['VAN ', 'VON ']) or StringAt(1, 3, 'SCH'))
// 'architect but not 'arch', 'orchestra', 'orchid'
or StringAt(Current - 2, 6, ['ORCHES', 'ARCHIT', 'ORCHID'])
or StringAt(Current + 2, 1, ['T', 'S'])
or ((StringAt(Current - 1, 1, ['A', 'O', 'U', 'E']) or (Current = 1))
//e.g., 'wachtler', 'wechsler', but not 'tichner'
and StringAt(Current + 2, 1, ['L', 'R', 'N', 'M', 'B', 'H', 'F', 'V', 'W', ' ']))) then
MetaphAdd('K')
else
begin
if Current > 1 then
begin
if StringAt(1, 2, 'MC') then
//e.g., 'McHugh'
MetaphAdd('K')
else
MetaphAdd('X', 'K');
end
else
MetaphAdd('X');
end;
IncCurrent(2);
Exit;
end;
//e.g, 'czerny'
if StringAt(Current, 2, 'CZ') and not StringAt(Current - 2, 4, 'WICZ') then
begin
MetaphAdd('S', 'X');
IncCurrent(2);
Exit;
end;
//e.g., 'focaccia'
if StringAt((Current + 1), 3, 'CIA') then
begin
MetaphAdd('X');
IncCurrent(3);
Exit;
end;
//double 'C', but not if e.g. 'McClellan'
if StringAt(Current, 2, 'CC') and not ((Current = 2) and (Value[1] = 'M')) then
begin
//'bellocchio' but not 'bacchus'
if StringAt(Current + 2, 1, ['I', 'E', 'H']) and not StringAt(Current + 2, 2, 'HU') then
begin
//'accident', 'accede' 'succeed'
if (((Current = 2) and (Value[Current - 1] = 'A'))
or StringAt(Current - 1, 5, ['UCCEE', 'UCCES'])) then
MetaphAdd('KS')
else
//'bacci', 'bertucci', other italian
MetaphAdd('X');
IncCurrent(3);
end
else
begin
//Pierce's rule
MetaphAdd('K');
IncCurrent(2);
end;
Exit;
end;
if StringAt(Current, 2, ['CK', 'CG', 'CQ']) then
begin
MetaphAdd('K');
IncCurrent(2);
Exit;
end;
if StringAt(Current, 2, ['CI', 'CE', 'CY']) then
begin
//italian vs. english
if StringAt(Current, 3, ['CIO', 'CIE', 'CIA']) then
MetaphAdd('S', 'X')
else
MetaphAdd('S');
IncCurrent(2);
Exit;
end;
//else
MetaphAdd('K');
//name sent in 'mac caffrey', 'mac gregor
if StringAt(Current + 1, 2, [' C', ' Q', ' G']) then
IncCurrent(3)
else
if StringAt(Current + 1, 1, ['C', 'K', 'Q']) and not StringAt(Current + 1, 2, ['CE', 'CI']) then
IncCurrent(2)
else
IncCurrent;
end;
procedure TdxDoubleMetaphone.ProcessD;
begin
if StringAt(Current, 2, 'DG') then
begin
if StringAt(Current + 2, 1, ['I', 'E', 'Y']) then
begin
//e.g. 'edge'
MetaphAdd('J');
IncCurrent(3);
end
else
begin
//e.g. 'edgar'
MetaphAdd('TK');
IncCurrent(2)
end;
Exit;
end;
if StringAt(current, 2, ['DT', 'DD']) then
begin
MetaphAdd('T');
IncCurrent(2);
end
else
begin
MetaphAdd('T');
IncCurrent;
end;
end;
procedure TdxDoubleMetaphone.ProcessF;
begin
if Value[Current + 1] = 'F' then
IncCurrent(2)
else
IncCurrent;
MetaphAdd('F');
end;
procedure TdxDoubleMetaphone.ProcessG;
begin
if Value[Current + 1] = 'H' then
begin
if (Current > 1) and not IsVowel(Current - 1) then
begin
MetaphAdd('K');
IncCurrent(2);
Exit;
end;
if Current < 4 then
begin
//'ghislane', ghiradelli
if Current = 1 then
begin
if Value[Current + 2] = 'I' then
MetaphAdd('J')
else
MetaphAdd('K');
IncCurrent(2);
Exit;
end;
end;
//Parker's rule (with some further refinements) - e.g., 'hugh'
if (((Current > 2) and StringAt(Current - 2, 1, ['B', 'H', 'D']))
//e.g., 'bough'
or ((Current > 3) and StringAt(Current - 3, 1, ['B', 'H', 'D']))
//e.g., 'broughton'
or ((Current > 4) and StringAt(Current - 4, 1, ['B', 'H']))) then
IncCurrent(2)
else
begin
//e.g., 'laugh', 'McLaughlin', 'cough', 'gough', 'rough', 'tough'
if (Current > 3) and (Value[Current - 1] = 'U')
and StringAt(Current - 3, 1, ['C', 'G', 'L', 'R', 'T']) then
MetaphAdd('F')
else
if (Current > 1) and (Value[Current - 1] <> 'I') then
MetaphAdd('K');
IncCurrent(2);
end;
Exit;
end;
if Value[Current + 1] = 'N' then
begin
if (Current = 2) and IsVowel(1) and not IsSlavoGermanic then
MetaphAdd('KN', 'N')
else
//not e.g. 'cagney'
if not StringAt(Current + 2, 2, 'EY') and (Value[Current + 1] <> 'Y') and not IsSlavoGermanic then
MetaphAdd('N', 'KN')
else
MetaphAdd('KN');
IncCurrent(2);
Exit;
end;
//'tagliaro'
if StringAt(Current + 1, 2, 'LI') and not IsSlavoGermanic then
begin
MetaphAdd('KL', 'L');
IncCurrent(2);
Exit;
end;
//-ges-,-gep-,-gel-, -gie- at beginning
if ((Current = 1) and ((Value[Current + 1] = 'Y')
or StringAt(Current + 1, 2, ['ES', 'EP', 'EB', 'EL', 'EY', 'IB', 'IL', 'IN', 'IE', 'EI', 'ER']))) then
begin
MetaphAdd('K', 'J');
IncCurrent(2);
Exit;
end;
// -ger-, -gy-
if ((StringAt(Current + 1, 2, 'ER') or (Value[Current + 1] = 'Y'))
and not StringAt(1, 6, ['DANGER', 'RANGER', 'MANGER'])
and not StringAt(Current - 1, 1, ['E', 'I'])
and not StringAt(Current - 1, 3, ['RGY', 'OGY'])) then
begin
MetaphAdd('K', 'J');
IncCurrent(2);
Exit;
end;
// italian e.g, 'biaggi'
if StringAt(Current + 1, 1, ['E', 'I', 'Y']) or StringAt(Current - 1, 4, ['AGGI', 'OGGI']) then
begin
//obvious germanic
if StringAt(1, 4, ['VAN ', 'VON ']) or StringAt(1, 3, 'SCH') or StringAt(Current + 1, 2, 'ET') then
MetaphAdd('K')
else
//always soft if french ending
if StringAt((Current + 1), 4, 'IER ') then
MetaphAdd('J')
else
MetaphAdd('J', 'K');
IncCurrent(2);
Exit;
end;
if Value[Current + 1] = 'G' then
IncCurrent(2)
else
IncCurrent;
MetaphAdd('K');
end;
procedure TdxDoubleMetaphone.ProcessH;
begin
//only keep if first & before vowel or btw. 2 vowels
if ((Current = 1) or IsVowel(Current - 1)) and IsVowel(Current + 1) then
begin
MetaphAdd('H');
IncCurrent(2);
end
else //also takes care of 'HH'
IncCurrent;
end;
procedure TdxDoubleMetaphone.ProcessJ;
begin
//obvious spanish, 'jose', 'san jacinto'
if StringAt(Current, 4, 'JOSE') or StringAt(1, 4, 'SAN ') then
begin
if ((Current = 1) and (Value[Current + 4] = ' ')) or StringAt(1, 4, 'SAN ') then
MetaphAdd('H')
else
MetaphAdd('J', 'H');
IncCurrent;
Exit;
end;
if (Current = 1) and not StringAt(Current, 4, 'JOSE') then
MetaphAdd('J', 'A') //Yankelovich/Jankelowicz
else
//spanish pron. of e.g. 'bajador'
if IsVowel(Current - 1) and not IsSlavoGermanic and ((Value[Current + 1] = 'A') or (Value[Current + 1] = 'O')) then
MetaphAdd('J', 'H')
else
if Current = Last then
MetaphAdd('J', ' ')
else
if not StringAt(Current + 1, 1, ['L', 'T', 'K', 'S', 'N', 'M', 'B', 'Z'])
and not StringAt(Current - 1, 1, ['S', 'K', 'L']) then
MetaphAdd('J');
if Value[Current + 1] = 'J' then //it could happennot
IncCurrent(2)
else
IncCurrent;
end;
procedure TdxDoubleMetaphone.ProcessK;
begin
if Value[Current + 1] = 'K' then
IncCurrent(2)
else
IncCurrent;
MetaphAdd('K');
end;
procedure TdxDoubleMetaphone.ProcessL;
begin
if Value[Current + 1] = 'L' then
begin
//!!!TODO: check Last - 2 bellow
//spanish e.g. 'cabrillo', 'gallegos'
if ((Current = (Last - 2)) and StringAt(Current - 1, 4, ['ILLO', 'ILLA', 'ALLE']))
or ((StringAt(Last - 1, 2, ['AS', 'OS']) or StringAt(Last, 1, ['A', 'O']))
and StringAt(Current - 1, 4, 'ALLE')) then
begin
MetaphAdd('L', ' ');
IncCurrent(2);
Exit;
end;
IncCurrent(2);
end
else
IncCurrent;
MetaphAdd('L');
end;
procedure TdxDoubleMetaphone.ProcessM;
begin
if (StringAt(Current - 1, 3, 'UMB')
and ((Current + 1 = Last) or StringAt(Current + 2, 2, 'ER')))
//'dumb','thumb'
or (Value[Current + 1] = 'M') then
IncCurrent(2)
else
IncCurrent;
MetaphAdd('M');
end;
procedure TdxDoubleMetaphone.ProcessN;
begin
if Value[Current + 1] = 'N' then
IncCurrent(2)
else
IncCurrent;
MetaphAdd('N');
end;
procedure TdxDoubleMetaphone.ProcessP;
begin
if Value[Current + 1] = 'H' then
begin
MetaphAdd('F');
IncCurrent(2);
Exit;
end;
//also account for 'campbell', 'raspberry'
if StringAt(Current + 1, 1, ['P', 'B']) then
IncCurrent(2)
else
IncCurrent;
MetaphAdd('P');
end;
procedure TdxDoubleMetaphone.ProcessQ;
begin
if Value[Current + 1] = 'Q' then
IncCurrent(2)
else
IncCurrent;
MetaphAdd('K');
end;
procedure TdxDoubleMetaphone.ProcessR;
begin
//french e.g. 'rogier', but exclude 'hochmeier'
if (Current = Last) and not IsSlavoGermanic and StringAt(Current - 2, 2, 'IE')
and not StringAt(Current - 4, 2, ['ME', 'MA']) then
MetaphAdd('', 'R')
else
MetaphAdd('R');
if Value[Current + 1] = 'R' then
IncCurrent(2)
else
IncCurrent;
end;
procedure TdxDoubleMetaphone.ProcessS;
begin
//special cases 'island', 'isle', 'carlisle', 'carlysle'
if StringAt(Current - 1, 3, ['ISL', 'YSL']) then
begin
IncCurrent;
Exit;
end;
//special case 'sugar-'
if (Current = 1) and StringAt(Current, 5, 'SUGAR') then
begin
MetaphAdd('X', 'S');
IncCurrent;
Exit;
end;
if StringAt(Current, 2, 'SH') then
begin
//germanic
if StringAt(Current + 1, 4, ['HEIM', 'HOEK', 'HOLM', 'HOLZ']) then
MetaphAdd('S')
else
MetaphAdd('X');
IncCurrent(2);
Exit;
end;
//italian & armenian
if StringAt(Current, 3, ['SIO', 'SIA']) or StringAt(Current, 4, 'SIAN') then
begin
if not IsSlavoGermanic then
MetaphAdd('S', 'X')
else
MetaphAdd('S');
IncCurrent(3);
Exit;
end;
//german & anglicisations, e.g. 'smith' match 'schmidt', 'snider' match 'schneider'
//also, -sz- in slavic language altho in hungarian it is pronounced 's'
if ((Current = 1) and StringAt(Current + 1, 1, ['M', 'N', 'L', 'W']))
or StringAt(Current + 1, 1, 'Z') then
begin
MetaphAdd('S', 'X');
if StringAt(Current + 1, 1, 'Z') then
IncCurrent(2)
else
IncCurrent;
Exit;
end;
if StringAt(Current, 2, 'SC') then
begin
//Schlesinger's rule
if Value[Current + 2] = 'H' then
begin
//dutch origin, e.g. 'school', 'schooner'
if StringAt(Current + 3, 2, ['OO', 'ER', 'EN', 'UY', 'ED', 'EM']) then
begin
//'schermerhorn', 'schenker'
if StringAt(Current + 3, 2, ['ER', 'EN']) then
MetaphAdd('X', 'SK')
else
MetaphAdd('SK');
end
else
begin
if (Current = 1) and not IsVowel(4) and (Value[4] <> 'W') then
MetaphAdd('X', 'S')
else
MetaphAdd('X');
end;
IncCurrent(3);
Exit;
end;
if StringAt(Current + 2, 1, ['I', 'E', 'Y']) then
begin
MetaphAdd('S');
IncCurrent(3);
Exit;
end;
//else
MetaphAdd('SK');
IncCurrent(3);
Exit;
end;
//french e.g. 'resnais', 'artois'
if (Current = Last) and StringAt(Current - 2, 2, ['AI', 'OI']) then
MetaphAdd('', 'S')
else
MetaphAdd('S');
if StringAt(Current + 1, 1, ['S', 'Z']) then
IncCurrent(2)
else
IncCurrent;
end;
procedure TdxDoubleMetaphone.ProcessT;
begin
if StringAt(Current, 4, 'TION') or StringAt(Current, 3, ['TIA', 'TCH']) then
begin
MetaphAdd('X');
IncCurrent(3);
Exit;
end;
if StringAt(Current, 2, 'TH') or StringAt(Current, 3, 'TTH') then
begin
//special case 'thomas', 'thames' or germanic
if StringAt(Current + 2, 2, ['OM', 'AM']) or StringAt(1, 4, ['VAN ', 'VON '])
or StringAt(1, 3, 'SCH') then
MetaphAdd('T')
else
MetaphAdd('0', 'T');
IncCurrent(2);
Exit;
end;
if StringAt(Current + 1, 1, ['T', 'D']) then
IncCurrent(2)
else
IncCurrent;
MetaphAdd('T');
end;
procedure TdxDoubleMetaphone.ProcessV;
begin
if Value[Current + 1] = 'V' then
IncCurrent(2)
else
IncCurrent;
MetaphAdd('F');
end;
procedure TdxDoubleMetaphone.ProcessW;
begin
//can also be in middle of word
if StringAt(Current, 2, 'WR') then
begin
MetaphAdd('R');
IncCurrent(2);
Exit;
end;
if (Current = 1) and (IsVowel(Current + 1) or StringAt(Current, 2, 'WH')) then
begin
//Wasserman should match Vasserman
if IsVowel(Current + 1) then
MetaphAdd('A', 'F')
else
//need Uomo to match Womo
MetaphAdd('A');
end;
//Arnow should match Arnoff
if ((Current = Last) and IsVowel(Current - 1))
or StringAt(Current - 1, 5, ['EWSKI', 'EWSKY', 'OWSKI', 'OWSKY']) or StringAt(1, 3, 'SCH') then
begin
MetaphAdd('', 'F');
IncCurrent;
Exit;
end;
//polish e.g. 'filipowicz'
if StringAt(Current, 4, ['WICZ', 'WITZ']) then
begin
MetaphAdd('TS', 'FX');
IncCurrent(4);
Exit;
end;
//else skip it
IncCurrent;
end;
procedure TdxDoubleMetaphone.ProcessX;
begin
//french e.g. breaux
if not ((Current = Last) and (StringAt(Current - 3, 3, ['IAU', 'EAU'])
or StringAt(Current - 2, 2, ['AU', 'OU']))) then
MetaphAdd('KS');
if StringAt(Current + 1, 1, ['C', 'X']) then
IncCurrent(2)
else
IncCurrent;
end;
procedure TdxDoubleMetaphone.ProcessZ;
begin
//chinese pinyin e.g. 'zhao'
if Value[Current + 1] = 'H' then
begin
MetaphAdd('J');
IncCurrent(2);
Exit;
end
else
if StringAt(Current + 1, 2, ['ZO', 'ZI', 'ZA'])
or (IsSlavoGermanic and ((Current > 1) and (Value[Current - 1] <> 'T'))) then
MetaphAdd('S', 'TS')
else
MetaphAdd('S');
if Value[Current + 1] = 'Z' then
IncCurrent(2)
else
IncCurrent;
end;
procedure TdxDoubleMetaphone.Reset;
begin
FCurrent := 1;
FAlternate := False;
FPrimary := '';
FSecondary := '';
end;
{ TdxStringSimilarityCalculator }
function TdxStringSimilarityCalculator.GetDistance(const S1, S2: WideString): Integer;
var
AMatrix: array of array of Integer;
ALoString1, ALoString2: WideString;
I, J, ALen1, ALen2: Integer;
begin
if (Length(S1) = 0) or (Length(S2) = 0) then
begin
Result := 0;
Exit;
end;
ALoString1 := WideLowerCase(S1);
ALoString2 := WideLowerCase(S2);
ALen1 := Length(ALoString1);
ALen2 := Length(ALoString2);
SetLength(AMatrix, ALen1 + 1, ALen2 + 1);
AMatrix[0, 0] := 0;
for I := 0 to ALen2 - 1 do
AMatrix[0, I + 1] := AMatrix[0, I] + InsertCost(ALoString2, I + 1);
for I := 0 to ALen1 - 1 do
begin
AMatrix[I + 1, 0] := AMatrix[I, 0] + DeleteCost(ALoString1, I + 1);
for J := 0 to ALen2 - 1 do
begin
AMatrix[I + 1, J + 1] := Minimum(
AMatrix[I, J] + SubstitutionCost(ALoString1, I + 1, ALoString2, J + 1),
AMatrix[I, J + 1] + DeleteCost(ALoString1, I + 1),
AMatrix[I + 1, J] + InsertCost(ALoString2, J + 1));
end;
end;
Result := AMatrix[ALen1, ALen2];
if Result > 0 then
begin
if ALoString1[1] <> ALoString2[1] then
Inc(Result);
if ALoString1[ALen1] <> ALoString2[ALen2] then
Inc(Result);
end;
end;
function TdxStringSimilarityCalculator.DeleteCost(const S: WideString;
APos: Integer): Integer;
begin
Result := 2;
end;
function TdxStringSimilarityCalculator.InsertCost(const S: WideString;
APos: Integer): Integer;
begin
Result := 2;
end;
function TdxStringSimilarityCalculator.Minimum(A, B, C: Integer): Integer;
begin
Result := A;
if B < A then
Result := B;
if C < Result then
Result := C;
end;
function TdxStringSimilarityCalculator.SubstitutionCost(const S1: WideString;
APos1: Integer; const S2: WideString; APos2: Integer): Integer;
begin
if S1[APos1] = S2[APos2] then
Result := 0
else
Result := 2;
end;
end.