git-svn-id: https://192.168.0.254/svn/Componentes.Terceros.DevExpressVCL@29 05c56307-c608-d34a-929d-697000501d7a
979 lines
25 KiB
ObjectPascal
979 lines
25 KiB
ObjectPascal
{********************************************************************}
|
|
{ }
|
|
{ Developer Express Visual Component Library }
|
|
{ ExpressSpellChecker }
|
|
{ }
|
|
{ Copyright (c) 1998-2008 Developer Express Inc. }
|
|
{ ALL RIGHTS RESERVED }
|
|
{ }
|
|
{ The entire contents of this file is protected by U.S. and }
|
|
{ International Copyright Laws. Unauthorized reproduction, }
|
|
{ reverse-engineering, and distribution of all or any portion of }
|
|
{ the code contained in this file is strictly prohibited and may }
|
|
{ result in severe civil and criminal penalties and will be }
|
|
{ prosecuted to the maximum extent possible under the law. }
|
|
{ }
|
|
{ RESTRICTIONS }
|
|
{ }
|
|
{ THIS SOURCE CODE AND ALL RESULTING INTERMEDIATE FILES }
|
|
{ (DCU, OBJ, DLL, ETC.) ARE CONFIDENTIAL AND PROPRIETARY TRADE }
|
|
{ SECRETS OF DEVELOPER EXPRESS INC. THE REGISTERED DEVELOPER IS }
|
|
{ LICENSED TO DISTRIBUTE THE EXPRESSSPELLCHECKER AND ALL }
|
|
{ ACCOMPANYING VCL CONTROLS AS PART OF AN EXECUTABLE PROGRAM ONLY. }
|
|
{ }
|
|
{ THE SOURCE CODE CONTAINED WITHIN THIS FILE AND ALL RELATED }
|
|
{ FILES OR ANY PORTION OF ITS CONTENTS SHALL AT NO TIME BE }
|
|
{ COPIED, TRANSFERRED, SOLD, DISTRIBUTED, OR OTHERWISE MADE }
|
|
{ AVAILABLE TO OTHER INDIVIDUALS WITHOUT EXPRESS WRITTEN CONSENT }
|
|
{ AND PERMISSION FROM DEVELOPER EXPRESS INC. }
|
|
{ }
|
|
{ CONSULT THE END USER LICENSE AGREEMENT FOR INFORMATION ON }
|
|
{ ADDITIONAL RESTRICTIONS. }
|
|
{ }
|
|
{********************************************************************}
|
|
|
|
unit dxSpellCheckerAlgorithms;
|
|
|
|
{$I cxVer.inc}
|
|
|
|
interface
|
|
|
|
type
|
|
{ TdxDoubleMetaphone }
|
|
|
|
TdxDoubleMetaphone = class
|
|
private
|
|
FAlternate: Boolean;
|
|
FCurrent: Integer;
|
|
FLast: Integer;
|
|
FPrimary: WideString;
|
|
FSecondary: WideString;
|
|
FValue: WideString;
|
|
procedure IncCurrent(ADelta: Integer = 1); {$IFDEF DELPHI9} inline; {$ENDIF}
|
|
procedure ProcessAEIOUY;
|
|
procedure ProcessB;
|
|
procedure ProcessC;
|
|
procedure ProcessD;
|
|
procedure ProcessF;
|
|
procedure ProcessG;
|
|
procedure ProcessH;
|
|
procedure ProcessJ;
|
|
procedure ProcessK;
|
|
procedure ProcessL;
|
|
procedure ProcessM;
|
|
procedure ProcessN;
|
|
procedure ProcessP;
|
|
procedure ProcessQ;
|
|
procedure ProcessR;
|
|
procedure ProcessS;
|
|
procedure ProcessT;
|
|
procedure ProcessV;
|
|
procedure ProcessW;
|
|
procedure ProcessX;
|
|
procedure ProcessZ;
|
|
procedure Reset;
|
|
protected
|
|
function StringAt(AStart, ALength: Integer; const ASubStr: WideString): Boolean; overload;
|
|
function StringAt(AStart, ALength: Integer; const ASubStrs: array of WideString): Boolean; overload;
|
|
function IsSlavoGermanic: Boolean;
|
|
function IsVowel(AIndex: Integer): Boolean;
|
|
procedure MetaphAdd(const AMain: WideString); overload; {$IFDEF DELPHI9} inline; {$ENDIF}
|
|
procedure MetaphAdd(const AMain, AAlt: WideString); overload; {$IFDEF DELPHI9} inline; {$ENDIF}
|
|
|
|
property Current: Integer read FCurrent;
|
|
property Last: Integer read FLast;
|
|
property Value: WideString read FValue;
|
|
public
|
|
procedure DoubleMetaphone(const S: WideString); virtual;
|
|
|
|
property Primary: WideString read FPrimary;
|
|
property Secondary: WideString read FSecondary;
|
|
end;
|
|
|
|
{ TdxStringSimilarityCalculator }
|
|
|
|
TdxStringSimilarityCalculator = class
|
|
protected
|
|
function DeleteCost(const S: WideString; APos: Integer): Integer; {$IFDEF DELPHI9} inline; {$ENDIF}
|
|
function InsertCost(const S: WideString; APos: Integer): Integer; {$IFDEF DELPHI9} inline; {$ENDIF}
|
|
function Minimum(A, B, C: Integer): Integer;
|
|
function SubstitutionCost(const S1: WideString; APos1: Integer;
|
|
const S2: WideString; APos2: Integer): Integer; {$IFDEF DELPHI9} inline; {$ENDIF}
|
|
public
|
|
function GetDistance(const S1, S2: WideString): Integer; virtual;
|
|
end;
|
|
|
|
implementation
|
|
|
|
uses
|
|
SysUtils;
|
|
|
|
{ TdxDoubleMetaphone }
|
|
|
|
procedure TdxDoubleMetaphone.DoubleMetaphone(const S: WideString);
|
|
begin
|
|
Reset;
|
|
FLast := Length(S);
|
|
if FLast = 0 then Exit;
|
|
|
|
//pad the original string so that we can index beyond the edge of the world
|
|
FValue := WideUpperCase(S) + ' ';
|
|
|
|
//skip these when at start of word
|
|
if StringAt(1, 2, ['GN', 'KN', 'PN', 'WR', 'PS']) then
|
|
IncCurrent;
|
|
|
|
//Initial 'X' is pronounced 'Z' e.g. 'Xavier'
|
|
if Value[1] = 'X' then
|
|
begin
|
|
MetaphAdd('S'); //'Z' maps to 'S'
|
|
IncCurrent;
|
|
end;
|
|
|
|
// main loop
|
|
while ((Length(Primary) < 4) or (Length(Secondary) < 4)) and (Current <= Last) do
|
|
case FValue[Current] of
|
|
'A', 'E', 'I', 'O', 'U', 'Y': ProcessAEIOUY;
|
|
'B': ProcessB;
|
|
'C': ProcessC;
|
|
'D': ProcessD;
|
|
'F': ProcessF;
|
|
'G': ProcessG;
|
|
'H': ProcessH;
|
|
'J': ProcessJ;
|
|
'K': ProcessK;
|
|
'L': ProcessL;
|
|
'M': ProcessM;
|
|
'N': ProcessN;
|
|
'P': ProcessP;
|
|
'Q': ProcessQ;
|
|
'R': ProcessR;
|
|
'S': ProcessS;
|
|
'T': ProcessT;
|
|
'V': ProcessV;
|
|
'W': ProcessW;
|
|
'X': ProcessX;
|
|
'Z': ProcessZ;
|
|
else
|
|
Inc(FCurrent);
|
|
end;
|
|
if Length(Primary) > 4 then
|
|
Delete(FPrimary, 1, 4);
|
|
if FAlternate and (Length(Secondary) > 4) then
|
|
Delete(FSecondary, 1, 4);
|
|
end;
|
|
|
|
function TdxDoubleMetaphone.IsSlavoGermanic: Boolean;
|
|
begin
|
|
Result := (Pos('W', Value) > 0) or (Pos('K', Value) > 0) or
|
|
(Pos('CZ', Value) > 0) or (Pos('WITZ', Value) > 0);
|
|
end;
|
|
|
|
function TdxDoubleMetaphone.IsVowel(AIndex: Integer): Boolean;
|
|
begin
|
|
Result := (AIndex > 0) and (AIndex <= Last);
|
|
if Result then
|
|
case Value[AIndex] of
|
|
'A', 'E', 'I', 'O', 'U', 'Y':;
|
|
else
|
|
Result := False;
|
|
end;
|
|
end;
|
|
|
|
procedure TdxDoubleMetaphone.MetaphAdd(const AMain: WideString);
|
|
begin
|
|
if Length(AMain) <> 0 then
|
|
begin
|
|
FPrimary := FPrimary + AMain;
|
|
FSecondary := FSecondary + AMain;
|
|
end;
|
|
end;
|
|
|
|
procedure TdxDoubleMetaphone.MetaphAdd(const AMain, AAlt: WideString);
|
|
begin
|
|
if Length(AMain) <> 0 then
|
|
FPrimary := FPrimary + AMain;
|
|
if Length(AAlt) <> 0 then
|
|
begin
|
|
FAlternate := True;
|
|
if AAlt[1] <> ' ' then
|
|
FSecondary := FSecondary + AAlt;
|
|
end
|
|
else
|
|
if (Length(AMain) <> 0) and (AMain[1] <> ' ') then
|
|
FSecondary := FSecondary + AMain;
|
|
end;
|
|
|
|
function TdxDoubleMetaphone.StringAt(AStart, ALength: Integer;
|
|
const ASubStr: WideString): Boolean;
|
|
begin
|
|
Result := (AStart > 0) and (Copy(Value, AStart, ALength) = ASubStr);
|
|
end;
|
|
|
|
function TdxDoubleMetaphone.StringAt(AStart, ALength: Integer;
|
|
const ASubStrs: array of WideString): Boolean;
|
|
var
|
|
I: Integer;
|
|
T: WideString;
|
|
begin
|
|
Result := False;
|
|
if AStart < 1 then Exit;
|
|
T := Copy(Value, AStart, ALength);
|
|
for I := Low(ASubStrs) to High(ASubStrs) do
|
|
if T = ASubStrs[I] then
|
|
begin
|
|
Result := True;
|
|
Break;
|
|
end;
|
|
end;
|
|
|
|
procedure TdxDoubleMetaphone.IncCurrent(ADelta: Integer = 1);
|
|
begin
|
|
Inc(FCurrent, ADelta);
|
|
end;
|
|
|
|
procedure TdxDoubleMetaphone.ProcessAEIOUY;
|
|
begin
|
|
if Current = 0 then
|
|
//all init vowels now map to 'A'
|
|
MetaphAdd('A');
|
|
Inc(FCurrent);
|
|
end;
|
|
|
|
procedure TdxDoubleMetaphone.ProcessB;
|
|
begin
|
|
//"-mb", e.g", "dumb", already skipped over...
|
|
MetaphAdd('P');
|
|
|
|
if Value[Current + 1] = 'B' then
|
|
IncCurrent(2)
|
|
else
|
|
IncCurrent;
|
|
end;
|
|
|
|
procedure TdxDoubleMetaphone.ProcessC;
|
|
begin
|
|
//various germanic
|
|
if ((Current > 2) and not IsVowel(Current - 2) and StringAt(Current - 1, 3, 'ACH') and
|
|
((Value[Current + 2] <> 'I') and ((Value[Current + 2] <> 'E') or StringAt(Current - 2, 6, ['BACHER', 'MACHER'])) )) then
|
|
begin
|
|
MetaphAdd('K');
|
|
IncCurrent(2);
|
|
Exit;
|
|
end;
|
|
|
|
//special case 'caesar'
|
|
if (Current = 1) AND StringAt(Current, 6, 'CAESAR') then
|
|
begin
|
|
MetaphAdd('S');
|
|
IncCurrent(2);
|
|
Exit;
|
|
end;
|
|
|
|
//italian 'chianti'
|
|
if StringAt(Current, 4, 'CHIA') then
|
|
begin
|
|
MetaphAdd('K');
|
|
IncCurrent(2);
|
|
Exit;
|
|
end;
|
|
|
|
if StringAt(Current, 2, 'CH') then
|
|
begin
|
|
//find 'michael'
|
|
if (Current > 1) and StringAt(Current, 4, 'CHAE') then
|
|
begin
|
|
MetaphAdd('K', 'X');
|
|
IncCurrent(2);
|
|
Exit;
|
|
end;
|
|
|
|
//greek roots e.g. 'chemistry', 'chorus'
|
|
if ((Current = 1) and (StringAt(Current + 1, 5, ['HARAC', 'HARIS']) or
|
|
StringAt(Current + 1, 3, ['HOR', 'HYM', 'HIA', 'HEM'])) and not StringAt(1, 5, 'CHORE')) then
|
|
begin
|
|
MetaphAdd('K');
|
|
IncCurrent(2);
|
|
Exit;
|
|
end;
|
|
|
|
//germanic, greek, or otherwise 'ch' for 'kh' sound
|
|
if ((StringAt(1, 4, ['VAN ', 'VON ']) or StringAt(1, 3, 'SCH'))
|
|
// 'architect but not 'arch', 'orchestra', 'orchid'
|
|
or StringAt(Current - 2, 6, ['ORCHES', 'ARCHIT', 'ORCHID'])
|
|
or StringAt(Current + 2, 1, ['T', 'S'])
|
|
or ((StringAt(Current - 1, 1, ['A', 'O', 'U', 'E']) or (Current = 1))
|
|
//e.g., 'wachtler', 'wechsler', but not 'tichner'
|
|
and StringAt(Current + 2, 1, ['L', 'R', 'N', 'M', 'B', 'H', 'F', 'V', 'W', ' ']))) then
|
|
MetaphAdd('K')
|
|
else
|
|
begin
|
|
if Current > 1 then
|
|
begin
|
|
if StringAt(1, 2, 'MC') then
|
|
//e.g., 'McHugh'
|
|
MetaphAdd('K')
|
|
else
|
|
MetaphAdd('X', 'K');
|
|
end
|
|
else
|
|
MetaphAdd('X');
|
|
end;
|
|
IncCurrent(2);
|
|
Exit;
|
|
end;
|
|
|
|
//e.g, 'czerny'
|
|
if StringAt(Current, 2, 'CZ') and not StringAt(Current - 2, 4, 'WICZ') then
|
|
begin
|
|
MetaphAdd('S', 'X');
|
|
IncCurrent(2);
|
|
Exit;
|
|
end;
|
|
|
|
//e.g., 'focaccia'
|
|
if StringAt((Current + 1), 3, 'CIA') then
|
|
begin
|
|
MetaphAdd('X');
|
|
IncCurrent(3);
|
|
Exit;
|
|
end;
|
|
|
|
//double 'C', but not if e.g. 'McClellan'
|
|
if StringAt(Current, 2, 'CC') and not ((Current = 2) and (Value[1] = 'M')) then
|
|
begin
|
|
//'bellocchio' but not 'bacchus'
|
|
if StringAt(Current + 2, 1, ['I', 'E', 'H']) and not StringAt(Current + 2, 2, 'HU') then
|
|
begin
|
|
//'accident', 'accede' 'succeed'
|
|
if (((Current = 2) and (Value[Current - 1] = 'A'))
|
|
or StringAt(Current - 1, 5, ['UCCEE', 'UCCES'])) then
|
|
MetaphAdd('KS')
|
|
else
|
|
//'bacci', 'bertucci', other italian
|
|
MetaphAdd('X');
|
|
IncCurrent(3);
|
|
end
|
|
else
|
|
begin
|
|
//Pierce's rule
|
|
MetaphAdd('K');
|
|
IncCurrent(2);
|
|
end;
|
|
Exit;
|
|
end;
|
|
|
|
if StringAt(Current, 2, ['CK', 'CG', 'CQ']) then
|
|
begin
|
|
MetaphAdd('K');
|
|
IncCurrent(2);
|
|
Exit;
|
|
end;
|
|
|
|
if StringAt(Current, 2, ['CI', 'CE', 'CY']) then
|
|
begin
|
|
//italian vs. english
|
|
if StringAt(Current, 3, ['CIO', 'CIE', 'CIA']) then
|
|
MetaphAdd('S', 'X')
|
|
else
|
|
MetaphAdd('S');
|
|
IncCurrent(2);
|
|
Exit;
|
|
end;
|
|
|
|
//else
|
|
MetaphAdd('K');
|
|
|
|
//name sent in 'mac caffrey', 'mac gregor
|
|
if StringAt(Current + 1, 2, [' C', ' Q', ' G']) then
|
|
IncCurrent(3)
|
|
else
|
|
if StringAt(Current + 1, 1, ['C', 'K', 'Q']) and not StringAt(Current + 1, 2, ['CE', 'CI']) then
|
|
IncCurrent(2)
|
|
else
|
|
IncCurrent;
|
|
end;
|
|
|
|
procedure TdxDoubleMetaphone.ProcessD;
|
|
begin
|
|
if StringAt(Current, 2, 'DG') then
|
|
begin
|
|
if StringAt(Current + 2, 1, ['I', 'E', 'Y']) then
|
|
begin
|
|
//e.g. 'edge'
|
|
MetaphAdd('J');
|
|
IncCurrent(3);
|
|
end
|
|
else
|
|
begin
|
|
//e.g. 'edgar'
|
|
MetaphAdd('TK');
|
|
IncCurrent(2)
|
|
end;
|
|
Exit;
|
|
end;
|
|
|
|
if StringAt(current, 2, ['DT', 'DD']) then
|
|
begin
|
|
MetaphAdd('T');
|
|
IncCurrent(2);
|
|
end
|
|
else
|
|
begin
|
|
MetaphAdd('T');
|
|
IncCurrent;
|
|
end;
|
|
end;
|
|
|
|
procedure TdxDoubleMetaphone.ProcessF;
|
|
begin
|
|
if Value[Current + 1] = 'F' then
|
|
IncCurrent(2)
|
|
else
|
|
IncCurrent;
|
|
MetaphAdd('F');
|
|
end;
|
|
|
|
procedure TdxDoubleMetaphone.ProcessG;
|
|
begin
|
|
if Value[Current + 1] = 'H' then
|
|
begin
|
|
if (Current > 1) and not IsVowel(Current - 1) then
|
|
begin
|
|
MetaphAdd('K');
|
|
IncCurrent(2);
|
|
Exit;
|
|
end;
|
|
|
|
if Current < 4 then
|
|
begin
|
|
//'ghislane', ghiradelli
|
|
if Current = 1 then
|
|
begin
|
|
if Value[Current + 2] = 'I' then
|
|
MetaphAdd('J')
|
|
else
|
|
MetaphAdd('K');
|
|
IncCurrent(2);
|
|
Exit;
|
|
end;
|
|
end;
|
|
//Parker's rule (with some further refinements) - e.g., 'hugh'
|
|
if (((Current > 2) and StringAt(Current - 2, 1, ['B', 'H', 'D']))
|
|
//e.g., 'bough'
|
|
or ((Current > 3) and StringAt(Current - 3, 1, ['B', 'H', 'D']))
|
|
//e.g., 'broughton'
|
|
or ((Current > 4) and StringAt(Current - 4, 1, ['B', 'H']))) then
|
|
IncCurrent(2)
|
|
else
|
|
begin
|
|
//e.g., 'laugh', 'McLaughlin', 'cough', 'gough', 'rough', 'tough'
|
|
if (Current > 3) and (Value[Current - 1] = 'U')
|
|
and StringAt(Current - 3, 1, ['C', 'G', 'L', 'R', 'T']) then
|
|
MetaphAdd('F')
|
|
else
|
|
if (Current > 1) and (Value[Current - 1] <> 'I') then
|
|
MetaphAdd('K');
|
|
IncCurrent(2);
|
|
end;
|
|
Exit;
|
|
end;
|
|
|
|
if Value[Current + 1] = 'N' then
|
|
begin
|
|
if (Current = 2) and IsVowel(1) and not IsSlavoGermanic then
|
|
MetaphAdd('KN', 'N')
|
|
else
|
|
//not e.g. 'cagney'
|
|
if not StringAt(Current + 2, 2, 'EY') and (Value[Current + 1] <> 'Y') and not IsSlavoGermanic then
|
|
MetaphAdd('N', 'KN')
|
|
else
|
|
MetaphAdd('KN');
|
|
IncCurrent(2);
|
|
Exit;
|
|
end;
|
|
|
|
//'tagliaro'
|
|
if StringAt(Current + 1, 2, 'LI') and not IsSlavoGermanic then
|
|
begin
|
|
MetaphAdd('KL', 'L');
|
|
IncCurrent(2);
|
|
Exit;
|
|
end;
|
|
|
|
//-ges-,-gep-,-gel-, -gie- at beginning
|
|
if ((Current = 1) and ((Value[Current + 1] = 'Y')
|
|
or StringAt(Current + 1, 2, ['ES', 'EP', 'EB', 'EL', 'EY', 'IB', 'IL', 'IN', 'IE', 'EI', 'ER']))) then
|
|
begin
|
|
MetaphAdd('K', 'J');
|
|
IncCurrent(2);
|
|
Exit;
|
|
end;
|
|
|
|
// -ger-, -gy-
|
|
if ((StringAt(Current + 1, 2, 'ER') or (Value[Current + 1] = 'Y'))
|
|
and not StringAt(1, 6, ['DANGER', 'RANGER', 'MANGER'])
|
|
and not StringAt(Current - 1, 1, ['E', 'I'])
|
|
and not StringAt(Current - 1, 3, ['RGY', 'OGY'])) then
|
|
begin
|
|
MetaphAdd('K', 'J');
|
|
IncCurrent(2);
|
|
Exit;
|
|
end;
|
|
|
|
// italian e.g, 'biaggi'
|
|
if StringAt(Current + 1, 1, ['E', 'I', 'Y']) or StringAt(Current - 1, 4, ['AGGI', 'OGGI']) then
|
|
begin
|
|
//obvious germanic
|
|
if StringAt(1, 4, ['VAN ', 'VON ']) or StringAt(1, 3, 'SCH') or StringAt(Current + 1, 2, 'ET') then
|
|
MetaphAdd('K')
|
|
else
|
|
//always soft if french ending
|
|
if StringAt((Current + 1), 4, 'IER ') then
|
|
MetaphAdd('J')
|
|
else
|
|
MetaphAdd('J', 'K');
|
|
IncCurrent(2);
|
|
Exit;
|
|
end;
|
|
|
|
if Value[Current + 1] = 'G' then
|
|
IncCurrent(2)
|
|
else
|
|
IncCurrent;
|
|
MetaphAdd('K');
|
|
end;
|
|
|
|
procedure TdxDoubleMetaphone.ProcessH;
|
|
begin
|
|
//only keep if first & before vowel or btw. 2 vowels
|
|
if ((Current = 1) or IsVowel(Current - 1)) and IsVowel(Current + 1) then
|
|
begin
|
|
MetaphAdd('H');
|
|
IncCurrent(2);
|
|
end
|
|
else //also takes care of 'HH'
|
|
IncCurrent;
|
|
end;
|
|
|
|
procedure TdxDoubleMetaphone.ProcessJ;
|
|
begin
|
|
//obvious spanish, 'jose', 'san jacinto'
|
|
if StringAt(Current, 4, 'JOSE') or StringAt(1, 4, 'SAN ') then
|
|
begin
|
|
if ((Current = 1) and (Value[Current + 4] = ' ')) or StringAt(1, 4, 'SAN ') then
|
|
MetaphAdd('H')
|
|
else
|
|
MetaphAdd('J', 'H');
|
|
IncCurrent;
|
|
Exit;
|
|
end;
|
|
|
|
if (Current = 1) and not StringAt(Current, 4, 'JOSE') then
|
|
MetaphAdd('J', 'A') //Yankelovich/Jankelowicz
|
|
else
|
|
//spanish pron. of e.g. 'bajador'
|
|
if IsVowel(Current - 1) and not IsSlavoGermanic and ((Value[Current + 1] = 'A') or (Value[Current + 1] = 'O')) then
|
|
MetaphAdd('J', 'H')
|
|
else
|
|
if Current = Last then
|
|
MetaphAdd('J', ' ')
|
|
else
|
|
if not StringAt(Current + 1, 1, ['L', 'T', 'K', 'S', 'N', 'M', 'B', 'Z'])
|
|
and not StringAt(Current - 1, 1, ['S', 'K', 'L']) then
|
|
MetaphAdd('J');
|
|
|
|
if Value[Current + 1] = 'J' then //it could happennot
|
|
IncCurrent(2)
|
|
else
|
|
IncCurrent;
|
|
end;
|
|
|
|
procedure TdxDoubleMetaphone.ProcessK;
|
|
begin
|
|
if Value[Current + 1] = 'K' then
|
|
IncCurrent(2)
|
|
else
|
|
IncCurrent;
|
|
MetaphAdd('K');
|
|
end;
|
|
|
|
procedure TdxDoubleMetaphone.ProcessL;
|
|
begin
|
|
if Value[Current + 1] = 'L' then
|
|
begin
|
|
//!!!TODO: check Last - 2 bellow
|
|
//spanish e.g. 'cabrillo', 'gallegos'
|
|
if ((Current = (Last - 2)) and StringAt(Current - 1, 4, ['ILLO', 'ILLA', 'ALLE']))
|
|
or ((StringAt(Last - 1, 2, ['AS', 'OS']) or StringAt(Last, 1, ['A', 'O']))
|
|
and StringAt(Current - 1, 4, 'ALLE')) then
|
|
begin
|
|
MetaphAdd('L', ' ');
|
|
IncCurrent(2);
|
|
Exit;
|
|
end;
|
|
IncCurrent(2);
|
|
end
|
|
else
|
|
IncCurrent;
|
|
MetaphAdd('L');
|
|
end;
|
|
|
|
procedure TdxDoubleMetaphone.ProcessM;
|
|
begin
|
|
if (StringAt(Current - 1, 3, 'UMB')
|
|
and ((Current + 1 = Last) or StringAt(Current + 2, 2, 'ER')))
|
|
//'dumb','thumb'
|
|
or (Value[Current + 1] = 'M') then
|
|
IncCurrent(2)
|
|
else
|
|
IncCurrent;
|
|
MetaphAdd('M');
|
|
end;
|
|
|
|
procedure TdxDoubleMetaphone.ProcessN;
|
|
begin
|
|
if Value[Current + 1] = 'N' then
|
|
IncCurrent(2)
|
|
else
|
|
IncCurrent;
|
|
MetaphAdd('N');
|
|
end;
|
|
|
|
procedure TdxDoubleMetaphone.ProcessP;
|
|
begin
|
|
if Value[Current + 1] = 'H' then
|
|
begin
|
|
MetaphAdd('F');
|
|
IncCurrent(2);
|
|
Exit;
|
|
end;
|
|
|
|
//also account for 'campbell', 'raspberry'
|
|
if StringAt(Current + 1, 1, ['P', 'B']) then
|
|
IncCurrent(2)
|
|
else
|
|
IncCurrent;
|
|
MetaphAdd('P');
|
|
end;
|
|
|
|
procedure TdxDoubleMetaphone.ProcessQ;
|
|
begin
|
|
if Value[Current + 1] = 'Q' then
|
|
IncCurrent(2)
|
|
else
|
|
IncCurrent;
|
|
MetaphAdd('K');
|
|
end;
|
|
|
|
procedure TdxDoubleMetaphone.ProcessR;
|
|
begin
|
|
//french e.g. 'rogier', but exclude 'hochmeier'
|
|
if (Current = Last) and not IsSlavoGermanic and StringAt(Current - 2, 2, 'IE')
|
|
and not StringAt(Current - 4, 2, ['ME', 'MA']) then
|
|
MetaphAdd('', 'R')
|
|
else
|
|
MetaphAdd('R');
|
|
|
|
if Value[Current + 1] = 'R' then
|
|
IncCurrent(2)
|
|
else
|
|
IncCurrent;
|
|
end;
|
|
|
|
procedure TdxDoubleMetaphone.ProcessS;
|
|
begin
|
|
//special cases 'island', 'isle', 'carlisle', 'carlysle'
|
|
if StringAt(Current - 1, 3, ['ISL', 'YSL']) then
|
|
begin
|
|
IncCurrent;
|
|
Exit;
|
|
end;
|
|
|
|
//special case 'sugar-'
|
|
if (Current = 1) and StringAt(Current, 5, 'SUGAR') then
|
|
begin
|
|
MetaphAdd('X', 'S');
|
|
IncCurrent;
|
|
Exit;
|
|
end;
|
|
|
|
if StringAt(Current, 2, 'SH') then
|
|
begin
|
|
//germanic
|
|
if StringAt(Current + 1, 4, ['HEIM', 'HOEK', 'HOLM', 'HOLZ']) then
|
|
MetaphAdd('S')
|
|
else
|
|
MetaphAdd('X');
|
|
IncCurrent(2);
|
|
Exit;
|
|
end;
|
|
|
|
//italian & armenian
|
|
if StringAt(Current, 3, ['SIO', 'SIA']) or StringAt(Current, 4, 'SIAN') then
|
|
begin
|
|
if not IsSlavoGermanic then
|
|
MetaphAdd('S', 'X')
|
|
else
|
|
MetaphAdd('S');
|
|
IncCurrent(3);
|
|
Exit;
|
|
end;
|
|
|
|
//german & anglicisations, e.g. 'smith' match 'schmidt', 'snider' match 'schneider'
|
|
//also, -sz- in slavic language altho in hungarian it is pronounced 's'
|
|
if ((Current = 1) and StringAt(Current + 1, 1, ['M', 'N', 'L', 'W']))
|
|
or StringAt(Current + 1, 1, 'Z') then
|
|
begin
|
|
MetaphAdd('S', 'X');
|
|
if StringAt(Current + 1, 1, 'Z') then
|
|
IncCurrent(2)
|
|
else
|
|
IncCurrent;
|
|
Exit;
|
|
end;
|
|
|
|
if StringAt(Current, 2, 'SC') then
|
|
begin
|
|
//Schlesinger's rule
|
|
if Value[Current + 2] = 'H' then
|
|
begin
|
|
//dutch origin, e.g. 'school', 'schooner'
|
|
if StringAt(Current + 3, 2, ['OO', 'ER', 'EN', 'UY', 'ED', 'EM']) then
|
|
begin
|
|
//'schermerhorn', 'schenker'
|
|
if StringAt(Current + 3, 2, ['ER', 'EN']) then
|
|
MetaphAdd('X', 'SK')
|
|
else
|
|
MetaphAdd('SK');
|
|
end
|
|
else
|
|
begin
|
|
if (Current = 1) and not IsVowel(4) and (Value[4] <> 'W') then
|
|
MetaphAdd('X', 'S')
|
|
else
|
|
MetaphAdd('X');
|
|
end;
|
|
IncCurrent(3);
|
|
Exit;
|
|
end;
|
|
|
|
if StringAt(Current + 2, 1, ['I', 'E', 'Y']) then
|
|
begin
|
|
MetaphAdd('S');
|
|
IncCurrent(3);
|
|
Exit;
|
|
end;
|
|
//else
|
|
MetaphAdd('SK');
|
|
IncCurrent(3);
|
|
Exit;
|
|
end;
|
|
|
|
//french e.g. 'resnais', 'artois'
|
|
if (Current = Last) and StringAt(Current - 2, 2, ['AI', 'OI']) then
|
|
MetaphAdd('', 'S')
|
|
else
|
|
MetaphAdd('S');
|
|
|
|
if StringAt(Current + 1, 1, ['S', 'Z']) then
|
|
IncCurrent(2)
|
|
else
|
|
IncCurrent;
|
|
end;
|
|
|
|
procedure TdxDoubleMetaphone.ProcessT;
|
|
begin
|
|
if StringAt(Current, 4, 'TION') or StringAt(Current, 3, ['TIA', 'TCH']) then
|
|
begin
|
|
MetaphAdd('X');
|
|
IncCurrent(3);
|
|
Exit;
|
|
end;
|
|
|
|
if StringAt(Current, 2, 'TH') or StringAt(Current, 3, 'TTH') then
|
|
begin
|
|
//special case 'thomas', 'thames' or germanic
|
|
if StringAt(Current + 2, 2, ['OM', 'AM']) or StringAt(1, 4, ['VAN ', 'VON '])
|
|
or StringAt(1, 3, 'SCH') then
|
|
MetaphAdd('T')
|
|
else
|
|
MetaphAdd('0', 'T');
|
|
IncCurrent(2);
|
|
Exit;
|
|
end;
|
|
|
|
if StringAt(Current + 1, 1, ['T', 'D']) then
|
|
IncCurrent(2)
|
|
else
|
|
IncCurrent;
|
|
MetaphAdd('T');
|
|
end;
|
|
|
|
procedure TdxDoubleMetaphone.ProcessV;
|
|
begin
|
|
if Value[Current + 1] = 'V' then
|
|
IncCurrent(2)
|
|
else
|
|
IncCurrent;
|
|
MetaphAdd('F');
|
|
end;
|
|
|
|
procedure TdxDoubleMetaphone.ProcessW;
|
|
begin
|
|
//can also be in middle of word
|
|
if StringAt(Current, 2, 'WR') then
|
|
begin
|
|
MetaphAdd('R');
|
|
IncCurrent(2);
|
|
Exit;
|
|
end;
|
|
|
|
if (Current = 1) and (IsVowel(Current + 1) or StringAt(Current, 2, 'WH')) then
|
|
begin
|
|
//Wasserman should match Vasserman
|
|
if IsVowel(Current + 1) then
|
|
MetaphAdd('A', 'F')
|
|
else
|
|
//need Uomo to match Womo
|
|
MetaphAdd('A');
|
|
end;
|
|
|
|
//Arnow should match Arnoff
|
|
if ((Current = Last) and IsVowel(Current - 1))
|
|
or StringAt(Current - 1, 5, ['EWSKI', 'EWSKY', 'OWSKI', 'OWSKY']) or StringAt(1, 3, 'SCH') then
|
|
begin
|
|
MetaphAdd('', 'F');
|
|
IncCurrent;
|
|
Exit;
|
|
end;
|
|
|
|
//polish e.g. 'filipowicz'
|
|
if StringAt(Current, 4, ['WICZ', 'WITZ']) then
|
|
begin
|
|
MetaphAdd('TS', 'FX');
|
|
IncCurrent(4);
|
|
Exit;
|
|
end;
|
|
|
|
//else skip it
|
|
IncCurrent;
|
|
end;
|
|
|
|
procedure TdxDoubleMetaphone.ProcessX;
|
|
begin
|
|
//french e.g. breaux
|
|
if not ((Current = Last) and (StringAt(Current - 3, 3, ['IAU', 'EAU'])
|
|
or StringAt(Current - 2, 2, ['AU', 'OU']))) then
|
|
MetaphAdd('KS');
|
|
|
|
if StringAt(Current + 1, 1, ['C', 'X']) then
|
|
IncCurrent(2)
|
|
else
|
|
IncCurrent;
|
|
end;
|
|
|
|
procedure TdxDoubleMetaphone.ProcessZ;
|
|
begin
|
|
//chinese pinyin e.g. 'zhao'
|
|
if Value[Current + 1] = 'H' then
|
|
begin
|
|
MetaphAdd('J');
|
|
IncCurrent(2);
|
|
Exit;
|
|
end
|
|
else
|
|
if StringAt(Current + 1, 2, ['ZO', 'ZI', 'ZA'])
|
|
or (IsSlavoGermanic and ((Current > 1) and (Value[Current - 1] <> 'T'))) then
|
|
MetaphAdd('S', 'TS')
|
|
else
|
|
MetaphAdd('S');
|
|
|
|
if Value[Current + 1] = 'Z' then
|
|
IncCurrent(2)
|
|
else
|
|
IncCurrent;
|
|
end;
|
|
|
|
procedure TdxDoubleMetaphone.Reset;
|
|
begin
|
|
FCurrent := 1;
|
|
FAlternate := False;
|
|
FPrimary := '';
|
|
FSecondary := '';
|
|
end;
|
|
|
|
{ TdxStringSimilarityCalculator }
|
|
|
|
function TdxStringSimilarityCalculator.GetDistance(const S1, S2: WideString): Integer;
|
|
var
|
|
AMatrix: array of array of Integer;
|
|
ALoString1, ALoString2: WideString;
|
|
I, J, ALen1, ALen2: Integer;
|
|
begin
|
|
if (Length(S1) = 0) or (Length(S2) = 0) then
|
|
begin
|
|
Result := 0;
|
|
Exit;
|
|
end;
|
|
ALoString1 := WideLowerCase(S1);
|
|
ALoString2 := WideLowerCase(S2);
|
|
ALen1 := Length(ALoString1);
|
|
ALen2 := Length(ALoString2);
|
|
SetLength(AMatrix, ALen1 + 1, ALen2 + 1);
|
|
AMatrix[0, 0] := 0;
|
|
for I := 0 to ALen2 - 1 do
|
|
AMatrix[0, I + 1] := AMatrix[0, I] + InsertCost(ALoString2, I + 1);
|
|
for I := 0 to ALen1 - 1 do
|
|
begin
|
|
AMatrix[I + 1, 0] := AMatrix[I, 0] + DeleteCost(ALoString1, I + 1);
|
|
for J := 0 to ALen2 - 1 do
|
|
begin
|
|
AMatrix[I + 1, J + 1] := Minimum(
|
|
AMatrix[I, J] + SubstitutionCost(ALoString1, I + 1, ALoString2, J + 1),
|
|
AMatrix[I, J + 1] + DeleteCost(ALoString1, I + 1),
|
|
AMatrix[I + 1, J] + InsertCost(ALoString2, J + 1));
|
|
end;
|
|
end;
|
|
Result := AMatrix[ALen1, ALen2];
|
|
if Result > 0 then
|
|
begin
|
|
if ALoString1[1] <> ALoString2[1] then
|
|
Inc(Result);
|
|
if ALoString1[ALen1] <> ALoString2[ALen2] then
|
|
Inc(Result);
|
|
end;
|
|
end;
|
|
|
|
function TdxStringSimilarityCalculator.DeleteCost(const S: WideString;
|
|
APos: Integer): Integer;
|
|
begin
|
|
Result := 2;
|
|
end;
|
|
|
|
function TdxStringSimilarityCalculator.InsertCost(const S: WideString;
|
|
APos: Integer): Integer;
|
|
begin
|
|
Result := 2;
|
|
end;
|
|
|
|
function TdxStringSimilarityCalculator.Minimum(A, B, C: Integer): Integer;
|
|
begin
|
|
Result := A;
|
|
if B < A then
|
|
Result := B;
|
|
if C < Result then
|
|
Result := C;
|
|
end;
|
|
|
|
function TdxStringSimilarityCalculator.SubstitutionCost(const S1: WideString;
|
|
APos1: Integer; const S2: WideString; APos2: Integer): Integer;
|
|
begin
|
|
if S1[APos1] = S2[APos2] then
|
|
Result := 0
|
|
else
|
|
Result := 2;
|
|
end;
|
|
|
|
end.
|