Skip to content

Commit

Permalink
perf: prevent further operations if score already lower than threshold (
Browse files Browse the repository at this point in the history
  • Loading branch information
gabynevada authored Mar 2, 2024
1 parent 6b3825e commit b5c999d
Show file tree
Hide file tree
Showing 5 changed files with 215 additions and 36 deletions.
4 changes: 2 additions & 2 deletions src/Biomatch.Benchmark/FindDuplicateBenchmark.cs
Original file line number Diff line number Diff line change
Expand Up @@ -28,12 +28,12 @@ public FindDuplicateBenchmark()
[Benchmark]
public void DuplicateBenchmarkSameDataSet()
{
Match.GetPotentialMatchesFromSameDataSet(RecordsToMatch, SampleRecords, 0.85, 1.0);
Match.GetPotentialMatchesFromSameDataSet(RecordsToMatch, SampleRecords, 0.80, 1.0);
}

[Benchmark]
public void DuplicateBenchmarkDifferentDataSet()
{
Match.GetPotentialMatchesFromDifferentDataSet(RecordsToMatch, SampleRecords, 0.85, 1.0);
Match.GetPotentialMatchesFromDifferentDataSet(RecordsToMatch, SampleRecords, 0.80, 1.0);
}
}
121 changes: 106 additions & 15 deletions src/Biomatch.Domain/Match.cs
Original file line number Diff line number Diff line change
Expand Up @@ -235,36 +235,127 @@ private static void CompareRecords(
double upperScoreThreshold
)
{
//get the distance vector for the ith vector of the first table and the jth record of the second table
var distanceVector = DistanceVector.CalculateDistance(ref primaryRecord, ref secondaryRecord);
var tempScore = Score.CalculateFinalScore(ref distanceVector);
if (tempScore >= lowerScoreThreshold && tempScore <= upperScoreThreshold)
{
potentialMatches.Add(
new PotentialMatch(primaryRecord, secondaryRecord, distanceVector, tempScore)
);
}
var potentialMatch = CompareRecords(
ref primaryRecord,
ref secondaryRecord,
lowerScoreThreshold,
upperScoreThreshold
);
if (potentialMatch is not null)
potentialMatches.Add(potentialMatch.Value);
}

private static PotentialMatch? CompareRecords(
public static PotentialMatch? CompareRecords(
ref PersonRecordForMatch primaryRecord,
ref PersonRecordForMatch secondaryRecord,
double lowerScoreThreshold,
double upperScoreThreshold
)
{
//get the distance vector for the ith vector of the first table and the jth record of the second table
var distanceVector = DistanceVector.CalculateDistance(ref primaryRecord, ref secondaryRecord);
var tempScore = Score.CalculateFinalScore(ref distanceVector);
if (tempScore >= lowerScoreThreshold && tempScore <= upperScoreThreshold)
const double maxNameScore = 0.62;
const double maxBirthDateScore = 0.22;
const double maxCityScore = 0.08;
const double maxPhoneNumberScore = 0.08;
var maxScore = 1.0;
//get the distance vector for the ith vector of the first table and the jth record of the second table

var birthDateDistance = StringDistance.DateDemographicFieldDistance(
primaryRecord.BirthDateText,
secondaryRecord.BirthDateText
);
var birthDateScore = Score.GetScore(birthDateDistance, maxBirthDateScore, 1);

maxScore -= maxBirthDateScore - birthDateScore;
if (maxScore < lowerScoreThreshold)
return null;
// Name parts
var firstNameDistance = StringDistance.GeneralDemographicFieldDistance(
primaryRecord.FirstName,
secondaryRecord.FirstName
);
var firstNameScore = Score.GetScore(firstNameDistance, 0.18, 2);
var middleNameDistance = StringDistance.MiddleNameDemographicFieldDistance(
primaryRecord.MiddleName,
secondaryRecord.MiddleName
);
var middleNameScore = Score.GetScore(middleNameDistance, 0.1, 1);
var lastNameDistance = StringDistance.GeneralDemographicFieldDistance(
primaryRecord.LastName,
secondaryRecord.LastName
);
var lastNameScore = Score.GetScore(lastNameDistance, 0.17, 2);
var secondLastNameDistance = StringDistance.GeneralDemographicFieldDistance(
primaryRecord.SecondLastName,
secondaryRecord.SecondLastName
);
var secondLastNameScore = Score.GetScore(secondLastNameDistance, 0.17, 2);

var separateNameScore = firstNameScore + middleNameScore + lastNameScore + secondLastNameScore;

// Fullname
var fullNameDistance = StringDistance.GeneralDemographicFieldDistance(
primaryRecord.FullName,
secondaryRecord.FullName
);
var fullNameScore = Score.GetScore(fullNameDistance, maxNameScore, 5);

var nameScore = separateNameScore > fullNameScore ? separateNameScore : fullNameScore;

maxScore -= maxNameScore - nameScore;
if (maxScore < lowerScoreThreshold)
return null;

var cityDistance = StringDistance.GeneralDemographicFieldDistance(
primaryRecord.City,
secondaryRecord.City
);
var cityScore = Score.GetScore(cityDistance, maxCityScore, 2);

maxScore -= maxCityScore - cityScore;
if (maxScore < lowerScoreThreshold)
return null;

var phoneNumberDistance = StringDistance.GeneralDemographicFieldDistance(
primaryRecord.PhoneNumber,
secondaryRecord.PhoneNumber
);
var phoneNumberScore = Score.GetScore(phoneNumberDistance, maxPhoneNumberScore, 1);

maxScore -= maxPhoneNumberScore - phoneNumberScore;
if (maxScore < lowerScoreThreshold)
return null;

// Scoring
// Then compute the weighted average
var totalScore = nameScore;
totalScore += birthDateScore;
totalScore += cityScore;
totalScore += phoneNumberScore;

if (totalScore >= lowerScoreThreshold && totalScore <= upperScoreThreshold)
{
return new PotentialMatch(primaryRecord, secondaryRecord, distanceVector, tempScore);
return new PotentialMatch(
primaryRecord,
secondaryRecord,
new DistanceVector(
firstNameDistance,
middleNameDistance,
lastNameDistance,
secondLastNameDistance,
fullNameDistance,
birthDateDistance,
cityDistance,
phoneNumberDistance
),
totalScore
);
}

return null;
}

public static (int, int)[] GetCharactersStartAndEndIndex(
private static (int, int)[] GetCharactersStartAndEndIndex(
ReadOnlySpan<PersonRecordForMatch> records
)
{
Expand Down
76 changes: 58 additions & 18 deletions src/Biomatch.Domain/Models/Score.cs
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,44 @@ private static double SingleFieldScoreStepMode(int distance, int threshold = 2,
return singleFieldScore;
}

public static double GetScore(int distance, double weight, int threshold)
{
const double step = 0.3;
// Experimental version for now, assigns 1 if the distance is 0, and lowers in increments to
// 0.7
double singleFieldScore;

// If the distance > threshold, assign 0
if (distance > threshold)
{
singleFieldScore = 0.0;
}
// else if the distance is -1 (special exception) return 0.5
// this corresponds to the case where one is empty and the other is not
else if (distance == -1)
{
singleFieldScore = 0.5;
}
// else, return 1-(step/threshold) * dist.
// This results in a higher individual score the closer the distance is to 0
// culminating at 1-step at the threshold
else
{
singleFieldScore = 1 - step * ((double)distance / threshold);
}

return singleFieldScore * weight;
}

public static double CalculateFinalScore(
ref DistanceVector d,
int firstNameDistance,
int middleNameDistance,
int lastNameDistance,
int secondLastNameDistance,
int fullNameDistance,
int birthDateDistance,
int cityDistance,
int phoneNumberDistance,
int firstNameThreshold = 2,
int middleNameThreshold = 1,
int lastNameThreshold = 2,
Expand All @@ -51,31 +87,35 @@ public static double CalculateFinalScore(
)
{
// Get the individual field score distances
var firstNameDistance = SingleFieldScoreStepMode(d.FirstNameDistance, firstNameThreshold);
var middleNameDistance = SingleFieldScoreStepMode(d.MiddleNameDistance, middleNameThreshold);
var lastNameDistance = SingleFieldScoreStepMode(d.LastNameDistance, lastNameThreshold);
var secondLastNameDistance = SingleFieldScoreStepMode(
d.SecondLastNameDistance,
var firstNameSingleScore = SingleFieldScoreStepMode(firstNameDistance, firstNameThreshold);
var middleNameSingleScore = SingleFieldScoreStepMode(middleNameDistance, middleNameThreshold);
var lastNameSingleScore = SingleFieldScoreStepMode(lastNameDistance, lastNameThreshold);
var secondLastNameSingleScore = SingleFieldScoreStepMode(
secondLastNameDistance,
secondLastNameThreshold
);
var fullNameDistance = SingleFieldScoreStepMode(d.FullNameDistance, fullNameThreshold);
var birthDateDistance = SingleFieldScoreStepMode(d.BirthDateDistance, birthDateThreshold);
var cityDistance = SingleFieldScoreStepMode(d.CityDistance, cityThreshold);
var phoneNumberDistance = SingleFieldScoreStepMode(d.PhoneNumberDistance, phoneNumberThreshold);
var fullNameSingleScore = SingleFieldScoreStepMode(fullNameDistance, fullNameThreshold);
var birthDateSingleScore = SingleFieldScoreStepMode(birthDateDistance, birthDateThreshold);
var citySingleScore = SingleFieldScoreStepMode(cityDistance, cityThreshold);
var phoneNumberSingleScore = SingleFieldScoreStepMode(
phoneNumberDistance,
phoneNumberThreshold
);

var separateNameScore =
firstNameDistance * firstNameWeight
+ middleNameDistance * middleNameWeight
+ lastNameDistance * lastNameWeight
+ secondLastNameDistance * secondLastNameWeight;
var fullNameScore = fullNameDistance * fullNameWeight;
firstNameSingleScore * firstNameWeight
+ middleNameSingleScore * middleNameWeight
+ lastNameSingleScore * lastNameWeight
+ secondLastNameSingleScore * secondLastNameWeight;
var fullNameScore = fullNameSingleScore * fullNameWeight;

var nameScore = separateNameScore > fullNameScore ? separateNameScore : fullNameScore;

// Then compute the weighted average
var totalScore = nameScore;
totalScore += birthDateWeight * birthDateDistance;
totalScore += cityWeight * cityDistance;
totalScore += phoneNumberWeight * phoneNumberDistance;
totalScore += birthDateWeight * birthDateSingleScore;
totalScore += cityWeight * citySingleScore;
totalScore += phoneNumberWeight * phoneNumberSingleScore;
return totalScore;
}
}
2 changes: 1 addition & 1 deletion src/Biomatch.Domain/Models/StringDistance.cs
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ public static int DateDemographicFieldDistance(ReadOnlySpan<byte> date1, ReadOnl

if (date1.IsEmpty || date2.IsEmpty)
{
return 0;
return -1;
}

// Check for inverted day and month
Expand Down
48 changes: 48 additions & 0 deletions tests/Biomatch.Domain.Tests.Unit/MatchTests.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
using Biomatch.Domain.Models;
using FluentAssertions;

namespace Biomatch.Domain.Tests.Unit;

public class MatchTests
{
[Fact]
public void CompareRecords_ShouldReturnPotentialMatch_WhenMatchIsFound()
{
var patientRecords = new List<IPersonRecord>
{
new PersonRecord(
"1230",
"Clara",
"",
"Pique",
"",
new DateOnly(1995, 02, 01),
"Adjuntas",
""
),
new PersonRecord(
"1875",
"Clara",
"",
"Pique",
"",
new DateOnly(1995, 01, 02),
"Adjuntas",
""
),
};

var preprocessedRecords = patientRecords.PreprocessData().ToArray();

// Act
var duplicatesToFix = Match.CompareRecords(
ref preprocessedRecords[0],
ref preprocessedRecords[1],
0.60,
1.0
);

// Assert
duplicatesToFix.Should().NotBeNull();
}
}

0 comments on commit b5c999d

Please sign in to comment.