diff --git a/src/Biomatch.Benchmark/FindDuplicateBenchmark.cs b/src/Biomatch.Benchmark/FindDuplicateBenchmark.cs index 3d8ba6a..908b179 100644 --- a/src/Biomatch.Benchmark/FindDuplicateBenchmark.cs +++ b/src/Biomatch.Benchmark/FindDuplicateBenchmark.cs @@ -28,12 +28,12 @@ public FindDuplicateBenchmark() [Benchmark] public void DuplicateBenchmarkSameDataSet() { - Match.GetPotentialMatchesFromSameDataSet(RecordsToMatch, SampleRecords, 0.85, 1.0); + Match.GetPotentialMatchesFromSameDataSet(RecordsToMatch, SampleRecords, 0.80, 1.0); } [Benchmark] public void DuplicateBenchmarkDifferentDataSet() { - Match.GetPotentialMatchesFromDifferentDataSet(RecordsToMatch, SampleRecords, 0.85, 1.0); + Match.GetPotentialMatchesFromDifferentDataSet(RecordsToMatch, SampleRecords, 0.80, 1.0); } } diff --git a/src/Biomatch.Domain/Match.cs b/src/Biomatch.Domain/Match.cs index c1c8a10..15bada4 100644 --- a/src/Biomatch.Domain/Match.cs +++ b/src/Biomatch.Domain/Match.cs @@ -235,18 +235,17 @@ private static void CompareRecords( double upperScoreThreshold ) { - //get the distance vector for the ith vector of the first table and the jth record of the second table - var distanceVector = DistanceVector.CalculateDistance(ref primaryRecord, ref secondaryRecord); - var tempScore = Score.CalculateFinalScore(ref distanceVector); - if (tempScore >= lowerScoreThreshold && tempScore <= upperScoreThreshold) - { - potentialMatches.Add( - new PotentialMatch(primaryRecord, secondaryRecord, distanceVector, tempScore) - ); - } + var potentialMatch = CompareRecords( + ref primaryRecord, + ref secondaryRecord, + lowerScoreThreshold, + upperScoreThreshold + ); + if (potentialMatch is not null) + potentialMatches.Add(potentialMatch.Value); } - private static PotentialMatch? CompareRecords( + public static PotentialMatch? CompareRecords( ref PersonRecordForMatch primaryRecord, ref PersonRecordForMatch secondaryRecord, double lowerScoreThreshold, @@ -254,17 +253,109 @@ double upperScoreThreshold ) { //get the distance vector for the ith vector of the first table and the jth record of the second table - var distanceVector = DistanceVector.CalculateDistance(ref primaryRecord, ref secondaryRecord); - var tempScore = Score.CalculateFinalScore(ref distanceVector); - if (tempScore >= lowerScoreThreshold && tempScore <= upperScoreThreshold) + const double maxNameScore = 0.62; + const double maxBirthDateScore = 0.22; + const double maxCityScore = 0.08; + const double maxPhoneNumberScore = 0.08; + var maxScore = 1.0; + //get the distance vector for the ith vector of the first table and the jth record of the second table + + var birthDateDistance = StringDistance.DateDemographicFieldDistance( + primaryRecord.BirthDateText, + secondaryRecord.BirthDateText + ); + var birthDateScore = Score.GetScore(birthDateDistance, maxBirthDateScore, 1); + + maxScore -= maxBirthDateScore - birthDateScore; + if (maxScore < lowerScoreThreshold) + return null; + // Name parts + var firstNameDistance = StringDistance.GeneralDemographicFieldDistance( + primaryRecord.FirstName, + secondaryRecord.FirstName + ); + var firstNameScore = Score.GetScore(firstNameDistance, 0.18, 2); + var middleNameDistance = StringDistance.MiddleNameDemographicFieldDistance( + primaryRecord.MiddleName, + secondaryRecord.MiddleName + ); + var middleNameScore = Score.GetScore(middleNameDistance, 0.1, 1); + var lastNameDistance = StringDistance.GeneralDemographicFieldDistance( + primaryRecord.LastName, + secondaryRecord.LastName + ); + var lastNameScore = Score.GetScore(lastNameDistance, 0.17, 2); + var secondLastNameDistance = StringDistance.GeneralDemographicFieldDistance( + primaryRecord.SecondLastName, + secondaryRecord.SecondLastName + ); + var secondLastNameScore = Score.GetScore(secondLastNameDistance, 0.17, 2); + + var separateNameScore = firstNameScore + middleNameScore + lastNameScore + secondLastNameScore; + + // Fullname + var fullNameDistance = StringDistance.GeneralDemographicFieldDistance( + primaryRecord.FullName, + secondaryRecord.FullName + ); + var fullNameScore = Score.GetScore(fullNameDistance, maxNameScore, 5); + + var nameScore = separateNameScore > fullNameScore ? separateNameScore : fullNameScore; + + maxScore -= maxNameScore - nameScore; + if (maxScore < lowerScoreThreshold) + return null; + + var cityDistance = StringDistance.GeneralDemographicFieldDistance( + primaryRecord.City, + secondaryRecord.City + ); + var cityScore = Score.GetScore(cityDistance, maxCityScore, 2); + + maxScore -= maxCityScore - cityScore; + if (maxScore < lowerScoreThreshold) + return null; + + var phoneNumberDistance = StringDistance.GeneralDemographicFieldDistance( + primaryRecord.PhoneNumber, + secondaryRecord.PhoneNumber + ); + var phoneNumberScore = Score.GetScore(phoneNumberDistance, maxPhoneNumberScore, 1); + + maxScore -= maxPhoneNumberScore - phoneNumberScore; + if (maxScore < lowerScoreThreshold) + return null; + + // Scoring + // Then compute the weighted average + var totalScore = nameScore; + totalScore += birthDateScore; + totalScore += cityScore; + totalScore += phoneNumberScore; + + if (totalScore >= lowerScoreThreshold && totalScore <= upperScoreThreshold) { - return new PotentialMatch(primaryRecord, secondaryRecord, distanceVector, tempScore); + return new PotentialMatch( + primaryRecord, + secondaryRecord, + new DistanceVector( + firstNameDistance, + middleNameDistance, + lastNameDistance, + secondLastNameDistance, + fullNameDistance, + birthDateDistance, + cityDistance, + phoneNumberDistance + ), + totalScore + ); } return null; } - public static (int, int)[] GetCharactersStartAndEndIndex( + private static (int, int)[] GetCharactersStartAndEndIndex( ReadOnlySpan records ) { diff --git a/src/Biomatch.Domain/Models/Score.cs b/src/Biomatch.Domain/Models/Score.cs index 738227a..233e7b6 100644 --- a/src/Biomatch.Domain/Models/Score.cs +++ b/src/Biomatch.Domain/Models/Score.cs @@ -30,8 +30,44 @@ private static double SingleFieldScoreStepMode(int distance, int threshold = 2, return singleFieldScore; } + public static double GetScore(int distance, double weight, int threshold) + { + const double step = 0.3; + // Experimental version for now, assigns 1 if the distance is 0, and lowers in increments to + // 0.7 + double singleFieldScore; + + // If the distance > threshold, assign 0 + if (distance > threshold) + { + singleFieldScore = 0.0; + } + // else if the distance is -1 (special exception) return 0.5 + // this corresponds to the case where one is empty and the other is not + else if (distance == -1) + { + singleFieldScore = 0.5; + } + // else, return 1-(step/threshold) * dist. + // This results in a higher individual score the closer the distance is to 0 + // culminating at 1-step at the threshold + else + { + singleFieldScore = 1 - step * ((double)distance / threshold); + } + + return singleFieldScore * weight; + } + public static double CalculateFinalScore( - ref DistanceVector d, + int firstNameDistance, + int middleNameDistance, + int lastNameDistance, + int secondLastNameDistance, + int fullNameDistance, + int birthDateDistance, + int cityDistance, + int phoneNumberDistance, int firstNameThreshold = 2, int middleNameThreshold = 1, int lastNameThreshold = 2, @@ -51,31 +87,35 @@ public static double CalculateFinalScore( ) { // Get the individual field score distances - var firstNameDistance = SingleFieldScoreStepMode(d.FirstNameDistance, firstNameThreshold); - var middleNameDistance = SingleFieldScoreStepMode(d.MiddleNameDistance, middleNameThreshold); - var lastNameDistance = SingleFieldScoreStepMode(d.LastNameDistance, lastNameThreshold); - var secondLastNameDistance = SingleFieldScoreStepMode( - d.SecondLastNameDistance, + var firstNameSingleScore = SingleFieldScoreStepMode(firstNameDistance, firstNameThreshold); + var middleNameSingleScore = SingleFieldScoreStepMode(middleNameDistance, middleNameThreshold); + var lastNameSingleScore = SingleFieldScoreStepMode(lastNameDistance, lastNameThreshold); + var secondLastNameSingleScore = SingleFieldScoreStepMode( + secondLastNameDistance, secondLastNameThreshold ); - var fullNameDistance = SingleFieldScoreStepMode(d.FullNameDistance, fullNameThreshold); - var birthDateDistance = SingleFieldScoreStepMode(d.BirthDateDistance, birthDateThreshold); - var cityDistance = SingleFieldScoreStepMode(d.CityDistance, cityThreshold); - var phoneNumberDistance = SingleFieldScoreStepMode(d.PhoneNumberDistance, phoneNumberThreshold); + var fullNameSingleScore = SingleFieldScoreStepMode(fullNameDistance, fullNameThreshold); + var birthDateSingleScore = SingleFieldScoreStepMode(birthDateDistance, birthDateThreshold); + var citySingleScore = SingleFieldScoreStepMode(cityDistance, cityThreshold); + var phoneNumberSingleScore = SingleFieldScoreStepMode( + phoneNumberDistance, + phoneNumberThreshold + ); var separateNameScore = - firstNameDistance * firstNameWeight - + middleNameDistance * middleNameWeight - + lastNameDistance * lastNameWeight - + secondLastNameDistance * secondLastNameWeight; - var fullNameScore = fullNameDistance * fullNameWeight; + firstNameSingleScore * firstNameWeight + + middleNameSingleScore * middleNameWeight + + lastNameSingleScore * lastNameWeight + + secondLastNameSingleScore * secondLastNameWeight; + var fullNameScore = fullNameSingleScore * fullNameWeight; + var nameScore = separateNameScore > fullNameScore ? separateNameScore : fullNameScore; // Then compute the weighted average var totalScore = nameScore; - totalScore += birthDateWeight * birthDateDistance; - totalScore += cityWeight * cityDistance; - totalScore += phoneNumberWeight * phoneNumberDistance; + totalScore += birthDateWeight * birthDateSingleScore; + totalScore += cityWeight * citySingleScore; + totalScore += phoneNumberWeight * phoneNumberSingleScore; return totalScore; } } diff --git a/src/Biomatch.Domain/Models/StringDistance.cs b/src/Biomatch.Domain/Models/StringDistance.cs index f980d1d..4e92d2f 100644 --- a/src/Biomatch.Domain/Models/StringDistance.cs +++ b/src/Biomatch.Domain/Models/StringDistance.cs @@ -48,7 +48,7 @@ public static int DateDemographicFieldDistance(ReadOnlySpan date1, ReadOnl if (date1.IsEmpty || date2.IsEmpty) { - return 0; + return -1; } // Check for inverted day and month diff --git a/tests/Biomatch.Domain.Tests.Unit/MatchTests.cs b/tests/Biomatch.Domain.Tests.Unit/MatchTests.cs new file mode 100644 index 0000000..327a19b --- /dev/null +++ b/tests/Biomatch.Domain.Tests.Unit/MatchTests.cs @@ -0,0 +1,48 @@ +using Biomatch.Domain.Models; +using FluentAssertions; + +namespace Biomatch.Domain.Tests.Unit; + +public class MatchTests +{ + [Fact] + public void CompareRecords_ShouldReturnPotentialMatch_WhenMatchIsFound() + { + var patientRecords = new List + { + new PersonRecord( + "1230", + "Clara", + "", + "Pique", + "", + new DateOnly(1995, 02, 01), + "Adjuntas", + "" + ), + new PersonRecord( + "1875", + "Clara", + "", + "Pique", + "", + new DateOnly(1995, 01, 02), + "Adjuntas", + "" + ), + }; + + var preprocessedRecords = patientRecords.PreprocessData().ToArray(); + + // Act + var duplicatesToFix = Match.CompareRecords( + ref preprocessedRecords[0], + ref preprocessedRecords[1], + 0.60, + 1.0 + ); + + // Assert + duplicatesToFix.Should().NotBeNull(); + } +}