From 38727e31d978350b76edba1683a081a114fe345f Mon Sep 17 00:00:00 2001 From: "Bruno P. Kinoshita" Date: Thu, 30 Jul 2020 17:21:53 +1200 Subject: [PATCH] [TEXT-158]: empty strings must have similarity of 1, and distance of 0 (i.e. identical) --- src/changes/changes.xml | 1 + .../org/apache/commons/text/similarity/JaccardSimilarity.java | 3 +++ .../apache/commons/text/similarity/JaccardDistanceTest.java | 2 +- .../apache/commons/text/similarity/JaccardSimilarityTest.java | 2 +- 4 files changed, 6 insertions(+), 2 deletions(-) diff --git a/src/changes/changes.xml b/src/changes/changes.xml index 109d7406f6..a88d0f1986 100644 --- a/src/changes/changes.xml +++ b/src/changes/changes.xml @@ -45,6 +45,7 @@ The type attribute can be add,update,fix,remove. + Incorrect values for Jaccard similarity with empty strings Release Notes page hasn't been updated for 1.9 release yet. Update spotbugs.plugin.version 4.0.0 to 4.0.4. diff --git a/src/main/java/org/apache/commons/text/similarity/JaccardSimilarity.java b/src/main/java/org/apache/commons/text/similarity/JaccardSimilarity.java index d1478cbca8..4f29139260 100644 --- a/src/main/java/org/apache/commons/text/similarity/JaccardSimilarity.java +++ b/src/main/java/org/apache/commons/text/similarity/JaccardSimilarity.java @@ -64,6 +64,9 @@ public Double apply(final CharSequence left, final CharSequence right) { private Double calculateJaccardSimilarity(final CharSequence left, final CharSequence right) { final int leftLength = left.length(); final int rightLength = right.length(); + if (leftLength == 0 && rightLength == 0) { + return 1d; + } if (leftLength == 0 || rightLength == 0) { return 0d; } diff --git a/src/test/java/org/apache/commons/text/similarity/JaccardDistanceTest.java b/src/test/java/org/apache/commons/text/similarity/JaccardDistanceTest.java index 979354f911..7b43665799 100644 --- a/src/test/java/org/apache/commons/text/similarity/JaccardDistanceTest.java +++ b/src/test/java/org/apache/commons/text/similarity/JaccardDistanceTest.java @@ -37,7 +37,7 @@ public static void setUp() { @Test public void testGettingJaccardDistance() { // Expected Jaccard distance = 1.0 - (intersect / union) - assertEquals(1.0, classBeingTested.apply("", "")); + assertEquals(0.0, classBeingTested.apply("", "")); assertEquals(1.0, classBeingTested.apply("left", "")); assertEquals(1.0, classBeingTested.apply("", "right")); assertEquals(1.0 - (3.0 / 4), classBeingTested.apply("frog", "fog")); diff --git a/src/test/java/org/apache/commons/text/similarity/JaccardSimilarityTest.java b/src/test/java/org/apache/commons/text/similarity/JaccardSimilarityTest.java index bb46122138..827a35f68c 100644 --- a/src/test/java/org/apache/commons/text/similarity/JaccardSimilarityTest.java +++ b/src/test/java/org/apache/commons/text/similarity/JaccardSimilarityTest.java @@ -37,7 +37,7 @@ public static void setUp() { @Test public void testGettingJaccardSimilarity() { // Expected Jaccard similarity = (intersect / union) - assertEquals(0.0, classBeingTested.apply("", "")); + assertEquals(1.0, classBeingTested.apply("", "")); assertEquals(0.0, classBeingTested.apply("left", "")); assertEquals(0.0, classBeingTested.apply("", "right")); assertEquals(3.0 / 4, classBeingTested.apply("frog", "fog"));