> converter;
+
+ // The following is adapted from commons-collections for a Bag.
+ // A Bag is a collection that can store the count of the number
+ // of copies of each element.
+
+ /**
+ * Mutable counter class for storing the count of elements.
+ */
+ private static class BagCount {
+ /** The count. This is initialised to 1 upon construction. */
+ int count = 1;
+ }
+
+ /**
+ * A minimal implementation of a Bag that can store elements and a count.
+ *
+ * For the intended purpose the Bag does not have to be a {@link Collection}. It does not
+ * even have to know its own size.
+ */
+ private class TinyBag {
+ /** The backing map. */
+ private final Map map;
+
+ /**
+ * Create a new tiny bag.
+ *
+ * @param initialCapacity the initial capacity
+ */
+ TinyBag(int initialCapacity) {
+ map = new HashMap<>(initialCapacity);
+ }
+
+ /**
+ * Adds a new element to the bag, incrementing its count in the underlying map.
+ *
+ * @param object the object to add
+ */
+ void add(T object) {
+ final BagCount mut = map.get(object);
+ if (mut == null) {
+ map.put(object, new BagCount());
+ } else {
+ mut.count++;
+ }
+ }
+
+ /**
+ * Returns the number of occurrence of the given element in this bag by
+ * looking up its count in the underlying map.
+ *
+ * @param object the object to search for
+ * @return the number of occurrences of the object, zero if not found
+ */
+ int getCount(final Object object) {
+ final BagCount count = map.get(object);
+ if (count != null) {
+ return count.count;
+ }
+ return 0;
+ }
+
+ /**
+ * Returns a possibly parallel Stream of all the entries in the bag.
+ *
+ * @return the stream
+ */
+ Stream> parallelStream() {
+ return map.entrySet().parallelStream();
+ }
+ }
/**
* Create a new set similarity using the provided converter.
*
- * @param converter the converter used to create the set
+ * If the converter returns a {@link Set} then the intersection result will
+ * not include duplicates. Any other {@link Collection} is used to produce a result
+ * that will include duplicates in the intersect and union.
+ *
+ * @param converter the converter used to create the elements from the characters.
* @throws IllegalArgumentException if the converter is null
*/
- public IntersectionSimilarity(Function> converter) {
+ public IntersectionSimilarity(Function> converter) {
if (converter == null) {
throw new IllegalArgumentException("Converter must not be null");
}
@@ -62,18 +139,59 @@ public IntersectionResult apply(final CharSequence left, final CharSequence righ
if (left == null || right == null) {
throw new IllegalArgumentException("Input cannot be null");
}
- final Set setA = converter.apply(left);
- final Set setB = converter.apply(right);
- final int sizeA = setA.size();
- final int sizeB = setB.size();
- // Short-cut if either set is empty
+
+ // Create the elements from the sequences
+ final Collection objectsA = converter.apply(left);
+ final Collection objectsB = converter.apply(right);
+ final int sizeA = objectsA.size();
+ final int sizeB = objectsB.size();
+
+ // Short-cut if either collection is empty
if (Math.min(sizeA, sizeB) == 0) {
// No intersection
return new IntersectionResult(sizeA, sizeB, 0);
}
- // We can use intValue() to convert the Long output from the
- // collector as the intersection cannot be bigger than either set.
- final int intersection = setA.stream().filter(setB::contains).collect(Collectors.counting()).intValue();
+
+ // Intersection = count the number of shared elements
+ int intersection;
+ if (objectsA instanceof Set) {
+ // If a Set then the elements will only have a count of 1.
+ // Stream the elements in the set A and check if also in set B.
+ // Note: Even if objectsB is a plain collection this will work
+ // since the contains(Object) method will return true when present.
+ // The fact that objectsA is a Set ensures non duplicate counting.
+ intersection = objectsA.parallelStream()
+ .mapToInt(element -> objectsB.contains(element) ? 1 : 0)
+ .sum();
+ } else {
+ // Create a bag for each collection
+ final TinyBag bagA = toBag(objectsA);
+ final TinyBag bagB = toBag(objectsB);
+ // Stream the count of each element in bag A and find the intersection with bag B
+ intersection = bagA.parallelStream()
+ .mapToInt(entry -> {
+ // The intersection of this entry in both bags is the min count
+ final T element = entry.getKey();
+ final int count = entry.getValue().count;
+ return Math.min(count, bagB.getCount(element));
+ })
+ .sum();
+ }
+
return new IntersectionResult(sizeA, sizeB, intersection);
}
+
+ /**
+ * Convert the collection to a bag. The bag will contain the count of each element in the collection.
+ *
+ * @param objects the objects
+ * @return the bag
+ */
+ private TinyBag toBag(Collection objects) {
+ final TinyBag bag = new TinyBag(objects.size());
+ for (T t : objects) {
+ bag.add(t);
+ }
+ return bag;
+ }
}
diff --git a/src/test/java/org/apache/commons/text/similarity/IntersectionResultTest.java b/src/test/java/org/apache/commons/text/similarity/IntersectionResultTest.java
index a0c0923e95..b21275bdf8 100644
--- a/src/test/java/org/apache/commons/text/similarity/IntersectionResultTest.java
+++ b/src/test/java/org/apache/commons/text/similarity/IntersectionResultTest.java
@@ -122,28 +122,28 @@ public void testJaccard() {
}
private static double getJaccard(int sizeA, int sizeB, int intersection) {
- return new IntersectionResult(sizeA, sizeB, intersection).getJaccard();
+ return new IntersectionResult(sizeA, sizeB, intersection).getJaccardIndex();
}
@Test
- public void testF1Score() {
- // F1-score is 2 * intersection divided by the size of each set
- Assertions.assertEquals(0, getF1Score(0, 0, 0));
+ public void testSorensenDice() {
+ // Sorensen-Dice is 2 * intersection divided by the size of each set
+ Assertions.assertEquals(0, getSorensenDice(0, 0, 0));
- Assertions.assertEquals(0, getF1Score(1, 0, 0));
- Assertions.assertEquals(0, getF1Score(1, 1, 0));
- Assertions.assertEquals(1, getF1Score(1, 1, 1));
+ Assertions.assertEquals(0, getSorensenDice(1, 0, 0));
+ Assertions.assertEquals(0, getSorensenDice(1, 1, 0));
+ Assertions.assertEquals(2.0 * 1 / (1 + 1), getSorensenDice(1, 1, 1));
- Assertions.assertEquals(0, getF1Score(2, 0, 0));
- Assertions.assertEquals(0, getF1Score(2, 1, 0));
- Assertions.assertEquals(2 * 1.0 / (2 + 1), getF1Score(2, 1, 1));
- Assertions.assertEquals(1, getF1Score(2, 2, 2));
+ Assertions.assertEquals(0, getSorensenDice(2, 0, 0));
+ Assertions.assertEquals(0, getSorensenDice(2, 1, 0));
+ Assertions.assertEquals(2.0 * 1 / (2 + 1), getSorensenDice(2, 1, 1));
+ Assertions.assertEquals(2.0 * 2 / (2 + 2), getSorensenDice(2, 2, 2));
- Assertions.assertEquals(2 * 2.0 / (20 + 3), getF1Score(20, 3, 2));
+ Assertions.assertEquals(2.0 * 2 / (20 + 3), getSorensenDice(20, 3, 2));
}
- private static double getF1Score(int sizeA, int sizeB, int intersection) {
- return new IntersectionResult(sizeA, sizeB, intersection).getF1Score();
+ private static double getSorensenDice(int sizeA, int sizeB, int intersection) {
+ return new IntersectionResult(sizeA, sizeB, intersection).getSorensenDiceCoefficient();
}
@Test
@@ -192,11 +192,12 @@ public void testHashCode() {
new IntersectionResult(10, 10, 10),
};
final HashMap map = new HashMap<>();
+ final int offset = 123;
for (int i = 0; i < results.length; i++) {
- map.put(results[i], i);
+ map.put(results[i], i + offset);
}
for (int i = 0; i < results.length; i++) {
- Assertions.assertEquals(i, map.get(results[i]));
+ Assertions.assertEquals(i + offset, map.get(results[i]));
}
}
diff --git a/src/test/java/org/apache/commons/text/similarity/IntersectionSimilarityTest.java b/src/test/java/org/apache/commons/text/similarity/IntersectionSimilarityTest.java
index b1054ac6f2..25f3016920 100644
--- a/src/test/java/org/apache/commons/text/similarity/IntersectionSimilarityTest.java
+++ b/src/test/java/org/apache/commons/text/similarity/IntersectionSimilarityTest.java
@@ -21,8 +21,11 @@
import org.junit.jupiter.api.Test;
+import java.util.ArrayList;
import java.util.Arrays;
+import java.util.Collection;
import java.util.HashSet;
+import java.util.List;
import java.util.Set;
import java.util.function.Function;
@@ -31,10 +34,10 @@
*/
public class IntersectionSimilarityTest {
@Test
- public void testGetJaccardSimilarityUsingChars() {
+ public void testJaccardIndexUsingSetCharacter() {
// Match the functionality of the JaccardSimilarity class by dividing
// the sequence into single characters
- final Function> converter = (cs) -> {
+ final Function> converter = (cs) -> {
final int length = cs.length();
final Set set = new HashSet<>(length);
for (int i = 0; i < length; i++) {
@@ -44,31 +47,63 @@ public void testGetJaccardSimilarityUsingChars() {
};
final IntersectionSimilarity similarity = new IntersectionSimilarity<>(converter);
- // This is explicitly implemented instead of using the JaccardSimilarity
- // since that class does rounding to 2 D.P.
- // Results generated using the python distance library using: 1 - distance.jaccard(seq1, seq2)
- assertEquals(0.0, similarity.apply("", "").getJaccard());
- assertEquals(0.0, similarity.apply("left", "").getJaccard());
- assertEquals(0.0, similarity.apply("", "right").getJaccard());
- assertEquals(0.75, similarity.apply("frog", "fog").getJaccard());
- assertEquals(0.0, similarity.apply("fly", "ant").getJaccard());
- assertEquals(0.2222222222222222, similarity.apply("elephant", "hippo").getJaccard());
- assertEquals(0.6363636363636364, similarity.apply("ABC Corporation", "ABC Corp").getJaccard());
- assertEquals(0.7647058823529411,
- similarity.apply("D N H Enterprises Inc", "D & H Enterprises, Inc.").getJaccard());
- assertEquals(0.8888888888888888,
- similarity.apply("My Gym Children's Fitness Center", "My Gym. Childrens Fitness").getJaccard());
- assertEquals(0.9, similarity.apply("PENNSYLVANIA", "PENNCISYLVNIA").getJaccard());
- assertEquals(0.125, similarity.apply("left", "right").getJaccard());
- assertEquals(0.125, similarity.apply("leettteft", "ritttght").getJaccard());
- assertEquals(1.0, similarity.apply("the same string", "the same string").getJaccard());
+ // Expected Jaccard index = (intersect / union)
+ // intersection = count of unique matching characters (exclude duplicates)
+ // union = count of unique characters
+ assertEquals(0.0, similarity.apply("", "").getJaccardIndex());
+ assertEquals(0.0, similarity.apply("left", "").getJaccardIndex());
+ assertEquals(0.0, similarity.apply("", "right").getJaccardIndex());
+ assertEquals(3.0 / 4, similarity.apply("frog", "fog").getJaccardIndex());
+ assertEquals(0.0, similarity.apply("fly", "ant").getJaccardIndex());
+ assertEquals(2.0 / 9, similarity.apply("elephant", "hippo").getJaccardIndex());
+ assertEquals(7.0 / 11, similarity.apply("ABC Corporation", "ABC Corp").getJaccardIndex());
+ assertEquals(13.0 / 17, similarity.apply("D N H Enterprises Inc", "D & H Enterprises, Inc.").getJaccardIndex());
+ assertEquals(16.0 / 18,
+ similarity.apply("My Gym Children's Fitness Center", "My Gym. Childrens Fitness").getJaccardIndex());
+ assertEquals(9.0 / 10, similarity.apply("PENNSYLVANIA", "PENNCISYLVNIA").getJaccardIndex());
+ assertEquals(1.0 / 8, similarity.apply("left", "right").getJaccardIndex());
+ assertEquals(1.0 / 8, similarity.apply("leettteft", "ritttght").getJaccardIndex());
+ assertEquals(1.0, similarity.apply("the same string", "the same string").getJaccardIndex());
}
@Test
- public void testGetF1ScoreUsingBigrams() {
- // Compute the F1-score using pairs of characters (bigrams).
+ public void testJaccardIndexUsingListCharacter() {
+ // This test uses a list and so duplicates should be matched
+ final Function> converter = (cs) -> {
+ final int length = cs.length();
+ final List list = new ArrayList<>(length);
+ for (int i = 0; i < length; i++) {
+ list.add(cs.charAt(i));
+ }
+ return list;
+ };
+ final IntersectionSimilarity similarity = new IntersectionSimilarity<>(converter);
+
+ // Expected Jaccard index = (intersect / union)
+ // intersection = count of matching characters including duplicates
+ // union = left.length() + right.length() - intersection
+ assertEquals(0.0, similarity.apply("", "").getJaccardIndex());
+ assertEquals(0.0, similarity.apply("left", "").getJaccardIndex());
+ assertEquals(0.0, similarity.apply("", "right").getJaccardIndex());
+ assertEquals(3.0 / (4 + 3 - 3), similarity.apply("frog", "fog").getJaccardIndex());
+ assertEquals(0.0, similarity.apply("fly", "ant").getJaccardIndex());
+ assertEquals(2.0 / (8 + 5 - 2), similarity.apply("elephant", "hippo").getJaccardIndex());
+ assertEquals(8.0 / (15 + 8 - 8), similarity.apply("ABC Corporation", "ABC Corp").getJaccardIndex());
+ assertEquals(20.0 / (21 + 23 - 20),
+ similarity.apply("D N H Enterprises Inc", "D & H Enterprises, Inc.").getJaccardIndex());
+ assertEquals(24.0 / (32 + 25 - 24),
+ similarity.apply("My Gym Children's Fitness Center", "My Gym. Childrens Fitness").getJaccardIndex());
+ assertEquals(11.0 / (12 + 13 - 11), similarity.apply("PENNSYLVANIA", "PENNCISYLVNIA").getJaccardIndex());
+ assertEquals(1.0 / (4 + 5 - 1), similarity.apply("left", "right").getJaccardIndex());
+ assertEquals(4.0 / (9 + 8 - 4), similarity.apply("leettteft", "ritttght").getJaccardIndex());
+ assertEquals(1.0, similarity.apply("the same string", "the same string").getJaccardIndex());
+ }
+
+ @Test
+ public void testSorensenDiceCoefficientUsingSetBigrams() {
+ // Compute using pairs of characters (bigrams).
// This can be done using a 32-bit int to store two 16-bit characters
- final Function> converter = (cs) -> {
+ final Function> converter = (cs) -> {
final int length = cs.length();
final Set set = new HashSet<>(length);
if (length > 1) {
@@ -83,30 +118,99 @@ public void testGetF1ScoreUsingBigrams() {
};
final IntersectionSimilarity similarity = new IntersectionSimilarity<>(converter);
- // Note that when there are no bigrams then the similarity is zero.
-
- assertEquals(0d, similarity.apply("", "").getF1Score());
- assertEquals(0d, similarity.apply("", "a").getF1Score());
- assertEquals(0d, similarity.apply("a", "").getF1Score());
- assertEquals(0d, similarity.apply("a", "a").getF1Score());
- assertEquals(0d, similarity.apply("a", "b").getF1Score());
- assertEquals(1.0d, similarity.apply("foo", "foo").getF1Score());
- assertEquals(0.8d, similarity.apply("foo", "foo ").getF1Score());
- assertEquals(0.4d, similarity.apply("frog", "fog").getF1Score());
- assertEquals(0.0d, similarity.apply("fly", "ant").getF1Score());
- assertEquals(0.0d, similarity.apply("elephant", "hippo").getF1Score());
- assertEquals(0.0d, similarity.apply("hippo", "elephant").getF1Score());
- assertEquals(0.0d, similarity.apply("hippo", "zzzzzzzz").getF1Score());
- assertEquals(0.5d, similarity.apply("hello", "hallo").getF1Score());
- assertEquals(0.7d, similarity.apply("ABC Corporation", "ABC Corp").getF1Score());
- assertEquals(0.7391304347826086d,
- similarity.apply("D N H Enterprises Inc", "D & H Enterprises, Inc.").getF1Score());
- assertEquals(0.8076923076923077d,
- similarity.apply("My Gym Children's Fitness Center", "My Gym. Childrens Fitness").getF1Score());
- assertEquals(0.6956521739130435, similarity.apply("PENNSYLVANIA", "PENNCISYLVNIA").getF1Score());
- assertEquals(0.9230769230769231, similarity.apply("/opt/software1", "/opt/software2").getF1Score());
- assertEquals(0.5d, similarity.apply("aaabcd", "aaacdb").getF1Score());
- assertEquals(0.631578947368421, similarity.apply("John Horn", "John Hopkins").getF1Score());
+ // Expected Sorensen-Dice = 2 * intersection / (|A| + |B|)
+ // intersection = count of unique matching bigrams (exclude duplicates)
+ // |A| = count of unique bigrams in A
+ // |B| = count of unique bigrams in B
+ assertEquals(2.0 * 1 / (1 + 1), similarity.apply("aa", "aa").getSorensenDiceCoefficient());
+ assertEquals(2.0 * 1 / (1 + 1), similarity.apply("aaa", "aa").getSorensenDiceCoefficient());
+ assertEquals(2.0 * 1 / (1 + 1), similarity.apply("aaaa", "aa").getSorensenDiceCoefficient());
+ assertEquals(2.0 * 1 / (1 + 1), similarity.apply("aaaa", "aaa").getSorensenDiceCoefficient());
+ assertEquals(2.0 * 2 / (3 + 2), similarity.apply("abaa", "aba").getSorensenDiceCoefficient());
+ assertEquals(2.0 * 2 / (2 + 2), similarity.apply("ababa", "aba").getSorensenDiceCoefficient());
+ assertEquals(2.0 * 2 / (2 + 2), similarity.apply("ababa", "abab").getSorensenDiceCoefficient());
+
+ assertEquals(0.0, similarity.apply("", "").getSorensenDiceCoefficient());
+ assertEquals(0.0, similarity.apply("", "a").getSorensenDiceCoefficient());
+ assertEquals(0.0, similarity.apply("a", "").getSorensenDiceCoefficient());
+ assertEquals(0.0, similarity.apply("a", "a").getSorensenDiceCoefficient());
+ assertEquals(0.0, similarity.apply("a", "b").getSorensenDiceCoefficient());
+ assertEquals(1.0, similarity.apply("foo", "foo").getSorensenDiceCoefficient());
+ assertEquals(2.0 * 2 / (2 + 3), similarity.apply("foo", "foo ").getSorensenDiceCoefficient());
+ assertEquals(2.0 * 1 / (3 + 2), similarity.apply("frog", "fog").getSorensenDiceCoefficient());
+ assertEquals(0.0, similarity.apply("fly", "ant").getSorensenDiceCoefficient());
+ assertEquals(0.0, similarity.apply("elephant", "hippo").getSorensenDiceCoefficient());
+ assertEquals(0.0, similarity.apply("hippo", "elephant").getSorensenDiceCoefficient());
+ assertEquals(0.0, similarity.apply("hippo", "zzzzzzzz").getSorensenDiceCoefficient());
+ assertEquals(2.0 * 2 / (4 + 4), similarity.apply("hello", "hallo").getSorensenDiceCoefficient());
+ assertEquals(2.0 * 7 / (13 + 7), similarity.apply("ABC Corporation", "ABC Corp").getSorensenDiceCoefficient());
+ assertEquals(2.0 * 17 / (20 + 26),
+ similarity.apply("D N H Enterprises Inc", "D & H Enterprises, Inc.").getSorensenDiceCoefficient());
+ assertEquals(2.0 * 21 / (28 + 24), similarity
+ .apply("My Gym Children's Fitness Center", "My Gym. Childrens Fitness").getSorensenDiceCoefficient());
+ assertEquals(2.0 * 8 / (11 + 12),
+ similarity.apply("PENNSYLVANIA", "PENNCISYLVNIA").getSorensenDiceCoefficient());
+ assertEquals(2.0 * 12 / (13 + 13),
+ similarity.apply("/opt/software1", "/opt/software2").getSorensenDiceCoefficient());
+ assertEquals(2.0 * 2 / (4 + 4), similarity.apply("aaabcd", "aaacdb").getSorensenDiceCoefficient());
+ assertEquals(2.0 * 6 / (8 + 11), similarity.apply("John Horn", "John Hopkins").getSorensenDiceCoefficient());
+ }
+
+ @Test
+ public void testSorensenDiceCoefficientUsingListBigrams() {
+ // Compute using pairs of characters (bigrams).
+ // This can be done using a 32-bit int to store two 16-bit characters
+ final Function> converter = (cs) -> {
+ final int length = cs.length();
+ final List set = new ArrayList<>(length);
+ if (length > 1) {
+ char ch2 = cs.charAt(0);
+ for (int i = 1; i < length; i++) {
+ final char ch1 = ch2;
+ ch2 = cs.charAt(i);
+ set.add(Integer.valueOf((ch1 << 16) | ch2));
+ }
+ }
+ return set;
+ };
+ final IntersectionSimilarity similarity = new IntersectionSimilarity<>(converter);
+
+ // Expected Sorensen-Dice = 2 * intersection / (|A| + |B|)
+ // intersection = count of matching bigrams including duplicates
+ // |A| = max(0, left.length() - 1)
+ // |B| = max(0, right.length() - 1)
+ assertEquals(2.0 * 1 / (1 + 1), similarity.apply("aa", "aa").getSorensenDiceCoefficient());
+ assertEquals(2.0 * 1 / (2 + 1), similarity.apply("aaa", "aa").getSorensenDiceCoefficient());
+ assertEquals(2.0 * 1 / (3 + 1), similarity.apply("aaaa", "aa").getSorensenDiceCoefficient());
+ assertEquals(2.0 * 2 / (3 + 2), similarity.apply("aaaa", "aaa").getSorensenDiceCoefficient());
+ assertEquals(2.0 * 2 / (3 + 2), similarity.apply("abaa", "aba").getSorensenDiceCoefficient());
+ assertEquals(2.0 * 2 / (4 + 2), similarity.apply("ababa", "aba").getSorensenDiceCoefficient());
+ assertEquals(2.0 * 3 / (4 + 3), similarity.apply("ababa", "abab").getSorensenDiceCoefficient());
+
+ assertEquals(0.0, similarity.apply("", "").getSorensenDiceCoefficient());
+ assertEquals(0.0, similarity.apply("", "a").getSorensenDiceCoefficient());
+ assertEquals(0.0, similarity.apply("a", "").getSorensenDiceCoefficient());
+ assertEquals(0.0, similarity.apply("a", "a").getSorensenDiceCoefficient());
+ assertEquals(0.0, similarity.apply("a", "b").getSorensenDiceCoefficient());
+ assertEquals(1.0, similarity.apply("foo", "foo").getSorensenDiceCoefficient());
+ assertEquals(2.0 * 2 / (2 + 3), similarity.apply("foo", "foo ").getSorensenDiceCoefficient());
+ assertEquals(2.0 * 1 / (3 + 2), similarity.apply("frog", "fog").getSorensenDiceCoefficient());
+ assertEquals(0.0, similarity.apply("fly", "ant").getSorensenDiceCoefficient());
+ assertEquals(0.0, similarity.apply("elephant", "hippo").getSorensenDiceCoefficient());
+ assertEquals(0.0, similarity.apply("hippo", "elephant").getSorensenDiceCoefficient());
+ assertEquals(0.0, similarity.apply("hippo", "zzzzzzzz").getSorensenDiceCoefficient());
+ assertEquals(2.0 * 2 / (4 + 4), similarity.apply("hello", "hallo").getSorensenDiceCoefficient());
+ assertEquals(2.0 * 7 / (14 + 7), similarity.apply("ABC Corporation", "ABC Corp").getSorensenDiceCoefficient());
+ assertEquals(2.0 * 17 / (20 + 26),
+ similarity.apply("D N H Enterprises Inc", "D & H Enterprises, Inc.").getSorensenDiceCoefficient());
+ assertEquals(2.0 * 21 / (31 + 24), similarity
+ .apply("My Gym Children's Fitness Center", "My Gym. Childrens Fitness").getSorensenDiceCoefficient());
+ assertEquals(2.0 * 8 / (11 + 12),
+ similarity.apply("PENNSYLVANIA", "PENNCISYLVNIA").getSorensenDiceCoefficient());
+ assertEquals(2.0 * 12 / (13 + 13),
+ similarity.apply("/opt/software1", "/opt/software2").getSorensenDiceCoefficient());
+ assertEquals(2.0 * 3 / (5 + 5), similarity.apply("aaabcd", "aaacdb").getSorensenDiceCoefficient());
+ assertEquals(2.0 * 6 / (8 + 11), similarity.apply("John Horn", "John Hopkins").getSorensenDiceCoefficient());
}
@Test
From dae816a09e2b27e837ac198c8b444d49824733c6 Mon Sep 17 00:00:00 2001
From: aherbert
Date: Fri, 8 Mar 2019 16:00:09 +0000
Subject: [PATCH 08/16] TEXT-155: Add word letter pairs test to
IntersectionSimilarityTest
---
.../IntersectionSimilarityTest.java | 100 +++++++++++++++---
1 file changed, 85 insertions(+), 15 deletions(-)
diff --git a/src/test/java/org/apache/commons/text/similarity/IntersectionSimilarityTest.java b/src/test/java/org/apache/commons/text/similarity/IntersectionSimilarityTest.java
index 25f3016920..c76330b7b1 100644
--- a/src/test/java/org/apache/commons/text/similarity/IntersectionSimilarityTest.java
+++ b/src/test/java/org/apache/commons/text/similarity/IntersectionSimilarityTest.java
@@ -28,6 +28,7 @@
import java.util.List;
import java.util.Set;
import java.util.function.Function;
+import java.util.regex.Pattern;
/**
* Unit tests for {@link IntersectionSimilarity}.
@@ -37,7 +38,7 @@ public class IntersectionSimilarityTest {
public void testJaccardIndexUsingSetCharacter() {
// Match the functionality of the JaccardSimilarity class by dividing
// the sequence into single characters
- final Function> converter = (cs) -> {
+ final Function> converter = cs -> {
final int length = cs.length();
final Set set = new HashSet<>(length);
for (int i = 0; i < length; i++) {
@@ -49,7 +50,7 @@ public void testJaccardIndexUsingSetCharacter() {
// Expected Jaccard index = (intersect / union)
// intersection = count of unique matching characters (exclude duplicates)
- // union = count of unique characters
+ // union = count of unique characters
assertEquals(0.0, similarity.apply("", "").getJaccardIndex());
assertEquals(0.0, similarity.apply("left", "").getJaccardIndex());
assertEquals(0.0, similarity.apply("", "right").getJaccardIndex());
@@ -69,7 +70,7 @@ public void testJaccardIndexUsingSetCharacter() {
@Test
public void testJaccardIndexUsingListCharacter() {
// This test uses a list and so duplicates should be matched
- final Function> converter = (cs) -> {
+ final Function> converter = cs -> {
final int length = cs.length();
final List list = new ArrayList<>(length);
for (int i = 0; i < length; i++) {
@@ -103,7 +104,7 @@ public void testJaccardIndexUsingListCharacter() {
public void testSorensenDiceCoefficientUsingSetBigrams() {
// Compute using pairs of characters (bigrams).
// This can be done using a 32-bit int to store two 16-bit characters
- final Function> converter = (cs) -> {
+ final Function> converter = cs -> {
final int length = cs.length();
final Set set = new HashSet<>(length);
if (length > 1) {
@@ -120,8 +121,8 @@ public void testSorensenDiceCoefficientUsingSetBigrams() {
// Expected Sorensen-Dice = 2 * intersection / (|A| + |B|)
// intersection = count of unique matching bigrams (exclude duplicates)
- // |A| = count of unique bigrams in A
- // |B| = count of unique bigrams in B
+ // |A| = count of unique bigrams in A
+ // |B| = count of unique bigrams in B
assertEquals(2.0 * 1 / (1 + 1), similarity.apply("aa", "aa").getSorensenDiceCoefficient());
assertEquals(2.0 * 1 / (1 + 1), similarity.apply("aaa", "aa").getSorensenDiceCoefficient());
assertEquals(2.0 * 1 / (1 + 1), similarity.apply("aaaa", "aa").getSorensenDiceCoefficient());
@@ -160,25 +161,25 @@ public void testSorensenDiceCoefficientUsingSetBigrams() {
public void testSorensenDiceCoefficientUsingListBigrams() {
// Compute using pairs of characters (bigrams).
// This can be done using a 32-bit int to store two 16-bit characters
- final Function> converter = (cs) -> {
+ final Function> converter = cs -> {
final int length = cs.length();
- final List set = new ArrayList<>(length);
+ final List list = new ArrayList<>(length);
if (length > 1) {
char ch2 = cs.charAt(0);
for (int i = 1; i < length; i++) {
final char ch1 = ch2;
ch2 = cs.charAt(i);
- set.add(Integer.valueOf((ch1 << 16) | ch2));
+ list.add(Integer.valueOf((ch1 << 16) | ch2));
}
}
- return set;
+ return list;
};
final IntersectionSimilarity similarity = new IntersectionSimilarity<>(converter);
// Expected Sorensen-Dice = 2 * intersection / (|A| + |B|)
// intersection = count of matching bigrams including duplicates
- // |A| = max(0, left.length() - 1)
- // |B| = max(0, right.length() - 1)
+ // |A| = max(0, left.length() - 1)
+ // |B| = max(0, right.length() - 1)
assertEquals(2.0 * 1 / (1 + 1), similarity.apply("aa", "aa").getSorensenDiceCoefficient());
assertEquals(2.0 * 1 / (2 + 1), similarity.apply("aaa", "aa").getSorensenDiceCoefficient());
assertEquals(2.0 * 1 / (3 + 1), similarity.apply("aaaa", "aa").getSorensenDiceCoefficient());
@@ -213,6 +214,75 @@ public void testSorensenDiceCoefficientUsingListBigrams() {
assertEquals(2.0 * 6 / (8 + 11), similarity.apply("John Horn", "John Hopkins").getSorensenDiceCoefficient());
}
+ @Test
+ public void testSorensenDiceCoefficientUsingListWordBigrams() {
+ // Example of a word letter pairs algorithm:
+ // http://www.catalysoft.com/articles/StrikeAMatch.html
+
+ // Split on whitespace
+ final Pattern pattern = Pattern.compile("\\s+");
+
+ // Compute using pairs of characters (bigrams) for each word.
+ // This can be done using a 32-bit int to store two 16-bit characters
+ final Function> converter = cs -> {
+ final List set = new ArrayList<>();
+ for (String word : pattern.split(cs)) {
+ if (word.length() > 1) {
+ // The strings are converted to upper case
+ char ch2 = Character.toUpperCase(word.charAt(0));
+ for (int i = 1; i < word.length(); i++) {
+ final char ch1 = ch2;
+ ch2 = Character.toUpperCase(word.charAt(i));
+ set.add(Integer.valueOf((ch1 << 16) | ch2));
+ }
+ }
+ }
+ return set;
+ };
+ final IntersectionSimilarity similarity = new IntersectionSimilarity<>(converter);
+
+ String bookTitle;
+ final String search1 = "Web Database Applications";
+ final String search2 = "PHP Web Applications";
+ final String search3 = "Web Aplications";
+ bookTitle = "Web Database Applications with PHP & MySQL";
+ assertEquals(82, toPercent(similarity.apply(bookTitle, search1).getSorensenDiceCoefficient()));
+ assertEquals(68, toPercent(similarity.apply(bookTitle, search2).getSorensenDiceCoefficient()));
+ assertEquals(59, toPercent(similarity.apply(bookTitle, search3).getSorensenDiceCoefficient()));
+ bookTitle = "Creating Database Web Applications with PHP and ASP";
+ assertEquals(71, toPercent(similarity.apply(bookTitle, search1).getSorensenDiceCoefficient()));
+ assertEquals(59, toPercent(similarity.apply(bookTitle, search2).getSorensenDiceCoefficient()));
+ assertEquals(50, toPercent(similarity.apply(bookTitle, search3).getSorensenDiceCoefficient()));
+ bookTitle = "Building Database Applications on the Web Using PHP3";
+ assertEquals(70, toPercent(similarity.apply(bookTitle, search1).getSorensenDiceCoefficient()));
+ assertEquals(58, toPercent(similarity.apply(bookTitle, search2).getSorensenDiceCoefficient()));
+ assertEquals(49, toPercent(similarity.apply(bookTitle, search3).getSorensenDiceCoefficient()));
+ bookTitle = "Building Web Database Applications with Visual Studio 6";
+ assertEquals(67, toPercent(similarity.apply(bookTitle, search1).getSorensenDiceCoefficient()));
+ assertEquals(47, toPercent(similarity.apply(bookTitle, search2).getSorensenDiceCoefficient()));
+ assertEquals(46, toPercent(similarity.apply(bookTitle, search3).getSorensenDiceCoefficient()));
+ bookTitle = "Web Application Development With PHP";
+ assertEquals(51, toPercent(similarity.apply(bookTitle, search1).getSorensenDiceCoefficient()));
+ assertEquals(67, toPercent(similarity.apply(bookTitle, search2).getSorensenDiceCoefficient()));
+ assertEquals(56, toPercent(similarity.apply(bookTitle, search3).getSorensenDiceCoefficient()));
+ bookTitle = "WebRAD: Building Database Applications on the Web with Visual FoxPro and Web Connection";
+ assertEquals(49, toPercent(similarity.apply(bookTitle, search1).getSorensenDiceCoefficient()));
+ assertEquals(34, toPercent(similarity.apply(bookTitle, search2).getSorensenDiceCoefficient()));
+ assertEquals(32, toPercent(similarity.apply(bookTitle, search3).getSorensenDiceCoefficient()));
+ bookTitle = "Structural Assessment: The Role of Large and Full-Scale Testing";
+ assertEquals(12, toPercent(similarity.apply(bookTitle, search1).getSorensenDiceCoefficient()));
+ assertEquals(7, toPercent(similarity.apply(bookTitle, search2).getSorensenDiceCoefficient()));
+ assertEquals(7, toPercent(similarity.apply(bookTitle, search3).getSorensenDiceCoefficient()));
+ bookTitle = "How to Find a Scholarship Online";
+ assertEquals(10, toPercent(similarity.apply(bookTitle, search1).getSorensenDiceCoefficient()));
+ assertEquals(11, toPercent(similarity.apply(bookTitle, search2).getSorensenDiceCoefficient()));
+ assertEquals(12, toPercent(similarity.apply(bookTitle, search3).getSorensenDiceCoefficient()));
+ }
+
+ private static int toPercent(double value) {
+ return (int) Math.round(value * 100);
+ }
+
@Test
public void testConstructorWithNullConverterThrows() {
assertThatIllegalArgumentException().isThrownBy(() -> {
@@ -223,21 +293,21 @@ public void testConstructorWithNullConverterThrows() {
@Test
public void testApplyNullNull() {
assertThatIllegalArgumentException().isThrownBy(() -> {
- new IntersectionSimilarity<>((cs) -> new HashSet<>(Arrays.asList(cs))).apply(null, null);
+ new IntersectionSimilarity<>(cs -> new HashSet<>(Arrays.asList(cs))).apply(null, null);
});
}
@Test
public void testApplyStringNull() {
assertThatIllegalArgumentException().isThrownBy(() -> {
- new IntersectionSimilarity<>((cs) -> new HashSet<>(Arrays.asList(cs))).apply("left", null);
+ new IntersectionSimilarity<>(cs -> new HashSet<>(Arrays.asList(cs))).apply("left", null);
});
}
@Test
public void testApplyNullString() {
assertThatIllegalArgumentException().isThrownBy(() -> {
- new IntersectionSimilarity<>((cs) -> new HashSet<>(Arrays.asList(cs))).apply(null, "right");
+ new IntersectionSimilarity<>(cs -> new HashSet<>(Arrays.asList(cs))).apply(null, "right");
});
}
}
From ae21c63dcb7f57893d095c802c596b7db268d673 Mon Sep 17 00:00:00 2001
From: aherbert
Date: Fri, 8 Mar 2019 16:08:45 +0000
Subject: [PATCH 09/16] Text-155: Javadoc fix in IntersectionResult
---
.../apache/commons/text/similarity/IntersectionResult.java | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/src/main/java/org/apache/commons/text/similarity/IntersectionResult.java b/src/main/java/org/apache/commons/text/similarity/IntersectionResult.java
index e63f2b9503..f06d021b71 100644
--- a/src/main/java/org/apache/commons/text/similarity/IntersectionResult.java
+++ b/src/main/java/org/apache/commons/text/similarity/IntersectionResult.java
@@ -21,7 +21,7 @@
/**
* Container class to store the intersection results between two sets.
*
- * Stores the size of set A, set B and the intersection of A and B (A ∩ B).
+ *
Stores the size of set A, set B and the intersection of A and B (|A ∩ B|).
* The result can be used to produce various similarity metrics, for example the Jaccard index or
* Sørensen-Dice coefficient (F1 score).
*
@@ -51,7 +51,7 @@ public class IntersectionResult {
*
* @param sizeA the size of set A ({@code |A|})
* @param sizeB the size of set B ({@code |B|})
- * @param intersection the size of the intersection of A and B (A ∩ B)
+ * @param intersection the size of the intersection of A and B (|A ∩ B|)
* @throws IllegalArgumentException if the sizes are negative or the intersection is greater
* than the minimum of the two set sizes
*/
From 9a7d018c3e85031749166195ebab66c07b7d94c6 Mon Sep 17 00:00:00 2001
From: Alex Herbert
Date: Sat, 9 Mar 2019 21:19:47 +0000
Subject: [PATCH 10/16] TEXT-155: Renamed to OverlapSimilarity.
Removed computation of metrics from the OverlapResult.
---
...rsectionResult.java => OverlapResult.java} | 57 +---
...Similarity.java => OverlapSimilarity.java} | 51 ++-
.../IntersectionSimilarityTest.java | 313 ------------------
...ResultTest.java => OverlapResultTest.java} | 97 ++----
.../similarity/OverlapSimilarityTest.java | 288 ++++++++++++++++
5 files changed, 350 insertions(+), 456 deletions(-)
rename src/main/java/org/apache/commons/text/similarity/{IntersectionResult.java => OverlapResult.java} (60%)
rename src/main/java/org/apache/commons/text/similarity/{IntersectionSimilarity.java => OverlapSimilarity.java} (77%)
delete mode 100644 src/test/java/org/apache/commons/text/similarity/IntersectionSimilarityTest.java
rename src/test/java/org/apache/commons/text/similarity/{IntersectionResultTest.java => OverlapResultTest.java} (60%)
create mode 100644 src/test/java/org/apache/commons/text/similarity/OverlapSimilarityTest.java
diff --git a/src/main/java/org/apache/commons/text/similarity/IntersectionResult.java b/src/main/java/org/apache/commons/text/similarity/OverlapResult.java
similarity index 60%
rename from src/main/java/org/apache/commons/text/similarity/IntersectionResult.java
rename to src/main/java/org/apache/commons/text/similarity/OverlapResult.java
index f06d021b71..021fab9c5a 100644
--- a/src/main/java/org/apache/commons/text/similarity/IntersectionResult.java
+++ b/src/main/java/org/apache/commons/text/similarity/OverlapResult.java
@@ -19,20 +19,20 @@
import java.util.Objects;
/**
- * Container class to store the intersection results between two sets.
+ * Container class to store the overlap results between two sets.
*
- * Stores the size of set A, set B and the intersection of A and B (|A ∩ B|).
- * The result can be used to produce various similarity metrics, for example the Jaccard index or
- * Sørensen-Dice coefficient (F1 score).
+ * Stores the size of set A ({@code |A|}), set B ({@code |A|}) and the
+ * intersection of A and B (|A ∩ B|). The result can be used
+ * to produce the union of A and B (|A ∪ B|).
*
* This class is immutable.
*
* @since 1.7
- * @see Jaccard index
- * @see Sørensen Dice coefficient
- * @see F1 score
+ * @see Intersection
+ * @see Union
*/
-public class IntersectionResult {
+public class OverlapResult {
/**
* The size of set A.
*/
@@ -55,7 +55,7 @@ public class IntersectionResult {
* @throws IllegalArgumentException if the sizes are negative or the intersection is greater
* than the minimum of the two set sizes
*/
- public IntersectionResult(final int sizeA, final int sizeB, final int intersection) {
+ public OverlapResult(final int sizeA, final int sizeB, final int intersection) {
if (sizeA < 0) {
throw new IllegalArgumentException("Set size |A| is not positive: " + sizeA);
}
@@ -105,43 +105,6 @@ public long getUnion() {
return (long) sizeA + sizeB - intersection;
}
- /**
- * Gets the Jaccard index. The Jaccard is the intersection divided by the union.
- *
- * |A ∩ B| / |A ∪ B|
- *
- * This implementation defines the result as zero if there is no intersection,
- * even when the union is zero to avoid a {@link Double#NaN} result.
- *
- * @return the Jaccard index
- * @see Jaccard index
- */
- public double getJaccardIndex() {
- return intersection == 0 ? 0.0 : (double) intersection / getUnion();
- }
-
- /**
- * Gets the Sørensen-Dice coefficient. The coefficient is twice the size of the intersection
- * divided by the size of both sets.
- *
- *
- * 2|A ∩ B| / (|A| + |B|)
- *
- *
- * This is also known as the F1 score.
- *
- *
This implementation defines the result as zero if there is no intersection, even when the size
- * of both sets is zero to avoid a {@link Double#NaN} result.
- *
- * @return the Sørensen-Dice coefficient
- * @see Sørensen
- * Dice coefficient
- * @see F1 score
- */
- public double getSorensenDiceCoefficient() {
- return intersection == 0 ? 0.0 : 2.0 * intersection / ((long) sizeA + sizeB);
- }
-
@Override
public boolean equals(final Object o) {
if (this == o) {
@@ -150,7 +113,7 @@ public boolean equals(final Object o) {
if (o == null || getClass() != o.getClass()) {
return false;
}
- final IntersectionResult result = (IntersectionResult) o;
+ final OverlapResult result = (OverlapResult) o;
return sizeA == result.sizeA && sizeB == result.sizeB && intersection == result.intersection;
}
diff --git a/src/main/java/org/apache/commons/text/similarity/IntersectionSimilarity.java b/src/main/java/org/apache/commons/text/similarity/OverlapSimilarity.java
similarity index 77%
rename from src/main/java/org/apache/commons/text/similarity/IntersectionSimilarity.java
rename to src/main/java/org/apache/commons/text/similarity/OverlapSimilarity.java
index 0751553066..df71654a96 100644
--- a/src/main/java/org/apache/commons/text/similarity/IntersectionSimilarity.java
+++ b/src/main/java/org/apache/commons/text/similarity/OverlapSimilarity.java
@@ -19,12 +19,12 @@
import java.util.Collection;
import java.util.HashMap;
import java.util.Map;
+import java.util.Map.Entry;
import java.util.Set;
import java.util.function.Function;
-import java.util.stream.Stream;
/**
- * Measures the intersection of two sets created from a pair of character sequences.
+ * Measures the overlap of two sets created from a pair of character sequences.
*
* It is assumed that the type {@code T} correctly conforms to the requirements for storage
* within a {@link Set} or {@link HashMap}. Ideally the type is immutable and implements
@@ -35,7 +35,7 @@
* @see Set
* @see HashMap
*/
-public class IntersectionSimilarity implements SimilarityScore {
+public class OverlapSimilarity implements SimilarityScore {
/** The converter used to create the elements from the characters. */
private final Function> converter;
@@ -100,17 +100,17 @@ int getCount(final Object object) {
}
/**
- * Returns a possibly parallel Stream of all the entries in the bag.
+ * Returns a Set view of the mappings contained in this bag.
*
- * @return the stream
+ * @return the Set view
*/
- Stream> parallelStream() {
- return map.entrySet().parallelStream();
+ Set> entrySet() {
+ return map.entrySet();
}
}
/**
- * Create a new set similarity using the provided converter.
+ * Create a new overlap similarity using the provided converter.
*
* If the converter returns a {@link Set} then the intersection result will
* not include duplicates. Any other {@link Collection} is used to produce a result
@@ -119,7 +119,7 @@ Stream> parallelStream() {
* @param converter the converter used to create the elements from the characters.
* @throws IllegalArgumentException if the converter is null
*/
- public IntersectionSimilarity(Function> converter) {
+ public OverlapSimilarity(Function> converter) {
if (converter == null) {
throw new IllegalArgumentException("Converter must not be null");
}
@@ -135,7 +135,7 @@ public IntersectionSimilarity(Function> converter) {
* @throws IllegalArgumentException if either input sequence is {@code null}
*/
@Override
- public IntersectionResult apply(final CharSequence left, final CharSequence right) {
+ public OverlapResult apply(final CharSequence left, final CharSequence right) {
if (left == null || right == null) {
throw new IllegalArgumentException("Input cannot be null");
}
@@ -149,36 +149,35 @@ public IntersectionResult apply(final CharSequence left, final CharSequence righ
// Short-cut if either collection is empty
if (Math.min(sizeA, sizeB) == 0) {
// No intersection
- return new IntersectionResult(sizeA, sizeB, 0);
+ return new OverlapResult(sizeA, sizeB, 0);
}
// Intersection = count the number of shared elements
- int intersection;
+ int intersection = 0;
if (objectsA instanceof Set) {
// If a Set then the elements will only have a count of 1.
- // Stream the elements in the set A and check if also in set B.
// Note: Even if objectsB is a plain collection this will work
// since the contains(Object) method will return true when present.
// The fact that objectsA is a Set ensures non duplicate counting.
- intersection = objectsA.parallelStream()
- .mapToInt(element -> objectsB.contains(element) ? 1 : 0)
- .sum();
+ for (T element : objectsA) {
+ if (objectsB.contains(element)) {
+ intersection++;
+ }
+ }
} else {
// Create a bag for each collection
final TinyBag bagA = toBag(objectsA);
final TinyBag bagB = toBag(objectsB);
- // Stream the count of each element in bag A and find the intersection with bag B
- intersection = bagA.parallelStream()
- .mapToInt(entry -> {
- // The intersection of this entry in both bags is the min count
- final T element = entry.getKey();
- final int count = entry.getValue().count;
- return Math.min(count, bagB.getCount(element));
- })
- .sum();
+ // Find the intersection of each element in bag A with bag B
+ for (Entry entry : bagA.entrySet()) {
+ final T element = entry.getKey();
+ final int count = entry.getValue().count;
+ // The intersection of this entry in both bags is the minimum count
+ intersection += Math.min(count, bagB.getCount(element));
+ }
}
- return new IntersectionResult(sizeA, sizeB, intersection);
+ return new OverlapResult(sizeA, sizeB, intersection);
}
/**
diff --git a/src/test/java/org/apache/commons/text/similarity/IntersectionSimilarityTest.java b/src/test/java/org/apache/commons/text/similarity/IntersectionSimilarityTest.java
deleted file mode 100644
index c76330b7b1..0000000000
--- a/src/test/java/org/apache/commons/text/similarity/IntersectionSimilarityTest.java
+++ /dev/null
@@ -1,313 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.commons.text.similarity;
-
-import static org.assertj.core.api.Assertions.assertThatIllegalArgumentException;
-import static org.junit.jupiter.api.Assertions.assertEquals;
-
-import org.junit.jupiter.api.Test;
-
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Collection;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Set;
-import java.util.function.Function;
-import java.util.regex.Pattern;
-
-/**
- * Unit tests for {@link IntersectionSimilarity}.
- */
-public class IntersectionSimilarityTest {
- @Test
- public void testJaccardIndexUsingSetCharacter() {
- // Match the functionality of the JaccardSimilarity class by dividing
- // the sequence into single characters
- final Function> converter = cs -> {
- final int length = cs.length();
- final Set set = new HashSet<>(length);
- for (int i = 0; i < length; i++) {
- set.add(cs.charAt(i));
- }
- return set;
- };
- final IntersectionSimilarity similarity = new IntersectionSimilarity<>(converter);
-
- // Expected Jaccard index = (intersect / union)
- // intersection = count of unique matching characters (exclude duplicates)
- // union = count of unique characters
- assertEquals(0.0, similarity.apply("", "").getJaccardIndex());
- assertEquals(0.0, similarity.apply("left", "").getJaccardIndex());
- assertEquals(0.0, similarity.apply("", "right").getJaccardIndex());
- assertEquals(3.0 / 4, similarity.apply("frog", "fog").getJaccardIndex());
- assertEquals(0.0, similarity.apply("fly", "ant").getJaccardIndex());
- assertEquals(2.0 / 9, similarity.apply("elephant", "hippo").getJaccardIndex());
- assertEquals(7.0 / 11, similarity.apply("ABC Corporation", "ABC Corp").getJaccardIndex());
- assertEquals(13.0 / 17, similarity.apply("D N H Enterprises Inc", "D & H Enterprises, Inc.").getJaccardIndex());
- assertEquals(16.0 / 18,
- similarity.apply("My Gym Children's Fitness Center", "My Gym. Childrens Fitness").getJaccardIndex());
- assertEquals(9.0 / 10, similarity.apply("PENNSYLVANIA", "PENNCISYLVNIA").getJaccardIndex());
- assertEquals(1.0 / 8, similarity.apply("left", "right").getJaccardIndex());
- assertEquals(1.0 / 8, similarity.apply("leettteft", "ritttght").getJaccardIndex());
- assertEquals(1.0, similarity.apply("the same string", "the same string").getJaccardIndex());
- }
-
- @Test
- public void testJaccardIndexUsingListCharacter() {
- // This test uses a list and so duplicates should be matched
- final Function> converter = cs -> {
- final int length = cs.length();
- final List list = new ArrayList<>(length);
- for (int i = 0; i < length; i++) {
- list.add(cs.charAt(i));
- }
- return list;
- };
- final IntersectionSimilarity similarity = new IntersectionSimilarity<>(converter);
-
- // Expected Jaccard index = (intersect / union)
- // intersection = count of matching characters including duplicates
- // union = left.length() + right.length() - intersection
- assertEquals(0.0, similarity.apply("", "").getJaccardIndex());
- assertEquals(0.0, similarity.apply("left", "").getJaccardIndex());
- assertEquals(0.0, similarity.apply("", "right").getJaccardIndex());
- assertEquals(3.0 / (4 + 3 - 3), similarity.apply("frog", "fog").getJaccardIndex());
- assertEquals(0.0, similarity.apply("fly", "ant").getJaccardIndex());
- assertEquals(2.0 / (8 + 5 - 2), similarity.apply("elephant", "hippo").getJaccardIndex());
- assertEquals(8.0 / (15 + 8 - 8), similarity.apply("ABC Corporation", "ABC Corp").getJaccardIndex());
- assertEquals(20.0 / (21 + 23 - 20),
- similarity.apply("D N H Enterprises Inc", "D & H Enterprises, Inc.").getJaccardIndex());
- assertEquals(24.0 / (32 + 25 - 24),
- similarity.apply("My Gym Children's Fitness Center", "My Gym. Childrens Fitness").getJaccardIndex());
- assertEquals(11.0 / (12 + 13 - 11), similarity.apply("PENNSYLVANIA", "PENNCISYLVNIA").getJaccardIndex());
- assertEquals(1.0 / (4 + 5 - 1), similarity.apply("left", "right").getJaccardIndex());
- assertEquals(4.0 / (9 + 8 - 4), similarity.apply("leettteft", "ritttght").getJaccardIndex());
- assertEquals(1.0, similarity.apply("the same string", "the same string").getJaccardIndex());
- }
-
- @Test
- public void testSorensenDiceCoefficientUsingSetBigrams() {
- // Compute using pairs of characters (bigrams).
- // This can be done using a 32-bit int to store two 16-bit characters
- final Function> converter = cs -> {
- final int length = cs.length();
- final Set set = new HashSet<>(length);
- if (length > 1) {
- char ch2 = cs.charAt(0);
- for (int i = 1; i < length; i++) {
- final char ch1 = ch2;
- ch2 = cs.charAt(i);
- set.add(Integer.valueOf((ch1 << 16) | ch2));
- }
- }
- return set;
- };
- final IntersectionSimilarity similarity = new IntersectionSimilarity<>(converter);
-
- // Expected Sorensen-Dice = 2 * intersection / (|A| + |B|)
- // intersection = count of unique matching bigrams (exclude duplicates)
- // |A| = count of unique bigrams in A
- // |B| = count of unique bigrams in B
- assertEquals(2.0 * 1 / (1 + 1), similarity.apply("aa", "aa").getSorensenDiceCoefficient());
- assertEquals(2.0 * 1 / (1 + 1), similarity.apply("aaa", "aa").getSorensenDiceCoefficient());
- assertEquals(2.0 * 1 / (1 + 1), similarity.apply("aaaa", "aa").getSorensenDiceCoefficient());
- assertEquals(2.0 * 1 / (1 + 1), similarity.apply("aaaa", "aaa").getSorensenDiceCoefficient());
- assertEquals(2.0 * 2 / (3 + 2), similarity.apply("abaa", "aba").getSorensenDiceCoefficient());
- assertEquals(2.0 * 2 / (2 + 2), similarity.apply("ababa", "aba").getSorensenDiceCoefficient());
- assertEquals(2.0 * 2 / (2 + 2), similarity.apply("ababa", "abab").getSorensenDiceCoefficient());
-
- assertEquals(0.0, similarity.apply("", "").getSorensenDiceCoefficient());
- assertEquals(0.0, similarity.apply("", "a").getSorensenDiceCoefficient());
- assertEquals(0.0, similarity.apply("a", "").getSorensenDiceCoefficient());
- assertEquals(0.0, similarity.apply("a", "a").getSorensenDiceCoefficient());
- assertEquals(0.0, similarity.apply("a", "b").getSorensenDiceCoefficient());
- assertEquals(1.0, similarity.apply("foo", "foo").getSorensenDiceCoefficient());
- assertEquals(2.0 * 2 / (2 + 3), similarity.apply("foo", "foo ").getSorensenDiceCoefficient());
- assertEquals(2.0 * 1 / (3 + 2), similarity.apply("frog", "fog").getSorensenDiceCoefficient());
- assertEquals(0.0, similarity.apply("fly", "ant").getSorensenDiceCoefficient());
- assertEquals(0.0, similarity.apply("elephant", "hippo").getSorensenDiceCoefficient());
- assertEquals(0.0, similarity.apply("hippo", "elephant").getSorensenDiceCoefficient());
- assertEquals(0.0, similarity.apply("hippo", "zzzzzzzz").getSorensenDiceCoefficient());
- assertEquals(2.0 * 2 / (4 + 4), similarity.apply("hello", "hallo").getSorensenDiceCoefficient());
- assertEquals(2.0 * 7 / (13 + 7), similarity.apply("ABC Corporation", "ABC Corp").getSorensenDiceCoefficient());
- assertEquals(2.0 * 17 / (20 + 26),
- similarity.apply("D N H Enterprises Inc", "D & H Enterprises, Inc.").getSorensenDiceCoefficient());
- assertEquals(2.0 * 21 / (28 + 24), similarity
- .apply("My Gym Children's Fitness Center", "My Gym. Childrens Fitness").getSorensenDiceCoefficient());
- assertEquals(2.0 * 8 / (11 + 12),
- similarity.apply("PENNSYLVANIA", "PENNCISYLVNIA").getSorensenDiceCoefficient());
- assertEquals(2.0 * 12 / (13 + 13),
- similarity.apply("/opt/software1", "/opt/software2").getSorensenDiceCoefficient());
- assertEquals(2.0 * 2 / (4 + 4), similarity.apply("aaabcd", "aaacdb").getSorensenDiceCoefficient());
- assertEquals(2.0 * 6 / (8 + 11), similarity.apply("John Horn", "John Hopkins").getSorensenDiceCoefficient());
- }
-
- @Test
- public void testSorensenDiceCoefficientUsingListBigrams() {
- // Compute using pairs of characters (bigrams).
- // This can be done using a 32-bit int to store two 16-bit characters
- final Function> converter = cs -> {
- final int length = cs.length();
- final List list = new ArrayList<>(length);
- if (length > 1) {
- char ch2 = cs.charAt(0);
- for (int i = 1; i < length; i++) {
- final char ch1 = ch2;
- ch2 = cs.charAt(i);
- list.add(Integer.valueOf((ch1 << 16) | ch2));
- }
- }
- return list;
- };
- final IntersectionSimilarity similarity = new IntersectionSimilarity<>(converter);
-
- // Expected Sorensen-Dice = 2 * intersection / (|A| + |B|)
- // intersection = count of matching bigrams including duplicates
- // |A| = max(0, left.length() - 1)
- // |B| = max(0, right.length() - 1)
- assertEquals(2.0 * 1 / (1 + 1), similarity.apply("aa", "aa").getSorensenDiceCoefficient());
- assertEquals(2.0 * 1 / (2 + 1), similarity.apply("aaa", "aa").getSorensenDiceCoefficient());
- assertEquals(2.0 * 1 / (3 + 1), similarity.apply("aaaa", "aa").getSorensenDiceCoefficient());
- assertEquals(2.0 * 2 / (3 + 2), similarity.apply("aaaa", "aaa").getSorensenDiceCoefficient());
- assertEquals(2.0 * 2 / (3 + 2), similarity.apply("abaa", "aba").getSorensenDiceCoefficient());
- assertEquals(2.0 * 2 / (4 + 2), similarity.apply("ababa", "aba").getSorensenDiceCoefficient());
- assertEquals(2.0 * 3 / (4 + 3), similarity.apply("ababa", "abab").getSorensenDiceCoefficient());
-
- assertEquals(0.0, similarity.apply("", "").getSorensenDiceCoefficient());
- assertEquals(0.0, similarity.apply("", "a").getSorensenDiceCoefficient());
- assertEquals(0.0, similarity.apply("a", "").getSorensenDiceCoefficient());
- assertEquals(0.0, similarity.apply("a", "a").getSorensenDiceCoefficient());
- assertEquals(0.0, similarity.apply("a", "b").getSorensenDiceCoefficient());
- assertEquals(1.0, similarity.apply("foo", "foo").getSorensenDiceCoefficient());
- assertEquals(2.0 * 2 / (2 + 3), similarity.apply("foo", "foo ").getSorensenDiceCoefficient());
- assertEquals(2.0 * 1 / (3 + 2), similarity.apply("frog", "fog").getSorensenDiceCoefficient());
- assertEquals(0.0, similarity.apply("fly", "ant").getSorensenDiceCoefficient());
- assertEquals(0.0, similarity.apply("elephant", "hippo").getSorensenDiceCoefficient());
- assertEquals(0.0, similarity.apply("hippo", "elephant").getSorensenDiceCoefficient());
- assertEquals(0.0, similarity.apply("hippo", "zzzzzzzz").getSorensenDiceCoefficient());
- assertEquals(2.0 * 2 / (4 + 4), similarity.apply("hello", "hallo").getSorensenDiceCoefficient());
- assertEquals(2.0 * 7 / (14 + 7), similarity.apply("ABC Corporation", "ABC Corp").getSorensenDiceCoefficient());
- assertEquals(2.0 * 17 / (20 + 26),
- similarity.apply("D N H Enterprises Inc", "D & H Enterprises, Inc.").getSorensenDiceCoefficient());
- assertEquals(2.0 * 21 / (31 + 24), similarity
- .apply("My Gym Children's Fitness Center", "My Gym. Childrens Fitness").getSorensenDiceCoefficient());
- assertEquals(2.0 * 8 / (11 + 12),
- similarity.apply("PENNSYLVANIA", "PENNCISYLVNIA").getSorensenDiceCoefficient());
- assertEquals(2.0 * 12 / (13 + 13),
- similarity.apply("/opt/software1", "/opt/software2").getSorensenDiceCoefficient());
- assertEquals(2.0 * 3 / (5 + 5), similarity.apply("aaabcd", "aaacdb").getSorensenDiceCoefficient());
- assertEquals(2.0 * 6 / (8 + 11), similarity.apply("John Horn", "John Hopkins").getSorensenDiceCoefficient());
- }
-
- @Test
- public void testSorensenDiceCoefficientUsingListWordBigrams() {
- // Example of a word letter pairs algorithm:
- // http://www.catalysoft.com/articles/StrikeAMatch.html
-
- // Split on whitespace
- final Pattern pattern = Pattern.compile("\\s+");
-
- // Compute using pairs of characters (bigrams) for each word.
- // This can be done using a 32-bit int to store two 16-bit characters
- final Function> converter = cs -> {
- final List set = new ArrayList<>();
- for (String word : pattern.split(cs)) {
- if (word.length() > 1) {
- // The strings are converted to upper case
- char ch2 = Character.toUpperCase(word.charAt(0));
- for (int i = 1; i < word.length(); i++) {
- final char ch1 = ch2;
- ch2 = Character.toUpperCase(word.charAt(i));
- set.add(Integer.valueOf((ch1 << 16) | ch2));
- }
- }
- }
- return set;
- };
- final IntersectionSimilarity similarity = new IntersectionSimilarity<>(converter);
-
- String bookTitle;
- final String search1 = "Web Database Applications";
- final String search2 = "PHP Web Applications";
- final String search3 = "Web Aplications";
- bookTitle = "Web Database Applications with PHP & MySQL";
- assertEquals(82, toPercent(similarity.apply(bookTitle, search1).getSorensenDiceCoefficient()));
- assertEquals(68, toPercent(similarity.apply(bookTitle, search2).getSorensenDiceCoefficient()));
- assertEquals(59, toPercent(similarity.apply(bookTitle, search3).getSorensenDiceCoefficient()));
- bookTitle = "Creating Database Web Applications with PHP and ASP";
- assertEquals(71, toPercent(similarity.apply(bookTitle, search1).getSorensenDiceCoefficient()));
- assertEquals(59, toPercent(similarity.apply(bookTitle, search2).getSorensenDiceCoefficient()));
- assertEquals(50, toPercent(similarity.apply(bookTitle, search3).getSorensenDiceCoefficient()));
- bookTitle = "Building Database Applications on the Web Using PHP3";
- assertEquals(70, toPercent(similarity.apply(bookTitle, search1).getSorensenDiceCoefficient()));
- assertEquals(58, toPercent(similarity.apply(bookTitle, search2).getSorensenDiceCoefficient()));
- assertEquals(49, toPercent(similarity.apply(bookTitle, search3).getSorensenDiceCoefficient()));
- bookTitle = "Building Web Database Applications with Visual Studio 6";
- assertEquals(67, toPercent(similarity.apply(bookTitle, search1).getSorensenDiceCoefficient()));
- assertEquals(47, toPercent(similarity.apply(bookTitle, search2).getSorensenDiceCoefficient()));
- assertEquals(46, toPercent(similarity.apply(bookTitle, search3).getSorensenDiceCoefficient()));
- bookTitle = "Web Application Development With PHP";
- assertEquals(51, toPercent(similarity.apply(bookTitle, search1).getSorensenDiceCoefficient()));
- assertEquals(67, toPercent(similarity.apply(bookTitle, search2).getSorensenDiceCoefficient()));
- assertEquals(56, toPercent(similarity.apply(bookTitle, search3).getSorensenDiceCoefficient()));
- bookTitle = "WebRAD: Building Database Applications on the Web with Visual FoxPro and Web Connection";
- assertEquals(49, toPercent(similarity.apply(bookTitle, search1).getSorensenDiceCoefficient()));
- assertEquals(34, toPercent(similarity.apply(bookTitle, search2).getSorensenDiceCoefficient()));
- assertEquals(32, toPercent(similarity.apply(bookTitle, search3).getSorensenDiceCoefficient()));
- bookTitle = "Structural Assessment: The Role of Large and Full-Scale Testing";
- assertEquals(12, toPercent(similarity.apply(bookTitle, search1).getSorensenDiceCoefficient()));
- assertEquals(7, toPercent(similarity.apply(bookTitle, search2).getSorensenDiceCoefficient()));
- assertEquals(7, toPercent(similarity.apply(bookTitle, search3).getSorensenDiceCoefficient()));
- bookTitle = "How to Find a Scholarship Online";
- assertEquals(10, toPercent(similarity.apply(bookTitle, search1).getSorensenDiceCoefficient()));
- assertEquals(11, toPercent(similarity.apply(bookTitle, search2).getSorensenDiceCoefficient()));
- assertEquals(12, toPercent(similarity.apply(bookTitle, search3).getSorensenDiceCoefficient()));
- }
-
- private static int toPercent(double value) {
- return (int) Math.round(value * 100);
- }
-
- @Test
- public void testConstructorWithNullConverterThrows() {
- assertThatIllegalArgumentException().isThrownBy(() -> {
- new IntersectionSimilarity<>(null);
- });
- }
-
- @Test
- public void testApplyNullNull() {
- assertThatIllegalArgumentException().isThrownBy(() -> {
- new IntersectionSimilarity<>(cs -> new HashSet<>(Arrays.asList(cs))).apply(null, null);
- });
- }
-
- @Test
- public void testApplyStringNull() {
- assertThatIllegalArgumentException().isThrownBy(() -> {
- new IntersectionSimilarity<>(cs -> new HashSet<>(Arrays.asList(cs))).apply("left", null);
- });
- }
-
- @Test
- public void testApplyNullString() {
- assertThatIllegalArgumentException().isThrownBy(() -> {
- new IntersectionSimilarity<>(cs -> new HashSet<>(Arrays.asList(cs))).apply(null, "right");
- });
- }
-}
diff --git a/src/test/java/org/apache/commons/text/similarity/IntersectionResultTest.java b/src/test/java/org/apache/commons/text/similarity/OverlapResultTest.java
similarity index 60%
rename from src/test/java/org/apache/commons/text/similarity/IntersectionResultTest.java
rename to src/test/java/org/apache/commons/text/similarity/OverlapResultTest.java
index b21275bdf8..8de8aa099b 100644
--- a/src/test/java/org/apache/commons/text/similarity/IntersectionResultTest.java
+++ b/src/test/java/org/apache/commons/text/similarity/OverlapResultTest.java
@@ -23,58 +23,57 @@
import java.util.concurrent.ThreadLocalRandom;
/**
- * Unit tests for {@link FuzzyScore}.
+ * Unit tests for {@link OverlapResult}.
*/
-public class IntersectionResultTest {
-
+public class OverlapResultTest {
@Test
- public void testNewIntersectionResult_WithZeros() {
+ public void testNewOverlapResult_WithZeros() {
final int sizeA = 0;
final int sizeB = 0;
final int intersection = 0;
- new IntersectionResult(sizeA, sizeB, intersection);
+ new OverlapResult(sizeA, sizeB, intersection);
}
@Test
- public void testNewIntersectionResult_WithNegativeSizeA() {
+ public void testNewOverlapResult_WithNegativeSizeA() {
final int sizeA = -1;
final int sizeB = 0;
final int intersection = 0;
Assertions.assertThrows(IllegalArgumentException.class, () -> {
- new IntersectionResult(sizeA, sizeB, intersection);
+ new OverlapResult(sizeA, sizeB, intersection);
});
}
@Test
- public void testNewIntersectionResult_WithNegativeSizeB() {
+ public void testNewOverlapResult_WithNegativeSizeB() {
final int sizeA = 0;
final int sizeB = -1;
final int intersection = 0;
Assertions.assertThrows(IllegalArgumentException.class, () -> {
- new IntersectionResult(sizeA, sizeB, intersection);
+ new OverlapResult(sizeA, sizeB, intersection);
});
}
@Test
- public void testNewIntersectionResult_WithNegativeIntersection() {
+ public void testNewOverlapResult_WithNegativeIntersection() {
final int sizeA = 0;
final int sizeB = 0;
final int intersection = -1;
Assertions.assertThrows(IllegalArgumentException.class, () -> {
- new IntersectionResult(sizeA, sizeB, intersection);
+ new OverlapResult(sizeA, sizeB, intersection);
});
}
@Test
- public void testNewIntersectionResult_WithIntersectionAboveSizeAorB() {
+ public void testNewOverlapResult_WithIntersectionAboveSizeAorB() {
final int sizeA = 1;
final int sizeB = 2;
final int intersection = Math.max(sizeA, sizeB) + 1;
Assertions.assertThrows(IllegalArgumentException.class, () -> {
- new IntersectionResult(sizeA, sizeB, intersection);
+ new OverlapResult(sizeA, sizeB, intersection);
});
Assertions.assertThrows(IllegalArgumentException.class, () -> {
- new IntersectionResult(sizeB, sizeA, intersection);
+ new OverlapResult(sizeB, sizeA, intersection);
});
}
@@ -101,49 +100,7 @@ public void testUnion() {
}
private static long getUnion(int sizeA, int sizeB, int intersection) {
- return new IntersectionResult(sizeA, sizeB, intersection).getUnion();
- }
-
- @Test
- public void testJaccard() {
- // Jaccard is the intersection divided by the union
- Assertions.assertEquals(0, getJaccard(0, 0, 0));
-
- Assertions.assertEquals(0, getJaccard(1, 0, 0));
- Assertions.assertEquals(0, getJaccard(1, 1, 0));
- Assertions.assertEquals(1, getJaccard(1, 1, 1));
-
- Assertions.assertEquals(0, getJaccard(2, 0, 0));
- Assertions.assertEquals(0, getJaccard(2, 1, 0));
- Assertions.assertEquals(1.0 / 2, getJaccard(2, 1, 1));
- Assertions.assertEquals(1, getJaccard(2, 2, 2));
-
- Assertions.assertEquals(2.0 / 21, getJaccard(20, 3, 2));
- }
-
- private static double getJaccard(int sizeA, int sizeB, int intersection) {
- return new IntersectionResult(sizeA, sizeB, intersection).getJaccardIndex();
- }
-
- @Test
- public void testSorensenDice() {
- // Sorensen-Dice is 2 * intersection divided by the size of each set
- Assertions.assertEquals(0, getSorensenDice(0, 0, 0));
-
- Assertions.assertEquals(0, getSorensenDice(1, 0, 0));
- Assertions.assertEquals(0, getSorensenDice(1, 1, 0));
- Assertions.assertEquals(2.0 * 1 / (1 + 1), getSorensenDice(1, 1, 1));
-
- Assertions.assertEquals(0, getSorensenDice(2, 0, 0));
- Assertions.assertEquals(0, getSorensenDice(2, 1, 0));
- Assertions.assertEquals(2.0 * 1 / (2 + 1), getSorensenDice(2, 1, 1));
- Assertions.assertEquals(2.0 * 2 / (2 + 2), getSorensenDice(2, 2, 2));
-
- Assertions.assertEquals(2.0 * 2 / (20 + 3), getSorensenDice(20, 3, 2));
- }
-
- private static double getSorensenDice(int sizeA, int sizeB, int intersection) {
- return new IntersectionResult(sizeA, sizeB, intersection).getSorensenDiceCoefficient();
+ return new OverlapResult(sizeA, sizeB, intersection).getUnion();
}
@Test
@@ -155,7 +112,7 @@ public void testProperties() {
final int sizeA = rand.nextInt(max) + 1;
final int sizeB = rand.nextInt(max) + 1;
final int intersection = rand.nextInt(Math.min(sizeA, sizeB));
- final IntersectionResult result = new IntersectionResult(sizeA, sizeB, intersection);
+ final OverlapResult result = new OverlapResult(sizeA, sizeB, intersection);
Assertions.assertEquals(sizeA, result.getSizeA());
Assertions.assertEquals(sizeB, result.getSizeB());
Assertions.assertEquals(intersection, result.getIntersection());
@@ -164,15 +121,15 @@ public void testProperties() {
@Test
public void testEquals() {
- final IntersectionResult[] results = new IntersectionResult[] {
- new IntersectionResult(0, 0, 0),
- new IntersectionResult(10, 0, 0),
- new IntersectionResult(10, 10, 0),
- new IntersectionResult(10, 10, 10),
+ final OverlapResult[] results = new OverlapResult[] {
+ new OverlapResult(0, 0, 0),
+ new OverlapResult(10, 0, 0),
+ new OverlapResult(10, 10, 0),
+ new OverlapResult(10, 10, 10),
};
// Test difference instance with same values
- Assertions.assertTrue(results[0].equals(new IntersectionResult(0, 0, 0)));
+ Assertions.assertTrue(results[0].equals(new OverlapResult(0, 0, 0)));
final Object something = new Object();
for (int i = 0; i < results.length; i++) {
@@ -186,12 +143,12 @@ public void testEquals() {
@Test
public void testHashCode() {
- final IntersectionResult[] results = new IntersectionResult[] {
- new IntersectionResult(10, 0, 0),
- new IntersectionResult(10, 10, 0),
- new IntersectionResult(10, 10, 10),
+ final OverlapResult[] results = new OverlapResult[] {
+ new OverlapResult(10, 0, 0),
+ new OverlapResult(10, 10, 0),
+ new OverlapResult(10, 10, 10),
};
- final HashMap map = new HashMap<>();
+ final HashMap map = new HashMap<>();
final int offset = 123;
for (int i = 0; i < results.length; i++) {
map.put(results[i], i + offset);
@@ -210,7 +167,7 @@ public void testToString() {
final int sizeA = rand.nextInt(max) + 1;
final int sizeB = rand.nextInt(max) + 1;
final int intersection = rand.nextInt(Math.min(sizeA, sizeB));
- final IntersectionResult result = new IntersectionResult(sizeA, sizeB, intersection);
+ final OverlapResult result = new OverlapResult(sizeA, sizeB, intersection);
final String string = result.toString();
// Not perfect as this will match substrings too. The chance of error
// is limited by restricting the numbers to a max of 10.
diff --git a/src/test/java/org/apache/commons/text/similarity/OverlapSimilarityTest.java b/src/test/java/org/apache/commons/text/similarity/OverlapSimilarityTest.java
new file mode 100644
index 0000000000..0c93ba4324
--- /dev/null
+++ b/src/test/java/org/apache/commons/text/similarity/OverlapSimilarityTest.java
@@ -0,0 +1,288 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.text.similarity;
+
+import static org.assertj.core.api.Assertions.assertThatIllegalArgumentException;
+import static org.junit.jupiter.api.Assertions.assertEquals;
+
+import org.junit.jupiter.api.Test;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+import java.util.function.Function;
+import java.util.regex.Pattern;
+
+/**
+ * Unit tests for {@link OverlapSimilarity}.
+ */
+public class OverlapSimilarityTest {
+ @Test
+ public void testOverlapUsingSetCharacter() {
+ // Compute using single characters.
+ // This test uses a set and so should not allow duplicates.
+ final Function> converter = cs -> {
+ final int length = cs.length();
+ final Set set = new HashSet<>(length);
+ for (int i = 0; i < length; i++) {
+ set.add(cs.charAt(i));
+ }
+ return set;
+ };
+ final OverlapSimilarity similarity = new OverlapSimilarity<>(converter);
+
+ // Expected:
+ // size A or B = count of unique characters (exclude duplicates)
+ // intersection = count of unique matching characters (exclude duplicates)
+ // union = count of unique characters in total (exclude duplicates)
+ assertOverlap(similarity, "", "", 0, 0, 0, 0);
+ assertOverlap(similarity, "a", "", 1, 0, 0, 1);
+ assertOverlap(similarity, "a", "a", 1, 1, 1, 1);
+ assertOverlap(similarity, "a", "b", 1, 1, 0, 2);
+ assertOverlap(similarity, "aa", "ab", 1, 2, 1, 2);
+ assertOverlap(similarity, "ab", "ab", 2, 2, 2, 2);
+ assertOverlap(similarity, "aaba", "abaa", 2, 2, 2, 2);
+ assertOverlap(similarity, "aaaa", "aa", 1, 1, 1, 1);
+ assertOverlap(similarity, "aaaa", "aaa", 1, 1, 1, 1);
+ assertOverlap(similarity, "aabab", "ababa", 2, 2, 2, 2);
+ assertOverlap(similarity, "the same", "the same", 7, 7, 7, 7);
+ assertOverlap(similarity, "abcdefghijklm", "ab_defg ijklm", 13, 13, 11, 15);
+ }
+
+ @Test
+ public void testOverlapUsingListCharacter() {
+ // Compute using single characters.
+ // This test uses a list and so duplicates should be matched.
+ final Function> converter = cs -> {
+ final int length = cs.length();
+ final List list = new ArrayList<>(length);
+ for (int i = 0; i < length; i++) {
+ list.add(cs.charAt(i));
+ }
+ return list;
+ };
+ final OverlapSimilarity similarity = new OverlapSimilarity<>(converter);
+
+ // Expected:
+ // size A or B = sequence length
+ // intersection = count of matching characters (include duplicates)
+ // union = count of matching characters (include duplicates) plus unmatched
+ // = size A + size B - intersection
+ assertOverlap(similarity, "", "", 0, 0, 0, 0);
+ assertOverlap(similarity, "a", "", 1, 0, 0, 1);
+ assertOverlap(similarity, "a", "a", 1, 1, 1, 1);
+ assertOverlap(similarity, "a", "b", 1, 1, 0, 2);
+ assertOverlap(similarity, "aa", "ab", 2, 2, 1, 3);
+ assertOverlap(similarity, "ab", "ab", 2, 2, 2, 2);
+ assertOverlap(similarity, "aaba", "abaa", 4, 4, 4, 4);
+ assertOverlap(similarity, "aaaa", "aa", 4, 2, 2, 4);
+ assertOverlap(similarity, "aaaa", "aaa", 4, 3, 3, 4);
+ assertOverlap(similarity, "aabab", "ababa", 5, 5, 5, 5);
+ assertOverlap(similarity, "the same", "the same", 8, 8, 8, 8);
+ assertOverlap(similarity, "abcdefghijklm", "ab_defg ijklm", 13, 13, 11, 15);
+ }
+
+ @Test
+ public void testOverlapUsingSetBigrams() {
+ // Compute using pairs of characters (bigrams).
+ // This can be done using a 32-bit int to store two 16-bit characters.
+ // This test uses a set and so should not allow duplicates.
+ final Function> converter = cs -> {
+ final int length = cs.length();
+ final Set set = new HashSet<>(length);
+ if (length > 1) {
+ char ch2 = cs.charAt(0);
+ for (int i = 1; i < length; i++) {
+ final char ch1 = ch2;
+ ch2 = cs.charAt(i);
+ set.add(Integer.valueOf((ch1 << 16) | ch2));
+ }
+ }
+ return set;
+ };
+ final OverlapSimilarity similarity = new OverlapSimilarity<>(converter);
+
+ // Expected:
+ // size A or B = count of unique bigrams (exclude duplicates)
+ // intersection = count of unique matching bigrams (exclude duplicates)
+ // union = count of unique bigrams in total (exclude duplicates)
+ assertOverlap(similarity, "", "", 0, 0, 0, 0);
+ assertOverlap(similarity, "a", "", 0, 0, 0, 0);
+ assertOverlap(similarity, "a", "a", 0, 0, 0, 0);
+ assertOverlap(similarity, "a", "b", 0, 0, 0, 0);
+ assertOverlap(similarity, "aa", "ab", 1, 1, 0, 2);
+ assertOverlap(similarity, "ab", "ab", 1, 1, 1, 1);
+ assertOverlap(similarity, "aaba", "abaa", 3, 3, 3, 3);
+ assertOverlap(similarity, "aaaa", "aa", 1, 1, 1, 1);
+ assertOverlap(similarity, "aaaa", "aaa", 1, 1, 1, 1);
+ assertOverlap(similarity, "aabab", "ababa", 3, 2, 2, 3);
+ assertOverlap(similarity, "the same", "the same", 7, 7, 7, 7);
+ assertOverlap(similarity, "abcdefghijklm", "ab_defg ijklm", 12, 12, 8, 16);
+ }
+
+ @Test
+ public void testOverlapUsingListBigrams() {
+ // Compute using pairs of characters (bigrams).
+ // This can be done using a 32-bit int to store two 16-bit characters.
+ // This test uses a list and so duplicates should be matched.
+ final Function> converter = cs -> {
+ final int length = cs.length();
+ final List list = new ArrayList<>(length);
+ if (length > 1) {
+ char ch2 = cs.charAt(0);
+ for (int i = 1; i < length; i++) {
+ final char ch1 = ch2;
+ ch2 = cs.charAt(i);
+ list.add(Integer.valueOf((ch1 << 16) | ch2));
+ }
+ }
+ return list;
+ };
+ final OverlapSimilarity similarity = new OverlapSimilarity<>(converter);
+
+ // Expected:
+ // size A or B = sequence length - 1
+ // intersection = count of matching bigrams (include duplicates)
+ // union = count of matching bigrams (include duplicates)
+ // = size A + size B - intersection
+ assertOverlap(similarity, "", "", 0, 0, 0, 0);
+ assertOverlap(similarity, "a", "", 0, 0, 0, 0);
+ assertOverlap(similarity, "a", "a", 0, 0, 0, 0);
+ assertOverlap(similarity, "a", "b", 0, 0, 0, 0);
+ assertOverlap(similarity, "aa", "ab", 1, 1, 0, 2);
+ assertOverlap(similarity, "ab", "ab", 1, 1, 1, 1);
+ assertOverlap(similarity, "aaba", "abaa", 3, 3, 3, 3);
+ assertOverlap(similarity, "aaaa", "aa", 3, 1, 1, 3);
+ assertOverlap(similarity, "aaaa", "aaa", 3, 2, 2, 3);
+ assertOverlap(similarity, "aabab", "ababa", 4, 4, 3, 5);
+ assertOverlap(similarity, "the same", "the same", 7, 7, 7, 7);
+ assertOverlap(similarity, "abcdefghijklm", "ab_defg ijklm", 12, 12, 8, 16);
+ }
+
+ private static void assertOverlap(OverlapSimilarity similarity, CharSequence cs1, CharSequence cs2,
+ int sizeA, int sizeB, int intersection, int union) {
+ OverlapResult overlap = similarity.apply(cs1, cs2);
+ assertEquals(sizeA, overlap.getSizeA(), "Size A error");
+ assertEquals(sizeB, overlap.getSizeB(), "Size B error");
+ assertEquals(intersection, overlap.getIntersection(), "Intersection error");
+ assertEquals(union, overlap.getUnion(), "Union error");
+ }
+
+ @Test
+ public void testF1ScoreUsingListWordBigrams() {
+ // Example of a word letter pairs algorithm by Simon White:
+ // http://www.catalysoft.com/articles/StrikeAMatch.html
+ // This splits into words using whitespace and then computes uppercase
+ // bigrams for each word.
+
+ // Split on whitespace
+ final Pattern pattern = Pattern.compile("\\s+");
+
+ // Compute using pairs of characters (bigrams) for each word.
+ // This can be done using a 32-bit int to store two 16-bit characters
+ final Function> converter = cs -> {
+ final List set = new ArrayList<>();
+ for (String word : pattern.split(cs)) {
+ if (word.length() > 1) {
+ // The strings are converted to upper case
+ char ch2 = Character.toUpperCase(word.charAt(0));
+ for (int i = 1; i < word.length(); i++) {
+ final char ch1 = ch2;
+ ch2 = Character.toUpperCase(word.charAt(i));
+ set.add(Integer.valueOf((ch1 << 16) | ch2));
+ }
+ }
+ }
+ return set;
+ };
+ final OverlapSimilarity similarity = new OverlapSimilarity<>(converter);
+
+ String bookTitle;
+ final String search1 = "Web Database Applications";
+ final String search2 = "PHP Web Applications";
+ final String search3 = "Web Aplications";
+ bookTitle = "Web Database Applications with PHP & MySQL";
+ assertEquals(82, toF1ScorePercent(similarity.apply(bookTitle, search1)));
+ assertEquals(68, toF1ScorePercent(similarity.apply(bookTitle, search2)));
+ assertEquals(59, toF1ScorePercent(similarity.apply(bookTitle, search3)));
+ bookTitle = "Creating Database Web Applications with PHP and ASP";
+ assertEquals(71, toF1ScorePercent(similarity.apply(bookTitle, search1)));
+ assertEquals(59, toF1ScorePercent(similarity.apply(bookTitle, search2)));
+ assertEquals(50, toF1ScorePercent(similarity.apply(bookTitle, search3)));
+ bookTitle = "Building Database Applications on the Web Using PHP3";
+ assertEquals(70, toF1ScorePercent(similarity.apply(bookTitle, search1)));
+ assertEquals(58, toF1ScorePercent(similarity.apply(bookTitle, search2)));
+ assertEquals(49, toF1ScorePercent(similarity.apply(bookTitle, search3)));
+ bookTitle = "Building Web Database Applications with Visual Studio 6";
+ assertEquals(67, toF1ScorePercent(similarity.apply(bookTitle, search1)));
+ assertEquals(47, toF1ScorePercent(similarity.apply(bookTitle, search2)));
+ assertEquals(46, toF1ScorePercent(similarity.apply(bookTitle, search3)));
+ bookTitle = "Web Application Development With PHP";
+ assertEquals(51, toF1ScorePercent(similarity.apply(bookTitle, search1)));
+ assertEquals(67, toF1ScorePercent(similarity.apply(bookTitle, search2)));
+ assertEquals(56, toF1ScorePercent(similarity.apply(bookTitle, search3)));
+ bookTitle = "WebRAD: Building Database Applications on the Web with Visual FoxPro and Web Connection";
+ assertEquals(49, toF1ScorePercent(similarity.apply(bookTitle, search1)));
+ assertEquals(34, toF1ScorePercent(similarity.apply(bookTitle, search2)));
+ assertEquals(32, toF1ScorePercent(similarity.apply(bookTitle, search3)));
+ bookTitle = "Structural Assessment: The Role of Large and Full-Scale Testing";
+ assertEquals(12, toF1ScorePercent(similarity.apply(bookTitle, search1)));
+ assertEquals(7, toF1ScorePercent(similarity.apply(bookTitle, search2)));
+ assertEquals(7, toF1ScorePercent(similarity.apply(bookTitle, search3)));
+ bookTitle = "How to Find a Scholarship Online";
+ assertEquals(10, toF1ScorePercent(similarity.apply(bookTitle, search1)));
+ assertEquals(11, toF1ScorePercent(similarity.apply(bookTitle, search2)));
+ assertEquals(12, toF1ScorePercent(similarity.apply(bookTitle, search3)));
+ }
+
+ private static int toF1ScorePercent(OverlapResult overlap) {
+ final double value = 2.0 * overlap.getIntersection() / (overlap.getSizeA() + overlap.getSizeB());
+ // Convert to percentage
+ return (int) Math.round(value * 100);
+ }
+
+ @Test
+ public void testConstructorWithNullConverterThrows() {
+ assertThatIllegalArgumentException().isThrownBy(() -> {
+ new OverlapSimilarity<>(null);
+ });
+ }
+
+ @Test
+ public void testApplyNullNull() {
+ assertThatIllegalArgumentException().isThrownBy(() -> {
+ new OverlapSimilarity<>(cs -> new HashSet<>(Arrays.asList(cs))).apply(null, null);
+ });
+ }
+
+ @Test
+ public void testApplyStringNull() {
+ assertThatIllegalArgumentException().isThrownBy(() -> {
+ new OverlapSimilarity<>(cs -> new HashSet<>(Arrays.asList(cs))).apply("left", null);
+ });
+ }
+
+ @Test
+ public void testApplyNullString() {
+ assertThatIllegalArgumentException().isThrownBy(() -> {
+ new OverlapSimilarity<>(cs -> new HashSet<>(Arrays.asList(cs))).apply(null, "right");
+ });
+ }
+}
From 7a4aee810bf8c4d067abdb7a726fc990a9c28714 Mon Sep 17 00:00:00 2001
From: Amey Jadiye
Date: Sun, 10 Mar 2019 13:28:26 +0530
Subject: [PATCH 11/16] using OverlapSimilarity for scoring Sorensendice
similarity
---
.../similarity/SorensenDiceSimilarity.java | 48 +++++++++++++++++--
1 file changed, 43 insertions(+), 5 deletions(-)
diff --git a/src/main/java/org/apache/commons/text/similarity/SorensenDiceSimilarity.java b/src/main/java/org/apache/commons/text/similarity/SorensenDiceSimilarity.java
index dc59d0fa74..0195e9be4c 100644
--- a/src/main/java/org/apache/commons/text/similarity/SorensenDiceSimilarity.java
+++ b/src/main/java/org/apache/commons/text/similarity/SorensenDiceSimilarity.java
@@ -16,9 +16,12 @@
*/
package org.apache.commons.text.similarity;
+import java.util.ArrayList;
+import java.util.Collection;
import java.util.HashSet;
+import java.util.List;
import java.util.Set;
-import java.util.stream.Collectors;
+import java.util.function.Function;
import org.apache.commons.lang3.StringUtils;
@@ -49,6 +52,22 @@
*/
public class SorensenDiceSimilarity implements SimilarityScore {
+ /**
+ * For shifting bigrams to fit in single integer.
+ */
+ public static final int SHIFT_NUMBER = 16;
+
+ /**
+ * Converter function for conversion of string to bigrams.
+ */
+ final Function> converter = new SorensenDiceConverter();
+
+ /**
+ * Measures the overlap of two sets created from a pair of character sequences.
+ * {@link OverlapSimilarity}}
+ */
+ final OverlapSimilarity similarity = new OverlapSimilarity<>(this.converter);
+
/**
* Calculates Sorensen-Dice Similarity of two character sequences passed as
* input.
@@ -98,11 +117,10 @@ public Double apply(final CharSequence left, final CharSequence right) {
return 0d;
}
- Set nLeft = createBigrams(left);
- Set nRight = createBigrams(right);
+ OverlapResult overlap = similarity.apply(left, right);
- final int total = nLeft.size() + nRight.size();
- final long intersection = nLeft.stream().filter(nRight::contains).collect(Collectors.counting());
+ final int total = overlap.getSizeA() + overlap.getSizeB();
+ final long intersection = overlap.getIntersection();
return (2.0d * intersection) / total;
}
@@ -124,4 +142,24 @@ protected Set createBigrams(CharSequence charSequence) {
}
return set;
}
+
+ /**
+ * Converter class for creating Bigrams for SorensenDice similarity.
+ */
+ class SorensenDiceConverter implements Function> {
+ @Override
+ public Collection apply(CharSequence cs) {
+ final int length = cs.length();
+ final List list = new ArrayList<>(length);
+ if (length > 1) {
+ char ch2 = cs.charAt(0);
+ for (int i = 1; i < length; i++) {
+ final char ch1 = ch2;
+ ch2 = cs.charAt(i);
+ list.add(Integer.valueOf((ch1 << SHIFT_NUMBER) | ch2));
+ }
+ }
+ return list;
+ }
+ }
}
From 211077b2e36f316c5e8c123a036dfd6543a778aa Mon Sep 17 00:00:00 2001
From: Amey Jadiye
Date: Sun, 10 Mar 2019 13:29:37 +0530
Subject: [PATCH 12/16] rounded resulring scores for tests
---
.../SorensenDiceSimilarityTest.java | 21 ++++++++++++-------
1 file changed, 13 insertions(+), 8 deletions(-)
diff --git a/src/test/java/org/apache/commons/text/similarity/SorensenDiceSimilarityTest.java b/src/test/java/org/apache/commons/text/similarity/SorensenDiceSimilarityTest.java
index 2e144e7628..3d59823ea0 100644
--- a/src/test/java/org/apache/commons/text/similarity/SorensenDiceSimilarityTest.java
+++ b/src/test/java/org/apache/commons/text/similarity/SorensenDiceSimilarityTest.java
@@ -55,14 +55,14 @@ public void testGetSorensenDicesSimilarity_StringString() {
assertEquals(0.0d, similarity.apply("hippo", "elephant"));
assertEquals(0.0d, similarity.apply("hippo", "zzzzzzzz"));
assertEquals(0.5d, similarity.apply("hello", "hallo"));
- assertEquals(0.7d, similarity.apply("ABC Corporation", "ABC Corp"));
- assertEquals(0.7391304347826086d, similarity.apply("D N H Enterprises Inc", "D & H Enterprises, Inc."));
- assertEquals(0.8076923076923077d,
- similarity.apply("My Gym Children's Fitness Center", "My Gym. Childrens Fitness"));
- assertEquals(0.6956521739130435, similarity.apply("PENNSYLVANIA", "PENNCISYLVNIA"));
- assertEquals(0.9230769230769231, similarity.apply("/opt/software1", "/opt/software2"));
- assertEquals(0.5d, similarity.apply("aaabcd", "aaacdb"));
- assertEquals(0.631578947368421, similarity.apply("John Horn", "John Hopkins"));
+ assertEquals(0.7d, round(similarity.apply("ABC Corporation", "ABC Corp"), 1));
+ assertEquals(0.7d, round(similarity.apply("D N H Enterprises Inc", "D & H Enterprises, Inc."), 1));
+ assertEquals(0.8d,
+ round(similarity.apply("My Gym Children's Fitness Center", "My Gym. Childrens Fitness"), 1));
+ assertEquals(0.7d, round(similarity.apply("PENNSYLVANIA", "PENNCISYLVNIA"), 1));
+ assertEquals(0.9d, round(similarity.apply("/opt/software1", "/opt/software2"), 1));
+ assertEquals(0.6d, round(similarity.apply("aaabcd", "aaacdb"), 1));
+ assertEquals(0.6d, round(similarity.apply("John Horn", "John Hopkins"), 1));
}
@@ -86,4 +86,9 @@ public void testGetSorensenDicesSimilarity_NullString() {
similarity.apply(null, "clear");
});
}
+
+ public static double round(double value, int precision) {
+ int scale = (int) Math.pow(10, precision);
+ return (double) Math.round(value * scale) / scale;
+ }
}
From 29fd2daee55dfd92467e64796c4e00f9a1599106 Mon Sep 17 00:00:00 2001
From: Amey Jadiye
Date: Sun, 10 Mar 2019 13:35:58 +0530
Subject: [PATCH 13/16] fixed spotbug checkstyle errors
---
.../commons/text/similarity/SorensenDiceSimilarity.java | 2 +-
.../commons/text/similarity/OverlapSimilarityTest.java | 8 ++++----
2 files changed, 5 insertions(+), 5 deletions(-)
diff --git a/src/main/java/org/apache/commons/text/similarity/SorensenDiceSimilarity.java b/src/main/java/org/apache/commons/text/similarity/SorensenDiceSimilarity.java
index 0195e9be4c..ead01779d1 100644
--- a/src/main/java/org/apache/commons/text/similarity/SorensenDiceSimilarity.java
+++ b/src/main/java/org/apache/commons/text/similarity/SorensenDiceSimilarity.java
@@ -146,7 +146,7 @@ protected Set createBigrams(CharSequence charSequence) {
/**
* Converter class for creating Bigrams for SorensenDice similarity.
*/
- class SorensenDiceConverter implements Function> {
+ static class SorensenDiceConverter implements Function> {
@Override
public Collection apply(CharSequence cs) {
final int length = cs.length();
diff --git a/src/test/java/org/apache/commons/text/similarity/OverlapSimilarityTest.java b/src/test/java/org/apache/commons/text/similarity/OverlapSimilarityTest.java
index 0c93ba4324..ebad367e91 100644
--- a/src/test/java/org/apache/commons/text/similarity/OverlapSimilarityTest.java
+++ b/src/test/java/org/apache/commons/text/similarity/OverlapSimilarityTest.java
@@ -84,7 +84,7 @@ public void testOverlapUsingListCharacter() {
// size A or B = sequence length
// intersection = count of matching characters (include duplicates)
// union = count of matching characters (include duplicates) plus unmatched
- // = size A + size B - intersection
+ // = size A + size B - intersection
assertOverlap(similarity, "", "", 0, 0, 0, 0);
assertOverlap(similarity, "a", "", 1, 0, 0, 1);
assertOverlap(similarity, "a", "a", 1, 1, 1, 1);
@@ -161,7 +161,7 @@ public void testOverlapUsingListBigrams() {
// size A or B = sequence length - 1
// intersection = count of matching bigrams (include duplicates)
// union = count of matching bigrams (include duplicates)
- // = size A + size B - intersection
+ // = size A + size B - intersection
assertOverlap(similarity, "", "", 0, 0, 0, 0);
assertOverlap(similarity, "a", "", 0, 0, 0, 0);
assertOverlap(similarity, "a", "a", 0, 0, 0, 0);
@@ -176,7 +176,7 @@ public void testOverlapUsingListBigrams() {
assertOverlap(similarity, "abcdefghijklm", "ab_defg ijklm", 12, 12, 8, 16);
}
- private static void assertOverlap(OverlapSimilarity similarity, CharSequence cs1, CharSequence cs2,
+ private static void assertOverlap(OverlapSimilarity similarity, CharSequence cs1, CharSequence cs2,
int sizeA, int sizeB, int intersection, int union) {
OverlapResult overlap = similarity.apply(cs1, cs2);
assertEquals(sizeA, overlap.getSizeA(), "Size A error");
@@ -189,7 +189,7 @@ private static void assertOverlap(OverlapSimilarity similarity, CharSeque
public void testF1ScoreUsingListWordBigrams() {
// Example of a word letter pairs algorithm by Simon White:
// http://www.catalysoft.com/articles/StrikeAMatch.html
- // This splits into words using whitespace and then computes uppercase
+ // This splits into words using whitespace and then computes uppercase
// bigrams for each word.
// Split on whitespace
From bcff9748d24727da8758f06df3beeab28e446264 Mon Sep 17 00:00:00 2001
From: Amey Jadiye
Date: Mon, 18 Mar 2019 21:22:48 +0530
Subject: [PATCH 14/16] no need
---
.../text/similarity/OverlapResult.java | 129 --------
.../text/similarity/OverlapSimilarity.java | 196 ------------
.../text/similarity/OverlapResultTest.java | 179 -----------
.../similarity/OverlapSimilarityTest.java | 288 ------------------
4 files changed, 792 deletions(-)
delete mode 100644 src/main/java/org/apache/commons/text/similarity/OverlapResult.java
delete mode 100644 src/main/java/org/apache/commons/text/similarity/OverlapSimilarity.java
delete mode 100644 src/test/java/org/apache/commons/text/similarity/OverlapResultTest.java
delete mode 100644 src/test/java/org/apache/commons/text/similarity/OverlapSimilarityTest.java
diff --git a/src/main/java/org/apache/commons/text/similarity/OverlapResult.java b/src/main/java/org/apache/commons/text/similarity/OverlapResult.java
deleted file mode 100644
index 021fab9c5a..0000000000
--- a/src/main/java/org/apache/commons/text/similarity/OverlapResult.java
+++ /dev/null
@@ -1,129 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.commons.text.similarity;
-
-import java.util.Objects;
-
-/**
- * Container class to store the overlap results between two sets.
- *
- * Stores the size of set A ({@code |A|}), set B ({@code |A|}) and the
- * intersection of A and B (|A ∩ B|). The result can be used
- * to produce the union of A and B (|A ∪ B|).
- *
- * This class is immutable.
- *
- * @since 1.7
- * @see Intersection
- * @see Union
- */
-public class OverlapResult {
- /**
- * The size of set A.
- */
- private final int sizeA;
- /**
- * The size of set B.
- */
- private final int sizeB;
- /**
- * The size of the intersection between set A and B.
- */
- private final int intersection;
-
- /**
- * Create the results for an intersection between two sets.
- *
- * @param sizeA the size of set A ({@code |A|})
- * @param sizeB the size of set B ({@code |B|})
- * @param intersection the size of the intersection of A and B (|A ∩ B|)
- * @throws IllegalArgumentException if the sizes are negative or the intersection is greater
- * than the minimum of the two set sizes
- */
- public OverlapResult(final int sizeA, final int sizeB, final int intersection) {
- if (sizeA < 0) {
- throw new IllegalArgumentException("Set size |A| is not positive: " + sizeA);
- }
- if (sizeB < 0) {
- throw new IllegalArgumentException("Set size |B| is not positive: " + sizeB);
- }
- if (intersection < 0 || intersection > Math.min(sizeA, sizeB)) {
- throw new IllegalArgumentException("Invalid intersection of |A| and |B|: " + intersection);
- }
- this.sizeA = sizeA;
- this.sizeB = sizeB;
- this.intersection = intersection;
- }
-
- /**
- * Get the size of set A.
- *
- * @return |A|
- */
- public int getSizeA() {
- return sizeA;
- }
-
- /**
- * Get the size of set B.
- *
- * @return |B|
- */
- public int getSizeB() {
- return sizeB;
- }
-
- /**
- * Get the size of the intersection between set A and B.
- *
- * @return |A ∩ B|
- */
- public int getIntersection() {
- return intersection;
- }
- /**
- * Get the size of the union between set A and B.
- *
- * @return |A ∪ B|
- */
- public long getUnion() {
- return (long) sizeA + sizeB - intersection;
- }
-
- @Override
- public boolean equals(final Object o) {
- if (this == o) {
- return true;
- }
- if (o == null || getClass() != o.getClass()) {
- return false;
- }
- final OverlapResult result = (OverlapResult) o;
- return sizeA == result.sizeA && sizeB == result.sizeB && intersection == result.intersection;
- }
-
- @Override
- public int hashCode() {
- return Objects.hash(sizeA, sizeB, intersection);
- }
-
- @Override
- public String toString() {
- return "Size A: " + sizeA + ", Size B: " + sizeB + ", Intersection: " + intersection;
- }
-}
diff --git a/src/main/java/org/apache/commons/text/similarity/OverlapSimilarity.java b/src/main/java/org/apache/commons/text/similarity/OverlapSimilarity.java
deleted file mode 100644
index df71654a96..0000000000
--- a/src/main/java/org/apache/commons/text/similarity/OverlapSimilarity.java
+++ /dev/null
@@ -1,196 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.commons.text.similarity;
-
-import java.util.Collection;
-import java.util.HashMap;
-import java.util.Map;
-import java.util.Map.Entry;
-import java.util.Set;
-import java.util.function.Function;
-
-/**
- * Measures the overlap of two sets created from a pair of character sequences.
- *
- * It is assumed that the type {@code T} correctly conforms to the requirements for storage
- * within a {@link Set} or {@link HashMap}. Ideally the type is immutable and implements
- * {@link Object#equals(Object)} and {@link Object#hashCode()}.
- *
- * @param the type of the elements extracted from the character sequence
- * @since 1.7
- * @see Set
- * @see HashMap
- */
-public class OverlapSimilarity implements SimilarityScore {
- /** The converter used to create the elements from the characters. */
- private final Function> converter;
-
- // The following is adapted from commons-collections for a Bag.
- // A Bag is a collection that can store the count of the number
- // of copies of each element.
-
- /**
- * Mutable counter class for storing the count of elements.
- */
- private static class BagCount {
- /** The count. This is initialised to 1 upon construction. */
- int count = 1;
- }
-
- /**
- * A minimal implementation of a Bag that can store elements and a count.
- *
- * For the intended purpose the Bag does not have to be a {@link Collection}. It does not
- * even have to know its own size.
- */
- private class TinyBag {
- /** The backing map. */
- private final Map map;
-
- /**
- * Create a new tiny bag.
- *
- * @param initialCapacity the initial capacity
- */
- TinyBag(int initialCapacity) {
- map = new HashMap<>(initialCapacity);
- }
-
- /**
- * Adds a new element to the bag, incrementing its count in the underlying map.
- *
- * @param object the object to add
- */
- void add(T object) {
- final BagCount mut = map.get(object);
- if (mut == null) {
- map.put(object, new BagCount());
- } else {
- mut.count++;
- }
- }
-
- /**
- * Returns the number of occurrence of the given element in this bag by
- * looking up its count in the underlying map.
- *
- * @param object the object to search for
- * @return the number of occurrences of the object, zero if not found
- */
- int getCount(final Object object) {
- final BagCount count = map.get(object);
- if (count != null) {
- return count.count;
- }
- return 0;
- }
-
- /**
- * Returns a Set view of the mappings contained in this bag.
- *
- * @return the Set view
- */
- Set> entrySet() {
- return map.entrySet();
- }
- }
-
- /**
- * Create a new overlap similarity using the provided converter.
- *
- * If the converter returns a {@link Set} then the intersection result will
- * not include duplicates. Any other {@link Collection} is used to produce a result
- * that will include duplicates in the intersect and union.
- *
- * @param converter the converter used to create the elements from the characters.
- * @throws IllegalArgumentException if the converter is null
- */
- public OverlapSimilarity(Function> converter) {
- if (converter == null) {
- throw new IllegalArgumentException("Converter must not be null");
- }
- this.converter = converter;
- }
-
- /**
- * Calculates the intersection of two character sequences passed as input.
- *
- * @param left first character sequence
- * @param right second character sequence
- * @return the intersection result
- * @throws IllegalArgumentException if either input sequence is {@code null}
- */
- @Override
- public OverlapResult apply(final CharSequence left, final CharSequence right) {
- if (left == null || right == null) {
- throw new IllegalArgumentException("Input cannot be null");
- }
-
- // Create the elements from the sequences
- final Collection objectsA = converter.apply(left);
- final Collection objectsB = converter.apply(right);
- final int sizeA = objectsA.size();
- final int sizeB = objectsB.size();
-
- // Short-cut if either collection is empty
- if (Math.min(sizeA, sizeB) == 0) {
- // No intersection
- return new OverlapResult(sizeA, sizeB, 0);
- }
-
- // Intersection = count the number of shared elements
- int intersection = 0;
- if (objectsA instanceof Set) {
- // If a Set then the elements will only have a count of 1.
- // Note: Even if objectsB is a plain collection this will work
- // since the contains(Object) method will return true when present.
- // The fact that objectsA is a Set ensures non duplicate counting.
- for (T element : objectsA) {
- if (objectsB.contains(element)) {
- intersection++;
- }
- }
- } else {
- // Create a bag for each collection
- final TinyBag bagA = toBag(objectsA);
- final TinyBag bagB = toBag(objectsB);
- // Find the intersection of each element in bag A with bag B
- for (Entry entry : bagA.entrySet()) {
- final T element = entry.getKey();
- final int count = entry.getValue().count;
- // The intersection of this entry in both bags is the minimum count
- intersection += Math.min(count, bagB.getCount(element));
- }
- }
-
- return new OverlapResult(sizeA, sizeB, intersection);
- }
-
- /**
- * Convert the collection to a bag. The bag will contain the count of each element in the collection.
- *
- * @param objects the objects
- * @return the bag
- */
- private TinyBag toBag(Collection objects) {
- final TinyBag bag = new TinyBag(objects.size());
- for (T t : objects) {
- bag.add(t);
- }
- return bag;
- }
-}
diff --git a/src/test/java/org/apache/commons/text/similarity/OverlapResultTest.java b/src/test/java/org/apache/commons/text/similarity/OverlapResultTest.java
deleted file mode 100644
index 8de8aa099b..0000000000
--- a/src/test/java/org/apache/commons/text/similarity/OverlapResultTest.java
+++ /dev/null
@@ -1,179 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.commons.text.similarity;
-
-import org.junit.jupiter.api.Assertions;
-import org.junit.jupiter.api.Test;
-
-import java.util.HashMap;
-import java.util.concurrent.ThreadLocalRandom;
-
-/**
- * Unit tests for {@link OverlapResult}.
- */
-public class OverlapResultTest {
- @Test
- public void testNewOverlapResult_WithZeros() {
- final int sizeA = 0;
- final int sizeB = 0;
- final int intersection = 0;
- new OverlapResult(sizeA, sizeB, intersection);
- }
-
- @Test
- public void testNewOverlapResult_WithNegativeSizeA() {
- final int sizeA = -1;
- final int sizeB = 0;
- final int intersection = 0;
- Assertions.assertThrows(IllegalArgumentException.class, () -> {
- new OverlapResult(sizeA, sizeB, intersection);
- });
- }
-
- @Test
- public void testNewOverlapResult_WithNegativeSizeB() {
- final int sizeA = 0;
- final int sizeB = -1;
- final int intersection = 0;
- Assertions.assertThrows(IllegalArgumentException.class, () -> {
- new OverlapResult(sizeA, sizeB, intersection);
- });
- }
-
- @Test
- public void testNewOverlapResult_WithNegativeIntersection() {
- final int sizeA = 0;
- final int sizeB = 0;
- final int intersection = -1;
- Assertions.assertThrows(IllegalArgumentException.class, () -> {
- new OverlapResult(sizeA, sizeB, intersection);
- });
- }
-
- @Test
- public void testNewOverlapResult_WithIntersectionAboveSizeAorB() {
- final int sizeA = 1;
- final int sizeB = 2;
- final int intersection = Math.max(sizeA, sizeB) + 1;
- Assertions.assertThrows(IllegalArgumentException.class, () -> {
- new OverlapResult(sizeA, sizeB, intersection);
- });
- Assertions.assertThrows(IllegalArgumentException.class, () -> {
- new OverlapResult(sizeB, sizeA, intersection);
- });
- }
-
- @Test
- public void testUnion() {
- // Union is the combined size minus the intersection
- Assertions.assertEquals(0, getUnion(0, 0, 0));
-
- Assertions.assertEquals(1, getUnion(1, 0, 0));
- Assertions.assertEquals(2, getUnion(1, 1, 0));
- Assertions.assertEquals(1, getUnion(1, 1, 1));
-
- Assertions.assertEquals(2, getUnion(2, 0, 0));
- Assertions.assertEquals(3, getUnion(2, 1, 0));
- Assertions.assertEquals(2, getUnion(2, 1, 1));
- Assertions.assertEquals(2, getUnion(2, 2, 2));
-
- // Test overflow of int addition
- Assertions.assertEquals((long) Integer.MAX_VALUE + 1, getUnion(Integer.MAX_VALUE, 1, 0));
- Assertions.assertEquals((long) Integer.MAX_VALUE + 1,
- getUnion(Integer.MAX_VALUE, Integer.MAX_VALUE, Integer.MAX_VALUE - 1));
- Assertions.assertEquals((long) Integer.MAX_VALUE + Integer.MAX_VALUE,
- getUnion(Integer.MAX_VALUE, Integer.MAX_VALUE, 0));
- }
-
- private static long getUnion(int sizeA, int sizeB, int intersection) {
- return new OverlapResult(sizeA, sizeB, intersection).getUnion();
- }
-
- @Test
- public void testProperties() {
- final ThreadLocalRandom rand = ThreadLocalRandom.current();
- final int max = 1024;
- for (int i = 0; i < 5; i++) {
- // Ensure the min is above 0
- final int sizeA = rand.nextInt(max) + 1;
- final int sizeB = rand.nextInt(max) + 1;
- final int intersection = rand.nextInt(Math.min(sizeA, sizeB));
- final OverlapResult result = new OverlapResult(sizeA, sizeB, intersection);
- Assertions.assertEquals(sizeA, result.getSizeA());
- Assertions.assertEquals(sizeB, result.getSizeB());
- Assertions.assertEquals(intersection, result.getIntersection());
- }
- }
-
- @Test
- public void testEquals() {
- final OverlapResult[] results = new OverlapResult[] {
- new OverlapResult(0, 0, 0),
- new OverlapResult(10, 0, 0),
- new OverlapResult(10, 10, 0),
- new OverlapResult(10, 10, 10),
- };
-
- // Test difference instance with same values
- Assertions.assertTrue(results[0].equals(new OverlapResult(0, 0, 0)));
-
- final Object something = new Object();
- for (int i = 0; i < results.length; i++) {
- Assertions.assertFalse(results[i].equals(something));
- Assertions.assertFalse(results[i].equals(null));
- for (int j = 0; j < results.length; j++) {
- Assertions.assertTrue(results[i].equals(results[j]) == (i == j));
- }
- }
- }
-
- @Test
- public void testHashCode() {
- final OverlapResult[] results = new OverlapResult[] {
- new OverlapResult(10, 0, 0),
- new OverlapResult(10, 10, 0),
- new OverlapResult(10, 10, 10),
- };
- final HashMap map = new HashMap<>();
- final int offset = 123;
- for (int i = 0; i < results.length; i++) {
- map.put(results[i], i + offset);
- }
- for (int i = 0; i < results.length; i++) {
- Assertions.assertEquals(i + offset, map.get(results[i]));
- }
- }
-
- @Test
- public void testToString() {
- final ThreadLocalRandom rand = ThreadLocalRandom.current();
- final int max = 9;
- for (int i = 0; i < 5; i++) {
- // Ensure the min is above 0
- final int sizeA = rand.nextInt(max) + 1;
- final int sizeB = rand.nextInt(max) + 1;
- final int intersection = rand.nextInt(Math.min(sizeA, sizeB));
- final OverlapResult result = new OverlapResult(sizeA, sizeB, intersection);
- final String string = result.toString();
- // Not perfect as this will match substrings too. The chance of error
- // is limited by restricting the numbers to a max of 10.
- Assertions.assertTrue(string.contains(String.valueOf(sizeA)));
- Assertions.assertTrue(string.contains(String.valueOf(sizeB)));
- Assertions.assertTrue(string.contains(String.valueOf(intersection)));
- }
- }
-}
diff --git a/src/test/java/org/apache/commons/text/similarity/OverlapSimilarityTest.java b/src/test/java/org/apache/commons/text/similarity/OverlapSimilarityTest.java
deleted file mode 100644
index ebad367e91..0000000000
--- a/src/test/java/org/apache/commons/text/similarity/OverlapSimilarityTest.java
+++ /dev/null
@@ -1,288 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.commons.text.similarity;
-
-import static org.assertj.core.api.Assertions.assertThatIllegalArgumentException;
-import static org.junit.jupiter.api.Assertions.assertEquals;
-
-import org.junit.jupiter.api.Test;
-
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Collection;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Set;
-import java.util.function.Function;
-import java.util.regex.Pattern;
-
-/**
- * Unit tests for {@link OverlapSimilarity}.
- */
-public class OverlapSimilarityTest {
- @Test
- public void testOverlapUsingSetCharacter() {
- // Compute using single characters.
- // This test uses a set and so should not allow duplicates.
- final Function> converter = cs -> {
- final int length = cs.length();
- final Set set = new HashSet<>(length);
- for (int i = 0; i < length; i++) {
- set.add(cs.charAt(i));
- }
- return set;
- };
- final OverlapSimilarity similarity = new OverlapSimilarity<>(converter);
-
- // Expected:
- // size A or B = count of unique characters (exclude duplicates)
- // intersection = count of unique matching characters (exclude duplicates)
- // union = count of unique characters in total (exclude duplicates)
- assertOverlap(similarity, "", "", 0, 0, 0, 0);
- assertOverlap(similarity, "a", "", 1, 0, 0, 1);
- assertOverlap(similarity, "a", "a", 1, 1, 1, 1);
- assertOverlap(similarity, "a", "b", 1, 1, 0, 2);
- assertOverlap(similarity, "aa", "ab", 1, 2, 1, 2);
- assertOverlap(similarity, "ab", "ab", 2, 2, 2, 2);
- assertOverlap(similarity, "aaba", "abaa", 2, 2, 2, 2);
- assertOverlap(similarity, "aaaa", "aa", 1, 1, 1, 1);
- assertOverlap(similarity, "aaaa", "aaa", 1, 1, 1, 1);
- assertOverlap(similarity, "aabab", "ababa", 2, 2, 2, 2);
- assertOverlap(similarity, "the same", "the same", 7, 7, 7, 7);
- assertOverlap(similarity, "abcdefghijklm", "ab_defg ijklm", 13, 13, 11, 15);
- }
-
- @Test
- public void testOverlapUsingListCharacter() {
- // Compute using single characters.
- // This test uses a list and so duplicates should be matched.
- final Function> converter = cs -> {
- final int length = cs.length();
- final List list = new ArrayList<>(length);
- for (int i = 0; i < length; i++) {
- list.add(cs.charAt(i));
- }
- return list;
- };
- final OverlapSimilarity similarity = new OverlapSimilarity<>(converter);
-
- // Expected:
- // size A or B = sequence length
- // intersection = count of matching characters (include duplicates)
- // union = count of matching characters (include duplicates) plus unmatched
- // = size A + size B - intersection
- assertOverlap(similarity, "", "", 0, 0, 0, 0);
- assertOverlap(similarity, "a", "", 1, 0, 0, 1);
- assertOverlap(similarity, "a", "a", 1, 1, 1, 1);
- assertOverlap(similarity, "a", "b", 1, 1, 0, 2);
- assertOverlap(similarity, "aa", "ab", 2, 2, 1, 3);
- assertOverlap(similarity, "ab", "ab", 2, 2, 2, 2);
- assertOverlap(similarity, "aaba", "abaa", 4, 4, 4, 4);
- assertOverlap(similarity, "aaaa", "aa", 4, 2, 2, 4);
- assertOverlap(similarity, "aaaa", "aaa", 4, 3, 3, 4);
- assertOverlap(similarity, "aabab", "ababa", 5, 5, 5, 5);
- assertOverlap(similarity, "the same", "the same", 8, 8, 8, 8);
- assertOverlap(similarity, "abcdefghijklm", "ab_defg ijklm", 13, 13, 11, 15);
- }
-
- @Test
- public void testOverlapUsingSetBigrams() {
- // Compute using pairs of characters (bigrams).
- // This can be done using a 32-bit int to store two 16-bit characters.
- // This test uses a set and so should not allow duplicates.
- final Function> converter = cs -> {
- final int length = cs.length();
- final Set set = new HashSet<>(length);
- if (length > 1) {
- char ch2 = cs.charAt(0);
- for (int i = 1; i < length; i++) {
- final char ch1 = ch2;
- ch2 = cs.charAt(i);
- set.add(Integer.valueOf((ch1 << 16) | ch2));
- }
- }
- return set;
- };
- final OverlapSimilarity similarity = new OverlapSimilarity<>(converter);
-
- // Expected:
- // size A or B = count of unique bigrams (exclude duplicates)
- // intersection = count of unique matching bigrams (exclude duplicates)
- // union = count of unique bigrams in total (exclude duplicates)
- assertOverlap(similarity, "", "", 0, 0, 0, 0);
- assertOverlap(similarity, "a", "", 0, 0, 0, 0);
- assertOverlap(similarity, "a", "a", 0, 0, 0, 0);
- assertOverlap(similarity, "a", "b", 0, 0, 0, 0);
- assertOverlap(similarity, "aa", "ab", 1, 1, 0, 2);
- assertOverlap(similarity, "ab", "ab", 1, 1, 1, 1);
- assertOverlap(similarity, "aaba", "abaa", 3, 3, 3, 3);
- assertOverlap(similarity, "aaaa", "aa", 1, 1, 1, 1);
- assertOverlap(similarity, "aaaa", "aaa", 1, 1, 1, 1);
- assertOverlap(similarity, "aabab", "ababa", 3, 2, 2, 3);
- assertOverlap(similarity, "the same", "the same", 7, 7, 7, 7);
- assertOverlap(similarity, "abcdefghijklm", "ab_defg ijklm", 12, 12, 8, 16);
- }
-
- @Test
- public void testOverlapUsingListBigrams() {
- // Compute using pairs of characters (bigrams).
- // This can be done using a 32-bit int to store two 16-bit characters.
- // This test uses a list and so duplicates should be matched.
- final Function> converter = cs -> {
- final int length = cs.length();
- final List list = new ArrayList<>(length);
- if (length > 1) {
- char ch2 = cs.charAt(0);
- for (int i = 1; i < length; i++) {
- final char ch1 = ch2;
- ch2 = cs.charAt(i);
- list.add(Integer.valueOf((ch1 << 16) | ch2));
- }
- }
- return list;
- };
- final OverlapSimilarity similarity = new OverlapSimilarity<>(converter);
-
- // Expected:
- // size A or B = sequence length - 1
- // intersection = count of matching bigrams (include duplicates)
- // union = count of matching bigrams (include duplicates)
- // = size A + size B - intersection
- assertOverlap(similarity, "", "", 0, 0, 0, 0);
- assertOverlap(similarity, "a", "", 0, 0, 0, 0);
- assertOverlap(similarity, "a", "a", 0, 0, 0, 0);
- assertOverlap(similarity, "a", "b", 0, 0, 0, 0);
- assertOverlap(similarity, "aa", "ab", 1, 1, 0, 2);
- assertOverlap(similarity, "ab", "ab", 1, 1, 1, 1);
- assertOverlap(similarity, "aaba", "abaa", 3, 3, 3, 3);
- assertOverlap(similarity, "aaaa", "aa", 3, 1, 1, 3);
- assertOverlap(similarity, "aaaa", "aaa", 3, 2, 2, 3);
- assertOverlap(similarity, "aabab", "ababa", 4, 4, 3, 5);
- assertOverlap(similarity, "the same", "the same", 7, 7, 7, 7);
- assertOverlap(similarity, "abcdefghijklm", "ab_defg ijklm", 12, 12, 8, 16);
- }
-
- private static void assertOverlap(OverlapSimilarity similarity, CharSequence cs1, CharSequence cs2,
- int sizeA, int sizeB, int intersection, int union) {
- OverlapResult overlap = similarity.apply(cs1, cs2);
- assertEquals(sizeA, overlap.getSizeA(), "Size A error");
- assertEquals(sizeB, overlap.getSizeB(), "Size B error");
- assertEquals(intersection, overlap.getIntersection(), "Intersection error");
- assertEquals(union, overlap.getUnion(), "Union error");
- }
-
- @Test
- public void testF1ScoreUsingListWordBigrams() {
- // Example of a word letter pairs algorithm by Simon White:
- // http://www.catalysoft.com/articles/StrikeAMatch.html
- // This splits into words using whitespace and then computes uppercase
- // bigrams for each word.
-
- // Split on whitespace
- final Pattern pattern = Pattern.compile("\\s+");
-
- // Compute using pairs of characters (bigrams) for each word.
- // This can be done using a 32-bit int to store two 16-bit characters
- final Function> converter = cs -> {
- final List set = new ArrayList<>();
- for (String word : pattern.split(cs)) {
- if (word.length() > 1) {
- // The strings are converted to upper case
- char ch2 = Character.toUpperCase(word.charAt(0));
- for (int i = 1; i < word.length(); i++) {
- final char ch1 = ch2;
- ch2 = Character.toUpperCase(word.charAt(i));
- set.add(Integer.valueOf((ch1 << 16) | ch2));
- }
- }
- }
- return set;
- };
- final OverlapSimilarity similarity = new OverlapSimilarity<>(converter);
-
- String bookTitle;
- final String search1 = "Web Database Applications";
- final String search2 = "PHP Web Applications";
- final String search3 = "Web Aplications";
- bookTitle = "Web Database Applications with PHP & MySQL";
- assertEquals(82, toF1ScorePercent(similarity.apply(bookTitle, search1)));
- assertEquals(68, toF1ScorePercent(similarity.apply(bookTitle, search2)));
- assertEquals(59, toF1ScorePercent(similarity.apply(bookTitle, search3)));
- bookTitle = "Creating Database Web Applications with PHP and ASP";
- assertEquals(71, toF1ScorePercent(similarity.apply(bookTitle, search1)));
- assertEquals(59, toF1ScorePercent(similarity.apply(bookTitle, search2)));
- assertEquals(50, toF1ScorePercent(similarity.apply(bookTitle, search3)));
- bookTitle = "Building Database Applications on the Web Using PHP3";
- assertEquals(70, toF1ScorePercent(similarity.apply(bookTitle, search1)));
- assertEquals(58, toF1ScorePercent(similarity.apply(bookTitle, search2)));
- assertEquals(49, toF1ScorePercent(similarity.apply(bookTitle, search3)));
- bookTitle = "Building Web Database Applications with Visual Studio 6";
- assertEquals(67, toF1ScorePercent(similarity.apply(bookTitle, search1)));
- assertEquals(47, toF1ScorePercent(similarity.apply(bookTitle, search2)));
- assertEquals(46, toF1ScorePercent(similarity.apply(bookTitle, search3)));
- bookTitle = "Web Application Development With PHP";
- assertEquals(51, toF1ScorePercent(similarity.apply(bookTitle, search1)));
- assertEquals(67, toF1ScorePercent(similarity.apply(bookTitle, search2)));
- assertEquals(56, toF1ScorePercent(similarity.apply(bookTitle, search3)));
- bookTitle = "WebRAD: Building Database Applications on the Web with Visual FoxPro and Web Connection";
- assertEquals(49, toF1ScorePercent(similarity.apply(bookTitle, search1)));
- assertEquals(34, toF1ScorePercent(similarity.apply(bookTitle, search2)));
- assertEquals(32, toF1ScorePercent(similarity.apply(bookTitle, search3)));
- bookTitle = "Structural Assessment: The Role of Large and Full-Scale Testing";
- assertEquals(12, toF1ScorePercent(similarity.apply(bookTitle, search1)));
- assertEquals(7, toF1ScorePercent(similarity.apply(bookTitle, search2)));
- assertEquals(7, toF1ScorePercent(similarity.apply(bookTitle, search3)));
- bookTitle = "How to Find a Scholarship Online";
- assertEquals(10, toF1ScorePercent(similarity.apply(bookTitle, search1)));
- assertEquals(11, toF1ScorePercent(similarity.apply(bookTitle, search2)));
- assertEquals(12, toF1ScorePercent(similarity.apply(bookTitle, search3)));
- }
-
- private static int toF1ScorePercent(OverlapResult overlap) {
- final double value = 2.0 * overlap.getIntersection() / (overlap.getSizeA() + overlap.getSizeB());
- // Convert to percentage
- return (int) Math.round(value * 100);
- }
-
- @Test
- public void testConstructorWithNullConverterThrows() {
- assertThatIllegalArgumentException().isThrownBy(() -> {
- new OverlapSimilarity<>(null);
- });
- }
-
- @Test
- public void testApplyNullNull() {
- assertThatIllegalArgumentException().isThrownBy(() -> {
- new OverlapSimilarity<>(cs -> new HashSet<>(Arrays.asList(cs))).apply(null, null);
- });
- }
-
- @Test
- public void testApplyStringNull() {
- assertThatIllegalArgumentException().isThrownBy(() -> {
- new OverlapSimilarity<>(cs -> new HashSet<>(Arrays.asList(cs))).apply("left", null);
- });
- }
-
- @Test
- public void testApplyNullString() {
- assertThatIllegalArgumentException().isThrownBy(() -> {
- new OverlapSimilarity<>(cs -> new HashSet<>(Arrays.asList(cs))).apply(null, "right");
- });
- }
-}
From 2de106eafaacbcd94afb18c084ce18888c3a1818 Mon Sep 17 00:00:00 2001
From: Amey Jadiye
Date: Mon, 18 Mar 2019 21:29:27 +0530
Subject: [PATCH 15/16] using new IntersectionSimilarity
---
.../commons/text/similarity/SorensenDiceSimilarity.java | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/src/main/java/org/apache/commons/text/similarity/SorensenDiceSimilarity.java b/src/main/java/org/apache/commons/text/similarity/SorensenDiceSimilarity.java
index ead01779d1..d46372d225 100644
--- a/src/main/java/org/apache/commons/text/similarity/SorensenDiceSimilarity.java
+++ b/src/main/java/org/apache/commons/text/similarity/SorensenDiceSimilarity.java
@@ -66,7 +66,7 @@ public class SorensenDiceSimilarity implements SimilarityScore {
* Measures the overlap of two sets created from a pair of character sequences.
* {@link OverlapSimilarity}}
*/
- final OverlapSimilarity similarity = new OverlapSimilarity<>(this.converter);
+ final IntersectionSimilarity similarity = new IntersectionSimilarity<>(this.converter);
/**
* Calculates Sorensen-Dice Similarity of two character sequences passed as
@@ -117,7 +117,7 @@ public Double apply(final CharSequence left, final CharSequence right) {
return 0d;
}
- OverlapResult overlap = similarity.apply(left, right);
+ IntersectionResult overlap = similarity.apply(left, right);
final int total = overlap.getSizeA() + overlap.getSizeB();
final long intersection = overlap.getIntersection();
From 8f0a97cfa6f61a1aaa0f55ffdf4ff2ca48ec9a04 Mon Sep 17 00:00:00 2001
From: Amey Jadiye
Date: Sun, 24 Mar 2019 18:00:33 +0530
Subject: [PATCH 16/16] corrected javadoc, removed unused code and made
instance var private.
---
.../similarity/SorensenDiceSimilarity.java | 35 +++++--------------
1 file changed, 8 insertions(+), 27 deletions(-)
diff --git a/src/main/java/org/apache/commons/text/similarity/SorensenDiceSimilarity.java b/src/main/java/org/apache/commons/text/similarity/SorensenDiceSimilarity.java
index d46372d225..3121bdc2f1 100644
--- a/src/main/java/org/apache/commons/text/similarity/SorensenDiceSimilarity.java
+++ b/src/main/java/org/apache/commons/text/similarity/SorensenDiceSimilarity.java
@@ -18,9 +18,7 @@
import java.util.ArrayList;
import java.util.Collection;
-import java.util.HashSet;
import java.util.List;
-import java.util.Set;
import java.util.function.Function;
import org.apache.commons.lang3.StringUtils;
@@ -30,17 +28,17 @@
* between two character sequences.
*
*
- * The Sørensen–Dice coefficient is a statistic used for comparing the
+ * The Sørensen-Dice coefficient is a statistic used for comparing the
* similarity of two samples. It was independently developed by the botanists
* Thorvald Sørensen and Lee Raymond Dice, who published in 1948 and 1945
* respectively. The index is known by several other names, especially
- * Sørensen–Dice index, Sørensen index and Dice's coefficient. Other
+ * Sørensen-Dice index, Sørensen index and Dice's coefficient. Other
* variations include the "similarity coefficient" or "index", such as Dice
* similarity coefficient (DSC).
*
*
*
- * This implementation is based on the Sørensen–Dice similarity algorithm
+ * This implementation is based on the Sørensen-Dice similarity algorithm
* from
* http://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient.
@@ -55,18 +53,18 @@ public class SorensenDiceSimilarity implements SimilarityScore {
/**
* For shifting bigrams to fit in single integer.
*/
- public static final int SHIFT_NUMBER = 16;
+ private static final int SHIFT_NUMBER = 16;
/**
* Converter function for conversion of string to bigrams.
*/
- final Function> converter = new SorensenDiceConverter();
+ private final Function> converter = new SorensenDiceConverter();
/**
* Measures the overlap of two sets created from a pair of character sequences.
* {@link OverlapSimilarity}}
*/
- final IntersectionSimilarity similarity = new IntersectionSimilarity<>(this.converter);
+ private final IntersectionSimilarity similarity = new IntersectionSimilarity<>(this.converter);
/**
* Calculates Sorensen-Dice Similarity of two character sequences passed as
@@ -113,6 +111,7 @@ public Double apply(final CharSequence left, final CharSequence right) {
return 1d;
}
+ // if bigram is not formed out of any given string, clearly both are not similar.
if (left.length() < 2 || right.length() < 2) {
return 0d;
}
@@ -125,28 +124,10 @@ public Double apply(final CharSequence left, final CharSequence right) {
return (2.0d * intersection) / total;
}
- /**
- * Method for creating bigrams - two consecutive characters. Returns a set of
- * bigrams.
- *
- * @param charSequence The char sequence for which we need set of bigrams.
- * @return set of bigrams.
- */
- protected Set createBigrams(CharSequence charSequence) {
- Set set = new HashSet();
- for (int i = 0; i < charSequence.length() - 1; i++) {
- char chr = charSequence.charAt(i);
- char nextChr = charSequence.charAt(i + 1);
- String bi = "" + chr + nextChr;
- set.add(bi);
- }
- return set;
- }
-
/**
* Converter class for creating Bigrams for SorensenDice similarity.
*/
- static class SorensenDiceConverter implements Function> {
+ private static class SorensenDiceConverter implements Function> {
@Override
public Collection apply(CharSequence cs) {
final int length = cs.length();