From 7aae02b96ca7cf5f6801f81c214c4d437a1e072c Mon Sep 17 00:00:00 2001 From: Amey Jadiye Date: Tue, 26 Feb 2019 23:46:06 +0530 Subject: [PATCH 01/16] Adding Sorensen-Dice similarity algoritham --- .../similarity/SorensenDicesSimilarity.java | 74 +++++++++++++++++ .../SorensenDicesSimilarityTest.java | 80 +++++++++++++++++++ 2 files changed, 154 insertions(+) create mode 100644 src/main/java/org/apache/commons/text/similarity/SorensenDicesSimilarity.java create mode 100644 src/test/java/org/apache/commons/text/similarity/SorensenDicesSimilarityTest.java diff --git a/src/main/java/org/apache/commons/text/similarity/SorensenDicesSimilarity.java b/src/main/java/org/apache/commons/text/similarity/SorensenDicesSimilarity.java new file mode 100644 index 0000000000..1c7710815d --- /dev/null +++ b/src/main/java/org/apache/commons/text/similarity/SorensenDicesSimilarity.java @@ -0,0 +1,74 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.text.similarity; + +import java.util.HashSet; +import java.util.Set; + +/** + * + * @since 1.7 + */ +public class SorensenDicesSimilarity implements SimilarityScore { + + /** + * @param left the first CharSequence, must not be null + * @param right the second CharSequence, must not be null + * @return result similarity + * @throws IllegalArgumentException if either CharSequence input is {@code null} + */ + + @Override + public Double apply(final CharSequence left, final CharSequence right) { + + if (left == null || right == null) { + throw new IllegalArgumentException("CharSequences must not be null"); + } + + if (left.equals(right)) { + return 1d; + } + + if ("".equals(left) || "".equals(right)) { + return 0d; + } + + Set nLeft = new HashSet(); + Set nRight = new HashSet(); + + for (int i = 0; i < left.length() - 1; i++) { + char chr = left.charAt(i); + char nextChr = left.charAt(i + 1); + String bi = "" + chr + nextChr; + nLeft.add(bi); + } + for (int j = 0; j < right.length() - 1; j++) { + char chr = right.charAt(j); + char nextChr = right.charAt(j + 1); + String bi = "" + chr + nextChr; + nRight.add(bi); + } + + final int total = nLeft.size() + nRight.size(); + final Set union = new HashSet(total); + union.addAll(nLeft); + union.addAll(nRight); + + final int intersection = total - union.size(); + return (2.0d * intersection) / total; + } +} diff --git a/src/test/java/org/apache/commons/text/similarity/SorensenDicesSimilarityTest.java b/src/test/java/org/apache/commons/text/similarity/SorensenDicesSimilarityTest.java new file mode 100644 index 0000000000..1298bf4337 --- /dev/null +++ b/src/test/java/org/apache/commons/text/similarity/SorensenDicesSimilarityTest.java @@ -0,0 +1,80 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.text.similarity; + +import static org.assertj.core.api.Assertions.assertThatIllegalArgumentException; +import static org.junit.jupiter.api.Assertions.assertEquals; + +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; + +/** + * Unit tests for {@link SorensenDicesSimilarity}. + */ +public class SorensenDicesSimilarityTest { + + private static SorensenDicesSimilarity similarity; + + @BeforeAll + public static void setUp() { + similarity = new SorensenDicesSimilarity(); + } + + @Test + public void testGetJaroWinklerSimilarity_StringString() { + + assertEquals(1d, similarity.apply("", "")); + assertEquals(1.0d, similarity.apply("foo", "foo")); + assertEquals(0.8d, similarity.apply("foo", "foo ")); + assertEquals(0.4d, similarity.apply("frog", "fog")); + assertEquals(0.0d, similarity.apply("fly", "ant")); + assertEquals(0.0d, similarity.apply("elephant", "hippo")); + assertEquals(0.0d, similarity.apply("hippo", "elephant")); + assertEquals(0.0d, similarity.apply("hippo", "zzzzzzzz")); + assertEquals(0.5d, similarity.apply("hello", "hallo")); + assertEquals(0.7d, similarity.apply("ABC Corporation", "ABC Corp")); + assertEquals(0.7391304347826086d, similarity.apply("D N H Enterprises Inc", "D & H Enterprises, Inc.")); + assertEquals(0.8076923076923077d, + similarity.apply("My Gym Children's Fitness Center", "My Gym. Childrens Fitness")); + assertEquals(0.6956521739130435, similarity.apply("PENNSYLVANIA", "PENNCISYLVNIA")); + assertEquals(0.9230769230769231, similarity.apply("/opt/software1", "/opt/software2")); + assertEquals(0.5d, similarity.apply("aaabcd", "aaacdb")); + assertEquals(0.631578947368421, similarity.apply("John Horn", "John Hopkins")); + + } + + @Test + public void testGetDicesCoefficientSimilarity_NullNull() { + assertThatIllegalArgumentException().isThrownBy(() -> { + similarity.apply(null, null); + }); + } + + @Test + public void testGetDicesCoefficientSimilarity_StringNull() { + assertThatIllegalArgumentException().isThrownBy(() -> { + similarity.apply(" ", null); + }); + } + + @Test + public void testGetDicesCoefficientSimilarity_NullString() { + assertThatIllegalArgumentException().isThrownBy(() -> { + similarity.apply(null, "clear"); + }); + } +} From 325658bb259d4ba49b56fa95f519623bf20af5cf Mon Sep 17 00:00:00 2001 From: Amey Jadiye Date: Wed, 27 Feb 2019 23:16:07 +0530 Subject: [PATCH 02/16] added documentation and optimised code --- .../similarity/SorensenDicesSimilarity.java | 84 ++++++++++++++----- .../SorensenDicesSimilarityTest.java | 13 ++- 2 files changed, 73 insertions(+), 24 deletions(-) diff --git a/src/main/java/org/apache/commons/text/similarity/SorensenDicesSimilarity.java b/src/main/java/org/apache/commons/text/similarity/SorensenDicesSimilarity.java index 1c7710815d..cc936fdb8c 100644 --- a/src/main/java/org/apache/commons/text/similarity/SorensenDicesSimilarity.java +++ b/src/main/java/org/apache/commons/text/similarity/SorensenDicesSimilarity.java @@ -20,18 +20,60 @@ import java.util.Set; /** + * A similarity algorithm indicating the percentage of matched characters + * between two character sequences. + * + *

+ * The Sørensen–Dice coefficient is a statistic used for comparing the + * similarity of two samples. It was independently developed by the botanists + * Thorvald Sørensen and Lee Raymond Dice, who published in 1948 and 1945 + * respectively. The index is known by several other names, especially + * Sørensen–Dice index, Sørensen index and Dice's coefficient. Other variations + * include the "similarity coefficient" or "index", such as Dice similarity + * coefficient (DSC). + *

+ * + *

+ * This implementation is based on the Sørensen–Dice similarity algorithm from + * + * http://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient. + * + * + *

* * @since 1.7 */ public class SorensenDicesSimilarity implements SimilarityScore { /** + * + *
+     * similarity.apply("", "")                     = 1.0
+     * similarity.apply("foo", "foo")               = 1.0
+     * similarity.apply("foo", "foo ")              = 0.8
+     * similarity.apply("foo", "foo ")              = 0.66
+     * similarity.apply("foo", " foo ")             = 0.66
+     * similarity.apply("foo", " foo")              = 0.66
+     * similarity.apply("", "a")                    = 0.0
+     * similarity.apply("aaapppp", "")              = 0.0
+     * similarity.apply("frog", "fog")              = 0.4
+     * similarity.apply("fly", "ant")               = 0.0
+     * similarity.apply("elephant", "hippo")        = 0.0
+     * similarity.apply("hippo", "elephant")        = 0.0
+     * similarity.apply("hippo", "zzzzzzzz")        = 0.0
+     * similarity.apply("hello", "hallo")           = 0.5
+     * similarity.apply("ABC Corporation", "ABC Corp") = 0.7
+     * similarity.apply("D N H Enterprises Inc", "D & H Enterprises, Inc.") = 0.74
+     * similarity.apply("My Gym Children's Fitness Center", "My Gym. Childrens Fitness") = 0.81
+     * similarity.apply("PENNSYLVANIA", "PENNCISYLVNIA") = 0.69
+     * 
+ * * @param left the first CharSequence, must not be null * @param right the second CharSequence, must not be null * @return result similarity * @throws IllegalArgumentException if either CharSequence input is {@code null} */ - @Override public Double apply(final CharSequence left, final CharSequence right) { @@ -47,28 +89,30 @@ public Double apply(final CharSequence left, final CharSequence right) { return 0d; } - Set nLeft = new HashSet(); - Set nRight = new HashSet(); - - for (int i = 0; i < left.length() - 1; i++) { - char chr = left.charAt(i); - char nextChr = left.charAt(i + 1); - String bi = "" + chr + nextChr; - nLeft.add(bi); - } - for (int j = 0; j < right.length() - 1; j++) { - char chr = right.charAt(j); - char nextChr = right.charAt(j + 1); - String bi = "" + chr + nextChr; - nRight.add(bi); - } + Set nLeft = createBigrams(left); + Set nRight = createBigrams(right); final int total = nLeft.size() + nRight.size(); - final Set union = new HashSet(total); - union.addAll(nLeft); - union.addAll(nRight); + nLeft.retainAll(nRight); + final int intersection = nLeft.size(); - final int intersection = total - union.size(); return (2.0d * intersection) / total; } + + /** + * Method for creating Bigrams, bigrams are nothing but set of two consecutive + * characters. + * @param charSequence The char sequence for which we need set of bigrams. + * @return set of bigrams. + */ + protected Set createBigrams(CharSequence charSequence) { + Set set = new HashSet(); + for (int i = 0; i < charSequence.length() - 1; i++) { + char chr = charSequence.charAt(i); + char nextChr = charSequence.charAt(i + 1); + String bi = "" + chr + nextChr; + set.add(bi); + } + return set; + } } diff --git a/src/test/java/org/apache/commons/text/similarity/SorensenDicesSimilarityTest.java b/src/test/java/org/apache/commons/text/similarity/SorensenDicesSimilarityTest.java index 1298bf4337..252e801126 100644 --- a/src/test/java/org/apache/commons/text/similarity/SorensenDicesSimilarityTest.java +++ b/src/test/java/org/apache/commons/text/similarity/SorensenDicesSimilarityTest.java @@ -35,7 +35,12 @@ public static void setUp() { } @Test - public void testGetJaroWinklerSimilarity_StringString() { + public void test() { + assertEquals(0.25d, similarity.apply("night", "nacht")); + } + + @Test + public void testGetSorensenDicesSimilarity_StringString() { assertEquals(1d, similarity.apply("", "")); assertEquals(1.0d, similarity.apply("foo", "foo")); @@ -58,21 +63,21 @@ public void testGetJaroWinklerSimilarity_StringString() { } @Test - public void testGetDicesCoefficientSimilarity_NullNull() { + public void testGetSorensenDicesSimilarity_NullNull() { assertThatIllegalArgumentException().isThrownBy(() -> { similarity.apply(null, null); }); } @Test - public void testGetDicesCoefficientSimilarity_StringNull() { + public void testGetSorensenDicesSimilarity_StringNull() { assertThatIllegalArgumentException().isThrownBy(() -> { similarity.apply(" ", null); }); } @Test - public void testGetDicesCoefficientSimilarity_NullString() { + public void testGetSorensenDicesSimilarity_NullString() { assertThatIllegalArgumentException().isThrownBy(() -> { similarity.apply(null, "clear"); }); From 4191840d20c54d1ef434bd0826b35a798e14accd Mon Sep 17 00:00:00 2001 From: Amey Jadiye Date: Sat, 2 Mar 2019 00:20:56 +0530 Subject: [PATCH 03/16] replacing bit faster version of empty check --- .../apache/commons/text/similarity/SorensenDicesSimilarity.java | 2 +- .../commons/text/similarity/SorensenDicesSimilarityTest.java | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/src/main/java/org/apache/commons/text/similarity/SorensenDicesSimilarity.java b/src/main/java/org/apache/commons/text/similarity/SorensenDicesSimilarity.java index cc936fdb8c..d4df1d7138 100644 --- a/src/main/java/org/apache/commons/text/similarity/SorensenDicesSimilarity.java +++ b/src/main/java/org/apache/commons/text/similarity/SorensenDicesSimilarity.java @@ -85,7 +85,7 @@ public Double apply(final CharSequence left, final CharSequence right) { return 1d; } - if ("".equals(left) || "".equals(right)) { + if (left.length() == 0 || right.length() == 0) { return 0d; } diff --git a/src/test/java/org/apache/commons/text/similarity/SorensenDicesSimilarityTest.java b/src/test/java/org/apache/commons/text/similarity/SorensenDicesSimilarityTest.java index 252e801126..9ba56b7c16 100644 --- a/src/test/java/org/apache/commons/text/similarity/SorensenDicesSimilarityTest.java +++ b/src/test/java/org/apache/commons/text/similarity/SorensenDicesSimilarityTest.java @@ -43,6 +43,8 @@ public void test() { public void testGetSorensenDicesSimilarity_StringString() { assertEquals(1d, similarity.apply("", "")); + assertEquals(0d, similarity.apply("", "a")); + assertEquals(0d, similarity.apply("a", "")); assertEquals(1.0d, similarity.apply("foo", "foo")); assertEquals(0.8d, similarity.apply("foo", "foo ")); assertEquals(0.4d, similarity.apply("frog", "fog")); From 9070f05f02740da15bdf4e30119cd9118f1cf599 Mon Sep 17 00:00:00 2001 From: Amey Jadiye Date: Sun, 3 Mar 2019 18:13:43 +0530 Subject: [PATCH 04/16] Improved javadocs and handled some edge cases --- ...arity.java => SorensenDiceSimilarity.java} | 37 ++++++++++++------- ...t.java => SorensenDiceSimilarityTest.java} | 8 ++-- 2 files changed, 28 insertions(+), 17 deletions(-) rename src/main/java/org/apache/commons/text/similarity/{SorensenDicesSimilarity.java => SorensenDiceSimilarity.java} (74%) rename src/test/java/org/apache/commons/text/similarity/{SorensenDicesSimilarityTest.java => SorensenDiceSimilarityTest.java} (92%) diff --git a/src/main/java/org/apache/commons/text/similarity/SorensenDicesSimilarity.java b/src/main/java/org/apache/commons/text/similarity/SorensenDiceSimilarity.java similarity index 74% rename from src/main/java/org/apache/commons/text/similarity/SorensenDicesSimilarity.java rename to src/main/java/org/apache/commons/text/similarity/SorensenDiceSimilarity.java index d4df1d7138..dc59d0fa74 100644 --- a/src/main/java/org/apache/commons/text/similarity/SorensenDicesSimilarity.java +++ b/src/main/java/org/apache/commons/text/similarity/SorensenDiceSimilarity.java @@ -18,24 +18,27 @@ import java.util.HashSet; import java.util.Set; +import java.util.stream.Collectors; + +import org.apache.commons.lang3.StringUtils; /** * A similarity algorithm indicating the percentage of matched characters * between two character sequences. * *

- * The Sørensen–Dice coefficient is a statistic used for comparing the + * The Sørensen–Dice coefficient is a statistic used for comparing the * similarity of two samples. It was independently developed by the botanists - * Thorvald Sørensen and Lee Raymond Dice, who published in 1948 and 1945 + * Thorvald Sørensen and Lee Raymond Dice, who published in 1948 and 1945 * respectively. The index is known by several other names, especially - * Sørensen–Dice index, Sørensen index and Dice's coefficient. Other variations - * include the "similarity coefficient" or "index", such as Dice similarity - * coefficient (DSC). + * Sørensen–Dice index, Sørensen index and Dice's coefficient. Other + * variations include the "similarity coefficient" or "index", such as Dice + * similarity coefficient (DSC). *

* *

- * This implementation is based on the Sørensen–Dice similarity algorithm from - * * http://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient. * @@ -44,11 +47,17 @@ * * @since 1.7 */ -public class SorensenDicesSimilarity implements SimilarityScore { +public class SorensenDiceSimilarity implements SimilarityScore { /** + * Calculates Sorensen-Dice Similarity of two character sequences passed as + * input. * *

+     * similarity.apply(null, null)                 = IllegalArgumentException
+     * similarity.apply("foo", null)                = IllegalArgumentException
+     * similarity.apply(null, "foo")                = IllegalArgumentException
+     * similarity.apply("night", "nacht")           = 0.25
      * similarity.apply("", "")                     = 1.0
      * similarity.apply("foo", "foo")               = 1.0
      * similarity.apply("foo", "foo ")              = 0.8
@@ -81,11 +90,11 @@ public Double apply(final CharSequence left, final CharSequence right) {
             throw new IllegalArgumentException("CharSequences must not be null");
         }
 
-        if (left.equals(right)) {
+        if (StringUtils.equals(left, right)) {
             return 1d;
         }
 
-        if (left.length() == 0 || right.length() == 0) {
+        if (left.length() < 2 || right.length() < 2) {
             return 0d;
         }
 
@@ -93,15 +102,15 @@ public Double apply(final CharSequence left, final CharSequence right) {
         Set nRight = createBigrams(right);
 
         final int total = nLeft.size() + nRight.size();
-        nLeft.retainAll(nRight);
-        final int intersection = nLeft.size();
+        final long intersection = nLeft.stream().filter(nRight::contains).collect(Collectors.counting());
 
         return (2.0d * intersection) / total;
     }
 
     /**
-     * Method for creating Bigrams, bigrams are nothing but set of two consecutive
-     * characters.
+     * Method for creating bigrams - two consecutive characters. Returns a set of
+     * bigrams.
+     *
      * @param charSequence The char sequence for which we need set of bigrams.
      * @return set of bigrams.
      */
diff --git a/src/test/java/org/apache/commons/text/similarity/SorensenDicesSimilarityTest.java b/src/test/java/org/apache/commons/text/similarity/SorensenDiceSimilarityTest.java
similarity index 92%
rename from src/test/java/org/apache/commons/text/similarity/SorensenDicesSimilarityTest.java
rename to src/test/java/org/apache/commons/text/similarity/SorensenDiceSimilarityTest.java
index 9ba56b7c16..2e144e7628 100644
--- a/src/test/java/org/apache/commons/text/similarity/SorensenDicesSimilarityTest.java
+++ b/src/test/java/org/apache/commons/text/similarity/SorensenDiceSimilarityTest.java
@@ -25,13 +25,13 @@
 /**
  * Unit tests for {@link SorensenDicesSimilarity}.
  */
-public class SorensenDicesSimilarityTest {
+public class SorensenDiceSimilarityTest {
 
-    private static SorensenDicesSimilarity similarity;
+    private static SorensenDiceSimilarity similarity;
 
     @BeforeAll
     public static void setUp() {
-        similarity = new SorensenDicesSimilarity();
+        similarity = new SorensenDiceSimilarity();
     }
 
     @Test
@@ -45,6 +45,8 @@ public void testGetSorensenDicesSimilarity_StringString() {
         assertEquals(1d, similarity.apply("", ""));
         assertEquals(0d, similarity.apply("", "a"));
         assertEquals(0d, similarity.apply("a", ""));
+        assertEquals(1d, similarity.apply("a", "a"));
+        assertEquals(0d, similarity.apply("a", "b"));
         assertEquals(1.0d, similarity.apply("foo", "foo"));
         assertEquals(0.8d, similarity.apply("foo", "foo "));
         assertEquals(0.4d, similarity.apply("frog", "fog"));

From 505066c984c7b4bd99d64aa3d29f78a23f1db648 Mon Sep 17 00:00:00 2001
From: Alex Herbert 
Date: Thu, 7 Mar 2019 13:44:54 +0000
Subject: [PATCH 05/16] TEXT-155: Add a generic IntersectionSimilarity measure

---
 .../text/similarity/IntersectionResult.java   | 153 ++++++++++++
 .../similarity/IntersectionSimilarity.java    |  79 +++++++
 .../similarity/IntersectionResultTest.java    | 221 ++++++++++++++++++
 .../IntersectionSimilarityTest.java           | 143 ++++++++++++
 4 files changed, 596 insertions(+)
 create mode 100644 src/main/java/org/apache/commons/text/similarity/IntersectionResult.java
 create mode 100644 src/main/java/org/apache/commons/text/similarity/IntersectionSimilarity.java
 create mode 100644 src/test/java/org/apache/commons/text/similarity/IntersectionResultTest.java
 create mode 100644 src/test/java/org/apache/commons/text/similarity/IntersectionSimilarityTest.java

diff --git a/src/main/java/org/apache/commons/text/similarity/IntersectionResult.java b/src/main/java/org/apache/commons/text/similarity/IntersectionResult.java
new file mode 100644
index 0000000000..faa2b42d57
--- /dev/null
+++ b/src/main/java/org/apache/commons/text/similarity/IntersectionResult.java
@@ -0,0 +1,153 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.text.similarity;
+
+import java.util.Objects;
+
+/**
+ * Container class to store the intersection results between two sets.
+ *
+ * 

Stores the size of set A, set B and the intersection of A and B (A ∩ B). + * The result can be used to produce various similarity metrics, for example the Jaccard or F1-score.

+ * + *

This class is immutable.

+ * + * @since 1.7 + * @see Jaccard index + * @see F1 score + */ +public class IntersectionResult { + /** + * The size of set A. + */ + private final int sizeA; + /** + * The size of set B. + */ + private final int sizeB; + /** + * The size of the intersection between set A and B. + */ + private final int intersection; + + /** + * Create the results for an intersection between two sets. + * + * @param sizeA the size of set A ({@code |A|}) + * @param sizeB the size of set B ({@code |B|}) + * @param intersection the size of the intersection of A and B (A ∩ B) + * @throws IllegalArgumentException if the sizes are negative or the intersection is greater + * than the minimum of the two set sizes + */ + public IntersectionResult(final int sizeA, final int sizeB, final int intersection) { + if (sizeA < 0) { + throw new IllegalArgumentException("Set size |A| is not positive: " + sizeA); + } + if (sizeB < 0) { + throw new IllegalArgumentException("Set size |B| is not positive: " + sizeB); + } + if (intersection < 0 || intersection > Math.min(sizeA, sizeB)) { + throw new IllegalArgumentException("Invalid intersection of |A| and |B|: " + intersection); + } + this.sizeA = sizeA; + this.sizeB = sizeB; + this.intersection = intersection; + } + + /** + * Get the size of set A (|A|). + * + * @return |A| + */ + public int getSizeA() { + return sizeA; + } + + /** + * Get the size of set B (|B|). + * + * @return |B| + */ + public int getSizeB() { + return sizeB; + } + + /** + * Get the size of the intersection between set A and B. + * + * @return A ∩ B + */ + public int getIntersection() { + return intersection; + } + /** + * Get the size of the union between set A and B. + * + * @return A ∩ B + */ + public long getUnion() { + return (long) sizeA + sizeB - intersection; + } + + /** + * Gets the Jaccard. + * + *

This implementation defines the result as zero if there is no intersection, + * even when the size of both sets is zero.

+ * + * @return the Jaccard + * @see Jaccard index + */ + public double getJaccard() { + return intersection == 0 ? 0.0 : (double) intersection / getUnion(); + } + + /** + * Gets the F1 score. + * + *

This implementation defines the result as zero if there is no intersection, + * even when the size of both sets is zero.

+ * + * @return the F1 score + * @see F1 score + */ + public double getF1Score() { + return intersection == 0 ? 0.0 : 2.0 * intersection / ((long) sizeA + sizeB); + } + + @Override + public boolean equals(final Object o) { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + final IntersectionResult result = (IntersectionResult) o; + return sizeA == result.sizeA && sizeB == result.sizeB && intersection == result.intersection; + } + + @Override + public int hashCode() { + return Objects.hash(sizeA, sizeB, intersection); + } + + @Override + public String toString() { + return "Size A: " + sizeA + ", Size B: " + sizeB + ", Intersection: " + intersection; + } +} diff --git a/src/main/java/org/apache/commons/text/similarity/IntersectionSimilarity.java b/src/main/java/org/apache/commons/text/similarity/IntersectionSimilarity.java new file mode 100644 index 0000000000..f0deb06c48 --- /dev/null +++ b/src/main/java/org/apache/commons/text/similarity/IntersectionSimilarity.java @@ -0,0 +1,79 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.text.similarity; + +import java.util.Set; +import java.util.function.Function; +import java.util.stream.Collectors; + +/** + * Measures the intersection of two sets created from a pair of character + * sequences. + * + *

It is assumed that the type {@code T} correctly conforms to the + * requirements for storage within a {@link Set}, ideally the type is + * immutable and implements {@link Object#equals(Object)}.

+ * + * @param the type of the set extracted from the character sequence + * @since 1.7 + * @see Set + */ +public class IntersectionSimilarity implements SimilarityScore { + /** The converter used to create the set elements. */ + private final Function> converter; + + /** + * Create a new set similarity using the provided converter. + * + * @param converter the converter used to create the set + * @throws IllegalArgumentException if the converter is null + */ + public IntersectionSimilarity(Function> converter) { + if (converter == null) { + throw new IllegalArgumentException("Converter must not be null"); + } + this.converter = converter; + } + + /** + * Calculates the intersection of two character sequences passed as input. + * + * @param left first character sequence + * @param right second character sequence + * @return the intersection result + * @throws IllegalArgumentException if either input sequence is {@code null} + */ + @Override + public IntersectionResult apply(final CharSequence left, final CharSequence right) { + if (left == null || right == null) { + throw new IllegalArgumentException("Input cannot be null"); + } + final Set setA = converter.apply(left); + final Set setB = converter.apply(right); + final int sizeA = setA.size(); + final int sizeB = setB.size(); + // Short-cut if either set is empty + if (Math.min(sizeA, sizeB) == 0) { + // No intersection + return new IntersectionResult(sizeA, sizeB, 0); + } + // We can use intValue() to convert the Long output from the + // collector as the intersection cannot be bigger than either set. + final int intersection = setA.stream().filter(setB::contains).collect(Collectors.counting()).intValue(); + return new IntersectionResult(sizeA, sizeB, intersection); + } +} diff --git a/src/test/java/org/apache/commons/text/similarity/IntersectionResultTest.java b/src/test/java/org/apache/commons/text/similarity/IntersectionResultTest.java new file mode 100644 index 0000000000..a0c0923e95 --- /dev/null +++ b/src/test/java/org/apache/commons/text/similarity/IntersectionResultTest.java @@ -0,0 +1,221 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.text.similarity; + +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Test; + +import java.util.HashMap; +import java.util.concurrent.ThreadLocalRandom; + +/** + * Unit tests for {@link FuzzyScore}. + */ +public class IntersectionResultTest { + + @Test + public void testNewIntersectionResult_WithZeros() { + final int sizeA = 0; + final int sizeB = 0; + final int intersection = 0; + new IntersectionResult(sizeA, sizeB, intersection); + } + + @Test + public void testNewIntersectionResult_WithNegativeSizeA() { + final int sizeA = -1; + final int sizeB = 0; + final int intersection = 0; + Assertions.assertThrows(IllegalArgumentException.class, () -> { + new IntersectionResult(sizeA, sizeB, intersection); + }); + } + + @Test + public void testNewIntersectionResult_WithNegativeSizeB() { + final int sizeA = 0; + final int sizeB = -1; + final int intersection = 0; + Assertions.assertThrows(IllegalArgumentException.class, () -> { + new IntersectionResult(sizeA, sizeB, intersection); + }); + } + + @Test + public void testNewIntersectionResult_WithNegativeIntersection() { + final int sizeA = 0; + final int sizeB = 0; + final int intersection = -1; + Assertions.assertThrows(IllegalArgumentException.class, () -> { + new IntersectionResult(sizeA, sizeB, intersection); + }); + } + + @Test + public void testNewIntersectionResult_WithIntersectionAboveSizeAorB() { + final int sizeA = 1; + final int sizeB = 2; + final int intersection = Math.max(sizeA, sizeB) + 1; + Assertions.assertThrows(IllegalArgumentException.class, () -> { + new IntersectionResult(sizeA, sizeB, intersection); + }); + Assertions.assertThrows(IllegalArgumentException.class, () -> { + new IntersectionResult(sizeB, sizeA, intersection); + }); + } + + @Test + public void testUnion() { + // Union is the combined size minus the intersection + Assertions.assertEquals(0, getUnion(0, 0, 0)); + + Assertions.assertEquals(1, getUnion(1, 0, 0)); + Assertions.assertEquals(2, getUnion(1, 1, 0)); + Assertions.assertEquals(1, getUnion(1, 1, 1)); + + Assertions.assertEquals(2, getUnion(2, 0, 0)); + Assertions.assertEquals(3, getUnion(2, 1, 0)); + Assertions.assertEquals(2, getUnion(2, 1, 1)); + Assertions.assertEquals(2, getUnion(2, 2, 2)); + + // Test overflow of int addition + Assertions.assertEquals((long) Integer.MAX_VALUE + 1, getUnion(Integer.MAX_VALUE, 1, 0)); + Assertions.assertEquals((long) Integer.MAX_VALUE + 1, + getUnion(Integer.MAX_VALUE, Integer.MAX_VALUE, Integer.MAX_VALUE - 1)); + Assertions.assertEquals((long) Integer.MAX_VALUE + Integer.MAX_VALUE, + getUnion(Integer.MAX_VALUE, Integer.MAX_VALUE, 0)); + } + + private static long getUnion(int sizeA, int sizeB, int intersection) { + return new IntersectionResult(sizeA, sizeB, intersection).getUnion(); + } + + @Test + public void testJaccard() { + // Jaccard is the intersection divided by the union + Assertions.assertEquals(0, getJaccard(0, 0, 0)); + + Assertions.assertEquals(0, getJaccard(1, 0, 0)); + Assertions.assertEquals(0, getJaccard(1, 1, 0)); + Assertions.assertEquals(1, getJaccard(1, 1, 1)); + + Assertions.assertEquals(0, getJaccard(2, 0, 0)); + Assertions.assertEquals(0, getJaccard(2, 1, 0)); + Assertions.assertEquals(1.0 / 2, getJaccard(2, 1, 1)); + Assertions.assertEquals(1, getJaccard(2, 2, 2)); + + Assertions.assertEquals(2.0 / 21, getJaccard(20, 3, 2)); + } + + private static double getJaccard(int sizeA, int sizeB, int intersection) { + return new IntersectionResult(sizeA, sizeB, intersection).getJaccard(); + } + + @Test + public void testF1Score() { + // F1-score is 2 * intersection divided by the size of each set + Assertions.assertEquals(0, getF1Score(0, 0, 0)); + + Assertions.assertEquals(0, getF1Score(1, 0, 0)); + Assertions.assertEquals(0, getF1Score(1, 1, 0)); + Assertions.assertEquals(1, getF1Score(1, 1, 1)); + + Assertions.assertEquals(0, getF1Score(2, 0, 0)); + Assertions.assertEquals(0, getF1Score(2, 1, 0)); + Assertions.assertEquals(2 * 1.0 / (2 + 1), getF1Score(2, 1, 1)); + Assertions.assertEquals(1, getF1Score(2, 2, 2)); + + Assertions.assertEquals(2 * 2.0 / (20 + 3), getF1Score(20, 3, 2)); + } + + private static double getF1Score(int sizeA, int sizeB, int intersection) { + return new IntersectionResult(sizeA, sizeB, intersection).getF1Score(); + } + + @Test + public void testProperties() { + final ThreadLocalRandom rand = ThreadLocalRandom.current(); + final int max = 1024; + for (int i = 0; i < 5; i++) { + // Ensure the min is above 0 + final int sizeA = rand.nextInt(max) + 1; + final int sizeB = rand.nextInt(max) + 1; + final int intersection = rand.nextInt(Math.min(sizeA, sizeB)); + final IntersectionResult result = new IntersectionResult(sizeA, sizeB, intersection); + Assertions.assertEquals(sizeA, result.getSizeA()); + Assertions.assertEquals(sizeB, result.getSizeB()); + Assertions.assertEquals(intersection, result.getIntersection()); + } + } + + @Test + public void testEquals() { + final IntersectionResult[] results = new IntersectionResult[] { + new IntersectionResult(0, 0, 0), + new IntersectionResult(10, 0, 0), + new IntersectionResult(10, 10, 0), + new IntersectionResult(10, 10, 10), + }; + + // Test difference instance with same values + Assertions.assertTrue(results[0].equals(new IntersectionResult(0, 0, 0))); + + final Object something = new Object(); + for (int i = 0; i < results.length; i++) { + Assertions.assertFalse(results[i].equals(something)); + Assertions.assertFalse(results[i].equals(null)); + for (int j = 0; j < results.length; j++) { + Assertions.assertTrue(results[i].equals(results[j]) == (i == j)); + } + } + } + + @Test + public void testHashCode() { + final IntersectionResult[] results = new IntersectionResult[] { + new IntersectionResult(10, 0, 0), + new IntersectionResult(10, 10, 0), + new IntersectionResult(10, 10, 10), + }; + final HashMap map = new HashMap<>(); + for (int i = 0; i < results.length; i++) { + map.put(results[i], i); + } + for (int i = 0; i < results.length; i++) { + Assertions.assertEquals(i, map.get(results[i])); + } + } + + @Test + public void testToString() { + final ThreadLocalRandom rand = ThreadLocalRandom.current(); + final int max = 9; + for (int i = 0; i < 5; i++) { + // Ensure the min is above 0 + final int sizeA = rand.nextInt(max) + 1; + final int sizeB = rand.nextInt(max) + 1; + final int intersection = rand.nextInt(Math.min(sizeA, sizeB)); + final IntersectionResult result = new IntersectionResult(sizeA, sizeB, intersection); + final String string = result.toString(); + // Not perfect as this will match substrings too. The chance of error + // is limited by restricting the numbers to a max of 10. + Assertions.assertTrue(string.contains(String.valueOf(sizeA))); + Assertions.assertTrue(string.contains(String.valueOf(sizeB))); + Assertions.assertTrue(string.contains(String.valueOf(intersection))); + } + } +} diff --git a/src/test/java/org/apache/commons/text/similarity/IntersectionSimilarityTest.java b/src/test/java/org/apache/commons/text/similarity/IntersectionSimilarityTest.java new file mode 100644 index 0000000000..b5c66c3ad2 --- /dev/null +++ b/src/test/java/org/apache/commons/text/similarity/IntersectionSimilarityTest.java @@ -0,0 +1,143 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.text.similarity; + +import static org.assertj.core.api.Assertions.assertThatIllegalArgumentException; +import static org.junit.jupiter.api.Assertions.assertEquals; + +import org.junit.jupiter.api.Test; + +import java.util.Arrays; +import java.util.HashSet; +import java.util.Set; +import java.util.function.Function; + +/** + * Unit tests for {@link IntersectionSimilarity}. + */ +public class IntersectionSimilarityTest { + @Test + public void testGettingJaccardSimilarity() { + // Match the functionality of the JaccardSimilarity class + final Function> converter = (cs) -> { + final Set set = new HashSet<>(); + for (int i = 0; i < cs.length(); i++) { + set.add(cs.charAt(i)); + } + return set; + }; + final IntersectionSimilarity similarity = new IntersectionSimilarity<>(converter); + + assertEquals(0.00d, round(similarity.apply("", "").getJaccard()), 0.00000000000000000001d); + assertEquals(0.00d, round(similarity.apply("left", "").getJaccard()), 0.00000000000000000001d); + assertEquals(0.00d, round(similarity.apply("", "right").getJaccard()), 0.00000000000000000001d); + assertEquals(0.75d, round(similarity.apply("frog", "fog").getJaccard()), 0.00000000000000000001d); + assertEquals(0.00d, round(similarity.apply("fly", "ant").getJaccard()), 0.00000000000000000001d); + assertEquals(0.22d, round(similarity.apply("elephant", "hippo").getJaccard()), 0.00000000000000000001d); + assertEquals(0.64d, round(similarity.apply("ABC Corporation", "ABC Corp").getJaccard()), + 0.00000000000000000001d); + assertEquals(0.76d, round(similarity.apply("D N H Enterprises Inc", "D & H Enterprises, Inc.").getJaccard()), + 0.00000000000000000001d); + assertEquals(0.89d, + round(similarity.apply("My Gym Children's Fitness Center", "My Gym. Childrens Fitness").getJaccard()), + 0.00000000000000000001d); + assertEquals(0.9d, round(similarity.apply("PENNSYLVANIA", "PENNCISYLVNIA").getJaccard()), + 0.00000000000000000001d); + assertEquals(0.13d, round(similarity.apply("left", "right").getJaccard()), 0.00000000000000000001d); + assertEquals(0.13d, round(similarity.apply("leettteft", "ritttght").getJaccard()), 0.00000000000000000001d); + assertEquals(1.0d, round(similarity.apply("the same string", "the same string").getJaccard()), + 0.00000000000000000001d); + } + + private static double round(double value) { + // For some undocumented reason the JaccardSimilarity rounds to 2 D.P. + return Math.round(value * 100d) / 100d; + } + + @Test + public void testGettingF1ScoreUsingBigrams() { + // Compute the F1-score using pairs of characters (bigrams) + final Function> converter = (cs) -> { + final Set set = new HashSet<>(); + final int length = cs.length(); + if (length > 1) { + final char[] bigram = new char[2]; + bigram[1] = cs.charAt(0); + for (int i = 1; i < cs.length(); i++) { + bigram[0] = bigram[1]; + bigram[1] = cs.charAt(i); + set.add(new String(bigram)); + } + } + return set; + }; + final IntersectionSimilarity similarity = new IntersectionSimilarity<>(converter); + + // Note that when there are no bigrams then the similarity is zero. + + assertEquals(0d, similarity.apply("", "").getF1Score()); + assertEquals(0d, similarity.apply("", "a").getF1Score()); + assertEquals(0d, similarity.apply("a", "").getF1Score()); + assertEquals(0d, similarity.apply("a", "a").getF1Score()); + assertEquals(0d, similarity.apply("a", "b").getF1Score()); + assertEquals(1.0d, similarity.apply("foo", "foo").getF1Score()); + assertEquals(0.8d, similarity.apply("foo", "foo ").getF1Score()); + assertEquals(0.4d, similarity.apply("frog", "fog").getF1Score()); + assertEquals(0.0d, similarity.apply("fly", "ant").getF1Score()); + assertEquals(0.0d, similarity.apply("elephant", "hippo").getF1Score()); + assertEquals(0.0d, similarity.apply("hippo", "elephant").getF1Score()); + assertEquals(0.0d, similarity.apply("hippo", "zzzzzzzz").getF1Score()); + assertEquals(0.5d, similarity.apply("hello", "hallo").getF1Score()); + assertEquals(0.7d, similarity.apply("ABC Corporation", "ABC Corp").getF1Score()); + assertEquals(0.7391304347826086d, + similarity.apply("D N H Enterprises Inc", "D & H Enterprises, Inc.").getF1Score()); + assertEquals(0.8076923076923077d, + similarity.apply("My Gym Children's Fitness Center", "My Gym. Childrens Fitness").getF1Score()); + assertEquals(0.6956521739130435, similarity.apply("PENNSYLVANIA", "PENNCISYLVNIA").getF1Score()); + assertEquals(0.9230769230769231, similarity.apply("/opt/software1", "/opt/software2").getF1Score()); + assertEquals(0.5d, similarity.apply("aaabcd", "aaacdb").getF1Score()); + assertEquals(0.631578947368421, similarity.apply("John Horn", "John Hopkins").getF1Score()); + } + + @Test + public void testConstructorWithNullConverterThrows() { + assertThatIllegalArgumentException().isThrownBy(() -> { + new IntersectionSimilarity<>(null); + }); + } + + @Test + public void testGettingSetSimilarityNullNull() { + assertThatIllegalArgumentException().isThrownBy(() -> { + new IntersectionSimilarity<>((cs) -> new HashSet<>(Arrays.asList(cs))).apply(null, null); + }); + } + + @Test + public void testGettingSetSimilarityStringNull() { + assertThatIllegalArgumentException().isThrownBy(() -> { + new IntersectionSimilarity<>((cs) -> new HashSet<>(Arrays.asList(cs))).apply("left", null); + }); + } + + @Test + public void testGettingSetSimilarityNullString() { + assertThatIllegalArgumentException().isThrownBy(() -> { + new IntersectionSimilarity<>((cs) -> new HashSet<>(Arrays.asList(cs))).apply(null, "right"); + }); + } +} From 8b92150563182de5dd8922a1345e36237ab43cd0 Mon Sep 17 00:00:00 2001 From: Alex Herbert Date: Thu, 7 Mar 2019 13:44:54 +0000 Subject: [PATCH 06/16] TEXT-155: Add a generic IntersectionSimilarity measure --- .../text/similarity/IntersectionResult.java | 153 ++++++++++++ .../similarity/IntersectionSimilarity.java | 79 +++++++ .../similarity/IntersectionResultTest.java | 221 ++++++++++++++++++ .../IntersectionSimilarityTest.java | 139 +++++++++++ 4 files changed, 592 insertions(+) create mode 100644 src/main/java/org/apache/commons/text/similarity/IntersectionResult.java create mode 100644 src/main/java/org/apache/commons/text/similarity/IntersectionSimilarity.java create mode 100644 src/test/java/org/apache/commons/text/similarity/IntersectionResultTest.java create mode 100644 src/test/java/org/apache/commons/text/similarity/IntersectionSimilarityTest.java diff --git a/src/main/java/org/apache/commons/text/similarity/IntersectionResult.java b/src/main/java/org/apache/commons/text/similarity/IntersectionResult.java new file mode 100644 index 0000000000..faa2b42d57 --- /dev/null +++ b/src/main/java/org/apache/commons/text/similarity/IntersectionResult.java @@ -0,0 +1,153 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.text.similarity; + +import java.util.Objects; + +/** + * Container class to store the intersection results between two sets. + * + *

Stores the size of set A, set B and the intersection of A and B (A ∩ B). + * The result can be used to produce various similarity metrics, for example the Jaccard or F1-score.

+ * + *

This class is immutable.

+ * + * @since 1.7 + * @see Jaccard index + * @see F1 score + */ +public class IntersectionResult { + /** + * The size of set A. + */ + private final int sizeA; + /** + * The size of set B. + */ + private final int sizeB; + /** + * The size of the intersection between set A and B. + */ + private final int intersection; + + /** + * Create the results for an intersection between two sets. + * + * @param sizeA the size of set A ({@code |A|}) + * @param sizeB the size of set B ({@code |B|}) + * @param intersection the size of the intersection of A and B (A ∩ B) + * @throws IllegalArgumentException if the sizes are negative or the intersection is greater + * than the minimum of the two set sizes + */ + public IntersectionResult(final int sizeA, final int sizeB, final int intersection) { + if (sizeA < 0) { + throw new IllegalArgumentException("Set size |A| is not positive: " + sizeA); + } + if (sizeB < 0) { + throw new IllegalArgumentException("Set size |B| is not positive: " + sizeB); + } + if (intersection < 0 || intersection > Math.min(sizeA, sizeB)) { + throw new IllegalArgumentException("Invalid intersection of |A| and |B|: " + intersection); + } + this.sizeA = sizeA; + this.sizeB = sizeB; + this.intersection = intersection; + } + + /** + * Get the size of set A (|A|). + * + * @return |A| + */ + public int getSizeA() { + return sizeA; + } + + /** + * Get the size of set B (|B|). + * + * @return |B| + */ + public int getSizeB() { + return sizeB; + } + + /** + * Get the size of the intersection between set A and B. + * + * @return A ∩ B + */ + public int getIntersection() { + return intersection; + } + /** + * Get the size of the union between set A and B. + * + * @return A ∩ B + */ + public long getUnion() { + return (long) sizeA + sizeB - intersection; + } + + /** + * Gets the Jaccard. + * + *

This implementation defines the result as zero if there is no intersection, + * even when the size of both sets is zero.

+ * + * @return the Jaccard + * @see Jaccard index + */ + public double getJaccard() { + return intersection == 0 ? 0.0 : (double) intersection / getUnion(); + } + + /** + * Gets the F1 score. + * + *

This implementation defines the result as zero if there is no intersection, + * even when the size of both sets is zero.

+ * + * @return the F1 score + * @see F1 score + */ + public double getF1Score() { + return intersection == 0 ? 0.0 : 2.0 * intersection / ((long) sizeA + sizeB); + } + + @Override + public boolean equals(final Object o) { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + final IntersectionResult result = (IntersectionResult) o; + return sizeA == result.sizeA && sizeB == result.sizeB && intersection == result.intersection; + } + + @Override + public int hashCode() { + return Objects.hash(sizeA, sizeB, intersection); + } + + @Override + public String toString() { + return "Size A: " + sizeA + ", Size B: " + sizeB + ", Intersection: " + intersection; + } +} diff --git a/src/main/java/org/apache/commons/text/similarity/IntersectionSimilarity.java b/src/main/java/org/apache/commons/text/similarity/IntersectionSimilarity.java new file mode 100644 index 0000000000..f0deb06c48 --- /dev/null +++ b/src/main/java/org/apache/commons/text/similarity/IntersectionSimilarity.java @@ -0,0 +1,79 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.text.similarity; + +import java.util.Set; +import java.util.function.Function; +import java.util.stream.Collectors; + +/** + * Measures the intersection of two sets created from a pair of character + * sequences. + * + *

It is assumed that the type {@code T} correctly conforms to the + * requirements for storage within a {@link Set}, ideally the type is + * immutable and implements {@link Object#equals(Object)}.

+ * + * @param the type of the set extracted from the character sequence + * @since 1.7 + * @see Set + */ +public class IntersectionSimilarity implements SimilarityScore { + /** The converter used to create the set elements. */ + private final Function> converter; + + /** + * Create a new set similarity using the provided converter. + * + * @param converter the converter used to create the set + * @throws IllegalArgumentException if the converter is null + */ + public IntersectionSimilarity(Function> converter) { + if (converter == null) { + throw new IllegalArgumentException("Converter must not be null"); + } + this.converter = converter; + } + + /** + * Calculates the intersection of two character sequences passed as input. + * + * @param left first character sequence + * @param right second character sequence + * @return the intersection result + * @throws IllegalArgumentException if either input sequence is {@code null} + */ + @Override + public IntersectionResult apply(final CharSequence left, final CharSequence right) { + if (left == null || right == null) { + throw new IllegalArgumentException("Input cannot be null"); + } + final Set setA = converter.apply(left); + final Set setB = converter.apply(right); + final int sizeA = setA.size(); + final int sizeB = setB.size(); + // Short-cut if either set is empty + if (Math.min(sizeA, sizeB) == 0) { + // No intersection + return new IntersectionResult(sizeA, sizeB, 0); + } + // We can use intValue() to convert the Long output from the + // collector as the intersection cannot be bigger than either set. + final int intersection = setA.stream().filter(setB::contains).collect(Collectors.counting()).intValue(); + return new IntersectionResult(sizeA, sizeB, intersection); + } +} diff --git a/src/test/java/org/apache/commons/text/similarity/IntersectionResultTest.java b/src/test/java/org/apache/commons/text/similarity/IntersectionResultTest.java new file mode 100644 index 0000000000..a0c0923e95 --- /dev/null +++ b/src/test/java/org/apache/commons/text/similarity/IntersectionResultTest.java @@ -0,0 +1,221 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.text.similarity; + +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Test; + +import java.util.HashMap; +import java.util.concurrent.ThreadLocalRandom; + +/** + * Unit tests for {@link FuzzyScore}. + */ +public class IntersectionResultTest { + + @Test + public void testNewIntersectionResult_WithZeros() { + final int sizeA = 0; + final int sizeB = 0; + final int intersection = 0; + new IntersectionResult(sizeA, sizeB, intersection); + } + + @Test + public void testNewIntersectionResult_WithNegativeSizeA() { + final int sizeA = -1; + final int sizeB = 0; + final int intersection = 0; + Assertions.assertThrows(IllegalArgumentException.class, () -> { + new IntersectionResult(sizeA, sizeB, intersection); + }); + } + + @Test + public void testNewIntersectionResult_WithNegativeSizeB() { + final int sizeA = 0; + final int sizeB = -1; + final int intersection = 0; + Assertions.assertThrows(IllegalArgumentException.class, () -> { + new IntersectionResult(sizeA, sizeB, intersection); + }); + } + + @Test + public void testNewIntersectionResult_WithNegativeIntersection() { + final int sizeA = 0; + final int sizeB = 0; + final int intersection = -1; + Assertions.assertThrows(IllegalArgumentException.class, () -> { + new IntersectionResult(sizeA, sizeB, intersection); + }); + } + + @Test + public void testNewIntersectionResult_WithIntersectionAboveSizeAorB() { + final int sizeA = 1; + final int sizeB = 2; + final int intersection = Math.max(sizeA, sizeB) + 1; + Assertions.assertThrows(IllegalArgumentException.class, () -> { + new IntersectionResult(sizeA, sizeB, intersection); + }); + Assertions.assertThrows(IllegalArgumentException.class, () -> { + new IntersectionResult(sizeB, sizeA, intersection); + }); + } + + @Test + public void testUnion() { + // Union is the combined size minus the intersection + Assertions.assertEquals(0, getUnion(0, 0, 0)); + + Assertions.assertEquals(1, getUnion(1, 0, 0)); + Assertions.assertEquals(2, getUnion(1, 1, 0)); + Assertions.assertEquals(1, getUnion(1, 1, 1)); + + Assertions.assertEquals(2, getUnion(2, 0, 0)); + Assertions.assertEquals(3, getUnion(2, 1, 0)); + Assertions.assertEquals(2, getUnion(2, 1, 1)); + Assertions.assertEquals(2, getUnion(2, 2, 2)); + + // Test overflow of int addition + Assertions.assertEquals((long) Integer.MAX_VALUE + 1, getUnion(Integer.MAX_VALUE, 1, 0)); + Assertions.assertEquals((long) Integer.MAX_VALUE + 1, + getUnion(Integer.MAX_VALUE, Integer.MAX_VALUE, Integer.MAX_VALUE - 1)); + Assertions.assertEquals((long) Integer.MAX_VALUE + Integer.MAX_VALUE, + getUnion(Integer.MAX_VALUE, Integer.MAX_VALUE, 0)); + } + + private static long getUnion(int sizeA, int sizeB, int intersection) { + return new IntersectionResult(sizeA, sizeB, intersection).getUnion(); + } + + @Test + public void testJaccard() { + // Jaccard is the intersection divided by the union + Assertions.assertEquals(0, getJaccard(0, 0, 0)); + + Assertions.assertEquals(0, getJaccard(1, 0, 0)); + Assertions.assertEquals(0, getJaccard(1, 1, 0)); + Assertions.assertEquals(1, getJaccard(1, 1, 1)); + + Assertions.assertEquals(0, getJaccard(2, 0, 0)); + Assertions.assertEquals(0, getJaccard(2, 1, 0)); + Assertions.assertEquals(1.0 / 2, getJaccard(2, 1, 1)); + Assertions.assertEquals(1, getJaccard(2, 2, 2)); + + Assertions.assertEquals(2.0 / 21, getJaccard(20, 3, 2)); + } + + private static double getJaccard(int sizeA, int sizeB, int intersection) { + return new IntersectionResult(sizeA, sizeB, intersection).getJaccard(); + } + + @Test + public void testF1Score() { + // F1-score is 2 * intersection divided by the size of each set + Assertions.assertEquals(0, getF1Score(0, 0, 0)); + + Assertions.assertEquals(0, getF1Score(1, 0, 0)); + Assertions.assertEquals(0, getF1Score(1, 1, 0)); + Assertions.assertEquals(1, getF1Score(1, 1, 1)); + + Assertions.assertEquals(0, getF1Score(2, 0, 0)); + Assertions.assertEquals(0, getF1Score(2, 1, 0)); + Assertions.assertEquals(2 * 1.0 / (2 + 1), getF1Score(2, 1, 1)); + Assertions.assertEquals(1, getF1Score(2, 2, 2)); + + Assertions.assertEquals(2 * 2.0 / (20 + 3), getF1Score(20, 3, 2)); + } + + private static double getF1Score(int sizeA, int sizeB, int intersection) { + return new IntersectionResult(sizeA, sizeB, intersection).getF1Score(); + } + + @Test + public void testProperties() { + final ThreadLocalRandom rand = ThreadLocalRandom.current(); + final int max = 1024; + for (int i = 0; i < 5; i++) { + // Ensure the min is above 0 + final int sizeA = rand.nextInt(max) + 1; + final int sizeB = rand.nextInt(max) + 1; + final int intersection = rand.nextInt(Math.min(sizeA, sizeB)); + final IntersectionResult result = new IntersectionResult(sizeA, sizeB, intersection); + Assertions.assertEquals(sizeA, result.getSizeA()); + Assertions.assertEquals(sizeB, result.getSizeB()); + Assertions.assertEquals(intersection, result.getIntersection()); + } + } + + @Test + public void testEquals() { + final IntersectionResult[] results = new IntersectionResult[] { + new IntersectionResult(0, 0, 0), + new IntersectionResult(10, 0, 0), + new IntersectionResult(10, 10, 0), + new IntersectionResult(10, 10, 10), + }; + + // Test difference instance with same values + Assertions.assertTrue(results[0].equals(new IntersectionResult(0, 0, 0))); + + final Object something = new Object(); + for (int i = 0; i < results.length; i++) { + Assertions.assertFalse(results[i].equals(something)); + Assertions.assertFalse(results[i].equals(null)); + for (int j = 0; j < results.length; j++) { + Assertions.assertTrue(results[i].equals(results[j]) == (i == j)); + } + } + } + + @Test + public void testHashCode() { + final IntersectionResult[] results = new IntersectionResult[] { + new IntersectionResult(10, 0, 0), + new IntersectionResult(10, 10, 0), + new IntersectionResult(10, 10, 10), + }; + final HashMap map = new HashMap<>(); + for (int i = 0; i < results.length; i++) { + map.put(results[i], i); + } + for (int i = 0; i < results.length; i++) { + Assertions.assertEquals(i, map.get(results[i])); + } + } + + @Test + public void testToString() { + final ThreadLocalRandom rand = ThreadLocalRandom.current(); + final int max = 9; + for (int i = 0; i < 5; i++) { + // Ensure the min is above 0 + final int sizeA = rand.nextInt(max) + 1; + final int sizeB = rand.nextInt(max) + 1; + final int intersection = rand.nextInt(Math.min(sizeA, sizeB)); + final IntersectionResult result = new IntersectionResult(sizeA, sizeB, intersection); + final String string = result.toString(); + // Not perfect as this will match substrings too. The chance of error + // is limited by restricting the numbers to a max of 10. + Assertions.assertTrue(string.contains(String.valueOf(sizeA))); + Assertions.assertTrue(string.contains(String.valueOf(sizeB))); + Assertions.assertTrue(string.contains(String.valueOf(intersection))); + } + } +} diff --git a/src/test/java/org/apache/commons/text/similarity/IntersectionSimilarityTest.java b/src/test/java/org/apache/commons/text/similarity/IntersectionSimilarityTest.java new file mode 100644 index 0000000000..b1054ac6f2 --- /dev/null +++ b/src/test/java/org/apache/commons/text/similarity/IntersectionSimilarityTest.java @@ -0,0 +1,139 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.text.similarity; + +import static org.assertj.core.api.Assertions.assertThatIllegalArgumentException; +import static org.junit.jupiter.api.Assertions.assertEquals; + +import org.junit.jupiter.api.Test; + +import java.util.Arrays; +import java.util.HashSet; +import java.util.Set; +import java.util.function.Function; + +/** + * Unit tests for {@link IntersectionSimilarity}. + */ +public class IntersectionSimilarityTest { + @Test + public void testGetJaccardSimilarityUsingChars() { + // Match the functionality of the JaccardSimilarity class by dividing + // the sequence into single characters + final Function> converter = (cs) -> { + final int length = cs.length(); + final Set set = new HashSet<>(length); + for (int i = 0; i < length; i++) { + set.add(cs.charAt(i)); + } + return set; + }; + final IntersectionSimilarity similarity = new IntersectionSimilarity<>(converter); + + // This is explicitly implemented instead of using the JaccardSimilarity + // since that class does rounding to 2 D.P. + // Results generated using the python distance library using: 1 - distance.jaccard(seq1, seq2) + assertEquals(0.0, similarity.apply("", "").getJaccard()); + assertEquals(0.0, similarity.apply("left", "").getJaccard()); + assertEquals(0.0, similarity.apply("", "right").getJaccard()); + assertEquals(0.75, similarity.apply("frog", "fog").getJaccard()); + assertEquals(0.0, similarity.apply("fly", "ant").getJaccard()); + assertEquals(0.2222222222222222, similarity.apply("elephant", "hippo").getJaccard()); + assertEquals(0.6363636363636364, similarity.apply("ABC Corporation", "ABC Corp").getJaccard()); + assertEquals(0.7647058823529411, + similarity.apply("D N H Enterprises Inc", "D & H Enterprises, Inc.").getJaccard()); + assertEquals(0.8888888888888888, + similarity.apply("My Gym Children's Fitness Center", "My Gym. Childrens Fitness").getJaccard()); + assertEquals(0.9, similarity.apply("PENNSYLVANIA", "PENNCISYLVNIA").getJaccard()); + assertEquals(0.125, similarity.apply("left", "right").getJaccard()); + assertEquals(0.125, similarity.apply("leettteft", "ritttght").getJaccard()); + assertEquals(1.0, similarity.apply("the same string", "the same string").getJaccard()); + } + + @Test + public void testGetF1ScoreUsingBigrams() { + // Compute the F1-score using pairs of characters (bigrams). + // This can be done using a 32-bit int to store two 16-bit characters + final Function> converter = (cs) -> { + final int length = cs.length(); + final Set set = new HashSet<>(length); + if (length > 1) { + char ch2 = cs.charAt(0); + for (int i = 1; i < length; i++) { + final char ch1 = ch2; + ch2 = cs.charAt(i); + set.add(Integer.valueOf((ch1 << 16) | ch2)); + } + } + return set; + }; + final IntersectionSimilarity similarity = new IntersectionSimilarity<>(converter); + + // Note that when there are no bigrams then the similarity is zero. + + assertEquals(0d, similarity.apply("", "").getF1Score()); + assertEquals(0d, similarity.apply("", "a").getF1Score()); + assertEquals(0d, similarity.apply("a", "").getF1Score()); + assertEquals(0d, similarity.apply("a", "a").getF1Score()); + assertEquals(0d, similarity.apply("a", "b").getF1Score()); + assertEquals(1.0d, similarity.apply("foo", "foo").getF1Score()); + assertEquals(0.8d, similarity.apply("foo", "foo ").getF1Score()); + assertEquals(0.4d, similarity.apply("frog", "fog").getF1Score()); + assertEquals(0.0d, similarity.apply("fly", "ant").getF1Score()); + assertEquals(0.0d, similarity.apply("elephant", "hippo").getF1Score()); + assertEquals(0.0d, similarity.apply("hippo", "elephant").getF1Score()); + assertEquals(0.0d, similarity.apply("hippo", "zzzzzzzz").getF1Score()); + assertEquals(0.5d, similarity.apply("hello", "hallo").getF1Score()); + assertEquals(0.7d, similarity.apply("ABC Corporation", "ABC Corp").getF1Score()); + assertEquals(0.7391304347826086d, + similarity.apply("D N H Enterprises Inc", "D & H Enterprises, Inc.").getF1Score()); + assertEquals(0.8076923076923077d, + similarity.apply("My Gym Children's Fitness Center", "My Gym. Childrens Fitness").getF1Score()); + assertEquals(0.6956521739130435, similarity.apply("PENNSYLVANIA", "PENNCISYLVNIA").getF1Score()); + assertEquals(0.9230769230769231, similarity.apply("/opt/software1", "/opt/software2").getF1Score()); + assertEquals(0.5d, similarity.apply("aaabcd", "aaacdb").getF1Score()); + assertEquals(0.631578947368421, similarity.apply("John Horn", "John Hopkins").getF1Score()); + } + + @Test + public void testConstructorWithNullConverterThrows() { + assertThatIllegalArgumentException().isThrownBy(() -> { + new IntersectionSimilarity<>(null); + }); + } + + @Test + public void testApplyNullNull() { + assertThatIllegalArgumentException().isThrownBy(() -> { + new IntersectionSimilarity<>((cs) -> new HashSet<>(Arrays.asList(cs))).apply(null, null); + }); + } + + @Test + public void testApplyStringNull() { + assertThatIllegalArgumentException().isThrownBy(() -> { + new IntersectionSimilarity<>((cs) -> new HashSet<>(Arrays.asList(cs))).apply("left", null); + }); + } + + @Test + public void testApplyNullString() { + assertThatIllegalArgumentException().isThrownBy(() -> { + new IntersectionSimilarity<>((cs) -> new HashSet<>(Arrays.asList(cs))).apply(null, "right"); + }); + } +} From 0c66921f59257450fc5a030b2499b815062cdeb0 Mon Sep 17 00:00:00 2001 From: aherbert Date: Fri, 8 Mar 2019 15:09:08 +0000 Subject: [PATCH 07/16] TEXT-155: IntersectionSimilarity to support duplicates in union. This adds Bag functionality to allow counting duplicate elements. --- .../text/similarity/IntersectionResult.java | 41 ++-- .../similarity/IntersectionSimilarity.java | 156 ++++++++++++-- .../similarity/IntersectionResultTest.java | 33 +-- .../IntersectionSimilarityTest.java | 198 +++++++++++++----- 4 files changed, 332 insertions(+), 96 deletions(-) diff --git a/src/main/java/org/apache/commons/text/similarity/IntersectionResult.java b/src/main/java/org/apache/commons/text/similarity/IntersectionResult.java index faa2b42d57..e63f2b9503 100644 --- a/src/main/java/org/apache/commons/text/similarity/IntersectionResult.java +++ b/src/main/java/org/apache/commons/text/similarity/IntersectionResult.java @@ -22,12 +22,14 @@ * Container class to store the intersection results between two sets. * *

Stores the size of set A, set B and the intersection of A and B (A ∩ B). - * The result can be used to produce various similarity metrics, for example the Jaccard or F1-score.

+ * The result can be used to produce various similarity metrics, for example the Jaccard index or + * Sørensen-Dice coefficient (F1 score).

* *

This class is immutable.

* * @since 1.7 * @see Jaccard index + * @see Sørensen Dice coefficient * @see F1 score */ public class IntersectionResult { @@ -69,7 +71,7 @@ public IntersectionResult(final int sizeA, final int sizeB, final int intersecti } /** - * Get the size of set A (|A|). + * Get the size of set A. * * @return |A| */ @@ -78,7 +80,7 @@ public int getSizeA() { } /** - * Get the size of set B (|B|). + * Get the size of set B. * * @return |B| */ @@ -89,7 +91,7 @@ public int getSizeB() { /** * Get the size of the intersection between set A and B. * - * @return A ∩ B + * @return |A ∩ B| */ public int getIntersection() { return intersection; @@ -97,35 +99,46 @@ public int getIntersection() { /** * Get the size of the union between set A and B. * - * @return A ∩ B + * @return |A ∪ B| */ public long getUnion() { return (long) sizeA + sizeB - intersection; } /** - * Gets the Jaccard. + * Gets the Jaccard index. The Jaccard is the intersection divided by the union. + * + *
|A ∩ B| / |A ∪ B| 
* *

This implementation defines the result as zero if there is no intersection, - * even when the size of both sets is zero.

+ * even when the union is zero to avoid a {@link Double#NaN} result.

* - * @return the Jaccard + * @return the Jaccard index * @see Jaccard index */ - public double getJaccard() { + public double getJaccardIndex() { return intersection == 0 ? 0.0 : (double) intersection / getUnion(); } /** - * Gets the F1 score. + * Gets the Sørensen-Dice coefficient. The coefficient is twice the size of the intersection + * divided by the size of both sets. * - *

This implementation defines the result as zero if there is no intersection, - * even when the size of both sets is zero.

+ *
+     * 2|A ∩ B| / (|A| + |B|) 
+     * 
+ * + *

This is also known as the F1 score. + * + *

This implementation defines the result as zero if there is no intersection, even when the size + * of both sets is zero to avoid a {@link Double#NaN} result.

* - * @return the F1 score + * @return the Sørensen-Dice coefficient + * @see Sørensen + * Dice coefficient * @see F1 score */ - public double getF1Score() { + public double getSorensenDiceCoefficient() { return intersection == 0 ? 0.0 : 2.0 * intersection / ((long) sizeA + sizeB); } diff --git a/src/main/java/org/apache/commons/text/similarity/IntersectionSimilarity.java b/src/main/java/org/apache/commons/text/similarity/IntersectionSimilarity.java index f0deb06c48..0751553066 100644 --- a/src/main/java/org/apache/commons/text/similarity/IntersectionSimilarity.java +++ b/src/main/java/org/apache/commons/text/similarity/IntersectionSimilarity.java @@ -16,33 +16,110 @@ */ package org.apache.commons.text.similarity; +import java.util.Collection; +import java.util.HashMap; +import java.util.Map; import java.util.Set; import java.util.function.Function; -import java.util.stream.Collectors; +import java.util.stream.Stream; /** - * Measures the intersection of two sets created from a pair of character - * sequences. + * Measures the intersection of two sets created from a pair of character sequences. * - *

It is assumed that the type {@code T} correctly conforms to the - * requirements for storage within a {@link Set}, ideally the type is - * immutable and implements {@link Object#equals(Object)}.

+ *

It is assumed that the type {@code T} correctly conforms to the requirements for storage + * within a {@link Set} or {@link HashMap}. Ideally the type is immutable and implements + * {@link Object#equals(Object)} and {@link Object#hashCode()}.

* - * @param the type of the set extracted from the character sequence + * @param the type of the elements extracted from the character sequence * @since 1.7 * @see Set + * @see HashMap */ public class IntersectionSimilarity implements SimilarityScore { - /** The converter used to create the set elements. */ - private final Function> converter; + /** The converter used to create the elements from the characters. */ + private final Function> converter; + + // The following is adapted from commons-collections for a Bag. + // A Bag is a collection that can store the count of the number + // of copies of each element. + + /** + * Mutable counter class for storing the count of elements. + */ + private static class BagCount { + /** The count. This is initialised to 1 upon construction. */ + int count = 1; + } + + /** + * A minimal implementation of a Bag that can store elements and a count. + * + *

For the intended purpose the Bag does not have to be a {@link Collection}. It does not + * even have to know its own size. + */ + private class TinyBag { + /** The backing map. */ + private final Map map; + + /** + * Create a new tiny bag. + * + * @param initialCapacity the initial capacity + */ + TinyBag(int initialCapacity) { + map = new HashMap<>(initialCapacity); + } + + /** + * Adds a new element to the bag, incrementing its count in the underlying map. + * + * @param object the object to add + */ + void add(T object) { + final BagCount mut = map.get(object); + if (mut == null) { + map.put(object, new BagCount()); + } else { + mut.count++; + } + } + + /** + * Returns the number of occurrence of the given element in this bag by + * looking up its count in the underlying map. + * + * @param object the object to search for + * @return the number of occurrences of the object, zero if not found + */ + int getCount(final Object object) { + final BagCount count = map.get(object); + if (count != null) { + return count.count; + } + return 0; + } + + /** + * Returns a possibly parallel Stream of all the entries in the bag. + * + * @return the stream + */ + Stream> parallelStream() { + return map.entrySet().parallelStream(); + } + } /** * Create a new set similarity using the provided converter. * - * @param converter the converter used to create the set + *

If the converter returns a {@link Set} then the intersection result will + * not include duplicates. Any other {@link Collection} is used to produce a result + * that will include duplicates in the intersect and union. + * + * @param converter the converter used to create the elements from the characters. * @throws IllegalArgumentException if the converter is null */ - public IntersectionSimilarity(Function> converter) { + public IntersectionSimilarity(Function> converter) { if (converter == null) { throw new IllegalArgumentException("Converter must not be null"); } @@ -62,18 +139,59 @@ public IntersectionResult apply(final CharSequence left, final CharSequence righ if (left == null || right == null) { throw new IllegalArgumentException("Input cannot be null"); } - final Set setA = converter.apply(left); - final Set setB = converter.apply(right); - final int sizeA = setA.size(); - final int sizeB = setB.size(); - // Short-cut if either set is empty + + // Create the elements from the sequences + final Collection objectsA = converter.apply(left); + final Collection objectsB = converter.apply(right); + final int sizeA = objectsA.size(); + final int sizeB = objectsB.size(); + + // Short-cut if either collection is empty if (Math.min(sizeA, sizeB) == 0) { // No intersection return new IntersectionResult(sizeA, sizeB, 0); } - // We can use intValue() to convert the Long output from the - // collector as the intersection cannot be bigger than either set. - final int intersection = setA.stream().filter(setB::contains).collect(Collectors.counting()).intValue(); + + // Intersection = count the number of shared elements + int intersection; + if (objectsA instanceof Set) { + // If a Set then the elements will only have a count of 1. + // Stream the elements in the set A and check if also in set B. + // Note: Even if objectsB is a plain collection this will work + // since the contains(Object) method will return true when present. + // The fact that objectsA is a Set ensures non duplicate counting. + intersection = objectsA.parallelStream() + .mapToInt(element -> objectsB.contains(element) ? 1 : 0) + .sum(); + } else { + // Create a bag for each collection + final TinyBag bagA = toBag(objectsA); + final TinyBag bagB = toBag(objectsB); + // Stream the count of each element in bag A and find the intersection with bag B + intersection = bagA.parallelStream() + .mapToInt(entry -> { + // The intersection of this entry in both bags is the min count + final T element = entry.getKey(); + final int count = entry.getValue().count; + return Math.min(count, bagB.getCount(element)); + }) + .sum(); + } + return new IntersectionResult(sizeA, sizeB, intersection); } + + /** + * Convert the collection to a bag. The bag will contain the count of each element in the collection. + * + * @param objects the objects + * @return the bag + */ + private TinyBag toBag(Collection objects) { + final TinyBag bag = new TinyBag(objects.size()); + for (T t : objects) { + bag.add(t); + } + return bag; + } } diff --git a/src/test/java/org/apache/commons/text/similarity/IntersectionResultTest.java b/src/test/java/org/apache/commons/text/similarity/IntersectionResultTest.java index a0c0923e95..b21275bdf8 100644 --- a/src/test/java/org/apache/commons/text/similarity/IntersectionResultTest.java +++ b/src/test/java/org/apache/commons/text/similarity/IntersectionResultTest.java @@ -122,28 +122,28 @@ public void testJaccard() { } private static double getJaccard(int sizeA, int sizeB, int intersection) { - return new IntersectionResult(sizeA, sizeB, intersection).getJaccard(); + return new IntersectionResult(sizeA, sizeB, intersection).getJaccardIndex(); } @Test - public void testF1Score() { - // F1-score is 2 * intersection divided by the size of each set - Assertions.assertEquals(0, getF1Score(0, 0, 0)); + public void testSorensenDice() { + // Sorensen-Dice is 2 * intersection divided by the size of each set + Assertions.assertEquals(0, getSorensenDice(0, 0, 0)); - Assertions.assertEquals(0, getF1Score(1, 0, 0)); - Assertions.assertEquals(0, getF1Score(1, 1, 0)); - Assertions.assertEquals(1, getF1Score(1, 1, 1)); + Assertions.assertEquals(0, getSorensenDice(1, 0, 0)); + Assertions.assertEquals(0, getSorensenDice(1, 1, 0)); + Assertions.assertEquals(2.0 * 1 / (1 + 1), getSorensenDice(1, 1, 1)); - Assertions.assertEquals(0, getF1Score(2, 0, 0)); - Assertions.assertEquals(0, getF1Score(2, 1, 0)); - Assertions.assertEquals(2 * 1.0 / (2 + 1), getF1Score(2, 1, 1)); - Assertions.assertEquals(1, getF1Score(2, 2, 2)); + Assertions.assertEquals(0, getSorensenDice(2, 0, 0)); + Assertions.assertEquals(0, getSorensenDice(2, 1, 0)); + Assertions.assertEquals(2.0 * 1 / (2 + 1), getSorensenDice(2, 1, 1)); + Assertions.assertEquals(2.0 * 2 / (2 + 2), getSorensenDice(2, 2, 2)); - Assertions.assertEquals(2 * 2.0 / (20 + 3), getF1Score(20, 3, 2)); + Assertions.assertEquals(2.0 * 2 / (20 + 3), getSorensenDice(20, 3, 2)); } - private static double getF1Score(int sizeA, int sizeB, int intersection) { - return new IntersectionResult(sizeA, sizeB, intersection).getF1Score(); + private static double getSorensenDice(int sizeA, int sizeB, int intersection) { + return new IntersectionResult(sizeA, sizeB, intersection).getSorensenDiceCoefficient(); } @Test @@ -192,11 +192,12 @@ public void testHashCode() { new IntersectionResult(10, 10, 10), }; final HashMap map = new HashMap<>(); + final int offset = 123; for (int i = 0; i < results.length; i++) { - map.put(results[i], i); + map.put(results[i], i + offset); } for (int i = 0; i < results.length; i++) { - Assertions.assertEquals(i, map.get(results[i])); + Assertions.assertEquals(i + offset, map.get(results[i])); } } diff --git a/src/test/java/org/apache/commons/text/similarity/IntersectionSimilarityTest.java b/src/test/java/org/apache/commons/text/similarity/IntersectionSimilarityTest.java index b1054ac6f2..25f3016920 100644 --- a/src/test/java/org/apache/commons/text/similarity/IntersectionSimilarityTest.java +++ b/src/test/java/org/apache/commons/text/similarity/IntersectionSimilarityTest.java @@ -21,8 +21,11 @@ import org.junit.jupiter.api.Test; +import java.util.ArrayList; import java.util.Arrays; +import java.util.Collection; import java.util.HashSet; +import java.util.List; import java.util.Set; import java.util.function.Function; @@ -31,10 +34,10 @@ */ public class IntersectionSimilarityTest { @Test - public void testGetJaccardSimilarityUsingChars() { + public void testJaccardIndexUsingSetCharacter() { // Match the functionality of the JaccardSimilarity class by dividing // the sequence into single characters - final Function> converter = (cs) -> { + final Function> converter = (cs) -> { final int length = cs.length(); final Set set = new HashSet<>(length); for (int i = 0; i < length; i++) { @@ -44,31 +47,63 @@ public void testGetJaccardSimilarityUsingChars() { }; final IntersectionSimilarity similarity = new IntersectionSimilarity<>(converter); - // This is explicitly implemented instead of using the JaccardSimilarity - // since that class does rounding to 2 D.P. - // Results generated using the python distance library using: 1 - distance.jaccard(seq1, seq2) - assertEquals(0.0, similarity.apply("", "").getJaccard()); - assertEquals(0.0, similarity.apply("left", "").getJaccard()); - assertEquals(0.0, similarity.apply("", "right").getJaccard()); - assertEquals(0.75, similarity.apply("frog", "fog").getJaccard()); - assertEquals(0.0, similarity.apply("fly", "ant").getJaccard()); - assertEquals(0.2222222222222222, similarity.apply("elephant", "hippo").getJaccard()); - assertEquals(0.6363636363636364, similarity.apply("ABC Corporation", "ABC Corp").getJaccard()); - assertEquals(0.7647058823529411, - similarity.apply("D N H Enterprises Inc", "D & H Enterprises, Inc.").getJaccard()); - assertEquals(0.8888888888888888, - similarity.apply("My Gym Children's Fitness Center", "My Gym. Childrens Fitness").getJaccard()); - assertEquals(0.9, similarity.apply("PENNSYLVANIA", "PENNCISYLVNIA").getJaccard()); - assertEquals(0.125, similarity.apply("left", "right").getJaccard()); - assertEquals(0.125, similarity.apply("leettteft", "ritttght").getJaccard()); - assertEquals(1.0, similarity.apply("the same string", "the same string").getJaccard()); + // Expected Jaccard index = (intersect / union) + // intersection = count of unique matching characters (exclude duplicates) + // union = count of unique characters + assertEquals(0.0, similarity.apply("", "").getJaccardIndex()); + assertEquals(0.0, similarity.apply("left", "").getJaccardIndex()); + assertEquals(0.0, similarity.apply("", "right").getJaccardIndex()); + assertEquals(3.0 / 4, similarity.apply("frog", "fog").getJaccardIndex()); + assertEquals(0.0, similarity.apply("fly", "ant").getJaccardIndex()); + assertEquals(2.0 / 9, similarity.apply("elephant", "hippo").getJaccardIndex()); + assertEquals(7.0 / 11, similarity.apply("ABC Corporation", "ABC Corp").getJaccardIndex()); + assertEquals(13.0 / 17, similarity.apply("D N H Enterprises Inc", "D & H Enterprises, Inc.").getJaccardIndex()); + assertEquals(16.0 / 18, + similarity.apply("My Gym Children's Fitness Center", "My Gym. Childrens Fitness").getJaccardIndex()); + assertEquals(9.0 / 10, similarity.apply("PENNSYLVANIA", "PENNCISYLVNIA").getJaccardIndex()); + assertEquals(1.0 / 8, similarity.apply("left", "right").getJaccardIndex()); + assertEquals(1.0 / 8, similarity.apply("leettteft", "ritttght").getJaccardIndex()); + assertEquals(1.0, similarity.apply("the same string", "the same string").getJaccardIndex()); } @Test - public void testGetF1ScoreUsingBigrams() { - // Compute the F1-score using pairs of characters (bigrams). + public void testJaccardIndexUsingListCharacter() { + // This test uses a list and so duplicates should be matched + final Function> converter = (cs) -> { + final int length = cs.length(); + final List list = new ArrayList<>(length); + for (int i = 0; i < length; i++) { + list.add(cs.charAt(i)); + } + return list; + }; + final IntersectionSimilarity similarity = new IntersectionSimilarity<>(converter); + + // Expected Jaccard index = (intersect / union) + // intersection = count of matching characters including duplicates + // union = left.length() + right.length() - intersection + assertEquals(0.0, similarity.apply("", "").getJaccardIndex()); + assertEquals(0.0, similarity.apply("left", "").getJaccardIndex()); + assertEquals(0.0, similarity.apply("", "right").getJaccardIndex()); + assertEquals(3.0 / (4 + 3 - 3), similarity.apply("frog", "fog").getJaccardIndex()); + assertEquals(0.0, similarity.apply("fly", "ant").getJaccardIndex()); + assertEquals(2.0 / (8 + 5 - 2), similarity.apply("elephant", "hippo").getJaccardIndex()); + assertEquals(8.0 / (15 + 8 - 8), similarity.apply("ABC Corporation", "ABC Corp").getJaccardIndex()); + assertEquals(20.0 / (21 + 23 - 20), + similarity.apply("D N H Enterprises Inc", "D & H Enterprises, Inc.").getJaccardIndex()); + assertEquals(24.0 / (32 + 25 - 24), + similarity.apply("My Gym Children's Fitness Center", "My Gym. Childrens Fitness").getJaccardIndex()); + assertEquals(11.0 / (12 + 13 - 11), similarity.apply("PENNSYLVANIA", "PENNCISYLVNIA").getJaccardIndex()); + assertEquals(1.0 / (4 + 5 - 1), similarity.apply("left", "right").getJaccardIndex()); + assertEquals(4.0 / (9 + 8 - 4), similarity.apply("leettteft", "ritttght").getJaccardIndex()); + assertEquals(1.0, similarity.apply("the same string", "the same string").getJaccardIndex()); + } + + @Test + public void testSorensenDiceCoefficientUsingSetBigrams() { + // Compute using pairs of characters (bigrams). // This can be done using a 32-bit int to store two 16-bit characters - final Function> converter = (cs) -> { + final Function> converter = (cs) -> { final int length = cs.length(); final Set set = new HashSet<>(length); if (length > 1) { @@ -83,30 +118,99 @@ public void testGetF1ScoreUsingBigrams() { }; final IntersectionSimilarity similarity = new IntersectionSimilarity<>(converter); - // Note that when there are no bigrams then the similarity is zero. - - assertEquals(0d, similarity.apply("", "").getF1Score()); - assertEquals(0d, similarity.apply("", "a").getF1Score()); - assertEquals(0d, similarity.apply("a", "").getF1Score()); - assertEquals(0d, similarity.apply("a", "a").getF1Score()); - assertEquals(0d, similarity.apply("a", "b").getF1Score()); - assertEquals(1.0d, similarity.apply("foo", "foo").getF1Score()); - assertEquals(0.8d, similarity.apply("foo", "foo ").getF1Score()); - assertEquals(0.4d, similarity.apply("frog", "fog").getF1Score()); - assertEquals(0.0d, similarity.apply("fly", "ant").getF1Score()); - assertEquals(0.0d, similarity.apply("elephant", "hippo").getF1Score()); - assertEquals(0.0d, similarity.apply("hippo", "elephant").getF1Score()); - assertEquals(0.0d, similarity.apply("hippo", "zzzzzzzz").getF1Score()); - assertEquals(0.5d, similarity.apply("hello", "hallo").getF1Score()); - assertEquals(0.7d, similarity.apply("ABC Corporation", "ABC Corp").getF1Score()); - assertEquals(0.7391304347826086d, - similarity.apply("D N H Enterprises Inc", "D & H Enterprises, Inc.").getF1Score()); - assertEquals(0.8076923076923077d, - similarity.apply("My Gym Children's Fitness Center", "My Gym. Childrens Fitness").getF1Score()); - assertEquals(0.6956521739130435, similarity.apply("PENNSYLVANIA", "PENNCISYLVNIA").getF1Score()); - assertEquals(0.9230769230769231, similarity.apply("/opt/software1", "/opt/software2").getF1Score()); - assertEquals(0.5d, similarity.apply("aaabcd", "aaacdb").getF1Score()); - assertEquals(0.631578947368421, similarity.apply("John Horn", "John Hopkins").getF1Score()); + // Expected Sorensen-Dice = 2 * intersection / (|A| + |B|) + // intersection = count of unique matching bigrams (exclude duplicates) + // |A| = count of unique bigrams in A + // |B| = count of unique bigrams in B + assertEquals(2.0 * 1 / (1 + 1), similarity.apply("aa", "aa").getSorensenDiceCoefficient()); + assertEquals(2.0 * 1 / (1 + 1), similarity.apply("aaa", "aa").getSorensenDiceCoefficient()); + assertEquals(2.0 * 1 / (1 + 1), similarity.apply("aaaa", "aa").getSorensenDiceCoefficient()); + assertEquals(2.0 * 1 / (1 + 1), similarity.apply("aaaa", "aaa").getSorensenDiceCoefficient()); + assertEquals(2.0 * 2 / (3 + 2), similarity.apply("abaa", "aba").getSorensenDiceCoefficient()); + assertEquals(2.0 * 2 / (2 + 2), similarity.apply("ababa", "aba").getSorensenDiceCoefficient()); + assertEquals(2.0 * 2 / (2 + 2), similarity.apply("ababa", "abab").getSorensenDiceCoefficient()); + + assertEquals(0.0, similarity.apply("", "").getSorensenDiceCoefficient()); + assertEquals(0.0, similarity.apply("", "a").getSorensenDiceCoefficient()); + assertEquals(0.0, similarity.apply("a", "").getSorensenDiceCoefficient()); + assertEquals(0.0, similarity.apply("a", "a").getSorensenDiceCoefficient()); + assertEquals(0.0, similarity.apply("a", "b").getSorensenDiceCoefficient()); + assertEquals(1.0, similarity.apply("foo", "foo").getSorensenDiceCoefficient()); + assertEquals(2.0 * 2 / (2 + 3), similarity.apply("foo", "foo ").getSorensenDiceCoefficient()); + assertEquals(2.0 * 1 / (3 + 2), similarity.apply("frog", "fog").getSorensenDiceCoefficient()); + assertEquals(0.0, similarity.apply("fly", "ant").getSorensenDiceCoefficient()); + assertEquals(0.0, similarity.apply("elephant", "hippo").getSorensenDiceCoefficient()); + assertEquals(0.0, similarity.apply("hippo", "elephant").getSorensenDiceCoefficient()); + assertEquals(0.0, similarity.apply("hippo", "zzzzzzzz").getSorensenDiceCoefficient()); + assertEquals(2.0 * 2 / (4 + 4), similarity.apply("hello", "hallo").getSorensenDiceCoefficient()); + assertEquals(2.0 * 7 / (13 + 7), similarity.apply("ABC Corporation", "ABC Corp").getSorensenDiceCoefficient()); + assertEquals(2.0 * 17 / (20 + 26), + similarity.apply("D N H Enterprises Inc", "D & H Enterprises, Inc.").getSorensenDiceCoefficient()); + assertEquals(2.0 * 21 / (28 + 24), similarity + .apply("My Gym Children's Fitness Center", "My Gym. Childrens Fitness").getSorensenDiceCoefficient()); + assertEquals(2.0 * 8 / (11 + 12), + similarity.apply("PENNSYLVANIA", "PENNCISYLVNIA").getSorensenDiceCoefficient()); + assertEquals(2.0 * 12 / (13 + 13), + similarity.apply("/opt/software1", "/opt/software2").getSorensenDiceCoefficient()); + assertEquals(2.0 * 2 / (4 + 4), similarity.apply("aaabcd", "aaacdb").getSorensenDiceCoefficient()); + assertEquals(2.0 * 6 / (8 + 11), similarity.apply("John Horn", "John Hopkins").getSorensenDiceCoefficient()); + } + + @Test + public void testSorensenDiceCoefficientUsingListBigrams() { + // Compute using pairs of characters (bigrams). + // This can be done using a 32-bit int to store two 16-bit characters + final Function> converter = (cs) -> { + final int length = cs.length(); + final List set = new ArrayList<>(length); + if (length > 1) { + char ch2 = cs.charAt(0); + for (int i = 1; i < length; i++) { + final char ch1 = ch2; + ch2 = cs.charAt(i); + set.add(Integer.valueOf((ch1 << 16) | ch2)); + } + } + return set; + }; + final IntersectionSimilarity similarity = new IntersectionSimilarity<>(converter); + + // Expected Sorensen-Dice = 2 * intersection / (|A| + |B|) + // intersection = count of matching bigrams including duplicates + // |A| = max(0, left.length() - 1) + // |B| = max(0, right.length() - 1) + assertEquals(2.0 * 1 / (1 + 1), similarity.apply("aa", "aa").getSorensenDiceCoefficient()); + assertEquals(2.0 * 1 / (2 + 1), similarity.apply("aaa", "aa").getSorensenDiceCoefficient()); + assertEquals(2.0 * 1 / (3 + 1), similarity.apply("aaaa", "aa").getSorensenDiceCoefficient()); + assertEquals(2.0 * 2 / (3 + 2), similarity.apply("aaaa", "aaa").getSorensenDiceCoefficient()); + assertEquals(2.0 * 2 / (3 + 2), similarity.apply("abaa", "aba").getSorensenDiceCoefficient()); + assertEquals(2.0 * 2 / (4 + 2), similarity.apply("ababa", "aba").getSorensenDiceCoefficient()); + assertEquals(2.0 * 3 / (4 + 3), similarity.apply("ababa", "abab").getSorensenDiceCoefficient()); + + assertEquals(0.0, similarity.apply("", "").getSorensenDiceCoefficient()); + assertEquals(0.0, similarity.apply("", "a").getSorensenDiceCoefficient()); + assertEquals(0.0, similarity.apply("a", "").getSorensenDiceCoefficient()); + assertEquals(0.0, similarity.apply("a", "a").getSorensenDiceCoefficient()); + assertEquals(0.0, similarity.apply("a", "b").getSorensenDiceCoefficient()); + assertEquals(1.0, similarity.apply("foo", "foo").getSorensenDiceCoefficient()); + assertEquals(2.0 * 2 / (2 + 3), similarity.apply("foo", "foo ").getSorensenDiceCoefficient()); + assertEquals(2.0 * 1 / (3 + 2), similarity.apply("frog", "fog").getSorensenDiceCoefficient()); + assertEquals(0.0, similarity.apply("fly", "ant").getSorensenDiceCoefficient()); + assertEquals(0.0, similarity.apply("elephant", "hippo").getSorensenDiceCoefficient()); + assertEquals(0.0, similarity.apply("hippo", "elephant").getSorensenDiceCoefficient()); + assertEquals(0.0, similarity.apply("hippo", "zzzzzzzz").getSorensenDiceCoefficient()); + assertEquals(2.0 * 2 / (4 + 4), similarity.apply("hello", "hallo").getSorensenDiceCoefficient()); + assertEquals(2.0 * 7 / (14 + 7), similarity.apply("ABC Corporation", "ABC Corp").getSorensenDiceCoefficient()); + assertEquals(2.0 * 17 / (20 + 26), + similarity.apply("D N H Enterprises Inc", "D & H Enterprises, Inc.").getSorensenDiceCoefficient()); + assertEquals(2.0 * 21 / (31 + 24), similarity + .apply("My Gym Children's Fitness Center", "My Gym. Childrens Fitness").getSorensenDiceCoefficient()); + assertEquals(2.0 * 8 / (11 + 12), + similarity.apply("PENNSYLVANIA", "PENNCISYLVNIA").getSorensenDiceCoefficient()); + assertEquals(2.0 * 12 / (13 + 13), + similarity.apply("/opt/software1", "/opt/software2").getSorensenDiceCoefficient()); + assertEquals(2.0 * 3 / (5 + 5), similarity.apply("aaabcd", "aaacdb").getSorensenDiceCoefficient()); + assertEquals(2.0 * 6 / (8 + 11), similarity.apply("John Horn", "John Hopkins").getSorensenDiceCoefficient()); } @Test From dae816a09e2b27e837ac198c8b444d49824733c6 Mon Sep 17 00:00:00 2001 From: aherbert Date: Fri, 8 Mar 2019 16:00:09 +0000 Subject: [PATCH 08/16] TEXT-155: Add word letter pairs test to IntersectionSimilarityTest --- .../IntersectionSimilarityTest.java | 100 +++++++++++++++--- 1 file changed, 85 insertions(+), 15 deletions(-) diff --git a/src/test/java/org/apache/commons/text/similarity/IntersectionSimilarityTest.java b/src/test/java/org/apache/commons/text/similarity/IntersectionSimilarityTest.java index 25f3016920..c76330b7b1 100644 --- a/src/test/java/org/apache/commons/text/similarity/IntersectionSimilarityTest.java +++ b/src/test/java/org/apache/commons/text/similarity/IntersectionSimilarityTest.java @@ -28,6 +28,7 @@ import java.util.List; import java.util.Set; import java.util.function.Function; +import java.util.regex.Pattern; /** * Unit tests for {@link IntersectionSimilarity}. @@ -37,7 +38,7 @@ public class IntersectionSimilarityTest { public void testJaccardIndexUsingSetCharacter() { // Match the functionality of the JaccardSimilarity class by dividing // the sequence into single characters - final Function> converter = (cs) -> { + final Function> converter = cs -> { final int length = cs.length(); final Set set = new HashSet<>(length); for (int i = 0; i < length; i++) { @@ -49,7 +50,7 @@ public void testJaccardIndexUsingSetCharacter() { // Expected Jaccard index = (intersect / union) // intersection = count of unique matching characters (exclude duplicates) - // union = count of unique characters + // union = count of unique characters assertEquals(0.0, similarity.apply("", "").getJaccardIndex()); assertEquals(0.0, similarity.apply("left", "").getJaccardIndex()); assertEquals(0.0, similarity.apply("", "right").getJaccardIndex()); @@ -69,7 +70,7 @@ public void testJaccardIndexUsingSetCharacter() { @Test public void testJaccardIndexUsingListCharacter() { // This test uses a list and so duplicates should be matched - final Function> converter = (cs) -> { + final Function> converter = cs -> { final int length = cs.length(); final List list = new ArrayList<>(length); for (int i = 0; i < length; i++) { @@ -103,7 +104,7 @@ public void testJaccardIndexUsingListCharacter() { public void testSorensenDiceCoefficientUsingSetBigrams() { // Compute using pairs of characters (bigrams). // This can be done using a 32-bit int to store two 16-bit characters - final Function> converter = (cs) -> { + final Function> converter = cs -> { final int length = cs.length(); final Set set = new HashSet<>(length); if (length > 1) { @@ -120,8 +121,8 @@ public void testSorensenDiceCoefficientUsingSetBigrams() { // Expected Sorensen-Dice = 2 * intersection / (|A| + |B|) // intersection = count of unique matching bigrams (exclude duplicates) - // |A| = count of unique bigrams in A - // |B| = count of unique bigrams in B + // |A| = count of unique bigrams in A + // |B| = count of unique bigrams in B assertEquals(2.0 * 1 / (1 + 1), similarity.apply("aa", "aa").getSorensenDiceCoefficient()); assertEquals(2.0 * 1 / (1 + 1), similarity.apply("aaa", "aa").getSorensenDiceCoefficient()); assertEquals(2.0 * 1 / (1 + 1), similarity.apply("aaaa", "aa").getSorensenDiceCoefficient()); @@ -160,25 +161,25 @@ public void testSorensenDiceCoefficientUsingSetBigrams() { public void testSorensenDiceCoefficientUsingListBigrams() { // Compute using pairs of characters (bigrams). // This can be done using a 32-bit int to store two 16-bit characters - final Function> converter = (cs) -> { + final Function> converter = cs -> { final int length = cs.length(); - final List set = new ArrayList<>(length); + final List list = new ArrayList<>(length); if (length > 1) { char ch2 = cs.charAt(0); for (int i = 1; i < length; i++) { final char ch1 = ch2; ch2 = cs.charAt(i); - set.add(Integer.valueOf((ch1 << 16) | ch2)); + list.add(Integer.valueOf((ch1 << 16) | ch2)); } } - return set; + return list; }; final IntersectionSimilarity similarity = new IntersectionSimilarity<>(converter); // Expected Sorensen-Dice = 2 * intersection / (|A| + |B|) // intersection = count of matching bigrams including duplicates - // |A| = max(0, left.length() - 1) - // |B| = max(0, right.length() - 1) + // |A| = max(0, left.length() - 1) + // |B| = max(0, right.length() - 1) assertEquals(2.0 * 1 / (1 + 1), similarity.apply("aa", "aa").getSorensenDiceCoefficient()); assertEquals(2.0 * 1 / (2 + 1), similarity.apply("aaa", "aa").getSorensenDiceCoefficient()); assertEquals(2.0 * 1 / (3 + 1), similarity.apply("aaaa", "aa").getSorensenDiceCoefficient()); @@ -213,6 +214,75 @@ public void testSorensenDiceCoefficientUsingListBigrams() { assertEquals(2.0 * 6 / (8 + 11), similarity.apply("John Horn", "John Hopkins").getSorensenDiceCoefficient()); } + @Test + public void testSorensenDiceCoefficientUsingListWordBigrams() { + // Example of a word letter pairs algorithm: + // http://www.catalysoft.com/articles/StrikeAMatch.html + + // Split on whitespace + final Pattern pattern = Pattern.compile("\\s+"); + + // Compute using pairs of characters (bigrams) for each word. + // This can be done using a 32-bit int to store two 16-bit characters + final Function> converter = cs -> { + final List set = new ArrayList<>(); + for (String word : pattern.split(cs)) { + if (word.length() > 1) { + // The strings are converted to upper case + char ch2 = Character.toUpperCase(word.charAt(0)); + for (int i = 1; i < word.length(); i++) { + final char ch1 = ch2; + ch2 = Character.toUpperCase(word.charAt(i)); + set.add(Integer.valueOf((ch1 << 16) | ch2)); + } + } + } + return set; + }; + final IntersectionSimilarity similarity = new IntersectionSimilarity<>(converter); + + String bookTitle; + final String search1 = "Web Database Applications"; + final String search2 = "PHP Web Applications"; + final String search3 = "Web Aplications"; + bookTitle = "Web Database Applications with PHP & MySQL"; + assertEquals(82, toPercent(similarity.apply(bookTitle, search1).getSorensenDiceCoefficient())); + assertEquals(68, toPercent(similarity.apply(bookTitle, search2).getSorensenDiceCoefficient())); + assertEquals(59, toPercent(similarity.apply(bookTitle, search3).getSorensenDiceCoefficient())); + bookTitle = "Creating Database Web Applications with PHP and ASP"; + assertEquals(71, toPercent(similarity.apply(bookTitle, search1).getSorensenDiceCoefficient())); + assertEquals(59, toPercent(similarity.apply(bookTitle, search2).getSorensenDiceCoefficient())); + assertEquals(50, toPercent(similarity.apply(bookTitle, search3).getSorensenDiceCoefficient())); + bookTitle = "Building Database Applications on the Web Using PHP3"; + assertEquals(70, toPercent(similarity.apply(bookTitle, search1).getSorensenDiceCoefficient())); + assertEquals(58, toPercent(similarity.apply(bookTitle, search2).getSorensenDiceCoefficient())); + assertEquals(49, toPercent(similarity.apply(bookTitle, search3).getSorensenDiceCoefficient())); + bookTitle = "Building Web Database Applications with Visual Studio 6"; + assertEquals(67, toPercent(similarity.apply(bookTitle, search1).getSorensenDiceCoefficient())); + assertEquals(47, toPercent(similarity.apply(bookTitle, search2).getSorensenDiceCoefficient())); + assertEquals(46, toPercent(similarity.apply(bookTitle, search3).getSorensenDiceCoefficient())); + bookTitle = "Web Application Development With PHP"; + assertEquals(51, toPercent(similarity.apply(bookTitle, search1).getSorensenDiceCoefficient())); + assertEquals(67, toPercent(similarity.apply(bookTitle, search2).getSorensenDiceCoefficient())); + assertEquals(56, toPercent(similarity.apply(bookTitle, search3).getSorensenDiceCoefficient())); + bookTitle = "WebRAD: Building Database Applications on the Web with Visual FoxPro and Web Connection"; + assertEquals(49, toPercent(similarity.apply(bookTitle, search1).getSorensenDiceCoefficient())); + assertEquals(34, toPercent(similarity.apply(bookTitle, search2).getSorensenDiceCoefficient())); + assertEquals(32, toPercent(similarity.apply(bookTitle, search3).getSorensenDiceCoefficient())); + bookTitle = "Structural Assessment: The Role of Large and Full-Scale Testing"; + assertEquals(12, toPercent(similarity.apply(bookTitle, search1).getSorensenDiceCoefficient())); + assertEquals(7, toPercent(similarity.apply(bookTitle, search2).getSorensenDiceCoefficient())); + assertEquals(7, toPercent(similarity.apply(bookTitle, search3).getSorensenDiceCoefficient())); + bookTitle = "How to Find a Scholarship Online"; + assertEquals(10, toPercent(similarity.apply(bookTitle, search1).getSorensenDiceCoefficient())); + assertEquals(11, toPercent(similarity.apply(bookTitle, search2).getSorensenDiceCoefficient())); + assertEquals(12, toPercent(similarity.apply(bookTitle, search3).getSorensenDiceCoefficient())); + } + + private static int toPercent(double value) { + return (int) Math.round(value * 100); + } + @Test public void testConstructorWithNullConverterThrows() { assertThatIllegalArgumentException().isThrownBy(() -> { @@ -223,21 +293,21 @@ public void testConstructorWithNullConverterThrows() { @Test public void testApplyNullNull() { assertThatIllegalArgumentException().isThrownBy(() -> { - new IntersectionSimilarity<>((cs) -> new HashSet<>(Arrays.asList(cs))).apply(null, null); + new IntersectionSimilarity<>(cs -> new HashSet<>(Arrays.asList(cs))).apply(null, null); }); } @Test public void testApplyStringNull() { assertThatIllegalArgumentException().isThrownBy(() -> { - new IntersectionSimilarity<>((cs) -> new HashSet<>(Arrays.asList(cs))).apply("left", null); + new IntersectionSimilarity<>(cs -> new HashSet<>(Arrays.asList(cs))).apply("left", null); }); } @Test public void testApplyNullString() { assertThatIllegalArgumentException().isThrownBy(() -> { - new IntersectionSimilarity<>((cs) -> new HashSet<>(Arrays.asList(cs))).apply(null, "right"); + new IntersectionSimilarity<>(cs -> new HashSet<>(Arrays.asList(cs))).apply(null, "right"); }); } } From ae21c63dcb7f57893d095c802c596b7db268d673 Mon Sep 17 00:00:00 2001 From: aherbert Date: Fri, 8 Mar 2019 16:08:45 +0000 Subject: [PATCH 09/16] Text-155: Javadoc fix in IntersectionResult --- .../apache/commons/text/similarity/IntersectionResult.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/java/org/apache/commons/text/similarity/IntersectionResult.java b/src/main/java/org/apache/commons/text/similarity/IntersectionResult.java index e63f2b9503..f06d021b71 100644 --- a/src/main/java/org/apache/commons/text/similarity/IntersectionResult.java +++ b/src/main/java/org/apache/commons/text/similarity/IntersectionResult.java @@ -21,7 +21,7 @@ /** * Container class to store the intersection results between two sets. * - *

Stores the size of set A, set B and the intersection of A and B (A ∩ B). + *

Stores the size of set A, set B and the intersection of A and B (|A ∩ B|). * The result can be used to produce various similarity metrics, for example the Jaccard index or * Sørensen-Dice coefficient (F1 score).

* @@ -51,7 +51,7 @@ public class IntersectionResult { * * @param sizeA the size of set A ({@code |A|}) * @param sizeB the size of set B ({@code |B|}) - * @param intersection the size of the intersection of A and B (A ∩ B) + * @param intersection the size of the intersection of A and B (|A ∩ B|) * @throws IllegalArgumentException if the sizes are negative or the intersection is greater * than the minimum of the two set sizes */ From 9a7d018c3e85031749166195ebab66c07b7d94c6 Mon Sep 17 00:00:00 2001 From: Alex Herbert Date: Sat, 9 Mar 2019 21:19:47 +0000 Subject: [PATCH 10/16] TEXT-155: Renamed to OverlapSimilarity. Removed computation of metrics from the OverlapResult. --- ...rsectionResult.java => OverlapResult.java} | 57 +--- ...Similarity.java => OverlapSimilarity.java} | 51 ++- .../IntersectionSimilarityTest.java | 313 ------------------ ...ResultTest.java => OverlapResultTest.java} | 97 ++---- .../similarity/OverlapSimilarityTest.java | 288 ++++++++++++++++ 5 files changed, 350 insertions(+), 456 deletions(-) rename src/main/java/org/apache/commons/text/similarity/{IntersectionResult.java => OverlapResult.java} (60%) rename src/main/java/org/apache/commons/text/similarity/{IntersectionSimilarity.java => OverlapSimilarity.java} (77%) delete mode 100644 src/test/java/org/apache/commons/text/similarity/IntersectionSimilarityTest.java rename src/test/java/org/apache/commons/text/similarity/{IntersectionResultTest.java => OverlapResultTest.java} (60%) create mode 100644 src/test/java/org/apache/commons/text/similarity/OverlapSimilarityTest.java diff --git a/src/main/java/org/apache/commons/text/similarity/IntersectionResult.java b/src/main/java/org/apache/commons/text/similarity/OverlapResult.java similarity index 60% rename from src/main/java/org/apache/commons/text/similarity/IntersectionResult.java rename to src/main/java/org/apache/commons/text/similarity/OverlapResult.java index f06d021b71..021fab9c5a 100644 --- a/src/main/java/org/apache/commons/text/similarity/IntersectionResult.java +++ b/src/main/java/org/apache/commons/text/similarity/OverlapResult.java @@ -19,20 +19,20 @@ import java.util.Objects; /** - * Container class to store the intersection results between two sets. + * Container class to store the overlap results between two sets. * - *

Stores the size of set A, set B and the intersection of A and B (|A ∩ B|). - * The result can be used to produce various similarity metrics, for example the Jaccard index or - * Sørensen-Dice coefficient (F1 score).

+ *

Stores the size of set A ({@code |A|}), set B ({@code |A|}) and the + * intersection of A and B (|A ∩ B|). The result can be used + * to produce the union of A and B (|A ∪ B|).

* *

This class is immutable.

* * @since 1.7 - * @see Jaccard index - * @see Sørensen Dice coefficient - * @see F1 score + * @see Intersection + * @see Union */ -public class IntersectionResult { +public class OverlapResult { /** * The size of set A. */ @@ -55,7 +55,7 @@ public class IntersectionResult { * @throws IllegalArgumentException if the sizes are negative or the intersection is greater * than the minimum of the two set sizes */ - public IntersectionResult(final int sizeA, final int sizeB, final int intersection) { + public OverlapResult(final int sizeA, final int sizeB, final int intersection) { if (sizeA < 0) { throw new IllegalArgumentException("Set size |A| is not positive: " + sizeA); } @@ -105,43 +105,6 @@ public long getUnion() { return (long) sizeA + sizeB - intersection; } - /** - * Gets the Jaccard index. The Jaccard is the intersection divided by the union. - * - *
|A ∩ B| / |A ∪ B| 
- * - *

This implementation defines the result as zero if there is no intersection, - * even when the union is zero to avoid a {@link Double#NaN} result.

- * - * @return the Jaccard index - * @see Jaccard index - */ - public double getJaccardIndex() { - return intersection == 0 ? 0.0 : (double) intersection / getUnion(); - } - - /** - * Gets the Sørensen-Dice coefficient. The coefficient is twice the size of the intersection - * divided by the size of both sets. - * - *
-     * 2|A ∩ B| / (|A| + |B|) 
-     * 
- * - *

This is also known as the F1 score. - * - *

This implementation defines the result as zero if there is no intersection, even when the size - * of both sets is zero to avoid a {@link Double#NaN} result.

- * - * @return the Sørensen-Dice coefficient - * @see Sørensen - * Dice coefficient - * @see F1 score - */ - public double getSorensenDiceCoefficient() { - return intersection == 0 ? 0.0 : 2.0 * intersection / ((long) sizeA + sizeB); - } - @Override public boolean equals(final Object o) { if (this == o) { @@ -150,7 +113,7 @@ public boolean equals(final Object o) { if (o == null || getClass() != o.getClass()) { return false; } - final IntersectionResult result = (IntersectionResult) o; + final OverlapResult result = (OverlapResult) o; return sizeA == result.sizeA && sizeB == result.sizeB && intersection == result.intersection; } diff --git a/src/main/java/org/apache/commons/text/similarity/IntersectionSimilarity.java b/src/main/java/org/apache/commons/text/similarity/OverlapSimilarity.java similarity index 77% rename from src/main/java/org/apache/commons/text/similarity/IntersectionSimilarity.java rename to src/main/java/org/apache/commons/text/similarity/OverlapSimilarity.java index 0751553066..df71654a96 100644 --- a/src/main/java/org/apache/commons/text/similarity/IntersectionSimilarity.java +++ b/src/main/java/org/apache/commons/text/similarity/OverlapSimilarity.java @@ -19,12 +19,12 @@ import java.util.Collection; import java.util.HashMap; import java.util.Map; +import java.util.Map.Entry; import java.util.Set; import java.util.function.Function; -import java.util.stream.Stream; /** - * Measures the intersection of two sets created from a pair of character sequences. + * Measures the overlap of two sets created from a pair of character sequences. * *

It is assumed that the type {@code T} correctly conforms to the requirements for storage * within a {@link Set} or {@link HashMap}. Ideally the type is immutable and implements @@ -35,7 +35,7 @@ * @see Set * @see HashMap */ -public class IntersectionSimilarity implements SimilarityScore { +public class OverlapSimilarity implements SimilarityScore { /** The converter used to create the elements from the characters. */ private final Function> converter; @@ -100,17 +100,17 @@ int getCount(final Object object) { } /** - * Returns a possibly parallel Stream of all the entries in the bag. + * Returns a Set view of the mappings contained in this bag. * - * @return the stream + * @return the Set view */ - Stream> parallelStream() { - return map.entrySet().parallelStream(); + Set> entrySet() { + return map.entrySet(); } } /** - * Create a new set similarity using the provided converter. + * Create a new overlap similarity using the provided converter. * *

If the converter returns a {@link Set} then the intersection result will * not include duplicates. Any other {@link Collection} is used to produce a result @@ -119,7 +119,7 @@ Stream> parallelStream() { * @param converter the converter used to create the elements from the characters. * @throws IllegalArgumentException if the converter is null */ - public IntersectionSimilarity(Function> converter) { + public OverlapSimilarity(Function> converter) { if (converter == null) { throw new IllegalArgumentException("Converter must not be null"); } @@ -135,7 +135,7 @@ public IntersectionSimilarity(Function> converter) { * @throws IllegalArgumentException if either input sequence is {@code null} */ @Override - public IntersectionResult apply(final CharSequence left, final CharSequence right) { + public OverlapResult apply(final CharSequence left, final CharSequence right) { if (left == null || right == null) { throw new IllegalArgumentException("Input cannot be null"); } @@ -149,36 +149,35 @@ public IntersectionResult apply(final CharSequence left, final CharSequence righ // Short-cut if either collection is empty if (Math.min(sizeA, sizeB) == 0) { // No intersection - return new IntersectionResult(sizeA, sizeB, 0); + return new OverlapResult(sizeA, sizeB, 0); } // Intersection = count the number of shared elements - int intersection; + int intersection = 0; if (objectsA instanceof Set) { // If a Set then the elements will only have a count of 1. - // Stream the elements in the set A and check if also in set B. // Note: Even if objectsB is a plain collection this will work // since the contains(Object) method will return true when present. // The fact that objectsA is a Set ensures non duplicate counting. - intersection = objectsA.parallelStream() - .mapToInt(element -> objectsB.contains(element) ? 1 : 0) - .sum(); + for (T element : objectsA) { + if (objectsB.contains(element)) { + intersection++; + } + } } else { // Create a bag for each collection final TinyBag bagA = toBag(objectsA); final TinyBag bagB = toBag(objectsB); - // Stream the count of each element in bag A and find the intersection with bag B - intersection = bagA.parallelStream() - .mapToInt(entry -> { - // The intersection of this entry in both bags is the min count - final T element = entry.getKey(); - final int count = entry.getValue().count; - return Math.min(count, bagB.getCount(element)); - }) - .sum(); + // Find the intersection of each element in bag A with bag B + for (Entry entry : bagA.entrySet()) { + final T element = entry.getKey(); + final int count = entry.getValue().count; + // The intersection of this entry in both bags is the minimum count + intersection += Math.min(count, bagB.getCount(element)); + } } - return new IntersectionResult(sizeA, sizeB, intersection); + return new OverlapResult(sizeA, sizeB, intersection); } /** diff --git a/src/test/java/org/apache/commons/text/similarity/IntersectionSimilarityTest.java b/src/test/java/org/apache/commons/text/similarity/IntersectionSimilarityTest.java deleted file mode 100644 index c76330b7b1..0000000000 --- a/src/test/java/org/apache/commons/text/similarity/IntersectionSimilarityTest.java +++ /dev/null @@ -1,313 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.text.similarity; - -import static org.assertj.core.api.Assertions.assertThatIllegalArgumentException; -import static org.junit.jupiter.api.Assertions.assertEquals; - -import org.junit.jupiter.api.Test; - -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collection; -import java.util.HashSet; -import java.util.List; -import java.util.Set; -import java.util.function.Function; -import java.util.regex.Pattern; - -/** - * Unit tests for {@link IntersectionSimilarity}. - */ -public class IntersectionSimilarityTest { - @Test - public void testJaccardIndexUsingSetCharacter() { - // Match the functionality of the JaccardSimilarity class by dividing - // the sequence into single characters - final Function> converter = cs -> { - final int length = cs.length(); - final Set set = new HashSet<>(length); - for (int i = 0; i < length; i++) { - set.add(cs.charAt(i)); - } - return set; - }; - final IntersectionSimilarity similarity = new IntersectionSimilarity<>(converter); - - // Expected Jaccard index = (intersect / union) - // intersection = count of unique matching characters (exclude duplicates) - // union = count of unique characters - assertEquals(0.0, similarity.apply("", "").getJaccardIndex()); - assertEquals(0.0, similarity.apply("left", "").getJaccardIndex()); - assertEquals(0.0, similarity.apply("", "right").getJaccardIndex()); - assertEquals(3.0 / 4, similarity.apply("frog", "fog").getJaccardIndex()); - assertEquals(0.0, similarity.apply("fly", "ant").getJaccardIndex()); - assertEquals(2.0 / 9, similarity.apply("elephant", "hippo").getJaccardIndex()); - assertEquals(7.0 / 11, similarity.apply("ABC Corporation", "ABC Corp").getJaccardIndex()); - assertEquals(13.0 / 17, similarity.apply("D N H Enterprises Inc", "D & H Enterprises, Inc.").getJaccardIndex()); - assertEquals(16.0 / 18, - similarity.apply("My Gym Children's Fitness Center", "My Gym. Childrens Fitness").getJaccardIndex()); - assertEquals(9.0 / 10, similarity.apply("PENNSYLVANIA", "PENNCISYLVNIA").getJaccardIndex()); - assertEquals(1.0 / 8, similarity.apply("left", "right").getJaccardIndex()); - assertEquals(1.0 / 8, similarity.apply("leettteft", "ritttght").getJaccardIndex()); - assertEquals(1.0, similarity.apply("the same string", "the same string").getJaccardIndex()); - } - - @Test - public void testJaccardIndexUsingListCharacter() { - // This test uses a list and so duplicates should be matched - final Function> converter = cs -> { - final int length = cs.length(); - final List list = new ArrayList<>(length); - for (int i = 0; i < length; i++) { - list.add(cs.charAt(i)); - } - return list; - }; - final IntersectionSimilarity similarity = new IntersectionSimilarity<>(converter); - - // Expected Jaccard index = (intersect / union) - // intersection = count of matching characters including duplicates - // union = left.length() + right.length() - intersection - assertEquals(0.0, similarity.apply("", "").getJaccardIndex()); - assertEquals(0.0, similarity.apply("left", "").getJaccardIndex()); - assertEquals(0.0, similarity.apply("", "right").getJaccardIndex()); - assertEquals(3.0 / (4 + 3 - 3), similarity.apply("frog", "fog").getJaccardIndex()); - assertEquals(0.0, similarity.apply("fly", "ant").getJaccardIndex()); - assertEquals(2.0 / (8 + 5 - 2), similarity.apply("elephant", "hippo").getJaccardIndex()); - assertEquals(8.0 / (15 + 8 - 8), similarity.apply("ABC Corporation", "ABC Corp").getJaccardIndex()); - assertEquals(20.0 / (21 + 23 - 20), - similarity.apply("D N H Enterprises Inc", "D & H Enterprises, Inc.").getJaccardIndex()); - assertEquals(24.0 / (32 + 25 - 24), - similarity.apply("My Gym Children's Fitness Center", "My Gym. Childrens Fitness").getJaccardIndex()); - assertEquals(11.0 / (12 + 13 - 11), similarity.apply("PENNSYLVANIA", "PENNCISYLVNIA").getJaccardIndex()); - assertEquals(1.0 / (4 + 5 - 1), similarity.apply("left", "right").getJaccardIndex()); - assertEquals(4.0 / (9 + 8 - 4), similarity.apply("leettteft", "ritttght").getJaccardIndex()); - assertEquals(1.0, similarity.apply("the same string", "the same string").getJaccardIndex()); - } - - @Test - public void testSorensenDiceCoefficientUsingSetBigrams() { - // Compute using pairs of characters (bigrams). - // This can be done using a 32-bit int to store two 16-bit characters - final Function> converter = cs -> { - final int length = cs.length(); - final Set set = new HashSet<>(length); - if (length > 1) { - char ch2 = cs.charAt(0); - for (int i = 1; i < length; i++) { - final char ch1 = ch2; - ch2 = cs.charAt(i); - set.add(Integer.valueOf((ch1 << 16) | ch2)); - } - } - return set; - }; - final IntersectionSimilarity similarity = new IntersectionSimilarity<>(converter); - - // Expected Sorensen-Dice = 2 * intersection / (|A| + |B|) - // intersection = count of unique matching bigrams (exclude duplicates) - // |A| = count of unique bigrams in A - // |B| = count of unique bigrams in B - assertEquals(2.0 * 1 / (1 + 1), similarity.apply("aa", "aa").getSorensenDiceCoefficient()); - assertEquals(2.0 * 1 / (1 + 1), similarity.apply("aaa", "aa").getSorensenDiceCoefficient()); - assertEquals(2.0 * 1 / (1 + 1), similarity.apply("aaaa", "aa").getSorensenDiceCoefficient()); - assertEquals(2.0 * 1 / (1 + 1), similarity.apply("aaaa", "aaa").getSorensenDiceCoefficient()); - assertEquals(2.0 * 2 / (3 + 2), similarity.apply("abaa", "aba").getSorensenDiceCoefficient()); - assertEquals(2.0 * 2 / (2 + 2), similarity.apply("ababa", "aba").getSorensenDiceCoefficient()); - assertEquals(2.0 * 2 / (2 + 2), similarity.apply("ababa", "abab").getSorensenDiceCoefficient()); - - assertEquals(0.0, similarity.apply("", "").getSorensenDiceCoefficient()); - assertEquals(0.0, similarity.apply("", "a").getSorensenDiceCoefficient()); - assertEquals(0.0, similarity.apply("a", "").getSorensenDiceCoefficient()); - assertEquals(0.0, similarity.apply("a", "a").getSorensenDiceCoefficient()); - assertEquals(0.0, similarity.apply("a", "b").getSorensenDiceCoefficient()); - assertEquals(1.0, similarity.apply("foo", "foo").getSorensenDiceCoefficient()); - assertEquals(2.0 * 2 / (2 + 3), similarity.apply("foo", "foo ").getSorensenDiceCoefficient()); - assertEquals(2.0 * 1 / (3 + 2), similarity.apply("frog", "fog").getSorensenDiceCoefficient()); - assertEquals(0.0, similarity.apply("fly", "ant").getSorensenDiceCoefficient()); - assertEquals(0.0, similarity.apply("elephant", "hippo").getSorensenDiceCoefficient()); - assertEquals(0.0, similarity.apply("hippo", "elephant").getSorensenDiceCoefficient()); - assertEquals(0.0, similarity.apply("hippo", "zzzzzzzz").getSorensenDiceCoefficient()); - assertEquals(2.0 * 2 / (4 + 4), similarity.apply("hello", "hallo").getSorensenDiceCoefficient()); - assertEquals(2.0 * 7 / (13 + 7), similarity.apply("ABC Corporation", "ABC Corp").getSorensenDiceCoefficient()); - assertEquals(2.0 * 17 / (20 + 26), - similarity.apply("D N H Enterprises Inc", "D & H Enterprises, Inc.").getSorensenDiceCoefficient()); - assertEquals(2.0 * 21 / (28 + 24), similarity - .apply("My Gym Children's Fitness Center", "My Gym. Childrens Fitness").getSorensenDiceCoefficient()); - assertEquals(2.0 * 8 / (11 + 12), - similarity.apply("PENNSYLVANIA", "PENNCISYLVNIA").getSorensenDiceCoefficient()); - assertEquals(2.0 * 12 / (13 + 13), - similarity.apply("/opt/software1", "/opt/software2").getSorensenDiceCoefficient()); - assertEquals(2.0 * 2 / (4 + 4), similarity.apply("aaabcd", "aaacdb").getSorensenDiceCoefficient()); - assertEquals(2.0 * 6 / (8 + 11), similarity.apply("John Horn", "John Hopkins").getSorensenDiceCoefficient()); - } - - @Test - public void testSorensenDiceCoefficientUsingListBigrams() { - // Compute using pairs of characters (bigrams). - // This can be done using a 32-bit int to store two 16-bit characters - final Function> converter = cs -> { - final int length = cs.length(); - final List list = new ArrayList<>(length); - if (length > 1) { - char ch2 = cs.charAt(0); - for (int i = 1; i < length; i++) { - final char ch1 = ch2; - ch2 = cs.charAt(i); - list.add(Integer.valueOf((ch1 << 16) | ch2)); - } - } - return list; - }; - final IntersectionSimilarity similarity = new IntersectionSimilarity<>(converter); - - // Expected Sorensen-Dice = 2 * intersection / (|A| + |B|) - // intersection = count of matching bigrams including duplicates - // |A| = max(0, left.length() - 1) - // |B| = max(0, right.length() - 1) - assertEquals(2.0 * 1 / (1 + 1), similarity.apply("aa", "aa").getSorensenDiceCoefficient()); - assertEquals(2.0 * 1 / (2 + 1), similarity.apply("aaa", "aa").getSorensenDiceCoefficient()); - assertEquals(2.0 * 1 / (3 + 1), similarity.apply("aaaa", "aa").getSorensenDiceCoefficient()); - assertEquals(2.0 * 2 / (3 + 2), similarity.apply("aaaa", "aaa").getSorensenDiceCoefficient()); - assertEquals(2.0 * 2 / (3 + 2), similarity.apply("abaa", "aba").getSorensenDiceCoefficient()); - assertEquals(2.0 * 2 / (4 + 2), similarity.apply("ababa", "aba").getSorensenDiceCoefficient()); - assertEquals(2.0 * 3 / (4 + 3), similarity.apply("ababa", "abab").getSorensenDiceCoefficient()); - - assertEquals(0.0, similarity.apply("", "").getSorensenDiceCoefficient()); - assertEquals(0.0, similarity.apply("", "a").getSorensenDiceCoefficient()); - assertEquals(0.0, similarity.apply("a", "").getSorensenDiceCoefficient()); - assertEquals(0.0, similarity.apply("a", "a").getSorensenDiceCoefficient()); - assertEquals(0.0, similarity.apply("a", "b").getSorensenDiceCoefficient()); - assertEquals(1.0, similarity.apply("foo", "foo").getSorensenDiceCoefficient()); - assertEquals(2.0 * 2 / (2 + 3), similarity.apply("foo", "foo ").getSorensenDiceCoefficient()); - assertEquals(2.0 * 1 / (3 + 2), similarity.apply("frog", "fog").getSorensenDiceCoefficient()); - assertEquals(0.0, similarity.apply("fly", "ant").getSorensenDiceCoefficient()); - assertEquals(0.0, similarity.apply("elephant", "hippo").getSorensenDiceCoefficient()); - assertEquals(0.0, similarity.apply("hippo", "elephant").getSorensenDiceCoefficient()); - assertEquals(0.0, similarity.apply("hippo", "zzzzzzzz").getSorensenDiceCoefficient()); - assertEquals(2.0 * 2 / (4 + 4), similarity.apply("hello", "hallo").getSorensenDiceCoefficient()); - assertEquals(2.0 * 7 / (14 + 7), similarity.apply("ABC Corporation", "ABC Corp").getSorensenDiceCoefficient()); - assertEquals(2.0 * 17 / (20 + 26), - similarity.apply("D N H Enterprises Inc", "D & H Enterprises, Inc.").getSorensenDiceCoefficient()); - assertEquals(2.0 * 21 / (31 + 24), similarity - .apply("My Gym Children's Fitness Center", "My Gym. Childrens Fitness").getSorensenDiceCoefficient()); - assertEquals(2.0 * 8 / (11 + 12), - similarity.apply("PENNSYLVANIA", "PENNCISYLVNIA").getSorensenDiceCoefficient()); - assertEquals(2.0 * 12 / (13 + 13), - similarity.apply("/opt/software1", "/opt/software2").getSorensenDiceCoefficient()); - assertEquals(2.0 * 3 / (5 + 5), similarity.apply("aaabcd", "aaacdb").getSorensenDiceCoefficient()); - assertEquals(2.0 * 6 / (8 + 11), similarity.apply("John Horn", "John Hopkins").getSorensenDiceCoefficient()); - } - - @Test - public void testSorensenDiceCoefficientUsingListWordBigrams() { - // Example of a word letter pairs algorithm: - // http://www.catalysoft.com/articles/StrikeAMatch.html - - // Split on whitespace - final Pattern pattern = Pattern.compile("\\s+"); - - // Compute using pairs of characters (bigrams) for each word. - // This can be done using a 32-bit int to store two 16-bit characters - final Function> converter = cs -> { - final List set = new ArrayList<>(); - for (String word : pattern.split(cs)) { - if (word.length() > 1) { - // The strings are converted to upper case - char ch2 = Character.toUpperCase(word.charAt(0)); - for (int i = 1; i < word.length(); i++) { - final char ch1 = ch2; - ch2 = Character.toUpperCase(word.charAt(i)); - set.add(Integer.valueOf((ch1 << 16) | ch2)); - } - } - } - return set; - }; - final IntersectionSimilarity similarity = new IntersectionSimilarity<>(converter); - - String bookTitle; - final String search1 = "Web Database Applications"; - final String search2 = "PHP Web Applications"; - final String search3 = "Web Aplications"; - bookTitle = "Web Database Applications with PHP & MySQL"; - assertEquals(82, toPercent(similarity.apply(bookTitle, search1).getSorensenDiceCoefficient())); - assertEquals(68, toPercent(similarity.apply(bookTitle, search2).getSorensenDiceCoefficient())); - assertEquals(59, toPercent(similarity.apply(bookTitle, search3).getSorensenDiceCoefficient())); - bookTitle = "Creating Database Web Applications with PHP and ASP"; - assertEquals(71, toPercent(similarity.apply(bookTitle, search1).getSorensenDiceCoefficient())); - assertEquals(59, toPercent(similarity.apply(bookTitle, search2).getSorensenDiceCoefficient())); - assertEquals(50, toPercent(similarity.apply(bookTitle, search3).getSorensenDiceCoefficient())); - bookTitle = "Building Database Applications on the Web Using PHP3"; - assertEquals(70, toPercent(similarity.apply(bookTitle, search1).getSorensenDiceCoefficient())); - assertEquals(58, toPercent(similarity.apply(bookTitle, search2).getSorensenDiceCoefficient())); - assertEquals(49, toPercent(similarity.apply(bookTitle, search3).getSorensenDiceCoefficient())); - bookTitle = "Building Web Database Applications with Visual Studio 6"; - assertEquals(67, toPercent(similarity.apply(bookTitle, search1).getSorensenDiceCoefficient())); - assertEquals(47, toPercent(similarity.apply(bookTitle, search2).getSorensenDiceCoefficient())); - assertEquals(46, toPercent(similarity.apply(bookTitle, search3).getSorensenDiceCoefficient())); - bookTitle = "Web Application Development With PHP"; - assertEquals(51, toPercent(similarity.apply(bookTitle, search1).getSorensenDiceCoefficient())); - assertEquals(67, toPercent(similarity.apply(bookTitle, search2).getSorensenDiceCoefficient())); - assertEquals(56, toPercent(similarity.apply(bookTitle, search3).getSorensenDiceCoefficient())); - bookTitle = "WebRAD: Building Database Applications on the Web with Visual FoxPro and Web Connection"; - assertEquals(49, toPercent(similarity.apply(bookTitle, search1).getSorensenDiceCoefficient())); - assertEquals(34, toPercent(similarity.apply(bookTitle, search2).getSorensenDiceCoefficient())); - assertEquals(32, toPercent(similarity.apply(bookTitle, search3).getSorensenDiceCoefficient())); - bookTitle = "Structural Assessment: The Role of Large and Full-Scale Testing"; - assertEquals(12, toPercent(similarity.apply(bookTitle, search1).getSorensenDiceCoefficient())); - assertEquals(7, toPercent(similarity.apply(bookTitle, search2).getSorensenDiceCoefficient())); - assertEquals(7, toPercent(similarity.apply(bookTitle, search3).getSorensenDiceCoefficient())); - bookTitle = "How to Find a Scholarship Online"; - assertEquals(10, toPercent(similarity.apply(bookTitle, search1).getSorensenDiceCoefficient())); - assertEquals(11, toPercent(similarity.apply(bookTitle, search2).getSorensenDiceCoefficient())); - assertEquals(12, toPercent(similarity.apply(bookTitle, search3).getSorensenDiceCoefficient())); - } - - private static int toPercent(double value) { - return (int) Math.round(value * 100); - } - - @Test - public void testConstructorWithNullConverterThrows() { - assertThatIllegalArgumentException().isThrownBy(() -> { - new IntersectionSimilarity<>(null); - }); - } - - @Test - public void testApplyNullNull() { - assertThatIllegalArgumentException().isThrownBy(() -> { - new IntersectionSimilarity<>(cs -> new HashSet<>(Arrays.asList(cs))).apply(null, null); - }); - } - - @Test - public void testApplyStringNull() { - assertThatIllegalArgumentException().isThrownBy(() -> { - new IntersectionSimilarity<>(cs -> new HashSet<>(Arrays.asList(cs))).apply("left", null); - }); - } - - @Test - public void testApplyNullString() { - assertThatIllegalArgumentException().isThrownBy(() -> { - new IntersectionSimilarity<>(cs -> new HashSet<>(Arrays.asList(cs))).apply(null, "right"); - }); - } -} diff --git a/src/test/java/org/apache/commons/text/similarity/IntersectionResultTest.java b/src/test/java/org/apache/commons/text/similarity/OverlapResultTest.java similarity index 60% rename from src/test/java/org/apache/commons/text/similarity/IntersectionResultTest.java rename to src/test/java/org/apache/commons/text/similarity/OverlapResultTest.java index b21275bdf8..8de8aa099b 100644 --- a/src/test/java/org/apache/commons/text/similarity/IntersectionResultTest.java +++ b/src/test/java/org/apache/commons/text/similarity/OverlapResultTest.java @@ -23,58 +23,57 @@ import java.util.concurrent.ThreadLocalRandom; /** - * Unit tests for {@link FuzzyScore}. + * Unit tests for {@link OverlapResult}. */ -public class IntersectionResultTest { - +public class OverlapResultTest { @Test - public void testNewIntersectionResult_WithZeros() { + public void testNewOverlapResult_WithZeros() { final int sizeA = 0; final int sizeB = 0; final int intersection = 0; - new IntersectionResult(sizeA, sizeB, intersection); + new OverlapResult(sizeA, sizeB, intersection); } @Test - public void testNewIntersectionResult_WithNegativeSizeA() { + public void testNewOverlapResult_WithNegativeSizeA() { final int sizeA = -1; final int sizeB = 0; final int intersection = 0; Assertions.assertThrows(IllegalArgumentException.class, () -> { - new IntersectionResult(sizeA, sizeB, intersection); + new OverlapResult(sizeA, sizeB, intersection); }); } @Test - public void testNewIntersectionResult_WithNegativeSizeB() { + public void testNewOverlapResult_WithNegativeSizeB() { final int sizeA = 0; final int sizeB = -1; final int intersection = 0; Assertions.assertThrows(IllegalArgumentException.class, () -> { - new IntersectionResult(sizeA, sizeB, intersection); + new OverlapResult(sizeA, sizeB, intersection); }); } @Test - public void testNewIntersectionResult_WithNegativeIntersection() { + public void testNewOverlapResult_WithNegativeIntersection() { final int sizeA = 0; final int sizeB = 0; final int intersection = -1; Assertions.assertThrows(IllegalArgumentException.class, () -> { - new IntersectionResult(sizeA, sizeB, intersection); + new OverlapResult(sizeA, sizeB, intersection); }); } @Test - public void testNewIntersectionResult_WithIntersectionAboveSizeAorB() { + public void testNewOverlapResult_WithIntersectionAboveSizeAorB() { final int sizeA = 1; final int sizeB = 2; final int intersection = Math.max(sizeA, sizeB) + 1; Assertions.assertThrows(IllegalArgumentException.class, () -> { - new IntersectionResult(sizeA, sizeB, intersection); + new OverlapResult(sizeA, sizeB, intersection); }); Assertions.assertThrows(IllegalArgumentException.class, () -> { - new IntersectionResult(sizeB, sizeA, intersection); + new OverlapResult(sizeB, sizeA, intersection); }); } @@ -101,49 +100,7 @@ public void testUnion() { } private static long getUnion(int sizeA, int sizeB, int intersection) { - return new IntersectionResult(sizeA, sizeB, intersection).getUnion(); - } - - @Test - public void testJaccard() { - // Jaccard is the intersection divided by the union - Assertions.assertEquals(0, getJaccard(0, 0, 0)); - - Assertions.assertEquals(0, getJaccard(1, 0, 0)); - Assertions.assertEquals(0, getJaccard(1, 1, 0)); - Assertions.assertEquals(1, getJaccard(1, 1, 1)); - - Assertions.assertEquals(0, getJaccard(2, 0, 0)); - Assertions.assertEquals(0, getJaccard(2, 1, 0)); - Assertions.assertEquals(1.0 / 2, getJaccard(2, 1, 1)); - Assertions.assertEquals(1, getJaccard(2, 2, 2)); - - Assertions.assertEquals(2.0 / 21, getJaccard(20, 3, 2)); - } - - private static double getJaccard(int sizeA, int sizeB, int intersection) { - return new IntersectionResult(sizeA, sizeB, intersection).getJaccardIndex(); - } - - @Test - public void testSorensenDice() { - // Sorensen-Dice is 2 * intersection divided by the size of each set - Assertions.assertEquals(0, getSorensenDice(0, 0, 0)); - - Assertions.assertEquals(0, getSorensenDice(1, 0, 0)); - Assertions.assertEquals(0, getSorensenDice(1, 1, 0)); - Assertions.assertEquals(2.0 * 1 / (1 + 1), getSorensenDice(1, 1, 1)); - - Assertions.assertEquals(0, getSorensenDice(2, 0, 0)); - Assertions.assertEquals(0, getSorensenDice(2, 1, 0)); - Assertions.assertEquals(2.0 * 1 / (2 + 1), getSorensenDice(2, 1, 1)); - Assertions.assertEquals(2.0 * 2 / (2 + 2), getSorensenDice(2, 2, 2)); - - Assertions.assertEquals(2.0 * 2 / (20 + 3), getSorensenDice(20, 3, 2)); - } - - private static double getSorensenDice(int sizeA, int sizeB, int intersection) { - return new IntersectionResult(sizeA, sizeB, intersection).getSorensenDiceCoefficient(); + return new OverlapResult(sizeA, sizeB, intersection).getUnion(); } @Test @@ -155,7 +112,7 @@ public void testProperties() { final int sizeA = rand.nextInt(max) + 1; final int sizeB = rand.nextInt(max) + 1; final int intersection = rand.nextInt(Math.min(sizeA, sizeB)); - final IntersectionResult result = new IntersectionResult(sizeA, sizeB, intersection); + final OverlapResult result = new OverlapResult(sizeA, sizeB, intersection); Assertions.assertEquals(sizeA, result.getSizeA()); Assertions.assertEquals(sizeB, result.getSizeB()); Assertions.assertEquals(intersection, result.getIntersection()); @@ -164,15 +121,15 @@ public void testProperties() { @Test public void testEquals() { - final IntersectionResult[] results = new IntersectionResult[] { - new IntersectionResult(0, 0, 0), - new IntersectionResult(10, 0, 0), - new IntersectionResult(10, 10, 0), - new IntersectionResult(10, 10, 10), + final OverlapResult[] results = new OverlapResult[] { + new OverlapResult(0, 0, 0), + new OverlapResult(10, 0, 0), + new OverlapResult(10, 10, 0), + new OverlapResult(10, 10, 10), }; // Test difference instance with same values - Assertions.assertTrue(results[0].equals(new IntersectionResult(0, 0, 0))); + Assertions.assertTrue(results[0].equals(new OverlapResult(0, 0, 0))); final Object something = new Object(); for (int i = 0; i < results.length; i++) { @@ -186,12 +143,12 @@ public void testEquals() { @Test public void testHashCode() { - final IntersectionResult[] results = new IntersectionResult[] { - new IntersectionResult(10, 0, 0), - new IntersectionResult(10, 10, 0), - new IntersectionResult(10, 10, 10), + final OverlapResult[] results = new OverlapResult[] { + new OverlapResult(10, 0, 0), + new OverlapResult(10, 10, 0), + new OverlapResult(10, 10, 10), }; - final HashMap map = new HashMap<>(); + final HashMap map = new HashMap<>(); final int offset = 123; for (int i = 0; i < results.length; i++) { map.put(results[i], i + offset); @@ -210,7 +167,7 @@ public void testToString() { final int sizeA = rand.nextInt(max) + 1; final int sizeB = rand.nextInt(max) + 1; final int intersection = rand.nextInt(Math.min(sizeA, sizeB)); - final IntersectionResult result = new IntersectionResult(sizeA, sizeB, intersection); + final OverlapResult result = new OverlapResult(sizeA, sizeB, intersection); final String string = result.toString(); // Not perfect as this will match substrings too. The chance of error // is limited by restricting the numbers to a max of 10. diff --git a/src/test/java/org/apache/commons/text/similarity/OverlapSimilarityTest.java b/src/test/java/org/apache/commons/text/similarity/OverlapSimilarityTest.java new file mode 100644 index 0000000000..0c93ba4324 --- /dev/null +++ b/src/test/java/org/apache/commons/text/similarity/OverlapSimilarityTest.java @@ -0,0 +1,288 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.text.similarity; + +import static org.assertj.core.api.Assertions.assertThatIllegalArgumentException; +import static org.junit.jupiter.api.Assertions.assertEquals; + +import org.junit.jupiter.api.Test; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.HashSet; +import java.util.List; +import java.util.Set; +import java.util.function.Function; +import java.util.regex.Pattern; + +/** + * Unit tests for {@link OverlapSimilarity}. + */ +public class OverlapSimilarityTest { + @Test + public void testOverlapUsingSetCharacter() { + // Compute using single characters. + // This test uses a set and so should not allow duplicates. + final Function> converter = cs -> { + final int length = cs.length(); + final Set set = new HashSet<>(length); + for (int i = 0; i < length; i++) { + set.add(cs.charAt(i)); + } + return set; + }; + final OverlapSimilarity similarity = new OverlapSimilarity<>(converter); + + // Expected: + // size A or B = count of unique characters (exclude duplicates) + // intersection = count of unique matching characters (exclude duplicates) + // union = count of unique characters in total (exclude duplicates) + assertOverlap(similarity, "", "", 0, 0, 0, 0); + assertOverlap(similarity, "a", "", 1, 0, 0, 1); + assertOverlap(similarity, "a", "a", 1, 1, 1, 1); + assertOverlap(similarity, "a", "b", 1, 1, 0, 2); + assertOverlap(similarity, "aa", "ab", 1, 2, 1, 2); + assertOverlap(similarity, "ab", "ab", 2, 2, 2, 2); + assertOverlap(similarity, "aaba", "abaa", 2, 2, 2, 2); + assertOverlap(similarity, "aaaa", "aa", 1, 1, 1, 1); + assertOverlap(similarity, "aaaa", "aaa", 1, 1, 1, 1); + assertOverlap(similarity, "aabab", "ababa", 2, 2, 2, 2); + assertOverlap(similarity, "the same", "the same", 7, 7, 7, 7); + assertOverlap(similarity, "abcdefghijklm", "ab_defg ijklm", 13, 13, 11, 15); + } + + @Test + public void testOverlapUsingListCharacter() { + // Compute using single characters. + // This test uses a list and so duplicates should be matched. + final Function> converter = cs -> { + final int length = cs.length(); + final List list = new ArrayList<>(length); + for (int i = 0; i < length; i++) { + list.add(cs.charAt(i)); + } + return list; + }; + final OverlapSimilarity similarity = new OverlapSimilarity<>(converter); + + // Expected: + // size A or B = sequence length + // intersection = count of matching characters (include duplicates) + // union = count of matching characters (include duplicates) plus unmatched + // = size A + size B - intersection + assertOverlap(similarity, "", "", 0, 0, 0, 0); + assertOverlap(similarity, "a", "", 1, 0, 0, 1); + assertOverlap(similarity, "a", "a", 1, 1, 1, 1); + assertOverlap(similarity, "a", "b", 1, 1, 0, 2); + assertOverlap(similarity, "aa", "ab", 2, 2, 1, 3); + assertOverlap(similarity, "ab", "ab", 2, 2, 2, 2); + assertOverlap(similarity, "aaba", "abaa", 4, 4, 4, 4); + assertOverlap(similarity, "aaaa", "aa", 4, 2, 2, 4); + assertOverlap(similarity, "aaaa", "aaa", 4, 3, 3, 4); + assertOverlap(similarity, "aabab", "ababa", 5, 5, 5, 5); + assertOverlap(similarity, "the same", "the same", 8, 8, 8, 8); + assertOverlap(similarity, "abcdefghijklm", "ab_defg ijklm", 13, 13, 11, 15); + } + + @Test + public void testOverlapUsingSetBigrams() { + // Compute using pairs of characters (bigrams). + // This can be done using a 32-bit int to store two 16-bit characters. + // This test uses a set and so should not allow duplicates. + final Function> converter = cs -> { + final int length = cs.length(); + final Set set = new HashSet<>(length); + if (length > 1) { + char ch2 = cs.charAt(0); + for (int i = 1; i < length; i++) { + final char ch1 = ch2; + ch2 = cs.charAt(i); + set.add(Integer.valueOf((ch1 << 16) | ch2)); + } + } + return set; + }; + final OverlapSimilarity similarity = new OverlapSimilarity<>(converter); + + // Expected: + // size A or B = count of unique bigrams (exclude duplicates) + // intersection = count of unique matching bigrams (exclude duplicates) + // union = count of unique bigrams in total (exclude duplicates) + assertOverlap(similarity, "", "", 0, 0, 0, 0); + assertOverlap(similarity, "a", "", 0, 0, 0, 0); + assertOverlap(similarity, "a", "a", 0, 0, 0, 0); + assertOverlap(similarity, "a", "b", 0, 0, 0, 0); + assertOverlap(similarity, "aa", "ab", 1, 1, 0, 2); + assertOverlap(similarity, "ab", "ab", 1, 1, 1, 1); + assertOverlap(similarity, "aaba", "abaa", 3, 3, 3, 3); + assertOverlap(similarity, "aaaa", "aa", 1, 1, 1, 1); + assertOverlap(similarity, "aaaa", "aaa", 1, 1, 1, 1); + assertOverlap(similarity, "aabab", "ababa", 3, 2, 2, 3); + assertOverlap(similarity, "the same", "the same", 7, 7, 7, 7); + assertOverlap(similarity, "abcdefghijklm", "ab_defg ijklm", 12, 12, 8, 16); + } + + @Test + public void testOverlapUsingListBigrams() { + // Compute using pairs of characters (bigrams). + // This can be done using a 32-bit int to store two 16-bit characters. + // This test uses a list and so duplicates should be matched. + final Function> converter = cs -> { + final int length = cs.length(); + final List list = new ArrayList<>(length); + if (length > 1) { + char ch2 = cs.charAt(0); + for (int i = 1; i < length; i++) { + final char ch1 = ch2; + ch2 = cs.charAt(i); + list.add(Integer.valueOf((ch1 << 16) | ch2)); + } + } + return list; + }; + final OverlapSimilarity similarity = new OverlapSimilarity<>(converter); + + // Expected: + // size A or B = sequence length - 1 + // intersection = count of matching bigrams (include duplicates) + // union = count of matching bigrams (include duplicates) + // = size A + size B - intersection + assertOverlap(similarity, "", "", 0, 0, 0, 0); + assertOverlap(similarity, "a", "", 0, 0, 0, 0); + assertOverlap(similarity, "a", "a", 0, 0, 0, 0); + assertOverlap(similarity, "a", "b", 0, 0, 0, 0); + assertOverlap(similarity, "aa", "ab", 1, 1, 0, 2); + assertOverlap(similarity, "ab", "ab", 1, 1, 1, 1); + assertOverlap(similarity, "aaba", "abaa", 3, 3, 3, 3); + assertOverlap(similarity, "aaaa", "aa", 3, 1, 1, 3); + assertOverlap(similarity, "aaaa", "aaa", 3, 2, 2, 3); + assertOverlap(similarity, "aabab", "ababa", 4, 4, 3, 5); + assertOverlap(similarity, "the same", "the same", 7, 7, 7, 7); + assertOverlap(similarity, "abcdefghijklm", "ab_defg ijklm", 12, 12, 8, 16); + } + + private static void assertOverlap(OverlapSimilarity similarity, CharSequence cs1, CharSequence cs2, + int sizeA, int sizeB, int intersection, int union) { + OverlapResult overlap = similarity.apply(cs1, cs2); + assertEquals(sizeA, overlap.getSizeA(), "Size A error"); + assertEquals(sizeB, overlap.getSizeB(), "Size B error"); + assertEquals(intersection, overlap.getIntersection(), "Intersection error"); + assertEquals(union, overlap.getUnion(), "Union error"); + } + + @Test + public void testF1ScoreUsingListWordBigrams() { + // Example of a word letter pairs algorithm by Simon White: + // http://www.catalysoft.com/articles/StrikeAMatch.html + // This splits into words using whitespace and then computes uppercase + // bigrams for each word. + + // Split on whitespace + final Pattern pattern = Pattern.compile("\\s+"); + + // Compute using pairs of characters (bigrams) for each word. + // This can be done using a 32-bit int to store two 16-bit characters + final Function> converter = cs -> { + final List set = new ArrayList<>(); + for (String word : pattern.split(cs)) { + if (word.length() > 1) { + // The strings are converted to upper case + char ch2 = Character.toUpperCase(word.charAt(0)); + for (int i = 1; i < word.length(); i++) { + final char ch1 = ch2; + ch2 = Character.toUpperCase(word.charAt(i)); + set.add(Integer.valueOf((ch1 << 16) | ch2)); + } + } + } + return set; + }; + final OverlapSimilarity similarity = new OverlapSimilarity<>(converter); + + String bookTitle; + final String search1 = "Web Database Applications"; + final String search2 = "PHP Web Applications"; + final String search3 = "Web Aplications"; + bookTitle = "Web Database Applications with PHP & MySQL"; + assertEquals(82, toF1ScorePercent(similarity.apply(bookTitle, search1))); + assertEquals(68, toF1ScorePercent(similarity.apply(bookTitle, search2))); + assertEquals(59, toF1ScorePercent(similarity.apply(bookTitle, search3))); + bookTitle = "Creating Database Web Applications with PHP and ASP"; + assertEquals(71, toF1ScorePercent(similarity.apply(bookTitle, search1))); + assertEquals(59, toF1ScorePercent(similarity.apply(bookTitle, search2))); + assertEquals(50, toF1ScorePercent(similarity.apply(bookTitle, search3))); + bookTitle = "Building Database Applications on the Web Using PHP3"; + assertEquals(70, toF1ScorePercent(similarity.apply(bookTitle, search1))); + assertEquals(58, toF1ScorePercent(similarity.apply(bookTitle, search2))); + assertEquals(49, toF1ScorePercent(similarity.apply(bookTitle, search3))); + bookTitle = "Building Web Database Applications with Visual Studio 6"; + assertEquals(67, toF1ScorePercent(similarity.apply(bookTitle, search1))); + assertEquals(47, toF1ScorePercent(similarity.apply(bookTitle, search2))); + assertEquals(46, toF1ScorePercent(similarity.apply(bookTitle, search3))); + bookTitle = "Web Application Development With PHP"; + assertEquals(51, toF1ScorePercent(similarity.apply(bookTitle, search1))); + assertEquals(67, toF1ScorePercent(similarity.apply(bookTitle, search2))); + assertEquals(56, toF1ScorePercent(similarity.apply(bookTitle, search3))); + bookTitle = "WebRAD: Building Database Applications on the Web with Visual FoxPro and Web Connection"; + assertEquals(49, toF1ScorePercent(similarity.apply(bookTitle, search1))); + assertEquals(34, toF1ScorePercent(similarity.apply(bookTitle, search2))); + assertEquals(32, toF1ScorePercent(similarity.apply(bookTitle, search3))); + bookTitle = "Structural Assessment: The Role of Large and Full-Scale Testing"; + assertEquals(12, toF1ScorePercent(similarity.apply(bookTitle, search1))); + assertEquals(7, toF1ScorePercent(similarity.apply(bookTitle, search2))); + assertEquals(7, toF1ScorePercent(similarity.apply(bookTitle, search3))); + bookTitle = "How to Find a Scholarship Online"; + assertEquals(10, toF1ScorePercent(similarity.apply(bookTitle, search1))); + assertEquals(11, toF1ScorePercent(similarity.apply(bookTitle, search2))); + assertEquals(12, toF1ScorePercent(similarity.apply(bookTitle, search3))); + } + + private static int toF1ScorePercent(OverlapResult overlap) { + final double value = 2.0 * overlap.getIntersection() / (overlap.getSizeA() + overlap.getSizeB()); + // Convert to percentage + return (int) Math.round(value * 100); + } + + @Test + public void testConstructorWithNullConverterThrows() { + assertThatIllegalArgumentException().isThrownBy(() -> { + new OverlapSimilarity<>(null); + }); + } + + @Test + public void testApplyNullNull() { + assertThatIllegalArgumentException().isThrownBy(() -> { + new OverlapSimilarity<>(cs -> new HashSet<>(Arrays.asList(cs))).apply(null, null); + }); + } + + @Test + public void testApplyStringNull() { + assertThatIllegalArgumentException().isThrownBy(() -> { + new OverlapSimilarity<>(cs -> new HashSet<>(Arrays.asList(cs))).apply("left", null); + }); + } + + @Test + public void testApplyNullString() { + assertThatIllegalArgumentException().isThrownBy(() -> { + new OverlapSimilarity<>(cs -> new HashSet<>(Arrays.asList(cs))).apply(null, "right"); + }); + } +} From 7a4aee810bf8c4d067abdb7a726fc990a9c28714 Mon Sep 17 00:00:00 2001 From: Amey Jadiye Date: Sun, 10 Mar 2019 13:28:26 +0530 Subject: [PATCH 11/16] using OverlapSimilarity for scoring Sorensendice similarity --- .../similarity/SorensenDiceSimilarity.java | 48 +++++++++++++++++-- 1 file changed, 43 insertions(+), 5 deletions(-) diff --git a/src/main/java/org/apache/commons/text/similarity/SorensenDiceSimilarity.java b/src/main/java/org/apache/commons/text/similarity/SorensenDiceSimilarity.java index dc59d0fa74..0195e9be4c 100644 --- a/src/main/java/org/apache/commons/text/similarity/SorensenDiceSimilarity.java +++ b/src/main/java/org/apache/commons/text/similarity/SorensenDiceSimilarity.java @@ -16,9 +16,12 @@ */ package org.apache.commons.text.similarity; +import java.util.ArrayList; +import java.util.Collection; import java.util.HashSet; +import java.util.List; import java.util.Set; -import java.util.stream.Collectors; +import java.util.function.Function; import org.apache.commons.lang3.StringUtils; @@ -49,6 +52,22 @@ */ public class SorensenDiceSimilarity implements SimilarityScore { + /** + * For shifting bigrams to fit in single integer. + */ + public static final int SHIFT_NUMBER = 16; + + /** + * Converter function for conversion of string to bigrams. + */ + final Function> converter = new SorensenDiceConverter(); + + /** + * Measures the overlap of two sets created from a pair of character sequences. + * {@link OverlapSimilarity}} + */ + final OverlapSimilarity similarity = new OverlapSimilarity<>(this.converter); + /** * Calculates Sorensen-Dice Similarity of two character sequences passed as * input. @@ -98,11 +117,10 @@ public Double apply(final CharSequence left, final CharSequence right) { return 0d; } - Set nLeft = createBigrams(left); - Set nRight = createBigrams(right); + OverlapResult overlap = similarity.apply(left, right); - final int total = nLeft.size() + nRight.size(); - final long intersection = nLeft.stream().filter(nRight::contains).collect(Collectors.counting()); + final int total = overlap.getSizeA() + overlap.getSizeB(); + final long intersection = overlap.getIntersection(); return (2.0d * intersection) / total; } @@ -124,4 +142,24 @@ protected Set createBigrams(CharSequence charSequence) { } return set; } + + /** + * Converter class for creating Bigrams for SorensenDice similarity. + */ + class SorensenDiceConverter implements Function> { + @Override + public Collection apply(CharSequence cs) { + final int length = cs.length(); + final List list = new ArrayList<>(length); + if (length > 1) { + char ch2 = cs.charAt(0); + for (int i = 1; i < length; i++) { + final char ch1 = ch2; + ch2 = cs.charAt(i); + list.add(Integer.valueOf((ch1 << SHIFT_NUMBER) | ch2)); + } + } + return list; + } + } } From 211077b2e36f316c5e8c123a036dfd6543a778aa Mon Sep 17 00:00:00 2001 From: Amey Jadiye Date: Sun, 10 Mar 2019 13:29:37 +0530 Subject: [PATCH 12/16] rounded resulring scores for tests --- .../SorensenDiceSimilarityTest.java | 21 ++++++++++++------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/src/test/java/org/apache/commons/text/similarity/SorensenDiceSimilarityTest.java b/src/test/java/org/apache/commons/text/similarity/SorensenDiceSimilarityTest.java index 2e144e7628..3d59823ea0 100644 --- a/src/test/java/org/apache/commons/text/similarity/SorensenDiceSimilarityTest.java +++ b/src/test/java/org/apache/commons/text/similarity/SorensenDiceSimilarityTest.java @@ -55,14 +55,14 @@ public void testGetSorensenDicesSimilarity_StringString() { assertEquals(0.0d, similarity.apply("hippo", "elephant")); assertEquals(0.0d, similarity.apply("hippo", "zzzzzzzz")); assertEquals(0.5d, similarity.apply("hello", "hallo")); - assertEquals(0.7d, similarity.apply("ABC Corporation", "ABC Corp")); - assertEquals(0.7391304347826086d, similarity.apply("D N H Enterprises Inc", "D & H Enterprises, Inc.")); - assertEquals(0.8076923076923077d, - similarity.apply("My Gym Children's Fitness Center", "My Gym. Childrens Fitness")); - assertEquals(0.6956521739130435, similarity.apply("PENNSYLVANIA", "PENNCISYLVNIA")); - assertEquals(0.9230769230769231, similarity.apply("/opt/software1", "/opt/software2")); - assertEquals(0.5d, similarity.apply("aaabcd", "aaacdb")); - assertEquals(0.631578947368421, similarity.apply("John Horn", "John Hopkins")); + assertEquals(0.7d, round(similarity.apply("ABC Corporation", "ABC Corp"), 1)); + assertEquals(0.7d, round(similarity.apply("D N H Enterprises Inc", "D & H Enterprises, Inc."), 1)); + assertEquals(0.8d, + round(similarity.apply("My Gym Children's Fitness Center", "My Gym. Childrens Fitness"), 1)); + assertEquals(0.7d, round(similarity.apply("PENNSYLVANIA", "PENNCISYLVNIA"), 1)); + assertEquals(0.9d, round(similarity.apply("/opt/software1", "/opt/software2"), 1)); + assertEquals(0.6d, round(similarity.apply("aaabcd", "aaacdb"), 1)); + assertEquals(0.6d, round(similarity.apply("John Horn", "John Hopkins"), 1)); } @@ -86,4 +86,9 @@ public void testGetSorensenDicesSimilarity_NullString() { similarity.apply(null, "clear"); }); } + + public static double round(double value, int precision) { + int scale = (int) Math.pow(10, precision); + return (double) Math.round(value * scale) / scale; + } } From 29fd2daee55dfd92467e64796c4e00f9a1599106 Mon Sep 17 00:00:00 2001 From: Amey Jadiye Date: Sun, 10 Mar 2019 13:35:58 +0530 Subject: [PATCH 13/16] fixed spotbug checkstyle errors --- .../commons/text/similarity/SorensenDiceSimilarity.java | 2 +- .../commons/text/similarity/OverlapSimilarityTest.java | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/main/java/org/apache/commons/text/similarity/SorensenDiceSimilarity.java b/src/main/java/org/apache/commons/text/similarity/SorensenDiceSimilarity.java index 0195e9be4c..ead01779d1 100644 --- a/src/main/java/org/apache/commons/text/similarity/SorensenDiceSimilarity.java +++ b/src/main/java/org/apache/commons/text/similarity/SorensenDiceSimilarity.java @@ -146,7 +146,7 @@ protected Set createBigrams(CharSequence charSequence) { /** * Converter class for creating Bigrams for SorensenDice similarity. */ - class SorensenDiceConverter implements Function> { + static class SorensenDiceConverter implements Function> { @Override public Collection apply(CharSequence cs) { final int length = cs.length(); diff --git a/src/test/java/org/apache/commons/text/similarity/OverlapSimilarityTest.java b/src/test/java/org/apache/commons/text/similarity/OverlapSimilarityTest.java index 0c93ba4324..ebad367e91 100644 --- a/src/test/java/org/apache/commons/text/similarity/OverlapSimilarityTest.java +++ b/src/test/java/org/apache/commons/text/similarity/OverlapSimilarityTest.java @@ -84,7 +84,7 @@ public void testOverlapUsingListCharacter() { // size A or B = sequence length // intersection = count of matching characters (include duplicates) // union = count of matching characters (include duplicates) plus unmatched - // = size A + size B - intersection + // = size A + size B - intersection assertOverlap(similarity, "", "", 0, 0, 0, 0); assertOverlap(similarity, "a", "", 1, 0, 0, 1); assertOverlap(similarity, "a", "a", 1, 1, 1, 1); @@ -161,7 +161,7 @@ public void testOverlapUsingListBigrams() { // size A or B = sequence length - 1 // intersection = count of matching bigrams (include duplicates) // union = count of matching bigrams (include duplicates) - // = size A + size B - intersection + // = size A + size B - intersection assertOverlap(similarity, "", "", 0, 0, 0, 0); assertOverlap(similarity, "a", "", 0, 0, 0, 0); assertOverlap(similarity, "a", "a", 0, 0, 0, 0); @@ -176,7 +176,7 @@ public void testOverlapUsingListBigrams() { assertOverlap(similarity, "abcdefghijklm", "ab_defg ijklm", 12, 12, 8, 16); } - private static void assertOverlap(OverlapSimilarity similarity, CharSequence cs1, CharSequence cs2, + private static void assertOverlap(OverlapSimilarity similarity, CharSequence cs1, CharSequence cs2, int sizeA, int sizeB, int intersection, int union) { OverlapResult overlap = similarity.apply(cs1, cs2); assertEquals(sizeA, overlap.getSizeA(), "Size A error"); @@ -189,7 +189,7 @@ private static void assertOverlap(OverlapSimilarity similarity, CharSeque public void testF1ScoreUsingListWordBigrams() { // Example of a word letter pairs algorithm by Simon White: // http://www.catalysoft.com/articles/StrikeAMatch.html - // This splits into words using whitespace and then computes uppercase + // This splits into words using whitespace and then computes uppercase // bigrams for each word. // Split on whitespace From bcff9748d24727da8758f06df3beeab28e446264 Mon Sep 17 00:00:00 2001 From: Amey Jadiye Date: Mon, 18 Mar 2019 21:22:48 +0530 Subject: [PATCH 14/16] no need --- .../text/similarity/OverlapResult.java | 129 -------- .../text/similarity/OverlapSimilarity.java | 196 ------------ .../text/similarity/OverlapResultTest.java | 179 ----------- .../similarity/OverlapSimilarityTest.java | 288 ------------------ 4 files changed, 792 deletions(-) delete mode 100644 src/main/java/org/apache/commons/text/similarity/OverlapResult.java delete mode 100644 src/main/java/org/apache/commons/text/similarity/OverlapSimilarity.java delete mode 100644 src/test/java/org/apache/commons/text/similarity/OverlapResultTest.java delete mode 100644 src/test/java/org/apache/commons/text/similarity/OverlapSimilarityTest.java diff --git a/src/main/java/org/apache/commons/text/similarity/OverlapResult.java b/src/main/java/org/apache/commons/text/similarity/OverlapResult.java deleted file mode 100644 index 021fab9c5a..0000000000 --- a/src/main/java/org/apache/commons/text/similarity/OverlapResult.java +++ /dev/null @@ -1,129 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.text.similarity; - -import java.util.Objects; - -/** - * Container class to store the overlap results between two sets. - * - *

Stores the size of set A ({@code |A|}), set B ({@code |A|}) and the - * intersection of A and B (|A ∩ B|). The result can be used - * to produce the union of A and B (|A ∪ B|).

- * - *

This class is immutable.

- * - * @since 1.7 - * @see Intersection - * @see Union - */ -public class OverlapResult { - /** - * The size of set A. - */ - private final int sizeA; - /** - * The size of set B. - */ - private final int sizeB; - /** - * The size of the intersection between set A and B. - */ - private final int intersection; - - /** - * Create the results for an intersection between two sets. - * - * @param sizeA the size of set A ({@code |A|}) - * @param sizeB the size of set B ({@code |B|}) - * @param intersection the size of the intersection of A and B (|A ∩ B|) - * @throws IllegalArgumentException if the sizes are negative or the intersection is greater - * than the minimum of the two set sizes - */ - public OverlapResult(final int sizeA, final int sizeB, final int intersection) { - if (sizeA < 0) { - throw new IllegalArgumentException("Set size |A| is not positive: " + sizeA); - } - if (sizeB < 0) { - throw new IllegalArgumentException("Set size |B| is not positive: " + sizeB); - } - if (intersection < 0 || intersection > Math.min(sizeA, sizeB)) { - throw new IllegalArgumentException("Invalid intersection of |A| and |B|: " + intersection); - } - this.sizeA = sizeA; - this.sizeB = sizeB; - this.intersection = intersection; - } - - /** - * Get the size of set A. - * - * @return |A| - */ - public int getSizeA() { - return sizeA; - } - - /** - * Get the size of set B. - * - * @return |B| - */ - public int getSizeB() { - return sizeB; - } - - /** - * Get the size of the intersection between set A and B. - * - * @return |A ∩ B| - */ - public int getIntersection() { - return intersection; - } - /** - * Get the size of the union between set A and B. - * - * @return |A ∪ B| - */ - public long getUnion() { - return (long) sizeA + sizeB - intersection; - } - - @Override - public boolean equals(final Object o) { - if (this == o) { - return true; - } - if (o == null || getClass() != o.getClass()) { - return false; - } - final OverlapResult result = (OverlapResult) o; - return sizeA == result.sizeA && sizeB == result.sizeB && intersection == result.intersection; - } - - @Override - public int hashCode() { - return Objects.hash(sizeA, sizeB, intersection); - } - - @Override - public String toString() { - return "Size A: " + sizeA + ", Size B: " + sizeB + ", Intersection: " + intersection; - } -} diff --git a/src/main/java/org/apache/commons/text/similarity/OverlapSimilarity.java b/src/main/java/org/apache/commons/text/similarity/OverlapSimilarity.java deleted file mode 100644 index df71654a96..0000000000 --- a/src/main/java/org/apache/commons/text/similarity/OverlapSimilarity.java +++ /dev/null @@ -1,196 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.text.similarity; - -import java.util.Collection; -import java.util.HashMap; -import java.util.Map; -import java.util.Map.Entry; -import java.util.Set; -import java.util.function.Function; - -/** - * Measures the overlap of two sets created from a pair of character sequences. - * - *

It is assumed that the type {@code T} correctly conforms to the requirements for storage - * within a {@link Set} or {@link HashMap}. Ideally the type is immutable and implements - * {@link Object#equals(Object)} and {@link Object#hashCode()}.

- * - * @param the type of the elements extracted from the character sequence - * @since 1.7 - * @see Set - * @see HashMap - */ -public class OverlapSimilarity implements SimilarityScore { - /** The converter used to create the elements from the characters. */ - private final Function> converter; - - // The following is adapted from commons-collections for a Bag. - // A Bag is a collection that can store the count of the number - // of copies of each element. - - /** - * Mutable counter class for storing the count of elements. - */ - private static class BagCount { - /** The count. This is initialised to 1 upon construction. */ - int count = 1; - } - - /** - * A minimal implementation of a Bag that can store elements and a count. - * - *

For the intended purpose the Bag does not have to be a {@link Collection}. It does not - * even have to know its own size. - */ - private class TinyBag { - /** The backing map. */ - private final Map map; - - /** - * Create a new tiny bag. - * - * @param initialCapacity the initial capacity - */ - TinyBag(int initialCapacity) { - map = new HashMap<>(initialCapacity); - } - - /** - * Adds a new element to the bag, incrementing its count in the underlying map. - * - * @param object the object to add - */ - void add(T object) { - final BagCount mut = map.get(object); - if (mut == null) { - map.put(object, new BagCount()); - } else { - mut.count++; - } - } - - /** - * Returns the number of occurrence of the given element in this bag by - * looking up its count in the underlying map. - * - * @param object the object to search for - * @return the number of occurrences of the object, zero if not found - */ - int getCount(final Object object) { - final BagCount count = map.get(object); - if (count != null) { - return count.count; - } - return 0; - } - - /** - * Returns a Set view of the mappings contained in this bag. - * - * @return the Set view - */ - Set> entrySet() { - return map.entrySet(); - } - } - - /** - * Create a new overlap similarity using the provided converter. - * - *

If the converter returns a {@link Set} then the intersection result will - * not include duplicates. Any other {@link Collection} is used to produce a result - * that will include duplicates in the intersect and union. - * - * @param converter the converter used to create the elements from the characters. - * @throws IllegalArgumentException if the converter is null - */ - public OverlapSimilarity(Function> converter) { - if (converter == null) { - throw new IllegalArgumentException("Converter must not be null"); - } - this.converter = converter; - } - - /** - * Calculates the intersection of two character sequences passed as input. - * - * @param left first character sequence - * @param right second character sequence - * @return the intersection result - * @throws IllegalArgumentException if either input sequence is {@code null} - */ - @Override - public OverlapResult apply(final CharSequence left, final CharSequence right) { - if (left == null || right == null) { - throw new IllegalArgumentException("Input cannot be null"); - } - - // Create the elements from the sequences - final Collection objectsA = converter.apply(left); - final Collection objectsB = converter.apply(right); - final int sizeA = objectsA.size(); - final int sizeB = objectsB.size(); - - // Short-cut if either collection is empty - if (Math.min(sizeA, sizeB) == 0) { - // No intersection - return new OverlapResult(sizeA, sizeB, 0); - } - - // Intersection = count the number of shared elements - int intersection = 0; - if (objectsA instanceof Set) { - // If a Set then the elements will only have a count of 1. - // Note: Even if objectsB is a plain collection this will work - // since the contains(Object) method will return true when present. - // The fact that objectsA is a Set ensures non duplicate counting. - for (T element : objectsA) { - if (objectsB.contains(element)) { - intersection++; - } - } - } else { - // Create a bag for each collection - final TinyBag bagA = toBag(objectsA); - final TinyBag bagB = toBag(objectsB); - // Find the intersection of each element in bag A with bag B - for (Entry entry : bagA.entrySet()) { - final T element = entry.getKey(); - final int count = entry.getValue().count; - // The intersection of this entry in both bags is the minimum count - intersection += Math.min(count, bagB.getCount(element)); - } - } - - return new OverlapResult(sizeA, sizeB, intersection); - } - - /** - * Convert the collection to a bag. The bag will contain the count of each element in the collection. - * - * @param objects the objects - * @return the bag - */ - private TinyBag toBag(Collection objects) { - final TinyBag bag = new TinyBag(objects.size()); - for (T t : objects) { - bag.add(t); - } - return bag; - } -} diff --git a/src/test/java/org/apache/commons/text/similarity/OverlapResultTest.java b/src/test/java/org/apache/commons/text/similarity/OverlapResultTest.java deleted file mode 100644 index 8de8aa099b..0000000000 --- a/src/test/java/org/apache/commons/text/similarity/OverlapResultTest.java +++ /dev/null @@ -1,179 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.text.similarity; - -import org.junit.jupiter.api.Assertions; -import org.junit.jupiter.api.Test; - -import java.util.HashMap; -import java.util.concurrent.ThreadLocalRandom; - -/** - * Unit tests for {@link OverlapResult}. - */ -public class OverlapResultTest { - @Test - public void testNewOverlapResult_WithZeros() { - final int sizeA = 0; - final int sizeB = 0; - final int intersection = 0; - new OverlapResult(sizeA, sizeB, intersection); - } - - @Test - public void testNewOverlapResult_WithNegativeSizeA() { - final int sizeA = -1; - final int sizeB = 0; - final int intersection = 0; - Assertions.assertThrows(IllegalArgumentException.class, () -> { - new OverlapResult(sizeA, sizeB, intersection); - }); - } - - @Test - public void testNewOverlapResult_WithNegativeSizeB() { - final int sizeA = 0; - final int sizeB = -1; - final int intersection = 0; - Assertions.assertThrows(IllegalArgumentException.class, () -> { - new OverlapResult(sizeA, sizeB, intersection); - }); - } - - @Test - public void testNewOverlapResult_WithNegativeIntersection() { - final int sizeA = 0; - final int sizeB = 0; - final int intersection = -1; - Assertions.assertThrows(IllegalArgumentException.class, () -> { - new OverlapResult(sizeA, sizeB, intersection); - }); - } - - @Test - public void testNewOverlapResult_WithIntersectionAboveSizeAorB() { - final int sizeA = 1; - final int sizeB = 2; - final int intersection = Math.max(sizeA, sizeB) + 1; - Assertions.assertThrows(IllegalArgumentException.class, () -> { - new OverlapResult(sizeA, sizeB, intersection); - }); - Assertions.assertThrows(IllegalArgumentException.class, () -> { - new OverlapResult(sizeB, sizeA, intersection); - }); - } - - @Test - public void testUnion() { - // Union is the combined size minus the intersection - Assertions.assertEquals(0, getUnion(0, 0, 0)); - - Assertions.assertEquals(1, getUnion(1, 0, 0)); - Assertions.assertEquals(2, getUnion(1, 1, 0)); - Assertions.assertEquals(1, getUnion(1, 1, 1)); - - Assertions.assertEquals(2, getUnion(2, 0, 0)); - Assertions.assertEquals(3, getUnion(2, 1, 0)); - Assertions.assertEquals(2, getUnion(2, 1, 1)); - Assertions.assertEquals(2, getUnion(2, 2, 2)); - - // Test overflow of int addition - Assertions.assertEquals((long) Integer.MAX_VALUE + 1, getUnion(Integer.MAX_VALUE, 1, 0)); - Assertions.assertEquals((long) Integer.MAX_VALUE + 1, - getUnion(Integer.MAX_VALUE, Integer.MAX_VALUE, Integer.MAX_VALUE - 1)); - Assertions.assertEquals((long) Integer.MAX_VALUE + Integer.MAX_VALUE, - getUnion(Integer.MAX_VALUE, Integer.MAX_VALUE, 0)); - } - - private static long getUnion(int sizeA, int sizeB, int intersection) { - return new OverlapResult(sizeA, sizeB, intersection).getUnion(); - } - - @Test - public void testProperties() { - final ThreadLocalRandom rand = ThreadLocalRandom.current(); - final int max = 1024; - for (int i = 0; i < 5; i++) { - // Ensure the min is above 0 - final int sizeA = rand.nextInt(max) + 1; - final int sizeB = rand.nextInt(max) + 1; - final int intersection = rand.nextInt(Math.min(sizeA, sizeB)); - final OverlapResult result = new OverlapResult(sizeA, sizeB, intersection); - Assertions.assertEquals(sizeA, result.getSizeA()); - Assertions.assertEquals(sizeB, result.getSizeB()); - Assertions.assertEquals(intersection, result.getIntersection()); - } - } - - @Test - public void testEquals() { - final OverlapResult[] results = new OverlapResult[] { - new OverlapResult(0, 0, 0), - new OverlapResult(10, 0, 0), - new OverlapResult(10, 10, 0), - new OverlapResult(10, 10, 10), - }; - - // Test difference instance with same values - Assertions.assertTrue(results[0].equals(new OverlapResult(0, 0, 0))); - - final Object something = new Object(); - for (int i = 0; i < results.length; i++) { - Assertions.assertFalse(results[i].equals(something)); - Assertions.assertFalse(results[i].equals(null)); - for (int j = 0; j < results.length; j++) { - Assertions.assertTrue(results[i].equals(results[j]) == (i == j)); - } - } - } - - @Test - public void testHashCode() { - final OverlapResult[] results = new OverlapResult[] { - new OverlapResult(10, 0, 0), - new OverlapResult(10, 10, 0), - new OverlapResult(10, 10, 10), - }; - final HashMap map = new HashMap<>(); - final int offset = 123; - for (int i = 0; i < results.length; i++) { - map.put(results[i], i + offset); - } - for (int i = 0; i < results.length; i++) { - Assertions.assertEquals(i + offset, map.get(results[i])); - } - } - - @Test - public void testToString() { - final ThreadLocalRandom rand = ThreadLocalRandom.current(); - final int max = 9; - for (int i = 0; i < 5; i++) { - // Ensure the min is above 0 - final int sizeA = rand.nextInt(max) + 1; - final int sizeB = rand.nextInt(max) + 1; - final int intersection = rand.nextInt(Math.min(sizeA, sizeB)); - final OverlapResult result = new OverlapResult(sizeA, sizeB, intersection); - final String string = result.toString(); - // Not perfect as this will match substrings too. The chance of error - // is limited by restricting the numbers to a max of 10. - Assertions.assertTrue(string.contains(String.valueOf(sizeA))); - Assertions.assertTrue(string.contains(String.valueOf(sizeB))); - Assertions.assertTrue(string.contains(String.valueOf(intersection))); - } - } -} diff --git a/src/test/java/org/apache/commons/text/similarity/OverlapSimilarityTest.java b/src/test/java/org/apache/commons/text/similarity/OverlapSimilarityTest.java deleted file mode 100644 index ebad367e91..0000000000 --- a/src/test/java/org/apache/commons/text/similarity/OverlapSimilarityTest.java +++ /dev/null @@ -1,288 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.text.similarity; - -import static org.assertj.core.api.Assertions.assertThatIllegalArgumentException; -import static org.junit.jupiter.api.Assertions.assertEquals; - -import org.junit.jupiter.api.Test; - -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collection; -import java.util.HashSet; -import java.util.List; -import java.util.Set; -import java.util.function.Function; -import java.util.regex.Pattern; - -/** - * Unit tests for {@link OverlapSimilarity}. - */ -public class OverlapSimilarityTest { - @Test - public void testOverlapUsingSetCharacter() { - // Compute using single characters. - // This test uses a set and so should not allow duplicates. - final Function> converter = cs -> { - final int length = cs.length(); - final Set set = new HashSet<>(length); - for (int i = 0; i < length; i++) { - set.add(cs.charAt(i)); - } - return set; - }; - final OverlapSimilarity similarity = new OverlapSimilarity<>(converter); - - // Expected: - // size A or B = count of unique characters (exclude duplicates) - // intersection = count of unique matching characters (exclude duplicates) - // union = count of unique characters in total (exclude duplicates) - assertOverlap(similarity, "", "", 0, 0, 0, 0); - assertOverlap(similarity, "a", "", 1, 0, 0, 1); - assertOverlap(similarity, "a", "a", 1, 1, 1, 1); - assertOverlap(similarity, "a", "b", 1, 1, 0, 2); - assertOverlap(similarity, "aa", "ab", 1, 2, 1, 2); - assertOverlap(similarity, "ab", "ab", 2, 2, 2, 2); - assertOverlap(similarity, "aaba", "abaa", 2, 2, 2, 2); - assertOverlap(similarity, "aaaa", "aa", 1, 1, 1, 1); - assertOverlap(similarity, "aaaa", "aaa", 1, 1, 1, 1); - assertOverlap(similarity, "aabab", "ababa", 2, 2, 2, 2); - assertOverlap(similarity, "the same", "the same", 7, 7, 7, 7); - assertOverlap(similarity, "abcdefghijklm", "ab_defg ijklm", 13, 13, 11, 15); - } - - @Test - public void testOverlapUsingListCharacter() { - // Compute using single characters. - // This test uses a list and so duplicates should be matched. - final Function> converter = cs -> { - final int length = cs.length(); - final List list = new ArrayList<>(length); - for (int i = 0; i < length; i++) { - list.add(cs.charAt(i)); - } - return list; - }; - final OverlapSimilarity similarity = new OverlapSimilarity<>(converter); - - // Expected: - // size A or B = sequence length - // intersection = count of matching characters (include duplicates) - // union = count of matching characters (include duplicates) plus unmatched - // = size A + size B - intersection - assertOverlap(similarity, "", "", 0, 0, 0, 0); - assertOverlap(similarity, "a", "", 1, 0, 0, 1); - assertOverlap(similarity, "a", "a", 1, 1, 1, 1); - assertOverlap(similarity, "a", "b", 1, 1, 0, 2); - assertOverlap(similarity, "aa", "ab", 2, 2, 1, 3); - assertOverlap(similarity, "ab", "ab", 2, 2, 2, 2); - assertOverlap(similarity, "aaba", "abaa", 4, 4, 4, 4); - assertOverlap(similarity, "aaaa", "aa", 4, 2, 2, 4); - assertOverlap(similarity, "aaaa", "aaa", 4, 3, 3, 4); - assertOverlap(similarity, "aabab", "ababa", 5, 5, 5, 5); - assertOverlap(similarity, "the same", "the same", 8, 8, 8, 8); - assertOverlap(similarity, "abcdefghijklm", "ab_defg ijklm", 13, 13, 11, 15); - } - - @Test - public void testOverlapUsingSetBigrams() { - // Compute using pairs of characters (bigrams). - // This can be done using a 32-bit int to store two 16-bit characters. - // This test uses a set and so should not allow duplicates. - final Function> converter = cs -> { - final int length = cs.length(); - final Set set = new HashSet<>(length); - if (length > 1) { - char ch2 = cs.charAt(0); - for (int i = 1; i < length; i++) { - final char ch1 = ch2; - ch2 = cs.charAt(i); - set.add(Integer.valueOf((ch1 << 16) | ch2)); - } - } - return set; - }; - final OverlapSimilarity similarity = new OverlapSimilarity<>(converter); - - // Expected: - // size A or B = count of unique bigrams (exclude duplicates) - // intersection = count of unique matching bigrams (exclude duplicates) - // union = count of unique bigrams in total (exclude duplicates) - assertOverlap(similarity, "", "", 0, 0, 0, 0); - assertOverlap(similarity, "a", "", 0, 0, 0, 0); - assertOverlap(similarity, "a", "a", 0, 0, 0, 0); - assertOverlap(similarity, "a", "b", 0, 0, 0, 0); - assertOverlap(similarity, "aa", "ab", 1, 1, 0, 2); - assertOverlap(similarity, "ab", "ab", 1, 1, 1, 1); - assertOverlap(similarity, "aaba", "abaa", 3, 3, 3, 3); - assertOverlap(similarity, "aaaa", "aa", 1, 1, 1, 1); - assertOverlap(similarity, "aaaa", "aaa", 1, 1, 1, 1); - assertOverlap(similarity, "aabab", "ababa", 3, 2, 2, 3); - assertOverlap(similarity, "the same", "the same", 7, 7, 7, 7); - assertOverlap(similarity, "abcdefghijklm", "ab_defg ijklm", 12, 12, 8, 16); - } - - @Test - public void testOverlapUsingListBigrams() { - // Compute using pairs of characters (bigrams). - // This can be done using a 32-bit int to store two 16-bit characters. - // This test uses a list and so duplicates should be matched. - final Function> converter = cs -> { - final int length = cs.length(); - final List list = new ArrayList<>(length); - if (length > 1) { - char ch2 = cs.charAt(0); - for (int i = 1; i < length; i++) { - final char ch1 = ch2; - ch2 = cs.charAt(i); - list.add(Integer.valueOf((ch1 << 16) | ch2)); - } - } - return list; - }; - final OverlapSimilarity similarity = new OverlapSimilarity<>(converter); - - // Expected: - // size A or B = sequence length - 1 - // intersection = count of matching bigrams (include duplicates) - // union = count of matching bigrams (include duplicates) - // = size A + size B - intersection - assertOverlap(similarity, "", "", 0, 0, 0, 0); - assertOverlap(similarity, "a", "", 0, 0, 0, 0); - assertOverlap(similarity, "a", "a", 0, 0, 0, 0); - assertOverlap(similarity, "a", "b", 0, 0, 0, 0); - assertOverlap(similarity, "aa", "ab", 1, 1, 0, 2); - assertOverlap(similarity, "ab", "ab", 1, 1, 1, 1); - assertOverlap(similarity, "aaba", "abaa", 3, 3, 3, 3); - assertOverlap(similarity, "aaaa", "aa", 3, 1, 1, 3); - assertOverlap(similarity, "aaaa", "aaa", 3, 2, 2, 3); - assertOverlap(similarity, "aabab", "ababa", 4, 4, 3, 5); - assertOverlap(similarity, "the same", "the same", 7, 7, 7, 7); - assertOverlap(similarity, "abcdefghijklm", "ab_defg ijklm", 12, 12, 8, 16); - } - - private static void assertOverlap(OverlapSimilarity similarity, CharSequence cs1, CharSequence cs2, - int sizeA, int sizeB, int intersection, int union) { - OverlapResult overlap = similarity.apply(cs1, cs2); - assertEquals(sizeA, overlap.getSizeA(), "Size A error"); - assertEquals(sizeB, overlap.getSizeB(), "Size B error"); - assertEquals(intersection, overlap.getIntersection(), "Intersection error"); - assertEquals(union, overlap.getUnion(), "Union error"); - } - - @Test - public void testF1ScoreUsingListWordBigrams() { - // Example of a word letter pairs algorithm by Simon White: - // http://www.catalysoft.com/articles/StrikeAMatch.html - // This splits into words using whitespace and then computes uppercase - // bigrams for each word. - - // Split on whitespace - final Pattern pattern = Pattern.compile("\\s+"); - - // Compute using pairs of characters (bigrams) for each word. - // This can be done using a 32-bit int to store two 16-bit characters - final Function> converter = cs -> { - final List set = new ArrayList<>(); - for (String word : pattern.split(cs)) { - if (word.length() > 1) { - // The strings are converted to upper case - char ch2 = Character.toUpperCase(word.charAt(0)); - for (int i = 1; i < word.length(); i++) { - final char ch1 = ch2; - ch2 = Character.toUpperCase(word.charAt(i)); - set.add(Integer.valueOf((ch1 << 16) | ch2)); - } - } - } - return set; - }; - final OverlapSimilarity similarity = new OverlapSimilarity<>(converter); - - String bookTitle; - final String search1 = "Web Database Applications"; - final String search2 = "PHP Web Applications"; - final String search3 = "Web Aplications"; - bookTitle = "Web Database Applications with PHP & MySQL"; - assertEquals(82, toF1ScorePercent(similarity.apply(bookTitle, search1))); - assertEquals(68, toF1ScorePercent(similarity.apply(bookTitle, search2))); - assertEquals(59, toF1ScorePercent(similarity.apply(bookTitle, search3))); - bookTitle = "Creating Database Web Applications with PHP and ASP"; - assertEquals(71, toF1ScorePercent(similarity.apply(bookTitle, search1))); - assertEquals(59, toF1ScorePercent(similarity.apply(bookTitle, search2))); - assertEquals(50, toF1ScorePercent(similarity.apply(bookTitle, search3))); - bookTitle = "Building Database Applications on the Web Using PHP3"; - assertEquals(70, toF1ScorePercent(similarity.apply(bookTitle, search1))); - assertEquals(58, toF1ScorePercent(similarity.apply(bookTitle, search2))); - assertEquals(49, toF1ScorePercent(similarity.apply(bookTitle, search3))); - bookTitle = "Building Web Database Applications with Visual Studio 6"; - assertEquals(67, toF1ScorePercent(similarity.apply(bookTitle, search1))); - assertEquals(47, toF1ScorePercent(similarity.apply(bookTitle, search2))); - assertEquals(46, toF1ScorePercent(similarity.apply(bookTitle, search3))); - bookTitle = "Web Application Development With PHP"; - assertEquals(51, toF1ScorePercent(similarity.apply(bookTitle, search1))); - assertEquals(67, toF1ScorePercent(similarity.apply(bookTitle, search2))); - assertEquals(56, toF1ScorePercent(similarity.apply(bookTitle, search3))); - bookTitle = "WebRAD: Building Database Applications on the Web with Visual FoxPro and Web Connection"; - assertEquals(49, toF1ScorePercent(similarity.apply(bookTitle, search1))); - assertEquals(34, toF1ScorePercent(similarity.apply(bookTitle, search2))); - assertEquals(32, toF1ScorePercent(similarity.apply(bookTitle, search3))); - bookTitle = "Structural Assessment: The Role of Large and Full-Scale Testing"; - assertEquals(12, toF1ScorePercent(similarity.apply(bookTitle, search1))); - assertEquals(7, toF1ScorePercent(similarity.apply(bookTitle, search2))); - assertEquals(7, toF1ScorePercent(similarity.apply(bookTitle, search3))); - bookTitle = "How to Find a Scholarship Online"; - assertEquals(10, toF1ScorePercent(similarity.apply(bookTitle, search1))); - assertEquals(11, toF1ScorePercent(similarity.apply(bookTitle, search2))); - assertEquals(12, toF1ScorePercent(similarity.apply(bookTitle, search3))); - } - - private static int toF1ScorePercent(OverlapResult overlap) { - final double value = 2.0 * overlap.getIntersection() / (overlap.getSizeA() + overlap.getSizeB()); - // Convert to percentage - return (int) Math.round(value * 100); - } - - @Test - public void testConstructorWithNullConverterThrows() { - assertThatIllegalArgumentException().isThrownBy(() -> { - new OverlapSimilarity<>(null); - }); - } - - @Test - public void testApplyNullNull() { - assertThatIllegalArgumentException().isThrownBy(() -> { - new OverlapSimilarity<>(cs -> new HashSet<>(Arrays.asList(cs))).apply(null, null); - }); - } - - @Test - public void testApplyStringNull() { - assertThatIllegalArgumentException().isThrownBy(() -> { - new OverlapSimilarity<>(cs -> new HashSet<>(Arrays.asList(cs))).apply("left", null); - }); - } - - @Test - public void testApplyNullString() { - assertThatIllegalArgumentException().isThrownBy(() -> { - new OverlapSimilarity<>(cs -> new HashSet<>(Arrays.asList(cs))).apply(null, "right"); - }); - } -} From 2de106eafaacbcd94afb18c084ce18888c3a1818 Mon Sep 17 00:00:00 2001 From: Amey Jadiye Date: Mon, 18 Mar 2019 21:29:27 +0530 Subject: [PATCH 15/16] using new IntersectionSimilarity --- .../commons/text/similarity/SorensenDiceSimilarity.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/java/org/apache/commons/text/similarity/SorensenDiceSimilarity.java b/src/main/java/org/apache/commons/text/similarity/SorensenDiceSimilarity.java index ead01779d1..d46372d225 100644 --- a/src/main/java/org/apache/commons/text/similarity/SorensenDiceSimilarity.java +++ b/src/main/java/org/apache/commons/text/similarity/SorensenDiceSimilarity.java @@ -66,7 +66,7 @@ public class SorensenDiceSimilarity implements SimilarityScore { * Measures the overlap of two sets created from a pair of character sequences. * {@link OverlapSimilarity}} */ - final OverlapSimilarity similarity = new OverlapSimilarity<>(this.converter); + final IntersectionSimilarity similarity = new IntersectionSimilarity<>(this.converter); /** * Calculates Sorensen-Dice Similarity of two character sequences passed as @@ -117,7 +117,7 @@ public Double apply(final CharSequence left, final CharSequence right) { return 0d; } - OverlapResult overlap = similarity.apply(left, right); + IntersectionResult overlap = similarity.apply(left, right); final int total = overlap.getSizeA() + overlap.getSizeB(); final long intersection = overlap.getIntersection(); From 8f0a97cfa6f61a1aaa0f55ffdf4ff2ca48ec9a04 Mon Sep 17 00:00:00 2001 From: Amey Jadiye Date: Sun, 24 Mar 2019 18:00:33 +0530 Subject: [PATCH 16/16] corrected javadoc, removed unused code and made instance var private. --- .../similarity/SorensenDiceSimilarity.java | 35 +++++-------------- 1 file changed, 8 insertions(+), 27 deletions(-) diff --git a/src/main/java/org/apache/commons/text/similarity/SorensenDiceSimilarity.java b/src/main/java/org/apache/commons/text/similarity/SorensenDiceSimilarity.java index d46372d225..3121bdc2f1 100644 --- a/src/main/java/org/apache/commons/text/similarity/SorensenDiceSimilarity.java +++ b/src/main/java/org/apache/commons/text/similarity/SorensenDiceSimilarity.java @@ -18,9 +18,7 @@ import java.util.ArrayList; import java.util.Collection; -import java.util.HashSet; import java.util.List; -import java.util.Set; import java.util.function.Function; import org.apache.commons.lang3.StringUtils; @@ -30,17 +28,17 @@ * between two character sequences. * *

- * The Sørensen–Dice coefficient is a statistic used for comparing the + * The Sørensen-Dice coefficient is a statistic used for comparing the * similarity of two samples. It was independently developed by the botanists * Thorvald Sørensen and Lee Raymond Dice, who published in 1948 and 1945 * respectively. The index is known by several other names, especially - * Sørensen–Dice index, Sørensen index and Dice's coefficient. Other + * Sørensen-Dice index, Sørensen index and Dice's coefficient. Other * variations include the "similarity coefficient" or "index", such as Dice * similarity coefficient (DSC). *

* *

- * This implementation is based on the Sørensen–Dice similarity algorithm + * This implementation is based on the Sørensen-Dice similarity algorithm * from * http://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient. @@ -55,18 +53,18 @@ public class SorensenDiceSimilarity implements SimilarityScore { /** * For shifting bigrams to fit in single integer. */ - public static final int SHIFT_NUMBER = 16; + private static final int SHIFT_NUMBER = 16; /** * Converter function for conversion of string to bigrams. */ - final Function> converter = new SorensenDiceConverter(); + private final Function> converter = new SorensenDiceConverter(); /** * Measures the overlap of two sets created from a pair of character sequences. * {@link OverlapSimilarity}} */ - final IntersectionSimilarity similarity = new IntersectionSimilarity<>(this.converter); + private final IntersectionSimilarity similarity = new IntersectionSimilarity<>(this.converter); /** * Calculates Sorensen-Dice Similarity of two character sequences passed as @@ -113,6 +111,7 @@ public Double apply(final CharSequence left, final CharSequence right) { return 1d; } + // if bigram is not formed out of any given string, clearly both are not similar. if (left.length() < 2 || right.length() < 2) { return 0d; } @@ -125,28 +124,10 @@ public Double apply(final CharSequence left, final CharSequence right) { return (2.0d * intersection) / total; } - /** - * Method for creating bigrams - two consecutive characters. Returns a set of - * bigrams. - * - * @param charSequence The char sequence for which we need set of bigrams. - * @return set of bigrams. - */ - protected Set createBigrams(CharSequence charSequence) { - Set set = new HashSet(); - for (int i = 0; i < charSequence.length() - 1; i++) { - char chr = charSequence.charAt(i); - char nextChr = charSequence.charAt(i + 1); - String bi = "" + chr + nextChr; - set.add(bi); - } - return set; - } - /** * Converter class for creating Bigrams for SorensenDice similarity. */ - static class SorensenDiceConverter implements Function> { + private static class SorensenDiceConverter implements Function> { @Override public Collection apply(CharSequence cs) { final int length = cs.length();