diff --git a/src/main/java/org/apache/commons/text/similarity/SorensenDiceSimilarity.java b/src/main/java/org/apache/commons/text/similarity/SorensenDiceSimilarity.java new file mode 100644 index 0000000000..3121bdc2f1 --- /dev/null +++ b/src/main/java/org/apache/commons/text/similarity/SorensenDiceSimilarity.java @@ -0,0 +1,146 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.text.similarity; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.List; +import java.util.function.Function; + +import org.apache.commons.lang3.StringUtils; + +/** + * A similarity algorithm indicating the percentage of matched characters + * between two character sequences. + * + *

+ * The Sørensen-Dice coefficient is a statistic used for comparing the + * similarity of two samples. It was independently developed by the botanists + * Thorvald Sørensen and Lee Raymond Dice, who published in 1948 and 1945 + * respectively. The index is known by several other names, especially + * Sørensen-Dice index, Sørensen index and Dice's coefficient. Other + * variations include the "similarity coefficient" or "index", such as Dice + * similarity coefficient (DSC). + *

+ * + *

+ * This implementation is based on the Sørensen-Dice similarity algorithm + * from + * http://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient. + * + * + *

+ * + * @since 1.7 + */ +public class SorensenDiceSimilarity implements SimilarityScore { + + /** + * For shifting bigrams to fit in single integer. + */ + private static final int SHIFT_NUMBER = 16; + + /** + * Converter function for conversion of string to bigrams. + */ + private final Function> converter = new SorensenDiceConverter(); + + /** + * Measures the overlap of two sets created from a pair of character sequences. + * {@link OverlapSimilarity}} + */ + private final IntersectionSimilarity similarity = new IntersectionSimilarity<>(this.converter); + + /** + * Calculates Sorensen-Dice Similarity of two character sequences passed as + * input. + * + *
+     * similarity.apply(null, null)                 = IllegalArgumentException
+     * similarity.apply("foo", null)                = IllegalArgumentException
+     * similarity.apply(null, "foo")                = IllegalArgumentException
+     * similarity.apply("night", "nacht")           = 0.25
+     * similarity.apply("", "")                     = 1.0
+     * similarity.apply("foo", "foo")               = 1.0
+     * similarity.apply("foo", "foo ")              = 0.8
+     * similarity.apply("foo", "foo ")              = 0.66
+     * similarity.apply("foo", " foo ")             = 0.66
+     * similarity.apply("foo", " foo")              = 0.66
+     * similarity.apply("", "a")                    = 0.0
+     * similarity.apply("aaapppp", "")              = 0.0
+     * similarity.apply("frog", "fog")              = 0.4
+     * similarity.apply("fly", "ant")               = 0.0
+     * similarity.apply("elephant", "hippo")        = 0.0
+     * similarity.apply("hippo", "elephant")        = 0.0
+     * similarity.apply("hippo", "zzzzzzzz")        = 0.0
+     * similarity.apply("hello", "hallo")           = 0.5
+     * similarity.apply("ABC Corporation", "ABC Corp") = 0.7
+     * similarity.apply("D N H Enterprises Inc", "D & H Enterprises, Inc.") = 0.74
+     * similarity.apply("My Gym Children's Fitness Center", "My Gym. Childrens Fitness") = 0.81
+     * similarity.apply("PENNSYLVANIA", "PENNCISYLVNIA") = 0.69
+     * 
+ * + * @param left the first CharSequence, must not be null + * @param right the second CharSequence, must not be null + * @return result similarity + * @throws IllegalArgumentException if either CharSequence input is {@code null} + */ + @Override + public Double apply(final CharSequence left, final CharSequence right) { + + if (left == null || right == null) { + throw new IllegalArgumentException("CharSequences must not be null"); + } + + if (StringUtils.equals(left, right)) { + return 1d; + } + + // if bigram is not formed out of any given string, clearly both are not similar. + if (left.length() < 2 || right.length() < 2) { + return 0d; + } + + IntersectionResult overlap = similarity.apply(left, right); + + final int total = overlap.getSizeA() + overlap.getSizeB(); + final long intersection = overlap.getIntersection(); + + return (2.0d * intersection) / total; + } + + /** + * Converter class for creating Bigrams for SorensenDice similarity. + */ + private static class SorensenDiceConverter implements Function> { + @Override + public Collection apply(CharSequence cs) { + final int length = cs.length(); + final List list = new ArrayList<>(length); + if (length > 1) { + char ch2 = cs.charAt(0); + for (int i = 1; i < length; i++) { + final char ch1 = ch2; + ch2 = cs.charAt(i); + list.add(Integer.valueOf((ch1 << SHIFT_NUMBER) | ch2)); + } + } + return list; + } + } +} diff --git a/src/test/java/org/apache/commons/text/similarity/SorensenDiceSimilarityTest.java b/src/test/java/org/apache/commons/text/similarity/SorensenDiceSimilarityTest.java new file mode 100644 index 0000000000..3d59823ea0 --- /dev/null +++ b/src/test/java/org/apache/commons/text/similarity/SorensenDiceSimilarityTest.java @@ -0,0 +1,94 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.text.similarity; + +import static org.assertj.core.api.Assertions.assertThatIllegalArgumentException; +import static org.junit.jupiter.api.Assertions.assertEquals; + +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; + +/** + * Unit tests for {@link SorensenDicesSimilarity}. + */ +public class SorensenDiceSimilarityTest { + + private static SorensenDiceSimilarity similarity; + + @BeforeAll + public static void setUp() { + similarity = new SorensenDiceSimilarity(); + } + + @Test + public void test() { + assertEquals(0.25d, similarity.apply("night", "nacht")); + } + + @Test + public void testGetSorensenDicesSimilarity_StringString() { + + assertEquals(1d, similarity.apply("", "")); + assertEquals(0d, similarity.apply("", "a")); + assertEquals(0d, similarity.apply("a", "")); + assertEquals(1d, similarity.apply("a", "a")); + assertEquals(0d, similarity.apply("a", "b")); + assertEquals(1.0d, similarity.apply("foo", "foo")); + assertEquals(0.8d, similarity.apply("foo", "foo ")); + assertEquals(0.4d, similarity.apply("frog", "fog")); + assertEquals(0.0d, similarity.apply("fly", "ant")); + assertEquals(0.0d, similarity.apply("elephant", "hippo")); + assertEquals(0.0d, similarity.apply("hippo", "elephant")); + assertEquals(0.0d, similarity.apply("hippo", "zzzzzzzz")); + assertEquals(0.5d, similarity.apply("hello", "hallo")); + assertEquals(0.7d, round(similarity.apply("ABC Corporation", "ABC Corp"), 1)); + assertEquals(0.7d, round(similarity.apply("D N H Enterprises Inc", "D & H Enterprises, Inc."), 1)); + assertEquals(0.8d, + round(similarity.apply("My Gym Children's Fitness Center", "My Gym. Childrens Fitness"), 1)); + assertEquals(0.7d, round(similarity.apply("PENNSYLVANIA", "PENNCISYLVNIA"), 1)); + assertEquals(0.9d, round(similarity.apply("/opt/software1", "/opt/software2"), 1)); + assertEquals(0.6d, round(similarity.apply("aaabcd", "aaacdb"), 1)); + assertEquals(0.6d, round(similarity.apply("John Horn", "John Hopkins"), 1)); + + } + + @Test + public void testGetSorensenDicesSimilarity_NullNull() { + assertThatIllegalArgumentException().isThrownBy(() -> { + similarity.apply(null, null); + }); + } + + @Test + public void testGetSorensenDicesSimilarity_StringNull() { + assertThatIllegalArgumentException().isThrownBy(() -> { + similarity.apply(" ", null); + }); + } + + @Test + public void testGetSorensenDicesSimilarity_NullString() { + assertThatIllegalArgumentException().isThrownBy(() -> { + similarity.apply(null, "clear"); + }); + } + + public static double round(double value, int precision) { + int scale = (int) Math.pow(10, precision); + return (double) Math.round(value * scale) / scale; + } +}