diff --git a/src/main/java/org/apache/commons/text/similarity/SorensenDiceSimilarity.java b/src/main/java/org/apache/commons/text/similarity/SorensenDiceSimilarity.java new file mode 100644 index 0000000000..3121bdc2f1 --- /dev/null +++ b/src/main/java/org/apache/commons/text/similarity/SorensenDiceSimilarity.java @@ -0,0 +1,146 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.text.similarity; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.List; +import java.util.function.Function; + +import org.apache.commons.lang3.StringUtils; + +/** + * A similarity algorithm indicating the percentage of matched characters + * between two character sequences. + * + *
+ * The Sørensen-Dice coefficient is a statistic used for comparing the + * similarity of two samples. It was independently developed by the botanists + * Thorvald Sørensen and Lee Raymond Dice, who published in 1948 and 1945 + * respectively. The index is known by several other names, especially + * Sørensen-Dice index, Sørensen index and Dice's coefficient. Other + * variations include the "similarity coefficient" or "index", such as Dice + * similarity coefficient (DSC). + *
+ * + *+ * This implementation is based on the Sørensen-Dice similarity algorithm + * from + * http://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient. + * + * + *
+ * + * @since 1.7 + */ +public class SorensenDiceSimilarity implements SimilarityScore
+ * similarity.apply(null, null) = IllegalArgumentException
+ * similarity.apply("foo", null) = IllegalArgumentException
+ * similarity.apply(null, "foo") = IllegalArgumentException
+ * similarity.apply("night", "nacht") = 0.25
+ * similarity.apply("", "") = 1.0
+ * similarity.apply("foo", "foo") = 1.0
+ * similarity.apply("foo", "foo ") = 0.8
+ * similarity.apply("foo", "foo ") = 0.66
+ * similarity.apply("foo", " foo ") = 0.66
+ * similarity.apply("foo", " foo") = 0.66
+ * similarity.apply("", "a") = 0.0
+ * similarity.apply("aaapppp", "") = 0.0
+ * similarity.apply("frog", "fog") = 0.4
+ * similarity.apply("fly", "ant") = 0.0
+ * similarity.apply("elephant", "hippo") = 0.0
+ * similarity.apply("hippo", "elephant") = 0.0
+ * similarity.apply("hippo", "zzzzzzzz") = 0.0
+ * similarity.apply("hello", "hallo") = 0.5
+ * similarity.apply("ABC Corporation", "ABC Corp") = 0.7
+ * similarity.apply("D N H Enterprises Inc", "D & H Enterprises, Inc.") = 0.74
+ * similarity.apply("My Gym Children's Fitness Center", "My Gym. Childrens Fitness") = 0.81
+ * similarity.apply("PENNSYLVANIA", "PENNCISYLVNIA") = 0.69
+ *
+ *
+ * @param left the first CharSequence, must not be null
+ * @param right the second CharSequence, must not be null
+ * @return result similarity
+ * @throws IllegalArgumentException if either CharSequence input is {@code null}
+ */
+ @Override
+ public Double apply(final CharSequence left, final CharSequence right) {
+
+ if (left == null || right == null) {
+ throw new IllegalArgumentException("CharSequences must not be null");
+ }
+
+ if (StringUtils.equals(left, right)) {
+ return 1d;
+ }
+
+ // if bigram is not formed out of any given string, clearly both are not similar.
+ if (left.length() < 2 || right.length() < 2) {
+ return 0d;
+ }
+
+ IntersectionResult overlap = similarity.apply(left, right);
+
+ final int total = overlap.getSizeA() + overlap.getSizeB();
+ final long intersection = overlap.getIntersection();
+
+ return (2.0d * intersection) / total;
+ }
+
+ /**
+ * Converter class for creating Bigrams for SorensenDice similarity.
+ */
+ private static class SorensenDiceConverter implements Function