merge: cache negative cache results for similarity metrics

pks-t · pks-t · commit 4dfcc50fe88e · 2020-04-01T15:16:18.000+02:00
When computing renames, we cache the hash signatures for each of the
potentially conflicting entries so that we do not need to repeatedly
read the file and can at least halfway efficiently determine whether two
files are similar enough to be deemed a rename. In order to make the
hash signatures meaningful, we require at least four lines of data to be
present, resulting in at least four different hashes that can be
compared. Files that are deemed too small are not cached at all and
will thus be repeatedly re-hashed, which is usually not a huge issue.

The issue with above heuristic is in case a file does _not_ have at
least four lines, where a line is anything separated by a consecutive
run of "\n" or "\0" characters. For example "a\nb" is two lines, but
"a\0\0b" is also just two lines. Taken to the extreme, a file that has
megabytes of consecutive space- or NUL-only may also be deemed as too
small and thus not get cached. As a result, we will repeatedly load its
blob, calculate its hash signature just to finally throw it away as we
notice it's not of any value. When you've got a comparitively big file
that you compare against a big set of potentially renamed files, then
the cost simply expodes.

The issue can be trivially fixed by introducing negative cache entries.
Whenever we determine that a given blob does not have a meaningful
representation via a hash signature, we store this negative cache marker
and will from then on not hash it again, but also ignore it as a
potential rename target. This should help the "normal" case already
where you have a lot of small files as rename candidates, but in the
above scenario it's savings are extraordinarily high.

To verify we do not hit the issue anymore with described solution, this
commit adds a test that uses the exact same setup described above with
one 50 megabyte blob of '\0' characters and 1000 other files that get
renamed. Without the negative cache:

$ time ./libgit2_clar -smerge::trees::renames::cache_recomputation &gt;/dev/null
real    11m48.377s
user    11m11.576s
sys     0m35.187s

And with the negative cache:

$ time ./libgit2_clar -smerge::trees::renames::cache_recomputation &gt;/dev/null
real    0m1.972s
user    0m1.851s
sys     0m0.118s

So this represents a ~350-fold performance improvement, but it obviously
depends on how many files you have and how big the blob is. The test
number were chosen in a way that one will immediately notice as soon as
the bug resurfaces.
diff --git a/src/merge.c b/src/merge.c
@@ -68,6 +68,16 @@ struct merge_diff_df_data {
 	git_merge_diff *prev_conflict;
 };
 
+/*
+ * This acts as a negative cache entry marker. In case we've tried to calculate
+ * similarity metrics for a given blob already but `git_hashsig` determined
+ * that it's too small in order to have a meaningful hash signature, we will
+ * insert the address of this marker instead of `NULL`. Like this, we can
+ * easily check whether we have checked a gien entry already and skip doing the
+ * calculation again and again.
+ */
+static int cache_invalid_marker;
+
 /* Merge base computation */
 
 int merge_bases_many(git_commit_list **out, git_revwalk **walk_out, git_repository *repo, size_t length, const git_oid input_array[])
@@ -1027,6 +1037,9 @@ static int index_entry_similarity_calc(
 	git_object_size_t blobsize;
 	int error;
 
+	if (*out || *out == &cache_invalid_marker)
+		return 0;
+
 	*out = NULL;
 
 	if ((error = git_blob_lookup(&blob, repo, &entry->id)) < 0)
@@ -1047,6 +1060,8 @@ static int index_entry_similarity_calc(
 	error = opts->metric->buffer_signature(out, &diff_file,
 		git_blob_rawcontent(blob), (size_t)blobsize,
 		opts->metric->payload);
+	if (error == GIT_EBUFS)
+		*out = &cache_invalid_marker;
 
 	git_blob_free(blob);
 
@@ -1069,18 +1084,16 @@ static int index_entry_similarity_inexact(
 		return 0;
 
 	/* update signature cache if needed */
-	if (!cache[a_idx] && (error = index_entry_similarity_calc(&cache[a_idx], repo, a, opts)) < 0)
-		return error;
-	if (!cache[b_idx] && (error = index_entry_similarity_calc(&cache[b_idx], repo, b, opts)) < 0)
+	if ((error = index_entry_similarity_calc(&cache[a_idx], repo, a, opts)) < 0 ||
+	    (error = index_entry_similarity_calc(&cache[b_idx], repo, b, opts)) < 0)
 		return error;
 
 	/* some metrics may not wish to process this file (too big / too small) */
-	if (!cache[a_idx] || !cache[b_idx])
+	if (cache[a_idx] == &cache_invalid_marker || cache[b_idx] == &cache_invalid_marker)
 		return 0;
 
 	/* compare signatures */
-	if (opts->metric->similarity(
-		&score, cache[a_idx], cache[b_idx], opts->metric->payload) < 0)
+	if (opts->metric->similarity(&score, cache[a_idx], cache[b_idx], opts->metric->payload) < 0)
 		return -1;
 
 	/* clip score */
@@ -1550,7 +1563,7 @@ int git_merge_diff_list__find_renames(
 done:
 	if (cache != NULL) {
 		for (i = 0; i < cache_size; ++i) {
-			if (cache[i] != NULL)
+			if (cache[i] != NULL && cache[i] != &cache_invalid_marker)
 				opts->metric->free_signature(cache[i], opts->metric->payload);
 		}
 
diff --git a/tests/merge/trees/renames.c b/tests/merge/trees/renames.c
@@ -274,3 +274,80 @@ void test_merge_trees_renames__submodules(void)
 	cl_assert(merge_test_index(index, merge_index_entries, 7));
 	git_index_free(index);
 }
+
+void test_merge_trees_renames__cache_recomputation(void)
+{
+	git_oid blob, binary, ancestor_oid, theirs_oid, ours_oid;
+	git_merge_options opts = GIT_MERGE_OPTIONS_INIT;
+	git_buf path = GIT_BUF_INIT;
+	git_treebuilder *builder;
+	git_tree *ancestor_tree, *their_tree, *our_tree;
+	git_index *index;
+	size_t blob_size;
+	void *data;
+	size_t i;
+
+	cl_git_pass(git_oid_fromstr(&blob, "a2d8d1824c68541cca94ffb90f79291eba495921"));
+
+	/*
+	 * Create a 50MB blob that consists of NUL bytes only. It is important
+	 * that this blob is of a special format, most importantly it cannot
+	 * contain more than four non-consecutive newlines or NUL bytes. This
+	 * is because of git_hashsig's inner workings where all files with less
+	 * than four "lines" are deemed to small.
+	 */
+	blob_size = 50 * 1024 * 1024;
+	cl_assert(data = git__calloc(blob_size, 1));
+	cl_git_pass(git_blob_create_from_buffer(&binary, repo, data, blob_size));
+
+	/*
+	 * Create the common ancestor, which has 1000 dummy blobs and the binary
+	 * blob. The dummy blobs serve as potential rename targets for the
+	 * dummy blob.
+	 */
+	cl_git_pass(git_treebuilder_new(&builder, repo, NULL));
+	for (i = 0; i < 1000; i++) {
+		cl_git_pass(git_buf_printf(&path, "%"PRIuMAX".txt", i));
+		cl_git_pass(git_treebuilder_insert(NULL, builder, path.ptr, &blob, GIT_FILEMODE_BLOB));
+		git_buf_clear(&path);
+	}
+	cl_git_pass(git_treebuilder_insert(NULL, builder, "original.bin", &binary, GIT_FILEMODE_BLOB));
+	cl_git_pass(git_treebuilder_write(&ancestor_oid, builder));
+
+	/* We now the binary blob in our tree. */
+	cl_git_pass(git_treebuilder_remove(builder, "original.bin"));
+	cl_git_pass(git_treebuilder_insert(NULL, builder, "renamed.bin", &binary, GIT_FILEMODE_BLOB));
+	cl_git_pass(git_treebuilder_write(&ours_oid, builder));
+
+	git_treebuilder_free(builder);
+
+	/* And move everything into a subdirectory in their tree. */
+	cl_git_pass(git_treebuilder_new(&builder, repo, NULL));
+	cl_git_pass(git_treebuilder_insert(NULL, builder, "subdir", &ancestor_oid, GIT_FILEMODE_TREE));
+	cl_git_pass(git_treebuilder_write(&theirs_oid, builder));
+
+	/*
+	 * Now merge ancestor, ours and theirs. As `git_hashsig` refuses to
+	 * create a hash signature for the 50MB binary file, we historically
+	 * didn't cache the hashsig computation for it. As a result, we now
+	 * started looking up the 50MB blob and scanning it at least 1000
+	 * times, which takes a long time.
+	 *
+	 * The number of 1000 blobs is chosen in such a way that it's
+	 * noticeable when the bug creeps in again, as it takes around 12
+	 * minutes on my machine to compute the following merge.
+	 */
+	opts.target_limit = 5000;
+	cl_git_pass(git_tree_lookup(&ancestor_tree, repo, &ancestor_oid));
+	cl_git_pass(git_tree_lookup(&their_tree, repo, &theirs_oid));
+	cl_git_pass(git_tree_lookup(&our_tree, repo, &ours_oid));
+	cl_git_pass(git_merge_trees(&index, repo, ancestor_tree, our_tree, their_tree, &opts));
+
+	git_treebuilder_free(builder);
+	git_buf_dispose(&path);
+	git_index_free(index);
+	git_tree_free(ancestor_tree);
+	git_tree_free(their_tree);
+	git_tree_free(our_tree);
+	git__free(data);
+}