Skip to content

Commit 4dfcc50

Browse files
committed
merge: cache negative cache results for similarity metrics
When computing renames, we cache the hash signatures for each of the potentially conflicting entries so that we do not need to repeatedly read the file and can at least halfway efficiently determine whether two files are similar enough to be deemed a rename. In order to make the hash signatures meaningful, we require at least four lines of data to be present, resulting in at least four different hashes that can be compared. Files that are deemed too small are not cached at all and will thus be repeatedly re-hashed, which is usually not a huge issue. The issue with above heuristic is in case a file does _not_ have at least four lines, where a line is anything separated by a consecutive run of "\n" or "\0" characters. For example "a\nb" is two lines, but "a\0\0b" is also just two lines. Taken to the extreme, a file that has megabytes of consecutive space- or NUL-only may also be deemed as too small and thus not get cached. As a result, we will repeatedly load its blob, calculate its hash signature just to finally throw it away as we notice it's not of any value. When you've got a comparitively big file that you compare against a big set of potentially renamed files, then the cost simply expodes. The issue can be trivially fixed by introducing negative cache entries. Whenever we determine that a given blob does not have a meaningful representation via a hash signature, we store this negative cache marker and will from then on not hash it again, but also ignore it as a potential rename target. This should help the "normal" case already where you have a lot of small files as rename candidates, but in the above scenario it's savings are extraordinarily high. To verify we do not hit the issue anymore with described solution, this commit adds a test that uses the exact same setup described above with one 50 megabyte blob of '\0' characters and 1000 other files that get renamed. Without the negative cache: $ time ./libgit2_clar -smerge::trees::renames::cache_recomputation >/dev/null real 11m48.377s user 11m11.576s sys 0m35.187s And with the negative cache: $ time ./libgit2_clar -smerge::trees::renames::cache_recomputation >/dev/null real 0m1.972s user 0m1.851s sys 0m0.118s So this represents a ~350-fold performance improvement, but it obviously depends on how many files you have and how big the blob is. The test number were chosen in a way that one will immediately notice as soon as the bug resurfaces.
1 parent ca782c9 commit 4dfcc50

File tree

2 files changed

+97
-7
lines changed

2 files changed

+97
-7
lines changed

src/merge.c

Lines changed: 20 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,16 @@ struct merge_diff_df_data {
6868
git_merge_diff *prev_conflict;
6969
};
7070

71+
/*
72+
* This acts as a negative cache entry marker. In case we've tried to calculate
73+
* similarity metrics for a given blob already but `git_hashsig` determined
74+
* that it's too small in order to have a meaningful hash signature, we will
75+
* insert the address of this marker instead of `NULL`. Like this, we can
76+
* easily check whether we have checked a gien entry already and skip doing the
77+
* calculation again and again.
78+
*/
79+
static int cache_invalid_marker;
80+
7181
/* Merge base computation */
7282

7383
int merge_bases_many(git_commit_list **out, git_revwalk **walk_out, git_repository *repo, size_t length, const git_oid input_array[])
@@ -1027,6 +1037,9 @@ static int index_entry_similarity_calc(
10271037
git_object_size_t blobsize;
10281038
int error;
10291039

1040+
if (*out || *out == &cache_invalid_marker)
1041+
return 0;
1042+
10301043
*out = NULL;
10311044

10321045
if ((error = git_blob_lookup(&blob, repo, &entry->id)) < 0)
@@ -1047,6 +1060,8 @@ static int index_entry_similarity_calc(
10471060
error = opts->metric->buffer_signature(out, &diff_file,
10481061
git_blob_rawcontent(blob), (size_t)blobsize,
10491062
opts->metric->payload);
1063+
if (error == GIT_EBUFS)
1064+
*out = &cache_invalid_marker;
10501065

10511066
git_blob_free(blob);
10521067

@@ -1069,18 +1084,16 @@ static int index_entry_similarity_inexact(
10691084
return 0;
10701085

10711086
/* update signature cache if needed */
1072-
if (!cache[a_idx] && (error = index_entry_similarity_calc(&cache[a_idx], repo, a, opts)) < 0)
1073-
return error;
1074-
if (!cache[b_idx] && (error = index_entry_similarity_calc(&cache[b_idx], repo, b, opts)) < 0)
1087+
if ((error = index_entry_similarity_calc(&cache[a_idx], repo, a, opts)) < 0 ||
1088+
(error = index_entry_similarity_calc(&cache[b_idx], repo, b, opts)) < 0)
10751089
return error;
10761090

10771091
/* some metrics may not wish to process this file (too big / too small) */
1078-
if (!cache[a_idx] || !cache[b_idx])
1092+
if (cache[a_idx] == &cache_invalid_marker || cache[b_idx] == &cache_invalid_marker)
10791093
return 0;
10801094

10811095
/* compare signatures */
1082-
if (opts->metric->similarity(
1083-
&score, cache[a_idx], cache[b_idx], opts->metric->payload) < 0)
1096+
if (opts->metric->similarity(&score, cache[a_idx], cache[b_idx], opts->metric->payload) < 0)
10841097
return -1;
10851098

10861099
/* clip score */
@@ -1550,7 +1563,7 @@ int git_merge_diff_list__find_renames(
15501563
done:
15511564
if (cache != NULL) {
15521565
for (i = 0; i < cache_size; ++i) {
1553-
if (cache[i] != NULL)
1566+
if (cache[i] != NULL && cache[i] != &cache_invalid_marker)
15541567
opts->metric->free_signature(cache[i], opts->metric->payload);
15551568
}
15561569

tests/merge/trees/renames.c

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -274,3 +274,80 @@ void test_merge_trees_renames__submodules(void)
274274
cl_assert(merge_test_index(index, merge_index_entries, 7));
275275
git_index_free(index);
276276
}
277+
278+
void test_merge_trees_renames__cache_recomputation(void)
279+
{
280+
git_oid blob, binary, ancestor_oid, theirs_oid, ours_oid;
281+
git_merge_options opts = GIT_MERGE_OPTIONS_INIT;
282+
git_buf path = GIT_BUF_INIT;
283+
git_treebuilder *builder;
284+
git_tree *ancestor_tree, *their_tree, *our_tree;
285+
git_index *index;
286+
size_t blob_size;
287+
void *data;
288+
size_t i;
289+
290+
cl_git_pass(git_oid_fromstr(&blob, "a2d8d1824c68541cca94ffb90f79291eba495921"));
291+
292+
/*
293+
* Create a 50MB blob that consists of NUL bytes only. It is important
294+
* that this blob is of a special format, most importantly it cannot
295+
* contain more than four non-consecutive newlines or NUL bytes. This
296+
* is because of git_hashsig's inner workings where all files with less
297+
* than four "lines" are deemed to small.
298+
*/
299+
blob_size = 50 * 1024 * 1024;
300+
cl_assert(data = git__calloc(blob_size, 1));
301+
cl_git_pass(git_blob_create_from_buffer(&binary, repo, data, blob_size));
302+
303+
/*
304+
* Create the common ancestor, which has 1000 dummy blobs and the binary
305+
* blob. The dummy blobs serve as potential rename targets for the
306+
* dummy blob.
307+
*/
308+
cl_git_pass(git_treebuilder_new(&builder, repo, NULL));
309+
for (i = 0; i < 1000; i++) {
310+
cl_git_pass(git_buf_printf(&path, "%"PRIuMAX".txt", i));
311+
cl_git_pass(git_treebuilder_insert(NULL, builder, path.ptr, &blob, GIT_FILEMODE_BLOB));
312+
git_buf_clear(&path);
313+
}
314+
cl_git_pass(git_treebuilder_insert(NULL, builder, "original.bin", &binary, GIT_FILEMODE_BLOB));
315+
cl_git_pass(git_treebuilder_write(&ancestor_oid, builder));
316+
317+
/* We now the binary blob in our tree. */
318+
cl_git_pass(git_treebuilder_remove(builder, "original.bin"));
319+
cl_git_pass(git_treebuilder_insert(NULL, builder, "renamed.bin", &binary, GIT_FILEMODE_BLOB));
320+
cl_git_pass(git_treebuilder_write(&ours_oid, builder));
321+
322+
git_treebuilder_free(builder);
323+
324+
/* And move everything into a subdirectory in their tree. */
325+
cl_git_pass(git_treebuilder_new(&builder, repo, NULL));
326+
cl_git_pass(git_treebuilder_insert(NULL, builder, "subdir", &ancestor_oid, GIT_FILEMODE_TREE));
327+
cl_git_pass(git_treebuilder_write(&theirs_oid, builder));
328+
329+
/*
330+
* Now merge ancestor, ours and theirs. As `git_hashsig` refuses to
331+
* create a hash signature for the 50MB binary file, we historically
332+
* didn't cache the hashsig computation for it. As a result, we now
333+
* started looking up the 50MB blob and scanning it at least 1000
334+
* times, which takes a long time.
335+
*
336+
* The number of 1000 blobs is chosen in such a way that it's
337+
* noticeable when the bug creeps in again, as it takes around 12
338+
* minutes on my machine to compute the following merge.
339+
*/
340+
opts.target_limit = 5000;
341+
cl_git_pass(git_tree_lookup(&ancestor_tree, repo, &ancestor_oid));
342+
cl_git_pass(git_tree_lookup(&their_tree, repo, &theirs_oid));
343+
cl_git_pass(git_tree_lookup(&our_tree, repo, &ours_oid));
344+
cl_git_pass(git_merge_trees(&index, repo, ancestor_tree, our_tree, their_tree, &opts));
345+
346+
git_treebuilder_free(builder);
347+
git_buf_dispose(&path);
348+
git_index_free(index);
349+
git_tree_free(ancestor_tree);
350+
git_tree_free(their_tree);
351+
git_tree_free(our_tree);
352+
git__free(data);
353+
}

0 commit comments

Comments
 (0)