Skip to content

Commit 966db47

Browse files
authored
Merge pull request libgit2#5477 from pks-t/pks/rename-detection-negative-caches
merge: cache negative cache results for similarity metrics
2 parents dfd7fcc + 4dfcc50 commit 966db47

File tree

2 files changed

+97
-7
lines changed

2 files changed

+97
-7
lines changed

src/merge.c

Lines changed: 20 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,16 @@ struct merge_diff_df_data {
6868
git_merge_diff *prev_conflict;
6969
};
7070

71+
/*
72+
* This acts as a negative cache entry marker. In case we've tried to calculate
73+
* similarity metrics for a given blob already but `git_hashsig` determined
74+
* that it's too small in order to have a meaningful hash signature, we will
75+
* insert the address of this marker instead of `NULL`. Like this, we can
76+
* easily check whether we have checked a gien entry already and skip doing the
77+
* calculation again and again.
78+
*/
79+
static int cache_invalid_marker;
80+
7181
/* Merge base computation */
7282

7383
int merge_bases_many(git_commit_list **out, git_revwalk **walk_out, git_repository *repo, size_t length, const git_oid input_array[])
@@ -1027,6 +1037,9 @@ static int index_entry_similarity_calc(
10271037
git_object_size_t blobsize;
10281038
int error;
10291039

1040+
if (*out || *out == &cache_invalid_marker)
1041+
return 0;
1042+
10301043
*out = NULL;
10311044

10321045
if ((error = git_blob_lookup(&blob, repo, &entry->id)) < 0)
@@ -1047,6 +1060,8 @@ static int index_entry_similarity_calc(
10471060
error = opts->metric->buffer_signature(out, &diff_file,
10481061
git_blob_rawcontent(blob), (size_t)blobsize,
10491062
opts->metric->payload);
1063+
if (error == GIT_EBUFS)
1064+
*out = &cache_invalid_marker;
10501065

10511066
git_blob_free(blob);
10521067

@@ -1069,18 +1084,16 @@ static int index_entry_similarity_inexact(
10691084
return 0;
10701085

10711086
/* update signature cache if needed */
1072-
if (!cache[a_idx] && (error = index_entry_similarity_calc(&cache[a_idx], repo, a, opts)) < 0)
1073-
return error;
1074-
if (!cache[b_idx] && (error = index_entry_similarity_calc(&cache[b_idx], repo, b, opts)) < 0)
1087+
if ((error = index_entry_similarity_calc(&cache[a_idx], repo, a, opts)) < 0 ||
1088+
(error = index_entry_similarity_calc(&cache[b_idx], repo, b, opts)) < 0)
10751089
return error;
10761090

10771091
/* some metrics may not wish to process this file (too big / too small) */
1078-
if (!cache[a_idx] || !cache[b_idx])
1092+
if (cache[a_idx] == &cache_invalid_marker || cache[b_idx] == &cache_invalid_marker)
10791093
return 0;
10801094

10811095
/* compare signatures */
1082-
if (opts->metric->similarity(
1083-
&score, cache[a_idx], cache[b_idx], opts->metric->payload) < 0)
1096+
if (opts->metric->similarity(&score, cache[a_idx], cache[b_idx], opts->metric->payload) < 0)
10841097
return -1;
10851098

10861099
/* clip score */
@@ -1550,7 +1563,7 @@ int git_merge_diff_list__find_renames(
15501563
done:
15511564
if (cache != NULL) {
15521565
for (i = 0; i < cache_size; ++i) {
1553-
if (cache[i] != NULL)
1566+
if (cache[i] != NULL && cache[i] != &cache_invalid_marker)
15541567
opts->metric->free_signature(cache[i], opts->metric->payload);
15551568
}
15561569

tests/merge/trees/renames.c

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -274,3 +274,80 @@ void test_merge_trees_renames__submodules(void)
274274
cl_assert(merge_test_index(index, merge_index_entries, 7));
275275
git_index_free(index);
276276
}
277+
278+
void test_merge_trees_renames__cache_recomputation(void)
279+
{
280+
git_oid blob, binary, ancestor_oid, theirs_oid, ours_oid;
281+
git_merge_options opts = GIT_MERGE_OPTIONS_INIT;
282+
git_buf path = GIT_BUF_INIT;
283+
git_treebuilder *builder;
284+
git_tree *ancestor_tree, *their_tree, *our_tree;
285+
git_index *index;
286+
size_t blob_size;
287+
void *data;
288+
size_t i;
289+
290+
cl_git_pass(git_oid_fromstr(&blob, "a2d8d1824c68541cca94ffb90f79291eba495921"));
291+
292+
/*
293+
* Create a 50MB blob that consists of NUL bytes only. It is important
294+
* that this blob is of a special format, most importantly it cannot
295+
* contain more than four non-consecutive newlines or NUL bytes. This
296+
* is because of git_hashsig's inner workings where all files with less
297+
* than four "lines" are deemed to small.
298+
*/
299+
blob_size = 50 * 1024 * 1024;
300+
cl_assert(data = git__calloc(blob_size, 1));
301+
cl_git_pass(git_blob_create_from_buffer(&binary, repo, data, blob_size));
302+
303+
/*
304+
* Create the common ancestor, which has 1000 dummy blobs and the binary
305+
* blob. The dummy blobs serve as potential rename targets for the
306+
* dummy blob.
307+
*/
308+
cl_git_pass(git_treebuilder_new(&builder, repo, NULL));
309+
for (i = 0; i < 1000; i++) {
310+
cl_git_pass(git_buf_printf(&path, "%"PRIuMAX".txt", i));
311+
cl_git_pass(git_treebuilder_insert(NULL, builder, path.ptr, &blob, GIT_FILEMODE_BLOB));
312+
git_buf_clear(&path);
313+
}
314+
cl_git_pass(git_treebuilder_insert(NULL, builder, "original.bin", &binary, GIT_FILEMODE_BLOB));
315+
cl_git_pass(git_treebuilder_write(&ancestor_oid, builder));
316+
317+
/* We now the binary blob in our tree. */
318+
cl_git_pass(git_treebuilder_remove(builder, "original.bin"));
319+
cl_git_pass(git_treebuilder_insert(NULL, builder, "renamed.bin", &binary, GIT_FILEMODE_BLOB));
320+
cl_git_pass(git_treebuilder_write(&ours_oid, builder));
321+
322+
git_treebuilder_free(builder);
323+
324+
/* And move everything into a subdirectory in their tree. */
325+
cl_git_pass(git_treebuilder_new(&builder, repo, NULL));
326+
cl_git_pass(git_treebuilder_insert(NULL, builder, "subdir", &ancestor_oid, GIT_FILEMODE_TREE));
327+
cl_git_pass(git_treebuilder_write(&theirs_oid, builder));
328+
329+
/*
330+
* Now merge ancestor, ours and theirs. As `git_hashsig` refuses to
331+
* create a hash signature for the 50MB binary file, we historically
332+
* didn't cache the hashsig computation for it. As a result, we now
333+
* started looking up the 50MB blob and scanning it at least 1000
334+
* times, which takes a long time.
335+
*
336+
* The number of 1000 blobs is chosen in such a way that it's
337+
* noticeable when the bug creeps in again, as it takes around 12
338+
* minutes on my machine to compute the following merge.
339+
*/
340+
opts.target_limit = 5000;
341+
cl_git_pass(git_tree_lookup(&ancestor_tree, repo, &ancestor_oid));
342+
cl_git_pass(git_tree_lookup(&their_tree, repo, &theirs_oid));
343+
cl_git_pass(git_tree_lookup(&our_tree, repo, &ours_oid));
344+
cl_git_pass(git_merge_trees(&index, repo, ancestor_tree, our_tree, their_tree, &opts));
345+
346+
git_treebuilder_free(builder);
347+
git_buf_dispose(&path);
348+
git_index_free(index);
349+
git_tree_free(ancestor_tree);
350+
git_tree_free(their_tree);
351+
git_tree_free(our_tree);
352+
git__free(data);
353+
}

0 commit comments

Comments
 (0)