Skip to content

Commit d8896bd

Browse files
committed
diff_generate: avoid excessive stats of .gitattribute files
When generating a diff between two trees, for each file that is to be diffed we have to determine whether it shall be treated as text or as binary files. While git has heuristics to determine which kind of diff to generate, users can also that default behaviour by setting or unsetting the 'diff' attribute for specific files. Because of that, we have to query gitattributes in order to determine how to diff the current files. Instead of hitting the '.gitattributes' file every time we need to query an attribute, which can get expensive especially on networked file systems, we try to cache them instead. This works perfectly fine for every '.gitattributes' file that is found, but we hit cache invalidation problems when we determine that an attribuse file is _not_ existing. We do create an entry in the cache for missing '.gitattributes' files, but as soon as we hit that file again we invalidate it and stat it again to see if it has now appeared. In the case of diffing large trees with each other, this behaviour is very suboptimal. For each pair of files that is to be diffed, we will repeatedly query every directory component leading towards their respective location for an attributes file. This leads to thousands or even hundreds of thousands of wasted syscalls. The attributes cache already has a mechanism to help in that scenario in form of the `git_attr_session`. As long as the same attributes session is still active, we will not try to re-query the gitmodules files at all but simply retain our currently cached results. To fix our problem, we can create a session at the top-most level, which is the initialization of the `git_diff` structure, and use it in order to look up the correct diff driver. As the `git_diff` structure is used to generate patches for multiple files at once, this neatly solves our problem by retaining the session until patches for all files have been generated. The fix has been tested with linux.git by calling `git_diff_tree_to_tree` and `git_diff_to_buf` with v4.10^{tree} and v4.14^{tree}. | time | .gitattributes stats without fix | 33.201s | 844614 with fix | 30.327s | 4441 While execution only improved by roughly 10%, the stat(3) syscalls for .gitattributes files decreased by 99.5%. The benchmarks were quite simple with best-of-three timings on Linux ext4 systems. One can assume that for network based file systems the performance gain will be a lot larger due to a much higher latency.
1 parent 7610638 commit d8896bd

File tree

5 files changed

+20
-10
lines changed

5 files changed

+20
-10
lines changed

src/diff.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ typedef enum {
3434
struct git_diff {
3535
git_refcount rc;
3636
git_repository *repo;
37+
git_attr_session attrsession;
3738
git_diff_origin_t type;
3839
git_diff_options opts;
3940
git_vector deltas; /* vector of git_diff_delta */

src/diff_driver.c

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -354,27 +354,30 @@ static int git_diff_driver_load(
354354
}
355355

356356
int git_diff_driver_lookup(
357-
git_diff_driver **out, git_repository *repo, const char *path)
357+
git_diff_driver **out, git_repository *repo,
358+
git_attr_session *attrsession, const char *path)
358359
{
359360
int error = 0;
360-
const char *value;
361+
const char *values[1], *attrs[] = { "diff" };
361362

362363
assert(out);
363364
*out = NULL;
364365

365366
if (!repo || !path || !strlen(path))
366367
/* just use the auto value */;
367-
else if ((error = git_attr_get(&value, repo, 0, path, "diff")) < 0)
368+
else if ((error = git_attr_get_many_with_session(values, repo,
369+
attrsession, 0, path, 1, attrs)) < 0)
368370
/* return error below */;
369-
else if (GIT_ATTR_UNSPECIFIED(value))
371+
372+
else if (GIT_ATTR_UNSPECIFIED(values[0]))
370373
/* just use the auto value */;
371-
else if (GIT_ATTR_FALSE(value))
374+
else if (GIT_ATTR_FALSE(values[0]))
372375
*out = &global_drivers[DIFF_DRIVER_BINARY];
373-
else if (GIT_ATTR_TRUE(value))
376+
else if (GIT_ATTR_TRUE(values[0]))
374377
*out = &global_drivers[DIFF_DRIVER_TEXT];
375378

376379
/* otherwise look for driver information in config and build driver */
377-
else if ((error = git_diff_driver_load(out, repo, value)) < 0) {
380+
else if ((error = git_diff_driver_load(out, repo, values[0])) < 0) {
378381
if (error == GIT_ENOTFOUND) {
379382
error = 0;
380383
giterr_clear();

src/diff_driver.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99

1010
#include "common.h"
1111

12+
#include "attr_file.h"
1213
#include "buffer.h"
1314

1415
typedef struct git_diff_driver_registry git_diff_driver_registry;
@@ -18,7 +19,8 @@ void git_diff_driver_registry_free(git_diff_driver_registry *);
1819

1920
typedef struct git_diff_driver git_diff_driver;
2021

21-
int git_diff_driver_lookup(git_diff_driver **, git_repository *, const char *);
22+
int git_diff_driver_lookup(git_diff_driver **, git_repository *,
23+
git_attr_session *attrsession, const char *);
2224
void git_diff_driver_free(git_diff_driver *);
2325

2426
/* diff option flags to force off and on for this driver */

src/diff_file.c

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,8 @@ static int diff_file_content_init_common(
5454
fc->src = GIT_ITERATOR_TYPE_TREE;
5555

5656
if (!fc->driver &&
57-
git_diff_driver_lookup(&fc->driver, fc->repo, fc->file->path) < 0)
57+
git_diff_driver_lookup(&fc->driver, fc->repo,
58+
NULL, fc->file->path) < 0)
5859
return -1;
5960

6061
/* give driver a chance to modify options */
@@ -101,7 +102,8 @@ int git_diff_file_content__init_from_diff(
101102
fc->file = use_old ? &delta->old_file : &delta->new_file;
102103
fc->src = use_old ? diff->old_src : diff->new_src;
103104

104-
if (git_diff_driver_lookup(&fc->driver, fc->repo, fc->file->path) < 0)
105+
if (git_diff_driver_lookup(&fc->driver, fc->repo,
106+
&diff->attrsession, fc->file->path) < 0)
105107
return -1;
106108

107109
switch (delta->status) {

src/diff_generate.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -389,6 +389,7 @@ static void diff_generated_free(git_diff *d)
389389
{
390390
git_diff_generated *diff = (git_diff_generated *)d;
391391

392+
git_attr_session__free(&diff->base.attrsession);
392393
git_vector_free_deep(&diff->base.deltas);
393394

394395
git_pathspec__vfree(&diff->pathspec);
@@ -418,6 +419,7 @@ static git_diff_generated *diff_generated_alloc(
418419
diff->base.new_src = new_iter->type;
419420
diff->base.patch_fn = git_patch_generated_from_diff;
420421
diff->base.free_fn = diff_generated_free;
422+
git_attr_session__init(&diff->base.attrsession, repo);
421423
memcpy(&diff->base.opts, &dflt, sizeof(git_diff_options));
422424

423425
git_pool_init(&diff->base.pool, 1);

0 commit comments

Comments
 (0)