Skip to content

Commit a616fb1

Browse files
committed
indexer: check pack file connectivity
When passing `--strict` to `git-unpack-objects`, core git will verify the pack file that is currently being read. In addition to the typical checksum verification, this will especially cause it to verify object connectivity of the received pack file. So it checks, for every received object, if all the objects it references are either part of the local object database or part of the pack file. In libgit2, we currently have no such mechanism, which leaves us unable to verify received pack files prior to writing them into our local object database. This commit introduce the concept of `expected_oids` to the indexer. When pack file verification is turned on by a new flag, the indexer will try to parse each received object first. If the object has any links to other objects, it will check if those links are already satisfied by known objects either part of the object database or objects it has already seen as part of that pack file. If not, it will add them to the list of `expected_oids`. Furthermore, the indexer will remove the current object from the `expected_oids` if it is currently being expected. Like this, we are able to verify whether all object links are being satisfied. As soon as we hit the end of the object stream and have resolved all objects as well as deltified objects, we assert that `expected_oids` is in fact empty. This should always be the case for a valid pack file with full connectivity.
1 parent be41c38 commit a616fb1

File tree

1 file changed

+151
-1
lines changed

1 file changed

+151
-1
lines changed

src/indexer.c

Lines changed: 151 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,9 @@
1010
#include "git2/indexer.h"
1111
#include "git2/object.h"
1212

13+
#include "commit.h"
14+
#include "tree.h"
15+
#include "tag.h"
1316
#include "pack.h"
1417
#include "mwindow.h"
1518
#include "posix.h"
@@ -36,12 +39,15 @@ struct git_indexer {
3639
pack_committed :1,
3740
have_stream :1,
3841
have_delta :1,
39-
do_fsync :1;
42+
do_fsync :1,
43+
do_verify :1;
4044
struct git_pack_header hdr;
4145
struct git_pack_file *pack;
4246
unsigned int mode;
4347
git_off_t off;
4448
git_off_t entry_start;
49+
git_otype entry_type;
50+
git_buf entry_data;
4551
git_packfile_stream stream;
4652
size_t nr_objects;
4753
git_vector objects;
@@ -53,6 +59,9 @@ struct git_indexer {
5359
void *progress_payload;
5460
char objbuf[8*1024];
5561

62+
/* OIDs referenced from pack objects. Used for verification. */
63+
git_oidmap *expected_oids;
64+
5665
/* Needed to look up objects which we want to inject to fix a thin pack */
5766
git_odb *odb;
5867

@@ -125,6 +134,11 @@ int git_indexer_new(
125134
idx->mode = mode ? mode : GIT_PACK_FILE_MODE;
126135
git_hash_ctx_init(&idx->hash_ctx);
127136
git_hash_ctx_init(&idx->trailer);
137+
git_buf_init(&idx->entry_data, 0);
138+
idx->expected_oids = git_oidmap_alloc();
139+
GITERR_CHECK_ALLOC(idx->expected_oids);
140+
141+
idx->do_verify = !!idx->odb;
128142

129143
if (git_repository__fsync_gitdir)
130144
idx->do_fsync = 1;
@@ -210,6 +224,9 @@ static int hash_object_stream(git_indexer*idx, git_packfile_stream *stream)
210224
if ((read = git_packfile_stream_read(stream, idx->objbuf, sizeof(idx->objbuf))) < 0)
211225
break;
212226

227+
if (idx->do_verify)
228+
git_buf_put(&idx->entry_data, idx->objbuf, read);
229+
213230
git_hash_update(&idx->hash_ctx, idx->objbuf, read);
214231
} while (read > 0);
215232

@@ -279,6 +296,97 @@ static int crc_object(uint32_t *crc_out, git_mwindow_file *mwf, git_off_t start,
279296
return 0;
280297
}
281298

299+
static void add_expected_oid(git_indexer *idx, const git_oid *oid)
300+
{
301+
int ret;
302+
303+
/*
304+
* If we know about that object because it is stored in our ODB or
305+
* because we have already processed it as part of our pack file, we do
306+
* not have to expect it.
307+
*/
308+
if (!git_odb_exists(idx->odb, oid) &&
309+
!git_oidmap_exists(idx->pack->idx_cache, oid) &&
310+
!git_oidmap_exists(idx->expected_oids, oid)) {
311+
git_oid *dup = git__malloc(sizeof(*oid));
312+
git_oid_cpy(dup, oid);
313+
git_oidmap_put(idx->expected_oids, dup, &ret);
314+
}
315+
}
316+
317+
static int check_object_connectivity(git_indexer *idx, const git_rawobj *obj)
318+
{
319+
git_object *object;
320+
size_t keyidx;
321+
int error;
322+
323+
if (obj->type != GIT_OBJ_BLOB &&
324+
obj->type != GIT_OBJ_TREE &&
325+
obj->type != GIT_OBJ_COMMIT &&
326+
obj->type != GIT_OBJ_TAG)
327+
return 0;
328+
329+
if ((error = git_object__from_raw(&object, obj->data, obj->len, obj->type)) < 0)
330+
goto out;
331+
332+
keyidx = git_oidmap_lookup_index(idx->expected_oids, &object->cached.oid);
333+
if (git_oidmap_valid_index(idx->expected_oids, keyidx)) {
334+
const git_oid *key = git_oidmap_key(idx->expected_oids, keyidx);
335+
git__free((git_oid *) key);
336+
git_oidmap_delete_at(idx->expected_oids, keyidx);
337+
}
338+
339+
/*
340+
* Check whether this is a known object. If so, we can just continue as
341+
* we assume that the ODB has a complete graph.
342+
*/
343+
if (git_odb_exists(idx->odb, &object->cached.oid))
344+
return 0;
345+
346+
switch (obj->type) {
347+
case GIT_OBJ_TREE:
348+
{
349+
git_tree *tree = (git_tree *) object;
350+
git_tree_entry *entry;
351+
size_t i;
352+
353+
git_array_foreach(tree->entries, i, entry)
354+
add_expected_oid(idx, entry->oid);
355+
356+
break;
357+
}
358+
case GIT_OBJ_COMMIT:
359+
{
360+
git_commit *commit = (git_commit *) object;
361+
git_oid *parent_oid;
362+
size_t i;
363+
364+
git_array_foreach(commit->parent_ids, i, parent_oid)
365+
add_expected_oid(idx, parent_oid);
366+
367+
add_expected_oid(idx, &commit->tree_id);
368+
369+
break;
370+
}
371+
case GIT_OBJ_TAG:
372+
{
373+
git_tag *tag = (git_tag *) object;
374+
375+
add_expected_oid(idx, &tag->target);
376+
377+
break;
378+
}
379+
case GIT_OBJ_BLOB:
380+
default:
381+
break;
382+
}
383+
384+
out:
385+
git_object_free(object);
386+
387+
return error;
388+
}
389+
282390
static int store_object(git_indexer *idx)
283391
{
284392
int i, error;
@@ -304,6 +412,17 @@ static int store_object(git_indexer *idx)
304412
entry->offset = (uint32_t)entry_start;
305413
}
306414

415+
if (idx->do_verify) {
416+
git_rawobj rawobj = {
417+
idx->entry_data.ptr,
418+
idx->entry_data.size,
419+
idx->entry_type
420+
};
421+
422+
if ((error = check_object_connectivity(idx, &rawobj)) < 0)
423+
goto on_error;
424+
}
425+
307426
git_oid_cpy(&pentry->sha1, &oid);
308427
pentry->offset = entry_start;
309428

@@ -549,6 +668,7 @@ static int read_stream_object(git_indexer *idx, git_transfer_progress *stats)
549668
git_mwindow_close(&w);
550669
idx->entry_start = entry_start;
551670
git_hash_init(&idx->hash_ctx);
671+
git_buf_clear(&idx->entry_data);
552672

553673
if (type == GIT_OBJ_REF_DELTA || type == GIT_OBJ_OFS_DELTA) {
554674
error = advance_delta_offset(idx, type);
@@ -569,6 +689,7 @@ static int read_stream_object(git_indexer *idx, git_transfer_progress *stats)
569689
}
570690

571691
idx->have_stream = 1;
692+
idx->entry_type = type;
572693

573694
error = git_packfile_stream_open(stream, idx->pack, idx->off);
574695
if (error < 0)
@@ -884,6 +1005,10 @@ static int resolve_deltas(git_indexer *idx, git_transfer_progress *stats)
8841005
return -1;
8851006
}
8861007

1008+
if (idx->do_verify && check_object_connectivity(idx, &obj) < 0)
1009+
/* TODO: error? continue? */
1010+
continue;
1011+
8871012
if (hash_and_save(idx, &obj, delta->delta_off) < 0)
8881013
continue;
8891014

@@ -1014,6 +1139,18 @@ int git_indexer_commit(git_indexer *idx, git_transfer_progress *stats)
10141139
write_at(idx, &trailer_hash, idx->pack->mwf.size - GIT_OID_RAWSZ, GIT_OID_RAWSZ);
10151140
}
10161141

1142+
/*
1143+
* Is the resulting graph fully connected or are we still
1144+
* missing some objects? In the second case, we can
1145+
* bail out due to an incomplete and thus corrupt
1146+
* packfile.
1147+
*/
1148+
if (git_oidmap_size(idx->expected_oids) > 0) {
1149+
giterr_set(GITERR_INDEXER, "packfile is missing %"PRIuZ" objects",
1150+
git_oidmap_size(idx->expected_oids));
1151+
return -1;
1152+
}
1153+
10171154
git_vector_sort(&idx->objects);
10181155

10191156
/* Use the trailer hash as the pack file name to ensure
@@ -1143,6 +1280,8 @@ int git_indexer_commit(git_indexer *idx, git_transfer_progress *stats)
11431280

11441281
void git_indexer_free(git_indexer *idx)
11451282
{
1283+
khiter_t pos;
1284+
11461285
if (idx == NULL)
11471286
return;
11481287

@@ -1170,7 +1309,18 @@ void git_indexer_free(git_indexer *idx)
11701309
git_mutex_unlock(&git__mwindow_mutex);
11711310
}
11721311

1312+
for (pos = git_oidmap_begin(idx->expected_oids);
1313+
pos != git_oidmap_end(idx->expected_oids); pos++)
1314+
{
1315+
if (git_oidmap_has_data(idx->expected_oids, pos)) {
1316+
git__free((git_oid *) git_oidmap_key(idx->expected_oids, pos));
1317+
git_oidmap_delete_at(idx->expected_oids, pos);
1318+
}
1319+
}
1320+
11731321
git_hash_ctx_cleanup(&idx->trailer);
11741322
git_hash_ctx_cleanup(&idx->hash_ctx);
1323+
git_buf_dispose(&idx->entry_data);
1324+
git_oidmap_free(idx->expected_oids);
11751325
git__free(idx);
11761326
}

0 commit comments

Comments
 (0)