Skip to content

Commit 9d83a2b

Browse files
committed
Sanitize the hunk header to ensure it contains UTF-8 valid data
The diff driver truncates the hunk header text to 80 bytes, which can truncate 4-byte Unicode characters and introduce garbage characters in the diff output. This change sanitizes the hunk header before it is displayed. This mirrors the test in git: https://github.com/git/git/blob/master/t/t4025-hunk-header.sh Closes libgit2/rugged#716
1 parent 0ad2372 commit 9d83a2b

File tree

4 files changed

+130
-0
lines changed

4 files changed

+130
-0
lines changed

src/diff_xdiff.c

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
*/
77

88
#include "diff_xdiff.h"
9+
#include "util.h"
910

1011
#include "git2/errors.h"
1112
#include "diff.h"
@@ -115,6 +116,7 @@ static int git_xdiff_cb(void *priv, mmbuffer_t *bufs, int len)
115116
const git_diff_delta *delta = patch->base.delta;
116117
git_patch_generated_output *output = &info->xo->output;
117118
git_diff_line line;
119+
size_t buffer_len;
118120

119121
if (len == 1) {
120122
output->error = git_xdiff_parse_hunk(&info->hunk, bufs[0].ptr);
@@ -124,6 +126,16 @@ static int git_xdiff_cb(void *priv, mmbuffer_t *bufs, int len)
124126
info->hunk.header_len = bufs[0].size;
125127
if (info->hunk.header_len >= sizeof(info->hunk.header))
126128
info->hunk.header_len = sizeof(info->hunk.header) - 1;
129+
130+
/* Sanitize the hunk header in case there is invalid Unicode */
131+
buffer_len = git__utf8_valid_buf_length((const uint8_t *) bufs[0].ptr, info->hunk.header_len);
132+
/* Sanitizing the hunk header may delete the newline, so add it back again if there is room */
133+
if (buffer_len < info->hunk.header_len) {
134+
bufs[0].ptr[buffer_len] = '\n';
135+
buffer_len += 1;
136+
info->hunk.header_len = buffer_len;
137+
}
138+
127139
memcpy(info->hunk.header, bufs[0].ptr, info->hunk.header_len);
128140
info->hunk.header[info->hunk.header_len] = '\0';
129141

src/util.c

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -806,6 +806,22 @@ double git_time_monotonic(void)
806806
return git__timer();
807807
}
808808

809+
size_t git__utf8_valid_buf_length(const uint8_t *str, size_t str_len)
810+
{
811+
size_t offset = 0;
812+
813+
while (offset < str_len) {
814+
int length = git__utf8_charlen(str + offset, str_len - offset);
815+
816+
if (length < 0)
817+
break;
818+
819+
offset += length;
820+
}
821+
822+
return offset;
823+
}
824+
809825
#ifdef GIT_WIN32
810826
int git__getenv(git_buf *out, const char *name)
811827
{

src/util.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -453,6 +453,16 @@ extern size_t git__unescape(char *str);
453453
*/
454454
extern int git__utf8_iterate(const uint8_t *str, int str_len, int32_t *dst);
455455

456+
/*
457+
* Iterate through an UTF-8 string and stops after finding any invalid UTF-8
458+
* codepoints.
459+
*
460+
* @param str string to scan
461+
* @param str_len size of the string
462+
* @return length in bytes of the string that contains valid data
463+
*/
464+
extern size_t git__utf8_valid_buf_length(const uint8_t *str, size_t str_len);
465+
456466
/*
457467
* Safely zero-out memory, making sure that the compiler
458468
* doesn't optimize away the operation.

tests/diff/patch.c

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,12 @@ void test_diff_patch__cleanup(void)
2525

2626
#define EXPECTED_HUNK "@@ -1,2 +0,0 @@\n"
2727

28+
#define UTF8_HUNK_HEADER "\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E\n"
29+
30+
#define UTF8_TRUNCATED_A_HUNK_HEADER "\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E\xE6\x97\xA5\xE6\x9C\xAC\n"
31+
32+
#define UTF8_TRUNCATED_L_HUNK_HEADER "\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E\xE6\x97\xA5\n"
33+
2834
static int check_removal_cb(
2935
const git_diff_delta *delta,
3036
const git_diff_hunk *hunk,
@@ -610,3 +616,89 @@ void test_diff_patch__line_counts_with_eofnl(void)
610616

611617
git_buf_free(&content);
612618
}
619+
620+
void test_diff_patch__can_strip_bad_utf8(void)
621+
{
622+
const char *a = "A " UTF8_HUNK_HEADER
623+
" B\n"
624+
" C\n"
625+
" D\n"
626+
" E\n"
627+
" F\n"
628+
" G\n"
629+
" H\n"
630+
" I\n"
631+
" J\n"
632+
" K\n"
633+
"L " UTF8_HUNK_HEADER
634+
" M\n"
635+
" N\n"
636+
" O\n"
637+
" P\n"
638+
" Q\n"
639+
" R\n"
640+
" S\n"
641+
" T\n"
642+
" U\n"
643+
" V\n";
644+
645+
const char *b = "A " UTF8_HUNK_HEADER
646+
" B\n"
647+
" C\n"
648+
" D\n"
649+
" E modified\n"
650+
" F\n"
651+
" G\n"
652+
" H\n"
653+
" I\n"
654+
" J\n"
655+
" K\n"
656+
"L " UTF8_HUNK_HEADER
657+
" M\n"
658+
" N\n"
659+
" O\n"
660+
" P modified\n"
661+
" Q\n"
662+
" R\n"
663+
" S\n"
664+
" T\n"
665+
" U\n"
666+
" V\n";
667+
668+
const char *expected = "diff --git a/file b/file\n"
669+
"index d0647c4..7827ce5 100644\n"
670+
"--- a/file\n"
671+
"+++ b/file\n"
672+
"@@ -2,7 +2,7 @@ A " UTF8_TRUNCATED_A_HUNK_HEADER
673+
" B\n"
674+
" C\n"
675+
" D\n"
676+
"- E\n"
677+
"+ E modified\n"
678+
" F\n"
679+
" G\n"
680+
" H\n"
681+
"@@ -13,7 +13,7 @@ L " UTF8_TRUNCATED_L_HUNK_HEADER
682+
" M\n"
683+
" N\n"
684+
" O\n"
685+
"- P\n"
686+
"+ P modified\n"
687+
" Q\n"
688+
" R\n"
689+
" S\n";
690+
691+
git_diff_options opts;
692+
git_patch *patch;
693+
git_buf buf = GIT_BUF_INIT;
694+
695+
cl_git_pass(git_diff_init_options(&opts, GIT_DIFF_OPTIONS_VERSION));
696+
697+
cl_git_pass(git_patch_from_buffers(&patch, a, strlen(a), NULL, b, strlen(b), NULL, &opts));
698+
cl_git_pass(git_patch_to_buf(&buf, patch));
699+
700+
cl_assert_equal_s(expected, buf.ptr);
701+
702+
git_patch_free(patch);
703+
git_buf_free(&buf);
704+
}

0 commit comments

Comments
 (0)