sourcebot-dev · sudhanshu112233shukla · Jan 20, 2026 · Jan 21, 2026 · Jan 21, 2026
diff --git a/.github/workflows/semgrep.yml b/.github/workflows/semgrep.yml
diff --git a/build/builder.go b/build/builder.go
@@ -120,6 +120,9 @@ type Options struct {
 
 	// ShardPrefix is the prefix of the shard. It defaults to the repository name.
 	ShardPrefix string
+
+	// AllowBinary allows indexing of binary files in the repository.
+	AllowBinary bool
 }
 
 // HashOptions contains only the options in Options that upon modification leads to IndexState of IndexStateMismatch during the next index building.
@@ -607,7 +610,7 @@ func (b *Builder) Add(doc zoekt.Document) error {
 		// files, the corresponding shard would be mostly empty, so
 		// insert a reason here too.
 		doc.SkipReason = fmt.Sprintf("document size %d larger than limit %d", len(doc.Content), b.opts.SizeMax)
-	} else if err := b.docChecker.Check(doc.Content, b.opts.TrigramMax, allowLargeFile); err != nil {
+	} else if err := b.docChecker.Check(doc.Content, b.opts.TrigramMax, allowLargeFile, b.opts.AllowBinary); err != nil {
 		doc.SkipReason = err.Error()
 		doc.Language = "binary"
 	}
@@ -1032,6 +1035,7 @@ func (b *Builder) newShardBuilder() (*zoekt.IndexBuilder, error) {
 	}
 	shardBuilder.IndexTime = b.indexTime
 	shardBuilder.ID = b.id
+	shardBuilder.AllowBinary = b.opts.AllowBinary
 	return shardBuilder, nil
 }
 

diff --git a/cmd/zoekt-git-index/main.go b/cmd/zoekt-git-index/main.go
@@ -48,6 +48,8 @@ func run() int {
 	tenantID := flag.Int("tenant_id", 0, "tenant ID to use for indexed repositories")
 	repoID := flag.Uint("repo_id", 0, "opaque ID to use for indexed repositories. Surfaces as `RepositoryID` in the REST search response.")
 
+	allowBinary := flag.Bool("allow_binary", false, "allow binary files (containing null bytes) to be indexed.")
+
 	cpuProfile := flag.String("cpuprofile", "", "write cpu profile to `file`")
 
 	flag.Parse()
@@ -79,6 +81,7 @@ func run() int {
 	opts.IsDelta = *isDelta
 	opts.RepositoryDescription.TenantID = *tenantID
 	opts.RepositoryDescription.ID = uint32(*repoID)
+	opts.AllowBinary = *allowBinary
 
 	var branches []string
 	if *branchesStr != "" {

diff --git a/index_test.go b/index_test.go
@@ -3206,27 +3206,34 @@ func TestDocChecker(t *testing.T) {
 
 	// Test valid and invalid text
 	for _, text := range []string{"", "simple ascii", "símplé unicödé", "\uFEFFwith utf8 'bom'", "with \uFFFD unicode replacement char"} {
-		if err := docChecker.Check([]byte(text), 20000, false); err != nil {
+		if err := docChecker.Check([]byte(text), 20000, false, false); err != nil {
 			t.Errorf("Check(%q): %v", text, err)
 		}
 	}
 	for _, text := range []string{"zero\x00byte", "xx", "0123456789abcdefghi"} {
-		if err := docChecker.Check([]byte(text), 15, false); err == nil {
+		if err := docChecker.Check([]byte(text), 15, false, false); err == nil {
 			t.Errorf("Check(%q) succeeded", text)
 		}
 	}
 
 	// Test valid and invalid text with an allowed large file
 	for _, text := range []string{"0123456789abcdefghi", "qwertyuiopasdfghjklzxcvbnm"} {
-		if err := docChecker.Check([]byte(text), 15, true); err != nil {
+		if err := docChecker.Check([]byte(text), 15, true, false); err != nil {
 			t.Errorf("Check(%q): %v", text, err)
 		}
 	}
 	for _, text := range []string{"zero\x00byte", "xx"} {
-		if err := docChecker.Check([]byte(text), 15, true); err == nil {
+		if err := docChecker.Check([]byte(text), 15, true, false); err == nil {
 			t.Errorf("Check(%q) succeeded", text)
 		}
 	}
+
+	// Test allowBinary=true
+	for _, text := range []string{"zero\x00byte"} {
+		if err := docChecker.Check([]byte(text), 15, false, true); err != nil {
+			t.Errorf("Check(%q) failed with allowBinary=true: %v", text, err)
+		}
+	}
 }
 
 func TestLineAnd(t *testing.T) {

diff --git a/indexbuilder.go b/indexbuilder.go
@@ -211,6 +211,9 @@ type IndexBuilder struct {
 
 	// a sortable 20 chars long id.
 	ID string
+
+	// AllowBinary allows indexing of binary files (files with null bytes).
+	AllowBinary bool
 }
 
 func (d *Repository) verify() error {
@@ -425,9 +428,11 @@ func DetermineLanguageIfUnknown(doc *Document) {
 func (b *IndexBuilder) Add(doc Document) error {
 	hasher := crc64.New(crc64.MakeTable(crc64.ISO))
 
-	if idx := bytes.IndexByte(doc.Content, 0); idx >= 0 {
-		doc.SkipReason = fmt.Sprintf("binary content at byte offset %d", idx)
-		doc.Language = "binary"
+	if !b.AllowBinary {
+		if idx := bytes.IndexByte(doc.Content, 0); idx >= 0 {
+			doc.SkipReason = fmt.Sprintf("binary content at byte offset %d", idx)
+			doc.Language = "binary"
+		}
 	}
 
 	if doc.SkipReason != "" {
@@ -532,7 +537,7 @@ type DocChecker struct {
 }
 
 // Check returns a reason why the given contents are probably not source texts.
-func (t *DocChecker) Check(content []byte, maxTrigramCount int, allowLargeFile bool) error {
+func (t *DocChecker) Check(content []byte, maxTrigramCount int, allowLargeFile bool, allowBinary bool) error {
 	if len(content) == 0 {
 		return nil
 	}
@@ -541,8 +546,10 @@ func (t *DocChecker) Check(content []byte, maxTrigramCount int, allowLargeFile b
 		return fmt.Errorf("file size smaller than %d", ngramSize)
 	}
 
-	if index := bytes.IndexByte(content, 0); index > 0 {
-		return fmt.Errorf("binary data at byte offset %d", index)
+	if !allowBinary {
+		if index := bytes.IndexByte(content, 0); index > 0 {
+			return fmt.Errorf("binary data at byte offset %d", index)
+		}
 	}
 
 	// PERF: we only need to do the trigram check if the upperbound on content is greater than