Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 0 additions & 37 deletions .github/workflows/semgrep.yml

This file was deleted.

6 changes: 5 additions & 1 deletion build/builder.go
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,9 @@ type Options struct {

// ShardPrefix is the prefix of the shard. It defaults to the repository name.
ShardPrefix string

// AllowBinary allows indexing of binary files in the repository.
AllowBinary bool
}

// HashOptions contains only the options in Options that upon modification leads to IndexState of IndexStateMismatch during the next index building.
Expand Down Expand Up @@ -607,7 +610,7 @@ func (b *Builder) Add(doc zoekt.Document) error {
// files, the corresponding shard would be mostly empty, so
// insert a reason here too.
doc.SkipReason = fmt.Sprintf("document size %d larger than limit %d", len(doc.Content), b.opts.SizeMax)
} else if err := b.docChecker.Check(doc.Content, b.opts.TrigramMax, allowLargeFile); err != nil {
} else if err := b.docChecker.Check(doc.Content, b.opts.TrigramMax, allowLargeFile, b.opts.AllowBinary); err != nil {
doc.SkipReason = err.Error()
doc.Language = "binary"
}
Expand Down Expand Up @@ -1032,6 +1035,7 @@ func (b *Builder) newShardBuilder() (*zoekt.IndexBuilder, error) {
}
shardBuilder.IndexTime = b.indexTime
shardBuilder.ID = b.id
shardBuilder.AllowBinary = b.opts.AllowBinary
return shardBuilder, nil
}

Expand Down
3 changes: 3 additions & 0 deletions cmd/zoekt-git-index/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,8 @@ func run() int {
tenantID := flag.Int("tenant_id", 0, "tenant ID to use for indexed repositories")
repoID := flag.Uint("repo_id", 0, "opaque ID to use for indexed repositories. Surfaces as `RepositoryID` in the REST search response.")

allowBinary := flag.Bool("allow_binary", false, "allow binary files (containing null bytes) to be indexed.")

cpuProfile := flag.String("cpuprofile", "", "write cpu profile to `file`")

flag.Parse()
Expand Down Expand Up @@ -79,6 +81,7 @@ func run() int {
opts.IsDelta = *isDelta
opts.RepositoryDescription.TenantID = *tenantID
opts.RepositoryDescription.ID = uint32(*repoID)
opts.AllowBinary = *allowBinary

var branches []string
if *branchesStr != "" {
Expand Down
15 changes: 11 additions & 4 deletions index_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -3206,27 +3206,34 @@ func TestDocChecker(t *testing.T) {

// Test valid and invalid text
for _, text := range []string{"", "simple ascii", "símplé unicödé", "\uFEFFwith utf8 'bom'", "with \uFFFD unicode replacement char"} {
if err := docChecker.Check([]byte(text), 20000, false); err != nil {
if err := docChecker.Check([]byte(text), 20000, false, false); err != nil {
t.Errorf("Check(%q): %v", text, err)
}
}
for _, text := range []string{"zero\x00byte", "xx", "0123456789abcdefghi"} {
if err := docChecker.Check([]byte(text), 15, false); err == nil {
if err := docChecker.Check([]byte(text), 15, false, false); err == nil {
t.Errorf("Check(%q) succeeded", text)
}
}

// Test valid and invalid text with an allowed large file
for _, text := range []string{"0123456789abcdefghi", "qwertyuiopasdfghjklzxcvbnm"} {
if err := docChecker.Check([]byte(text), 15, true); err != nil {
if err := docChecker.Check([]byte(text), 15, true, false); err != nil {
t.Errorf("Check(%q): %v", text, err)
}
}
for _, text := range []string{"zero\x00byte", "xx"} {
if err := docChecker.Check([]byte(text), 15, true); err == nil {
if err := docChecker.Check([]byte(text), 15, true, false); err == nil {
t.Errorf("Check(%q) succeeded", text)
}
}

// Test allowBinary=true
for _, text := range []string{"zero\x00byte"} {
if err := docChecker.Check([]byte(text), 15, false, true); err != nil {
t.Errorf("Check(%q) failed with allowBinary=true: %v", text, err)
}
}
}

func TestLineAnd(t *testing.T) {
Expand Down
19 changes: 13 additions & 6 deletions indexbuilder.go
Original file line number Diff line number Diff line change
Expand Up @@ -211,6 +211,9 @@ type IndexBuilder struct {

// a sortable 20 chars long id.
ID string

// AllowBinary allows indexing of binary files (files with null bytes).
AllowBinary bool
}

func (d *Repository) verify() error {
Expand Down Expand Up @@ -425,9 +428,11 @@ func DetermineLanguageIfUnknown(doc *Document) {
func (b *IndexBuilder) Add(doc Document) error {
hasher := crc64.New(crc64.MakeTable(crc64.ISO))

if idx := bytes.IndexByte(doc.Content, 0); idx >= 0 {
doc.SkipReason = fmt.Sprintf("binary content at byte offset %d", idx)
doc.Language = "binary"
if !b.AllowBinary {
if idx := bytes.IndexByte(doc.Content, 0); idx >= 0 {
doc.SkipReason = fmt.Sprintf("binary content at byte offset %d", idx)
doc.Language = "binary"
}
}

if doc.SkipReason != "" {
Expand Down Expand Up @@ -532,7 +537,7 @@ type DocChecker struct {
}

// Check returns a reason why the given contents are probably not source texts.
func (t *DocChecker) Check(content []byte, maxTrigramCount int, allowLargeFile bool) error {
func (t *DocChecker) Check(content []byte, maxTrigramCount int, allowLargeFile bool, allowBinary bool) error {
if len(content) == 0 {
return nil
}
Expand All @@ -541,8 +546,10 @@ func (t *DocChecker) Check(content []byte, maxTrigramCount int, allowLargeFile b
return fmt.Errorf("file size smaller than %d", ngramSize)
}

if index := bytes.IndexByte(content, 0); index > 0 {
return fmt.Errorf("binary data at byte offset %d", index)
if !allowBinary {
if index := bytes.IndexByte(content, 0); index > 0 {
return fmt.Errorf("binary data at byte offset %d", index)
}
}

// PERF: we only need to do the trigram check if the upperbound on content is greater than
Expand Down
Loading