diff --git a/.github/workflows/semgrep.yml b/.github/workflows/semgrep.yml deleted file mode 100644 index af52f4fcf..000000000 --- a/.github/workflows/semgrep.yml +++ /dev/null @@ -1,37 +0,0 @@ -name: Semgrep - SAST Scan - -on: - pull_request_target: - types: [ closed, edited, opened, synchronize, ready_for_review ] - -jobs: - semgrep: - permissions: - contents: read # for actions/checkout to fetch code - security-events: write # for github/codeql-action/upload-sarif to upload SARIF results - actions: read # only required for a private repository by github/codeql-action/upload-sarif to get the Action run status - runs-on: ubuntu-latest - container: - image: returntocorp/semgrep - - steps: - - uses: actions/checkout@v4 - with: - ref: ${{ github.event.pull_request.head.ref }} - repository: ${{ github.event.pull_request.head.repo.full_name }} - - - name: Checkout semgrep-rules repo - uses: actions/checkout@v4 - with: - repository: sourcegraph/security-semgrep-rules - token: ${{ secrets.GH_SEMGREP_SAST_TOKEN }} - path: semgrep-rules - - - name: Run Semgrep SAST Scan - run: | - mv semgrep-rules ../ - semgrep ci -f ../semgrep-rules/semgrep-rules/ --metrics=off --oss-only --suppress-errors --sarif -o results.sarif --exclude='semgrep-rules' --baseline-commit "$(git merge-base main HEAD)" || true - - name: Upload SARIF file - uses: github/codeql-action/upload-sarif@v3 - with: - sarif_file: results.sarif \ No newline at end of file diff --git a/build/builder.go b/build/builder.go index 498318699..0b4e0bcae 100644 --- a/build/builder.go +++ b/build/builder.go @@ -120,6 +120,9 @@ type Options struct { // ShardPrefix is the prefix of the shard. It defaults to the repository name. ShardPrefix string + + // AllowBinary allows indexing of binary files in the repository. + AllowBinary bool } // HashOptions contains only the options in Options that upon modification leads to IndexState of IndexStateMismatch during the next index building. @@ -607,7 +610,7 @@ func (b *Builder) Add(doc zoekt.Document) error { // files, the corresponding shard would be mostly empty, so // insert a reason here too. doc.SkipReason = fmt.Sprintf("document size %d larger than limit %d", len(doc.Content), b.opts.SizeMax) - } else if err := b.docChecker.Check(doc.Content, b.opts.TrigramMax, allowLargeFile); err != nil { + } else if err := b.docChecker.Check(doc.Content, b.opts.TrigramMax, allowLargeFile, b.opts.AllowBinary); err != nil { doc.SkipReason = err.Error() doc.Language = "binary" } @@ -1032,6 +1035,7 @@ func (b *Builder) newShardBuilder() (*zoekt.IndexBuilder, error) { } shardBuilder.IndexTime = b.indexTime shardBuilder.ID = b.id + shardBuilder.AllowBinary = b.opts.AllowBinary return shardBuilder, nil } diff --git a/cmd/zoekt-git-index/main.go b/cmd/zoekt-git-index/main.go index cde395800..b214986af 100644 --- a/cmd/zoekt-git-index/main.go +++ b/cmd/zoekt-git-index/main.go @@ -48,6 +48,8 @@ func run() int { tenantID := flag.Int("tenant_id", 0, "tenant ID to use for indexed repositories") repoID := flag.Uint("repo_id", 0, "opaque ID to use for indexed repositories. Surfaces as `RepositoryID` in the REST search response.") + allowBinary := flag.Bool("allow_binary", false, "allow binary files (containing null bytes) to be indexed.") + cpuProfile := flag.String("cpuprofile", "", "write cpu profile to `file`") flag.Parse() @@ -79,6 +81,7 @@ func run() int { opts.IsDelta = *isDelta opts.RepositoryDescription.TenantID = *tenantID opts.RepositoryDescription.ID = uint32(*repoID) + opts.AllowBinary = *allowBinary var branches []string if *branchesStr != "" { diff --git a/index_test.go b/index_test.go index bdb92f5a4..f2b02d5c2 100644 --- a/index_test.go +++ b/index_test.go @@ -3206,27 +3206,34 @@ func TestDocChecker(t *testing.T) { // Test valid and invalid text for _, text := range []string{"", "simple ascii", "símplé unicödé", "\uFEFFwith utf8 'bom'", "with \uFFFD unicode replacement char"} { - if err := docChecker.Check([]byte(text), 20000, false); err != nil { + if err := docChecker.Check([]byte(text), 20000, false, false); err != nil { t.Errorf("Check(%q): %v", text, err) } } for _, text := range []string{"zero\x00byte", "xx", "0123456789abcdefghi"} { - if err := docChecker.Check([]byte(text), 15, false); err == nil { + if err := docChecker.Check([]byte(text), 15, false, false); err == nil { t.Errorf("Check(%q) succeeded", text) } } // Test valid and invalid text with an allowed large file for _, text := range []string{"0123456789abcdefghi", "qwertyuiopasdfghjklzxcvbnm"} { - if err := docChecker.Check([]byte(text), 15, true); err != nil { + if err := docChecker.Check([]byte(text), 15, true, false); err != nil { t.Errorf("Check(%q): %v", text, err) } } for _, text := range []string{"zero\x00byte", "xx"} { - if err := docChecker.Check([]byte(text), 15, true); err == nil { + if err := docChecker.Check([]byte(text), 15, true, false); err == nil { t.Errorf("Check(%q) succeeded", text) } } + + // Test allowBinary=true + for _, text := range []string{"zero\x00byte"} { + if err := docChecker.Check([]byte(text), 15, false, true); err != nil { + t.Errorf("Check(%q) failed with allowBinary=true: %v", text, err) + } + } } func TestLineAnd(t *testing.T) { diff --git a/indexbuilder.go b/indexbuilder.go index 027edf9f4..e4da3f9ec 100644 --- a/indexbuilder.go +++ b/indexbuilder.go @@ -211,6 +211,9 @@ type IndexBuilder struct { // a sortable 20 chars long id. ID string + + // AllowBinary allows indexing of binary files (files with null bytes). + AllowBinary bool } func (d *Repository) verify() error { @@ -425,9 +428,11 @@ func DetermineLanguageIfUnknown(doc *Document) { func (b *IndexBuilder) Add(doc Document) error { hasher := crc64.New(crc64.MakeTable(crc64.ISO)) - if idx := bytes.IndexByte(doc.Content, 0); idx >= 0 { - doc.SkipReason = fmt.Sprintf("binary content at byte offset %d", idx) - doc.Language = "binary" + if !b.AllowBinary { + if idx := bytes.IndexByte(doc.Content, 0); idx >= 0 { + doc.SkipReason = fmt.Sprintf("binary content at byte offset %d", idx) + doc.Language = "binary" + } } if doc.SkipReason != "" { @@ -532,7 +537,7 @@ type DocChecker struct { } // Check returns a reason why the given contents are probably not source texts. -func (t *DocChecker) Check(content []byte, maxTrigramCount int, allowLargeFile bool) error { +func (t *DocChecker) Check(content []byte, maxTrigramCount int, allowLargeFile bool, allowBinary bool) error { if len(content) == 0 { return nil } @@ -541,8 +546,10 @@ func (t *DocChecker) Check(content []byte, maxTrigramCount int, allowLargeFile b return fmt.Errorf("file size smaller than %d", ngramSize) } - if index := bytes.IndexByte(content, 0); index > 0 { - return fmt.Errorf("binary data at byte offset %d", index) + if !allowBinary { + if index := bytes.IndexByte(content, 0); index > 0 { + return fmt.Errorf("binary data at byte offset %d", index) + } } // PERF: we only need to do the trigram check if the upperbound on content is greater than