Skip to content

Commit 8175ecc

Browse files
Shivaji KharseShivaji Kharse
authored andcommitted
resolve review comments
1 parent 84a5ef5 commit 8175ecc

7 files changed

Lines changed: 362 additions & 100 deletions

File tree

.github/workflows/ci-dgraph-integration2-tests.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ jobs:
3333
uses: actions/cache/restore@v4
3434
with:
3535
path: dgraphtest/datafiles
36-
key: dataset-dgraphtest-v1
36+
key: dataset-dgraphtest-${{ hashFiles('testutil/benchmark-data-version') }}
3737

3838
- name: Ensure datafiles directory
3939
run: mkdir -p dgraphtest/datafiles
@@ -70,4 +70,4 @@ jobs:
7070
uses: actions/cache/save@v4
7171
with:
7272
path: dgraphtest/datafiles
73-
key: dataset-dgraphtest-v1
73+
key: dataset-dgraphtest-${{ hashFiles('testutil/benchmark-data-version') }}

.github/workflows/ci-dgraph-ldbc-tests.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ jobs:
3232
uses: actions/cache/restore@v4
3333
with:
3434
path: ${{ github.workspace }}/test-data
35-
key: dataset-ldbc-v1
35+
key: dataset-ldbc-${{ hashFiles('testutil/benchmark-data-version') }}
3636

3737
- name: Set up Go
3838
uses: actions/setup-go@v6
@@ -67,7 +67,7 @@ jobs:
6767
# move the binary
6868
cp dgraph/dgraph ~/go/bin/dgraph
6969
# run the ldbc tests
70-
cd t; ./t --suite=ldbc --tmp=${{ github.workspace }}/test-data
70+
cd t; ./t --suite=ldbc --tmp=${{ github.workspace }}/test-data --keep-data
7171
# clean up docker containers after test execution
7272
./t -r
7373
@@ -76,4 +76,4 @@ jobs:
7676
uses: actions/cache/save@v4
7777
with:
7878
path: ${{ github.workspace }}/test-data
79-
key: dataset-ldbc-v1
79+
key: dataset-ldbc-${{ hashFiles('testutil/benchmark-data-version') }}

.github/workflows/ci-dgraph-load-tests.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ jobs:
3131
uses: actions/cache/restore@v4
3232
with:
3333
path: ${{ github.workspace }}/test-data
34-
key: dataset-load-v1
34+
key: dataset-load-${{ hashFiles('testutil/benchmark-data-version') }}
3535

3636
- name: Set up Go
3737
uses: actions/setup-go@v6
@@ -66,7 +66,7 @@ jobs:
6666
# move the binary
6767
cp dgraph/dgraph ~/go/bin/dgraph
6868
# run the load tests
69-
cd t; ./t --suite=load --tmp=${{ github.workspace }}/test-data
69+
cd t; ./t --suite=load --tmp=${{ github.workspace }}/test-data --keep-data
7070
# clean up docker containers after test execution
7171
./t -r
7272
# sleep
@@ -77,4 +77,4 @@ jobs:
7777
uses: actions/cache/save@v4
7878
with:
7979
path: ${{ github.workspace }}/test-data
80-
key: dataset-load-v1
80+
key: dataset-load-${{ hashFiles('testutil/benchmark-data-version') }}

dgraphtest/load.go

Lines changed: 19 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -20,13 +20,13 @@ import (
2020
"runtime"
2121
"strconv"
2222
"strings"
23-
"time"
2423

2524
"github.com/pkg/errors"
2625

2726
"github.com/dgraph-io/dgo/v250/protos/api"
2827
"github.com/dgraph-io/dgraph/v25/dgraphapi"
2928
"github.com/dgraph-io/dgraph/v25/enc"
29+
"github.com/dgraph-io/dgraph/v25/testutil"
3030
"github.com/dgraph-io/dgraph/v25/x"
3131
)
3232

@@ -41,11 +41,13 @@ func (c *LocalCluster) HostDgraphBinaryPath() string {
4141
return filepath.Join(c.tempBinDir, "dgraph_host")
4242
}
4343

44-
var datafiles = map[string]string{
45-
"1million.schema": "https://raw.githubusercontent.com/dgraph-io/dgraph-benchmarks/refs/heads/main/data/1million.schema",
46-
"1million.rdf.gz": "https://media.githubusercontent.com/media/dgraph-io/dgraph-benchmarks/refs/heads/main/data/1million.rdf.gz",
47-
"21million.schema": "https://raw.githubusercontent.com/dgraph-io/dgraph-benchmarks/refs/heads/main/data/21million.schema",
48-
"21million.rdf.gz": "https://media.githubusercontent.com/media/dgraph-io/dgraph-benchmarks/refs/heads/main/data/21million.rdf.gz",
44+
// datafilePaths maps filenames to their paths inside the dgraph-benchmarks repo.
45+
// URLs are constructed at runtime by combining these with the configured ref.
46+
var datafilePaths = map[string]string{
47+
"1million.schema": "data/1million.schema",
48+
"1million.rdf.gz": "data/1million.rdf.gz",
49+
"21million.schema": "data/21million.schema",
50+
"21million.rdf.gz": "data/21million.rdf.gz",
4951
}
5052

5153
type DatasetType int
@@ -592,35 +594,21 @@ func (d *Dataset) GqlSchemaPath() string {
592594

593595
func (d *Dataset) ensureFile(filename string) string {
594596
fullPath := filepath.Join(datasetFilesPath, filename)
595-
if exists, _ := fileExists(fullPath); !exists {
596-
url, ok := datafiles[filename]
597+
if !testutil.FileExistsAndValid(fullPath) {
598+
repoPath, ok := datafilePaths[filename]
597599
if !ok {
598-
panic(fmt.Sprintf("dataset file %s not found in datafiles map", filename))
600+
panic(fmt.Sprintf("dataset file %s not found in datafilePaths map", filename))
599601
}
600-
if err := downloadFile(filename, url); err != nil {
602+
ref := testutil.BenchmarkDataRef("")
603+
var err error
604+
if strings.HasSuffix(filename, ".rdf.gz") {
605+
err = testutil.DownloadLFSFile(filename, ref, repoPath, datasetFilesPath)
606+
} else {
607+
err = testutil.DownloadFile(filename, testutil.BenchmarkRawURL(ref, repoPath), datasetFilesPath)
608+
}
609+
if err != nil {
601610
panic(fmt.Sprintf("failed to download %s: %v", filename, err))
602611
}
603612
}
604613
return fullPath
605614
}
606-
607-
func downloadFile(fname, url string) error {
608-
const maxRetries = 3
609-
fpath := filepath.Join(datasetFilesPath, fname)
610-
for attempt := 1; attempt <= maxRetries; attempt++ {
611-
cmd := exec.Command("wget", "--tries=3", "--waitretry=5", "--retry-connrefused", "-O", fname, url)
612-
cmd.Dir = datasetFilesPath
613-
614-
if out, err := cmd.CombinedOutput(); err != nil {
615-
log.Printf("attempt %d/%d failed to download %s: %v\n%s", attempt, maxRetries, fname, err, string(out))
616-
if attempt < maxRetries {
617-
time.Sleep(time.Duration(attempt*5) * time.Second)
618-
continue
619-
}
620-
_ = os.Remove(fpath)
621-
return fmt.Errorf("error downloading file %s after %d attempts: %w", fname, maxRetries, err)
622-
}
623-
return nil
624-
}
625-
return nil
626-
}

t/t.go

Lines changed: 74 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ import (
3333
"github.com/docker/docker/client"
3434
"github.com/golang/glog"
3535
"github.com/spf13/pflag"
36+
"golang.org/x/sync/errgroup"
3637
"golang.org/x/tools/go/packages"
3738

3839
"github.com/dgraph-io/dgraph/v25/testutil"
@@ -93,7 +94,13 @@ var (
9394
"unit = true unit tests only (no Docker, no integration tag). "+
9495
"integration = everything except ldbc, load, and systest-heavy (with Docker). "+
9596
"systest = systest-baseline + systest-heavy.")
96-
tmp = pflag.String("tmp", "", "Temporary directory used to download data.")
97+
tmp = pflag.String("tmp", "", "Temporary directory used to download data.")
98+
keepData = pflag.Bool("keep-data", false,
99+
"If true, do not remove the data directory after tests complete. "+
100+
"Useful in CI where data is cached between runs.")
101+
dataRef = pflag.String("data-ref", "",
102+
"Git ref (branch, tag, or SHA) for dgraph-benchmarks data. "+
103+
"Overrides DGRAPH_TEST_DATA_REF env var and benchmark-data-version file.")
97104
downloadResources = pflag.BoolP("download", "d", true,
98105
"Flag to specify whether to download resources or not")
99106
race = pflag.Bool("race", false, "Set true to build with race")
@@ -1121,18 +1128,17 @@ func isHeavyPackage(pkg string) bool {
11211128
return false
11221129
}
11231130

1124-
var datafiles = map[string]string{
1125-
"1million-noindex.schema": "https://raw.githubusercontent.com/dgraph-io/dgraph-benchmarks/refs/heads/main/data/1million-noindex.schema",
1126-
"1million.schema": "https://raw.githubusercontent.com/dgraph-io/dgraph-benchmarks/refs/heads/main/data/1million.schema",
1127-
"1million.rdf.gz": "https://media.githubusercontent.com/media/dgraph-io/dgraph-benchmarks/refs/heads/main/data/1million.rdf.gz",
1128-
"21million.schema": "https://raw.githubusercontent.com/dgraph-io/dgraph-benchmarks/refs/heads/main/data/21million.schema",
1129-
"21million.rdf.gz": "https://media.githubusercontent.com/media/dgraph-io/dgraph-benchmarks/refs/heads/main/data/21million.rdf.gz",
1131+
// datafilePaths maps filenames to their paths inside the dgraph-benchmarks repo.
1132+
// URLs are constructed at runtime from BenchmarkDataRef().
1133+
var datafilePaths = map[string]string{
1134+
"1million-noindex.schema": "data/1million-noindex.schema",
1135+
"1million.schema": "data/1million.schema",
1136+
"1million.rdf.gz": "data/1million.rdf.gz",
1137+
"21million.schema": "data/21million.schema",
1138+
"21million.rdf.gz": "data/21million.rdf.gz",
11301139
}
11311140

1132-
var baseUrl = "https://media.githubusercontent.com/media/dgraph-io/dgraph-benchmarks/refs/heads/main/ldbc/sf0.3/ldbc_rdf_0.3/"
1133-
var suffix = "?raw=true"
1134-
1135-
var rdfFileNames = [...]string{
1141+
var ldbcRdfFileNames = [...]string{
11361142
"Deltas.rdf",
11371143
"comment_0.rdf",
11381144
"containerOf_0.rdf",
@@ -1156,83 +1162,81 @@ var rdfFileNames = [...]string{
11561162
"studyAt_0.rdf",
11571163
"tag_0.rdf",
11581164
"tagclass_0.rdf",
1159-
"workAt_0.rdf"}
1160-
1161-
var ldbcDataFiles = map[string]string{
1162-
"ldbcTypes.schema": "https://media.githubusercontent.com/media/dgraph-io/dgraph-benchmarks/refs/heads/main/ldbc/sf0.3/ldbcTypes.schema",
1165+
"workAt_0.rdf",
11631166
}
11641167

1165-
func wgetWithRetry(fname, url, dir string) error {
1166-
const maxRetries = 3
1167-
fpath := filepath.Join(dir, fname)
1168-
for attempt := 1; attempt <= maxRetries; attempt++ {
1169-
cmd := exec.Command("wget", "--tries=3", "--waitretry=5", "--retry-connrefused", "-O", fname, url)
1170-
cmd.Dir = dir
1171-
if out, err := cmd.CombinedOutput(); err != nil {
1172-
fmt.Printf("attempt %d/%d failed to download %s: %v\n%s\n", attempt, maxRetries, fname, err, string(out))
1173-
if attempt < maxRetries {
1174-
time.Sleep(time.Duration(attempt*5) * time.Second)
1175-
continue
1176-
}
1177-
_ = os.Remove(fpath)
1178-
return fmt.Errorf("failed to download %s after %d attempts: %w", fname, maxRetries, err)
1179-
}
1180-
return nil
1181-
}
1182-
return nil
1168+
// ldbcFilePaths maps filenames to their paths inside the dgraph-benchmarks repo.
1169+
var ldbcFilePaths = map[string]string{
1170+
"ldbcTypes.schema": "ldbc/sf0.3/ldbcTypes.schema",
11831171
}
11841172

1185-
func downloadDataFiles() {
1173+
func downloadDataFiles() error {
11861174
if !*downloadResources {
11871175
fmt.Print("Skipping downloading of resources\n")
1188-
return
1176+
return nil
11891177
}
1190-
for fname, link := range datafiles {
1178+
ref := testutil.BenchmarkDataRef(*dataRef)
1179+
fmt.Printf("Using benchmark data ref: %s\n", ref)
1180+
for fname, repoPath := range datafilePaths {
11911181
fpath := filepath.Join(*tmp, fname)
1192-
if fi, err := os.Stat(fpath); err == nil && fi.Size() > 0 {
1182+
if testutil.FileExistsAndValid(fpath) {
11931183
fmt.Printf("Skipping %s (already exists)\n", fname)
11941184
continue
11951185
}
1196-
if err := wgetWithRetry(fname, link, *tmp); err != nil {
1197-
panic(fmt.Sprintf("error downloading %s: %v", fname, err))
1186+
var err error
1187+
if strings.HasSuffix(fname, ".rdf.gz") {
1188+
err = testutil.DownloadLFSFile(fname, ref, repoPath, *tmp)
1189+
} else {
1190+
err = testutil.DownloadFile(fname, testutil.BenchmarkRawURL(ref, repoPath), *tmp)
1191+
}
1192+
if err != nil {
1193+
return fmt.Errorf("error downloading %s: %v", fname, err)
11981194
}
11991195
}
1196+
return nil
12001197
}
12011198

1202-
func downloadLDBCFiles(dir string) {
1199+
func downloadLDBCFiles(dir string) error {
12031200
if !*downloadResources {
12041201
fmt.Print("Skipping downloading of resources\n")
1205-
return
1202+
return nil
12061203
}
12071204

1208-
for _, name := range rdfFileNames {
1209-
ldbcDataFiles[name] = baseUrl + name + suffix
1205+
ref := testutil.BenchmarkDataRef(*dataRef)
1206+
fmt.Printf("Using benchmark data ref: %s\n", ref)
1207+
1208+
// All LDBC files (schema + RDF) are LFS-tracked.
1209+
allFiles := make(map[string]string)
1210+
for fname, repoPath := range ldbcFilePaths {
1211+
allFiles[fname] = repoPath
1212+
}
1213+
for _, name := range ldbcRdfFileNames {
1214+
allFiles[name] = "ldbc/sf0.3/ldbc_rdf_0.3/" + name
12101215
}
12111216

12121217
start := time.Now()
1213-
sem := make(chan struct{}, 5)
1214-
var wg sync.WaitGroup
1215-
for fname, link := range ldbcDataFiles {
1218+
g, _ := errgroup.WithContext(context.Background())
1219+
g.SetLimit(5)
1220+
for fname, repoPath := range allFiles {
12161221
fpath := filepath.Join(dir, fname)
1217-
if fi, err := os.Stat(fpath); err == nil && fi.Size() > 0 {
1222+
if testutil.FileExistsAndValid(fpath) {
12181223
fmt.Printf("Skipping %s (already exists)\n", fname)
12191224
continue
12201225
}
1221-
wg.Add(1)
1222-
go func(fname, link string) {
1223-
defer wg.Done()
1224-
sem <- struct{}{}
1225-
defer func() { <-sem }()
1226-
1226+
g.Go(func() error {
12271227
dlStart := time.Now()
1228-
if err := wgetWithRetry(fname, link, dir); err != nil {
1229-
panic(fmt.Sprintf("error downloading %s: %v", fname, err))
1228+
if err := testutil.DownloadLFSFile(fname, ref, repoPath, dir); err != nil {
1229+
return fmt.Errorf("error downloading %s: %v", fname, err)
12301230
}
1231-
fmt.Printf("Downloaded %s to %s in %s \n", fname, dir, time.Since(dlStart))
1232-
}(fname, link)
1231+
fmt.Printf("Downloaded %s to %s in %s\n", fname, dir, time.Since(dlStart))
1232+
return nil
1233+
})
12331234
}
1234-
wg.Wait()
1235-
fmt.Printf("Downloaded %d files in %s \n", len(ldbcDataFiles), time.Since(start))
1235+
if err := g.Wait(); err != nil {
1236+
return err
1237+
}
1238+
fmt.Printf("Downloaded %d files in %s\n", len(allFiles), time.Since(start))
1239+
return nil
12361240
}
12371241

12381242
func createTestCoverageFile(path string) error {
@@ -1419,15 +1423,21 @@ func run() error {
14191423
x.Check(os.MkdirAll(*tmp, 0755))
14201424
}
14211425
if testSuiteContainsAny("load", "all") {
1422-
downloadDataFiles()
1426+
if err := downloadDataFiles(); err != nil {
1427+
fmt.Printf("Failed to download data files: %v\n", err)
1428+
return
1429+
}
14231430
}
14241431
if testSuiteContainsAny("ldbc", "all") {
14251432
// LDBC files go into a subdirectory because the LDBC test bulk-loads
14261433
// the entire directory (-f <dir>). Mixing load data (1million, 21million)
14271434
// with LDBC data causes schema mismatches.
14281435
ldbcDir := filepath.Join(*tmp, "ldbc")
14291436
x.Check(os.MkdirAll(ldbcDir, 0755))
1430-
downloadLDBCFiles(ldbcDir)
1437+
if err := downloadLDBCFiles(ldbcDir); err != nil {
1438+
fmt.Printf("Failed to download LDBC files: %v\n", err)
1439+
return
1440+
}
14311441
}
14321442
for i, task := range valid {
14331443
select {
@@ -1478,6 +1488,9 @@ func main() {
14781488
procId = rand.Intn(1000)
14791489

14801490
err := run()
1491+
if !*keepData && *tmp != "" {
1492+
_ = os.RemoveAll(*tmp)
1493+
}
14811494
if err != nil {
14821495
os.Exit(1)
14831496
}

testutil/benchmark-data-version

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
main

0 commit comments

Comments
 (0)