Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions jvector-examples/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,11 @@
<version>7.3.0</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>io.nosqlbench</groupId>
<artifactId>datatools-vectordata</artifactId>
<version>0.1.22</version>
</dependency>
<dependency>
<groupId>org.junit.jupiter</groupId>
<artifactId>junit-jupiter-api</artifactId>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,4 +38,6 @@ public interface DataSetLoader {
* @return a {@link DataSet}, if found
*/
Optional<DataSet> loadDataSet(String dataSetName);

String getName();
}
Original file line number Diff line number Diff line change
Expand Up @@ -41,17 +41,63 @@
* This dataset loader will get and load hdf5 files from <a href="https://ann-benchmarks.com/">ann-benchmarks</a>.
*/
public class DataSetLoaderHDF5 implements DataSetLoader {
private static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(DataSetLoaderHDF5.class);
public static final Path HDF5_DIR = Path.of("hdf5");
private static final VectorTypeSupport vectorTypeSupport = VectorizationProvider.getInstance().getVectorTypeSupport();
public static final String HDF5_EXTN = ".hdf5";

public static final String NAME = "HDF5";
public String getName() {
return NAME;
}

private static final java.util.Set<String> KNOWN_DATASETS = java.util.Set.of(
"deep-image-96-angular",
"fashion-mnist-784-euclidean",
"gist-960-euclidean",
"glove-25-angular",
"glove-50-angular",
"glove-100-angular",
"glove-200-angular",
"kosarak-jaccard",
"mnist-784-euclidean",
"movielens10m-jaccard",
"nytimes-256-angular",
"sift-128-euclidean",
"lastfm-64-dot",
"coco-i2i-512-angular",
"coco-t2i-512-angular"
);


/**
* {@inheritDoc}
*/
public Optional<DataSet> loadDataSet(String datasetName) {

// HDF5 loader does not support profiles
if (datasetName.contains(":")) {
logger.trace("Dataset '{}' has a profile, which is not supported by the HDF5 loader.", datasetName);
return Optional.empty();
}

// If not local, only download if it's explicitly known to be on ann-benchmarks.com
if (!KNOWN_DATASETS.contains(datasetName)) {
logger.trace("Dataset '{}' not in known list, skipping HDF5 download.", datasetName);
return Optional.empty();
}

// If it exists locally, we're good
var dsFilePath = HDF5_DIR.resolve(datasetName + HDF5_EXTN);
if (Files.exists(dsFilePath)) {
logger.trace("Dataset '{}' already downloaded.", datasetName);
return Optional.of(readHdf5Data(dsFilePath));
}

return maybeDownloadHdf5(datasetName).map(this::readHdf5Data);
}


private DataSet readHdf5Data(Path path) {

// infer the similarity
Expand Down Expand Up @@ -114,16 +160,12 @@ else if (filename.toString().contains("-euclidean")) {
}

private Optional<Path> maybeDownloadHdf5(String datasetName) {

var dsFilePath = HDF5_DIR.resolve(datasetName+HDF5_EXTN);

if (Files.exists(dsFilePath)) {
return Optional.of(dsFilePath);
}
var dsFilePath = HDF5_DIR.resolve(datasetName + HDF5_EXTN);

// Download from https://ann-benchmarks.com/datasetName
var url = "https://ann-benchmarks.com/" + datasetName + HDF5_EXTN;
System.out.println("Downloading: " + url);
logger.info("Downloading: {}", url);


HttpURLConnection connection;
while (true) {
Expand All @@ -139,7 +181,7 @@ private Optional<Path> maybeDownloadHdf5(String datasetName) {
}
if (responseCode == HttpURLConnection.HTTP_MOVED_PERM || responseCode == HttpURLConnection.HTTP_MOVED_TEMP) {
String newUrl = connection.getHeaderField("Location");
System.out.println("Redirect detected to URL: " + newUrl);
logger.info("Redirect detected to URL: {}", newUrl);
url = newUrl;
} else {
break;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,10 +51,21 @@ public class DataSetLoaderMFD implements DataSetLoader {
private static final String bucketName = "astra-vector";
private static final List<String> bucketNames = List.of(bucketName, infraBucketName);

public static final String NAME = "MFD";
public String getName() {
return NAME;
}

/**
* {@inheritDoc}
*/
public Optional<DataSet> loadDataSet(String fileName) {

if (fileName.contains(":")) {
logger.trace("Dataset {} with profile is not supported by MFD loader", fileName);
return Optional.empty();
}

return maybeDownloadFvecs(fileName).map(MultiFileDatasource::load);
}

Expand Down
Loading