From c93b797cec8a563d0a548d52cf91e8f4f5fe4c3f Mon Sep 17 00:00:00 2001 From: romit Date: Fri, 12 Dec 2025 11:00:17 +0000 Subject: [PATCH 1/2] Added kmeans from cuml Signed-off-by: romit --- .../online-data-mixing/artifacts/custom_loop_usage.py | 10 +++++----- plugins/online-data-mixing/pyproject.toml | 7 +++++-- .../src/fms_acceleration_odm/odm/auto_categorizer.py | 9 ++++++++- 3 files changed, 18 insertions(+), 8 deletions(-) diff --git a/plugins/online-data-mixing/artifacts/custom_loop_usage.py b/plugins/online-data-mixing/artifacts/custom_loop_usage.py index 4c7f0cba..e4ec15c0 100644 --- a/plugins/online-data-mixing/artifacts/custom_loop_usage.py +++ b/plugins/online-data-mixing/artifacts/custom_loop_usage.py @@ -19,7 +19,7 @@ from fms_acceleration_odm import OnlineMixingDataset from fms_acceleration_odm.odm.reward import Reward -model_name = "ibm-granite/granite-4.0-h-1b" +model_name = "ibm-granite/granite-4.0-350m" output_dir = "./odm_custom_use" max_steps = 125 batch_size = 4 @@ -27,10 +27,10 @@ # odm related step_idx = 0 -update_interval = 1 # every step +update_interval = 10 # every 10 steps # model -model = AutoModelForCausalLM.from_pretrained(model_name, dtype=torch.bfloat16) +model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16) # tokenizer tokenizer = AutoTokenizer.from_pretrained(model_name) @@ -102,7 +102,7 @@ def collate_fn(batch, tokenizer): dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False, collate_fn=None) # distributed setup -dataloader_config = DataLoaderConfiguration(split_batches=True, dispatch_batches=True) +dataloader_config = DataLoaderConfiguration(dispatch_batches=False) accelerator = Accelerator(dataloader_config=dataloader_config) model, dataloader = accelerator.prepare(model, dataloader) @@ -141,7 +141,7 @@ class State: if step_idx % update_interval == 0: with torch.no_grad(): model.eval() - dataloader.dataset.update_sampling_weights(model, accelerator, state) + dataset.update_sampling_weights(model, accelerator, state) model.train() if step_idx > max_steps: break diff --git a/plugins/online-data-mixing/pyproject.toml b/plugins/online-data-mixing/pyproject.toml index d459ceea..8c8f588a 100644 --- a/plugins/online-data-mixing/pyproject.toml +++ b/plugins/online-data-mixing/pyproject.toml @@ -15,7 +15,7 @@ license = {text = "Apache-2.0"} readme = "README.md" requires-python = "~=3.11" keywords = ['fms-hf-tuning', 'acceleration', 'online-data-mixing'] -classifiers=[ +classifiers = [ "License :: OSI Approved :: Apache Software License", "Development Status :: 4 - Beta", "Programming Language :: Python :: 3", @@ -23,6 +23,9 @@ classifiers=[ ] dependencies = [ + "torch==2.8.0", + "torchvision==0.23.0", + "torchaudio==2.8.0", "scikit-learn", "datasets==4.*", "torchdata==0.11.0", @@ -43,4 +46,4 @@ include = [ "src", "pyproject.toml", "README.md", -] \ No newline at end of file +] diff --git a/plugins/online-data-mixing/src/fms_acceleration_odm/odm/auto_categorizer.py b/plugins/online-data-mixing/src/fms_acceleration_odm/odm/auto_categorizer.py index af75b0f1..b1cd62ac 100644 --- a/plugins/online-data-mixing/src/fms_acceleration_odm/odm/auto_categorizer.py +++ b/plugins/online-data-mixing/src/fms_acceleration_odm/odm/auto_categorizer.py @@ -27,7 +27,6 @@ # Third Party from datasets import Dataset, DatasetDict from sentence_transformers import SentenceTransformer -from sklearn.cluster import KMeans import numpy as np import torch @@ -175,6 +174,14 @@ def _cluster_embeddings( "Unsupported clustering algorithm '%s'. Only 'kmeans' is currently supported." % self.config.cluster_algo ) + + try: + from cuml import KMeans + print(f"Using GPU accelerated Kmeans") + except ImportError as e: + print("GPU accelerated KMeans is not avaialble. Falling back to CPU based KMeans") + from sklearn.cluster import KMeans + kwargs = {"n_init": 10} kwargs.update(self.config.cluster_kwargs) model = KMeans(n_clusters=num_categories, **kwargs) From b928325c13856245139e2c225bc41c1f5249040c Mon Sep 17 00:00:00 2001 From: romit Date: Fri, 12 Dec 2025 14:49:36 +0000 Subject: [PATCH 2/2] Fixed ci cd Signed-off-by: romit --- .../src/fms_acceleration_odm/odm/auto_categorizer.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/plugins/online-data-mixing/src/fms_acceleration_odm/odm/auto_categorizer.py b/plugins/online-data-mixing/src/fms_acceleration_odm/odm/auto_categorizer.py index b1cd62ac..0ac3332c 100644 --- a/plugins/online-data-mixing/src/fms_acceleration_odm/odm/auto_categorizer.py +++ b/plugins/online-data-mixing/src/fms_acceleration_odm/odm/auto_categorizer.py @@ -176,11 +176,11 @@ def _cluster_embeddings( ) try: - from cuml import KMeans - print(f"Using GPU accelerated Kmeans") - except ImportError as e: + from cuml import KMeans # pylint: disable=import-outside-toplevel + print("Using GPU accelerated Kmeans") + except ImportError: print("GPU accelerated KMeans is not avaialble. Falling back to CPU based KMeans") - from sklearn.cluster import KMeans + from sklearn.cluster import KMeans # pylint: disable=import-outside-toplevel kwargs = {"n_init": 10} kwargs.update(self.config.cluster_kwargs)