From c3fa9350bf0aa7f1d97bb0363c0d4ab0f589e9dd Mon Sep 17 00:00:00 2001
From: ShawnXuan <xiexuanx2@gmail.com>
Date: Tue, 13 May 2025 11:07:38 +0800
Subject: [PATCH 1/4] multi devices support

---
 cv/classification/main.py | 22 ++++++++++++++++------
 1 file changed, 16 insertions(+), 6 deletions(-)

diff --git a/cv/classification/main.py b/cv/classification/main.py
index b780d7e9..91f9de19 100755
--- a/cv/classification/main.py
+++ b/cv/classification/main.py
@@ -122,10 +122,17 @@ def parse_option():
         required=False,
         help="local rank for DistributedDataParallel",
     )
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="cuda",
+        help="Specify the device to run the model on. Options: 'cuda', 'cpu', or 'npu'.",
+    )
 
     args, unparsed = parser.parse_known_args()
 
     config = get_config(args)
+    config["DEVICE"] = args.device.lower()
 
     return args, config
 
@@ -141,7 +148,7 @@ def main(config):
 
     logger.info(f"Creating model:{config.MODEL.ARCH}")
     model = build_model(config)
-    model.cuda()
+    model.to(config.DEVICE)
 
     optimizer = build_optimizer(config, model)
     model = flow.nn.parallel.DistributedDataParallel(model, broadcast_buffers=False, use_bucket=False)
@@ -255,8 +262,8 @@ def train_one_epoch(
     start = time.time()
     end = time.time()
     for idx, (samples, targets) in enumerate(data_loader):
-        samples = samples.cuda()
-        targets = targets.cuda()
+        samples = samples.to(config.DEVICE)
+        targets = targets.to(config.DEVICE).to(flow.int32)
 
         if mixup_fn is not None:
             samples, targets = mixup_fn(samples, targets)
@@ -324,8 +331,8 @@ def validate(config, data_loader, model):
 
     end = time.time()
     for idx, (images, target) in enumerate(data_loader):
-        images = images.cuda()
-        target = target.cuda()
+        images = images.to(config.DEVICE)
+        target = target.to(config.DEVICE).to(flow.int32)
 
         # compute output
         output = model(images)
@@ -370,7 +377,7 @@ def throughput(data_loader, model, logger):
     model.eval()
 
     for idx, (images, _) in enumerate(data_loader):
-        images = images.cuda()
+        images = images.to(config.DEVICE)
         batch_size = images.shape[0]
         for i in range(50):
             model(images)
@@ -453,4 +460,7 @@ def throughput(data_loader, model, logger):
     # print config
     logger.info(config.dump())
 
+    if config.DEVICE == "npu":
+        import oneflow_npu
+
     main(config)

From 656ca33820c9e8d6114cf9847a3cc6079a58ce84 Mon Sep 17 00:00:00 2001
From: ShawnXuan <xiexuanx2@gmail.com>
Date: Tue, 13 May 2025 11:11:16 +0800
Subject: [PATCH 2/4] update README

---
 cv/classification/README.md | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/cv/classification/README.md b/cv/classification/README.md
index 41020387..bf04f48a 100644
--- a/cv/classification/README.md
+++ b/cv/classification/README.md
@@ -105,5 +105,13 @@ Bash script `infer.sh` is used to infer the trained model.
 sh infer.sh
 ```
 
+### Multi-Device Support (Experimental)
 
+This branch introduces preliminary support for running on different device types. To train on an NPU device, add the following argument to your train.sh command:
+
+```bash
+--device=npu
+```
+
+> Note: The label_smoothing feature is currently not supported in this branch. If your configuration file (e.g., configs/default_settings.yaml) includes label_smoothing, please disable it(set to 0.0) to avoid errors.
 

From 206ab33bba30e95db54ff13497e65c72e8cb625d Mon Sep 17 00:00:00 2001
From: ShawnXuan <xiexuanx2@gmail.com>
Date: Tue, 13 May 2025 11:37:59 +0800
Subject: [PATCH 3/4] auto detect device

---
 cv/classification/main.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/cv/classification/main.py b/cv/classification/main.py
index 91f9de19..6841b782 100755
--- a/cv/classification/main.py
+++ b/cv/classification/main.py
@@ -7,6 +7,7 @@
 import argparse
 import datetime
 import numpy as np
+import importlib.util
 import oneflow as flow
 import oneflow.backends.cudnn as cudnn
 
@@ -39,6 +40,15 @@ def build_model(config):
     return model
 
 
+def detect_device():
+    if flow.cuda.is_available():
+        return "cuda"
+    elif importlib.util.find_spec("oneflow_npu") is not None:
+        return "npu"
+    else:
+        return "cpu"
+
+
 def parse_option():
     parser = argparse.ArgumentParser(
         "Flowvision image classification training and evaluation script", add_help=False
@@ -125,7 +135,7 @@ def parse_option():
     parser.add_argument(
         "--device",
         type=str,
-        default="cuda",
+        default=detect_device(),
         help="Specify the device to run the model on. Options: 'cuda', 'cpu', or 'npu'.",
     )
 

From ddf1ff2fd49266b8c5387b6bef68f349dd1a6b7d Mon Sep 17 00:00:00 2001
From: ShawnXuan <xiexuanx2@gmail.com>
Date: Tue, 13 May 2025 11:45:16 +0800
Subject: [PATCH 4/4] update readme

---
 cv/classification/README.md | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/cv/classification/README.md b/cv/classification/README.md
index bf04f48a..97b926d5 100644
--- a/cv/classification/README.md
+++ b/cv/classification/README.md
@@ -107,7 +107,12 @@ sh infer.sh
 
 ### Multi-Device Support (Experimental)
 
-This branch introduces preliminary support for running on different device types. To train on an NPU device, add the following argument to your train.sh command:
+This branch introduces preliminary support for running on different device types. By default, the training script now automatically selects the best available device in the following priority:
+1. CUDA (GPU)
+2. NPU (if oneflow_npu is installed)
+3. CPU (fallback)
+
+If you want to explicitly run on a specific device (e.g., NPU), you can still override the default by adding the following argument to your train.sh command:
 
 ```bash
 --device=npu