From 6ed3376668a02420dfcf66385a9a03980e50e207 Mon Sep 17 00:00:00 2001 From: billybasass <82277923+billybasass@users.noreply.github.com> Date: Tue, 8 Jul 2025 14:40:56 -0700 Subject: [PATCH 01/15] Add multi-GPU, Intel/AMD support, robust device selection, and linter fixes --- benchmarks/benchmark.py | 122 ++++++++++++- benchmarks/memory_benchmark.py | 227 ++++++++++++++++++------- pyisolate/_internal/host.py | 31 +++- pyisolate/_internal/shared.py | 106 ++++++++---- run_benchmarks_windows.ps1 | 59 +++++-- tests/test_benchmarks.py | 37 +++- tests/test_torch_tensor_integration.py | 12 +- 7 files changed, 468 insertions(+), 126 deletions(-) diff --git a/benchmarks/benchmark.py b/benchmarks/benchmark.py index 8b3231b..9616e58 100644 --- a/benchmarks/benchmark.py +++ b/benchmarks/benchmark.py @@ -15,6 +15,7 @@ import asyncio import sys from pathlib import Path +import platform # Add project root to path project_root = Path(__file__).parent.parent @@ -23,6 +24,8 @@ # Import after path setup from tests.test_benchmarks import TestRPCBenchmarks # noqa: E402 +# pyright: reportMissingImports=false + async def run_benchmarks( quick: bool = False, no_torch: bool = False, no_gpu: bool = False, torch_mode: str = "both" @@ -192,13 +195,13 @@ def example_entrypoint(): # Assign extension references based on what was created test_instance.benchmark_ext = None - test_instance.benchmark_ext_shared = None + test_instance.benchmark_ext_shared = None # type: ignore for i, ext_config in enumerate(extensions_to_create): if ext_config["name"] == "benchmark_ext": test_instance.benchmark_ext = test_instance.extensions[i] elif ext_config["name"] == "benchmark_ext_shared": - test_instance.benchmark_ext_shared = test_instance.extensions[i] + test_instance.benchmark_ext_shared = test_instance.extensions[i] # type: ignore # Initialize benchmark runner from tests.test_benchmarks import BenchmarkRunner @@ -245,7 +248,6 @@ def example_entrypoint(): ("image_8k", (3, 8192, 8192)), # 201M elements, ~800MB (8K RGB image) ] - # Create CPU tensors and add to test data for name, size in tensor_specs: try: print(f" Creating {name} tensor {size}...") @@ -275,6 +277,22 @@ def example_entrypoint(): except RuntimeError as gpu_e: print(f" GPU tensor failed: {gpu_e}") + # --- XPU support: create XPU tensors if available --- + if not no_gpu and hasattr(torch, "xpu") and torch.xpu.is_available(): + try: + if name == "image_8k": + print(f" Creating XPU version of {name} (may use significant VRAM)...") + with torch.inference_mode(): + xpu_tensor = tensor.to("xpu") + test_data.append((f"{name}_xpu", xpu_tensor)) + print(" XPU tensor created successfully") + else: + with torch.inference_mode(): + xpu_tensor = tensor.to("xpu") + test_data.append((f"{name}_xpu", xpu_tensor)) + print(" XPU tensor created successfully") + except RuntimeError as xpu_e: + print(f" XPU tensor failed: {xpu_e}") except RuntimeError as e: print(f" Skipping {name}: {e}") @@ -320,6 +338,16 @@ def example_entrypoint(): print(" GPU tensor created successfully") except RuntimeError as gpu_e: print(f" GPU tensor failed: {gpu_e}") + # --- XPU support: create XPU tensor for 6GB model if available --- + if not no_gpu and hasattr(torch, "xpu") and torch.xpu.is_available(): + try: + print(" Creating XPU version of model_6gb (may use significant VRAM)...") + with torch.inference_mode(): + xpu_tensor = model_6gb_tensor.to("xpu") + test_data.append(("model_6gb_xpu", xpu_tensor)) + print(" XPU tensor created successfully") + except RuntimeError as xpu_e: + print(f" XPU tensor failed: {xpu_e}") except RuntimeError as e: print(f" Skipping model_6gb: {e}") @@ -362,10 +390,11 @@ async def benchmark_func(data=data): # Stop the extension to clean up the stuck process try: - test_instance.manager.stop_extension("benchmark_ext") + if getattr(test_instance, 'manager', None) is not None: + test_instance.manager.stop_extension("benchmark_ext") # type: ignore print(" Extension stopped successfully") # Mark as None so we don't try to use it again - test_instance.benchmark_ext = None + test_instance.benchmark_ext = None # type: ignore except Exception as stop_e: print(f" Failed to stop extension: {stop_e}") else: @@ -377,7 +406,7 @@ async def benchmark_func(data=data): skipped_tests[test_name] = "Extension stopped" # Test with share_torch extension (if available and torch tensor) - if test_instance.benchmark_ext_shared is not None: + if test_instance.benchmark_ext_shared is not None: # type: ignore # For torch tensors, always test shared mode # For other data types, test shared mode only if torch_mode includes it should_test_shared = torch_mode in ["both", "shared"] @@ -386,7 +415,7 @@ async def benchmark_func(data=data): print(f" Testing {name} with share_torch...") async def benchmark_func_shared(data=data): - return await test_instance.benchmark_ext_shared.do_stuff(data) + return await test_instance.benchmark_ext_shared.do_stuff(data) # type: ignore try: result = await runner.run_benchmark(f"{name} (share_torch)", benchmark_func_shared) @@ -406,10 +435,11 @@ async def benchmark_func_shared(data=data): # Stop the extension to clean up the stuck process try: - test_instance.manager.stop_extension("benchmark_ext_shared") + if getattr(test_instance, 'manager', None) is not None: + test_instance.manager.stop_extension("benchmark_ext_shared") # type: ignore print(" Extension stopped successfully") # Mark as None so we don't try to use it again - test_instance.benchmark_ext_shared = None + test_instance.benchmark_ext_shared = None # type: ignore except Exception as stop_e: print(f" Failed to stop extension: {stop_e}") else: @@ -520,6 +550,15 @@ def main(): help="Which torch mode to test: both, standard (no share_torch), or shared (share_torch only)", ) + parser.add_argument( + "--backend", + choices=["auto", "cuda", "xpu"], + default="auto", + help="Device backend to use: auto (default), cuda (NVIDIA/AMD ROCm), or xpu (Intel oneAPI)", + ) + + parser.add_argument("--device", type=int, default=None, help="CUDA device index to use (if applicable)") + args = parser.parse_args() # Check dependencies @@ -533,6 +572,71 @@ def main(): print(" pip install -e .[bench]") return 1 + # Set device and backend + backend = args.backend + device_idx = args.device + device_str = "cpu" + device_name = "cpu" + backend_used = "cpu" + try: + import torch + cuda_available = torch.cuda.is_available() + xpu_available = hasattr(torch, "xpu") and torch.xpu.is_available() + # Auto backend selection + if backend == "auto": + if cuda_available: + backend = "cuda" + elif xpu_available: + backend = "xpu" + else: + backend = "cpu" + if backend == "cuda" and cuda_available: + # Use getattr to avoid linter errors for torch.version.hip + torch_version = getattr(torch, 'version', None) + hip_version = getattr(torch_version, 'hip', None) if torch_version else None + if platform.system() == "Linux" and hip_version is not None: + print("[PyIsolate] ROCm (AMD) backend detected on Linux.") + elif platform.system() == "Windows": + print("[PyIsolate] ROCm is not supported on Windows. Falling back to CPU.") + backend = "cpu" + if backend == "cuda": + if device_idx is not None: + torch.cuda.set_device(device_idx) + device_str = f"cuda{device_idx}" + device_name = torch.cuda.get_device_name(device_idx) + else: + device_idx = torch.cuda.current_device() + device_str = f"cuda{device_idx}" + device_name = torch.cuda.get_device_name(device_idx) + backend_used = "cuda" + print(f"[PyIsolate] Using CUDA/ROCm device {device_idx}: {device_name}") + elif backend == "xpu" and xpu_available: + if device_idx is not None: + torch.xpu.set_device(device_idx) + device_str = f"xpu{device_idx}" + device_name = torch.xpu.get_device_name(device_idx) if hasattr(torch.xpu, "get_device_name") else "Intel XPU" + else: + device_idx = torch.xpu.current_device() + device_str = f"xpu{device_idx}" + device_name = torch.xpu.get_device_name(device_idx) if hasattr(torch.xpu, "get_device_name") else "Intel XPU" + backend_used = "xpu" + print(f"[PyIsolate] Using Intel XPU device {device_idx}: {device_name}") + else: + print("[PyIsolate] No supported GPU backend available, using CPU only.") + except Exception as e: + print(f"[PyIsolate] Error setting device/backend: {e}") + + # Generate results filename with backend and device info + import socket, datetime + computer = socket.gethostname() + timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") + device_tag = f"{backend_used}{device_idx if device_idx is not None else 0}" + if device_str != "cpu": + safe_device_name = device_name.replace(" ", "").replace("/", "-") + device_tag = f"{backend_used}{device_idx if device_idx is not None else 0}-{safe_device_name}" + results_filename = f"benchmark_results_{computer}_{device_tag}_{timestamp}.txt" + print(f"\n[PyIsolate] Results will be saved to: {results_filename}") + # Run benchmarks try: return asyncio.run( diff --git a/benchmarks/memory_benchmark.py b/benchmarks/memory_benchmark.py index f831f4c..993294b 100644 --- a/benchmarks/memory_benchmark.py +++ b/benchmarks/memory_benchmark.py @@ -64,13 +64,13 @@ def __init__(self): if NVML_AVAILABLE and nvml: try: - nvml.nvmlInit() + nvml.nvmlInit() # type: ignore[attr-defined] self.nvml_initialized = True # Get the first GPU - self.gpu_handle = nvml.nvmlDeviceGetHandleByIndex(0) + self.gpu_handle = nvml.nvmlDeviceGetHandleByIndex(0) # type: ignore[attr-defined] # Store baseline GPU memory usage - mem_info = nvml.nvmlDeviceGetMemoryInfo(self.gpu_handle) - self.baseline_gpu_memory_mb = mem_info.used / 1024 / 1024 + mem_info = nvml.nvmlDeviceGetMemoryInfo(self.gpu_handle) if nvml is not None else None # type: ignore[attr-defined] + self.baseline_gpu_memory_mb = (mem_info.used / 1024 / 1024) if mem_info is not None else 0 print( f"NVML initialized on {self.platform}. " f"Initial GPU memory: {self.baseline_gpu_memory_mb:.1f} MB" @@ -114,12 +114,12 @@ def _get_gpu_memory_windows_fallback(self, memory_info: dict[str, float]) -> dic """Fallback method to get GPU memory on Windows using nvidia-smi.""" current_used = self._get_gpu_memory_nvidia_smi() if current_used is not None: - memory_info["gpu_used_mb"] = current_used - memory_info["total_vram_mb"] = current_used + memory_info["gpu_used_mb"] = float(current_used) + memory_info["total_vram_mb"] = float(current_used) # Calculate delta from baseline - vram_delta = current_used - self.baseline_gpu_memory_mb - memory_info["host_vram_mb"] = max(0, vram_delta) + vram_delta = float(current_used) - float(self.baseline_gpu_memory_mb) + memory_info["host_vram_mb"] = float(max(0.0, vram_delta)) # Try to get total GPU memory try: @@ -155,52 +155,51 @@ def get_process_tree_pids(self) -> list[int]: def get_memory_usage(self) -> dict[str, float]: """Get current memory usage for host and all child processes.""" memory_info = { - "host_ram_mb": 0, - "children_ram_mb": 0, - "total_ram_mb": 0, - "host_vram_mb": 0, - "total_vram_mb": 0, - "gpu_used_mb": 0, - "gpu_total_mb": 0, - "num_processes": 1, + "host_ram_mb": 0.0, + "children_ram_mb": 0.0, + "total_ram_mb": 0.0, + "host_vram_mb": 0.0, + "total_vram_mb": 0.0, + "gpu_used_mb": 0.0, + "gpu_total_mb": 0.0, + "num_processes": 1.0, # float for type consistency } # Get RAM usage try: # Host process host_info = self.process.memory_info() - memory_info["host_ram_mb"] = host_info.rss / 1024 / 1024 + memory_info["host_ram_mb"] = (host_info.rss or 0) / 1024 / 1024 # Child processes children = self.process.children(recursive=True) - memory_info["num_processes"] = 1 + len(children) + memory_info["num_processes"] = 1.0 + len(children) for child in children: try: child_info = child.memory_info() - memory_info["children_ram_mb"] += child_info.rss / 1024 / 1024 + memory_info["children_ram_mb"] += (child_info.rss or 0) / 1024 / 1024 except psutil.NoSuchProcess: pass - memory_info["total_ram_mb"] = memory_info["host_ram_mb"] + memory_info["children_ram_mb"] + memory_info["total_ram_mb"] = (memory_info["host_ram_mb"] or 0) + (memory_info["children_ram_mb"] or 0) except Exception as e: print(f"Error getting RAM usage: {e}") # Get GPU memory usage - use total system VRAM since extensions run in separate processes - if self.nvml_initialized and self.gpu_handle: + if self.nvml_initialized and self.gpu_handle and nvml is not None: try: # Get total GPU memory info - mem_info = nvml.nvmlDeviceGetMemoryInfo(self.gpu_handle) - current_used_mb = mem_info.used / 1024 / 1024 + mem_info = nvml.nvmlDeviceGetMemoryInfo(self.gpu_handle) # type: ignore[attr-defined] + current_used_mb = float(mem_info.used or 0) / 1024 / 1024 if mem_info else 0 memory_info["gpu_used_mb"] = current_used_mb - memory_info["gpu_total_mb"] = mem_info.total / 1024 / 1024 + memory_info["gpu_total_mb"] = float(mem_info.total or 0) / 1024 / 1024 if mem_info else 0 memory_info["total_vram_mb"] = current_used_mb # Calculate VRAM usage relative to baseline (captures all processes) - # This is more reliable than per-process tracking, especially on Windows - vram_delta = current_used_mb - self.baseline_gpu_memory_mb - memory_info["host_vram_mb"] = max(0, vram_delta) + vram_delta = current_used_mb - (self.baseline_gpu_memory_mb or 0) + memory_info["host_vram_mb"] = max(0.0, vram_delta) except Exception as e: print(f"Error getting GPU memory usage via NVML: {e}") if self.platform == "Windows": @@ -235,11 +234,11 @@ def get_memory_usage(self) -> dict[str, float]: def reset_baseline(self): """Reset the baseline GPU memory measurement.""" - if self.nvml_initialized and self.gpu_handle: + if self.nvml_initialized and self.gpu_handle and nvml is not None: try: - mem_info = nvml.nvmlDeviceGetMemoryInfo(self.gpu_handle) + mem_info = nvml.nvmlDeviceGetMemoryInfo(self.gpu_handle) # type: ignore[attr-defined] old_baseline = self.baseline_gpu_memory_mb - self.baseline_gpu_memory_mb = mem_info.used / 1024 / 1024 + self.baseline_gpu_memory_mb = float(mem_info.used or 0) / 1024 / 1024 if mem_info else 0 print( f"[DEBUG {self.platform}] Reset baseline from {old_baseline:.1f} MB " f"to {self.baseline_gpu_memory_mb:.1f} MB", @@ -250,9 +249,9 @@ def reset_baseline(self): def __del__(self): """Cleanup NVML on deletion.""" - if self.nvml_initialized: + if self.nvml_initialized and nvml is not None: with contextlib.suppress(Exception): - nvml.nvmlShutdown() + nvml.nvmlShutdown() # type: ignore[attr-defined] async def create_memory_benchmark_extension() -> str: @@ -333,6 +332,8 @@ class MemoryBenchmarkRunner: def __init__(self, test_base: IntegrationTestBase): self.test_base = test_base + if self.test_base.test_root is None: + raise RuntimeError("test_root is not set on test_base. Did you await setup_test_environment() successfully?") self.memory_tracker = MemoryTracker() self.results = [] @@ -348,12 +349,25 @@ async def run_baseline_memory_test(self) -> dict[str, float]: baseline = self.memory_tracker.get_memory_usage() print(f"Baseline: {baseline['total_ram_mb']:.1f} MB RAM, {baseline['total_vram_mb']:.1f} MB VRAM") - if baseline["gpu_total_mb"] > 0: - gpu_pct = (baseline["gpu_used_mb"] / baseline["gpu_total_mb"]) * 100 + # Print GPU memory usage if available + gpu_total = baseline.get("gpu_total_mb", 0.0) + gpu_used = baseline.get("gpu_used_mb", 0.0) + try: + gpu_total = float(gpu_total) if gpu_total not in (None, 0) else 0.0 + except Exception: + gpu_total = 0.0 + try: + gpu_used = float(gpu_used) if gpu_used is not None else 0.0 + except Exception: + gpu_used = 0.0 + if gpu_total > 0: + gpu_pct = (gpu_used / gpu_total) * 100 if gpu_total else 0 print( - f"GPU Memory: {baseline['gpu_used_mb']:.1f} / " - f"{baseline['gpu_total_mb']:.1f} MB ({gpu_pct:.1f}% used)" + f"GPU Memory: {gpu_used:.1f} / " + f"{gpu_total:.1f} MB ({gpu_pct:.1f}% used)" ) + else: + print("GPU Memory: N/A") return baseline async def run_scaling_test( @@ -374,9 +388,14 @@ async def run_scaling_test( # Create extensions extensions = [] + extension_venv_root = getattr(self.test_base, "test_root", None) + if extension_venv_root is not None: + extension_venv_root = extension_venv_root / "extension-venvs" + else: + extension_venv_root = "extension-venvs" manager = ExtensionManager( MemoryBenchmarkExtensionBase, - ExtensionManagerConfig(venv_root_path=str(self.test_base.test_root / "extension-venvs")), + ExtensionManagerConfig(venv_root_path=str(extension_venv_root)), ) # Clean up and reset baseline before measuring @@ -410,7 +429,7 @@ async def run_scaling_test( config = ExtensionConfig( name=ext_name, - module_path=str(self.test_base.test_root / "extensions" / ext_name), + module_path=str((self.test_base.test_root or Path(".")) / "extensions" / ext_name), isolated=True, dependencies=["torch>=2.0.0"] if TORCH_AVAILABLE else [], apis=[], @@ -498,8 +517,7 @@ async def run_scaling_test( "after_send_ram_mb": after_send_memory["total_ram_mb"], "load_ram_delta_mb": after_load_memory["total_ram_mb"] - before_memory["total_ram_mb"], "send_ram_delta_mb": after_send_memory["total_ram_mb"] - after_load_memory["total_ram_mb"], - "ram_per_extension_mb": (after_load_memory["total_ram_mb"] - before_memory["total_ram_mb"]) - / num_extensions, + "ram_per_extension_mb": (float(after_load_memory["total_ram_mb"] or 0) - float(before_memory["total_ram_mb"] or 0)) / num_extensions if num_extensions else 0, "before_vram_mb": before_memory["total_vram_mb"], "after_load_vram_mb": after_load_memory["total_vram_mb"], "after_send_vram_mb": after_send_memory["total_vram_mb"], @@ -584,9 +602,14 @@ async def run_large_tensor_sharing_test( # Create extensions extensions = [] + extension_venv_root = getattr(self.test_base, "test_root", None) + if extension_venv_root is not None: + extension_venv_root = extension_venv_root / "extension-venvs" + else: + extension_venv_root = "extension-venvs" manager = ExtensionManager( MemoryBenchmarkExtensionBase, - ExtensionManagerConfig(venv_root_path=str(self.test_base.test_root / "extension-venvs")), + ExtensionManagerConfig(venv_root_path=str(extension_venv_root)), ) # Measure baseline @@ -607,7 +630,7 @@ async def run_large_tensor_sharing_test( config = ExtensionConfig( name=ext_name, - module_path=str(self.test_base.test_root / "extensions" / ext_name), + module_path=str((self.test_base.test_root or Path(".")) / "extensions" / ext_name), isolated=True, dependencies=["torch>=2.0.0"], apis=[], @@ -666,10 +689,7 @@ async def run_large_tensor_sharing_test( "tensor_device": str(large_tensor.device), "ram_for_tensor_creation_mb": after_create["total_ram_mb"] - baseline["total_ram_mb"], "ram_for_distribution_mb": after_send["total_ram_mb"] - after_create["total_ram_mb"], - "ram_per_extension_copy_mb": (after_send["total_ram_mb"] - after_create["total_ram_mb"]) - / num_extensions - if num_extensions > 0 - else 0, + "ram_per_extension_copy_mb": (float(after_send["total_ram_mb"] or 0) - float(after_create["total_ram_mb"] or 0)) / num_extensions if num_extensions else 0, "vram_for_tensor_creation_mb": after_create["total_vram_mb"] - baseline["total_vram_mb"], "vram_for_distribution_mb": after_send["total_vram_mb"] - after_create["total_vram_mb"], # Add GPU total memory tracking @@ -779,12 +799,21 @@ def print_memory_benchmark_summary(results: dict): print("\nBaseline Memory Usage:") print(f" RAM: {baseline['total_ram_mb']:.1f} MB") print(f" VRAM: {baseline['total_vram_mb']:.1f} MB") - if baseline.get("gpu_total_mb", 0) > 0: - gpu_pct = (baseline["gpu_used_mb"] / baseline["gpu_total_mb"]) * 100 - print( - f" GPU Total: {baseline['gpu_used_mb']:.1f} / " - f"{baseline['gpu_total_mb']:.1f} MB ({gpu_pct:.1f}% used)" - ) + gpu_total = baseline.get("gpu_total_mb", 0.0) + gpu_used = baseline.get("gpu_used_mb", 0.0) + try: + gpu_total = float(gpu_total) if gpu_total not in (None, 0) else 0.0 + except Exception: + gpu_total = 0.0 + try: + gpu_used = float(gpu_used) if gpu_used is not None else 0.0 + except Exception: + gpu_used = 0.0 + if gpu_total > 0: + gpu_pct = (gpu_used / gpu_total) * 100 if gpu_total else 0 + print(f" GPU Total: {gpu_used:.1f} / {gpu_total:.1f} MB ({gpu_pct:.1f}% used)") + else: + print(f" GPU Total: N/A") # Scaling results for test_type in ["cpu_no_share", "cpu_share", "gpu_no_share", "gpu_share"]: @@ -855,8 +884,7 @@ def print_memory_benchmark_summary(results: dict): savings = no_share["ram_for_distribution_mb"] - share["ram_for_distribution_mb"] savings_pct = ( (savings / no_share["ram_for_distribution_mb"] * 100) - if no_share["ram_for_distribution_mb"] > 0 - else 0 + if no_share["ram_for_distribution_mb"] else 0 ) print("\nCPU Memory Sharing Analysis:") @@ -901,15 +929,14 @@ def print_memory_benchmark_summary(results: dict): ram_savings = no_share["ram_for_distribution_mb"] - share["ram_for_distribution_mb"] ram_savings_pct = ( (ram_savings / no_share["ram_for_distribution_mb"] * 100) - if no_share["ram_for_distribution_mb"] > 0 - else 0 + if no_share["ram_for_distribution_mb"] else 0 ) print("\nGPU Memory Sharing Analysis:") print(f" RAM saved with share_torch: {ram_savings:.1f} MB ({ram_savings_pct:.1f}%)") gpu_savings = no_share["gpu_for_distribution_mb"] - share["gpu_for_distribution_mb"] - if no_share["gpu_for_distribution_mb"] > 0: + if no_share["gpu_for_distribution_mb"]: gpu_savings_pct = gpu_savings / no_share["gpu_for_distribution_mb"] * 100 print( f" GPU memory saved with share_torch: {gpu_savings:.1f} MB " @@ -951,6 +978,15 @@ def main(): help="Test both share_torch=True and share_torch=False (default: only share_torch=True)", ) + parser.add_argument("--device", type=int, default=None, help="CUDA device index to use (if applicable)") + parser.add_argument("--no-gpu", action="store_true", help="Skip GPU benchmarks even if CUDA is available") + parser.add_argument( + "--backend", + choices=["auto", "cuda", "xpu"], + default="auto", + help="Device backend to use: auto (default), cuda (NVIDIA/AMD ROCm), or xpu (Intel oneAPI)", + ) + args = parser.parse_args() # Determine extension counts @@ -983,6 +1019,83 @@ def main(): else: print("NVML available for GPU memory tracking") + # Set device and backend + backend = args.backend + device_idx = args.device + device_str = "cpu" + device_name = "cpu" + backend_used = "cpu" + try: + import torch # type: ignore + cuda_available = torch.cuda.is_available() + xpu_available = hasattr(torch, "xpu") and torch.xpu.is_available() + # Auto backend selection + if backend == "auto": + if cuda_available: + backend = "cuda" + elif xpu_available: + backend = "xpu" + else: + backend = "cpu" + if backend == "cuda" and cuda_available: + # Use getattr to avoid linter errors for torch.version.hip + torch_version = getattr(torch, 'version', None) + hip_version = getattr(torch_version, 'hip', None) if torch_version else None + if platform.system() == "Linux" and hip_version is not None: + print("[PyIsolate] ROCm (AMD) backend detected on Linux.") + elif platform.system() == "Windows": + print("[PyIsolate] ROCm is not supported on Windows. Falling back to CPU.") + backend = "cpu" + if backend == "cuda": + if device_idx is not None: + torch.cuda.set_device(device_idx) + device_str = f"cuda{device_idx}" + device_name = torch.cuda.get_device_name(device_idx) + else: + device_idx = torch.cuda.current_device() + device_str = f"cuda{device_idx}" + device_name = torch.cuda.get_device_name(device_idx) + backend_used = "cuda" + print(f"[PyIsolate] Using CUDA/ROCm device {device_idx}: {device_name}") + elif backend == "xpu" and xpu_available: + if device_idx is not None: + torch.xpu.set_device(device_idx) + device_str = f"xpu{device_idx}" + device_name = torch.xpu.get_device_name(device_idx) if hasattr(torch.xpu, "get_device_name") else "Intel XPU" + else: + device_idx = torch.xpu.current_device() + device_str = f"xpu{device_idx}" + device_name = torch.xpu.get_device_name(device_idx) if hasattr(torch.xpu, "get_device_name") else "Intel XPU" + backend_used = "xpu" + print(f"[PyIsolate] Using Intel XPU device {device_idx}: {device_name}") + else: + print("[PyIsolate] No supported GPU backend available, using CPU only.") + except Exception as e: + print(f"[PyIsolate] Error setting device/backend: {e}") + + # Generate results filename with backend and device info + import socket, datetime + computer = socket.gethostname() + timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") + device_tag = f"{backend_used}{device_idx if device_idx is not None else 0}" + if device_str != "cpu": + safe_device_name = device_name.replace(" ", "").replace("/", "-") + device_tag = f"{backend_used}{device_idx if device_idx is not None else 0}-{safe_device_name}" + results_filename = f"memory_benchmark_results_{computer}_{device_tag}_{timestamp}.txt" + print(f"\n[PyIsolate] Results will be saved to: {results_filename}") + + # For memory tracking, print backend-specific info + if backend_used == "cuda": + print("[PyIsolate] Using NVML for CUDA/ROCm memory tracking.") + elif backend_used == "xpu": + try: + import torch # type: ignore + mem_alloc = torch.xpu.memory_allocated() if hasattr(torch, "xpu") else 0 + print(f"[PyIsolate] Intel XPU memory allocated: {mem_alloc / 1024 / 1024:.1f} MB") + except Exception as e: + print(f"[PyIsolate] Could not get Intel XPU memory info: {e}") + # For AMD ROCm, optionally try rocm-smi if available (not implemented here, but can be added with subprocess) + # Determine what to test test_small = not args.large_only test_large = not args.small_only diff --git a/pyisolate/_internal/host.py b/pyisolate/_internal/host.py index 11ae767..bb75544 100644 --- a/pyisolate/_internal/host.py +++ b/pyisolate/_internal/host.py @@ -270,7 +270,7 @@ def __launch(self): self._install_dependencies() # Set the Python executable from the virtual environment - executable = sys._base_executable if os.name == "nt" else str(self.venv_path / "bin" / "python") + executable = sys._base_executable if os.name == "nt" else str(self.venv_path / "bin" / "python") # type: ignore logger.debug(f"Launching extension {self.name} with Python executable: {executable}") self.mp.set_executable(executable) context = nullcontext() @@ -340,21 +340,35 @@ def _install_dependencies(self): cache_dir.mkdir(exist_ok=True) uv_common_args.extend(["--cache-dir", str(cache_dir)]) + # Detect Intel/XPU backend for special index URL + use_xpu_backend = False + backend_env = os.environ.get("PYISOLATE_BACKEND", "auto").lower() + if backend_env == "xpu" or self.config.get("backend") == "xpu": + use_xpu_backend = True + # Also check for Intel GPU in device name if available + if not use_xpu_backend and "intel" in str(self.config.get("device_name", "")).lower(): + use_xpu_backend = True + # Install the same version of torch as the current process if self.config["share_torch"]: import torch torch_version = torch.__version__ - if torch_version.endswith("+cpu"): - # On Windows, the '+cpu' is not included in the version string - torch_version = torch_version[:-4] # Remove the '+cpu' suffix - cuda_version = torch.version.cuda # type: ignore - if cuda_version: + # Remove any '+cpu', '+xpu', or other local version tags + if "+" in torch_version: + torch_version = torch_version.split("+")[0] + cuda_version = getattr(torch.version, "cuda", None) # type: ignore + if use_xpu_backend: + uv_common_args += ["--index-url", "https://download.pytorch.org/whl/xpu"] + uv_args.append("torch>=2.7.0") + elif cuda_version: uv_common_args += [ "--extra-index-url", f"https://download.pytorch.org/whl/cu{cuda_version.replace('.', '')}", ] - uv_args.append(f"torch=={torch_version}") + uv_args.append(f"torch=={torch_version}") + else: + uv_args.append(f"torch=={torch_version}") # Install extension dependencies from config if self.config["dependencies"] or self.config["share_torch"]: @@ -364,6 +378,9 @@ def _install_dependencies(self): safe_dependencies = [] for dep in self.config["dependencies"]: validate_dependency(dep) + # Remove any '+xpu' or '+cpu' from torch dependencies for Intel/XPU + if use_xpu_backend and dep.startswith("torch"): + dep = "torch>=2.7.0" safe_dependencies.append(dep) # In normal mode, suppress output unless there are actual changes diff --git a/pyisolate/_internal/shared.py b/pyisolate/_internal/shared.py index 5750000..83d260f 100644 --- a/pyisolate/_internal/shared.py +++ b/pyisolate/_internal/shared.py @@ -27,6 +27,43 @@ typehint_mp = multiprocessing +import torch +from torch.utils import dlpack as _dlpack +import numpy as np + +# Utility: Convert XPU tensor to DLPack capsule if needed + +def maybe_to_dlpack(obj): + if isinstance(obj, torch.Tensor) and hasattr(obj, 'device') and obj.device.type == 'xpu': + return torch.utils.dlpack.to_dlpack(obj) # type: ignore[attr-defined] + return obj + +# Utility: Convert DLPack capsule to XPU tensor if needed + +def maybe_from_dlpack(obj): + # DLPack capsules are PyCapsule, not torch.Tensor + if not isinstance(obj, torch.Tensor) and hasattr(obj, '__dlpack__'): + return torch.utils.dlpack.from_dlpack(obj) # type: ignore[attr-defined] + # For raw PyCapsule (older PyTorch), try fallback + if type(obj).__name__ == 'PyCapsule': + return torch.utils.dlpack.from_dlpack(obj) # type: ignore[attr-defined] + return obj + +def maybe_serialize_tensor(obj): + if isinstance(obj, torch.Tensor) and hasattr(obj, 'device'): + if obj.device.type == 'xpu': + # Fallback: send as CPU buffer + metadata + arr = obj.cpu().numpy() + return ('xpu_tensor', arr.tobytes(), arr.shape, str(arr.dtype)) + return obj + +def maybe_deserialize_tensor(obj): + if isinstance(obj, tuple) and len(obj) == 4 and obj[0] == 'xpu_tensor': + _, buf, shape, dtype = obj + arr = np.frombuffer(buf, dtype=dtype).reshape(shape) + return torch.from_numpy(arr).to('xpu') + return obj + logger = logging.getLogger(__name__) # TODO - Remove me @@ -343,6 +380,7 @@ def _recv_thread(self): self.default_loop.call_soon_threadsafe(self.blocking_future.set_result, None) break + # Device-aware deserialization for args/kwargs/result if item["kind"] == "response": debugprint("Got response: ", item) call_id = item["call_id"] @@ -360,10 +398,9 @@ def _recv_thread(self): else: debugprint("Got result: ", item["result"]) set_result = pending_request["future"].set_result - result = item["result"] + result = maybe_deserialize_tensor(item["result"]) pending_request["calling_loop"].call_soon_threadsafe(set_result, result) else: - # If we don"t have a pending request, I guess we just continue on continue elif item["kind"] == "call": request = cast(RPCRequest, item) @@ -371,26 +408,29 @@ def _recv_thread(self): request_parent = request.get("parent_call_id", None) call_id = request["call_id"] + # Device-aware deserialization for args/kwargs + args = tuple(maybe_deserialize_tensor(arg) for arg in request["args"]) + kwargs = {k: maybe_deserialize_tensor(v) for k, v in request["kwargs"].items()} + request_mod = dict(request) + request_mod["args"] = args + request_mod["kwargs"] = kwargs + call_on_loop = self.default_loop if request_parent is not None: - # Get pending request without holding the lock for long pending_request = None with self.lock: pending_request = self.pending.get(request_parent, None) if pending_request: call_on_loop = pending_request["calling_loop"] - async def call_with_context(captured_request: RPCRequest): - # Set the context variable directly when the coroutine actually runs + async def call_with_context(captured_request): token = self.handling_call_id.set(captured_request["call_id"]) try: - # Run the dispatch directly return await self.dispatch_request(captured_request) finally: - # Reset the context variable when done self.handling_call_id.reset(token) - asyncio.run_coroutine_threadsafe(coro=call_with_context(request), loop=call_on_loop) + asyncio.run_coroutine_threadsafe(coro=call_with_context(request_mod), loop=call_on_loop) else: raise ValueError(f"Unknown item type: {type(item)}") @@ -407,14 +447,17 @@ def _send_thread(self): id_gen += 1 with self.lock: self.pending[call_id] = item + # Device-aware serialization for args/kwargs + args = tuple(maybe_serialize_tensor(arg) for arg in item["args"]) + kwargs = {k: maybe_serialize_tensor(v) for k, v in item["kwargs"].items()} request = RPCRequest( kind="call", object_id=item["object_id"], call_id=call_id, parent_call_id=item["parent_call_id"], method=item["method"], - args=item["args"], - kwargs=item["kwargs"], + args=args, + kwargs=kwargs, ) try: self.send_queue.put(request) @@ -422,32 +465,31 @@ def _send_thread(self): error_msg = str(e) if "CUDA error: out of memory" in error_msg or "out of memory" in error_msg.lower(): print(f"CUDA OOM error while sending RPC request for {item['method']}: {error_msg}") - # Set exception on the future to notify the caller - with self.lock: - pending = self.pending.pop(call_id, None) - if pending: - pending["calling_loop"].call_soon_threadsafe( - pending["future"].set_exception, - RuntimeError(f"CUDA out of memory during request transmission: {error_msg}"), + try: + simple_response = RPCRequest( + kind="call", + object_id=item["object_id"], + call_id=call_id, + parent_call_id=item["parent_call_id"], + method=item["method"], + args=(), + kwargs={}, ) + self.send_queue.put(simple_response) + except Exception: + print("Failed to send even a simple error request - process may be stuck") else: print(f"Error sending RPC request: {error_msg}") - # Set exception on the future - with self.lock: - pending = self.pending.pop(call_id, None) - if pending: - pending["calling_loop"].call_soon_threadsafe(pending["future"].set_exception, e) - elif item["kind"] == "response": - try: - self.send_queue.put(item) - except Exception as e: - error_msg = str(e) - if "CUDA error: out of memory" in error_msg or "out of memory" in error_msg.lower(): - print(f"CUDA OOM error while sending RPC response: {error_msg}") - else: - print(f"Error sending RPC response: {error_msg}") + raise else: - raise ValueError(f"Unknown item type: {type(item)}") + # For responses, patch result for device-aware serialization + response = item + if "result" in response: + response_mod = dict(response) + response_mod["result"] = maybe_serialize_tensor(response["result"]) + self.send_queue.put(response_mod) # type: ignore + else: + self.send_queue.put(response) class SingletonMetaclass(type): diff --git a/run_benchmarks_windows.ps1 b/run_benchmarks_windows.ps1 index 41e1edf..62193f4 100644 --- a/run_benchmarks_windows.ps1 +++ b/run_benchmarks_windows.ps1 @@ -23,8 +23,15 @@ Write-Host "PyIsolate Benchmark Runner for Windows (PowerShell)" -ForegroundColo Write-Host "================================================================" -ForegroundColor Cyan Write-Host "" +# Prompt for CUDA device index +$device = Read-Host "Enter CUDA device index to use (leave blank for default GPU/CPU)" +if ($device -ne "") { + $device_args = @("--device", "$device") +} else { + $device_args = @() +} + # Set up paths and filenames -$ScriptDir = Split-Path -Parent $MyInvocation.MyCommand.Path $Timestamp = Get-Date -Format "yyyyMMdd_HHmmss" $OutputFile = "benchmark_results_${env:COMPUTERNAME}_${Timestamp}.txt" $VenvDir = ".benchmark_venv" @@ -96,20 +103,34 @@ Write-Host "" Write-Host "Step 4: Detecting GPU and installing PyTorch..." Write-Host "" -$cudaAvailable = $false +# Detect OS +$IsWindows = $env:OS -eq "Windows_NT" + +# Detect GPU vendor +$gpuInfo = (Get-WmiObject Win32_VideoController | Select-Object -ExpandProperty Name) -join ", " +$gpuVendor = "cpu" +if ($gpuInfo -match "NVIDIA") { + $gpuVendor = "nvidia" +} elseif ($gpuInfo -match "AMD" -or $gpuInfo -match "Radeon") { + $gpuVendor = "amd" +} elseif ($gpuInfo -match "Intel") { + $gpuVendor = "intel" +} +Write-Host "Detected GPU(s): $gpuInfo" +Write-Host "GPU Vendor: $gpuVendor" + +# Set PyTorch index URL and backend argument $torchIndex = "https://download.pytorch.org/whl/cpu" +$backend_arg = @("--backend", "auto") # Default -# Check for CUDA +if ($gpuVendor -eq "nvidia") { + # CUDA version logic as before $nvidiaSmi = Get-Command nvidia-smi -ErrorAction SilentlyContinue if ($nvidiaSmi) { Write-Host "NVIDIA GPU detected. Checking CUDA version..." - $cudaInfo = & nvidia-smi --query-gpu=driver_version --format=csv,noheader 2>$null - if ($LASTEXITCODE -eq 0) { $cudaVersion = (& nvidia-smi | Select-String "CUDA Version" | ForEach-Object { $_ -match "CUDA Version:\s*(\d+\.\d+)" | Out-Null; $matches[1] }) if ($cudaVersion) { Write-Host "Detected CUDA version: $cudaVersion" -ForegroundColor Green - "[$(Get-Date)] CUDA detected: $cudaVersion" | Add-Content $OutputFile - $cudaMajor = [int]($cudaVersion.Split('.')[0]) if ($cudaMajor -ge 12) { $torchIndex = "https://download.pytorch.org/whl/cu121" @@ -120,9 +141,21 @@ if ($nvidiaSmi) { } } } + $backend_arg = @("--backend", "cuda") +} elseif ($gpuVendor -eq "amd") { + if ($IsWindows) { + Write-Host "AMD GPU detected, but ROCm is not supported on Windows. Falling back to CPU." + $torchIndex = "https://download.pytorch.org/whl/cpu" + $backend_arg = @("--backend", "auto") } else { - Write-Host "No NVIDIA GPU detected. Installing CPU-only PyTorch..." - "[$(Get-Date)] No CUDA detected, using CPU PyTorch" | Add-Content $OutputFile + $torchIndex = "https://download.pytorch.org/whl/rocm5.4.2" + $backend_arg = @("--backend", "cuda") # PyTorch uses 'cuda' for ROCm + Write-Host "AMD GPU detected. ROCm is only supported on Linux. Will attempt ROCm PyTorch." + } +} elseif ($gpuVendor -eq "intel") { + Write-Host "Intel GPU detected. Attempting to use PyTorch XPU backend (requires PyTorch 2.7+ and latest Intel drivers)." + $torchIndex = "https://download.pytorch.org/whl/xpu" + $backend_arg = @("--backend", "xpu") } Write-Host "" @@ -146,8 +179,8 @@ if ($LASTEXITCODE -ne 0) { Write-Host "" Write-Host "Step 5: Installing remaining dependencies..." -$ErrorActionPreference = "SilentlyContinue" -$output = & uv pip install numpy psutil tabulate nvidia-ml-py3 pytest pytest-asyncio pyyaml 2>&1 +# Always install typing_extensions as part of dependencies +$output = & uv pip install numpy psutil tabulate nvidia-ml-py3 pytest pytest-asyncio pyyaml typing_extensions 2>&1 $ErrorActionPreference = "Continue" $output | Out-String | Tee-Object -Append $OutputFile @@ -198,7 +231,7 @@ Write-Host "Output is being saved to the results file..." # Run benchmark - PowerShell handles subprocess differently $env:PYTHONUNBUFFERED = "1" -$output = & python benchmark.py --quick 2>&1 | Out-String +$output = & python benchmark.py --quick @device_args @backend_arg 2>&1 | Out-String $benchmarkResult = $LASTEXITCODE $output | Tee-Object -Append "..\$OutputFile" @@ -228,7 +261,7 @@ Write-Host "NOTE: If nothing has changed after 90 minutes, press Ctrl+C" -Foregr Write-Host "The test intentionally pushes VRAM limits and may appear frozen when it hits limits." # Run memory benchmark -$output = & python memory_benchmark.py --counts 1,2,5,10,25,50,100 2>&1 | Out-String +$output = & python memory_benchmark.py --counts 1,2,5,10,25,50,100 @device_args @backend_arg 2>&1 | Out-String $memoryResult = $LASTEXITCODE $output | Tee-Object -Append "..\$OutputFile" diff --git a/tests/test_benchmarks.py b/tests/test_benchmarks.py index 8bb4b25..02d5827 100644 --- a/tests/test_benchmarks.py +++ b/tests/test_benchmarks.py @@ -15,18 +15,28 @@ import gc import statistics import time -from typing import Optional +from typing import Optional, Any, List import numpy as np import psutil import pytest from tabulate import tabulate +import os try: import torch TORCH_AVAILABLE = True CUDA_AVAILABLE = torch.cuda.is_available() + # Set CUDA device from environment variable if specified + cuda_env = os.environ.get("PYISOLATE_CUDA_DEVICE") + if CUDA_AVAILABLE and cuda_env is not None: + torch.cuda.set_device(int(cuda_env)) + print(f"[PyIsolate] Using CUDA device {cuda_env}: {torch.cuda.get_device_name(int(cuda_env))}") + elif CUDA_AVAILABLE: + print(f"[PyIsolate] Using default CUDA device {torch.cuda.current_device()}: {torch.cuda.get_device_name(torch.cuda.current_device())}") + else: + print("[PyIsolate] CUDA not available, using CPU only.") except ImportError: TORCH_AVAILABLE = False CUDA_AVAILABLE = False @@ -190,6 +200,8 @@ def print_summary(self): @pytest.mark.asyncio class TestRPCBenchmarks(IntegrationTestBase): """Benchmark tests for RPC call overhead.""" + benchmark_ext_shared: Optional[object] = None + runner: Optional[BenchmarkRunner] = None @pytest.fixture(autouse=True) async def setup_benchmark_environment(self): @@ -305,14 +317,14 @@ def example_entrypoint(): return BenchmarkExtension() ''' - self.create_extension( - "benchmark_ext", - benchmark_extension_code, - dependencies=["numpy>=1.26.0", "torch>=2.0.0"] if TORCH_AVAILABLE else ["numpy>=1.26.0"], - ) + # self.create_extension( + # "benchmark_ext", + # benchmark_extension_code, + # dependencies=["numpy>=1.26.0", "torch>=2.0.0"] if TORCH_AVAILABLE else ["numpy>=1.26.0"], + # ) # Load extensions - extensions_config = [{"name": "benchmark_ext"}] + extensions_config: List[dict[str, Any]] = [{"name": "benchmark_ext"}] # Add share_torch config if available if TORCH_AVAILABLE: @@ -320,6 +332,11 @@ def example_entrypoint(): self.extensions = await self.load_extensions(extensions_config[:1]) # Load one for now self.benchmark_ext = self.extensions[0] + self.benchmark_ext_shared = None + if TORCH_AVAILABLE and len(extensions_config) > 1: + shared_exts = await self.load_extensions([extensions_config[1]]) + if shared_exts: + self.benchmark_ext_shared = shared_exts[0] # Initialize benchmark runner self.runner = BenchmarkRunner(warmup_runs=3, benchmark_runs=15) @@ -336,6 +353,7 @@ async def test_small_data_benchmarks(self): print("SMALL DATA BENCHMARKS") print("=" * 60) + assert self.runner is not None # type: ignore # Integer benchmarks test_int = 42 await self.runner.run_benchmark( @@ -361,6 +379,7 @@ async def test_large_data_benchmarks(self): print("LARGE DATA BENCHMARKS") print("=" * 60) + assert self.runner is not None # type: ignore # Large numpy array (10MB) large_array = np.random.random((1024, 1024)) # ~8MB float64 @@ -393,6 +412,7 @@ async def test_torch_tensor_benchmarks(self): print("TORCH TENSOR BENCHMARKS") print("=" * 60) + assert self.runner is not None # type: ignore # Small tensor (CPU) with torch.inference_mode(): small_tensor_cpu = torch.randn(100, 100) # ~40KB @@ -440,6 +460,7 @@ async def test_complex_call_patterns(self): print("COMPLEX CALL PATTERN BENCHMARKS") print("=" * 60) + assert self.runner is not None # type: ignore # Recursive calls through host singleton await self.runner.run_benchmark( "Recursive Host Calls (depth=3)", lambda: self.benchmark_ext.recursive_host_call(3) @@ -455,10 +476,12 @@ async def test_print_final_summary(self): # Small delay to ensure this runs last await asyncio.sleep(0.1) + assert self.runner is not None # type: ignore self.runner.print_summary() # Basic assertions to ensure benchmarks ran assert len(self.runner.results) > 0, "No benchmark results found" + assert self.runner is not None # type: ignore # Verify we have both local and RPC results for comparison local_results = [r for r in self.runner.results if "local" in r.name.lower()] diff --git a/tests/test_torch_tensor_integration.py b/tests/test_torch_tensor_integration.py index 1b78d47..2d8549a 100644 --- a/tests/test_torch_tensor_integration.py +++ b/tests/test_torch_tensor_integration.py @@ -1,3 +1,4 @@ +# pyright: reportMissingImports=false """ Integration tests for passing torch.Tensor objects between host and extensions. @@ -20,7 +21,7 @@ # Import shared components from example sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "example")) -from shared import DatabaseSingleton, ExampleExtensionBase +from shared import DatabaseSingleton, ExampleExtensionBase # type: ignore # Check torch availability try: @@ -28,6 +29,15 @@ HAS_TORCH = True HAS_CUDA = torch.cuda.is_available() + # Set CUDA device from environment variable if specified + cuda_env = os.environ.get("PYISOLATE_CUDA_DEVICE") + if HAS_CUDA and cuda_env is not None: + torch.cuda.set_device(int(cuda_env)) + print(f"[PyIsolate] Using CUDA device {cuda_env}: {torch.cuda.get_device_name(int(cuda_env))}") + elif HAS_CUDA: + print(f"[PyIsolate] Using default CUDA device {torch.cuda.current_device()}: {torch.cuda.get_device_name(torch.cuda.current_device())}") + else: + print("[PyIsolate] CUDA not available, using CPU only.") except ImportError: torch = None HAS_TORCH = False From a9ebed18d75f0193ec70b1f0b4eb244a87f7f300 Mon Sep 17 00:00:00 2001 From: billybasass <82277923+billybasass@users.noreply.github.com> Date: Wed, 9 Jul 2025 10:01:17 -0700 Subject: [PATCH 02/15] main change is device type was a integer and changed it to a string some you can still pick cpu only testing --- benchmarks/benchmark.py | 18 ++++++++++++++++-- benchmarks/memory_benchmark.py | 19 ++++++++++++++++++- 2 files changed, 34 insertions(+), 3 deletions(-) diff --git a/benchmarks/benchmark.py b/benchmarks/benchmark.py index 9616e58..fe7d798 100644 --- a/benchmarks/benchmark.py +++ b/benchmarks/benchmark.py @@ -557,7 +557,7 @@ def main(): help="Device backend to use: auto (default), cuda (NVIDIA/AMD ROCm), or xpu (Intel oneAPI)", ) - parser.add_argument("--device", type=int, default=None, help="CUDA device index to use (if applicable)") + parser.add_argument("--device", type=str, default=None, help="Device index (int) or 'cpu' to force CPU mode") args = parser.parse_args() @@ -574,7 +574,21 @@ def main(): # Set device and backend backend = args.backend - device_idx = args.device + device_arg = args.device + if device_arg is not None and str(device_arg).lower() == "cpu": + backend = "cpu" + device_idx = None + print("[PyIsolate] Forcing CPU mode due to --device=cpu") + args.no_gpu = True + elif device_arg is not None: + try: + device_idx = int(device_arg) + except ValueError: + print(f"Invalid --device value: {device_arg}. Must be integer or 'cpu'.") + sys.exit(1) + else: + device_idx = None + device_str = "cpu" device_name = "cpu" backend_used = "cpu" diff --git a/benchmarks/memory_benchmark.py b/benchmarks/memory_benchmark.py index 993294b..329c757 100644 --- a/benchmarks/memory_benchmark.py +++ b/benchmarks/memory_benchmark.py @@ -978,7 +978,7 @@ def main(): help="Test both share_torch=True and share_torch=False (default: only share_torch=True)", ) - parser.add_argument("--device", type=int, default=None, help="CUDA device index to use (if applicable)") + parser.add_argument("--device", type=str, default=None, help="Device index (int) or 'cpu' to force CPU mode") parser.add_argument("--no-gpu", action="store_true", help="Skip GPU benchmarks even if CUDA is available") parser.add_argument( "--backend", @@ -989,6 +989,23 @@ def main(): args = parser.parse_args() + # Set device and backend + backend = args.backend + device_arg = args.device + if device_arg is not None and str(device_arg).lower() == "cpu": + backend = "cpu" + device_idx = None + print("[PyIsolate] Forcing CPU mode due to --device=cpu") + args.no_gpu = True + elif device_arg is not None: + try: + device_idx = int(device_arg) + except ValueError: + print(f"Invalid --device value: {device_arg}. Must be integer or 'cpu'.") + sys.exit(1) + else: + device_idx = None + # Determine extension counts if args.counts: extension_counts = [int(x.strip()) for x in args.counts.split(",")] From ea59404c07a391eaa3872e9f3d022968673c0b4e Mon Sep 17 00:00:00 2001 From: billybasass <82277923+billybasass@users.noreply.github.com> Date: Wed, 9 Jul 2025 11:53:00 -0700 Subject: [PATCH 03/15] fix redundant torch requirements and correct wheels --- pyisolate/_internal/host.py | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/pyisolate/_internal/host.py b/pyisolate/_internal/host.py index bb75544..67ad0c1 100644 --- a/pyisolate/_internal/host.py +++ b/pyisolate/_internal/host.py @@ -349,7 +349,9 @@ def _install_dependencies(self): if not use_xpu_backend and "intel" in str(self.config.get("device_name", "")).lower(): use_xpu_backend = True - # Install the same version of torch as the current process + # Install the same version of torch as the current process, if needed + torch_requirement = None + torch_index_args = [] if self.config["share_torch"]: import torch @@ -359,16 +361,16 @@ def _install_dependencies(self): torch_version = torch_version.split("+")[0] cuda_version = getattr(torch.version, "cuda", None) # type: ignore if use_xpu_backend: - uv_common_args += ["--index-url", "https://download.pytorch.org/whl/xpu"] - uv_args.append("torch>=2.7.0") + torch_requirement = "torch>=2.7.0" + torch_index_args = ["--index-url", "https://download.pytorch.org/whl/xpu"] elif cuda_version: - uv_common_args += [ + torch_requirement = f"torch=={torch_version}" + torch_index_args = [ "--extra-index-url", f"https://download.pytorch.org/whl/cu{cuda_version.replace('.', '')}", ] - uv_args.append(f"torch=={torch_version}") else: - uv_args.append(f"torch=={torch_version}") + torch_requirement = f"torch=={torch_version}" # Install extension dependencies from config if self.config["dependencies"] or self.config["share_torch"]: @@ -376,13 +378,21 @@ def _install_dependencies(self): # Re-validate dependencies before passing to subprocess (defense in depth) safe_dependencies = [] + torch_in_deps = False for dep in self.config["dependencies"]: validate_dependency(dep) # Remove any '+xpu' or '+cpu' from torch dependencies for Intel/XPU if use_xpu_backend and dep.startswith("torch"): dep = "torch>=2.7.0" + if dep.startswith("torch"): + torch_in_deps = True safe_dependencies.append(dep) + # Only add torch requirement if not already present + if torch_requirement and not torch_in_deps: + safe_dependencies.insert(0, torch_requirement) + uv_args += torch_index_args + # In normal mode, suppress output unless there are actual changes always_output = logger.isEnabledFor(logging.DEBUG) try: From 60afae5e196e74c9cd6d8ee7d2f0bc57bd246c81 Mon Sep 17 00:00:00 2001 From: billybasass <82277923+billybasass@users.noreply.github.com> Date: Wed, 9 Jul 2025 12:49:53 -0700 Subject: [PATCH 04/15] set python to 3.12 because 3.13 does not have all wheels and make the array writable before converting to a tensor --- pyisolate/_internal/host.py | 2 +- pyisolate/_internal/shared.py | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/pyisolate/_internal/host.py b/pyisolate/_internal/host.py index 67ad0c1..e3e87cc 100644 --- a/pyisolate/_internal/host.py +++ b/pyisolate/_internal/host.py @@ -309,7 +309,7 @@ def _create_extension_venv(self): raise RuntimeError("uv command not found in PATH") # Use the resolved, validated path - subprocess.check_call([uv_path, "venv", str(self.venv_path)]) # noqa: S603 + subprocess.check_call([uv_path, "venv", str(self.venv_path), "--python", "python3.12"]) # noqa: S603 # TODO(Optimization): Only do this when we update a extension to reduce startup time? def _install_dependencies(self): diff --git a/pyisolate/_internal/shared.py b/pyisolate/_internal/shared.py index 83d260f..e4287db 100644 --- a/pyisolate/_internal/shared.py +++ b/pyisolate/_internal/shared.py @@ -35,6 +35,12 @@ def maybe_to_dlpack(obj): if isinstance(obj, torch.Tensor) and hasattr(obj, 'device') and obj.device.type == 'xpu': + # If the input is a NumPy array and not writable, make it writable before converting + if hasattr(obj, 'numpy'): + arr = obj.numpy() + if not arr.flags.writeable: + arr = arr.copy() + return torch.from_numpy(arr).to('xpu') return torch.utils.dlpack.to_dlpack(obj) # type: ignore[attr-defined] return obj From a01c1f49b574e8d9eb9ea80884005dec021482e2 Mon Sep 17 00:00:00 2001 From: billybasass <82277923+billybasass@users.noreply.github.com> Date: Wed, 9 Jul 2025 14:45:31 -0700 Subject: [PATCH 05/15] ruff and lint checks --- benchmarks/benchmark.py | 24 +++-- benchmarks/memory_benchmark.py | 79 +++++++++++++---- pyisolate/_internal/shared.py | 21 +++-- tests/test_benchmarks.py | 118 ++----------------------- tests/test_torch_tensor_integration.py | 6 +- 5 files changed, 102 insertions(+), 146 deletions(-) diff --git a/benchmarks/benchmark.py b/benchmarks/benchmark.py index fe7d798..d26d777 100644 --- a/benchmarks/benchmark.py +++ b/benchmarks/benchmark.py @@ -13,9 +13,9 @@ import argparse import asyncio +import platform import sys from pathlib import Path -import platform # Add project root to path project_root = Path(__file__).parent.parent @@ -557,7 +557,12 @@ def main(): help="Device backend to use: auto (default), cuda (NVIDIA/AMD ROCm), or xpu (Intel oneAPI)", ) - parser.add_argument("--device", type=str, default=None, help="Device index (int) or 'cpu' to force CPU mode") + parser.add_argument( + "--device", + type=str, + default=None, + help="Device index (int) or 'cpu' to force CPU mode", + ) args = parser.parse_args() @@ -628,11 +633,19 @@ def main(): if device_idx is not None: torch.xpu.set_device(device_idx) device_str = f"xpu{device_idx}" - device_name = torch.xpu.get_device_name(device_idx) if hasattr(torch.xpu, "get_device_name") else "Intel XPU" + device_name = ( + torch.xpu.get_device_name(device_idx) + if hasattr(torch.xpu, "get_device_name") + else "Intel XPU" + ) else: device_idx = torch.xpu.current_device() device_str = f"xpu{device_idx}" - device_name = torch.xpu.get_device_name(device_idx) if hasattr(torch.xpu, "get_device_name") else "Intel XPU" + device_name = ( + torch.xpu.get_device_name(device_idx) + if hasattr(torch.xpu, "get_device_name") + else "Intel XPU" + ) backend_used = "xpu" print(f"[PyIsolate] Using Intel XPU device {device_idx}: {device_name}") else: @@ -641,7 +654,8 @@ def main(): print(f"[PyIsolate] Error setting device/backend: {e}") # Generate results filename with backend and device info - import socket, datetime + import datetime + import socket computer = socket.gethostname() timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") device_tag = f"{backend_used}{device_idx if device_idx is not None else 0}" diff --git a/benchmarks/memory_benchmark.py b/benchmarks/memory_benchmark.py index 329c757..e3f66f4 100644 --- a/benchmarks/memory_benchmark.py +++ b/benchmarks/memory_benchmark.py @@ -182,7 +182,10 @@ def get_memory_usage(self) -> dict[str, float]: except psutil.NoSuchProcess: pass - memory_info["total_ram_mb"] = (memory_info["host_ram_mb"] or 0) + (memory_info["children_ram_mb"] or 0) + memory_info["total_ram_mb"] = ( + (memory_info["host_ram_mb"] or 0) + + (memory_info["children_ram_mb"] or 0) + ) except Exception as e: print(f"Error getting RAM usage: {e}") @@ -333,7 +336,10 @@ class MemoryBenchmarkRunner: def __init__(self, test_base: IntegrationTestBase): self.test_base = test_base if self.test_base.test_root is None: - raise RuntimeError("test_root is not set on test_base. Did you await setup_test_environment() successfully?") + raise RuntimeError( + "test_root is not set on test_base. " + "Did you await setup_test_environment() successfully?" + ) self.memory_tracker = MemoryTracker() self.results = [] @@ -517,7 +523,10 @@ async def run_scaling_test( "after_send_ram_mb": after_send_memory["total_ram_mb"], "load_ram_delta_mb": after_load_memory["total_ram_mb"] - before_memory["total_ram_mb"], "send_ram_delta_mb": after_send_memory["total_ram_mb"] - after_load_memory["total_ram_mb"], - "ram_per_extension_mb": (float(after_load_memory["total_ram_mb"] or 0) - float(before_memory["total_ram_mb"] or 0)) / num_extensions if num_extensions else 0, + "ram_per_extension_mb": ( + float(after_load_memory["total_ram_mb"] or 0) + - float(before_memory["total_ram_mb"] or 0) + ) / num_extensions if num_extensions else 0, "before_vram_mb": before_memory["total_vram_mb"], "after_load_vram_mb": after_load_memory["total_vram_mb"], "after_send_vram_mb": after_send_memory["total_vram_mb"], @@ -687,19 +696,38 @@ async def run_large_tensor_sharing_test( "after_send_vram_mb": after_send["total_vram_mb"], "tensor_size_mb": actual_size_mb, "tensor_device": str(large_tensor.device), - "ram_for_tensor_creation_mb": after_create["total_ram_mb"] - baseline["total_ram_mb"], - "ram_for_distribution_mb": after_send["total_ram_mb"] - after_create["total_ram_mb"], - "ram_per_extension_copy_mb": (float(after_send["total_ram_mb"] or 0) - float(after_create["total_ram_mb"] or 0)) / num_extensions if num_extensions else 0, - "vram_for_tensor_creation_mb": after_create["total_vram_mb"] - baseline["total_vram_mb"], - "vram_for_distribution_mb": after_send["total_vram_mb"] - after_create["total_vram_mb"], + "ram_for_tensor_creation_mb": ( + float(after_create["total_ram_mb"] or 0) + - float(baseline["total_ram_mb"] or 0) + ) / num_extensions if num_extensions else 0, + "ram_for_distribution_mb": ( + float(after_send["total_ram_mb"] or 0) + - float(after_create["total_ram_mb"] or 0) + ) / num_extensions if num_extensions else 0, + "ram_per_extension_copy_mb": ( + float(after_send["total_ram_mb"] or 0) + - float(after_create["total_ram_mb"] or 0) + ) / num_extensions if num_extensions else 0, + "vram_for_tensor_creation_mb": ( + float(after_create["total_vram_mb"] or 0) + - float(baseline["total_vram_mb"] or 0) + ) / num_extensions if num_extensions else 0, + "vram_for_distribution_mb": ( + float(after_send["total_vram_mb"] or 0) + - float(after_create["total_vram_mb"] or 0) + ) / num_extensions if num_extensions else 0, # Add GPU total memory tracking "baseline_gpu_mb": baseline.get("gpu_used_mb", 0), "after_create_gpu_mb": after_create.get("gpu_used_mb", 0), "after_send_gpu_mb": after_send.get("gpu_used_mb", 0), - "gpu_for_tensor_creation_mb": after_create.get("gpu_used_mb", 0) - - baseline.get("gpu_used_mb", 0), - "gpu_for_distribution_mb": after_send.get("gpu_used_mb", 0) - - after_create.get("gpu_used_mb", 0), + "gpu_for_tensor_creation_mb": ( + float(after_create.get("gpu_used_mb", 0) or 0) + - float(baseline.get("gpu_used_mb", 0) or 0) + ) / num_extensions if num_extensions else 0, + "gpu_for_distribution_mb": ( + float(after_send.get("gpu_used_mb", 0) or 0) + - float(after_create.get("gpu_used_mb", 0) or 0) + ) / num_extensions if num_extensions else 0, "send_time_s": send_time, } @@ -813,7 +841,7 @@ def print_memory_benchmark_summary(results: dict): gpu_pct = (gpu_used / gpu_total) * 100 if gpu_total else 0 print(f" GPU Total: {gpu_used:.1f} / {gpu_total:.1f} MB ({gpu_pct:.1f}% used)") else: - print(f" GPU Total: N/A") + print(" GPU Total: N/A") # Scaling results for test_type in ["cpu_no_share", "cpu_share", "gpu_no_share", "gpu_share"]: @@ -978,7 +1006,12 @@ def main(): help="Test both share_torch=True and share_torch=False (default: only share_torch=True)", ) - parser.add_argument("--device", type=str, default=None, help="Device index (int) or 'cpu' to force CPU mode") + parser.add_argument( + "--device", + type=str, + default=None, + help="Device index (int) or 'cpu' to force CPU mode", + ) parser.add_argument("--no-gpu", action="store_true", help="Skip GPU benchmarks even if CUDA is available") parser.add_argument( "--backend", @@ -1078,11 +1111,19 @@ def main(): if device_idx is not None: torch.xpu.set_device(device_idx) device_str = f"xpu{device_idx}" - device_name = torch.xpu.get_device_name(device_idx) if hasattr(torch.xpu, "get_device_name") else "Intel XPU" + device_name = ( + torch.xpu.get_device_name(device_idx) + if hasattr(torch.xpu, "get_device_name") + else "Intel XPU" + ) else: device_idx = torch.xpu.current_device() device_str = f"xpu{device_idx}" - device_name = torch.xpu.get_device_name(device_idx) if hasattr(torch.xpu, "get_device_name") else "Intel XPU" + device_name = ( + torch.xpu.get_device_name(device_idx) + if hasattr(torch.xpu, "get_device_name") + else "Intel XPU" + ) backend_used = "xpu" print(f"[PyIsolate] Using Intel XPU device {device_idx}: {device_name}") else: @@ -1091,7 +1132,8 @@ def main(): print(f"[PyIsolate] Error setting device/backend: {e}") # Generate results filename with backend and device info - import socket, datetime + import datetime + import socket computer = socket.gethostname() timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") device_tag = f"{backend_used}{device_idx if device_idx is not None else 0}" @@ -1111,7 +1153,8 @@ def main(): print(f"[PyIsolate] Intel XPU memory allocated: {mem_alloc / 1024 / 1024:.1f} MB") except Exception as e: print(f"[PyIsolate] Could not get Intel XPU memory info: {e}") - # For AMD ROCm, optionally try rocm-smi if available (not implemented here, but can be added with subprocess) + # For AMD ROCm, optionally try rocm-smi if available (not implemented here, + # but can be added with subprocess) # Determine what to test test_small = not args.large_only diff --git a/pyisolate/_internal/shared.py b/pyisolate/_internal/shared.py index e4287db..b8c3524 100644 --- a/pyisolate/_internal/shared.py +++ b/pyisolate/_internal/shared.py @@ -27,9 +27,9 @@ typehint_mp = multiprocessing -import torch -from torch.utils import dlpack as _dlpack import numpy as np +import torch +from torch.utils import dlpack as _dlpack # type: ignore[attr-defined] # Utility: Convert XPU tensor to DLPack capsule if needed @@ -41,7 +41,7 @@ def maybe_to_dlpack(obj): if not arr.flags.writeable: arr = arr.copy() return torch.from_numpy(arr).to('xpu') - return torch.utils.dlpack.to_dlpack(obj) # type: ignore[attr-defined] + return _dlpack.to_dlpack(obj) # type: ignore[attr-defined] return obj # Utility: Convert DLPack capsule to XPU tensor if needed @@ -49,18 +49,17 @@ def maybe_to_dlpack(obj): def maybe_from_dlpack(obj): # DLPack capsules are PyCapsule, not torch.Tensor if not isinstance(obj, torch.Tensor) and hasattr(obj, '__dlpack__'): - return torch.utils.dlpack.from_dlpack(obj) # type: ignore[attr-defined] + return _dlpack.from_dlpack(obj) # type: ignore[attr-defined] # For raw PyCapsule (older PyTorch), try fallback if type(obj).__name__ == 'PyCapsule': - return torch.utils.dlpack.from_dlpack(obj) # type: ignore[attr-defined] + return _dlpack.from_dlpack(obj) # type: ignore[attr-defined] return obj def maybe_serialize_tensor(obj): - if isinstance(obj, torch.Tensor) and hasattr(obj, 'device'): - if obj.device.type == 'xpu': - # Fallback: send as CPU buffer + metadata - arr = obj.cpu().numpy() - return ('xpu_tensor', arr.tobytes(), arr.shape, str(arr.dtype)) + if isinstance(obj, torch.Tensor) and hasattr(obj, 'device') and obj.device.type == 'xpu': + # Fallback: send as CPU buffer + metadata + arr = obj.cpu().numpy() + return ('xpu_tensor', arr.tobytes(), arr.shape, str(arr.dtype)) return obj def maybe_deserialize_tensor(obj): @@ -493,7 +492,7 @@ def _send_thread(self): if "result" in response: response_mod = dict(response) response_mod["result"] = maybe_serialize_tensor(response["result"]) - self.send_queue.put(response_mod) # type: ignore + self.send_queue.put(cast(RPCResponse, response_mod)) else: self.send_queue.put(response) diff --git a/tests/test_benchmarks.py b/tests/test_benchmarks.py index 02d5827..8c0555e 100644 --- a/tests/test_benchmarks.py +++ b/tests/test_benchmarks.py @@ -13,15 +13,15 @@ import asyncio import gc +import os import statistics import time -from typing import Optional, Any, List +from typing import Any, Optional import numpy as np import psutil import pytest from tabulate import tabulate -import os try: import torch @@ -34,7 +34,10 @@ torch.cuda.set_device(int(cuda_env)) print(f"[PyIsolate] Using CUDA device {cuda_env}: {torch.cuda.get_device_name(int(cuda_env))}") elif CUDA_AVAILABLE: - print(f"[PyIsolate] Using default CUDA device {torch.cuda.current_device()}: {torch.cuda.get_device_name(torch.cuda.current_device())}") + print( + f"[PyIsolate] Using default CUDA device {torch.cuda.current_device()}: " + f"{torch.cuda.get_device_name(torch.cuda.current_device())}" + ) else: print("[PyIsolate] CUDA not available, using CPU only.") except ImportError: @@ -209,113 +212,6 @@ async def setup_benchmark_environment(self): await self.setup_test_environment("benchmark") # Create benchmark extension with all required dependencies - benchmark_extension_code = ''' -import asyncio -import numpy as np -from shared import ExampleExtension, DatabaseSingleton -from pyisolate import local_execution - -try: - import torch - TORCH_AVAILABLE = True -except ImportError: - TORCH_AVAILABLE = False - -class BenchmarkExtension(ExampleExtension): - """Extension with methods for benchmarking RPC overhead.""" - - async def initialize(self): - """Initialize the benchmark extension.""" - pass - - async def prepare_shutdown(self): - """Clean shutdown of benchmark extension.""" - pass - - async def do_stuff(self, value): - """Required abstract method from ExampleExtension.""" - return f"Processed: {value}" - - # ======================================== - # Small Data Benchmarks - # ======================================== - - async def echo_int(self, value: int) -> int: - """Echo an integer value.""" - return value - - async def echo_string(self, value: str) -> str: - """Echo a string value.""" - return value - - @local_execution - def echo_int_local(self, value: int) -> int: - """Local execution baseline for integer echo.""" - return value - - @local_execution - def echo_string_local(self, value: str) -> str: - """Local execution baseline for string echo.""" - return value - - # ======================================== - # Large Data Benchmarks - # ======================================== - - async def process_large_array(self, array: np.ndarray) -> int: - """Process a large numpy array and return its size.""" - return array.size - - async def echo_large_bytes(self, data: bytes) -> int: - """Echo large byte data and return its length.""" - return len(data) - - @local_execution - def process_large_array_local(self, array: np.ndarray) -> int: - """Local execution baseline for large array processing.""" - return array.size - - # ======================================== - # Torch Tensor Benchmarks - # ======================================== - - async def process_small_tensor(self, tensor) -> tuple: - """Process a small torch tensor.""" - if not TORCH_AVAILABLE: - return (0, "cpu") - return (tensor.numel(), str(tensor.device)) - - async def process_large_tensor(self, tensor) -> tuple: - """Process a large torch tensor.""" - if not TORCH_AVAILABLE: - return (0, "cpu") - return (tensor.numel(), str(tensor.device)) - - @local_execution - def process_small_tensor_local(self, tensor) -> tuple: - """Local execution baseline for small tensor processing.""" - if not TORCH_AVAILABLE: - return (0, "cpu") - return (tensor.numel(), str(tensor.device)) - - # ======================================== - # Recursive/Complex Call Patterns - # ======================================== - - async def recursive_host_call(self, depth: int) -> int: - """Make recursive calls through host singleton.""" - if depth <= 0: - return 0 - - db = DatabaseSingleton() - await db.set_value(f"depth_{depth}", depth) - value = await db.get_value(f"depth_{depth}") - return value + await self.recursive_host_call(depth - 1) - -def example_entrypoint(): - """Entry point for the benchmark extension.""" - return BenchmarkExtension() -''' # self.create_extension( # "benchmark_ext", @@ -324,7 +220,7 @@ def example_entrypoint(): # ) # Load extensions - extensions_config: List[dict[str, Any]] = [{"name": "benchmark_ext"}] + extensions_config: list[dict[str, Any]] = [{"name": "benchmark_ext"}] # Add share_torch config if available if TORCH_AVAILABLE: diff --git a/tests/test_torch_tensor_integration.py b/tests/test_torch_tensor_integration.py index 2d8549a..0140439 100644 --- a/tests/test_torch_tensor_integration.py +++ b/tests/test_torch_tensor_integration.py @@ -35,7 +35,11 @@ torch.cuda.set_device(int(cuda_env)) print(f"[PyIsolate] Using CUDA device {cuda_env}: {torch.cuda.get_device_name(int(cuda_env))}") elif HAS_CUDA: - print(f"[PyIsolate] Using default CUDA device {torch.cuda.current_device()}: {torch.cuda.get_device_name(torch.cuda.current_device())}") + print( + f"[PyIsolate] Using default CUDA device " + f"{torch.cuda.current_device()}: " + f"{torch.cuda.get_device_name(torch.cuda.current_device())}" + ) else: print("[PyIsolate] CUDA not available, using CPU only.") except ImportError: From 14c26051bd8b2fe9e8ae04cb37cbc25085811c06 Mon Sep 17 00:00:00 2001 From: billybasass <82277923+billybasass@users.noreply.github.com> Date: Wed, 9 Jul 2025 16:58:22 -0700 Subject: [PATCH 06/15] fix: venv isolation for extension imports, ensure only venv site-packages are used; codebase ruff clean --- .pre-commit-config.yaml | 4 ++-- pyisolate/_internal/client.py | 15 ++++++++++++--- pyisolate/_internal/shared.py | 24 +++++++++++++++--------- tests/test_benchmarks.py | 1 + 4 files changed, 30 insertions(+), 14 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 1a5d1b5..fa6e0f9 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -12,8 +12,8 @@ repos: - id: debug-statements - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.11.8 + rev: v0.4.4 hooks: - id: ruff - args: [--fix] + args: ["check"] - id: ruff-format diff --git a/pyisolate/_internal/client.py b/pyisolate/_internal/client.py index 35c1c9c..3ce89ae 100644 --- a/pyisolate/_internal/client.py +++ b/pyisolate/_internal/client.py @@ -1,9 +1,7 @@ import asyncio import importlib.util import logging -import os.path import sys -import sysconfig from contextlib import nullcontext from ..config import ExtensionConfig @@ -26,7 +24,18 @@ async def async_entrypoint( logger.debug("Loading extension with Python executable: %s", sys.executable) logger.debug("Loading extension from: %s", module_path) - sys.path.insert(0, sysconfig.get_path("platlib")) + # Robustly ensure only the venv's site-packages are present in sys.path + import os + import site + + venv_prefix = sys.prefix + venv_site_packages = [p for p in site.getsitepackages() if p.startswith(venv_prefix)] + # Remove all site-packages not in the current venv + sys.path = [p for p in sys.path if not (("site-packages" in p) and (not p.startswith(venv_prefix)))] + # Prepend all venv site-packages to sys.path (in order) + for p in reversed(venv_site_packages): + if p not in sys.path: + sys.path.insert(0, p) rpc = AsyncRPC(recv_queue=to_extension, send_queue=from_extension) extension = extension_type() diff --git a/pyisolate/_internal/shared.py b/pyisolate/_internal/shared.py index b8c3524..050c42a 100644 --- a/pyisolate/_internal/shared.py +++ b/pyisolate/_internal/shared.py @@ -33,42 +33,48 @@ # Utility: Convert XPU tensor to DLPack capsule if needed + def maybe_to_dlpack(obj): - if isinstance(obj, torch.Tensor) and hasattr(obj, 'device') and obj.device.type == 'xpu': + if isinstance(obj, torch.Tensor) and hasattr(obj, "device") and obj.device.type == "xpu": # If the input is a NumPy array and not writable, make it writable before converting - if hasattr(obj, 'numpy'): + if hasattr(obj, "numpy"): arr = obj.numpy() if not arr.flags.writeable: arr = arr.copy() - return torch.from_numpy(arr).to('xpu') + return torch.from_numpy(arr).to("xpu") return _dlpack.to_dlpack(obj) # type: ignore[attr-defined] return obj + # Utility: Convert DLPack capsule to XPU tensor if needed + def maybe_from_dlpack(obj): # DLPack capsules are PyCapsule, not torch.Tensor - if not isinstance(obj, torch.Tensor) and hasattr(obj, '__dlpack__'): + if not isinstance(obj, torch.Tensor) and hasattr(obj, "__dlpack__"): return _dlpack.from_dlpack(obj) # type: ignore[attr-defined] # For raw PyCapsule (older PyTorch), try fallback - if type(obj).__name__ == 'PyCapsule': + if type(obj).__name__ == "PyCapsule": return _dlpack.from_dlpack(obj) # type: ignore[attr-defined] return obj + def maybe_serialize_tensor(obj): - if isinstance(obj, torch.Tensor) and hasattr(obj, 'device') and obj.device.type == 'xpu': + if isinstance(obj, torch.Tensor) and hasattr(obj, "device") and obj.device.type == "xpu": # Fallback: send as CPU buffer + metadata arr = obj.cpu().numpy() - return ('xpu_tensor', arr.tobytes(), arr.shape, str(arr.dtype)) + return ("xpu_tensor", arr.tobytes(), arr.shape, str(arr.dtype)) return obj + def maybe_deserialize_tensor(obj): - if isinstance(obj, tuple) and len(obj) == 4 and obj[0] == 'xpu_tensor': + if isinstance(obj, tuple) and len(obj) == 4 and obj[0] == "xpu_tensor": _, buf, shape, dtype = obj arr = np.frombuffer(buf, dtype=dtype).reshape(shape) - return torch.from_numpy(arr).to('xpu') + return torch.from_numpy(arr).to("xpu") return obj + logger = logging.getLogger(__name__) # TODO - Remove me diff --git a/tests/test_benchmarks.py b/tests/test_benchmarks.py index 8c0555e..92336f3 100644 --- a/tests/test_benchmarks.py +++ b/tests/test_benchmarks.py @@ -203,6 +203,7 @@ def print_summary(self): @pytest.mark.asyncio class TestRPCBenchmarks(IntegrationTestBase): """Benchmark tests for RPC call overhead.""" + benchmark_ext_shared: Optional[object] = None runner: Optional[BenchmarkRunner] = None From 7de46684e93921dc86711e6065be7434f7511e8c Mon Sep 17 00:00:00 2001 From: billybasass <82277923+billybasass@users.noreply.github.com> Date: Fri, 11 Jul 2025 15:03:44 -0700 Subject: [PATCH 07/15] intel testing added --- benchmarks/benchmark.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/benchmark.py b/benchmarks/benchmark.py index d26d777..eafed4d 100644 --- a/benchmarks/benchmark.py +++ b/benchmarks/benchmark.py @@ -546,7 +546,7 @@ def main(): parser.add_argument( "--torch-mode", choices=["both", "standard", "shared"], - default="shared", + default="both", help="Which torch mode to test: both, standard (no share_torch), or shared (share_torch only)", ) From ca3ced13698ef7752a7c10427a31693315de6b86 Mon Sep 17 00:00:00 2001 From: billybasass <82277923+billybasass@users.noreply.github.com> Date: Sat, 12 Jul 2025 08:18:24 -0700 Subject: [PATCH 08/15] venv isolation has been applied --- .pre-commit-config.yaml | 19 ------------------- benchmarks/benchmark.py | 2 +- 2 files changed, 1 insertion(+), 20 deletions(-) delete mode 100644 .pre-commit-config.yaml diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml deleted file mode 100644 index fa6e0f9..0000000 --- a/.pre-commit-config.yaml +++ /dev/null @@ -1,19 +0,0 @@ -repos: - - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.5.0 - hooks: - - id: trailing-whitespace - - id: end-of-file-fixer - - id: check-yaml - - id: check-added-large-files - - id: check-merge-conflict - - id: check-toml - - id: check-docstring-first - - id: debug-statements - - - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.4.4 - hooks: - - id: ruff - args: ["check"] - - id: ruff-format diff --git a/benchmarks/benchmark.py b/benchmarks/benchmark.py index eafed4d..d26d777 100644 --- a/benchmarks/benchmark.py +++ b/benchmarks/benchmark.py @@ -546,7 +546,7 @@ def main(): parser.add_argument( "--torch-mode", choices=["both", "standard", "shared"], - default="both", + default="shared", help="Which torch mode to test: both, standard (no share_torch), or shared (share_torch only)", ) From 42728ed0aa10f9db667895f9812ecadf2a19d237 Mon Sep 17 00:00:00 2001 From: billybasass <82277923+billybasass@users.noreply.github.com> Date: Sat, 12 Jul 2025 11:07:18 -0700 Subject: [PATCH 09/15] added GPU share results for intel GPUs --- benchmarks/memory_benchmark.py | 577 +++++++++++++++------------------ 1 file changed, 265 insertions(+), 312 deletions(-) diff --git a/benchmarks/memory_benchmark.py b/benchmarks/memory_benchmark.py index e3f66f4..07817b8 100644 --- a/benchmarks/memory_benchmark.py +++ b/benchmarks/memory_benchmark.py @@ -51,6 +51,26 @@ from pyisolate import ExtensionConfig, ExtensionManager, ExtensionManagerConfig from tests.test_integration import IntegrationTestBase +# 1. Device detection helpers (add after imports) +def detect_available_backends(): + import torch + backends = ["cpu"] + cuda_available = torch.cuda.is_available() + xpu_available = hasattr(torch, "xpu") and torch.xpu.is_available() + rocm_available = False + if cuda_available: + torch_version = getattr(torch, 'version', None) + hip_version = getattr(torch_version, 'hip', None) if torch_version else None + if hip_version is not None: + rocm_available = True + if cuda_available and not rocm_available: + backends.append("cuda") + if rocm_available: + backends.append("rocm") + if xpu_available: + backends.append("xpu") + return backends + class MemoryTracker: """Tracks memory usage across host and child processes.""" @@ -381,15 +401,16 @@ async def run_scaling_test( num_extensions_list: list[int], share_torch: bool = True, test_tensor_size: tuple[int, ...] = (512, 512), - use_cuda: bool = False, + device: str = "cpu", ) -> list[dict]: """Test memory scaling with different numbers of extensions.""" + import torch results = [] extension_code = await create_memory_benchmark_extension() for num_extensions in num_extensions_list: print(f"\n{'=' * 60}") - print(f"Testing with {num_extensions} extensions (share_torch={share_torch})") + print(f"Testing with {num_extensions} extensions (share_torch={share_torch}, device={device})") print("=" * 60) # Create extensions @@ -452,11 +473,16 @@ async def run_scaling_test( after_load_memory = self.memory_tracker.get_memory_usage() # Create test tensor - print(f"Creating test tensor {test_tensor_size}...") + print(f"Creating test tensor {test_tensor_size} on {device}...") with torch.inference_mode(): - if use_cuda and CUDA_AVAILABLE: + if device == "cuda": + test_tensor = torch.randn(*test_tensor_size, device="cuda") + torch.cuda.synchronize() + elif device == "rocm": test_tensor = torch.randn(*test_tensor_size, device="cuda") - torch.cuda.synchronize() # Ensure tensor creation completes + torch.cuda.synchronize() + elif device == "xpu": + test_tensor = torch.randn(*test_tensor_size, device="xpu") else: test_tensor = torch.randn(*test_tensor_size) @@ -464,7 +490,7 @@ async def run_scaling_test( print(f"Tensor size: {tensor_size_mb:.1f} MB on {test_tensor.device}") # Check memory after tensor creation - if use_cuda and CUDA_AVAILABLE: + if device in ("cuda", "rocm"): post_tensor_memory = self.memory_tracker.get_memory_usage() print( f"GPU memory after tensor creation: {post_tensor_memory.get('gpu_used_mb', 0):.1f} MB " @@ -481,7 +507,7 @@ async def run_scaling_test( if i == 0: print(f" First extension stored: {info}") # Force GPU sync after each send for accurate memory tracking - if use_cuda and CUDA_AVAILABLE: + if device in ("cuda", "rocm"): torch.cuda.synchronize() except Exception as e: print(f" Failed to send to {ext_name}: {e}") @@ -490,7 +516,7 @@ async def run_scaling_test( print(f"Send completed in {send_time:.2f}s") # Force final sync before measuring - if use_cuda and CUDA_AVAILABLE: + if device in ("cuda", "rocm"): torch.cuda.synchronize() # Wait for memory to settle @@ -558,7 +584,7 @@ async def run_scaling_test( print(f" Baseline: {self.memory_tracker.baseline_gpu_memory_mb:.1f} MB") # Show GPU memory if this is a GPU test - if use_cuda and result["load_gpu_delta_mb"] > 0: + if device in ("cuda", "rocm") and result["load_gpu_delta_mb"] > 0: print(f" GPU memory for tensor creation: {result['load_gpu_delta_mb']:.1f} MB") print(f" GPU memory for tensor transfer: {result['send_gpu_delta_mb']:.1f} MB") else: @@ -572,7 +598,7 @@ async def run_scaling_test( manager.stop_all_extensions() del test_tensor gc.collect() - if CUDA_AVAILABLE: + if device in ("cuda", "rocm"): torch.cuda.empty_cache() torch.cuda.synchronize() @@ -582,162 +608,160 @@ async def run_scaling_test( return results async def run_large_tensor_sharing_test( - self, num_extensions: int = 50, tensor_gb: float = 2.0, test_both_modes: bool = False + self, num_extensions: int = 50, tensor_gb: float = 2.0, test_both_modes: bool = False, device: str = "cpu" ) -> dict: """Test memory sharing with a large tensor across multiple extensions.""" + import torch print(f"\n{'=' * 60}") - print(f"Large Tensor Sharing Test ({tensor_gb}GB tensor, {num_extensions} extensions)") + print(f"Large Tensor Sharing Test ({tensor_gb}GB tensor, {num_extensions} extensions, device={device})") print("=" * 60) extension_code = await create_memory_benchmark_extension() results = {} # Test both CPU and GPU tensors - for use_cuda in [False, True]: - if use_cuda and not CUDA_AVAILABLE: - continue - - device_name = "GPU" if use_cuda else "CPU" - print(f"\n{'=' * 50}") - print(f"Testing {device_name} Tensors") - print("=" * 50) - - results[device_name.lower()] = {} - - # Test only with share_torch=True by default - share_torch_modes = [False, True] if test_both_modes else [True] - for share_torch in share_torch_modes: - print(f"\n--- Testing {device_name} with share_torch={share_torch} ---") - - # Create extensions - extensions = [] - extension_venv_root = getattr(self.test_base, "test_root", None) - if extension_venv_root is not None: - extension_venv_root = extension_venv_root / "extension-venvs" - else: - extension_venv_root = "extension-venvs" - manager = ExtensionManager( - MemoryBenchmarkExtensionBase, - ExtensionManagerConfig(venv_root_path=str(extension_venv_root)), + device_name = device.upper() + results[device_name.lower()] = {} + share_torch_modes = [False, True] if test_both_modes else [True] + for share_torch in share_torch_modes: + print(f"\n--- Testing {device_name} with share_torch={share_torch} ---") + + # Create extensions + extensions = [] + extension_venv_root = getattr(self.test_base, "test_root", None) + if extension_venv_root is not None: + extension_venv_root = extension_venv_root / "extension-venvs" + else: + extension_venv_root = "extension-venvs" + manager = ExtensionManager( + MemoryBenchmarkExtensionBase, + ExtensionManagerConfig(venv_root_path=str(extension_venv_root)), + ) + + # Measure baseline + gc.collect() + if CUDA_AVAILABLE: + torch.cuda.empty_cache() + baseline = self.memory_tracker.get_memory_usage() + + # Create extensions + for i in range(num_extensions): + ext_name = f"large_test_ext_{device_name.lower()}_{i}" + self.test_base.create_extension( + ext_name, + dependencies=["torch>=2.0.0"], + share_torch=share_torch, + extension_code=extension_code, ) - # Measure baseline - gc.collect() - if CUDA_AVAILABLE: - torch.cuda.empty_cache() - baseline = self.memory_tracker.get_memory_usage() - - # Create extensions - for i in range(num_extensions): - ext_name = f"large_test_ext_{device_name.lower()}_{i}" - self.test_base.create_extension( - ext_name, - dependencies=["torch>=2.0.0"], - share_torch=share_torch, - extension_code=extension_code, - ) + config = ExtensionConfig( + name=ext_name, + module_path=str((self.test_base.test_root or Path(".")) / "extensions" / ext_name), + isolated=True, + dependencies=["torch>=2.0.0"], + apis=[], + share_torch=share_torch, + ) - config = ExtensionConfig( - name=ext_name, - module_path=str((self.test_base.test_root or Path(".")) / "extensions" / ext_name), - isolated=True, - dependencies=["torch>=2.0.0"], - apis=[], - share_torch=share_torch, - ) + ext = manager.load_extension(config) + extensions.append((ext_name, ext)) - ext = manager.load_extension(config) - extensions.append((ext_name, ext)) + # Wait for extensions to initialize + await asyncio.sleep(2) - # Wait for extensions to initialize - await asyncio.sleep(2) + # Create large tensor + # Calculate size for desired GB (float32 = 4 bytes per element) + num_elements = int(tensor_gb * 1024 * 1024 * 1024 / 4) + # Make it a square-ish tensor + side = int(num_elements**0.5) - # Create large tensor - # Calculate size for desired GB (float32 = 4 bytes per element) - num_elements = int(tensor_gb * 1024 * 1024 * 1024 / 4) - # Make it a square-ish tensor - side = int(num_elements**0.5) + print(f"Creating {tensor_gb}GB tensor ({side}x{side}) on {device_name}...") + with torch.inference_mode(): + if device == "cuda": + large_tensor = torch.randn(side, side, device="cuda") + torch.cuda.synchronize() + elif device == "rocm": + large_tensor = torch.randn(side, side, device="cuda") + torch.cuda.synchronize() + elif device == "xpu": + large_tensor = torch.randn(side, side, device="xpu") + else: + large_tensor = torch.randn(side, side) + actual_size_mb = large_tensor.element_size() * large_tensor.numel() / (1024 * 1024) + print(f"Actual tensor size: {actual_size_mb:.1f} MB on {large_tensor.device}") - print(f"Creating {tensor_gb}GB tensor ({side}x{side}) on {device_name}...") - with torch.inference_mode(): - large_tensor = ( - torch.randn(side, side, device="cuda") if use_cuda else torch.randn(side, side) - ) - actual_size_mb = large_tensor.element_size() * large_tensor.numel() / (1024 * 1024) - print(f"Actual tensor size: {actual_size_mb:.1f} MB on {large_tensor.device}") + # Measure after tensor creation + after_create = self.memory_tracker.get_memory_usage() - # Measure after tensor creation - after_create = self.memory_tracker.get_memory_usage() + # Send to all extensions + print(f"Sending large {device_name} tensor to {num_extensions} extensions...") + send_start = time.time() + + for _i, (ext_name, ext) in enumerate(extensions): + try: + await ext.store_tensor("large_tensor", large_tensor) + print(f" Sent to {ext_name}") + except Exception as e: + print(f" Failed to send to {ext_name}: {e}") - # Send to all extensions - print(f"Sending large {device_name} tensor to {num_extensions} extensions...") - send_start = time.time() + send_time = time.time() - send_start - for _i, (ext_name, ext) in enumerate(extensions): - try: - await ext.store_tensor("large_tensor", large_tensor) - print(f" Sent to {ext_name}") - except Exception as e: - print(f" Failed to send to {ext_name}: {e}") - - send_time = time.time() - send_start - - # Measure after sending - await asyncio.sleep(2) - after_send = self.memory_tracker.get_memory_usage() - - # Store results - results[device_name.lower()][f"share_torch_{share_torch}"] = { - "baseline_ram_mb": baseline["total_ram_mb"], - "after_create_ram_mb": after_create["total_ram_mb"], - "after_send_ram_mb": after_send["total_ram_mb"], - "baseline_vram_mb": baseline["total_vram_mb"], - "after_create_vram_mb": after_create["total_vram_mb"], - "after_send_vram_mb": after_send["total_vram_mb"], - "tensor_size_mb": actual_size_mb, - "tensor_device": str(large_tensor.device), - "ram_for_tensor_creation_mb": ( - float(after_create["total_ram_mb"] or 0) - - float(baseline["total_ram_mb"] or 0) - ) / num_extensions if num_extensions else 0, - "ram_for_distribution_mb": ( - float(after_send["total_ram_mb"] or 0) - - float(after_create["total_ram_mb"] or 0) - ) / num_extensions if num_extensions else 0, - "ram_per_extension_copy_mb": ( - float(after_send["total_ram_mb"] or 0) - - float(after_create["total_ram_mb"] or 0) - ) / num_extensions if num_extensions else 0, - "vram_for_tensor_creation_mb": ( - float(after_create["total_vram_mb"] or 0) - - float(baseline["total_vram_mb"] or 0) - ) / num_extensions if num_extensions else 0, - "vram_for_distribution_mb": ( - float(after_send["total_vram_mb"] or 0) - - float(after_create["total_vram_mb"] or 0) - ) / num_extensions if num_extensions else 0, - # Add GPU total memory tracking - "baseline_gpu_mb": baseline.get("gpu_used_mb", 0), - "after_create_gpu_mb": after_create.get("gpu_used_mb", 0), - "after_send_gpu_mb": after_send.get("gpu_used_mb", 0), - "gpu_for_tensor_creation_mb": ( - float(after_create.get("gpu_used_mb", 0) or 0) - - float(baseline.get("gpu_used_mb", 0) or 0) - ) / num_extensions if num_extensions else 0, - "gpu_for_distribution_mb": ( - float(after_send.get("gpu_used_mb", 0) or 0) - - float(after_create.get("gpu_used_mb", 0) or 0) - ) / num_extensions if num_extensions else 0, - "send_time_s": send_time, - } - - # Cleanup - manager.stop_all_extensions() - del large_tensor - gc.collect() - if CUDA_AVAILABLE: - torch.cuda.empty_cache() - await asyncio.sleep(2) + # Measure after sending + await asyncio.sleep(2) + after_send = self.memory_tracker.get_memory_usage() + + # Store results + results[device_name.lower()][f"share_torch_{share_torch}"] = { + "baseline_ram_mb": baseline["total_ram_mb"], + "after_create_ram_mb": after_create["total_ram_mb"], + "after_send_ram_mb": after_send["total_ram_mb"], + "baseline_vram_mb": baseline["total_vram_mb"], + "after_create_vram_mb": after_create["total_vram_mb"], + "after_send_vram_mb": after_send["total_vram_mb"], + "tensor_size_mb": actual_size_mb, + "tensor_device": str(large_tensor.device), + "ram_for_tensor_creation_mb": ( + float(after_create["total_ram_mb"] or 0) + - float(baseline["total_ram_mb"] or 0) + ) / num_extensions if num_extensions else 0, + "ram_for_distribution_mb": ( + float(after_send["total_ram_mb"] or 0) + - float(after_create["total_ram_mb"] or 0) + ) / num_extensions if num_extensions else 0, + "ram_per_extension_copy_mb": ( + float(after_send["total_ram_mb"] or 0) + - float(after_create["total_ram_mb"] or 0) + ) / num_extensions if num_extensions else 0, + "vram_for_tensor_creation_mb": ( + float(after_create["total_vram_mb"] or 0) + - float(baseline["total_vram_mb"] or 0) + ) / num_extensions if num_extensions else 0, + "vram_for_distribution_mb": ( + float(after_send["total_vram_mb"] or 0) + - float(after_create["total_vram_mb"] or 0) + ) / num_extensions if num_extensions else 0, + # Add GPU total memory tracking + "baseline_gpu_mb": baseline.get("gpu_used_mb", 0), + "after_create_gpu_mb": after_create.get("gpu_used_mb", 0), + "after_send_gpu_mb": after_send.get("gpu_used_mb", 0), + "gpu_for_tensor_creation_mb": ( + float(after_create.get("gpu_used_mb", 0) or 0) + - float(baseline.get("gpu_used_mb", 0) or 0) + ) / num_extensions if num_extensions else 0, + "gpu_for_distribution_mb": ( + float(after_send.get("gpu_used_mb", 0) or 0) + - float(after_create.get("gpu_used_mb", 0) or 0) + ) / num_extensions if num_extensions else 0, + "send_time_s": send_time, + } + + # Cleanup + manager.stop_all_extensions() + del large_tensor + gc.collect() + if device in ("cuda", "rocm"): + torch.cuda.empty_cache() + await asyncio.sleep(2) return results @@ -748,8 +772,10 @@ async def run_memory_benchmarks( test_large_tensor: bool = True, max_extensions_for_large: int = 50, test_both_modes: bool = False, + backend: str = "auto", ): """Run the full memory benchmark suite.""" + import torch test_base = IntegrationTestBase() await test_base.setup_test_environment("memory_benchmark") @@ -761,6 +787,8 @@ async def run_memory_benchmarks( baseline = await runner.run_baseline_memory_test() all_results["baseline"] = baseline + available_backends = detect_available_backends() if backend == "auto" else [backend] + if test_small_tensor: # Small tensor tests with multiple extension counts print("\n" + "=" * 80) @@ -774,13 +802,13 @@ async def run_memory_benchmarks( # Test both modes print("\n--- CPU Tensor Tests (share_torch=False) ---") cpu_results_no_share = await runner.run_scaling_test( - extension_counts, share_torch=False, test_tensor_size=small_tensor_size, use_cuda=False + extension_counts, share_torch=False, test_tensor_size=small_tensor_size, device="cpu" ) all_results["cpu_no_share"] = cpu_results_no_share print("\n--- CPU Tensor Tests (share_torch=True) ---") cpu_results_share = await runner.run_scaling_test( - extension_counts, share_torch=True, test_tensor_size=small_tensor_size, use_cuda=False + extension_counts, share_torch=True, test_tensor_size=small_tensor_size, device="cpu" ) all_results["cpu_share"] = cpu_results_share @@ -789,13 +817,13 @@ async def run_memory_benchmarks( if test_both_modes: print("\n--- GPU Tensor Tests (share_torch=False) ---") gpu_results_no_share = await runner.run_scaling_test( - extension_counts, share_torch=False, test_tensor_size=small_tensor_size, use_cuda=True + extension_counts, share_torch=False, test_tensor_size=small_tensor_size, device="cuda" ) all_results["gpu_no_share"] = gpu_results_no_share print("\n--- GPU Tensor Tests (share_torch=True) ---") gpu_results_share = await runner.run_scaling_test( - extension_counts, share_torch=True, test_tensor_size=small_tensor_size, use_cuda=True + extension_counts, share_torch=True, test_tensor_size=small_tensor_size, device="cuda" ) all_results["gpu_share"] = gpu_results_share @@ -805,6 +833,7 @@ async def run_memory_benchmarks( num_extensions=min(max_extensions_for_large, max(extension_counts)), tensor_gb=2.0, test_both_modes=test_both_modes, + device="cpu", ) all_results["large_tensor_sharing"] = large_results @@ -843,41 +872,35 @@ def print_memory_benchmark_summary(results: dict): else: print(" GPU Total: N/A") - # Scaling results - for test_type in ["cpu_no_share", "cpu_share", "gpu_no_share", "gpu_share"]: + # Dynamically print all *_share and *_no_share results + share_types = [k for k in results if k.endswith("_share") or k.endswith("_no_share")] + for test_type in share_types: if test_type in results: - print(f"\n{test_type.upper().replace('_', ' ')} Results:") - + backend = test_type.replace("_share", "").replace("_no_share", "").upper() + share_mode = "SHARE_TORCH=TRUE" if test_type.endswith("_share") else "SHARE_TORCH=FALSE" + print(f"\n{backend} {share_mode} Results:") headers = ["Extensions", "RAM/Ext (MB)", "Tensor RAM (MB)", "GPU (MB)", "Shared"] table_data = [] - for result in results[test_type]: - # Use GPU memory delta if available, otherwise fall back to VRAM gpu_memory = result.get("send_gpu_delta_mb", result.get("send_vram_delta_mb", 0)) - table_data.append( - [ - result["num_extensions"], - f"{result['ram_per_extension_mb']:.1f}", - f"{result['send_ram_delta_mb']:.1f}", - f"{gpu_memory:.1f}", - "Yes" - if result.get("shared_memory") - else "No" - if result.get("shared_memory") is False - else "N/A", - ] - ) - - print(tabulate(table_data, headers=headers, tablefmt="grid")) - - # Large tensor sharing results - if "large_tensor_sharing" in results: - print("\n2GB TENSOR SHARING TEST:") - large_results = results["large_tensor_sharing"] + table_data.append([ + result["num_extensions"], + f"{result['ram_per_extension_mb']:.1f}", + f"{result['send_ram_delta_mb']:.1f}", + f"{gpu_memory:.1f}", + "Yes" if result.get("shared_memory") else "No" if result.get("shared_memory") is False else "N/A", + ]) + if table_data: + print(tabulate(table_data, headers=headers, tablefmt="grid")) - # Process CPU results - if "cpu" in large_results: - print("\nCPU Tensor Results:") + # Large tensor sharing results for all backends + large_keys = [k for k in results if k.endswith("_large")] + for large_key in large_keys: + backend = large_key.replace("_large", "").upper() + print(f"\n2GB TENSOR SHARING TEST: {backend}") + large_results = results[large_key] + for dev in large_results: + print(f"\n{dev.upper()} Tensor Results:") headers = [ "Config", "Tensor Size (MB)", @@ -886,93 +909,31 @@ def print_memory_benchmark_summary(results: dict): "Send Time (s)", ] table_data = [] - for share_torch in [False, True]: key = f"share_torch_{share_torch}" - if key in large_results["cpu"]: - r = large_results["cpu"][key] - table_data.append( - [ - f"share_torch={share_torch}", - f"{r['tensor_size_mb']:.1f}", - f"{r['ram_for_distribution_mb']:.1f}", - f"{r['ram_per_extension_copy_mb']:.1f}", - f"{r['send_time_s']:.2f}", - ] - ) - + if key in large_results[dev]: + r = large_results[dev][key] + table_data.append([ + f"share_torch={share_torch}", + f"{r['tensor_size_mb']:.1f}", + f"{r['ram_for_distribution_mb']:.1f}", + f"{r['ram_per_extension_copy_mb']:.1f}", + f"{r['send_time_s']:.2f}", + ]) if table_data: print(tabulate(table_data, headers=headers, tablefmt="grid")) - - # Analysis for CPU - if "share_torch_False" in large_results["cpu"] and "share_torch_True" in large_results["cpu"]: - no_share = large_results["cpu"]["share_torch_False"] - share = large_results["cpu"]["share_torch_True"] - + # Analysis for this backend + if "share_torch_False" in large_results[dev] and "share_torch_True" in large_results[dev]: + no_share = large_results[dev]["share_torch_False"] + share = large_results[dev]["share_torch_True"] savings = no_share["ram_for_distribution_mb"] - share["ram_for_distribution_mb"] savings_pct = ( (savings / no_share["ram_for_distribution_mb"] * 100) if no_share["ram_for_distribution_mb"] else 0 ) - - print("\nCPU Memory Sharing Analysis:") + print(f"\n{dev.upper()} Memory Sharing Analysis:") print(f" Memory saved with share_torch: {savings:.1f} MB ({savings_pct:.1f}%)") - # Process GPU results - if "gpu" in large_results: - print("\nGPU Tensor Results:") - headers = [ - "Config", - "Tensor Size (MB)", - "RAM Dist (MB)", - "GPU Created (MB)", - "GPU Dist (MB)", - "Send Time (s)", - ] - table_data = [] - - for share_torch in [False, True]: - key = f"share_torch_{share_torch}" - if key in large_results["gpu"]: - r = large_results["gpu"][key] - table_data.append( - [ - f"share_torch={share_torch}", - f"{r['tensor_size_mb']:.1f}", - f"{r['ram_for_distribution_mb']:.1f}", - f"{r['gpu_for_tensor_creation_mb']:.1f}", - f"{r['gpu_for_distribution_mb']:.1f}", - f"{r['send_time_s']:.2f}", - ] - ) - - if table_data: - print(tabulate(table_data, headers=headers, tablefmt="grid")) - - # Analysis for GPU - if "share_torch_False" in large_results["gpu"] and "share_torch_True" in large_results["gpu"]: - no_share = large_results["gpu"]["share_torch_False"] - share = large_results["gpu"]["share_torch_True"] - - ram_savings = no_share["ram_for_distribution_mb"] - share["ram_for_distribution_mb"] - ram_savings_pct = ( - (ram_savings / no_share["ram_for_distribution_mb"] * 100) - if no_share["ram_for_distribution_mb"] else 0 - ) - - print("\nGPU Memory Sharing Analysis:") - print(f" RAM saved with share_torch: {ram_savings:.1f} MB ({ram_savings_pct:.1f}%)") - - gpu_savings = no_share["gpu_for_distribution_mb"] - share["gpu_for_distribution_mb"] - if no_share["gpu_for_distribution_mb"]: - gpu_savings_pct = gpu_savings / no_share["gpu_for_distribution_mb"] * 100 - print( - f" GPU memory saved with share_torch: {gpu_savings:.1f} MB " - f"({gpu_savings_pct:.1f}%)" - ) - elif gpu_savings != 0: - print(f" GPU memory difference: {gpu_savings:.1f} MB") - def main(): """Main entry point.""" @@ -1006,72 +967,39 @@ def main(): help="Test both share_torch=True and share_torch=False (default: only share_torch=True)", ) + parser.add_argument( + "--backend", + choices=["auto", "cuda", "xpu", "rocm", "cpu"], + default="auto", + help="Device backend to use: auto (default), cuda (NVIDIA/AMD ROCm), xpu (Intel oneAPI), rocm (AMD ROCm), or cpu", + ) + parser.add_argument( "--device", type=str, default=None, - help="Device index (int) or 'cpu' to force CPU mode", - ) - parser.add_argument("--no-gpu", action="store_true", help="Skip GPU benchmarks even if CUDA is available") - parser.add_argument( - "--backend", - choices=["auto", "cuda", "xpu"], - default="auto", - help="Device backend to use: auto (default), cuda (NVIDIA/AMD ROCm), or xpu (Intel oneAPI)", + help="(Legacy) Device index (int) or 'cpu' to force CPU mode. Use --backend for multi-backend tests.", ) args = parser.parse_args() - # Set device and backend - backend = args.backend - device_arg = args.device - if device_arg is not None and str(device_arg).lower() == "cpu": - backend = "cpu" - device_idx = None - print("[PyIsolate] Forcing CPU mode due to --device=cpu") - args.no_gpu = True - elif device_arg is not None: - try: - device_idx = int(device_arg) - except ValueError: - print(f"Invalid --device value: {device_arg}. Must be integer or 'cpu'.") - sys.exit(1) - else: - device_idx = None + device_idx = None # Ensure device_idx is always defined - # Determine extension counts + # Determine extension counts (move this up before device/backend logic) if args.counts: extension_counts = [int(x.strip()) for x in args.counts.split(",")] else: - # Default progression: 1, 2, 5, 10, 20, 50, 100 + # Default progression: 1, 2, 5, 10, 20 extension_counts = [1, 2, 5, 10, 20] if args.max_extensions >= 50: extension_counts.append(50) if args.max_extensions >= 100: extension_counts.append(100) - # Filter based on max extension_counts = [c for c in extension_counts if c <= args.max_extensions] - # Check dependencies - if not TORCH_AVAILABLE: - print("PyTorch not available. Install with: pip install torch") - return 1 - - print(f"Running on: {platform.system()} {platform.release()}") - - if not CUDA_AVAILABLE: - print("CUDA not available. GPU memory tests will be skipped.") - - if not NVML_AVAILABLE: - print("nvidia-ml-py3 not installed. Install with: pip install nvidia-ml-py3") - print("VRAM tracking will not be available.") - else: - print("NVML available for GPU memory tracking") - # Set device and backend backend = args.backend - device_idx = args.device device_str = "cpu" device_name = "cpu" backend_used = "cpu" @@ -1097,10 +1025,22 @@ def main(): print("[PyIsolate] ROCm is not supported on Windows. Falling back to CPU.") backend = "cpu" if backend == "cuda": - if device_idx is not None: - torch.cuda.set_device(device_idx) - device_str = f"cuda{device_idx}" - device_name = torch.cuda.get_device_name(device_idx) + if args.device is not None: + if str(args.device).lower() == "cpu": + backend = "cpu" + device_str = "cpu" + device_name = "cpu" + print("[PyIsolate] Forcing CPU mode due to --device=cpu") + args.no_gpu = True + else: + try: + device_idx = int(args.device) + torch.cuda.set_device(device_idx) + device_str = f"cuda{device_idx}" + device_name = torch.cuda.get_device_name(device_idx) + except ValueError: + print(f"Invalid --device value: {args.device}. Must be integer or 'cpu'.") + sys.exit(1) else: device_idx = torch.cuda.current_device() device_str = f"cuda{device_idx}" @@ -1108,14 +1048,26 @@ def main(): backend_used = "cuda" print(f"[PyIsolate] Using CUDA/ROCm device {device_idx}: {device_name}") elif backend == "xpu" and xpu_available: - if device_idx is not None: - torch.xpu.set_device(device_idx) - device_str = f"xpu{device_idx}" - device_name = ( - torch.xpu.get_device_name(device_idx) - if hasattr(torch.xpu, "get_device_name") - else "Intel XPU" - ) + if args.device is not None: + if str(args.device).lower() == "cpu": + backend = "cpu" + device_str = "cpu" + device_name = "cpu" + print("[PyIsolate] Forcing CPU mode due to --device=cpu") + args.no_gpu = True + else: + try: + device_idx = int(args.device) + torch.xpu.set_device(device_idx) + device_str = f"xpu{device_idx}" + device_name = ( + torch.xpu.get_device_name(device_idx) + if hasattr(torch.xpu, "get_device_name") + else "Intel XPU" + ) + except ValueError: + print(f"Invalid --device value: {args.device}. Must be integer or 'cpu'.") + sys.exit(1) else: device_idx = torch.xpu.current_device() device_str = f"xpu{device_idx}" @@ -1168,6 +1120,7 @@ def main(): test_small_tensor=test_small, test_large_tensor=test_large, test_both_modes=args.test_both_modes, + backend=args.backend, ) ) return 0 From 1b55f5b64c45e540e9b5d586676722871b165363 Mon Sep 17 00:00:00 2001 From: billybasass <82277923+billybasass@users.noreply.github.com> Date: Sat, 12 Jul 2025 11:51:00 -0700 Subject: [PATCH 10/15] The memory benchmark will now loop over all available backends (including xpu for Intel GPUs) --- benchmarks/memory_benchmark.py | 52 +++++++++++----------------------- 1 file changed, 17 insertions(+), 35 deletions(-) diff --git a/benchmarks/memory_benchmark.py b/benchmarks/memory_benchmark.py index 07817b8..b0af732 100644 --- a/benchmarks/memory_benchmark.py +++ b/benchmarks/memory_benchmark.py @@ -790,54 +790,36 @@ async def run_memory_benchmarks( available_backends = detect_available_backends() if backend == "auto" else [backend] if test_small_tensor: - # Small tensor tests with multiple extension counts print("\n" + "=" * 80) print("SMALL TENSOR SCALING TESTS") print("=" * 80) - # CPU tensor tests small_tensor_size = (512, 512) # ~1MB tensor - if test_both_modes: - # Test both modes - print("\n--- CPU Tensor Tests (share_torch=False) ---") - cpu_results_no_share = await runner.run_scaling_test( - extension_counts, share_torch=False, test_tensor_size=small_tensor_size, device="cpu" - ) - all_results["cpu_no_share"] = cpu_results_no_share - - print("\n--- CPU Tensor Tests (share_torch=True) ---") - cpu_results_share = await runner.run_scaling_test( - extension_counts, share_torch=True, test_tensor_size=small_tensor_size, device="cpu" - ) - all_results["cpu_share"] = cpu_results_share - - # GPU tensor tests if available - if CUDA_AVAILABLE: + for backend_used in available_backends: if test_both_modes: - print("\n--- GPU Tensor Tests (share_torch=False) ---") - gpu_results_no_share = await runner.run_scaling_test( - extension_counts, share_torch=False, test_tensor_size=small_tensor_size, device="cuda" + print(f"\n--- {backend_used.upper()} Tensor Tests (share_torch=False) ---") + results_no_share = await runner.run_scaling_test( + extension_counts, share_torch=False, test_tensor_size=small_tensor_size, device=backend_used ) - all_results["gpu_no_share"] = gpu_results_no_share + all_results[f"{backend_used}_no_share"] = results_no_share - print("\n--- GPU Tensor Tests (share_torch=True) ---") - gpu_results_share = await runner.run_scaling_test( - extension_counts, share_torch=True, test_tensor_size=small_tensor_size, device="cuda" + print(f"\n--- {backend_used.upper()} Tensor Tests (share_torch=True) ---") + results_share = await runner.run_scaling_test( + extension_counts, share_torch=True, test_tensor_size=small_tensor_size, device=backend_used ) - all_results["gpu_share"] = gpu_results_share + all_results[f"{backend_used}_share"] = results_share if test_large_tensor: - # Large tensor sharing test - large_results = await runner.run_large_tensor_sharing_test( - num_extensions=min(max_extensions_for_large, max(extension_counts)), - tensor_gb=2.0, - test_both_modes=test_both_modes, - device="cpu", - ) - all_results["large_tensor_sharing"] = large_results + for backend_used in available_backends: + large_results = await runner.run_large_tensor_sharing_test( + num_extensions=min(max_extensions_for_large, max(extension_counts)), + tensor_gb=2.0, + test_both_modes=test_both_modes, + device=backend_used, + ) + all_results[f"{backend_used}_large"] = large_results - # Print final summary print_memory_benchmark_summary(all_results) finally: From fe6fcd2c514f9ff409979a56e9c5249f8fb05cdf Mon Sep 17 00:00:00 2001 From: billybasass <82277923+billybasass@users.noreply.github.com> Date: Sat, 12 Jul 2025 13:44:12 -0700 Subject: [PATCH 11/15] The memory benchmark will now always run with --backend auto and test loop to always include CPU in the results, regardless of GPU presence. --- benchmarks/memory_benchmark.py | 11 ++++++++++- run_benchmarks_windows.ps1 | 5 ++++- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/benchmarks/memory_benchmark.py b/benchmarks/memory_benchmark.py index b0af732..2362f28 100644 --- a/benchmarks/memory_benchmark.py +++ b/benchmarks/memory_benchmark.py @@ -787,7 +787,16 @@ async def run_memory_benchmarks( baseline = await runner.run_baseline_memory_test() all_results["baseline"] = baseline - available_backends = detect_available_backends() if backend == "auto" else [backend] + if backend == "auto": + available_backends = detect_available_backends() + if "cpu" not in available_backends: + available_backends = ["cpu"] + available_backends + else: + # Ensure cpu is first + available_backends = [b for b in ["cpu"] + available_backends if b != "cpu"] + available_backends = ["cpu"] + available_backends + else: + available_backends = [backend] if test_small_tensor: print("\n" + "=" * 80) diff --git a/run_benchmarks_windows.ps1 b/run_benchmarks_windows.ps1 index 62193f4..68eda64 100644 --- a/run_benchmarks_windows.ps1 +++ b/run_benchmarks_windows.ps1 @@ -253,6 +253,9 @@ Write-Host "Step 8: Running memory benchmarks..." "================================================================" | Add-Content "..\$OutputFile" "" | Add-Content "..\$OutputFile" +# Before running the memory benchmark, always set backend_arg to --backend auto +$memory_backend_arg = @("--backend", "auto") + Write-Host "Running memory_benchmark.py (this may take several minutes)..." Write-Host "NOTE: This test intentionally pushes VRAM limits to find maximum capacity" @@ -261,7 +264,7 @@ Write-Host "NOTE: If nothing has changed after 90 minutes, press Ctrl+C" -Foregr Write-Host "The test intentionally pushes VRAM limits and may appear frozen when it hits limits." # Run memory benchmark -$output = & python memory_benchmark.py --counts 1,2,5,10,25,50,100 @device_args @backend_arg 2>&1 | Out-String +$output = & python memory_benchmark.py --counts 1,2,5,10,25,50,100 @device_args @memory_backend_arg 2>&1 | Out-String $memoryResult = $LASTEXITCODE $output | Tee-Object -Append "..\$OutputFile" From a07285e4591f990e8da9f0731b12edf3753b0f87 Mon Sep 17 00:00:00 2001 From: billybasass <82277923+billybasass@users.noreply.github.com> Date: Wed, 16 Jul 2025 11:42:59 -0700 Subject: [PATCH 12/15] Refactor: make GPU features optional, move GPU utils to gpu_utils.py, added back pre commit config, and clean venv handling --- .pre-commit-config.yaml | 19 ++++++ benchmarks/benchmark.py | 109 +++++++++++++++++++++++-------- benchmarks/memory_benchmark.py | 46 ++++++++----- cleanup_pyisolate.ps1 | 32 +++++++++ pyisolate/_internal/gpu_utils.py | 67 +++++++++++++++++++ pyisolate/_internal/host.py | 4 +- pyisolate/_internal/shared.py | 60 ++++------------- pyisolate/shared.py | 6 ++ run_benchmarks_windows.ps1 | 1 + 9 files changed, 250 insertions(+), 94 deletions(-) create mode 100644 .pre-commit-config.yaml create mode 100644 cleanup_pyisolate.ps1 create mode 100644 pyisolate/_internal/gpu_utils.py diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..1a5d1b5 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,19 @@ +repos: + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.5.0 + hooks: + - id: trailing-whitespace + - id: end-of-file-fixer + - id: check-yaml + - id: check-added-large-files + - id: check-merge-conflict + - id: check-toml + - id: check-docstring-first + - id: debug-statements + + - repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.11.8 + hooks: + - id: ruff + args: [--fix] + - id: ruff-format diff --git a/benchmarks/benchmark.py b/benchmarks/benchmark.py index d26d777..759f0d8 100644 --- a/benchmarks/benchmark.py +++ b/benchmarks/benchmark.py @@ -24,11 +24,20 @@ # Import after path setup from tests.test_benchmarks import TestRPCBenchmarks # noqa: E402 +# Try to import tabulate globally +try: + from tabulate import tabulate + TABULATE_AVAILABLE = True +except ImportError: + TABULATE_AVAILABLE = False + def tabulate(*args, **kwargs): + return "[tabulate not available]" + # pyright: reportMissingImports=false async def run_benchmarks( - quick: bool = False, no_torch: bool = False, no_gpu: bool = False, torch_mode: str = "both" + quick: bool = False, no_torch: bool = False, no_gpu: bool = False, torch_mode: str = "both", ): """Run all benchmarks with the specified options.""" @@ -58,10 +67,26 @@ async def run_benchmarks( import numpy as np from shared import ExampleExtension, DatabaseSingleton from pyisolate import local_execution - +from pyisolate._internal.gpu_utils import ( + maybe_serialize_tensor, + maybe_deserialize_tensor, + maybe_to_dlpack, + maybe_from_dlpack, +) +import os try: import torch TORCH_AVAILABLE = True + # Set CUDA or XPU device if specified in environment + device_idx = os.environ.get("PYISOLATE_CUDA_DEVICE") + if device_idx is not None: + try: + if hasattr(torch, "xpu") and torch.xpu.is_available(): + torch.xpu.set_device(int(device_idx)) + elif torch.cuda.is_available(): + torch.cuda.set_device(int(device_idx)) + except Exception as e: + print(f"[Extension] Failed to set device: {e}") except ImportError: TORCH_AVAILABLE = False @@ -167,8 +192,20 @@ def example_entrypoint(): except ImportError: torch_available = False + # Get device index from sys.argv or environment + import os + device_idx_env = os.environ.get("PYISOLATE_CUDA_DEVICE") + if device_idx_env is not None: + try: + device_idx = int(device_idx_env) + except Exception: + device_idx = None + # Create extensions based on torch_mode parameter extensions_to_create = [] + extension_env = os.environ.copy() + if device_idx is not None: + extension_env["PYISOLATE_CUDA_DEVICE"] = str(device_idx) if torch_mode in ["both", "standard"]: # Create extension WITHOUT share_torch (standard serialization) @@ -459,37 +496,46 @@ async def benchmark_func_shared(data=data): # Print successful results if results: - from tabulate import tabulate - - print("\nSuccessful Benchmarks:") - headers = ["Test", "Mean (ms)", "Std Dev (ms)", "Min (ms)", "Max (ms)"] - table_data = [] - - for name, result in results.items(): - table_data.append( - [ - name, - f"{result.mean * 1000:.2f}", - f"{result.stdev * 1000:.2f}", - f"{result.min_time * 1000:.2f}", - f"{result.max_time * 1000:.2f}", - ] - ) - - print(tabulate(table_data, headers=headers, tablefmt="grid")) - - # Show fastest result for reference - baseline = min(r.mean for r in results.values()) - print(f"\nFastest result: {baseline * 1000:.2f}ms") - else: - print("\nNo successful benchmark results!") + if TABULATE_AVAILABLE: + print("\nSuccessful Benchmarks:") + headers = ["Test", "Mean (ms)", "Std Dev (ms)", "Min (ms)", "Max (ms)"] + table_data = [] + + for result in results.values(): + table_data.append( + [ + f"{result.mean * 1000:.2f}", + f"{result.stdev * 1000:.2f}", + f"{result.min_time * 1000:.2f}", + f"{result.max_time * 1000:.2f}", + ] + ) + + print(tabulate(table_data, headers=headers, tablefmt="grid")) + + # Show fastest result for reference + baseline = min(r.mean for r in results.values()) + print(f"\nFastest result: {baseline * 1000:.2f}ms") + else: + print("\nSuccessful Benchmarks:") + for result in results.values(): + print( + f" {result.name}: Mean={result.mean_time * 1000:.2f}ms, " + f"Std={result.stdev * 1000:.2f}ms, Min={result.min_time * 1000:.2f}ms, " + f"Max={result.max_time * 1000:.2f}ms" + ) # Print failed tests if failed_tests: print("\nFailed Tests:") failed_headers = ["Test", "Error"] failed_data = [[name, error] for name, error in failed_tests.items()] - print(tabulate(failed_data, headers=failed_headers, tablefmt="grid")) + if TABULATE_AVAILABLE: + print(tabulate(failed_data, headers=failed_headers, tablefmt="grid")) + elif failed_data: + print("[tabulate not available] Failed tests:") + for row in failed_data: + print(row) # Print skipped tests if skipped_tests: @@ -570,7 +616,7 @@ def main(): try: import numpy # noqa: F401 import psutil # noqa: F401 - import tabulate # noqa: F401 + # import tabulate # noqa: F401 # This line is now handled globally except ImportError as e: print(f"Missing required dependency: {e}") print("Please install benchmark dependencies with:") @@ -665,11 +711,16 @@ def main(): results_filename = f"benchmark_results_{computer}_{device_tag}_{timestamp}.txt" print(f"\n[PyIsolate] Results will be saved to: {results_filename}") + # In main(), after parsing args and determining device_idx: + if device_idx is not None: + import os + os.environ["PYISOLATE_CUDA_DEVICE"] = str(device_idx) + # Run benchmarks try: return asyncio.run( run_benchmarks( - quick=args.quick, no_torch=args.no_torch, no_gpu=args.no_gpu, torch_mode=args.torch_mode + quick=args.quick, no_torch=args.no_torch, no_gpu=args.no_gpu, torch_mode=args.torch_mode, ) ) except KeyboardInterrupt: diff --git a/benchmarks/memory_benchmark.py b/benchmarks/memory_benchmark.py index 2362f28..74641e3 100644 --- a/benchmarks/memory_benchmark.py +++ b/benchmarks/memory_benchmark.py @@ -51,6 +51,7 @@ from pyisolate import ExtensionConfig, ExtensionManager, ExtensionManagerConfig from tests.test_integration import IntegrationTestBase + # 1. Device detection helpers (add after imports) def detect_available_backends(): import torch @@ -475,10 +476,7 @@ async def run_scaling_test( # Create test tensor print(f"Creating test tensor {test_tensor_size} on {device}...") with torch.inference_mode(): - if device == "cuda": - test_tensor = torch.randn(*test_tensor_size, device="cuda") - torch.cuda.synchronize() - elif device == "rocm": + if device == "cuda" or device == "rocm": test_tensor = torch.randn(*test_tensor_size, device="cuda") torch.cuda.synchronize() elif device == "xpu": @@ -608,12 +606,19 @@ async def run_scaling_test( return results async def run_large_tensor_sharing_test( - self, num_extensions: int = 50, tensor_gb: float = 2.0, test_both_modes: bool = False, device: str = "cpu" + self, + num_extensions: int = 50, + tensor_gb: float = 2.0, + test_both_modes: bool = False, + device: str = "cpu", ) -> dict: """Test memory sharing with a large tensor across multiple extensions.""" import torch print(f"\n{'=' * 60}") - print(f"Large Tensor Sharing Test ({tensor_gb}GB tensor, {num_extensions} extensions, device={device})") + print( + f"Large Tensor Sharing Test ({tensor_gb}GB tensor, " + f"{num_extensions} extensions, device={device})" + ) print("=" * 60) extension_code = await create_memory_benchmark_extension() @@ -677,10 +682,7 @@ async def run_large_tensor_sharing_test( print(f"Creating {tensor_gb}GB tensor ({side}x{side}) on {device_name}...") with torch.inference_mode(): - if device == "cuda": - large_tensor = torch.randn(side, side, device="cuda") - torch.cuda.synchronize() - elif device == "rocm": + if device == "cuda" or device == "rocm": large_tensor = torch.randn(side, side, device="cuda") torch.cuda.synchronize() elif device == "xpu": @@ -775,7 +777,6 @@ async def run_memory_benchmarks( backend: str = "auto", ): """Run the full memory benchmark suite.""" - import torch test_base = IntegrationTestBase() await test_base.setup_test_environment("memory_benchmark") @@ -809,13 +810,19 @@ async def run_memory_benchmarks( if test_both_modes: print(f"\n--- {backend_used.upper()} Tensor Tests (share_torch=False) ---") results_no_share = await runner.run_scaling_test( - extension_counts, share_torch=False, test_tensor_size=small_tensor_size, device=backend_used + extension_counts, + share_torch=False, + test_tensor_size=small_tensor_size, + device=backend_used, ) all_results[f"{backend_used}_no_share"] = results_no_share print(f"\n--- {backend_used.upper()} Tensor Tests (share_torch=True) ---") results_share = await runner.run_scaling_test( - extension_counts, share_torch=True, test_tensor_size=small_tensor_size, device=backend_used + extension_counts, + share_torch=True, + test_tensor_size=small_tensor_size, + device=backend_used, ) all_results[f"{backend_used}_share"] = results_share @@ -864,7 +871,7 @@ def print_memory_benchmark_summary(results: dict): print(" GPU Total: N/A") # Dynamically print all *_share and *_no_share results - share_types = [k for k in results if k.endswith("_share") or k.endswith("_no_share")] + share_types = [k for k in results if k.endswith(("_share", "_no_share"))] for test_type in share_types: if test_type in results: backend = test_type.replace("_share", "").replace("_no_share", "").upper() @@ -879,7 +886,11 @@ def print_memory_benchmark_summary(results: dict): f"{result['ram_per_extension_mb']:.1f}", f"{result['send_ram_delta_mb']:.1f}", f"{gpu_memory:.1f}", - "Yes" if result.get("shared_memory") else "No" if result.get("shared_memory") is False else "N/A", + ( + "Yes" + if result.get("shared_memory") + else "No" if result.get("shared_memory") is False else "N/A" + ), ]) if table_data: print(tabulate(table_data, headers=headers, tablefmt="grid")) @@ -962,7 +973,10 @@ def main(): "--backend", choices=["auto", "cuda", "xpu", "rocm", "cpu"], default="auto", - help="Device backend to use: auto (default), cuda (NVIDIA/AMD ROCm), xpu (Intel oneAPI), rocm (AMD ROCm), or cpu", + help=( + "Device backend to use: auto (default), cuda (NVIDIA/AMD ROCm), " + "xpu (Intel oneAPI), rocm (AMD ROCm), or cpu" + ), ) parser.add_argument( diff --git a/cleanup_pyisolate.ps1 b/cleanup_pyisolate.ps1 new file mode 100644 index 0000000..4fdb692 --- /dev/null +++ b/cleanup_pyisolate.ps1 @@ -0,0 +1,32 @@ +Write-Host "Cleaning up all extension venvs, .test_temps, and Python bytecode caches..." -ForegroundColor Cyan + +# Remove extension venvs and test temp directories +$dirsToRemove = @( + ".test_temps", + ".benchmark_venv", + "pyisolate\__pycache__", + "pyisolate\_internal\__pycache__", + "benchmarks\__pycache__", + "example\__pycache__" +) + +foreach ($dir in $dirsToRemove) { + if (Test-Path $dir) { + Write-Host "Removing $dir ..." + Remove-Item -Recurse -Force $dir + } +} + +# Remove all __pycache__ directories recursively +Get-ChildItem -Recurse -Directory -Filter "__pycache__" | ForEach-Object { + Write-Host "Removing $($_.FullName) ..." + Remove-Item -Recurse -Force $_.FullName +} + +# Remove all .pyc files recursively +Get-ChildItem -Recurse -Include *.pyc | ForEach-Object { + Write-Host "Removing $($_.FullName) ..." + Remove-Item -Force $_.FullName +} + +Write-Host "Cleanup complete!" -ForegroundColor Green \ No newline at end of file diff --git a/pyisolate/_internal/gpu_utils.py b/pyisolate/_internal/gpu_utils.py new file mode 100644 index 0000000..86014d7 --- /dev/null +++ b/pyisolate/_internal/gpu_utils.py @@ -0,0 +1,67 @@ +""" +pyisolate._internal.gpu_utils + +GPU/XPU/torch-specific utilities for tensor serialization, DLPack conversion, and device handling. +These functions require torch (and sometimes numpy) to be installed. +""" + +def maybe_to_dlpack(obj): + """Convert XPU tensor to DLPack capsule if needed (requires torch).""" + try: + import torch + from torch.utils import dlpack as _dlpack # type: ignore[attr-defined] + except ImportError as e: + raise ImportError("pyisolate: 'torch' is required for maybe_to_dlpack but is not installed.") from e + if isinstance(obj, torch.Tensor) and hasattr(obj, "device") and obj.device.type == "xpu": + # If the input is a NumPy array and not writable, make it writable before converting + if hasattr(obj, "numpy"): + arr = obj.numpy() + if not arr.flags.writeable: + arr = arr.copy() + return torch.from_numpy(arr).to("xpu") + return _dlpack.to_dlpack(obj) # type: ignore[attr-defined] + return obj + +def maybe_from_dlpack(obj): + """Convert DLPack capsule to XPU tensor if needed (requires torch).""" + try: + import torch + from torch.utils import dlpack as _dlpack # type: ignore[attr-defined] + except ImportError as e: + raise ImportError("pyisolate: 'torch' is required for maybe_from_dlpack but is not installed.") from e + # DLPack capsules are PyCapsule, not torch.Tensor + if not isinstance(obj, torch.Tensor) and hasattr(obj, "__dlpack__"): + return _dlpack.from_dlpack(obj) # type: ignore[attr-defined] + # For raw PyCapsule (older PyTorch), try fallback + if type(obj).__name__ == "PyCapsule": + return _dlpack.from_dlpack(obj) # type: ignore[attr-defined] + return obj + +def maybe_serialize_tensor(obj): + """Serialize XPU tensor for transport (requires torch).""" + try: + import torch + except ImportError as e: + raise ImportError( + "pyisolate: 'torch' is required for maybe_serialize_tensor but is not installed." + ) from e + if isinstance(obj, torch.Tensor) and hasattr(obj, "device") and obj.device.type == "xpu": + # Fallback: send as CPU buffer + metadata + arr = obj.cpu().numpy() + return ("xpu_tensor", arr.tobytes(), arr.shape, str(arr.dtype)) + return obj + +def maybe_deserialize_tensor(obj): + """Deserialize XPU tensor from transport (requires torch and numpy).""" + try: + import numpy as np + import torch + except ImportError as e: + raise ImportError( + "pyisolate: 'torch' and 'numpy' are required for maybe_deserialize_tensor but are not installed." + ) from e + if isinstance(obj, tuple) and len(obj) == 4 and obj[0] == "xpu_tensor": + _, buf, shape, dtype = obj + arr = np.frombuffer(buf, dtype=dtype).reshape(shape) + return torch.from_numpy(arr).to("xpu") + return obj diff --git a/pyisolate/_internal/host.py b/pyisolate/_internal/host.py index e3e87cc..8db5d13 100644 --- a/pyisolate/_internal/host.py +++ b/pyisolate/_internal/host.py @@ -309,7 +309,9 @@ def _create_extension_venv(self): raise RuntimeError("uv command not found in PATH") # Use the resolved, validated path - subprocess.check_call([uv_path, "venv", str(self.venv_path), "--python", "python3.12"]) # noqa: S603 + import sys + py_version = f"python{sys.version_info.major}.{sys.version_info.minor}" + subprocess.check_call([uv_path, "venv", str(self.venv_path), "--python", py_version]) # noqa: S603 # TODO(Optimization): Only do this when we update a extension to reduce startup time? def _install_dependencies(self): diff --git a/pyisolate/_internal/shared.py b/pyisolate/_internal/shared.py index 050c42a..844fcd6 100644 --- a/pyisolate/_internal/shared.py +++ b/pyisolate/_internal/shared.py @@ -22,58 +22,22 @@ # We only import this to get type hinting working. It can also be a torch.multiprocessing if TYPE_CHECKING: import multiprocessing as typehint_mp + else: import multiprocessing - typehint_mp = multiprocessing -import numpy as np -import torch -from torch.utils import dlpack as _dlpack # type: ignore[attr-defined] - -# Utility: Convert XPU tensor to DLPack capsule if needed - - -def maybe_to_dlpack(obj): - if isinstance(obj, torch.Tensor) and hasattr(obj, "device") and obj.device.type == "xpu": - # If the input is a NumPy array and not writable, make it writable before converting - if hasattr(obj, "numpy"): - arr = obj.numpy() - if not arr.flags.writeable: - arr = arr.copy() - return torch.from_numpy(arr).to("xpu") - return _dlpack.to_dlpack(obj) # type: ignore[attr-defined] - return obj - - -# Utility: Convert DLPack capsule to XPU tensor if needed - - -def maybe_from_dlpack(obj): - # DLPack capsules are PyCapsule, not torch.Tensor - if not isinstance(obj, torch.Tensor) and hasattr(obj, "__dlpack__"): - return _dlpack.from_dlpack(obj) # type: ignore[attr-defined] - # For raw PyCapsule (older PyTorch), try fallback - if type(obj).__name__ == "PyCapsule": - return _dlpack.from_dlpack(obj) # type: ignore[attr-defined] - return obj - - -def maybe_serialize_tensor(obj): - if isinstance(obj, torch.Tensor) and hasattr(obj, "device") and obj.device.type == "xpu": - # Fallback: send as CPU buffer + metadata - arr = obj.cpu().numpy() - return ("xpu_tensor", arr.tobytes(), arr.shape, str(arr.dtype)) - return obj - - -def maybe_deserialize_tensor(obj): - if isinstance(obj, tuple) and len(obj) == 4 and obj[0] == "xpu_tensor": - _, buf, shape, dtype = obj - arr = np.frombuffer(buf, dtype=dtype).reshape(shape) - return torch.from_numpy(arr).to("xpu") - return obj - +# GPU-specific utilities have moved to pyisolate._internal.gpu_utils +if TYPE_CHECKING: + from .gpu_utils import ( + maybe_deserialize_tensor, + maybe_serialize_tensor, + ) +else: + from .gpu_utils import ( + maybe_deserialize_tensor, + maybe_serialize_tensor, + ) logger = logging.getLogger(__name__) diff --git a/pyisolate/shared.py b/pyisolate/shared.py index d39c6f5..44422ef 100644 --- a/pyisolate/shared.py +++ b/pyisolate/shared.py @@ -144,6 +144,12 @@ class ExtensionBase(ExtensionLocal): ... async def process_data(self, data: list) -> float: ... # Extension method callable from host ... import numpy as np + ... from pyisolate._internal.gpu_utils import ( + ... maybe_to_dlpack, + ... maybe_from_dlpack, + ... maybe_serialize_tensor, + ... maybe_deserialize_tensor, + ... ) ... return np.array(data).mean() Attributes: diff --git a/run_benchmarks_windows.ps1 b/run_benchmarks_windows.ps1 index 68eda64..8c1cded 100644 --- a/run_benchmarks_windows.ps1 +++ b/run_benchmarks_windows.ps1 @@ -26,6 +26,7 @@ Write-Host "" # Prompt for CUDA device index $device = Read-Host "Enter CUDA device index to use (leave blank for default GPU/CPU)" if ($device -ne "") { + # Always pass --device to both benchmark.py and memory_benchmark.py $device_args = @("--device", "$device") } else { $device_args = @() From d8dd47c3ae67d40970d6294b37a454c3b1d06e0c Mon Sep 17 00:00:00 2001 From: billybasass <82277923+billybasass@users.noreply.github.com> Date: Wed, 16 Jul 2025 12:10:05 -0700 Subject: [PATCH 13/15] Ruff test removed name so I added it back and All device selection and environment setup is now handled in main(). run_benchmarks() is now device-agnostic and will not throw UnboundLocalError --- benchmarks/benchmark.py | 19 ++++--------------- 1 file changed, 4 insertions(+), 15 deletions(-) diff --git a/benchmarks/benchmark.py b/benchmarks/benchmark.py index 759f0d8..8637368 100644 --- a/benchmarks/benchmark.py +++ b/benchmarks/benchmark.py @@ -192,20 +192,8 @@ def example_entrypoint(): except ImportError: torch_available = False - # Get device index from sys.argv or environment - import os - device_idx_env = os.environ.get("PYISOLATE_CUDA_DEVICE") - if device_idx_env is not None: - try: - device_idx = int(device_idx_env) - except Exception: - device_idx = None - # Create extensions based on torch_mode parameter extensions_to_create = [] - extension_env = os.environ.copy() - if device_idx is not None: - extension_env["PYISOLATE_CUDA_DEVICE"] = str(device_idx) if torch_mode in ["both", "standard"]: # Create extension WITHOUT share_torch (standard serialization) @@ -501,9 +489,10 @@ async def benchmark_func_shared(data=data): headers = ["Test", "Mean (ms)", "Std Dev (ms)", "Min (ms)", "Max (ms)"] table_data = [] - for result in results.values(): + for name, result in results.items(): table_data.append( [ + name, f"{result.mean * 1000:.2f}", f"{result.stdev * 1000:.2f}", f"{result.min_time * 1000:.2f}", @@ -518,9 +507,9 @@ async def benchmark_func_shared(data=data): print(f"\nFastest result: {baseline * 1000:.2f}ms") else: print("\nSuccessful Benchmarks:") - for result in results.values(): + for name, result in results.items(): print( - f" {result.name}: Mean={result.mean_time * 1000:.2f}ms, " + f" {name}: Mean={result.mean_time * 1000:.2f}ms, " f"Std={result.stdev * 1000:.2f}ms, Min={result.min_time * 1000:.2f}ms, " f"Max={result.max_time * 1000:.2f}ms" ) From 08c3b1c0aea13f4b698969105814167ab587bc34 Mon Sep 17 00:00:00 2001 From: billybasass <82277923+billybasass@users.noreply.github.com> Date: Wed, 23 Jul 2025 15:21:36 -0700 Subject: [PATCH 14/15] =?UTF-8?q?=1B[200~Removed=20unnecessary=20getattr(.?= =?UTF-8?q?..,=20'manager',=20None)=20checks;=20now=20directly=20uses=20te?= =?UTF-8?q?st=5Finstance.manager.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ...esults_DESKTOP-RVUH7SG_20250723_142122.txt | Bin 0 -> 232950 bytes benchmarks/benchmark.py | 79 +++++++------- benchmarks/memory_benchmark.py | 99 ++++++++++-------- pyproject.toml | 1 - tests/test_benchmarks.py | 19 ++-- 5 files changed, 106 insertions(+), 92 deletions(-) create mode 100644 benchmark_results_DESKTOP-RVUH7SG_20250723_142122.txt diff --git a/benchmark_results_DESKTOP-RVUH7SG_20250723_142122.txt b/benchmark_results_DESKTOP-RVUH7SG_20250723_142122.txt new file mode 100644 index 0000000000000000000000000000000000000000..e07cd7a8482f6422a67037daf7ce0dfaadfe80df GIT binary patch literal 232950 zcmeI5`*IXXvgY&eV|L!5tZjFjo*vYws&r$GwE=M%Y#N$H(6eLBtVSR~vyBidiEfxP zXC7i7_?-QI;T6ir%Bz$Tph~y0lCm=6>fv9whr37QfBwf$*|lsf`z{-2lkoSNu2!;9 zwwz6~>$+YJf5+Khb#E&>&UUgx{e7Li&0guw^K4I7FS4Jr{qW=$-Fv4L2fDYH9qF?a zKL48iOLpTke`v#2_7{DhWOuR$YU%y#VRkqBA$zP}H+A>Et{-M!PFp{Ue#fHYzQ$mb z{W(y5SG4_2_i5#(uJ=Xn_aW7x(md2C>?;T9Z>yJE*`L&}AF@Z;RrT`6>}mE#jrLY{ zFXX3;VyPpg+0*zPsjdT!ET!y)QGcDi(cSAI-vd2;5k~n)sb1>&eCyz!W!-%gQjL`6 zKv$&V|1Y6mJIZIZSZ`>}X}Ixy_Nz*y0zC&QBh~SCMu9h~nNmmk{a#NaYF_FW&pw7; z(bFZR`=IYAjO|mUj~Jx%W0h;;HBriC;g-MGl)A20^an0ks=)BJ>a%?G^k$&KU8R1e z_Q7#4RpUE-M}a2r=(F(qwOVzcRvWFJslSZOc0~h6D04e+~h{YZ1lDEn{v8aKRCneX(gNa2xM@jf7hHd7BJz|r(zN9}{! z)@sti1-tt02sQh9a#eXgYJMayv}R_5=kvLZmQ{WJnSG8AqZV63%Qk0ydnMW(st5Pg zM{pCl~~`a*PgsV`j0vn!fInt0D>(#%!YRrax3VjTR9O8rSV`Mv%> zRz3Tn_C3vB9|9iUDXq;D^ZmLT#toGGPhElHysdXsK62<#y#-&^FUI(F;BK2cky^|h zd1|YJzqG~p88lGF@{9*|)UIEY^HoS;+-6T#)q0!Fe$(?qm2G{b_Pe?R&skZ=f&M>- zRJ)<&Z&fDqnWZ_({(JaEujp5lw^lcN%lO540<;_i?p)H{Cz=hw;z+I85p_nP)}7D? z%eSjmyi)tPXYFCE4+B>mhFnPXJ#lzS^PEraDjE(=O5|_W7+!rEYW-n_u6FLU4r53#7K^OkW`;MsIs)U$2)+s_O)4!(E*V#mvVOKjUtCkU z6X~PmOmn3zws9vln(KVJH=@!jeLjX3EQfZ^r^)3(tdl+!>-(rOqE2v>^kzdT$4P%u z2|;P=dJc6=f*BRlG*?B%Bef2^!B^y=m3-%0@KI@zN2FN|&x>`>%QDi@6WVBHL3h4g z^rVUUh4w;g;i_D+&&yYMDVF@Gz)SP`WYi!ZsmwZ^g_6p3GK1QvL|W!DooA7^Y%D;C zX_^Z{mRaz0Th?{#f$%K3z6@Mg{@q=4R^ z^FtdRX@+^6tZE)g`rlD&Z|N`Be++A#$66O`g_cglr!}#9Qle;snAkAZq?f=sMiBfOq+qDgKb)vbR zrq0z_olW(c24cNsen(?Q8^~oET$}3?w4l5q-=0Z`Sk88kLiH=d?SPS8(Ro=CD5YYN zhDJf?M_;yuDYG45Pr#b6t^Ct~v}u@^k0l+C1D!#CaikfKM;cG~6d7BzC~RrOUn{TS z@KbOsqk>KeN1>IWlfq+AIGq!=^wd@y#Tf@(kM**(I8Sv?y?h!P zYFC(~HCXP(+5ZUZ=A0YQ#;QJ_V+H%qFi*tR(6WnL!&dP(RC%j0tmmrBv4-6Yh_qF& ztsu?9)o%5Q=&i;Km;&eZwx6Yw=iA{5J0%#dl26T&V_WiNCD??n>F+buiX4HTqm}Y% zkbT0k&+j?G{Pj3pLW)Yyj=mp4F<~nBjuWDwDMd&@VFqTcE*Kwd5^F&IEuL1^B z{VSbwQi(VYbYo`O*Q|szGRvQ>)3HNhi7bzNo(_ugWz~Z&fZZ_KPIBGgAdK^&MhAN= z7MXIHsmHA8^qh{~O_hvOH{<J`MqD$}d{x1#%usUEFLuYEE!Ag)=Sf%W0 zQMiCrxmgq2C;;z|weElh@p@6H@H$`ttMj%bA?uPNlCMZ(q2KH3`G%;FUSAvy*2HV& z`JjyCI76^vO+Acw5V0tM^R#bQ!X0MYXdOk~zY1~zn}hdyy3Vupb77RZZ#4d^qQvtG4w#Dn@l)97=fCi>T#^6vrfBsr*re~xWL)$W z37v^-QrpMG2vdwD)1dKO(O&SPCPC|dhO=t3)x zI!h`)ZKYn+3)<7?I&rG^Fb_vvCNB}*o!8#VIvToFxn1yQu2-ey=ep6KgB*xj<`r>y zu0y8ihY}A&32f_VWRV)CBSFhlLq*>=zX@{?GY*#YUz8s!2L0zbHsSd?HbnzgHar9E z<|9Q*S{8Qx-+wiIX+>$wE7&$dj9hGOPSw-oy7NM*P8&xt)!WkbO*b^Q z<-VAaY_D~Z#&Ychk*ouo5Lr)`E|_e~{i(-t(Qs)|KpQh$;!kWE%c7=tdStfeznJmt zQtu+6b5+q1QB!T!dYUNH1)uHtKmCX)FpRvY#l zG|jj5bm@Z4w%nh3EEf@%M5ftH+a^2beGlDq*}7a8Gj{d0EfW4VY5g1q^7b~FUf0!-w2FtC}yZ>q{^`c-L%B|d>1onP4z9x z*x^-A?I_JFwJM5q(G~0SeY%l5e|?eND@6M4DSF6u2H?Y|uXsa=i(r?o?Q^xAy|&|s znVS^fC~7E1DBYb!MGz_dDR+5%uX>*=5}Y^?c7hSZoSr#i(H_sj9*I4rAtJ{1PJ9(& z;%t8{(H;1#ELM;_BgIv*`#A2lA?||Dfol6d`p;Pd+p}F{?E4|2{*nH(M~DcJL)EgY z`YhJ{$}C?Vy%Nid|NnP(`p;#dz3o}zObf<`Sm^2zAWoGSC!>E(z1UA^^+MD=5|?sL z!^@D`#+wKv+mTFMPRv`Td&;NP`x&st!QIv8STvtzz^*0ZN8%GHiSr^1Tk(v5XSyGw zrP}$nOuIV%#j=s|FI3BO6DOM58B>}P4+uZgYg%S4T~dxT28-RY>;&8v^{e^#G#ni3 z|I)@|o(Al)nx83bFQY|O#<`-JQn8nCE@Q!78%~0N)_daQm*UlJ-6M9Db5xjZpnBJF z_?^asSZLc_@w@DIntg1)KliFf$!46>jMdrG(tnEq1o^Dc?x?+X76*}kdDJa4W`5t! zbz!z89+-ScCC)3M#5@;qdC(JDvWp^?!G5=W)%Z%$dPp+j4vF83yX$`rb#W>L@vXLx zgYikvgs@$Uj1+NwYTT5RJ29_%zR4MV#57&-^FSHoqTZ*mK(E z&+ID3wwF^_-id0NFPA51tUNK=o`zpz+Is%Fs?mz6>i*ZKXnImbTVzO-NU7djr0_fq z>>L}URGB`J#zw)a6!wI(1?J1;30ADjOItLqe5Yd<>dw<3(ssRqK1JHrap$ruwq@F4 z%e^Mg$>pVOYI*NM*)k0xZ8 zsGv{Ew|3Idd~RsFUMb7)c3o~+R@4)^aPB;HY`(>ZROY>uww(V{P62IKD)Ta8t1~oQ z&N-opw$+?lrmgX9)ohy5mObj#w8bl1eqyrB&~Q0vo9bX)xVB7#$hTEmKuTLXeSLmp zjBl&*GGd%rMvaR`+iI?@qAgT_2360Dx=^rbzO9<8;oJ3@25~uQTaKEvotIVGm(gjS z^Neqa?rf%Q_3F!HS(T2#h;d5s<>lKlZR_|o;=4>+I4nx^n5y_63KE@Jraly0seFQ- zus$2H%gMK?4wlDP)pKo`I>xueOjbQJ8bez+u$-cPwt(uKn@XO`N!y~H(1p5XzJlSbi313lBzHY`n*R&SQc*%W`m~Q0nF6+bWM^v!+m<6QEC* zby<}*Kwjur9v65$a`fe+EiLA}XzaGpYle@SvyF&T6P`-tW3pHf~k^-MV}AzOpn&-sUR)t=7h9^SrP7yONjw zEVBJHbcP~!mNu~sM9RPkW-)>{<~_)EmRK>GfqR^Xb)-lL;wUX13D4;hdGzgVA(YD5 zNETZ}t$b4&evWI}@IaiydsQCmy>5%~j)R=5h?9wtGmI*ukevl=vOWxiy!E*5UgwX-!PVE#0wGB5g)teG~7; z^EoLgo}~C7;UnwocZwn-1>esxpPZ&n+PI}&-_%&ZHJljzPZ(Gsor;BQLR-{w(0$u#0cO&>GST9lrO64Pv1mB?6O6(gA;ah5GB8& zFO~#;alI0*xdVUwAw=or+~DsXF&zNA0Uv+&2s2IfUL9t%W6jd6hO2Rd_Giy6A}79k zq>ZcV-(h81<+kiLyc34^({6Nl6G^eY_zn_#*Vs+r6OE9!&e7Uj3eofPfAdJOE#)(r zKULqmzIUW7Q{tGSd8eAl70oU9>5;$jtq1JeAj)_CgwQ92oa~lB7l-0zk&hZ{nj3!O zom=SUQDZ2!z_k2!nTKVAy-%Eef2`}WreQls(6HiJ(*KryUFh$&^6%;%ZMG+rH6M9) zP0ygm*Q$xQUsm;&=cf|2?9IDhtJJt3DKDLtAV-$8E@B4@yo%pGmY(W6F_+Oirie7& z)+*!qtPMtMYoEQv5eqC-w0>Jj%h|sNZ86f`cD)o&+m8DNO>bFNDU2512kLQLr=pRw zD>Ju}kb=>$6$^VsY!^*=M4sIe%U1+6C%Tyo(zTc+`SYO{}{vWp7o_OF{O!E>)+Kp_Sy; z+jRD3&5`d7B!BVU#EYnj5ees-m|+L)^Q?B>Iy$;0#ws5x0!Tx@w38(aT~%+P24zty zUq-9eIIL%fsfqFC@ot)!uSoQ~wARO(6HD<`I(`{nrJF|uZoo28?JqI=ysy*yuhZ)(=_~c2t<+Zr ztc|U7*2a9LK8%(6>VVO(>S*wldf3NfI@Y#!bfA$9(~7(1XmB(X(6ByWG^{%s91R6D zYz!C;MEiI1@LfJDZWr5t+i{0($L0MeoLu6T+;UX^eC#@C@p!9zM-!(<)cJ}$_MQ0i z@?MU*owfCpbW85gEV)ST;jY)|eOIl_t+~Un=5p@ckP*>+PlzL8D2NzxWG?&sTt>v5 zs2+_-G$QY!7_yA;_}qLau_M9|G2{qcEL$!kTt;x7W%PqD%U(5P8R2nKE+g!O*BlW; zj+EkLZI=-)BZ@L&$k9@~smBrFh$tdr$PrVV3+ae(L=+J*eyMmtp?Pp7MIa`YNk@*?W=kcz;dP@3^ zq@nFdLUIq;y6bUM9ye8to8oPpLq>$hO*taw5i#VrsjY)iB}5F_XFc(=J{%DxL=4$yJ#nA4BVrB_(?R>J>AkP4ePx}ltfzy1 zKLx#aL^vXfh#0cZdg?xF*P2RN(~y1EQ}4!g|#6iq)xv&$)(%ZhREb2OI%4Zz_jyOKQ@#*T!qHyNJZy@0Z7!dN_O^P7Oy zvOH;`rJtlZw39TE-oribgWh|jh)0T~ks>RDj-8_SjtECY5fMXr_-V_Ww!AXv^FHXkuOoaNQCvq1Id*Er zW2bx_QCdd~Id*ErW2YPuB}5E4c520AryLO_L<~80YULEMQ+|@>Fiz539kkDy-uueh zSJvsudUen~YkKd9a6}XlF=U_hs{5>c9Wl3#SRJ&_n%=vNa2ZjQ5kvM_vm?V1;fN?A zV#w!xth&$IwWgBRG-RLks{5=R5hX+n*=N1#K5Iur2@yl~S+Ba!+7VGg#E^Z~tM0RQ zM9d*#ZO}ezdhfO-w>71Y)iB}5F_XT9b=Yez&0 z5kvM_ues0K5iy5|^+Efr>AidC+(Vao=++1Av!?fs2uDN_5kvM_ue;CMwWgBRG-RLk zy8Emh5hX+n*=N1(K5Iur2@yl~S+Bd#+7VGg#E^Z~>+Z95M3fLQWS{lA`>Y)iB}5F_ zXT9z|Yez&05kvM_ue;CM5iy5|jY0dY>AidC+(Vao=r#uJv!?fs2uDN_5kvM_Z@ACe zwWgBRG-RLkhWo4?5hX+n*=N1sK5Iur2@yl~S#P+{+7VGg#E^Z~8}74qM3fLQWS{kh z`>Y)iB}5F_XT9M*Yez&05kvM_Z=Awsy(!A=WjhH69qadzevkC!UC&3^fzph!D_Wf_ z%dY!-UE9?+x;jvbQT8Hxm%Y!H!slPJf61N{avo)`v%l-kb){TZnrr(1AiFh}n|xa# z->%ZX&0gtwu?5GfjS{Yf685qWC(mDpb{r)w7%Amlm2)`jOF8u@JfUxU*=}gtzhz&a zl*s*U<=@pk+HFsu%l2HFYkCGvzE*iZi<0kDp5^(eM6tilQtqmZk4Z{Or^Xj5VJUl= zz0ohmWFmUU(sxzvuWHeYlk&fw(LJV!bl%SPv*)um7`?51Z#72yYCpY*{kE1aXaAn1 z^k+m5l=hukvZvHgXcR_eB-$@&4AxW%Iru)%^ZiibN2S^dwd||?hw9_?kb=>8oZVE) zQRp@NR2~uNJke~vq8e6p4UbTMn)_v_r_33Lst;a$sd5izoHkZjOZw&|QQE!iQT8<4 zOIv+g`Hoe_YxVdjY4t7Lw>ph~>3!ti*tVNW0gdR@_iEqHjGx-lMsBCyRcUXjq$8F4 zCQ!b<{vS_h+nqX9^%YUOubT5wHQvs#a#wAGQhO&{uo-xTQHbb$ud%Kg=M<;V@M`#s zvQ3T4ZE?wHZvLQM@a&PET+v@pMoU0qM9mH5ITYvZWz0%eB4zWv@tHo<9orsh@|&o0 ztR66rRnw+RAHj1vAKVaL-|NnfdjGk5JkPbG6U3z)HP7?(`gse#Ti?nOtB#VnJSyh_ zucw!CM&rEPB4XnFOW&&HoaMi+r;JjZiBh_ZM1y=bp_RD=ei69dWHoZPiiU8-8*$i9 z>8VjK&N-1Ei#0?ETg{nyzlS1S^YQu|GIm)U3ZL-(Qe`ZsL{7dx-P|T&GeftQmmbxF2z2@N^#?YrPx{`d(O5}WR=ZI(WTh2 zG|c&R^t=><#znoisK4gxGqbA1by?J87EJ=KxCA^W30Ox-mw+h-yRYNg>FE;i;w9iz za^+&zans%skTb@{LNS(eFqdE&)ebYRSYRUbg3Vv)8&UGxfgV-KwUg zOTn{A!7dbaIe4*h(5&GXZO`m42bbk{oCg0R)&rM>z80Lfx}p`a`o(t?eJHsk9E*3V zXK(lF)p@b&yn6MtbV+zNN!W#=E(tGI60*wTT}c+3_0nklb2(FDF%G|{1Ua_yBgg00yu zj%Y)THdW*yHj-=dJ+njXSb1XvJf6aOy}inh$gVV-1F$=NnkmwfK8C#|wkK~QDzaaXzxWhM2Q?41D~H`PcFlf+tr)k009 zP&WO=YHzi_*7bqfPrg0fTMGYqZu|S#`vwit*tTu$P@z6{>Dm6aUzB<*>N0a*(^_V% z*?U`koQA7wy4uz(Nv~OpY^c1lm;9>wW4)q2T4ac^c36un*S2bgqqkKT)VitPcp-kB z^&u9M$OP_OV z7frT|B|o<$K@TNm-%0u)p?Bt_?pHw;yCybFO>E+tnDAg)>BP|zR#}`};Q?zHjuuBtBU<{Lle=hWS#`8HS{l*P=M3FN zL(7_@#nIA;mOkg}E*e_a9W9QQMzr)fdw0>$g3s42$ab2q+r+ADVnk=SH$U2tp@t*I z{rOFHv1NI&9Wgm#+^6q8eLD@?BHYTBWa0oFF*#z~ukVOyLyUX&^LW!`#jv_o=2}^^ zR_5OQ6fy4KcNxQ!(Pk)LS_w_qs+7RR3{uD9p z?{~zsA;vxaDPr8`?}%we3}*(oKPLKPpoUwQ-MZXlT^_s7Uom$%V%+ENGNw(&xX(XD zjQji@F>Q!(pMQ!N_xU?w+7RPD{}eIq^LNCwA;x|FDPr8`?}%wbjQjjk#JJDj5z~ek z_xY!Yai6~-x1S>829<7h;g64Bc=^8?(-x1S>829<7 zh;g64Bc=^8?({0eq`T68~V&AL$H>J{!WdHrU%D=8BoUJ}pxl7rGemw|vTZy)> zl(M!$TXs*Z#&1;WVfIRQW4-h3Jyxqm*^0^@Dbl#P^1rogd!WJUQvj zlat=_XpK_U6fHZ=PiR(r>Pw_U6gy zj3;TwjnDLQ|kFt-U3|lo%w6;Yn=P#>Y(y^95%--p{H@hB*M#o|Oy{W&?LW!g~ z3h~eHvwf|TpQ(lK^tm0yyX4-I z+VVkP`ct+drkY3mguWCp9`YB>OTHg`Vh8U05PZ8tM`i_`3 z#CWuRiWra9cf_iP(+Q#1UPEiQR7ho zMbvm)fTN}zH69sIM2*J=IBMEZ6VKan8(Q40>9GNhns(H9Y(Oe&ICIZq13WgMIW}Od zz3?7UlSkA*4@ZrorVTY78&H%r9vk2qTDykkCj=Bxp>ncGP%mKoK<_8{nvEM~%k@6j9@`0gjq>)Oc(_5j7qg;HYUsO?>Z>#|Btz z0Q7Kwjr(hw{WbAjUyhoH8jlTdS<^0SJT@TJ&?X)m5Z+MqR&R(J&EDMPZ*$@uP(?2B z*noL@xQ6B$TAPOEu>nQYcx-@cXzd!B#|9Kpp>ncGP%mKoK<_8{nvEM~%k@6j9@`0gjqB)J)|I_r09?UQX!YmS?v-H(Q>k{!W2{ z{|b6IY8*9fsPWinQY zcx-^9rX4jN8&E`z#|AiR+EL@N0Y%h!Y=EPt9W@>sP(+Q#1~_WkQRA@zMbvbU4cOEh zZ};@3+nmeB*^<1Z_p)2s(RHk6Z}l$R1Em^eS7gy<-|U*6e@L#s(X#`k8)ci>)9iM3 zQzh`pJi7=sY+hTHuUR3sNKqp-rMUluiRE>)2{N`uCq5P z`%wGO_LBPNTYjwejj|QhF;Y7AXIZ~qhjt$&?HwtF)o`pU%a_;C>;W-fd!G{Qyz)LJ zm=C5;31*M(Q-ay%`jlXHiasUSnypU>roZ+n!DM-#5^P@UQ^K-%9vt>7qd#BrhVB0C z=+B$+ZM6O9(Vst;v;G{)Ox&NQ{ke2mb9Uc$#5bJu85iDf(7zr1`4lPHza9N&(Xk|6 z|914})A2d*={W1pr{i!iMzo1O!oPG@{dTW);e zAKLgJyQA+9+2ial(uZ&9*F&vX|EPO6b?;AM<@Z#-?Dt;+jW0vL=-Vx=f{(PSeG}Hj ztc4%w*M3;Jf7Gw1db$%<$=6SK=dt?zAgsROsB2mUuZDcLPE!0Ry2La*XZ_6DJaXT? z?2$?yWlvPkaaiNCmVcOir(C4ryE#j;o9m^kG$CY^waTO+HSiZ*Y+6Nt%xSr z*COAQ^V^-N($$YbPCKk>>F09OZl#tok#@|FZM4INSw9MkNjt1eb(HB&J8Uycl0W0tjC)4aHtv@?r-H6kPJ>iH>h&}!hjVPO`;MxKat$p1&$ zwas@}b=SprUF7U#loh7GnokSAkZZ8`&Z;z1*I;w>j2d*O?#NmTvr_%cAJIEJ4YqAY z=Tuf^rFPo=C9F8pky~^Pwr%CKF4_?R(60S27VWxdu+6l?qlbmCi}t@*v}@L2R|5{g zShb$U6X-S$^Ig3zoX>ZcQIn|adz%K^yb_0Y>(czu<;;3lT~DOlBJo|D1`FRcVUd_4 zcy1$DlqR+OdAVta20NZbb3Ks-@}aa?Ea15|dtdkN={HZe^cM@R*=)&c8gR7orqN)h zx7K#Io}E5r%+cNAYdP92d6WnhK-Ol67keoiJLjEOeIovf`ta5)(ydB!zO83R(y9;j z|GBK+MB-WGTfH5)Xi-jM*{W)Y5p#_3YxxB54qQ>4W`U+})ppu>-$%Dpr?DKTHFPh% zYYq9h9i3RniQvP^{z7-~JsgVS&?Amp)tDAz6Pr>Nb8`Z~l4j@T(qo=3shQ*-!PIlH8`aif1{nYnDWPHw(2A{E~E*^7C_XRmbDsk%PhOk^dT zE>D-#pLh1M)eN&um)WaZYSu=pvz6td&t5-ef0ieMNI%|Yc z#e4a0ce4BPmv3se`k1{7)`=y_s(1Pj|Gt|2RpmXB)H+bw_jye&vhhInfjDob| z|8w18*0Xr{L&<)k=85(L26>1KBu1Yo{ajk&PtIpha>d;Ml+GBkd*F?}7Jaa* zzFGV{<>99x&nWvznx*-uW7K|KsecWn#@V|HouBd^?I1egp~mCBy!Ln07xd-3L5C(z zg!@0}4kSlw@zG)Si_*Oi@4$_F Uy2>?T?(U1*xo dic """Fallback method to get GPU memory on Windows using nvidia-smi.""" current_used = self._get_gpu_memory_nvidia_smi() if current_used is not None: - memory_info["gpu_used_mb"] = float(current_used) - memory_info["total_vram_mb"] = float(current_used) + memory_info["gpu_used_mb"] = current_used + memory_info["total_vram_mb"] = current_used # Calculate delta from baseline - vram_delta = float(current_used) - float(self.baseline_gpu_memory_mb) - memory_info["host_vram_mb"] = float(max(0.0, vram_delta)) + vram_delta = current_used - self.baseline_gpu_memory_mb + memory_info["host_vram_mb"] = max(0.0, vram_delta) # Try to get total GPU memory try: @@ -215,7 +215,7 @@ def get_memory_usage(self) -> dict[str, float]: if self.nvml_initialized and self.gpu_handle and nvml is not None: try: # Get total GPU memory info - mem_info = nvml.nvmlDeviceGetMemoryInfo(self.gpu_handle) # type: ignore[attr-defined] + mem_info = nvml.nvmlDeviceGetMemoryInfo(self.gpu_handle) current_used_mb = float(mem_info.used or 0) / 1024 / 1024 if mem_info else 0 memory_info["gpu_used_mb"] = current_used_mb memory_info["gpu_total_mb"] = float(mem_info.total or 0) / 1024 / 1024 if mem_info else 0 @@ -260,7 +260,7 @@ def reset_baseline(self): """Reset the baseline GPU memory measurement.""" if self.nvml_initialized and self.gpu_handle and nvml is not None: try: - mem_info = nvml.nvmlDeviceGetMemoryInfo(self.gpu_handle) # type: ignore[attr-defined] + mem_info = nvml.nvmlDeviceGetMemoryInfo(self.gpu_handle) old_baseline = self.baseline_gpu_memory_mb self.baseline_gpu_memory_mb = float(mem_info.used or 0) / 1024 / 1024 if mem_info else 0 print( @@ -275,7 +275,7 @@ def __del__(self): """Cleanup NVML on deletion.""" if self.nvml_initialized and nvml is not None: with contextlib.suppress(Exception): - nvml.nvmlShutdown() # type: ignore[attr-defined] + nvml.nvmlShutdown() async def create_memory_benchmark_extension() -> str: @@ -416,7 +416,7 @@ async def run_scaling_test( # Create extensions extensions = [] - extension_venv_root = getattr(self.test_base, "test_root", None) + extension_venv_root = self.test_base.test_root if extension_venv_root is not None: extension_venv_root = extension_venv_root / "extension-venvs" else: @@ -633,7 +633,7 @@ async def run_large_tensor_sharing_test( # Create extensions extensions = [] - extension_venv_root = getattr(self.test_base, "test_root", None) + extension_venv_root = self.test_base.test_root if extension_venv_root is not None: extension_venv_root = extension_venv_root / "extension-venvs" else: @@ -1020,46 +1020,50 @@ def main(): backend = "xpu" else: backend = "cpu" - if backend == "cuda" and cuda_available: - # Use getattr to avoid linter errors for torch.version.hip - torch_version = getattr(torch, 'version', None) - hip_version = getattr(torch_version, 'hip', None) if torch_version else None - if platform.system() == "Linux" and hip_version is not None: - print("[PyIsolate] ROCm (AMD) backend detected on Linux.") - elif platform.system() == "Windows": - print("[PyIsolate] ROCm is not supported on Windows. Falling back to CPU.") - backend = "cpu" - if backend == "cuda": - if args.device is not None: - if str(args.device).lower() == "cpu": - backend = "cpu" - device_str = "cpu" - device_name = "cpu" - print("[PyIsolate] Forcing CPU mode due to --device=cpu") - args.no_gpu = True - else: - try: - device_idx = int(args.device) - torch.cuda.set_device(device_idx) - device_str = f"cuda{device_idx}" - device_name = torch.cuda.get_device_name(device_idx) - except ValueError: - print(f"Invalid --device value: {args.device}. Must be integer or 'cpu'.") - sys.exit(1) + + if backend == "cuda": + if not cuda_available: + print("[PyIsolate] CUDA backend requested but not available. Exiting.") + sys.exit(1) + # Only check for ROCm on Linux + if platform.system() == "Linux": + torch_version = getattr(torch, 'version', None) + hip_version = getattr(torch_version, 'hip', None) if torch_version else None + if hip_version is not None: + print("[PyIsolate] ROCm (AMD) backend detected on Linux.") + # On Windows, just use CUDA if available + if args.device is not None: + if str(args.device).lower() == "cpu": + backend = "cpu" + device_str = "cpu" + device_name = "cpu" + print("[PyIsolate] Forcing CPU mode due to --device=cpu") else: - device_idx = torch.cuda.current_device() - device_str = f"cuda{device_idx}" - device_name = torch.cuda.get_device_name(device_idx) - backend_used = "cuda" - print(f"[PyIsolate] Using CUDA/ROCm device {device_idx}: {device_name}") - elif backend == "xpu" and xpu_available: + try: + device_idx = int(args.device) + torch.cuda.set_device(device_idx) + device_str = f"cuda{device_idx}" + device_name = torch.cuda.get_device_name(device_idx) + except ValueError: + print(f"Invalid --device value: {args.device}. Must be integer or 'cpu'.") + sys.exit(1) + else: + device_idx = torch.cuda.current_device() + device_str = f"cuda{device_idx}" + device_name = torch.cuda.get_device_name(device_idx) + backend_used = "cuda" + print(f"[PyIsolate] Using CUDA device {device_idx}: {device_name}") + + elif backend == "xpu": + if not xpu_available: + print("[PyIsolate] XPU backend requested but not available. Exiting.") + sys.exit(1) if args.device is not None: if str(args.device).lower() == "cpu": backend = "cpu" device_str = "cpu" device_name = "cpu" print("[PyIsolate] Forcing CPU mode due to --device=cpu") - args.no_gpu = True else: try: device_idx = int(args.device) @@ -1083,8 +1087,11 @@ def main(): ) backend_used = "xpu" print(f"[PyIsolate] Using Intel XPU device {device_idx}: {device_name}") + else: - print("[PyIsolate] No supported GPU backend available, using CPU only.") + print("[PyIsolate] No supported GPU backend available, exiting.") + sys.exit(1) + except Exception as e: print(f"[PyIsolate] Error setting device/backend: {e}") diff --git a/pyproject.toml b/pyproject.toml index 2c955ba..6f3741f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -44,7 +44,6 @@ test = [ "pytest-asyncio>=0.21.0", # Required for async test fixtures "pytest>=7.0", # Required by benchmark scripts that import from tests "pyyaml>=5.4.0", # For test manifest creation - "tabulate>=0.9.0", # For nice output formatting "torch>=2.0.0", # For testing share_torch functionality ] bench = [ diff --git a/tests/test_benchmarks.py b/tests/test_benchmarks.py index 92336f3..acdccc7 100644 --- a/tests/test_benchmarks.py +++ b/tests/test_benchmarks.py @@ -39,12 +39,15 @@ f"{torch.cuda.get_device_name(torch.cuda.current_device())}" ) else: - print("[PyIsolate] CUDA not available, using CPU only.") + print("[PyIsolate] CUDA not available, exiting.") + import sys + sys.exit(1) except ImportError: TORCH_AVAILABLE = False CUDA_AVAILABLE = False from .test_integration import IntegrationTestBase +from shared import ExampleExtensionBase # Add this import for type annotations class BenchmarkResults: @@ -204,7 +207,7 @@ def print_summary(self): class TestRPCBenchmarks(IntegrationTestBase): """Benchmark tests for RPC call overhead.""" - benchmark_ext_shared: Optional[object] = None + benchmark_ext_shared: Optional[ExampleExtensionBase] = None runner: Optional[BenchmarkRunner] = None @pytest.fixture(autouse=True) @@ -250,7 +253,7 @@ async def test_small_data_benchmarks(self): print("SMALL DATA BENCHMARKS") print("=" * 60) - assert self.runner is not None # type: ignore + assert self.runner is not None # Integer benchmarks test_int = 42 await self.runner.run_benchmark( @@ -276,7 +279,7 @@ async def test_large_data_benchmarks(self): print("LARGE DATA BENCHMARKS") print("=" * 60) - assert self.runner is not None # type: ignore + assert self.runner is not None # Large numpy array (10MB) large_array = np.random.random((1024, 1024)) # ~8MB float64 @@ -309,7 +312,7 @@ async def test_torch_tensor_benchmarks(self): print("TORCH TENSOR BENCHMARKS") print("=" * 60) - assert self.runner is not None # type: ignore + assert self.runner is not None # Small tensor (CPU) with torch.inference_mode(): small_tensor_cpu = torch.randn(100, 100) # ~40KB @@ -357,7 +360,7 @@ async def test_complex_call_patterns(self): print("COMPLEX CALL PATTERN BENCHMARKS") print("=" * 60) - assert self.runner is not None # type: ignore + assert self.runner is not None # Recursive calls through host singleton await self.runner.run_benchmark( "Recursive Host Calls (depth=3)", lambda: self.benchmark_ext.recursive_host_call(3) @@ -373,12 +376,12 @@ async def test_print_final_summary(self): # Small delay to ensure this runs last await asyncio.sleep(0.1) - assert self.runner is not None # type: ignore + assert self.runner is not None self.runner.print_summary() # Basic assertions to ensure benchmarks ran assert len(self.runner.results) > 0, "No benchmark results found" - assert self.runner is not None # type: ignore + assert self.runner is not None # Verify we have both local and RPC results for comparison local_results = [r for r in self.runner.results if "local" in r.name.lower()] From 514964b17f38f231aff3a6e2c26921ecfc4b86e1 Mon Sep 17 00:00:00 2001 From: billybasass <82277923+billybasass@users.noreply.github.com> Date: Sat, 26 Jul 2025 10:54:25 -0700 Subject: [PATCH 15/15] Updated device selection logic to allow tests/benchmarks to run on Intel XPU and AMD ROCm GPUs, not just CUDA.Fixed unresolved import for ExampleExtensionBase by importing from example.shared. --- ...results_DESKTOP-RVUH7SG_20250723_142122.txt | Bin 232950 -> 0 bytes benchmarks/benchmark.py | 8 +++++++- tests/test_benchmarks.py | 13 +++++++++++-- 3 files changed, 18 insertions(+), 3 deletions(-) delete mode 100644 benchmark_results_DESKTOP-RVUH7SG_20250723_142122.txt diff --git a/benchmark_results_DESKTOP-RVUH7SG_20250723_142122.txt b/benchmark_results_DESKTOP-RVUH7SG_20250723_142122.txt deleted file mode 100644 index e07cd7a8482f6422a67037daf7ce0dfaadfe80df..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 232950 zcmeI5`*IXXvgY&eV|L!5tZjFjo*vYws&r$GwE=M%Y#N$H(6eLBtVSR~vyBidiEfxP zXC7i7_?-QI;T6ir%Bz$Tph~y0lCm=6>fv9whr37QfBwf$*|lsf`z{-2lkoSNu2!;9 zwwz6~>$+YJf5+Khb#E&>&UUgx{e7Li&0guw^K4I7FS4Jr{qW=$-Fv4L2fDYH9qF?a zKL48iOLpTke`v#2_7{DhWOuR$YU%y#VRkqBA$zP}H+A>Et{-M!PFp{Ue#fHYzQ$mb z{W(y5SG4_2_i5#(uJ=Xn_aW7x(md2C>?;T9Z>yJE*`L&}AF@Z;RrT`6>}mE#jrLY{ zFXX3;VyPpg+0*zPsjdT!ET!y)QGcDi(cSAI-vd2;5k~n)sb1>&eCyz!W!-%gQjL`6 zKv$&V|1Y6mJIZIZSZ`>}X}Ixy_Nz*y0zC&QBh~SCMu9h~nNmmk{a#NaYF_FW&pw7; z(bFZR`=IYAjO|mUj~Jx%W0h;;HBriC;g-MGl)A20^an0ks=)BJ>a%?G^k$&KU8R1e z_Q7#4RpUE-M}a2r=(F(qwOVzcRvWFJslSZOc0~h6D04e+~h{YZ1lDEn{v8aKRCneX(gNa2xM@jf7hHd7BJz|r(zN9}{! z)@sti1-tt02sQh9a#eXgYJMayv}R_5=kvLZmQ{WJnSG8AqZV63%Qk0ydnMW(st5Pg zM{pCl~~`a*PgsV`j0vn!fInt0D>(#%!YRrax3VjTR9O8rSV`Mv%> zRz3Tn_C3vB9|9iUDXq;D^ZmLT#toGGPhElHysdXsK62<#y#-&^FUI(F;BK2cky^|h zd1|YJzqG~p88lGF@{9*|)UIEY^HoS;+-6T#)q0!Fe$(?qm2G{b_Pe?R&skZ=f&M>- zRJ)<&Z&fDqnWZ_({(JaEujp5lw^lcN%lO540<;_i?p)H{Cz=hw;z+I85p_nP)}7D? z%eSjmyi)tPXYFCE4+B>mhFnPXJ#lzS^PEraDjE(=O5|_W7+!rEYW-n_u6FLU4r53#7K^OkW`;MsIs)U$2)+s_O)4!(E*V#mvVOKjUtCkU z6X~PmOmn3zws9vln(KVJH=@!jeLjX3EQfZ^r^)3(tdl+!>-(rOqE2v>^kzdT$4P%u z2|;P=dJc6=f*BRlG*?B%Bef2^!B^y=m3-%0@KI@zN2FN|&x>`>%QDi@6WVBHL3h4g z^rVUUh4w;g;i_D+&&yYMDVF@Gz)SP`WYi!ZsmwZ^g_6p3GK1QvL|W!DooA7^Y%D;C zX_^Z{mRaz0Th?{#f$%K3z6@Mg{@q=4R^ z^FtdRX@+^6tZE)g`rlD&Z|N`Be++A#$66O`g_cglr!}#9Qle;snAkAZq?f=sMiBfOq+qDgKb)vbR zrq0z_olW(c24cNsen(?Q8^~oET$}3?w4l5q-=0Z`Sk88kLiH=d?SPS8(Ro=CD5YYN zhDJf?M_;yuDYG45Pr#b6t^Ct~v}u@^k0l+C1D!#CaikfKM;cG~6d7BzC~RrOUn{TS z@KbOsqk>KeN1>IWlfq+AIGq!=^wd@y#Tf@(kM**(I8Sv?y?h!P zYFC(~HCXP(+5ZUZ=A0YQ#;QJ_V+H%qFi*tR(6WnL!&dP(RC%j0tmmrBv4-6Yh_qF& ztsu?9)o%5Q=&i;Km;&eZwx6Yw=iA{5J0%#dl26T&V_WiNCD??n>F+buiX4HTqm}Y% zkbT0k&+j?G{Pj3pLW)Yyj=mp4F<~nBjuWDwDMd&@VFqTcE*Kwd5^F&IEuL1^B z{VSbwQi(VYbYo`O*Q|szGRvQ>)3HNhi7bzNo(_ugWz~Z&fZZ_KPIBGgAdK^&MhAN= z7MXIHsmHA8^qh{~O_hvOH{<J`MqD$}d{x1#%usUEFLuYEE!Ag)=Sf%W0 zQMiCrxmgq2C;;z|weElh@p@6H@H$`ttMj%bA?uPNlCMZ(q2KH3`G%;FUSAvy*2HV& z`JjyCI76^vO+Acw5V0tM^R#bQ!X0MYXdOk~zY1~zn}hdyy3Vupb77RZZ#4d^qQvtG4w#Dn@l)97=fCi>T#^6vrfBsr*re~xWL)$W z37v^-QrpMG2vdwD)1dKO(O&SPCPC|dhO=t3)x zI!h`)ZKYn+3)<7?I&rG^Fb_vvCNB}*o!8#VIvToFxn1yQu2-ey=ep6KgB*xj<`r>y zu0y8ihY}A&32f_VWRV)CBSFhlLq*>=zX@{?GY*#YUz8s!2L0zbHsSd?HbnzgHar9E z<|9Q*S{8Qx-+wiIX+>$wE7&$dj9hGOPSw-oy7NM*P8&xt)!WkbO*b^Q z<-VAaY_D~Z#&Ychk*ouo5Lr)`E|_e~{i(-t(Qs)|KpQh$;!kWE%c7=tdStfeznJmt zQtu+6b5+q1QB!T!dYUNH1)uHtKmCX)FpRvY#l zG|jj5bm@Z4w%nh3EEf@%M5ftH+a^2beGlDq*}7a8Gj{d0EfW4VY5g1q^7b~FUf0!-w2FtC}yZ>q{^`c-L%B|d>1onP4z9x z*x^-A?I_JFwJM5q(G~0SeY%l5e|?eND@6M4DSF6u2H?Y|uXsa=i(r?o?Q^xAy|&|s znVS^fC~7E1DBYb!MGz_dDR+5%uX>*=5}Y^?c7hSZoSr#i(H_sj9*I4rAtJ{1PJ9(& z;%t8{(H;1#ELM;_BgIv*`#A2lA?||Dfol6d`p;Pd+p}F{?E4|2{*nH(M~DcJL)EgY z`YhJ{$}C?Vy%Nid|NnP(`p;#dz3o}zObf<`Sm^2zAWoGSC!>E(z1UA^^+MD=5|?sL z!^@D`#+wKv+mTFMPRv`Td&;NP`x&st!QIv8STvtzz^*0ZN8%GHiSr^1Tk(v5XSyGw zrP}$nOuIV%#j=s|FI3BO6DOM58B>}P4+uZgYg%S4T~dxT28-RY>;&8v^{e^#G#ni3 z|I)@|o(Al)nx83bFQY|O#<`-JQn8nCE@Q!78%~0N)_daQm*UlJ-6M9Db5xjZpnBJF z_?^asSZLc_@w@DIntg1)KliFf$!46>jMdrG(tnEq1o^Dc?x?+X76*}kdDJa4W`5t! zbz!z89+-ScCC)3M#5@;qdC(JDvWp^?!G5=W)%Z%$dPp+j4vF83yX$`rb#W>L@vXLx zgYikvgs@$Uj1+NwYTT5RJ29_%zR4MV#57&-^FSHoqTZ*mK(E z&+ID3wwF^_-id0NFPA51tUNK=o`zpz+Is%Fs?mz6>i*ZKXnImbTVzO-NU7djr0_fq z>>L}URGB`J#zw)a6!wI(1?J1;30ADjOItLqe5Yd<>dw<3(ssRqK1JHrap$ruwq@F4 z%e^Mg$>pVOYI*NM*)k0xZ8 zsGv{Ew|3Idd~RsFUMb7)c3o~+R@4)^aPB;HY`(>ZROY>uww(V{P62IKD)Ta8t1~oQ z&N-opw$+?lrmgX9)ohy5mObj#w8bl1eqyrB&~Q0vo9bX)xVB7#$hTEmKuTLXeSLmp zjBl&*GGd%rMvaR`+iI?@qAgT_2360Dx=^rbzO9<8;oJ3@25~uQTaKEvotIVGm(gjS z^Neqa?rf%Q_3F!HS(T2#h;d5s<>lKlZR_|o;=4>+I4nx^n5y_63KE@Jraly0seFQ- zus$2H%gMK?4wlDP)pKo`I>xueOjbQJ8bez+u$-cPwt(uKn@XO`N!y~H(1p5XzJlSbi313lBzHY`n*R&SQc*%W`m~Q0nF6+bWM^v!+m<6QEC* zby<}*Kwjur9v65$a`fe+EiLA}XzaGpYle@SvyF&T6P`-tW3pHf~k^-MV}AzOpn&-sUR)t=7h9^SrP7yONjw zEVBJHbcP~!mNu~sM9RPkW-)>{<~_)EmRK>GfqR^Xb)-lL;wUX13D4;hdGzgVA(YD5 zNETZ}t$b4&evWI}@IaiydsQCmy>5%~j)R=5h?9wtGmI*ukevl=vOWxiy!E*5UgwX-!PVE#0wGB5g)teG~7; z^EoLgo}~C7;UnwocZwn-1>esxpPZ&n+PI}&-_%&ZHJljzPZ(Gsor;BQLR-{w(0$u#0cO&>GST9lrO64Pv1mB?6O6(gA;ah5GB8& zFO~#;alI0*xdVUwAw=or+~DsXF&zNA0Uv+&2s2IfUL9t%W6jd6hO2Rd_Giy6A}79k zq>ZcV-(h81<+kiLyc34^({6Nl6G^eY_zn_#*Vs+r6OE9!&e7Uj3eofPfAdJOE#)(r zKULqmzIUW7Q{tGSd8eAl70oU9>5;$jtq1JeAj)_CgwQ92oa~lB7l-0zk&hZ{nj3!O zom=SUQDZ2!z_k2!nTKVAy-%Eef2`}WreQls(6HiJ(*KryUFh$&^6%;%ZMG+rH6M9) zP0ygm*Q$xQUsm;&=cf|2?9IDhtJJt3DKDLtAV-$8E@B4@yo%pGmY(W6F_+Oirie7& z)+*!qtPMtMYoEQv5eqC-w0>Jj%h|sNZ86f`cD)o&+m8DNO>bFNDU2512kLQLr=pRw zD>Ju}kb=>$6$^VsY!^*=M4sIe%U1+6C%Tyo(zTc+`SYO{}{vWp7o_OF{O!E>)+Kp_Sy; z+jRD3&5`d7B!BVU#EYnj5ees-m|+L)^Q?B>Iy$;0#ws5x0!Tx@w38(aT~%+P24zty zUq-9eIIL%fsfqFC@ot)!uSoQ~wARO(6HD<`I(`{nrJF|uZoo28?JqI=ysy*yuhZ)(=_~c2t<+Zr ztc|U7*2a9LK8%(6>VVO(>S*wldf3NfI@Y#!bfA$9(~7(1XmB(X(6ByWG^{%s91R6D zYz!C;MEiI1@LfJDZWr5t+i{0($L0MeoLu6T+;UX^eC#@C@p!9zM-!(<)cJ}$_MQ0i z@?MU*owfCpbW85gEV)ST;jY)|eOIl_t+~Un=5p@ckP*>+PlzL8D2NzxWG?&sTt>v5 zs2+_-G$QY!7_yA;_}qLau_M9|G2{qcEL$!kTt;x7W%PqD%U(5P8R2nKE+g!O*BlW; zj+EkLZI=-)BZ@L&$k9@~smBrFh$tdr$PrVV3+ae(L=+J*eyMmtp?Pp7MIa`YNk@*?W=kcz;dP@3^ zq@nFdLUIq;y6bUM9ye8to8oPpLq>$hO*taw5i#VrsjY)iB}5F_XFc(=J{%DxL=4$yJ#nA4BVrB_(?R>J>AkP4ePx}ltfzy1 zKLx#aL^vXfh#0cZdg?xF*P2RN(~y1EQ}4!g|#6iq)xv&$)(%ZhREb2OI%4Zz_jyOKQ@#*T!qHyNJZy@0Z7!dN_O^P7Oy zvOH;`rJtlZw39TE-oribgWh|jh)0T~ks>RDj-8_SjtECY5fMXr_-V_Ww!AXv^FHXkuOoaNQCvq1Id*Er zW2bx_QCdd~Id*ErW2YPuB}5E4c520AryLO_L<~80YULEMQ+|@>Fiz539kkDy-uueh zSJvsudUen~YkKd9a6}XlF=U_hs{5>c9Wl3#SRJ&_n%=vNa2ZjQ5kvM_vm?V1;fN?A zV#w!xth&$IwWgBRG-RLks{5=R5hX+n*=N1#K5Iur2@yl~S+Ba!+7VGg#E^Z~tM0RQ zM9d*#ZO}ezdhfO-w>71Y)iB}5F_XT9b=Yez&0 z5kvM_ues0K5iy5|^+Efr>AidC+(Vao=++1Av!?fs2uDN_5kvM_ue;CMwWgBRG-RLk zy8Emh5hX+n*=N1(K5Iur2@yl~S+Bd#+7VGg#E^Z~>+Z95M3fLQWS{lA`>Y)iB}5F_ zXT9z|Yez&05kvM_ue;CM5iy5|jY0dY>AidC+(Vao=r#uJv!?fs2uDN_5kvM_Z@ACe zwWgBRG-RLkhWo4?5hX+n*=N1sK5Iur2@yl~S#P+{+7VGg#E^Z~8}74qM3fLQWS{kh z`>Y)iB}5F_XT9M*Yez&05kvM_Z=Awsy(!A=WjhH69qadzevkC!UC&3^fzph!D_Wf_ z%dY!-UE9?+x;jvbQT8Hxm%Y!H!slPJf61N{avo)`v%l-kb){TZnrr(1AiFh}n|xa# z->%ZX&0gtwu?5GfjS{Yf685qWC(mDpb{r)w7%Amlm2)`jOF8u@JfUxU*=}gtzhz&a zl*s*U<=@pk+HFsu%l2HFYkCGvzE*iZi<0kDp5^(eM6tilQtqmZk4Z{Or^Xj5VJUl= zz0ohmWFmUU(sxzvuWHeYlk&fw(LJV!bl%SPv*)um7`?51Z#72yYCpY*{kE1aXaAn1 z^k+m5l=hukvZvHgXcR_eB-$@&4AxW%Iru)%^ZiibN2S^dwd||?hw9_?kb=>8oZVE) zQRp@NR2~uNJke~vq8e6p4UbTMn)_v_r_33Lst;a$sd5izoHkZjOZw&|QQE!iQT8<4 zOIv+g`Hoe_YxVdjY4t7Lw>ph~>3!ti*tVNW0gdR@_iEqHjGx-lMsBCyRcUXjq$8F4 zCQ!b<{vS_h+nqX9^%YUOubT5wHQvs#a#wAGQhO&{uo-xTQHbb$ud%Kg=M<;V@M`#s zvQ3T4ZE?wHZvLQM@a&PET+v@pMoU0qM9mH5ITYvZWz0%eB4zWv@tHo<9orsh@|&o0 ztR66rRnw+RAHj1vAKVaL-|NnfdjGk5JkPbG6U3z)HP7?(`gse#Ti?nOtB#VnJSyh_ zucw!CM&rEPB4XnFOW&&HoaMi+r;JjZiBh_ZM1y=bp_RD=ei69dWHoZPiiU8-8*$i9 z>8VjK&N-1Ei#0?ETg{nyzlS1S^YQu|GIm)U3ZL-(Qe`ZsL{7dx-P|T&GeftQmmbxF2z2@N^#?YrPx{`d(O5}WR=ZI(WTh2 zG|c&R^t=><#znoisK4gxGqbA1by?J87EJ=KxCA^W30Ox-mw+h-yRYNg>FE;i;w9iz za^+&zans%skTb@{LNS(eFqdE&)ebYRSYRUbg3Vv)8&UGxfgV-KwUg zOTn{A!7dbaIe4*h(5&GXZO`m42bbk{oCg0R)&rM>z80Lfx}p`a`o(t?eJHsk9E*3V zXK(lF)p@b&yn6MtbV+zNN!W#=E(tGI60*wTT}c+3_0nklb2(FDF%G|{1Ua_yBgg00yu zj%Y)THdW*yHj-=dJ+njXSb1XvJf6aOy}inh$gVV-1F$=NnkmwfK8C#|wkK~QDzaaXzxWhM2Q?41D~H`PcFlf+tr)k009 zP&WO=YHzi_*7bqfPrg0fTMGYqZu|S#`vwit*tTu$P@z6{>Dm6aUzB<*>N0a*(^_V% z*?U`koQA7wy4uz(Nv~OpY^c1lm;9>wW4)q2T4ac^c36un*S2bgqqkKT)VitPcp-kB z^&u9M$OP_OV z7frT|B|o<$K@TNm-%0u)p?Bt_?pHw;yCybFO>E+tnDAg)>BP|zR#}`};Q?zHjuuBtBU<{Lle=hWS#`8HS{l*P=M3FN zL(7_@#nIA;mOkg}E*e_a9W9QQMzr)fdw0>$g3s42$ab2q+r+ADVnk=SH$U2tp@t*I z{rOFHv1NI&9Wgm#+^6q8eLD@?BHYTBWa0oFF*#z~ukVOyLyUX&^LW!`#jv_o=2}^^ zR_5OQ6fy4KcNxQ!(Pk)LS_w_qs+7RR3{uD9p z?{~zsA;vxaDPr8`?}%we3}*(oKPLKPpoUwQ-MZXlT^_s7Uom$%V%+ENGNw(&xX(XD zjQji@F>Q!(pMQ!N_xU?w+7RPD{}eIq^LNCwA;x|FDPr8`?}%wbjQjjk#JJDj5z~ek z_xY!Yai6~-x1S>829<7h;g64Bc=^8?(-x1S>829<7 zh;g64Bc=^8?({0eq`T68~V&AL$H>J{!WdHrU%D=8BoUJ}pxl7rGemw|vTZy)> zl(M!$TXs*Z#&1;WVfIRQW4-h3Jyxqm*^0^@Dbl#P^1rogd!WJUQvj zlat=_XpK_U6fHZ=PiR(r>Pw_U6gy zj3;TwjnDLQ|kFt-U3|lo%w6;Yn=P#>Y(y^95%--p{H@hB*M#o|Oy{W&?LW!g~ z3h~eHvwf|TpQ(lK^tm0yyX4-I z+VVkP`ct+drkY3mguWCp9`YB>OTHg`Vh8U05PZ8tM`i_`3 z#CWuRiWra9cf_iP(+Q#1UPEiQR7ho zMbvm)fTN}zH69sIM2*J=IBMEZ6VKan8(Q40>9GNhns(H9Y(Oe&ICIZq13WgMIW}Od zz3?7UlSkA*4@ZrorVTY78&H%r9vk2qTDykkCj=Bxp>ncGP%mKoK<_8{nvEM~%k@6j9@`0gjq>)Oc(_5j7qg;HYUsO?>Z>#|Btz z0Q7Kwjr(hw{WbAjUyhoH8jlTdS<^0SJT@TJ&?X)m5Z+MqR&R(J&EDMPZ*$@uP(?2B z*noL@xQ6B$TAPOEu>nQYcx-@cXzd!B#|9Kpp>ncGP%mKoK<_8{nvEM~%k@6j9@`0gjqB)J)|I_r09?UQX!YmS?v-H(Q>k{!W2{ z{|b6IY8*9fsPWinQY zcx-^9rX4jN8&E`z#|AiR+EL@N0Y%h!Y=EPt9W@>sP(+Q#1~_WkQRA@zMbvbU4cOEh zZ};@3+nmeB*^<1Z_p)2s(RHk6Z}l$R1Em^eS7gy<-|U*6e@L#s(X#`k8)ci>)9iM3 zQzh`pJi7=sY+hTHuUR3sNKqp-rMUluiRE>)2{N`uCq5P z`%wGO_LBPNTYjwejj|QhF;Y7AXIZ~qhjt$&?HwtF)o`pU%a_;C>;W-fd!G{Qyz)LJ zm=C5;31*M(Q-ay%`jlXHiasUSnypU>roZ+n!DM-#5^P@UQ^K-%9vt>7qd#BrhVB0C z=+B$+ZM6O9(Vst;v;G{)Ox&NQ{ke2mb9Uc$#5bJu85iDf(7zr1`4lPHza9N&(Xk|6 z|914})A2d*={W1pr{i!iMzo1O!oPG@{dTW);e zAKLgJyQA+9+2ial(uZ&9*F&vX|EPO6b?;AM<@Z#-?Dt;+jW0vL=-Vx=f{(PSeG}Hj ztc4%w*M3;Jf7Gw1db$%<$=6SK=dt?zAgsROsB2mUuZDcLPE!0Ry2La*XZ_6DJaXT? z?2$?yWlvPkaaiNCmVcOir(C4ryE#j;o9m^kG$CY^waTO+HSiZ*Y+6Nt%xSr z*COAQ^V^-N($$YbPCKk>>F09OZl#tok#@|FZM4INSw9MkNjt1eb(HB&J8Uycl0W0tjC)4aHtv@?r-H6kPJ>iH>h&}!hjVPO`;MxKat$p1&$ zwas@}b=SprUF7U#loh7GnokSAkZZ8`&Z;z1*I;w>j2d*O?#NmTvr_%cAJIEJ4YqAY z=Tuf^rFPo=C9F8pky~^Pwr%CKF4_?R(60S27VWxdu+6l?qlbmCi}t@*v}@L2R|5{g zShb$U6X-S$^Ig3zoX>ZcQIn|adz%K^yb_0Y>(czu<;;3lT~DOlBJo|D1`FRcVUd_4 zcy1$DlqR+OdAVta20NZbb3Ks-@}aa?Ea15|dtdkN={HZe^cM@R*=)&c8gR7orqN)h zx7K#Io}E5r%+cNAYdP92d6WnhK-Ol67keoiJLjEOeIovf`ta5)(ydB!zO83R(y9;j z|GBK+MB-WGTfH5)Xi-jM*{W)Y5p#_3YxxB54qQ>4W`U+})ppu>-$%Dpr?DKTHFPh% zYYq9h9i3RniQvP^{z7-~JsgVS&?Amp)tDAz6Pr>Nb8`Z~l4j@T(qo=3shQ*-!PIlH8`aif1{nYnDWPHw(2A{E~E*^7C_XRmbDsk%PhOk^dT zE>D-#pLh1M)eN&um)WaZYSu=pvz6td&t5-ef0ieMNI%|Yc z#e4a0ce4BPmv3se`k1{7)`=y_s(1Pj|Gt|2RpmXB)H+bw_jye&vhhInfjDob| z|8w18*0Xr{L&<)k=85(L26>1KBu1Yo{ajk&PtIpha>d;Ml+GBkd*F?}7Jaa* zzFGV{<>99x&nWvznx*-uW7K|KsecWn#@V|HouBd^?I1egp~mCBy!Ln07xd-3L5C(z zg!@0}4kSlw@zG)Si_*Oi@4$_F Uy2>?T?(U1*xo