From 9d8ef6fb9c8c0cc9e5f114aee217e2d277469ccc Mon Sep 17 00:00:00 2001 From: Ada Bohm Date: Thu, 25 Sep 2025 16:08:17 +0200 Subject: [PATCH 01/17] Introduction of resource request ID --- crates/hyperqueue/src/server/event/payload.rs | 2 +- crates/tako/src/control.rs | 11 +++- .../tako/src/internal/common/resources/map.rs | 55 +++++++++++++------ .../tako/src/internal/common/resources/mod.rs | 5 +- .../src/internal/common/resources/request.rs | 6 +- crates/tako/src/internal/messages/worker.rs | 9 ++- crates/tako/src/internal/server/core.rs | 25 ++++++--- crates/tako/src/internal/server/explain.rs | 10 ++-- crates/tako/src/internal/server/reactor.rs | 15 +++++ crates/tako/src/internal/server/rpc.rs | 1 + crates/tako/src/internal/server/worker.rs | 4 +- crates/tako/src/internal/server/workerload.rs | 4 +- .../tako/src/internal/tests/test_reactor.rs | 6 +- .../src/internal/tests/test_scheduler_mn.rs | 4 +- crates/tako/src/internal/tests/test_worker.rs | 5 +- .../tako/src/internal/tests/utils/schedule.rs | 6 +- .../tako/src/internal/tests/utils/shared.rs | 5 +- .../internal/worker/resources/allocator.rs | 4 +- .../tako/src/internal/worker/resources/map.rs | 4 +- crates/tako/src/internal/worker/rpc.rs | 11 +++- crates/tako/src/internal/worker/state.rs | 13 +++-- crates/tako/src/launcher.rs | 4 +- crates/tako/src/lib.rs | 2 +- 23 files changed, 139 insertions(+), 72 deletions(-) diff --git a/crates/hyperqueue/src/server/event/payload.rs b/crates/hyperqueue/src/server/event/payload.rs index 811d11e68..48245883f 100644 --- a/crates/hyperqueue/src/server/event/payload.rs +++ b/crates/hyperqueue/src/server/event/payload.rs @@ -16,7 +16,7 @@ use tako::{JobId, WorkerId}; */ #[derive(Serialize, Deserialize, Debug, Clone)] pub enum EventPayload { - /// New worker has connected to the server + /// A new worker has connected to the server WorkerConnected(WorkerId, Box), /// Worker has disconnected from the server WorkerLost(WorkerId, LostWorkerReason), diff --git a/crates/tako/src/control.rs b/crates/tako/src/control.rs index bcc73ae51..7404ad709 100644 --- a/crates/tako/src/control.rs +++ b/crates/tako/src/control.rs @@ -12,6 +12,7 @@ use crate::gateway::{ LostWorkerReason, MultiNodeAllocationResponse, TaskSubmit, WorkerRuntimeInfo, }; use crate::internal::common::error::DsError; +use crate::internal::common::resources::{ResourceId, ResourceRqId}; use crate::internal::messages::worker::ToWorkerMessage; use crate::internal::scheduler::query::compute_new_worker_query; use crate::internal::scheduler::state::{run_scheduling_now, scheduler_loop}; @@ -21,9 +22,9 @@ use crate::internal::server::core::{CoreRef, CustomConnectionHandler}; use crate::internal::server::explain::{ TaskExplanation, task_explain_for_worker, task_explain_init, }; -use crate::internal::server::reactor::on_cancel_tasks; +use crate::internal::server::reactor::{get_or_create_resource_rq_id, on_cancel_tasks}; use crate::internal::server::worker::DEFAULT_WORKER_OVERVIEW_INTERVAL; -use crate::resources::ResourceDescriptor; +use crate::resources::{ResourceDescriptor, ResourceRequest, ResourceRequestVariants}; use crate::{TaskId, WorkerId}; #[derive(Debug)] @@ -202,6 +203,12 @@ impl ServerRef { let core = self.core_ref.get(); core.dump(now) } + + pub fn get_or_create_resource_rq_id(&self, rqv: &ResourceRequestVariants) -> ResourceRqId { + let mut core = self.core_ref.get_mut(); + let mut comm = self.comm_ref.get_mut(); + get_or_create_resource_rq_id(&mut core, &mut *comm, rqv) + } } #[allow(clippy::too_many_arguments)] diff --git a/crates/tako/src/internal/common/resources/map.rs b/crates/tako/src/internal/common/resources/map.rs index b8d4a3836..6cf471c56 100644 --- a/crates/tako/src/internal/common/resources/map.rs +++ b/crates/tako/src/internal/common/resources/map.rs @@ -1,5 +1,6 @@ use crate::internal::common::Map; -use crate::internal::common::resources::ResourceId; +use crate::internal::common::resources::{ResourceId, ResourceRqId}; +use crate::resources::{ResourceRequest, ResourceRequestVariants}; pub const CPU_RESOURCE_ID: ResourceId = ResourceId(0); @@ -8,22 +9,30 @@ pub const NVIDIA_GPU_RESOURCE_NAME: &str = "gpus/nvidia"; pub const AMD_GPU_RESOURCE_NAME: &str = "gpus/amd"; pub const MEM_RESOURCE_NAME: &str = "mem"; +pub(crate) type ResourceRqMap = Map; + #[derive(Debug)] -pub(crate) struct ResourceIdAllocator { +pub(crate) struct GlobalResourceMapping { + resource_rq_from_id: ResourceRqMap, + resource_rq_to_id: Map, resource_names: Map, } -impl Default for ResourceIdAllocator { +impl Default for GlobalResourceMapping { fn default() -> Self { let mut resource_names = Map::new(); /* Fix id for cpus */ resource_names.insert(CPU_RESOURCE_NAME.to_string(), CPU_RESOURCE_ID); - ResourceIdAllocator { resource_names } + GlobalResourceMapping { + resource_rq_from_id: Default::default(), + resource_names, + resource_rq_to_id: Map::new(), + } } } -impl ResourceIdAllocator { - pub fn get_or_allocate_id(&mut self, name: &str) -> ResourceId { +impl GlobalResourceMapping { + pub fn get_or_allocate_resource_id(&mut self, name: &str) -> ResourceId { match self.resource_names.get(name) { Some(&id) => id, None => { @@ -37,25 +46,40 @@ impl ResourceIdAllocator { /// Create an immutable snapshot of resource name map. #[inline] - pub fn create_map(&self) -> ResourceMap { + pub fn create_resource_id_map(&self) -> ResourceIdMap { let mut resource_names: Vec<_> = self.resource_names.keys().cloned().collect(); resource_names.sort_unstable_by_key(|name| *self.resource_names.get(name).unwrap()); - ResourceMap { resource_names } + ResourceIdMap { resource_names } } - #[inline] - pub fn resource_count(&self) -> usize { - self.resource_names.len() + pub fn get_resource_rq_map(&self) -> &ResourceRqMap { + &self.resource_rq_from_id + } + + pub fn get_or_allocate_resource_rq_id( + &mut self, + rqv: &ResourceRequestVariants, + ) -> (ResourceRqId, bool) { + match self.resource_rq_to_id.get(rqv) { + Some(&id) => (id, false), + None => { + let id = ResourceRqId::new(self.resource_rq_to_id.len() as u32); + log::debug!("New resource request registered {rqv:?} as {id}"); + self.resource_rq_to_id.insert(rqv.clone(), id); + self.resource_rq_from_id.insert(id, rqv.clone()); + (id, true) + } + } } } #[derive(Default, Debug)] -pub struct ResourceMap { +pub struct ResourceIdMap { resource_names: Vec, } -impl ResourceMap { +impl ResourceIdMap { #[inline] pub fn from_vec(resource_names: Vec) -> Self { Self { resource_names } @@ -78,11 +102,6 @@ impl ResourceMap { self.resource_names.len() } - #[inline] - pub fn is_empty(&self) -> bool { - self.len() == 0 - } - #[inline] pub fn get_index(&self, name: &str) -> Option { self.resource_names diff --git a/crates/tako/src/internal/common/resources/mod.rs b/crates/tako/src/internal/common/resources/mod.rs index b24fcc87e..df09df096 100644 --- a/crates/tako/src/internal/common/resources/mod.rs +++ b/crates/tako/src/internal/common/resources/mod.rs @@ -24,9 +24,12 @@ pub use amount::{ResourceAmount, ResourceFractions, ResourceUnits}; pub type NumOfNodes = u32; -// Identifies a globally unique Resource request stored in Core. +// Identifies a globally unique Resource ID stored in Core. define_id_type!(ResourceId, u32); +// Identifies a globally unique Resource request stored in Core. +define_id_type!(ResourceRqId, u32); + // Represents an index within a single generic resource (e.g. GPU with ID 1). define_id_type!(ResourceIndex, u32); diff --git a/crates/tako/src/internal/common/resources/request.rs b/crates/tako/src/internal/common/resources/request.rs index b354ad012..734b1840e 100644 --- a/crates/tako/src/internal/common/resources/request.rs +++ b/crates/tako/src/internal/common/resources/request.rs @@ -6,7 +6,7 @@ use crate::internal::common::resources::{NumOfNodes, ResourceAmount, ResourceId} use crate::internal::server::workerload::WorkerResources; use crate::internal::worker::resources::allocator::ResourceAllocator; -use crate::resources::ResourceMap; +use crate::resources::ResourceIdMap; use smallvec::SmallVec; use std::time::Duration; @@ -165,7 +165,7 @@ impl ResourceRequest { Ok(()) } - pub fn to_gateway(&self, resource_map: &ResourceMap) -> crate::gateway::ResourceRequest { + pub fn to_gateway(&self, resource_map: &ResourceIdMap) -> crate::gateway::ResourceRequest { crate::gateway::ResourceRequest { n_nodes: self.n_nodes, resources: self @@ -289,7 +289,7 @@ impl ResourceRequestVariants { pub fn to_gateway( &self, - resource_map: &ResourceMap, + resource_map: &ResourceIdMap, ) -> crate::gateway::ResourceRequestVariants { crate::gateway::ResourceRequestVariants { variants: self diff --git a/crates/tako/src/internal/messages/worker.rs b/crates/tako/src/internal/messages/worker.rs index f2c2ee57f..06578cb14 100644 --- a/crates/tako/src/internal/messages/worker.rs +++ b/crates/tako/src/internal/messages/worker.rs @@ -1,9 +1,10 @@ use crate::datasrv::{DataObjectId, OutputId}; use crate::gateway::{EntryType, TaskDataFlags}; use crate::hwstats::WorkerHwStateMessage; -use crate::internal::common::resources::{ResourceAmount, ResourceIndex}; +use crate::internal::common::resources::map::ResourceRqMap; +use crate::internal::common::resources::{ResourceAmount, ResourceIndex, ResourceRqId}; use crate::internal::messages::common::TaskFailInfo; -use crate::resources::ResourceFractions; +use crate::resources::{ResourceFractions, ResourceRequest, ResourceRequestVariants}; use crate::task::SerializedTaskContext; use crate::{InstanceId, Priority, ResourceVariantId}; use crate::{TaskId, WorkerId}; @@ -21,6 +22,7 @@ pub struct WorkerRegistrationResponse { pub server_uid: String, /// Override worker overview interval, if the worker does not have it configured pub worker_overview_interval_override: Option, + pub resource_rq_map: ResourceRqMap, } #[derive(Serialize, Deserialize, Debug)] @@ -38,7 +40,7 @@ pub struct ComputeTaskSeparateData { #[derive(Serialize, Deserialize, Debug, Default, Clone)] pub struct ComputeTaskSharedData { pub user_priority: Priority, - pub resources: crate::internal::common::resources::ResourceRequestVariants, + pub resources: ResourceRequestVariants, pub time_limit: Option, pub data_flags: TaskDataFlags, pub body: Rc<[u8]>, @@ -86,6 +88,7 @@ pub enum ToWorkerMessage { SetOverviewIntervalOverride(Option), RemoveDataObjects(SmallVec<[DataObjectId; 1]>), PlacementResponse(DataObjectId, Option), + NewResourceRequest(ResourceRqId, ResourceRequestVariants), Stop, } diff --git a/crates/tako/src/internal/server/core.rs b/crates/tako/src/internal/server/core.rs index 2cb781e64..a5ea4498e 100644 --- a/crates/tako/src/internal/server/core.rs +++ b/crates/tako/src/internal/server/core.rs @@ -1,8 +1,10 @@ use std::sync::Arc; use std::time::{Duration, Instant}; -use crate::internal::common::resources::map::{ResourceIdAllocator, ResourceMap}; -use crate::internal::common::resources::{ResourceId, ResourceRequestVariants}; +use crate::internal::common::resources::map::{ + GlobalResourceMapping, ResourceIdMap, ResourceRqMap, +}; +use crate::internal::common::resources::{ResourceId, ResourceRequestVariants, ResourceRqId}; use crate::internal::common::{Set, WrappedRcRefCell}; use crate::internal::scheduler::multinode::MultiNodeQueue; use crate::internal::server::dataobj::{DataObjectHandle, ObjsToRemoveFromWorkers}; @@ -37,7 +39,7 @@ pub struct Core { maximal_task_id: TaskId, worker_id_counter: u32, - resource_map: ResourceIdAllocator, + resource_map: GlobalResourceMapping, worker_listen_port: u16, idle_timeout: Option, @@ -511,17 +513,24 @@ impl Core { #[inline] pub fn get_or_create_resource_id(&mut self, name: &str) -> ResourceId { - self.resource_map.get_or_allocate_id(name) + self.resource_map.get_or_allocate_resource_id(name) } #[inline] - pub fn create_resource_map(&self) -> ResourceMap { - self.resource_map.create_map() + pub fn get_or_create_resource_rq_id( + &mut self, + rqv: &ResourceRequestVariants, + ) -> (ResourceRqId, bool) { + self.resource_map.get_or_allocate_resource_rq_id(rqv) } #[inline] - pub fn resource_count(&self) -> usize { - self.resource_map.resource_count() + pub fn create_resource_map(&self) -> ResourceIdMap { + self.resource_map.create_resource_id_map() + } + + pub fn get_resource_rq_map(&self) -> &ResourceRqMap { + self.resource_map.get_resource_rq_map() } pub fn secret_key(&self) -> Option<&Arc> { diff --git a/crates/tako/src/internal/server/explain.rs b/crates/tako/src/internal/server/explain.rs index 5a0666d85..3ae4ae3fc 100644 --- a/crates/tako/src/internal/server/explain.rs +++ b/crates/tako/src/internal/server/explain.rs @@ -2,7 +2,7 @@ use crate::WorkerId; use crate::internal::server::task::{Task, TaskRuntimeState}; use crate::internal::server::worker::Worker; use crate::internal::server::workergroup::WorkerGroup; -use crate::resources::{NumOfNodes, ResourceAmount, ResourceMap}; +use crate::resources::{NumOfNodes, ResourceAmount, ResourceIdMap}; use serde::{Deserialize, Serialize}; use std::time::Duration; @@ -95,7 +95,7 @@ pub fn task_explain_init(task: &Task) -> TaskExplanation { } pub fn task_explain_for_worker( - resource_map: &ResourceMap, + resource_map: &ResourceIdMap, task: &Task, worker: &Worker, worker_group: &WorkerGroup, @@ -149,14 +149,14 @@ mod tests { use crate::internal::tests::utils::schedule::create_test_worker_config; use crate::internal::tests::utils::task::TaskBuilder; use crate::resources::{ - ResourceAmount, ResourceDescriptor, ResourceDescriptorItem, ResourceMap, + ResourceAmount, ResourceDescriptor, ResourceDescriptorItem, ResourceIdMap, }; use crate::{Set, WorkerId}; use std::time::{Duration, Instant}; #[test] fn explain_single_node() { - let resource_map = ResourceMap::from_vec(vec!["cpus".to_string(), "gpus".to_string()]); + let resource_map = ResourceIdMap::from_vec(vec!["cpus".to_string(), "gpus".to_string()]); let now = Instant::now(); let wcfg = create_test_worker_config(1.into(), ResourceDescriptor::simple_cpus(4)); @@ -273,7 +273,7 @@ mod tests { #[test] fn explain_multi_node() { - let resource_map = ResourceMap::from_vec(vec!["cpus".to_string(), "gpus".to_string()]); + let resource_map = ResourceIdMap::from_vec(vec!["cpus".to_string(), "gpus".to_string()]); let now = Instant::now(); let wcfg = create_test_worker_config(1.into(), ResourceDescriptor::simple_cpus(4)); diff --git a/crates/tako/src/internal/server/reactor.rs b/crates/tako/src/internal/server/reactor.rs index 601a65d40..89cdf6dd0 100644 --- a/crates/tako/src/internal/server/reactor.rs +++ b/crates/tako/src/internal/server/reactor.rs @@ -1,5 +1,6 @@ use crate::datasrv::{DataObjectId, OutputId}; use crate::gateway::{CrashLimit, LostWorkerReason}; +use crate::internal::common::resources::ResourceRqId; use crate::internal::common::{Map, Set}; use crate::internal::messages::common::TaskFailInfo; use crate::internal::messages::worker::{ @@ -13,6 +14,7 @@ use crate::internal::server::task::{ComputeTasksBuilder, WaitingInfo}; use crate::internal::server::task::{Task, TaskRuntimeState}; use crate::internal::server::worker::Worker; use crate::internal::server::workermap::WorkerMap; +use crate::resources::ResourceRequestVariants; use crate::{TaskId, WorkerId}; use std::fmt::Write; @@ -595,3 +597,16 @@ pub(crate) fn on_resolve_placement( &ToWorkerMessage::PlacementResponse(data_id, placement), ); } + +pub(crate) fn get_or_create_resource_rq_id( + core: &mut Core, + comm: &mut impl Comm, + rqv: &ResourceRequestVariants, +) -> ResourceRqId { + let (rq_id, is_new) = core.get_or_create_resource_rq_id(rqv); + if is_new { + let msg = ToWorkerMessage::NewResourceRequest(rq_id, rqv.clone()); + comm.broadcast_worker_message(&msg); + } + rq_id +} diff --git a/crates/tako/src/internal/server/rpc.rs b/crates/tako/src/internal/server/rpc.rs index d15cd1c71..c6d54af76 100644 --- a/crates/tako/src/internal/server/rpc.rs +++ b/crates/tako/src/internal/server/rpc.rs @@ -157,6 +157,7 @@ async fn worker_rpc_loop( WorkerRegistrationResponse { worker_id, resource_names: core.create_resource_map().into_vec(), + resource_rq_map: core.get_resource_rq_map().clone(), other_workers: core .get_workers() .filter_map(|w| { diff --git a/crates/tako/src/internal/server/worker.rs b/crates/tako/src/internal/server/worker.rs index 67f466a7a..0a98da703 100644 --- a/crates/tako/src/internal/server/worker.rs +++ b/crates/tako/src/internal/server/worker.rs @@ -3,7 +3,7 @@ use std::fmt; use crate::gateway::{LostWorkerReason, WorkerRuntimeInfo}; use crate::internal::common::Set; use crate::internal::common::resources::TimeRequest; -use crate::internal::common::resources::map::ResourceMap; +use crate::internal::common::resources::map::ResourceIdMap; use crate::internal::common::resources::{ResourceRequest, ResourceRequestVariants}; use crate::internal::messages::worker::{TaskIdsMsg, ToWorkerMessage}; use crate::internal::server::comm::Comm; @@ -319,7 +319,7 @@ impl Worker { pub fn new( id: WorkerId, configuration: WorkerConfiguration, - resource_map: &ResourceMap, + resource_map: &ResourceIdMap, now: Instant, ) -> Self { let resources = WorkerResources::from_description(&configuration.resources, resource_map); diff --git a/crates/tako/src/internal/server/workerload.rs b/crates/tako/src/internal/server/workerload.rs index 8780ecd38..6b69c035b 100644 --- a/crates/tako/src/internal/server/workerload.rs +++ b/crates/tako/src/internal/server/workerload.rs @@ -1,5 +1,5 @@ use crate::internal::common::index::IndexVec; -use crate::internal::common::resources::map::ResourceMap; +use crate::internal::common::resources::map::ResourceIdMap; use crate::internal::common::resources::request::ResourceAllocRequest; use crate::internal::common::resources::{ ResourceAmount, ResourceDescriptor, ResourceId, ResourceRequest, ResourceRequestVariants, @@ -37,7 +37,7 @@ impl WorkerResources { pub(crate) fn from_description( resource_desc: &ResourceDescriptor, - resource_map: &ResourceMap, + resource_map: &ResourceIdMap, ) -> Self { // We only take maximum needed resource id // We are doing it for normalization purposes. It is useful later diff --git a/crates/tako/src/internal/tests/test_reactor.rs b/crates/tako/src/internal/tests/test_reactor.rs index 2181ec1fb..b9cc1574f 100644 --- a/crates/tako/src/internal/tests/test_reactor.rs +++ b/crates/tako/src/internal/tests/test_reactor.rs @@ -30,7 +30,7 @@ use crate::internal::worker::configuration::{ DEFAULT_MAX_DOWNLOAD_TRIES, DEFAULT_MAX_PARALLEL_DOWNLOADS, DEFAULT_WAIT_BETWEEN_DOWNLOAD_TRIES, OverviewConfiguration, }; -use crate::resources::{ResourceAmount, ResourceDescriptorItem, ResourceMap}; +use crate::resources::{ResourceAmount, ResourceDescriptorItem, ResourceIdMap}; use crate::worker::{ServerLostPolicy, WorkerConfiguration}; use crate::{TaskId, WorkerId}; @@ -65,7 +65,7 @@ fn test_worker_add() { let worker = Worker::new( 402.into(), wcfg, - &ResourceMap::from_vec(vec!["cpus".to_string()]), + &ResourceIdMap::from_vec(vec!["cpus".to_string()]), Instant::now(), ); on_new_worker(&mut core, &mut comm, worker); @@ -124,7 +124,7 @@ fn test_worker_add() { let worker = Worker::new( 502.into(), wcfg2, - &ResourceMap::from_vec(vec![ + &ResourceIdMap::from_vec(vec![ "cpus".to_string(), "gpus".to_string(), "mem".to_string(), diff --git a/crates/tako/src/internal/tests/test_scheduler_mn.rs b/crates/tako/src/internal/tests/test_scheduler_mn.rs index a1748619c..9bbd5d580 100644 --- a/crates/tako/src/internal/tests/test_scheduler_mn.rs +++ b/crates/tako/src/internal/tests/test_scheduler_mn.rs @@ -9,7 +9,7 @@ use crate::internal::tests::utils::schedule::{ }; use crate::internal::tests::utils::task::TaskBuilder; -use crate::resources::{ResourceDescriptor, ResourceMap}; +use crate::resources::{ResourceDescriptor, ResourceIdMap}; use crate::{Priority, TaskId, WorkerId}; use std::time::Duration; @@ -290,7 +290,7 @@ fn test_mn_sleep_wakeup_at_once() { fn test_mn_schedule_on_groups() { let mut core = Core::default(); - let resource_map = ResourceMap::from_vec(vec!["cpus".to_string()]); + let resource_map = ResourceIdMap::from_vec(vec!["cpus".to_string()]); let worker_id = WorkerId::new(100); let mut wcfg1 = create_test_worker_config(worker_id, ResourceDescriptor::simple_cpus(1)); wcfg1.group = "group1".to_string(); diff --git a/crates/tako/src/internal/tests/test_worker.rs b/crates/tako/src/internal/tests/test_worker.rs index cc8d606f8..b5286b248 100644 --- a/crates/tako/src/internal/tests/test_worker.rs +++ b/crates/tako/src/internal/tests/test_worker.rs @@ -14,7 +14,7 @@ use crate::internal::worker::configuration::{ use crate::internal::worker::rpc::process_worker_message; use crate::internal::worker::state::WorkerStateRef; use crate::launcher::{StopReason, TaskBuildContext, TaskLaunchData, TaskLauncher}; -use crate::resources::{ResourceDescriptor, ResourceMap}; +use crate::resources::{ResourceDescriptor, ResourceIdMap}; use crate::worker::{ServerLostPolicy, WorkerConfiguration}; use crate::{Set, TaskId, WorkerId}; use smallvec::smallvec; @@ -59,7 +59,7 @@ fn create_test_worker_config() -> WorkerConfiguration { } fn create_test_worker_state(config: WorkerConfiguration) -> WorkerStateRef { - let resource_map = ResourceMap::from_vec( + let resource_map = ResourceIdMap::from_vec( config .resources .resources @@ -73,6 +73,7 @@ fn create_test_worker_state(config: WorkerConfiguration) -> WorkerStateRef { config, None, resource_map, + Default::default(), Box::new(TestLauncher), "testuid".to_string(), ) diff --git a/crates/tako/src/internal/tests/utils/schedule.rs b/crates/tako/src/internal/tests/utils/schedule.rs index baa908317..69e2e84be 100644 --- a/crates/tako/src/internal/tests/utils/schedule.rs +++ b/crates/tako/src/internal/tests/utils/schedule.rs @@ -13,7 +13,7 @@ use crate::internal::worker::configuration::{ DEFAULT_MAX_DOWNLOAD_TRIES, DEFAULT_MAX_PARALLEL_DOWNLOADS, DEFAULT_WAIT_BETWEEN_DOWNLOAD_TRIES, OverviewConfiguration, }; -use crate::resources::ResourceMap; +use crate::resources::ResourceIdMap; use crate::worker::{ServerLostPolicy, WorkerConfiguration}; use crate::{TaskId, WorkerId}; use std::time::{Duration, Instant}; @@ -47,7 +47,7 @@ pub fn new_test_worker( core: &mut Core, worker_id: WorkerId, configuration: WorkerConfiguration, - resource_map: &ResourceMap, + resource_map: &ResourceIdMap, ) { let worker = Worker::new(worker_id, configuration, resource_map, Instant::now()); on_new_worker(core, &mut TestComm::default(), worker); @@ -59,7 +59,7 @@ pub fn create_test_worker(core: &mut Core, worker_id: WorkerId, cpus: u32) { core, worker_id, wcfg, - &ResourceMap::from_vec(vec!["cpus".to_string()]), + &ResourceIdMap::from_vec(vec!["cpus".to_string()]), ); } diff --git a/crates/tako/src/internal/tests/utils/shared.rs b/crates/tako/src/internal/tests/utils/shared.rs index 7d8bf7f2b..f2c22b004 100644 --- a/crates/tako/src/internal/tests/utils/shared.rs +++ b/crates/tako/src/internal/tests/utils/shared.rs @@ -1,7 +1,8 @@ use crate::internal::worker::resources::allocator::ResourceAllocator; use crate::internal::worker::resources::map::ResourceLabelMap; use crate::resources::{ - ResourceAmount, ResourceDescriptor, ResourceDescriptorItem, ResourceDescriptorKind, ResourceMap, + ResourceAmount, ResourceDescriptor, ResourceDescriptorItem, ResourceDescriptorKind, + ResourceIdMap, }; pub fn res_kind_range(start: u32, end: u32) -> ResourceDescriptorKind { @@ -45,7 +46,7 @@ pub fn res_allocator_from_descriptor(descriptor: ResourceDescriptor) -> Resource names.push(item.name.clone()); } - let resource_map = ResourceMap::from_vec(names); + let resource_map = ResourceIdMap::from_vec(names); let label_resource_map = ResourceLabelMap::new(&descriptor, &resource_map); let allocator = ResourceAllocator::new(&descriptor, &resource_map, &label_resource_map); allocator.validate(); diff --git a/crates/tako/src/internal/worker/resources/allocator.rs b/crates/tako/src/internal/worker/resources/allocator.rs index aa1a103e4..19ad7f82d 100644 --- a/crates/tako/src/internal/worker/resources/allocator.rs +++ b/crates/tako/src/internal/worker/resources/allocator.rs @@ -8,7 +8,7 @@ use crate::internal::worker::resources::concise::ConciseFreeResources; use crate::internal::worker::resources::groups::{CouplingWeightItem, group_solver}; use crate::internal::worker::resources::map::ResourceLabelMap; use crate::internal::worker::resources::pool::{FAST_MAX_COUPLED_RESOURCES, ResourcePool}; -use crate::resources::{Allocation, ResourceAmount, ResourceDescriptor, ResourceMap}; +use crate::resources::{Allocation, ResourceAmount, ResourceDescriptor, ResourceIdMap}; use smallvec::SmallVec; use std::cell::RefCell; use std::rc::Rc; @@ -47,7 +47,7 @@ struct BlockedRequest { impl ResourceAllocator { pub fn new( desc: &ResourceDescriptor, - resource_map: &ResourceMap, + resource_map: &ResourceIdMap, label_map: &ResourceLabelMap, ) -> Self { let max_id = desc diff --git a/crates/tako/src/internal/worker/resources/map.rs b/crates/tako/src/internal/worker/resources/map.rs index fef1caa3f..45cdaea08 100644 --- a/crates/tako/src/internal/worker/resources/map.rs +++ b/crates/tako/src/internal/worker/resources/map.rs @@ -2,7 +2,7 @@ use crate::Map; use crate::internal::common::index::IndexVec; use crate::internal::common::resources::ResourceId; use crate::resources::{ - ResourceDescriptor, ResourceDescriptorKind, ResourceIndex, ResourceLabel, ResourceMap, + ResourceDescriptor, ResourceDescriptorKind, ResourceIdMap, ResourceIndex, ResourceLabel, }; use std::borrow::Cow; @@ -13,7 +13,7 @@ pub struct ResourceLabelMap { } impl ResourceLabelMap { - pub fn new(descriptor: &ResourceDescriptor, map: &ResourceMap) -> Self { + pub fn new(descriptor: &ResourceDescriptor, map: &ResourceIdMap) -> Self { let mut resources: IndexVec = vec![Default::default(); map.len()].into(); for resource in &descriptor.resources { diff --git a/crates/tako/src/internal/worker/rpc.rs b/crates/tako/src/internal/worker/rpc.rs index cab8fcbaf..0c3b3d208 100644 --- a/crates/tako/src/internal/worker/rpc.rs +++ b/crates/tako/src/internal/worker/rpc.rs @@ -17,7 +17,7 @@ use crate::comm::{ConnectionRegistration, RegisterWorker}; use crate::hwstats::{WorkerHwState, WorkerHwStateMessage}; use crate::internal::common::WrappedRcRefCell; use crate::internal::common::resources::Allocation; -use crate::internal::common::resources::map::ResourceMap; +use crate::internal::common::resources::map::ResourceIdMap; use crate::internal::datasrv::download::download_manager_process; use crate::internal::datasrv::{DownloadManagerRef, data_upload_service}; use crate::internal::messages::worker::{ @@ -135,6 +135,7 @@ pub async fn run_worker( worker_id, other_workers, resource_names, + resource_rq_map, server_idle_timeout, server_uid, worker_overview_interval_override, @@ -150,7 +151,8 @@ pub async fn run_worker( worker_id, configuration.clone(), secret_key.clone(), - ResourceMap::from_vec(resource_names), + ResourceIdMap::from_vec(resource_names), + resource_rq_map, launcher, server_uid, ); @@ -442,6 +444,9 @@ pub(crate) fn process_worker_message(state: &mut WorkerState, message: ToWorkerM ToWorkerMessage::SetOverviewIntervalOverride(r#override) => { state.worker_overview_interval_override = r#override; } + ToWorkerMessage::NewResourceRequest(rq_id, rqv) => { + todo!() + } } false } @@ -564,7 +569,7 @@ async fn send_overview_loop(state_ref: WorkerStateRef) -> crate::Result<()> { fn resource_allocation_to_msg( allocation: &Allocation, - resource_map: &ResourceMap, + resource_map: &ResourceIdMap, ) -> TaskResourceAllocation { TaskResourceAllocation { resources: allocation diff --git a/crates/tako/src/internal/worker/state.rs b/crates/tako/src/internal/worker/state.rs index 7d6570d65..7a8f2e0b5 100644 --- a/crates/tako/src/internal/worker/state.rs +++ b/crates/tako/src/internal/worker/state.rs @@ -1,6 +1,6 @@ use crate::datasrv::DataObjectId; -use crate::internal::common::resources::Allocation; -use crate::internal::common::resources::map::ResourceMap; +use crate::internal::common::resources::map::{ResourceIdMap, ResourceRqMap}; +use crate::internal::common::resources::{Allocation, ResourceId}; use crate::internal::common::stablemap::StableMap; use crate::internal::common::{Map, Set, WrappedRcRefCell}; use crate::internal::datasrv::{DataObjectRef, DataStorage}; @@ -61,7 +61,8 @@ pub struct WorkerState { tasks_waiting_for_data: Map>, placement_resolver: Map>>, - resource_map: ResourceMap, + resource_rq_map: ResourceRqMap, + resource_map: ResourceIdMap, resource_label_map: ResourceLabelMap, secret_key: Option>, @@ -315,7 +316,7 @@ impl WorkerState { self.remove_task(task_id, true, false); } - pub fn get_resource_map(&self) -> &ResourceMap { + pub fn get_resource_map(&self) -> &ResourceIdMap { &self.resource_map } @@ -440,7 +441,8 @@ impl WorkerStateRef { worker_id: WorkerId, configuration: WorkerConfiguration, secret_key: Option>, - resource_map: ResourceMap, + resource_map: ResourceIdMap, + resource_rq_map: ResourceRqMap, task_launcher: Box, server_uid: String, ) -> Self { @@ -465,6 +467,7 @@ impl WorkerStateRef { running_tasks: Default::default(), start_time: now, resource_map, + resource_rq_map, resource_label_map, worker_addresses: Default::default(), lc_state: RefCell::new(LocalCommState::new()), diff --git a/crates/tako/src/launcher.rs b/crates/tako/src/launcher.rs index 513f4f56c..9694e431b 100644 --- a/crates/tako/src/launcher.rs +++ b/crates/tako/src/launcher.rs @@ -11,7 +11,7 @@ use nix::libc; use tokio::process::Command; use crate::gateway::{EntryType, TaskDataFlags}; -use crate::internal::common::resources::map::ResourceMap; +use crate::internal::common::resources::map::ResourceIdMap; use crate::internal::worker::configuration::WorkerConfiguration; use crate::internal::worker::localcomm::Token; use crate::internal::worker::resources::map::ResourceLabelMap; @@ -126,7 +126,7 @@ impl<'a> TaskBuildContext<'a> { self.state.worker_hostname(worker_id) } - pub fn get_resource_map(&self) -> &ResourceMap { + pub fn get_resource_map(&self) -> &ResourceIdMap { self.state.get_resource_map() } diff --git a/crates/tako/src/lib.rs b/crates/tako/src/lib.rs index 21c19b2ab..337a7e2a9 100644 --- a/crates/tako/src/lib.rs +++ b/crates/tako/src/lib.rs @@ -43,7 +43,7 @@ pub mod resources { ResourceRequestEntries, ResourceRequestVariants, ResourceUnits, TimeRequest, }; - pub use crate::internal::common::resources::map::ResourceMap; + pub use crate::internal::common::resources::map::ResourceIdMap; pub use crate::internal::common::resources::descriptor::DescriptorError; From 791b9ee13ae2a848a64f64b4ddd568ce3b2f78f5 Mon Sep 17 00:00:00 2001 From: Ada Bohm Date: Mon, 24 Nov 2025 13:52:55 +0100 Subject: [PATCH 02/17] ResourceRqId integrated into Tako server --- crates/tako/src/control.rs | 6 +- crates/tako/src/gateway.rs | 6 +- .../tako/src/internal/common/resources/map.rs | 82 +++++++++++- .../tako/src/internal/common/resources/mod.rs | 6 + crates/tako/src/internal/messages/worker.rs | 2 +- .../tako/src/internal/scheduler/multinode.rs | 43 ++++--- crates/tako/src/internal/scheduler/query.rs | 16 +-- crates/tako/src/internal/scheduler/state.rs | 85 ++++++++----- crates/tako/src/internal/server/client.rs | 39 +----- crates/tako/src/internal/server/core.rs | 64 ++++++++-- crates/tako/src/internal/server/explain.rs | 20 +-- crates/tako/src/internal/server/reactor.rs | 35 +++--- crates/tako/src/internal/server/rpc.rs | 4 +- crates/tako/src/internal/server/task.rs | 48 ++++--- crates/tako/src/internal/server/worker.rs | 38 +++--- crates/tako/src/internal/server/workerload.rs | 8 +- .../internal/tests/integration/test_basic.rs | 36 +++--- .../tests/integration/test_resources.rs | 60 ++++----- .../internal/tests/integration/test_worker.rs | 10 +- .../tests/integration/utils/server.rs | 18 ++- .../internal/tests/integration/utils/task.rs | 48 ++++--- crates/tako/src/internal/tests/test_query.rs | 54 ++++---- .../tako/src/internal/tests/test_reactor.rs | 118 ++++++++++-------- .../src/internal/tests/test_scheduler_mn.rs | 56 +++++---- .../src/internal/tests/test_scheduler_sn.rs | 20 +-- crates/tako/src/internal/tests/test_worker.rs | 49 ++++---- crates/tako/src/internal/tests/utils/env.rs | 13 +- crates/tako/src/internal/tests/utils/task.rs | 20 ++- .../src/internal/tests/utils/workflows.rs | 52 ++++---- crates/tako/src/internal/worker/rpc.rs | 7 +- crates/tako/src/internal/worker/state.rs | 21 +++- crates/tako/src/internal/worker/task.rs | 5 +- .../tako/src/internal/worker/test_rqueue.rs | 118 ++++++++++++------ crates/tako/src/internal/worker/test_util.rs | 10 +- 34 files changed, 748 insertions(+), 469 deletions(-) diff --git a/crates/tako/src/control.rs b/crates/tako/src/control.rs index 7404ad709..3d0aa4383 100644 --- a/crates/tako/src/control.rs +++ b/crates/tako/src/control.rs @@ -8,6 +8,7 @@ use tokio::net::TcpListener; use tokio::sync::Notify; use crate::events::EventProcessor; +use crate::gateway::ResourceRequestVariants; use crate::gateway::{ LostWorkerReason, MultiNodeAllocationResponse, TaskSubmit, WorkerRuntimeInfo, }; @@ -24,7 +25,7 @@ use crate::internal::server::explain::{ }; use crate::internal::server::reactor::{get_or_create_resource_rq_id, on_cancel_tasks}; use crate::internal::server::worker::DEFAULT_WORKER_OVERVIEW_INTERVAL; -use crate::resources::{ResourceDescriptor, ResourceRequest, ResourceRequestVariants}; +use crate::resources::{ResourceDescriptor, ResourceRequest}; use crate::{TaskId, WorkerId}; #[derive(Debug)] @@ -207,7 +208,8 @@ impl ServerRef { pub fn get_or_create_resource_rq_id(&self, rqv: &ResourceRequestVariants) -> ResourceRqId { let mut core = self.core_ref.get_mut(); let mut comm = self.comm_ref.get_mut(); - get_or_create_resource_rq_id(&mut core, &mut *comm, rqv) + let rqv = core.convert_client_resource_rq(rqv); + get_or_create_resource_rq_id(&mut core, &mut *comm, &rqv) } } diff --git a/crates/tako/src/gateway.rs b/crates/tako/src/gateway.rs index 488e225ea..20f9e85c0 100644 --- a/crates/tako/src/gateway.rs +++ b/crates/tako/src/gateway.rs @@ -1,4 +1,5 @@ use crate::internal::common::error::DsError; +use crate::internal::common::resources::ResourceRqId; use crate::internal::datasrv::dataobj::DataObjectId; use crate::resources::{AllocationRequest, CPU_RESOURCE_NAME, NumOfNodes, ResourceAmount}; use crate::{InstanceId, Map, Priority, TaskId}; @@ -123,8 +124,6 @@ impl Display for CrashLimit { /// It is sent out-of-band in NewTasksMessage to save bandwidth and allocations. #[derive(Debug)] pub struct SharedTaskConfiguration { - pub resources: ResourceRequestVariants, - pub time_limit: Option, pub priority: Priority, @@ -142,6 +141,9 @@ pub type EntryType = ThinVec; #[derive(Deserialize, Serialize, Clone, Debug)] pub struct TaskConfiguration { pub id: TaskId, + + pub resource_rq_id: ResourceRqId, + /// Index into NewTasksMessage::shared_data that contains the shared data for this task. pub shared_data_index: u32, diff --git a/crates/tako/src/internal/common/resources/map.rs b/crates/tako/src/internal/common/resources/map.rs index 6cf471c56..684fb68df 100644 --- a/crates/tako/src/internal/common/resources/map.rs +++ b/crates/tako/src/internal/common/resources/map.rs @@ -1,6 +1,9 @@ +use crate::gateway::ResourceRequestVariants as ClientResourceRequestVariants; use crate::internal::common::Map; use crate::internal::common::resources::{ResourceId, ResourceRqId}; -use crate::resources::{ResourceRequest, ResourceRequestVariants}; +use crate::internal::server::core::Core; +use crate::resources::{ResourceAllocRequest, ResourceRequest, ResourceRequestVariants}; +use serde::{Deserialize, Serialize}; pub const CPU_RESOURCE_ID: ResourceId = ResourceId(0); @@ -9,8 +12,6 @@ pub const NVIDIA_GPU_RESOURCE_NAME: &str = "gpus/nvidia"; pub const AMD_GPU_RESOURCE_NAME: &str = "gpus/amd"; pub const MEM_RESOURCE_NAME: &str = "mem"; -pub(crate) type ResourceRqMap = Map; - #[derive(Debug)] pub(crate) struct GlobalResourceMapping { resource_rq_from_id: ResourceRqMap, @@ -32,7 +33,35 @@ impl Default for GlobalResourceMapping { } impl GlobalResourceMapping { - pub fn get_or_allocate_resource_id(&mut self, name: &str) -> ResourceId { + pub(crate) fn convert_client_resource_rq( + &mut self, + resources: &ClientResourceRequestVariants, + ) -> ResourceRequestVariants { + ResourceRequestVariants::new( + resources + .variants + .iter() + .map(|rq| { + ResourceRequest::new( + rq.n_nodes, + rq.min_time, + rq.resources + .iter() + .map(|r| { + let resource_id = self.get_or_create_resource_id(&r.resource); + ResourceAllocRequest { + resource_id, + request: r.policy.clone(), + } + }) + .collect(), + ) + }) + .collect(), + ) + } + + pub fn get_or_create_resource_id(&mut self, name: &str) -> ResourceId { match self.resource_names.get(name) { Some(&id) => id, None => { @@ -53,18 +82,27 @@ impl GlobalResourceMapping { ResourceIdMap { resource_names } } + #[inline] pub fn get_resource_rq_map(&self) -> &ResourceRqMap { &self.resource_rq_from_id } - pub fn get_or_allocate_resource_rq_id( + #[cfg(test)] + pub fn get_resource_rq_id(&mut self, rqv: &ResourceRequestVariants) -> ResourceRqId { + *self.resource_rq_to_id.get(rqv).unwrap() + } + + pub fn get_or_create_resource_rq_id( &mut self, rqv: &ResourceRequestVariants, ) -> (ResourceRqId, bool) { match self.resource_rq_to_id.get(rqv) { Some(&id) => (id, false), None => { - let id = ResourceRqId::new(self.resource_rq_to_id.len() as u32); + let mut id = ResourceRqId::new( + self.resource_rq_to_id.len() as u32 * 2 + + if rqv.is_multi_node() { 1 } else { 0 }, + ); log::debug!("New resource request registered {rqv:?} as {id}"); self.resource_rq_to_id.insert(rqv.clone(), id); self.resource_rq_from_id.insert(id, rqv.clone()); @@ -117,3 +155,35 @@ impl ResourceIdMap { .map(|s| s.as_str()) } } + +#[derive(Default, Debug, Clone, Serialize, Deserialize)] +#[serde(transparent)] +pub struct ResourceRqMap(Map); + +impl ResourceRqMap { + pub fn insert(&mut self, rq_id: ResourceRqId, rqv: ResourceRequestVariants) { + assert!(self.0.insert(rq_id, rqv).is_none()); + } + + #[inline] + pub fn get(&self, rq_id: &ResourceRqId) -> &ResourceRequestVariants { + self.0.get(rq_id).unwrap() + } + + #[cfg(test)] + pub fn get_or_create(&mut self, rqv: ResourceRequestVariants) -> ResourceRqId { + if let Some(rq_id) = self + .0 + .iter() + .find_map(|(rq_id, rqv2)| (&rqv == rqv2).then(|| *rq_id)) + { + rq_id + } else { + let mut new_id = ResourceRqId::new( + self.0.len() as u32 * 2 + if rqv.is_multi_node() { 1 } else { 0 }, + ); + self.0.insert(new_id, rqv); + new_id + } + } +} diff --git a/crates/tako/src/internal/common/resources/mod.rs b/crates/tako/src/internal/common/resources/mod.rs index df09df096..f3e73d6a3 100644 --- a/crates/tako/src/internal/common/resources/mod.rs +++ b/crates/tako/src/internal/common/resources/mod.rs @@ -30,6 +30,12 @@ define_id_type!(ResourceId, u32); // Identifies a globally unique Resource request stored in Core. define_id_type!(ResourceRqId, u32); +impl ResourceRqId { + pub fn is_multi_node(&self) -> bool { + self.0 % 2 == 1 + } +} + // Represents an index within a single generic resource (e.g. GPU with ID 1). define_id_type!(ResourceIndex, u32); diff --git a/crates/tako/src/internal/messages/worker.rs b/crates/tako/src/internal/messages/worker.rs index 06578cb14..4331f0693 100644 --- a/crates/tako/src/internal/messages/worker.rs +++ b/crates/tako/src/internal/messages/worker.rs @@ -30,6 +30,7 @@ pub struct ComputeTaskSeparateData { /// Index into shared data stored in [ComputeTasksMsg]. pub shared_index: usize, pub id: TaskId, + pub resource_rq_id: ResourceRqId, pub instance_id: InstanceId, pub scheduler_priority: Priority, pub node_list: Vec, @@ -40,7 +41,6 @@ pub struct ComputeTaskSeparateData { #[derive(Serialize, Deserialize, Debug, Default, Clone)] pub struct ComputeTaskSharedData { pub user_priority: Priority, - pub resources: ResourceRequestVariants, pub time_limit: Option, pub data_flags: TaskDataFlags, pub body: Rc<[u8]>, diff --git a/crates/tako/src/internal/scheduler/multinode.rs b/crates/tako/src/internal/scheduler/multinode.rs index a2ca5e0d4..f8cdbb799 100644 --- a/crates/tako/src/internal/scheduler/multinode.rs +++ b/crates/tako/src/internal/scheduler/multinode.rs @@ -1,3 +1,5 @@ +use crate::internal::common::resources::ResourceRqId; +use crate::internal::common::resources::map::{GlobalResourceMapping, ResourceRqMap}; use crate::internal::server::task::Task; use crate::internal::server::taskmap::TaskMap; use crate::internal::server::worker::Worker; @@ -25,8 +27,8 @@ impl QueueForRequest { #[derive(Default)] pub(crate) struct MultiNodeQueue { - queues: Map, - requests: Vec, + queues: Map, + requests: Vec, } fn task_priority_tuple(task: &Task) -> PriorityTuple { @@ -42,26 +44,24 @@ impl MultiNodeQueue { self.requests.shrink_to_fit(); } - pub fn get_profiles(&self) -> impl Iterator { + pub fn get_profiles(&self) -> impl Iterator { self.queues .iter() - .map(|(rq, qfr)| (rq, qfr.queue.len() as u32)) + .map(|(rq, qfr)| (*rq, qfr.queue.len() as u32)) } - pub fn add_task(&mut self, task: &Task) { - let queue = if let Some(qfr) = self - .queues - .get_mut(task.configuration.resources.unwrap_first()) - { + pub fn add_task(&mut self, task: &Task, resource_map: &ResourceRqMap) { + let queue = if let Some(qfr) = self.queues.get_mut(&task.resource_rq_id) { &mut qfr.queue } else { - self.requests - .push(task.configuration.resources.unwrap_first().clone()); - self.requests - .sort_unstable_by_key(|x| std::cmp::Reverse((x.n_nodes(), x.min_time()))); + self.requests.push(task.resource_rq_id); + self.requests.sort_unstable_by_key(|id| { + let rq = resource_map.get(id).trivial_request().unwrap(); + std::cmp::Reverse((rq.n_nodes(), rq.min_time())) + }); &mut self .queues - .entry(task.configuration.resources.unwrap_first().clone()) + .entry(task.resource_rq_id) .or_insert(QueueForRequest { queue: PriorityQueue::new(), sleeping: false, @@ -78,8 +78,8 @@ impl MultiNodeQueue { } #[cfg(test)] - pub fn is_sleeping(&self, rq: &ResourceRequest) -> bool { - self.queues.get(rq).unwrap().sleeping + pub fn is_sleeping(&self, rq_id: ResourceRqId) -> bool { + self.queues.get(&rq_id).unwrap().sleeping } pub fn dump(&self) -> serde_json::Value { @@ -101,6 +101,7 @@ pub(crate) struct MultiNodeAllocator<'a> { task_map: &'a mut TaskMap, worker_map: &'a mut WorkerMap, worker_groups: &'a Map, + resource_map: &'a ResourceRqMap, now: std::time::Instant, } @@ -168,6 +169,7 @@ impl<'a> MultiNodeAllocator<'a> { task_map: &'a mut TaskMap, worker_map: &'a mut WorkerMap, worker_groups: &'a Map, + resource_map: &'a ResourceRqMap, now: std::time::Instant, ) -> Self { MultiNodeAllocator { @@ -175,6 +177,7 @@ impl<'a> MultiNodeAllocator<'a> { task_map, worker_map, worker_groups, + resource_map, now, } } @@ -193,8 +196,8 @@ impl<'a> MultiNodeAllocator<'a> { } else { return None; }; - for rq in &self.mn_queue.requests { - let qfr = self.mn_queue.queues.get_mut(rq).unwrap(); + for rq_id in &self.mn_queue.requests { + let qfr = self.mn_queue.queues.get_mut(rq_id).unwrap(); if qfr.sleeping { continue; } @@ -206,6 +209,8 @@ impl<'a> MultiNodeAllocator<'a> { qfr.queue.pop(); continue; } + + let rq = self.resource_map.get(rq_id).unwrap_first(); match find_workers_for_task(rq, self.worker_map, self.worker_groups, self.now) { TaskFindWorkersResult::Ready(workers) => { let task_id = qfr.queue.pop().unwrap().0; @@ -214,7 +219,7 @@ impl<'a> MultiNodeAllocator<'a> { TaskFindWorkersResult::NotReady => { /* Do nothing */ } TaskFindWorkersResult::NoWorkers => { qfr.sleeping = true; - log::debug!("Multi-node task {rq:?} put into sleep",); + log::debug!("Multi-node task {rq_id:?} put into sleep",); continue 'outer; } } diff --git a/crates/tako/src/internal/scheduler/query.rs b/crates/tako/src/internal/scheduler/query.rs index cd8ec011c..d9919e047 100644 --- a/crates/tako/src/internal/scheduler/query.rs +++ b/crates/tako/src/internal/scheduler/query.rs @@ -19,21 +19,22 @@ pub(crate) fn compute_new_worker_query( core: &mut Core, queries: &[WorkerTypeQuery], ) -> NewWorkerAllocationResponse { - log::debug!("Compute new worker query: query = {queries:?}"); + todo!() + /*log::debug!("Compute new worker query: query = {queries:?}"); // Scheduler has to be performed before the query, so there should be no ready_to_assign tasks assert!(core.sn_ready_to_assign().is_empty() || !core.has_workers()); let add_task = |new_loads: &mut [WorkerTypeState], task: &Task| { - let request = &task.configuration.resources; + let request = core.get_resource_rq_map().get(&task.resource_rq_id); for ws in new_loads.iter_mut() { if !ws.w_resources.is_capable_to_run_with(request, |rq| { ws.time_limit.is_none_or(|t| rq.min_time() <= t) }) { if ws.partial && ws.w_resources.is_lowerbound_for(request, |rq| { - ws.time_limit.is_none_or(|t| rq.min_time() <= t) - }) + ws.time_limit.is_none_or(|t| rq.min_time() <= t) + }) { ws.min = 1; } @@ -78,7 +79,7 @@ pub(crate) fn compute_new_worker_query( let mut load = WorkerLoad::new(&worker.resources); for task_id in worker.sn_tasks() { let task = core.get_task(*task_id); - let request = &task.configuration.resources; + let request = core.get_resource_rq(task.resource_rq_id); if task.is_sn_running() || load.have_immediate_resources_for_rqv(request, &worker.resources) { @@ -127,7 +128,8 @@ pub(crate) fn compute_new_worker_query( let (queue, _map, _ws) = core.multi_node_queue_split(); let mut multi_node_allocations: Vec<_> = queue .get_profiles() - .filter_map(|(rq, count)| { + .filter_map(|(rq_id, count)| { + let rq = core.get_resource_rq(rq_id).unwrap_first(); let n_nodes = rq.n_nodes(); queries.iter().enumerate().find_map(|(i, worker_type)| { if let Some(time_limit) = worker_type.time_limit @@ -152,5 +154,5 @@ pub(crate) fn compute_new_worker_query( NewWorkerAllocationResponse { single_node_workers_per_query: single_node_allocations, multi_node_allocations, - } + }*/ } diff --git a/crates/tako/src/internal/scheduler/state.rs b/crates/tako/src/internal/scheduler/state.rs index c83bcb43f..fca5a66cc 100644 --- a/crates/tako/src/internal/scheduler/state.rs +++ b/crates/tako/src/internal/scheduler/state.rs @@ -6,6 +6,7 @@ use tokio::sync::Notify; use tokio::time::sleep; use crate::internal::common::Map; +use crate::internal::common::resources::map::ResourceRqMap; use crate::internal::messages::worker::{TaskIdsMsg, ToWorkerMessage}; use crate::internal::scheduler::multinode::MultiNodeAllocator; use crate::internal::server::comm::{Comm, CommSender, CommSenderRef}; @@ -15,6 +16,7 @@ use crate::internal::server::task::{ComputeTasksBuilder, Task, TaskRuntimeState} use crate::internal::server::worker::Worker; use crate::internal::server::workerload::ResourceRequestLowerBound; use crate::internal::server::workermap::WorkerMap; +use crate::resources::ResourceRequestVariants; use crate::{TaskId, WorkerId}; // Long duration - 1 year @@ -86,24 +88,21 @@ impl SchedulerState { fn choose_worker_for_task<'a>( &mut self, task: &Task, + rq: &ResourceRequestVariants, workers: &'a [&'a mut Worker], dataobj_map: &DataObjectMap, try_immediate_check: bool, ) -> Option { let no_data_deps = task.data_deps.is_empty(); if no_data_deps && try_immediate_check { - if workers[self.last_idx] - .have_immediate_resources_for_rqv_now(&task.configuration.resources, self.now) - { + if workers[self.last_idx].have_immediate_resources_for_rqv_now(rq, self.now) { return Some(self.last_idx); } for (idx, worker) in workers.iter().enumerate() { if idx == self.last_idx { continue; } - if worker - .have_immediate_resources_for_rqv_now(&task.configuration.resources, self.now) - { + if worker.have_immediate_resources_for_rqv_now(rq, self.now) { self.last_idx = idx; return Some(self.last_idx); } @@ -112,13 +111,13 @@ impl SchedulerState { let start_idx = self.last_idx + 1; if no_data_deps { for (idx, worker) in workers[start_idx..].iter().enumerate() { - if worker.is_capable_to_run_rqv(&task.configuration.resources, self.now) { + if worker.is_capable_to_run_rqv(rq, self.now) { self.last_idx = idx + start_idx; return Some(self.last_idx); } } for (idx, worker) in workers[..start_idx].iter().enumerate() { - if worker.is_capable_to_run_rqv(&task.configuration.resources, self.now) { + if worker.is_capable_to_run_rqv(rq, self.now) { self.last_idx = idx; return Some(self.last_idx); } @@ -129,7 +128,7 @@ impl SchedulerState { let mut best_idx = None; for (idx, worker) in workers[start_idx..].iter().enumerate() { - if !worker.is_capable_to_run_rqv(&task.configuration.resources, self.now) { + if !worker.is_capable_to_run_rqv(rq, self.now) { continue; } let cost = compute_transfer_cost(dataobj_map, task, worker.id); @@ -140,7 +139,7 @@ impl SchedulerState { best_idx = Some(start_idx + idx); } for (idx, worker) in workers[..start_idx].iter().enumerate() { - if !worker.is_capable_to_run_rqv(&task.configuration.resources, self.now) { + if !worker.is_capable_to_run_rqv(rq, self.now) { continue; } let cost = compute_transfer_cost(dataobj_map, task, worker.id); @@ -281,8 +280,8 @@ impl SchedulerState { } // This function assumes that potential removal of an assigned is already done - fn assign_into(&mut self, task: &mut Task, worker: &mut Worker) { - worker.insert_sn_task(task); + fn assign_into(&mut self, task: &mut Task, rqv: &ResourceRequestVariants, worker: &mut Worker) { + worker.insert_sn_task(task, rqv); let new_state = match task.state { TaskRuntimeState::Waiting(_) => TaskRuntimeState::Assigned(worker.id), TaskRuntimeState::Assigned(old_w) => { @@ -306,9 +305,10 @@ impl SchedulerState { } pub fn assign(&mut self, core: &mut Core, task_id: TaskId, worker_id: WorkerId) { - let (tasks, workers) = core.split_tasks_workers_mut(); + let (tasks, workers, requests) = core.split_tasks_workers_requests_mut(); let task = tasks.get_task_mut(task_id); let assigned_worker = task.get_assigned_worker(); + let rqv = requests.get(&task.resource_rq_id); if let Some(w_id) = assigned_worker { log::debug!( "Changing assignment of task={} from worker={} to worker={}", @@ -317,7 +317,7 @@ impl SchedulerState { worker_id ); assert_ne!(w_id, worker_id); - workers.get_worker_mut(w_id).remove_sn_task(task); + workers.get_worker_mut(w_id).remove_sn_task(task, rqv); } else { log::debug!( "Fresh assignment of task={} to worker={}", @@ -325,7 +325,7 @@ impl SchedulerState { worker_id ); } - self.assign_into(task, workers.get_worker_mut(worker_id)); + self.assign_into(task, rqv, workers.get_worker_mut(worker_id)); } // fn assign_multi_node_task( @@ -357,9 +357,16 @@ impl SchedulerState { fn try_start_multinode_tasks(&mut self, core: &mut Core) { loop { // "while let" not used because of lifetime problems - let (mn_queue, task_map, worker_map, worker_groups) = core.multi_node_queue_split_mut(); - let allocator = - MultiNodeAllocator::new(mn_queue, task_map, worker_map, worker_groups, self.now); + let (mn_queue, task_map, worker_map, worker_groups, resource_map) = + core.multi_node_queue_split_mut(); + let allocator = MultiNodeAllocator::new( + mn_queue, + task_map, + worker_map, + worker_groups, + resource_map, + self.now, + ); if let Some((task_id, workers)) = allocator.try_allocate_task() { let task = task_map.get_task_mut(task_id); self.assign_multinode(worker_map, task, workers); @@ -387,13 +394,14 @@ impl SchedulerState { let Some(task) = core.find_task(*task_id) else { continue; }; - if core.check_parked_resources(&task.configuration.resources) { + let rq = core.get_resource_rq(task.resource_rq_id); + if core.check_parked_resources(rq) { core.wakeup_parked_resources(); break; } } } - let (tasks, workers, dataobjs) = core.split_tasks_workers_dataobjs_mut(); + let (tasks, workers, dataobjs, resource_map) = core.split_tasks_workers_dataobjs_mut(); let mut workers = workers .values_mut() .filter(|w| !w.is_parked()) @@ -404,13 +412,15 @@ impl SchedulerState { let Some(task) = tasks.find_task_mut(task_id) else { continue; }; + let rq = resource_map.get(&task.resource_rq_id); if let Some(worker) = self.choose_worker_for_task( task, + rq, &workers, dataobjs, idx < MAX_TASKS_FOR_IMMEDIATE_RUN_CHECK, ) { - self.assign_into(task, workers[worker]); + self.assign_into(task, rq, workers[worker]); } else { sleeping_tasks.push(task_id); } @@ -437,7 +447,7 @@ impl SchedulerState { let now = Instant::now(); { - let (tasks, workers) = core.split_tasks_workers_mut(); + let (tasks, workers, request_map) = core.split_tasks_workers_requests_mut(); for worker in workers.values() { let mut offered = 0; if !worker.is_overloaded() { @@ -448,8 +458,9 @@ impl SchedulerState { if task.is_sn_running() { continue; } + let rq = request_map.get(&task.resource_rq_id); task.set_take_flag(false); - min_resource.include_rqv(&task.configuration.resources); + min_resource.include_rqv(rq); balanced_tasks.push(task_id); offered += 1; } @@ -472,9 +483,8 @@ impl SchedulerState { log::debug!("Min resources {min_resource:?}"); let mut underload_workers = Vec::new(); - let task_map = core.task_map(); - let dataobj_map = core.dataobj_map(); - for worker in core.get_workers() { + let (task_map, workers, dataobj_map, requests) = core.split_tasks_workers_dataobjs_mut(); + for (_, worker) in workers.iter_mut() { // We could here also test park flag, but it is already solved in the next condition if worker.have_immediate_resources_for_lb(&min_resource) { log::debug!( @@ -492,6 +502,14 @@ impl SchedulerState { if !task.is_fresh() && task.get_assigned_worker() != Some(worker.id) { cost += 10_000_000; } + let difficulty = + *worker + .difficulty + .entry(task.resource_rq_id) + .or_insert_with(|| { + let rqv = requests.get(&task.resource_rq_id); + worker.resources.compute_difficulty_score_of_rqv(&rqv) + }); log::debug!( "Transfer cost task={} -> worker={} is {}", task.id, @@ -502,9 +520,7 @@ impl SchedulerState { u64::MAX - cost, task.configuration.user_priority, task.scheduler_priority, - worker - .resources - .difficulty_score_of_rqv(&task.configuration.resources), + difficulty, ) }); let len = ts.len(); @@ -549,10 +565,11 @@ impl SchedulerState { if task.is_taken() { continue; } - if !worker.has_time_to_run_for_rqv(&task.configuration.resources, now) { + let rq = core.get_resource_rq(task.resource_rq_id); + if !worker.has_time_to_run_for_rqv(rq, now) { continue; } - if !worker.have_immediate_resources_for_rqv(&task.configuration.resources) { + if !worker.have_immediate_resources_for_rqv(rq) { continue; } let worker2_id = task.get_assigned_worker().unwrap(); @@ -589,15 +606,15 @@ impl SchedulerState { if task.is_taken() { continue; } - let request = &task.configuration.resources; - if !worker.is_capable_to_run_rqv(request, now) { + let rq = core.get_resource_rq(task.resource_rq_id); + if !worker.is_capable_to_run_rqv(rq, now) { continue; } let worker2_id = task.get_assigned_worker().unwrap(); let worker2 = core.get_worker_by_id_or_panic(worker2_id); if !worker2.is_overloaded() - || worker.load_wrt_rqv(request) > worker2.load_wrt_rqv(request) + || worker.load_wrt_rqv(rq) > worker2.load_wrt_rqv(rq) { continue; } diff --git a/crates/tako/src/internal/server/client.rs b/crates/tako/src/internal/server/client.rs index c6800608a..ac684ec62 100644 --- a/crates/tako/src/internal/server/client.rs +++ b/crates/tako/src/internal/server/client.rs @@ -1,4 +1,4 @@ -use crate::internal::common::resources::{ResourceRequest, ResourceRequestVariants}; +use crate::internal::common::resources::{ResourceRequest, ResourceRequestVariants, ResourceRqId}; use crate::gateway::{ ResourceRequestVariants as ClientResourceRequestVariants, SharedTaskConfiguration, TaskSubmit, @@ -11,38 +11,8 @@ use crate::internal::server::reactor::on_new_tasks; use crate::internal::server::task::{Task, TaskConfiguration}; use std::rc::Rc; -fn convert_client_resources( - core: &mut Core, - resources: ClientResourceRequestVariants, -) -> ResourceRequestVariants { - ResourceRequestVariants::new( - resources - .variants - .into_iter() - .map(|rq| { - ResourceRequest::new( - rq.n_nodes, - rq.min_time, - rq.resources - .into_iter() - .map(|r| { - let resource_id = core.get_or_create_resource_id(&r.resource); - ResourceAllocRequest { - resource_id, - request: r.policy, - } - }) - .collect(), - ) - }) - .collect(), - ) -} - fn create_task_configuration(core: &mut Core, msg: SharedTaskConfiguration) -> TaskConfiguration { - let resources = convert_client_resources(core, msg.resources); TaskConfiguration { - resources, time_limit: msg.time_limit, user_priority: msg.priority, crash_limit: msg.crash_limit, @@ -67,12 +37,6 @@ pub(crate) fn handle_new_tasks( .map(|c| Rc::new(create_task_configuration(core, c))) .collect(); - for cfg in &configurations { - if let Err(e) = cfg.resources.validate() { - return Err(format!("Invalid task request {e:?}").into()); - } - } - let mut tasks: Vec = Vec::with_capacity(task_submit.tasks.len()); for task in task_submit.tasks { if core.is_used_task_id(task.id) { @@ -85,6 +49,7 @@ pub(crate) fn handle_new_tasks( let conf = &configurations[idx]; let mut task = Task::new( task.id, + task.resource_rq_id, task.task_deps, task.dataobj_deps, task.entry, diff --git a/crates/tako/src/internal/server/core.rs b/crates/tako/src/internal/server/core.rs index a5ea4498e..523a14c55 100644 --- a/crates/tako/src/internal/server/core.rs +++ b/crates/tako/src/internal/server/core.rs @@ -86,11 +86,37 @@ impl Core { (&mut self.tasks, &mut self.workers) } + #[inline] + pub fn split_tasks_workers_requests_mut( + &mut self, + ) -> (&mut TaskMap, &mut WorkerMap, &ResourceRqMap) { + ( + &mut self.tasks, + &mut self.workers, + self.resource_map.get_resource_rq_map(), + ) + } + #[inline] pub fn split_tasks_workers_dataobjs_mut( &mut self, - ) -> (&mut TaskMap, &mut WorkerMap, &mut DataObjectMap) { - (&mut self.tasks, &mut self.workers, &mut self.data_objects) + ) -> ( + &mut TaskMap, + &mut WorkerMap, + &mut DataObjectMap, + &ResourceRqMap, + ) { + ( + &mut self.tasks, + &mut self.workers, + &mut self.data_objects, + self.resource_map.get_resource_rq_map(), + ) + } + + #[cfg(test)] + pub fn split_tasks_resource_map_mut(&mut self) -> (&mut TaskMap, &mut GlobalResourceMapping) { + (&mut self.tasks, &mut self.resource_map) } #[inline] @@ -98,6 +124,11 @@ impl Core { (&mut self.tasks, &mut self.data_objects) } + #[cfg(test)] + pub fn get_resource_map_mut(&mut self) -> &mut GlobalResourceMapping { + &mut self.resource_map + } + pub fn new_worker_id(&mut self) -> WorkerId { self.worker_id_counter += 1; WorkerId::new(self.worker_id_counter) @@ -127,6 +158,7 @@ impl Core { &mut self.worker_overview_listeners } + #[inline] pub(crate) fn multi_node_queue_split_mut( &mut self, ) -> ( @@ -134,12 +166,14 @@ impl Core { &mut TaskMap, &mut WorkerMap, &Map, + &ResourceRqMap, ) { ( &mut self.multi_node_queue, &mut self.tasks, &mut self.workers, &self.worker_groups, + &self.resource_map.get_resource_rq_map(), ) } @@ -323,8 +357,9 @@ impl Core { pub fn add_ready_to_assign(&mut self, task_id: TaskId) { let task = self.tasks.get_task(task_id); - if task.configuration.resources.is_multi_node() { - self.multi_node_queue.add_task(task); + if task.resource_rq_id.is_multi_node() { + self.multi_node_queue + .add_task(task, self.resource_map.get_resource_rq_map()); } else { self.single_node_ready_to_assign.push(task_id); } @@ -437,7 +472,7 @@ impl Core { if worker.is_parked() { assert!(self.parked_resources.contains(&worker.resources)); } - worker.sanity_check(&self.tasks); + worker.sanity_check(&self.tasks, self.resource_map.get_resource_rq_map()); } for data in self.data_objects.iter() { @@ -513,7 +548,14 @@ impl Core { #[inline] pub fn get_or_create_resource_id(&mut self, name: &str) -> ResourceId { - self.resource_map.get_or_allocate_resource_id(name) + self.resource_map.get_or_create_resource_id(name) + } + + pub fn convert_client_resource_rq( + &mut self, + resources: &crate::gateway::ResourceRequestVariants, + ) -> ResourceRequestVariants { + self.resource_map.convert_client_resource_rq(resources) } #[inline] @@ -521,7 +563,7 @@ impl Core { &mut self, rqv: &ResourceRequestVariants, ) -> (ResourceRqId, bool) { - self.resource_map.get_or_allocate_resource_rq_id(rqv) + self.resource_map.get_or_create_resource_rq_id(rqv) } #[inline] @@ -533,6 +575,11 @@ impl Core { self.resource_map.get_resource_rq_map() } + #[inline] + pub fn get_resource_rq(&self, rq_id: ResourceRqId) -> &ResourceRequestVariants { + self.resource_map.get_resource_rq_map().get(&rq_id) + } + pub fn secret_key(&self) -> Option<&Arc> { self.secret_key.as_ref() } @@ -653,7 +700,8 @@ mod tests { #[test] fn add_remove() { let mut core = Core::default(); - let t = task::task(101); + let rmap = core.get_resource_map_mut(); + let t = task::task(101, rmap); core.add_task(t); let mut objs_to_remove = ObjsToRemoveFromWorkers::new(); assert!(matches!( diff --git a/crates/tako/src/internal/server/explain.rs b/crates/tako/src/internal/server/explain.rs index 3ae4ae3fc..bb3e9f286 100644 --- a/crates/tako/src/internal/server/explain.rs +++ b/crates/tako/src/internal/server/explain.rs @@ -101,7 +101,8 @@ pub fn task_explain_for_worker( worker_group: &WorkerGroup, now: std::time::Instant, ) -> TaskExplanationForWorker { - TaskExplanationForWorker { + todo!() + /*TaskExplanationForWorker { worker_id: worker.id, variants: task .configuration @@ -138,11 +139,12 @@ pub fn task_explain_for_worker( result }) .collect(), - } + }*/ } #[cfg(test)] mod tests { + use crate::internal::common::resources::map::{GlobalResourceMapping, ResourceRqMap}; use crate::internal::server::explain::{TaskExplainItem, task_explain_for_worker}; use crate::internal::server::worker::Worker; use crate::internal::server::workergroup::WorkerGroup; @@ -156,6 +158,7 @@ mod tests { #[test] fn explain_single_node() { + let mut rqs = GlobalResourceMapping::default(); let resource_map = ResourceIdMap::from_vec(vec!["cpus".to_string(), "gpus".to_string()]); let now = Instant::now(); @@ -181,13 +184,15 @@ mod tests { }; let task_id = 1; - let task = TaskBuilder::new(task_id).build(); + let task = TaskBuilder::new(task_id).build(&mut rqs); let r = explain(&task, &worker1, now); assert_eq!(r.variants.len(), 1); assert_eq!(r.variants[0].len(), 1); assert_eq!(r.n_enabled_variants(), 1); - let task = TaskBuilder::new(task_id).time_request(20_000).build(); + let task = TaskBuilder::new(task_id) + .time_request(20_000) + .build(&mut rqs); let r = explain(&task, &worker1, now); assert_eq!(r.variants.len(), 1); assert_eq!(r.variants[0].len(), 2); @@ -220,7 +225,7 @@ mod tests { .time_request(20_000) .cpus_compact(30) .add_resource(1, 3) - .build(); + .build(&mut rqs); let r = explain(&task, &worker2, now); assert_eq!(r.variants.len(), 1); assert_eq!(r.variants[0].len(), 3); @@ -239,7 +244,7 @@ mod tests { .next_resources() .cpus_compact(2) .add_resource(1, 32) - .build(); + .build(&mut rqs); let r = explain(&task, &worker2, now2); assert_eq!(r.variants.len(), 2); assert_eq!(r.variants[0].len(), 3); @@ -273,12 +278,13 @@ mod tests { #[test] fn explain_multi_node() { + let mut rqs = GlobalResourceMapping::default(); let resource_map = ResourceIdMap::from_vec(vec!["cpus".to_string(), "gpus".to_string()]); let now = Instant::now(); let wcfg = create_test_worker_config(1.into(), ResourceDescriptor::simple_cpus(4)); let worker = Worker::new(1.into(), wcfg, &resource_map, now); - let task = TaskBuilder::new(1).n_nodes(4).build(); + let task = TaskBuilder::new(1).n_nodes(4).build(&mut rqs); let mut wset = Set::new(); wset.insert(WorkerId::new(1)); wset.insert(WorkerId::new(2)); diff --git a/crates/tako/src/internal/server/reactor.rs b/crates/tako/src/internal/server/reactor.rs index 89cdf6dd0..618c4dc66 100644 --- a/crates/tako/src/internal/server/reactor.rs +++ b/crates/tako/src/internal/server/reactor.rs @@ -119,10 +119,11 @@ pub(crate) fn on_remove_worker( } { - let (tasks, workers) = core.split_tasks_workers_mut(); + let (tasks, workers, requests) = core.split_tasks_workers_requests_mut(); for (w_id, task_id) in removes { let task = tasks.get_task(task_id); - workers.get_worker_mut(w_id).remove_sn_task(task) + let rqv = requests.get(&task.resource_rq_id); + workers.get_worker_mut(w_id).remove_sn_task(task, rqv); } } @@ -217,7 +218,7 @@ pub(crate) fn on_task_running( context, } = message; - let (tasks, workers) = core.split_tasks_workers_mut(); + let (tasks, workers, requests) = core.split_tasks_workers_requests_mut(); let simple_worker_list = &[worker_id]; if let Some(task) = tasks.find_task_mut(task_id) { let worker_ids = match &task.state { @@ -232,9 +233,10 @@ pub(crate) fn on_task_running( TaskRuntimeState::Stealing(w_id, Some(target_id)) => { assert_eq!(*w_id, worker_id); let worker = workers.get_worker_mut(*target_id); - worker.remove_sn_task(task); + let rqv = requests.get(&task.resource_rq_id); + worker.remove_sn_task(task, rqv); let worker = workers.get_worker_mut(*w_id); - worker.insert_sn_task(task); + worker.insert_sn_task(task, rqv); comm.ask_for_scheduling(); task.state = TaskRuntimeState::Running { worker_id, @@ -281,7 +283,7 @@ pub(crate) fn on_task_finished( ) { let task_id = msg.id; { - let (tasks, workers) = core.split_tasks_workers_mut(); + let (tasks, workers, requests) = core.split_tasks_workers_requests_mut(); if let Some(task) = tasks.find_task_mut(msg.id) { log::debug!( "Task id={} finished on worker={}; outputs={:?}", @@ -289,8 +291,8 @@ pub(crate) fn on_task_finished( worker_id, &msg.outputs ); - assert!(task.is_assigned_or_stolen_from(worker_id)); + let rqv = requests.get(&task.resource_rq_id); match &task.state { TaskRuntimeState::Assigned(w_id) @@ -298,7 +300,7 @@ pub(crate) fn on_task_finished( worker_id: w_id, .. } => { assert_eq!(*w_id, worker_id); - workers.get_worker_mut(worker_id).remove_sn_task(task); + workers.get_worker_mut(worker_id).remove_sn_task(task, rqv); } TaskRuntimeState::RunningMultiNode(ws) => { assert_eq!(ws[0], worker_id); @@ -306,7 +308,7 @@ pub(crate) fn on_task_finished( } TaskRuntimeState::Stealing(w_id, Some(target_w)) => { assert_eq!(*w_id, worker_id); - workers.get_worker_mut(*target_w).remove_sn_task(task); + workers.get_worker_mut(*target_w).remove_sn_task(task, rqv); } TaskRuntimeState::Stealing(w_id, None) => { assert_eq!(*w_id, worker_id); @@ -468,17 +470,18 @@ fn fail_task_helper( error_info: TaskFailInfo, ) { let consumers: Vec = { - let (tasks, workers) = core.split_tasks_workers_mut(); + let (tasks, workers, requests) = core.split_tasks_workers_requests_mut(); if let Some(task) = tasks.find_task(task_id) { log::debug!("Task task_id={task_id} failed"); if let Some(worker_id) = worker_id { - if task.configuration.resources.is_multi_node() { + if task.resource_rq_id.is_multi_node() { let ws = task.mn_placement().unwrap(); assert_eq!(ws[0], worker_id); reset_mn_task_workers(workers, ws, task_id); } else { + let rqv = requests.get(&task.resource_rq_id); assert!(task.is_assigned_or_stolen_from(worker_id)); - workers.get_worker_mut(worker_id).remove_sn_task(task); + workers.get_worker_mut(worker_id).remove_sn_task(task, rqv); } } else { assert!(task.is_waiting()) @@ -538,7 +541,7 @@ pub(crate) fn on_cancel_tasks(core: &mut Core, comm: &mut impl Comm, task_ids: & log::debug!("Canceling {} tasks", task_ids.len()); - let (tasks, workers) = core.split_tasks_workers_mut(); + let (tasks, workers, requests) = core.split_tasks_workers_requests_mut(); for &task_id in task_ids { log::debug!("Canceling task id={task_id}"); if let Some(task) = tasks.find_task(task_id) { @@ -550,7 +553,8 @@ pub(crate) fn on_cancel_tasks(core: &mut Core, comm: &mut impl Comm, task_ids: & | TaskRuntimeState::Running { worker_id: w_id, .. } => { - workers.get_worker_mut(w_id).remove_sn_task(task); + let rqv = requests.get(&task.resource_rq_id); + workers.get_worker_mut(w_id).remove_sn_task(task, rqv); running_ids.entry(w_id).or_default().push(task_id); } TaskRuntimeState::RunningMultiNode(ref ws) => { @@ -561,7 +565,8 @@ pub(crate) fn on_cancel_tasks(core: &mut Core, comm: &mut impl Comm, task_ids: & } TaskRuntimeState::Stealing(from_id, to_id) => { if let Some(to_id) = to_id { - workers.get_worker_mut(to_id).remove_sn_task(task); + let rqv = requests.get(&task.resource_rq_id); + workers.get_worker_mut(to_id).remove_sn_task(task, rqv); } running_ids.entry(from_id).or_default().push(task_id); } diff --git a/crates/tako/src/internal/server/rpc.rs b/crates/tako/src/internal/server/rpc.rs index c6d54af76..8d8ec92fe 100644 --- a/crates/tako/src/internal/server/rpc.rs +++ b/crates/tako/src/internal/server/rpc.rs @@ -211,7 +211,7 @@ async fn worker_rpc_loop( loop { interval.tick().await; let mut core = core_ref.get_mut(); - let (task_map, worker_map) = core.split_tasks_workers_mut(); + let (task_map, worker_map, requests) = core.split_tasks_workers_requests_mut(); let worker = worker_map.get_worker_mut(worker_id); let now = Instant::now(); let elapsed = now - worker.last_heartbeat; @@ -224,7 +224,7 @@ async fn worker_rpc_loop( if elapsed > retract_interval { log::debug!("Trying to retract overtime tasks, worker={}", worker.id); let mut comm = comm_ref2.get_mut(); - worker.retract_overtime_tasks(&mut *comm, task_map, now); + worker.retract_overtime_tasks(&mut *comm, task_map, requests, now); last_retract_check = now; } diff --git a/crates/tako/src/internal/server/task.rs b/crates/tako/src/internal/server/task.rs index e590aa654..7612391b9 100644 --- a/crates/tako/src/internal/server/task.rs +++ b/crates/tako/src/internal/server/task.rs @@ -8,13 +8,15 @@ use crate::internal::common::Set; use crate::internal::common::stablemap::ExtractKey; use crate::{MAX_FRAME_SIZE, Map, ResourceVariantId, WorkerId}; -use crate::gateway::{CrashLimit, EntryType, TaskDataFlags}; +use crate::gateway::{CrashLimit, EntryType, ResourceRequestVariants, TaskDataFlags}; use crate::internal::datasrv::dataobj::DataObjectId; +use crate::internal::common::resources::ResourceRqId; use crate::internal::messages::worker::{ ComputeTaskSeparateData, ComputeTaskSharedData, ComputeTasksMsg, ToWorkerMessage, }; use crate::internal::server::taskmap::TaskMap; +use crate::internal::server::workerload::ResourceRequestLowerBound; use crate::{InstanceId, Priority}; use crate::{TaskId, static_assert_size}; @@ -94,11 +96,6 @@ bitflags::bitflags! { #[derive(Debug, Eq, PartialEq, Hash)] pub struct TaskConfiguration { - // Try to keep the fields ordered in a way so that the chance for finding a different field - // between two different task configurations is as high as possible. - // In other words, task configuration fields that are the same between most tasks should be - // ordered last. - pub resources: crate::internal::common::resources::ResourceRequestVariants, // Use Rc to avoid cloning the data when we serialize them pub body: Rc<[u8]>, pub user_priority: Priority, @@ -110,7 +107,6 @@ pub struct TaskConfiguration { impl TaskConfiguration { pub fn dump(&self) -> serde_json::Value { json!({ - "resources": self.resources, "user_priority": self.user_priority, "time_limit": self.time_limit, "crash_limit": self.crash_limit, @@ -127,6 +123,7 @@ pub struct Task { pub task_deps: ThinVec, pub data_deps: ThinVec, pub flags: TaskFlags, + pub resource_rq_id: ResourceRqId, pub configuration: Rc, pub scheduler_priority: Priority, pub instance_id: InstanceId, @@ -135,7 +132,7 @@ pub struct Task { } // Task is a critical data structure, so we should keep its size in check -static_assert_size!(Task, 112); +static_assert_size!(Task, 120); impl fmt::Debug for Task { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { @@ -161,15 +158,16 @@ impl Task { pub fn new( id: TaskId, + resource_rq_id: ResourceRqId, task_deps: ThinVec, dataobj_deps: ThinVec, entry: Option, configuration: Rc, ) -> Self { log::debug!( - "New task {} {:?} {:?} {:?}", + "New task {} rs={} {:?} {:?}", id, - &configuration.resources, + resource_rq_id, &task_deps, &dataobj_deps, ); @@ -182,6 +180,7 @@ impl Task { task_deps, data_deps: dataobj_deps, flags, + resource_rq_id, configuration, entry, scheduler_priority: Default::default(), @@ -424,7 +423,6 @@ impl ComputeTasksBuilder { .or_insert_with(|| { let shared = ComputeTaskSharedData { user_priority: conf.user_priority, - resources: conf.resources.clone(), time_limit: conf.time_limit, data_flags: conf.data_flags, body: conf.body.clone(), @@ -438,6 +436,7 @@ impl ComputeTasksBuilder { let task_data = ComputeTaskSeparateData { shared_index, id: task.id, + resource_rq_id: task.resource_rq_id, instance_id: task.instance_id, scheduler_priority: task.scheduler_priority, node_list, @@ -485,6 +484,7 @@ fn estimate_task_data_size(data: &ComputeTaskSeparateData) -> usize { let ComputeTaskSeparateData { shared_index, id, + resource_rq_id, instance_id, scheduler_priority, node_list, @@ -496,6 +496,7 @@ fn estimate_task_data_size(data: &ComputeTaskSeparateData) -> usize { // count internal field of Vecs, which are not serialized. size_of_val(shared_index) + size_of_val(id) + + size_of_val(resource_rq_id) + size_of_val(instance_id) + size_of_val(scheduler_priority) + size_of_val(node_list.as_slice()) @@ -507,27 +508,22 @@ fn estimate_task_data_size(data: &ComputeTaskSeparateData) -> usize { fn estimate_shared_data_size(data: &ComputeTaskSharedData) -> usize { let ComputeTaskSharedData { user_priority, - resources, time_limit, data_flags, body, } = data; - size_of_val(user_priority) - + size_of_val(resources.requests()) - + size_of_val(time_limit) - + size_of_val(data_flags) - + body.len() + size_of_val(user_priority) + size_of_val(time_limit) + size_of_val(data_flags) + body.len() } #[cfg(test)] mod tests { - use std::default::Default; - + use crate::internal::common::resources::map::GlobalResourceMapping; use crate::internal::server::core::Core; use crate::internal::server::task::{Task, TaskRuntimeState}; use crate::internal::tests::utils::schedule::submit_test_tasks; use crate::internal::tests::utils::task; use crate::internal::tests::utils::task::task_with_deps; + use std::default::Default; impl Task { pub fn get_unfinished_deps(&self) -> u32 { @@ -540,7 +536,8 @@ mod tests { #[test] fn task_consumers_empty() { - let a = task::task(0); + let mut rmap = GlobalResourceMapping::default(); + let a = task::task(0, &mut rmap); let mut s = crate::Set::new(); a.collect_recursive_consumers(&Default::default(), &mut s); assert!(s.is_empty()); @@ -549,11 +546,12 @@ mod tests { #[test] fn task_recursive_consumers() { let mut core = Core::default(); - let a = task::task(0); - let b = task_with_deps(1, &[&a]); - let c = task_with_deps(2, &[&b]); - let d = task_with_deps(3, &[&b]); - let e = task_with_deps(4, &[&c, &d]); + let rmap = core.get_resource_map_mut(); + let a = task::task(0, rmap); + let b = task_with_deps(1, &[&a], rmap); + let c = task_with_deps(2, &[&b], rmap); + let d = task_with_deps(3, &[&b], rmap); + let e = task_with_deps(4, &[&c, &d], rmap); let expected_ids = vec![b.id, c.id, d.id, e.id]; submit_test_tasks(&mut core, vec![a, b, c, d, e]); diff --git a/crates/tako/src/internal/server/worker.rs b/crates/tako/src/internal/server/worker.rs index 0a98da703..df6f3372d 100644 --- a/crates/tako/src/internal/server/worker.rs +++ b/crates/tako/src/internal/server/worker.rs @@ -2,16 +2,16 @@ use std::fmt; use crate::gateway::{LostWorkerReason, WorkerRuntimeInfo}; use crate::internal::common::Set; -use crate::internal::common::resources::TimeRequest; -use crate::internal::common::resources::map::ResourceIdMap; +use crate::internal::common::resources::map::{ResourceIdMap, ResourceRqMap}; use crate::internal::common::resources::{ResourceRequest, ResourceRequestVariants}; +use crate::internal::common::resources::{ResourceRqId, TimeRequest}; use crate::internal::messages::worker::{TaskIdsMsg, ToWorkerMessage}; use crate::internal::server::comm::Comm; use crate::internal::server::task::{Task, TaskRuntimeState}; use crate::internal::server::taskmap::TaskMap; use crate::internal::server::workerload::{ResourceRequestLowerBound, WorkerLoad, WorkerResources}; use crate::internal::worker::configuration::WorkerConfiguration; -use crate::{TaskId, WorkerId}; +use crate::{Map, TaskId, WorkerId}; use serde_json::json; use std::time::{Duration, Instant}; @@ -50,6 +50,7 @@ pub struct Worker { // !! In case of stealing T from W1 to W2, T is in "tasks" of W2, even T was not yet canceled from W1. sn_tasks: Set, pub(crate) sn_load: WorkerLoad, + pub(crate) difficulty: Map, pub(crate) resources: WorkerResources, pub(crate) flags: WorkerFlags, // When the worker will be terminated @@ -142,38 +143,29 @@ impl Worker { self.sn_tasks.is_empty() && self.mn_task.is_none() && !self.is_stopping() } - pub fn insert_sn_task(&mut self, task: &Task) { + pub fn insert_sn_task(&mut self, task: &Task, rqv: &ResourceRequestVariants) { assert!(self.sn_tasks.insert(task.id)); - self.sn_load.add_request( - task.id, - &task.configuration.resources, - task.running_variant(), - &self.resources, - ); + self.sn_load + .add_request(task.id, rqv, task.running_variant(), &self.resources); } - pub fn remove_sn_task(&mut self, task: &Task) { + pub fn remove_sn_task(&mut self, task: &Task, rqv: &ResourceRequestVariants) { assert!(self.sn_tasks.remove(&task.id)); if self.sn_tasks.is_empty() { self.idle_timestamp = Instant::now(); } - self.sn_load - .remove_request(task.id, &task.configuration.resources, &self.resources); + self.sn_load.remove_request(task.id, rqv, &self.resources); } - pub fn sanity_check(&self, task_map: &TaskMap) { + pub fn sanity_check(&self, task_map: &TaskMap, request_map: &ResourceRqMap) { assert!(self.sn_tasks.is_empty() || self.mn_task.is_none()); let mut check_load = WorkerLoad::new(&self.resources); let mut trivial = true; for &task_id in &self.sn_tasks { let task = task_map.get_task(task_id); - trivial &= task.configuration.resources.is_trivial(); - check_load.add_request( - task_id, - &task.configuration.resources, - task.running_variant(), - &self.resources, - ); + let rqv = request_map.get(&task.resource_rq_id); + trivial &= rqv.is_trivial(); + check_load.add_request(task_id, rqv, task.running_variant(), &self.resources); } if trivial { assert_eq!(self.sn_load, check_load); @@ -268,6 +260,7 @@ impl Worker { &mut self, comm: &mut impl Comm, task_map: &mut TaskMap, + request_map: &ResourceRqMap, now: Instant, ) { if self.termination_time.is_none() || self.mn_task.is_some() { @@ -280,7 +273,7 @@ impl Worker { .filter(|task_id| { let task = task_map.get_task_mut(*task_id); if task.is_assigned() - && !self.is_capable_to_run_rqv(&task.configuration.resources, now) + && !self.is_capable_to_run_rqv(request_map.get(&task.resource_rq_id), now) { log::debug!( "Retracting task={task_id}, time request cannot be fulfilled anymore" @@ -336,6 +329,7 @@ impl Worker { last_heartbeat: now, mn_task: None, idle_timestamp: now, + difficulty: Map::new(), } } diff --git a/crates/tako/src/internal/server/workerload.rs b/crates/tako/src/internal/server/workerload.rs index 6b69c035b..1178a7982 100644 --- a/crates/tako/src/internal/server/workerload.rs +++ b/crates/tako/src/internal/server/workerload.rs @@ -3,7 +3,7 @@ use crate::internal::common::resources::map::ResourceIdMap; use crate::internal::common::resources::request::ResourceAllocRequest; use crate::internal::common::resources::{ ResourceAmount, ResourceDescriptor, ResourceId, ResourceRequest, ResourceRequestVariants, - ResourceVec, + ResourceRqId, ResourceVec, }; use crate::internal::messages::worker::WorkerResourceCounts; use crate::{Map, ResourceVariantId, Set, TaskId}; @@ -115,7 +115,7 @@ impl WorkerResources { entry.request.amount(self.get(entry.resource_id)) } - pub fn difficulty_score(&self, request: &ResourceRequest) -> u64 { + fn compute_difficulty_score(&self, request: &ResourceRequest) -> u64 { let mut result = 0; for entry in request.entries() { let count = self @@ -131,10 +131,10 @@ impl WorkerResources { result } - pub fn difficulty_score_of_rqv(&self, rqv: &ResourceRequestVariants) -> u64 { + pub fn compute_difficulty_score_of_rqv(&self, rqv: &ResourceRequestVariants) -> u64 { rqv.requests() .iter() - .map(|r| self.difficulty_score(r)) + .map(|r| self.compute_difficulty_score(r)) .min() .unwrap_or(0) } diff --git a/crates/tako/src/internal/tests/integration/test_basic.rs b/crates/tako/src/internal/tests/integration/test_basic.rs index 9da38e674..5dcfc7ea9 100644 --- a/crates/tako/src/internal/tests/integration/test_basic.rs +++ b/crates/tako/src/internal/tests/integration/test_basic.rs @@ -13,6 +13,7 @@ use tokio::time::sleep; #[tokio::test] async fn test_submit_simple_task_ok() { run_server_test(Default::default(), |mut handler| async move { + let rq = handler.register_default_request(); let worker = handler.start_worker(Default::default()).await.unwrap(); let stdout = worker.workdir.join("test.out"); @@ -21,10 +22,11 @@ async fn test_submit_simple_task_ok() { let ids = handler .submit( GraphBuilder::default() - .simple_task(&["uname"]) - .simple_task(&["uname"]) + .simple_task(&["uname"], rq) + .simple_task(&["uname"], rq) .task( TaskConfigBuilder::default() + .resources(rq) .args(simple_args(&["bash", "-c", "echo 'hello'"])) .stdout(StdioDef::File { path: stdout.clone(), @@ -50,11 +52,12 @@ async fn test_submit_simple_task_ok() { async fn test_submit_simple_task_fail() { run_server_test(Default::default(), |mut handler| async move { handler.start_worker(Default::default()).await.unwrap(); - + let rq = handler.register_default_request(); let ids = handler .submit(GraphBuilder::singleton(simple_task( &["/usr/bin/nonsense"], 1, + rq, ))) .await; handler.wait(&ids).await.assert_all_failed(); @@ -63,12 +66,13 @@ async fn test_submit_simple_task_fail() { .submit(GraphBuilder::singleton(simple_task( &["bash", "c", "'exit 3'"], 2, + rq, ))) .await; handler.wait(&ids).await.assert_all_failed(); let ids = handler - .submit(GraphBuilder::singleton(simple_task(&["uname"], 3))) + .submit(GraphBuilder::singleton(simple_task(&["uname"], 3, rq))) .await; handler.wait(&ids).await.assert_all_finished(); }) @@ -78,11 +82,12 @@ async fn test_submit_simple_task_fail() { #[tokio::test] async fn test_task_time_limit_fail() { run_server_test(Default::default(), |mut handle| async move { + let rq = handle.register_default_request(); handle.start_worker(Default::default()).await.unwrap(); - handle .submit(GraphBuilder::singleton( TaskConfigBuilder::default() + .resources(rq) .args(simple_args(&["sleep", "2"])) .time_limit(Some(Duration::from_millis(600))), )) @@ -99,11 +104,12 @@ async fn test_task_time_limit_fail() { #[tokio::test] async fn test_task_time_limit_pass() { run_server_test(Default::default(), |mut handle| async move { + let rq = handle.register_default_request(); handle.start_worker(Default::default()).await.unwrap(); - handle .submit(GraphBuilder::singleton( TaskConfigBuilder::default() + .resources(rq) .args(simple_args(&["sleep", "1"])) .time_limit(Some(Duration::from_millis(1600))), )) @@ -123,9 +129,10 @@ fn query_helper( #[tokio::test] async fn test_query_no_output_immediate_call() { run_server_test(Default::default(), |mut handler| async move { + let rq = handler.register_default_request(); handler.start_worker(Default::default()).await.unwrap(); let ids = handler - .submit(GraphBuilder::singleton(simple_task(&["sleep", "1"], 1))) + .submit(GraphBuilder::singleton(simple_task(&["sleep", "1"], 1, rq))) .await; let msg = query_helper( &mut handler, @@ -148,9 +155,10 @@ async fn test_query_no_output_immediate_call() { #[tokio::test] async fn test_query_no_output_delayed_call() { run_server_test(Default::default(), |mut handler| async move { + let rq = handler.register_default_request(); handler.start_worker(Default::default()).await.unwrap(); let ids = handler - .submit(GraphBuilder::singleton(simple_task(&["sleep", "1"], 1))) + .submit(GraphBuilder::singleton(simple_task(&["sleep", "1"], 1, rq))) .await; sleep(Duration::from_secs(1)).await; let msg = query_helper( @@ -175,11 +183,9 @@ async fn test_query_no_output_delayed_call() { async fn test_query_new_workers_delayed_call() { run_server_test(Default::default(), |mut handler| async move { handler.start_worker(Default::default()).await.unwrap(); + let rq = handler.register_request(ResourceRequestConfigBuilder::default().cpus(5)); let _ = handler - .submit(GraphBuilder::singleton( - simple_task(&["sleep", "1"], 1) - .resources(ResourceRequestConfigBuilder::default().cpus(5)), - )) + .submit(GraphBuilder::singleton(simple_task(&["sleep", "1"], 1, rq))) .await; sleep(Duration::from_secs(1)).await; let msg = query_helper( @@ -203,11 +209,9 @@ async fn test_query_new_workers_delayed_call() { async fn test_query_new_workers_immediate() { run_server_test(Default::default(), |mut handler| async move { handler.start_worker(Default::default()).await.unwrap(); + let rq = handler.register_request(ResourceRequestConfigBuilder::default().cpus(5)); let _ = handler - .submit(GraphBuilder::singleton( - simple_task(&["sleep", "1"], 1) - .resources(ResourceRequestConfigBuilder::default().cpus(5)), - )) + .submit(GraphBuilder::singleton(simple_task(&["sleep", "1"], 1, rq))) .await; let msg = query_helper( &mut handler, diff --git a/crates/tako/src/internal/tests/integration/test_resources.rs b/crates/tako/src/internal/tests/integration/test_resources.rs index 45319f3fa..81d4bee6d 100644 --- a/crates/tako/src/internal/tests/integration/test_resources.rs +++ b/crates/tako/src/internal/tests/integration/test_resources.rs @@ -1,8 +1,7 @@ use std::time::{Duration, Instant}; -use tokio::time::sleep; - use crate::WorkerId; +use crate::internal::common::resources::ResourceRqId; use crate::internal::tests::integration::utils::api::{ wait_for_task_start, wait_for_worker_overview, wait_for_workers_overview, }; @@ -13,15 +12,18 @@ use crate::internal::tests::integration::utils::task::{ }; use crate::internal::tests::integration::utils::worker::WorkerConfigBuilder as WC; use crate::resources::ResourceDescriptor; +use crate::tests::integration::utils::task::ResourceRequestConfigBuilder; +use tokio::time::sleep; #[tokio::test] async fn test_submit_2_sleeps_on_1() { run_server_test(Default::default(), |mut handle| async move { + let rq = handle.register_default_request(); handle .submit( GraphBuilder::default() - .task(simple_task(&["sleep", "1"], 1)) - .task(simple_task(&["sleep", "1"], 2)) + .task(simple_task(&["sleep", "1"], 1, rq)) + .task(simple_task(&["sleep", "1"], 2, rq)) .build(), ) .await; @@ -51,11 +53,12 @@ async fn test_submit_2_sleeps_on_1() { #[tokio::test] async fn test_submit_2_sleeps_on_2() { run_server_test(Default::default(), |mut handler| async move { + let rq = handler.register_default_request(); handler .submit( GraphBuilder::default() - .task(simple_task(&["sleep", "1"], 1)) - .task(simple_task(&["sleep", "1"], 2)) + .task(simple_task(&["sleep", "1"], 1, rq)) + .task(simple_task(&["sleep", "1"], 2, rq)) .build(), ) .await; @@ -83,11 +86,12 @@ async fn test_submit_2_sleeps_on_2() { #[tokio::test] async fn test_submit_2_sleeps_on_separated_2() { run_server_test(Default::default(), |mut handler| async move { + let rq = handler.register_default_request(); handler .submit( GraphBuilder::default() - .task(simple_task(&["sleep", "1"], 1)) - .task(simple_task(&["sleep", "1"], 2)) + .task(simple_task(&["sleep", "1"], 1, rq)) + .task(simple_task(&["sleep", "1"], 2, rq)) .build(), ) .await; @@ -120,8 +124,8 @@ async fn test_submit_2_sleeps_on_separated_2() { #[tokio::test] async fn test_submit_sleeps_more_cpus1() { run_server_test(Default::default(), |mut handler| async move { - let rq1 = RR::default().cpus(3); - let rq2 = RR::default().cpus(2); + let rq1 = handler.register_request(RR::default().cpus(3)); + let rq2 = handler.register_request(RR::default().cpus(2)); handler .submit( GB::default() @@ -133,7 +137,7 @@ async fn test_submit_sleeps_more_cpus1() { .task( TC::default() .args(simple_args(&["sleep", "1"])) - .resources(rq2.clone()), + .resources(rq2), ) .task( TC::default() @@ -171,12 +175,12 @@ async fn test_submit_sleeps_more_cpus1() { #[tokio::test] async fn test_submit_sleeps_more_cpus2() { run_server_test(Default::default(), |mut handler| async move { - let rq1 = RR::default().cpus(3); - let rq2 = RR::default().cpus(2); - let t = |rq: &RR| { + let rq1 = handler.register_request(RR::default().cpus(3)); + let rq2 = handler.register_request(RR::default().cpus(2)); + let t = |rq: ResourceRqId| { TC::default() .args(simple_args(&["sleep", "1"])) - .resources(rq.clone()) + .resources(rq) }; handler @@ -191,10 +195,10 @@ async fn test_submit_sleeps_more_cpus2() { let ids = handler .submit( GB::default() - .task(t(&rq1)) - .task(t(&rq2)) - .task(t(&rq2)) - .task(t(&rq1)) + .task(t(rq1)) + .task(t(rq2)) + .task(t(rq2)) + .task(t(rq1)) .build(), ) .await; @@ -209,12 +213,12 @@ async fn test_submit_sleeps_more_cpus2() { #[tokio::test] async fn test_submit_sleeps_more_cpus3() { run_server_test(Default::default(), |mut handler| async move { - let rq1 = RR::default().cpus(3); - let rq2 = RR::default().cpus(2); - let t = |rq: &RR| { + let rq1 = handler.register_request(RR::default().cpus(3)); + let rq2 = handler.register_request(RR::default().cpus(2)); + let t = |rq: ResourceRqId| { TC::default() .args(simple_args(&["sleep", "1"])) - .resources(rq.clone()) + .resources(rq) }; handler @@ -229,10 +233,10 @@ async fn test_submit_sleeps_more_cpus3() { let ids = handler .submit( GB::default() - .task(t(&rq1)) - .task(t(&rq2)) - .task(t(&rq2)) - .task(t(&rq1)) + .task(t(rq1)) + .task(t(rq2)) + .task(t(rq2)) + .task(t(rq1)) .build(), ) .await; @@ -248,7 +252,7 @@ async fn test_submit_sleeps_more_cpus3() { #[tokio::test] async fn test_force_compact() { run_server_test(Default::default(), |mut handler| async move { - let rq = RR::default().add_force_compact("cpus", 4); + let rq = handler.register_request(RR::default().add_force_compact("cpus", 4)); handler .start_workers( diff --git a/crates/tako/src/internal/tests/integration/test_worker.rs b/crates/tako/src/internal/tests/integration/test_worker.rs index 72f456c83..91c1b01fd 100644 --- a/crates/tako/src/internal/tests/integration/test_worker.rs +++ b/crates/tako/src/internal/tests/integration/test_worker.rs @@ -111,10 +111,11 @@ async fn test_worker_lost_idle_timeout() { #[tokio::test] async fn test_worker_idle_timeout_stays_alive_with_tasks() { run_server_test(Default::default(), |mut handle| async move { + let rq = handle.register_default_request(); handle .submit( GraphBuilder::default() - .task(simple_task(&["sleep", "1"], 1)) + .task(simple_task(&["sleep", "1"], 1, rq)) .build(), ) .await; @@ -152,9 +153,9 @@ async fn test_panic_on_worker_lost() { async fn test_lost_worker_with_tasks_continue() { run_server_test(Default::default(), |mut handler| async move { let _workers = handler.start_workers(Default::default, 2).await.unwrap(); - + let rq = handler.register_default_request(); let task_ids = handler - .submit(GraphBuilder::singleton(simple_task(&["sleep", "1"], 1))) + .submit(GraphBuilder::singleton(simple_task(&["sleep", "1"], 1, rq))) .await; let running_on = wait_for_task_start(&mut handler, task_ids[0]).await; @@ -167,8 +168,9 @@ async fn test_lost_worker_with_tasks_continue() { #[tokio::test] async fn test_lost_worker_with_tasks_restarts() { run_server_test(Default::default(), |mut handle| async move { + let rq = handle.register_default_request(); handle - .submit(GraphBuilder::singleton(simple_task(&["sleep", "1"], 1))) + .submit(GraphBuilder::singleton(simple_task(&["sleep", "1"], 1, rq))) .await; for _ in 0..5 { diff --git a/crates/tako/src/internal/tests/integration/utils/server.rs b/crates/tako/src/internal/tests/integration/utils/server.rs index fa4096eab..8fc7c8e5a 100644 --- a/crates/tako/src/internal/tests/integration/utils/server.rs +++ b/crates/tako/src/internal/tests/integration/utils/server.rs @@ -1,5 +1,6 @@ use derive_builder::Builder; use orion::auth::SecretKey; +use smallvec::smallvec; use std::future::Future; use std::net::{Ipv4Addr, SocketAddr}; use std::rc::Rc; @@ -13,7 +14,11 @@ use tokio::time::timeout; use super::worker::WorkerConfigBuilder; use crate::control::ServerRef; use crate::events::EventProcessor; -use crate::gateway::{LostWorkerReason, SharedTaskConfiguration, TaskConfiguration, TaskSubmit}; +use crate::gateway::{ + LostWorkerReason, ResourceRequest, ResourceRequestVariants, SharedTaskConfiguration, + TaskConfiguration, TaskSubmit, +}; +use crate::internal::common::resources::ResourceRqId; use crate::internal::common::{Map, Set}; use crate::internal::messages::common::TaskFailInfo; use crate::internal::tests::integration::utils::api::{WaitResult, wait_for_tasks}; @@ -21,6 +26,7 @@ use crate::internal::tests::integration::utils::worker::{ WorkerContext, WorkerHandle, start_worker, }; use crate::task::SerializedTaskContext; +use crate::tests::integration::utils::task::{ResourceRequestConfig, ResourceRequestConfigBuilder}; use crate::worker::{WorkerConfiguration, WorkerOverview}; use crate::{InstanceId, ResourceVariantId, TaskId, WorkerId, WrappedRcRefCell}; @@ -111,6 +117,16 @@ impl ServerHandle { .await .unwrap() } + + pub fn register_request(&self, rr_builder: ResourceRequestConfigBuilder) -> ResourceRqId { + let rqv = rr_builder.into_rqv(); + self.server_ref.get_or_create_resource_rq_id(&rqv) + } + + pub fn register_default_request(&self) -> ResourceRqId { + let config = ResourceRequestConfigBuilder::default().cpus(1); + self.register_request(config) + } } #[derive(Clone)] diff --git a/crates/tako/src/internal/tests/integration/utils/task.rs b/crates/tako/src/internal/tests/integration/utils/task.rs index 8319c997e..582a25a03 100644 --- a/crates/tako/src/internal/tests/integration/utils/task.rs +++ b/crates/tako/src/internal/tests/integration/utils/task.rs @@ -12,7 +12,7 @@ use crate::gateway::{ SharedTaskConfiguration, TaskConfiguration, TaskDataFlags, }; use crate::internal::common::Map; -use crate::internal::common::resources::NumOfNodes; +use crate::internal::common::resources::{NumOfNodes, ResourceRqId}; use crate::program::{ProgramDefinition, StdioDef}; use crate::resources::{AllocationRequest, ResourceAmount}; @@ -65,8 +65,12 @@ impl GraphBuilder { self } - pub fn simple_task(self, args: &[&'static str]) -> Self { - self.task(TaskConfigBuilder::default().args(simple_args(args))) + pub fn simple_task(self, args: &[&'static str], rq_id: ResourceRqId) -> Self { + self.task( + TaskConfigBuilder::default() + .resources(rq_id) + .args(simple_args(args)), + ) } fn add_task_from_config(&mut self, config: TaskConfig) { @@ -94,11 +98,6 @@ pub fn build_task_def_from_config( stderr, cwd, }: TaskConfig = config; - let ResourceRequestConfig { - n_nodes, - entries, - min_time, - }: ResourceRequestConfig = resources.build().unwrap(); let program_def = ProgramDefinition { args: args.into_iter().map(|v| v.into()).collect(), @@ -114,13 +113,6 @@ pub fn build_task_def_from_config( .unwrap(); let conf = SharedTaskConfiguration { - resources: ResourceRequestVariants { - variants: smallvec![ResourceRequest { - n_nodes, - resources: entries.into(), - min_time, - }], - }, time_limit, priority: 0, crash_limit: CrashLimit::default(), @@ -130,6 +122,7 @@ pub fn build_task_def_from_config( ( TaskConfiguration { id: TaskId::new_test(id.unwrap_or(1)), + resource_rq_id: resources, shared_data_index: 0, task_deps: ThinVec::new(), dataobj_deps: ThinVec::new(), @@ -148,8 +141,7 @@ pub struct TaskConfig { #[builder(default)] time_limit: Option, - #[builder(default = "ResourceRequestConfigBuilder::default().cpus(1)")] - resources: ResourceRequestConfigBuilder, + resources: ResourceRqId, #[builder(default)] args: Vec, @@ -199,14 +191,34 @@ impl ResourceRequestConfigBuilder { self._add(name, AllocationRequest::ForceCompact(amount.into())); self } + + pub fn into_rqv(self) -> ResourceRequestVariants { + let ResourceRequestConfig { + n_nodes, + entries, + min_time, + }: ResourceRequestConfig = self.build().unwrap(); + ResourceRequestVariants { + variants: smallvec![ResourceRequest { + n_nodes, + resources: entries.into(), + min_time, + }], + } + } } pub fn simple_args(args: &[&'static str]) -> Vec { args.iter().map(|&v| v.to_string()).collect() } -pub fn simple_task(args: &[&'static str], id: u32) -> TaskConfigBuilder { +pub fn simple_task( + args: &[&'static str], + id: u32, + resource_rq_id: ResourceRqId, +) -> TaskConfigBuilder { TaskConfigBuilder::default() + .resources(resource_rq_id) .args(simple_args(args)) .id(Some(id)) } diff --git a/crates/tako/src/internal/tests/test_query.rs b/crates/tako/src/internal/tests/test_query.rs index d5aa5b251..a1e31a2d3 100644 --- a/crates/tako/src/internal/tests/test_query.rs +++ b/crates/tako/src/internal/tests/test_query.rs @@ -36,9 +36,10 @@ fn test_query_enough_workers() { create_test_workers(&mut core, &[2, 3]); - let t1 = TaskBuilder::new(1).cpus_compact(3).build(); - let t2 = TaskBuilder::new(2).cpus_compact(1).build(); - let t3 = TaskBuilder::new(3).cpus_compact(1).build(); + let rmap = core.get_resource_map_mut(); + let t1 = TaskBuilder::new(1).cpus_compact(3).build(rmap); + let t2 = TaskBuilder::new(2).cpus_compact(1).build(rmap); + let t3 = TaskBuilder::new(3).cpus_compact(1).build(rmap); submit_test_tasks(&mut core, vec![t1, t2, t3]); let mut scheduler = create_test_scheduler(); @@ -66,9 +67,10 @@ fn test_query_no_enough_workers1() { create_test_workers(&mut core, &[2, 3]); - let t1 = TaskBuilder::new(1).cpus_compact(3).build(); - let t2 = TaskBuilder::new(2).cpus_compact(3).build(); - let t3 = TaskBuilder::new(3).cpus_compact(1).build(); + let rmap = core.get_resource_map_mut(); + let t1 = TaskBuilder::new(1).cpus_compact(3).build(rmap); + let t2 = TaskBuilder::new(2).cpus_compact(3).build(rmap); + let t3 = TaskBuilder::new(3).cpus_compact(1).build(rmap); submit_test_tasks(&mut core, vec![t1, t2, t3]); let mut scheduler = create_test_scheduler(); @@ -300,9 +302,10 @@ fn test_query_multi_node_time_limit() { fn test_query_min_utilization1() { let mut core = Core::default(); - let t1 = TaskBuilder::new(1).cpus_compact(3).build(); - let t2 = TaskBuilder::new(2).cpus_compact(1).build(); - let t3 = TaskBuilder::new(3).cpus_compact(1).build(); + let rmap = core.get_resource_map_mut(); + let t1 = TaskBuilder::new(1).cpus_compact(3).build(rmap); + let t2 = TaskBuilder::new(2).cpus_compact(1).build(rmap); + let t3 = TaskBuilder::new(3).cpus_compact(1).build(rmap); submit_test_tasks(&mut core, vec![t1, t2, t3]); let mut scheduler = create_test_scheduler(); @@ -337,14 +340,15 @@ fn test_query_min_utilization1() { fn test_query_min_utilization2() { let mut core = Core::default(); + let rmap = core.get_resource_map_mut(); let t1 = TaskBuilder::new(1) .cpus_compact(1) .add_resource(1, 10) - .build(); + .build(rmap); let t2 = TaskBuilder::new(2) .cpus_compact(1) .add_resource(1, 10) - .build(); + .build(rmap); submit_test_tasks(&mut core, vec![t1, t2]); let mut scheduler = create_test_scheduler(); @@ -390,8 +394,9 @@ fn test_query_min_utilization2() { fn test_query_min_utilization3() { let mut core = Core::default(); - let t1 = TaskBuilder::new(1).cpus_compact(2).build(); - let t2 = TaskBuilder::new(2).cpus_compact(2).build(); + let rmap = core.get_resource_map_mut(); + let t1 = TaskBuilder::new(1).cpus_compact(2).build(rmap); + let t2 = TaskBuilder::new(2).cpus_compact(2).build(rmap); submit_test_tasks(&mut core, vec![t1, t2]); let descriptor = ResourceDescriptor::new( @@ -433,18 +438,20 @@ fn test_query_min_utilization_vs_partial() { (0, 0, 0), ] { let mut core = Core::default(); + let rmap = core.get_resource_map_mut(); let tasks: Vec<_> = (1..=cpu_tasks) - .map(|task_id| TaskBuilder::new(task_id).cpus_compact(2).build()) + .map(|task_id| TaskBuilder::new(task_id).cpus_compact(2).build(rmap)) .collect(); if !tasks.is_empty() { submit_test_tasks(&mut core, tasks); } + let rmap = core.get_resource_map_mut(); let tasks: Vec<_> = (10..10 + gpu_tasks) .map(|task_id| { TaskBuilder::new(task_id) .cpus_compact(2) .add_resource(1, 1) - .build() + .build(rmap) }) .collect(); if !tasks.is_empty() { @@ -476,14 +483,14 @@ fn test_query_min_utilization_vs_partial() { #[test] fn test_query_min_time2() { let mut core = Core::default(); - + let rmap = core.get_resource_map_mut(); let t1 = TaskBuilder::new(1) .cpus_compact(1) .time_request(100) .next_resources() .cpus_compact(4) .time_request(50) - .build(); + .build(rmap); submit_test_tasks(&mut core, vec![t1]); let mut scheduler = create_test_scheduler(); @@ -517,15 +524,15 @@ fn test_query_min_time2() { #[test] fn test_query_min_time1() { let mut core = Core::default(); - + let rmap = core.get_resource_map_mut(); let t1 = TaskBuilder::new(1) .cpus_compact(1) .time_request(100) - .build(); + .build(rmap); let t2 = TaskBuilder::new(2) .cpus_compact(10) .time_request(100) - .build(); + .build(rmap); submit_test_tasks(&mut core, vec![t1, t2]); let mut scheduler = create_test_scheduler(); @@ -803,10 +810,9 @@ fn test_query_unknown_do_not_add_extra() { #[test] fn test_query_after_task_cancel() { let mut rt = TestEnv::new(); - submit_test_tasks( - rt.core(), - vec![TaskBuilder::new(1).cpus_compact(10).build()], - ); + let rmap = rt.core().get_resource_map_mut(); + let t1 = TaskBuilder::new(1).cpus_compact(10).build(rmap); + submit_test_tasks(rt.core(), vec![t1]); create_test_worker(rt.core(), 102.into(), 1); rt.schedule(); let mut comm = create_test_comm(); diff --git a/crates/tako/src/internal/tests/test_reactor.rs b/crates/tako/src/internal/tests/test_reactor.rs index b9cc1574f..a3cb2bd3f 100644 --- a/crates/tako/src/internal/tests/test_reactor.rs +++ b/crates/tako/src/internal/tests/test_reactor.rs @@ -151,22 +151,23 @@ fn test_worker_add() { #[test] fn test_scheduler_priority() { let mut core = Core::default(); + let rmap = core.get_resource_map_mut(); let mut comm = create_test_comm(); //new_workers(&mut core, &mut comm, vec![1]); - let t1 = task(501); - let t2 = task_with_deps(502, &[&t1]); - let t3 = task(503); - let t4 = task_with_deps(504, &[&t2]); + let t1 = task(501, rmap); + let t2 = task_with_deps(502, &[&t1], rmap); + let t3 = task(503, rmap); + let t4 = task_with_deps(504, &[&t2], rmap); let task_id5 = TaskId::new(123.into(), 1.into()); - let t5 = TaskBuilder::new(task_id5).build(); + let t5 = TaskBuilder::new(task_id5).build(rmap); let task_id6 = TaskId::new(122.into(), 0.into()); - let t6 = TaskBuilder::new(task_id6).build(); + let t6 = TaskBuilder::new(task_id6).build(rmap); let task_id7 = TaskId::new(123.into(), 2.into()); - let t7 = TaskBuilder::new(task_id7).task_deps(&[&t5]).build(); + let t7 = TaskBuilder::new(task_id7).task_deps(&[&t5]).build(rmap); let task_id8 = TaskId::new(123.into(), 4.into()); - let t8 = TaskBuilder::new(task_id8).build(); + let t8 = TaskBuilder::new(task_id8).build(rmap); on_new_tasks(&mut core, &mut comm, vec![t1, t2, t3, t4, t5, t6, t7, t8]); @@ -195,9 +196,9 @@ fn test_submit_jobs() { let mut core = Core::default(); let mut comm = create_test_comm(); //new_workers(&mut core, &mut comm, vec![1]); - - let t1 = task(501); - let t2 = task_with_deps(502, &[&t1]); + let rmap = core.get_resource_map_mut(); + let t1 = task(501, rmap); + let t2 = task_with_deps(502, &[&t1], rmap); on_new_tasks(&mut core, &mut comm, vec![t1, t2]); comm.check_need_scheduling(); @@ -210,10 +211,13 @@ fn test_submit_jobs() { check_task_consumers_exact(t1, &[t2]); - let t3 = task(604); - let t4 = task_with_deps(602, &[t1, &t3]); - let t5 = task_with_deps(603, &[&t3]); - let t6 = task_with_deps(601, &[&t3, &t4, &t5, t2]); + let (tasks, rmap) = core.split_tasks_resource_map_mut(); + let t1 = tasks.get_task(501.into()); + let t2 = tasks.get_task(502.into()); + let t3 = task(604, rmap); + let t4 = task_with_deps(602, &[t1, &t3], rmap); + let t5 = task_with_deps(603, &[&t3], rmap); + let t6 = task_with_deps(601, &[&t3, &t4, &t5, t2], rmap); on_new_tasks(&mut core, &mut comm, vec![t3, t4, t5, t6]); comm.check_need_scheduling(); @@ -253,12 +257,13 @@ fn test_assignments_and_finish() { t3[k] t7[k] */ - let t1 = TaskBuilder::new(11).user_priority(12).build(); - let t2 = task(12); - let t3 = task_with_deps(13, &[&t1, &t2]); - let t4 = task(14); - let t5 = task(15); - let t7 = task_with_deps(17, &[&t4]); + let rmap = core.get_resource_map_mut(); + let t1 = TaskBuilder::new(11).user_priority(12).build(rmap); + let t2 = task(12, rmap); + let t3 = task_with_deps(13, &[&t1, &t2], rmap); + let t4 = task(14, rmap); + let t5 = task(15, rmap); + let t7 = task_with_deps(17, &[&t4], rmap); let (id1, id2, id3, id5, id7) = (t1.id, t2.id, t3.id, t5.id, t7.id); @@ -526,7 +531,8 @@ fn finish_unassigned_task() { fn finish_task_without_outputs() { let mut core = Core::default(); create_test_workers(&mut core, &[1]); - let t1 = task_with_deps(1, &[]); + let rmap = core.get_resource_map_mut(); + let t1 = task_with_deps(1, &[], rmap); submit_test_tasks(&mut core, vec![t1]); assign_to_worker(&mut core, 1, 100); @@ -544,9 +550,10 @@ fn test_task_cancel() { create_test_workers(&mut core, &[1, 1, 1]); submit_example_1(&mut core); - let t40 = task(40); - let t41 = task(41); - let t42 = task(42); + let rmap = core.get_resource_map_mut(); + let t40 = task(40, rmap); + let t41 = task(41, rmap); + let t42 = task(42, rmap); submit_test_tasks(&mut core, vec![t40, t41, t42]); assign_to_worker(&mut core, 11, 101); @@ -592,7 +599,8 @@ fn test_task_cancel() { fn test_worker_lost_with_mn_task_non_root() { let mut core = Core::default(); create_test_workers(&mut core, &[1, 1, 1, 1]); - let task1 = TaskBuilder::new(1).n_nodes(3).build(); + let rmap = core.get_resource_map_mut(); + let task1 = TaskBuilder::new(1).n_nodes(3).build(rmap); submit_test_tasks(&mut core, vec![task1]); start_mn_task_on_worker( &mut core, @@ -624,7 +632,8 @@ fn test_worker_lost_with_mn_task_non_root() { fn test_worker_lost_with_mn_task_root() { let mut core = Core::default(); create_test_workers(&mut core, &[1, 1, 1, 1]); - let task1 = TaskBuilder::new(1).n_nodes(3).build(); + let rmap = core.get_resource_map_mut(); + let task1 = TaskBuilder::new(1).n_nodes(3).build(rmap); submit_test_tasks(&mut core, vec![task1]); start_mn_task_on_worker( &mut core, @@ -652,8 +661,8 @@ fn test_worker_lost_with_mn_task_root() { #[test] fn test_worker_crashing_task() { let mut core = Core::default(); - - let t1 = task(1); + let rmap = core.get_resource_map_mut(); + let t1 = task(1, rmap); submit_test_tasks(&mut core, vec![t1]); assert_eq!(core.get_task(TaskId::new_test(1)).crash_counter, 0); @@ -695,7 +704,8 @@ fn test_worker_crashing_task() { fn test_task_mn_fail() { let mut core = Core::default(); create_test_workers(&mut core, &[1, 1, 1, 1]); - let task1 = TaskBuilder::new(1).n_nodes(3).build(); + let rmap = core.get_resource_map_mut(); + let task1 = TaskBuilder::new(1).n_nodes(3).build(rmap); submit_test_tasks(&mut core, vec![task1]); start_mn_task_on_worker( &mut core, @@ -732,7 +742,8 @@ fn test_task_mn_fail() { fn test_task_mn_cancel() { let mut core = Core::default(); create_test_workers(&mut core, &[1, 1, 1, 1]); - let task1 = TaskBuilder::new(1).n_nodes(3).build(); + let rmap = core.get_resource_map_mut(); + let task1 = TaskBuilder::new(1).n_nodes(3).build(rmap); submit_test_tasks(&mut core, vec![task1]); start_mn_task_on_worker( &mut core, @@ -763,8 +774,9 @@ fn test_task_mn_cancel() { fn test_running_task() { let mut core = Core::default(); create_test_workers(&mut core, &[1, 1, 1]); - let t1 = task(1); - let t2 = task(2); + let rmap = core.get_resource_map_mut(); + let t1 = task(1, rmap); + let t2 = task(2, rmap); submit_test_tasks(&mut core, vec![t1, t2]); assign_to_worker(&mut core, 1, 101); assign_to_worker(&mut core, 2, 101); @@ -820,7 +832,8 @@ fn test_running_task() { fn test_finished_before_steal_response() { let mut core = Core::default(); create_test_workers(&mut core, &[1, 1, 1]); - let t1 = task(1); + let rmap = core.get_resource_map_mut(); + let t1 = task(1, rmap); submit_test_tasks(&mut core, vec![t1]); assign_to_worker(&mut core, 1, 101); start_stealing(&mut core, 1, 102); @@ -855,7 +868,8 @@ fn test_finished_before_steal_response() { fn test_running_before_steal_response() { let mut core = Core::default(); create_test_workers(&mut core, &[1, 1, 1]); - let t1 = task(1); + let rmap = core.get_resource_map_mut(); + let t1 = task(1, rmap); submit_test_tasks(&mut core, vec![t1]); assign_to_worker(&mut core, 1, 101); start_stealing(&mut core, 1, 102); @@ -887,7 +901,8 @@ fn test_running_before_steal_response() { #[test] fn test_ready_to_assign_is_empty_after_cancel() { let mut core = Core::default(); - let t1 = task(1); + let rmap = core.get_resource_map_mut(); + let t1 = task(1, rmap); submit_test_tasks(&mut core, vec![t1]); cancel_tasks(&mut core, &[1]); assert!(core.take_single_node_ready_to_assign().is_empty()); @@ -897,10 +912,11 @@ fn test_ready_to_assign_is_empty_after_cancel() { fn test_after_cancel_messages() { let mut core = Core::default(); create_test_workers(&mut core, &[1, 1, 1]); - let t1 = task(1); - let t2 = task(2); - let t3 = task(3); - let t4 = task(4); + let rmap = core.get_resource_map_mut(); + let t1 = task(1, rmap); + let t2 = task(2, rmap); + let t3 = task(3, rmap); + let t4 = task(4, rmap); submit_test_tasks(&mut core, vec![t1, t2, t3, t4]); assign_to_worker(&mut core, 1, 101); assign_to_worker(&mut core, 2, 101); @@ -954,8 +970,9 @@ fn lost_worker_with_running_and_assign_tasks() { create_test_workers(&mut core, &[1, 1, 1]); submit_example_1(&mut core); - let t40 = task(40); - let t41 = task(41); + let rmap = core.get_resource_map_mut(); + let t40 = task(40, rmap); + let t41 = task(41, rmap); submit_test_tasks(&mut core, vec![t40, t41]); assign_to_worker(&mut core, 11, 101); @@ -1141,8 +1158,9 @@ fn test_worker_groups() { fn test_data_deps_no_output() { let mut core = Core::default(); create_test_workers(&mut core, &[4]); - let t1 = TaskBuilder::new(1).build(); - let t2 = TaskBuilder::new(2).data_dep(&t1, 11).build(); + let rmap = core.get_resource_map_mut(); + let t1 = TaskBuilder::new(1).build(rmap); + let t2 = TaskBuilder::new(2).data_dep(&t1, 11).build(rmap); submit_test_tasks(&mut core, vec![t1, t2]); assign_to_worker(&mut core, 1, 100); core.sanity_check(); @@ -1171,13 +1189,14 @@ fn test_data_deps_no_output() { fn test_data_deps_missing_outputs() { let mut core = Core::default(); create_test_workers(&mut core, &[4]); - let t1 = TaskBuilder::new(1).build(); + let rmap = core.get_resource_map_mut(); + let t1 = TaskBuilder::new(1).build(rmap); let t2 = TaskBuilder::new(2) .data_dep(&t1, 10) .data_dep(&t1, 11) .data_dep(&t1, 100) .data_dep(&t1, 101) - .build(); + .build(rmap); submit_test_tasks(&mut core, vec![t1, t2]); assign_to_worker(&mut core, 1, 100); core.sanity_check(); @@ -1229,12 +1248,13 @@ fn test_data_deps_missing_outputs() { #[test] fn test_data_deps_basic() { let mut core = Core::default(); - let t1 = TaskBuilder::new(1).build(); - let t2 = TaskBuilder::new(2).data_dep(&t1, 0).build(); + let rmap = core.get_resource_map_mut(); + let t1 = TaskBuilder::new(1).build(rmap); + let t2 = TaskBuilder::new(2).data_dep(&t1, 0).build(rmap); let t3 = TaskBuilder::new(3) .data_dep(&t2, 123) .data_dep(&t2, 478) - .build(); + .build(rmap); submit_test_tasks(&mut core, vec![t1, t2, t3]); assert_eq!(core.get_task(2.into()).task_deps, [TaskId::new_test(1)]); core.assert_waiting(&[2, 3]); diff --git a/crates/tako/src/internal/tests/test_scheduler_mn.rs b/crates/tako/src/internal/tests/test_scheduler_mn.rs index 9bbd5d580..92c8b10fd 100644 --- a/crates/tako/src/internal/tests/test_scheduler_mn.rs +++ b/crates/tako/src/internal/tests/test_scheduler_mn.rs @@ -64,13 +64,13 @@ fn check_worker_status_change(s1: WorkerStatus, s2: WorkerStatus, ms: &[ToWorker fn test_schedule_mn_simple() { let mut core = Core::default(); create_test_workers(&mut core, &[5, 5, 5, 5, 5]); - + let rmap = core.get_resource_map_mut(); let tasks: Vec = (1..=4) .map(|i| { TaskBuilder::new(i) .user_priority(i as Priority) .n_nodes(2) - .build() + .build(rmap) }) .collect(); submit_test_tasks(&mut core, tasks); @@ -126,9 +126,10 @@ fn test_schedule_mn_reserve() { let mut core = Core::default(); create_test_workers(&mut core, &[1, 1, 1]); - let task1 = TaskBuilder::new(1).user_priority(10).n_nodes(3).build(); - let task2 = TaskBuilder::new(2).user_priority(5).n_nodes(2).build(); - let task3 = TaskBuilder::new(3).user_priority(0).n_nodes(3).build(); + let rmap = core.get_resource_map_mut(); + let task1 = TaskBuilder::new(1).user_priority(10).n_nodes(3).build(rmap); + let task2 = TaskBuilder::new(2).user_priority(5).n_nodes(2).build(rmap); + let task3 = TaskBuilder::new(3).user_priority(0).n_nodes(3).build(rmap); submit_test_tasks(&mut core, vec![task1, task2, task3]); core.sanity_check(); @@ -193,10 +194,11 @@ fn test_schedule_mn_fill() { &mut core, &[/* 11 workers */ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], ); - let task1 = TaskBuilder::new(1).n_nodes(3).build(); - let task2 = TaskBuilder::new(2).n_nodes(5).build(); - let task3 = TaskBuilder::new(3).n_nodes(1).build(); - let task4 = TaskBuilder::new(4).n_nodes(2).build(); + let rmap = core.get_resource_map_mut(); + let task1 = TaskBuilder::new(1).n_nodes(3).build(rmap); + let task2 = TaskBuilder::new(2).n_nodes(5).build(rmap); + let task3 = TaskBuilder::new(3).n_nodes(1).build(rmap); + let task4 = TaskBuilder::new(4).n_nodes(2).build(rmap); submit_test_tasks(&mut core, vec![task1, task2, task3, task4]); let mut scheduler = create_test_scheduler(); scheduler.run_scheduling(&mut core, &mut comm); @@ -215,10 +217,15 @@ fn test_mn_not_enough() { let mut comm = create_test_comm(); create_test_workers(&mut core, &[4]); - let task1 = TaskBuilder::new(1).n_nodes(3).build(); - let task2 = TaskBuilder::new(2).n_nodes(5).build(); - let task3 = TaskBuilder::new(3).n_nodes(11).build(); - let task4 = TaskBuilder::new(4).n_nodes(2).build(); + let rmap = core.get_resource_map_mut(); + let task1 = TaskBuilder::new(1).n_nodes(3).build(rmap); + let task2 = TaskBuilder::new(2).n_nodes(5).build(rmap); + let task3 = TaskBuilder::new(3).n_nodes(11).build(rmap); + let task4 = TaskBuilder::new(4).n_nodes(2).build(rmap); + let r1 = rmap.get_resource_rq_id(&ResBuilder::default().n_nodes(3).finish_v()); + let r2 = rmap.get_resource_rq_id(&ResBuilder::default().n_nodes(5).finish_v()); + let r3 = rmap.get_resource_rq_id(&ResBuilder::default().n_nodes(11).finish_v()); + let r4 = rmap.get_resource_rq_id(&ResBuilder::default().n_nodes(2).finish_v()); submit_test_tasks(&mut core, vec![task1, task2, task3, task4]); let mut scheduler = create_test_scheduler(); scheduler.run_scheduling(&mut core, &mut comm); @@ -231,11 +238,10 @@ fn test_mn_not_enough() { } let (mn_queue, _, _) = core.multi_node_queue_split(); - - assert!(mn_queue.is_sleeping(&ResBuilder::default().n_nodes(3).finish())); - assert!(mn_queue.is_sleeping(&ResBuilder::default().n_nodes(5).finish())); - assert!(mn_queue.is_sleeping(&ResBuilder::default().n_nodes(11).finish())); - assert!(mn_queue.is_sleeping(&ResBuilder::default().n_nodes(2).finish())); + assert!(mn_queue.is_sleeping(r1)); + assert!(mn_queue.is_sleeping(r2)); + assert!(mn_queue.is_sleeping(r3)); + assert!(mn_queue.is_sleeping(r4)); } #[test] @@ -243,7 +249,8 @@ fn test_mn_sleep_wakeup_one_by_one() { let mut core = Core::default(); let mut comm = create_test_comm(); - let task1 = TaskBuilder::new(1).n_nodes(4).user_priority(10).build(); + let rmap = core.get_resource_map_mut(); + let task1 = TaskBuilder::new(1).n_nodes(4).user_priority(10).build(rmap); submit_test_tasks(&mut core, vec![task1]); create_test_workers(&mut core, &[4, 1]); @@ -253,7 +260,8 @@ fn test_mn_sleep_wakeup_one_by_one() { core.sanity_check(); assert!(core.task_map().get_task(1.into()).is_waiting()); - let task2 = TaskBuilder::new(2).n_nodes(2).user_priority(1).build(); + let rmap = core.get_resource_map_mut(); + let task2 = TaskBuilder::new(2).n_nodes(2).user_priority(1).build(rmap); submit_test_tasks(&mut core, vec![task2]); scheduler.run_scheduling(&mut core, &mut comm); core.sanity_check(); @@ -275,8 +283,9 @@ fn test_mn_sleep_wakeup_at_once() { let mut comm = create_test_comm(); create_test_workers(&mut core, &[4, 1]); - let task1 = TaskBuilder::new(1).n_nodes(4).user_priority(10).build(); - let task2 = TaskBuilder::new(2).n_nodes(2).user_priority(1).build(); + let rmap = core.get_resource_map_mut(); + let task1 = TaskBuilder::new(1).n_nodes(4).user_priority(10).build(rmap); + let task2 = TaskBuilder::new(2).n_nodes(2).user_priority(1).build(rmap); submit_test_tasks(&mut core, vec![task1, task2]); let mut scheduler = create_test_scheduler(); @@ -302,7 +311,8 @@ fn test_mn_schedule_on_groups() { new_test_worker(&mut core, worker_id, wcfg2, &resource_map); let mut comm = create_test_comm(); - let task1 = TaskBuilder::new(1).n_nodes(2).build(); + let rmap = core.get_resource_map_mut(); + let task1 = TaskBuilder::new(1).n_nodes(2).build(rmap); submit_test_tasks(&mut core, vec![task1]); let mut scheduler = create_test_scheduler(); diff --git a/crates/tako/src/internal/tests/test_scheduler_sn.rs b/crates/tako/src/internal/tests/test_scheduler_sn.rs index c7cd23604..2fd3e47b2 100644 --- a/crates/tako/src/internal/tests/test_scheduler_sn.rs +++ b/crates/tako/src/internal/tests/test_scheduler_sn.rs @@ -29,7 +29,8 @@ fn test_no_deps_scattering_1() { let mut core = Core::default(); create_test_workers(&mut core, &[5, 5, 5]); - let tasks: Vec = (1..=4).map(task).collect(); + let rmap = core.get_resource_map_mut(); + let tasks: Vec = (1..=4).map(|id| task(id, rmap)).collect(); submit_test_tasks(&mut core, tasks); let mut scheduler = create_test_scheduler(); @@ -57,9 +58,9 @@ fn test_no_deps_scattering_2() { let mut scheduler = create_test_scheduler(); let mut comm = create_test_comm(); - let mut submit_and_check = |id, expected| { - let t = task(id); + let rmap = core.get_resource_map_mut(); + let t = task(id, rmap); submit_test_tasks(&mut core, vec![t]); scheduler.run_scheduling_without_balancing(&mut core, &mut comm); let mut counts: Vec<_> = core.get_workers().map(|w| w.sn_tasks().len()).collect(); @@ -91,7 +92,8 @@ fn test_no_deps_distribute_without_balance() { let mut core = Core::default(); create_test_workers(&mut core, &[10, 10, 10]); - let tasks: Vec = (1..=150).map(task).collect(); + let rmap = core.get_resource_map_mut(); + let tasks: Vec = (1..=150).map(|id| task(id, rmap)).collect(); submit_test_tasks(&mut core, tasks); let mut scheduler = create_test_scheduler(); @@ -122,7 +124,8 @@ fn test_no_deps_distribute_with_balance() { } let mut active_ids: Set = (1..301).map(|id| id.into()).collect(); - let tasks: Vec = (1..301).map(task).collect(); + let rmap = core.get_resource_map_mut(); + let tasks: Vec = (1..301).map(|id| task(id, rmap)).collect(); submit_test_tasks(&mut core, tasks); let mut scheduler = create_test_scheduler(); @@ -839,14 +842,15 @@ fn test_task_data_deps_balancing() { for odd in [0u32, 1u32] { for late_worker in [true, false] { let mut core = Core::default(); - let t1 = TaskBuilder::new(1).build(); - let t2 = TaskBuilder::new(2).build(); + let rmap = core.get_resource_map_mut(); + let t1 = TaskBuilder::new(1).build(rmap); + let t2 = TaskBuilder::new(2).build(rmap); let mut ts: Vec<_> = (10u32..110u32) .map(|i| { TaskBuilder::new(TaskId::new_test(i)) .data_dep(&t1, i - 10) .data_dep(&t2, i - 10) - .build() + .build(rmap) }) .collect(); ts.insert(0, t1); diff --git a/crates/tako/src/internal/tests/test_worker.rs b/crates/tako/src/internal/tests/test_worker.rs index b5286b248..a93c2c944 100644 --- a/crates/tako/src/internal/tests/test_worker.rs +++ b/crates/tako/src/internal/tests/test_worker.rs @@ -1,5 +1,6 @@ use crate::gateway::TaskDataFlags; -use crate::internal::common::resources::ResourceRequestVariants; +use crate::internal::common::resources::map::{GlobalResourceMapping, ResourceRqMap}; +use crate::internal::common::resources::{ResourceRequestVariants, ResourceRqId}; use crate::internal::messages::worker::{ ComputeTaskSeparateData, ComputeTaskSharedData, ComputeTasksMsg, NewWorkerMsg, ToWorkerMessage, WorkerResourceCounts, @@ -58,7 +59,10 @@ fn create_test_worker_config() -> WorkerConfiguration { } } -fn create_test_worker_state(config: WorkerConfiguration) -> WorkerStateRef { +fn create_test_worker_state( + config: WorkerConfiguration, + resource_rq_map: ResourceRqMap, +) -> WorkerStateRef { let resource_map = ResourceIdMap::from_vec( config .resources @@ -73,17 +77,18 @@ fn create_test_worker_state(config: WorkerConfiguration) -> WorkerStateRef { config, None, resource_map, - Default::default(), + resource_rq_map, Box::new(TestLauncher), "testuid".to_string(), ) } -fn create_dummy_compute_msg(task_id: TaskId) -> ComputeTasksMsg { +fn create_dummy_compute_msg(task_id: TaskId, resource_rq_id: ResourceRqId) -> ComputeTasksMsg { ComputeTasksMsg { tasks: vec![ComputeTaskSeparateData { shared_index: 0, id: task_id, + resource_rq_id, instance_id: Default::default(), scheduler_priority: 0, node_list: vec![], @@ -92,7 +97,6 @@ fn create_dummy_compute_msg(task_id: TaskId) -> ComputeTasksMsg { }], shared_data: vec![ComputeTaskSharedData { user_priority: 0, - resources: Default::default(), time_limit: None, data_flags: TaskDataFlags::empty(), body: Default::default(), @@ -102,17 +106,13 @@ fn create_dummy_compute_msg(task_id: TaskId) -> ComputeTasksMsg { #[test] fn test_worker_start_task() { + let mut rmap = GlobalResourceMapping::default(); + let rqv = ResourceRequestBuilder::default().cpus(3).finish_v(); + let (rq_id, _) = rmap.get_or_create_resource_rq_id(&rqv); + let config = create_test_worker_config(); - let state_ref = create_test_worker_state(config); - let mut msg = create_dummy_compute_msg(7.into()); - /*let mut entries = ResourceRequestEntries::new(); - entries.push(ResourceRequestEntry { - resource_id: 0.into(), - request: AllocationRequest::Compact(3), - }); - let rq = ResourceRequest::new(0, TimeRequest::default(), entries);*/ - let rq = ResourceRequestBuilder::default().cpus(3).finish_v(); - msg.shared_data[0].resources = rq.clone(); + let state_ref = create_test_worker_state(config, rmap.get_resource_rq_map().clone()); + let msg = create_dummy_compute_msg(7.into(), rq_id); let mut state = state_ref.get_mut(); process_worker_message(&mut state, ToWorkerMessage::ComputeTasks(msg)); let comm = state.comm().test(); @@ -123,18 +123,21 @@ fn test_worker_start_task() { assert!(state.running_tasks.is_empty()); let requests = state.ready_task_queue.requests(); assert_eq!(requests.len(), 1); - assert_eq!(requests[0], rq); + assert_eq!(requests[0], rqv); } -#[test] +/*#[test] fn test_worker_start_task_resource_variants() { - let config = create_test_worker_config(); - let state_ref = create_test_worker_state(config); - let mut msg = create_dummy_compute_msg(7.into()); + let mut rmap = GlobalResourceMapping::default(); + let rqv = ResourceRequestBuilder::default().cpus(3).finish_v(); let rq1 = ResourceRequestBuilder::default().cpus(2).add(1, 1).finish(); let rq2 = ResourceRequestBuilder::default().cpus(4).finish(); let rq = ResourceRequestVariants::new(smallvec![rq1.clone(), rq2.clone()]); - msg.shared_data[0].resources = rq.clone(); + let (rq_id, _) = rmap.get_or_create_resource_rq_id(&rqv); + + let config = create_test_worker_config(); + let state_ref = create_test_worker_state(config, rmap.get_resource_rq_map().clone()); + let msg = create_dummy_compute_msg(7.into(), rq_id); let mut state = state_ref.get_mut(); process_worker_message(&mut state, ToWorkerMessage::ComputeTasks(msg)); let comm = state.comm().test(); @@ -147,10 +150,12 @@ fn test_worker_start_task_resource_variants() { assert_eq!(requests.len(), 1); assert_eq!(requests[0], rq); } + */ #[test] fn test_worker_other_workers() { - let state_ref = create_test_worker_state(create_test_worker_config()); + let rmap = ResourceRqMap::default(); + let state_ref = create_test_worker_state(create_test_worker_config(), rmap); let mut state = state_ref.get_mut(); assert!(state.worker_addresses.is_empty()); assert!(state.ready_task_queue.worker_resources().is_empty()); diff --git a/crates/tako/src/internal/tests/utils/env.rs b/crates/tako/src/internal/tests/utils/env.rs index 6449ac830..391e5c658 100644 --- a/crates/tako/src/internal/tests/utils/env.rs +++ b/crates/tako/src/internal/tests/utils/env.rs @@ -62,7 +62,7 @@ impl TestEnv { } pub fn new_task(&mut self, builder: TaskBuilder) -> &Task { - let task = builder.build(); + let task = builder.build(self.core.get_resource_map_mut()); let task_id = task.id; schedule::submit_test_tasks(&mut self.core, vec![task]); self.task(task_id) @@ -75,14 +75,14 @@ impl TestEnv { } pub fn new_task_assigned>(&mut self, builder: TaskBuilder, worker_id: W) { - let task = builder.build(); + let task = builder.build(self.core.get_resource_map_mut()); let task_id = task.id(); schedule::submit_test_tasks(&mut self.core, vec![task]); schedule::assign_to_worker(&mut self.core, task_id, worker_id.into()); } pub fn new_task_running>(&mut self, builder: TaskBuilder, worker_id: W) { - let task = builder.build(); + let task = builder.build(self.core.get_resource_map_mut()); let task_id = task.id(); schedule::submit_test_tasks(&mut self.core, vec![task]); schedule::start_on_worker_running(&mut self.core, task_id, worker_id.into()); @@ -146,6 +146,7 @@ impl TestEnv { } pub fn new_ready_tasks_cpus(&mut self, tasks: &[ResourceUnits]) -> Vec { + let rmap = self.core.get_resource_map_mut(); let tasks: Vec<_> = tasks .iter() .map(|n_cpus| { @@ -153,7 +154,7 @@ impl TestEnv { self.task_id_counter += 1; TaskBuilder::new(task_id) .resources(cpus_compact(*n_cpus)) - .build() + .build(rmap) }) .collect(); let task_ids: Vec<_> = tasks.iter().map(|t| t.id).collect(); @@ -233,9 +234,9 @@ impl TestEnv { "Worker {} {}", worker.id, format_comma_delimited(worker.sn_tasks().iter().map(|&task_id| format!( - "{}:{:?}", + "{} -> {}", task_id, - self.core.get_task(task_id).configuration.resources + self.core.get_task(task_id).resource_rq_id ))) ); } diff --git a/crates/tako/src/internal/tests/utils/task.rs b/crates/tako/src/internal/tests/utils/task.rs index 14467fb1c..53a214a88 100644 --- a/crates/tako/src/internal/tests/utils/task.rs +++ b/crates/tako/src/internal/tests/utils/task.rs @@ -1,6 +1,7 @@ use super::resources::ResBuilder; use crate::datasrv::DataObjectId; use crate::gateway::{CrashLimit, TaskDataFlags}; +use crate::internal::common::resources::map::{GlobalResourceMapping, ResourceRqMap}; use crate::internal::common::resources::{ NumOfNodes, ResourceAmount, ResourceId, ResourceRequestVariants, }; @@ -90,7 +91,7 @@ impl TaskBuilder { self } - pub fn build(self) -> Task { + pub fn build(self, resource_map: &mut GlobalResourceMapping) -> Task { let last_resource = self.resources_builder.finish(); let mut resources: SmallVec<[ResourceRequest; 1]> = self.finished_resources.into(); resources.push(last_resource); @@ -98,13 +99,14 @@ impl TaskBuilder { rq.validate().unwrap(); } let resources = ResourceRequestVariants::new(resources); + let (rq_id, _) = resource_map.get_or_create_resource_rq_id(&resources); Task::new( self.id, + rq_id, self.task_deps.into_iter().collect(), self.data_deps, None, Rc::new(TaskConfiguration { - resources, time_limit: None, user_priority: self.user_priority, crash_limit: self.crash_limit, @@ -115,12 +117,18 @@ impl TaskBuilder { } } -pub fn task>(id: T) -> Task { - TaskBuilder::new(id.into()).build() +pub fn task>(id: T, resource_map: &mut GlobalResourceMapping) -> Task { + TaskBuilder::new(id.into()).build(resource_map) } -pub fn task_with_deps>(id: T, deps: &[&Task]) -> Task { - TaskBuilder::new(id.into()).task_deps(deps).build() +pub fn task_with_deps>( + id: T, + deps: &[&Task], + resource_map: &mut GlobalResourceMapping, +) -> Task { + TaskBuilder::new(id.into()) + .task_deps(deps) + .build(resource_map) } pub fn task_running_msg>(task_id: T) -> TaskRunningMsg { diff --git a/crates/tako/src/internal/tests/utils/workflows.rs b/crates/tako/src/internal/tests/utils/workflows.rs index 708ed09ba..fb84e5896 100644 --- a/crates/tako/src/internal/tests/utils/workflows.rs +++ b/crates/tako/src/internal/tests/utils/workflows.rs @@ -14,14 +14,14 @@ pub fn submit_example_1(core: &mut Core) { | 17 */ - - let t1 = task::task(11); - let t2 = task::task(12); - let t3 = task_with_deps(13, &[&t1, &t2]); - let t4 = task_with_deps(14, &[&t2]); - let t5 = task_with_deps(15, &[&t3, &t4]); - let t6 = task_with_deps(16, &[&t3]); - let t7 = task_with_deps(17, &[&t6]); + let rmap = core.get_resource_map_mut(); + let t1 = task::task(11, rmap); + let t2 = task::task(12, rmap); + let t3 = task_with_deps(13, &[&t1, &t2], rmap); + let t4 = task_with_deps(14, &[&t2], rmap); + let t5 = task_with_deps(15, &[&t3, &t4], rmap); + let t6 = task_with_deps(16, &[&t3], rmap); + let t7 = task_with_deps(17, &[&t6], rmap); submit_test_tasks(core, vec![t1, t2, t3, t4, t5, t6, t7]); } @@ -37,13 +37,14 @@ pub fn submit_example_2(core: &mut Core) { T5 */ - let t1 = task_with_deps(1, &[]); - let t2 = task_with_deps(2, &[&t1]); - let t3 = task_with_deps(3, &[&t1]); - let t4 = task_with_deps(4, &[&t2, &t3]); - let t5 = task_with_deps(5, &[&t4]); - let t6 = task_with_deps(6, &[&t3]); - let t7 = task_with_deps(7, &[&t6]); + let rmap = core.get_resource_map_mut(); + let t1 = task_with_deps(1, &[], rmap); + let t2 = task_with_deps(2, &[&t1], rmap); + let t3 = task_with_deps(3, &[&t1], rmap); + let t4 = task_with_deps(4, &[&t2, &t3], rmap); + let t5 = task_with_deps(5, &[&t4], rmap); + let t6 = task_with_deps(6, &[&t3], rmap); + let t7 = task_with_deps(7, &[&t6], rmap); submit_test_tasks(core, vec![t1, t2, t3, t4, t5, t6, t7]); } @@ -57,13 +58,13 @@ pub fn submit_example_3(core: &mut Core) { \ / T6 */ - - let t1 = TaskBuilder::new(1).task_deps(&[]).build(); - let t2 = TaskBuilder::new(2).task_deps(&[]).build(); - let t3 = TaskBuilder::new(3).task_deps(&[&t1]).build(); - let t4 = TaskBuilder::new(4).task_deps(&[&t1, &t2]).build(); - let t5 = TaskBuilder::new(5).task_deps(&[&t2]).build(); - let t6 = TaskBuilder::new(6).task_deps(&[&t1, &t5, &t3]).build(); + let rmap = core.get_resource_map_mut(); + let t1 = TaskBuilder::new(1).task_deps(&[]).build(rmap); + let t2 = TaskBuilder::new(2).task_deps(&[]).build(rmap); + let t3 = TaskBuilder::new(3).task_deps(&[&t1]).build(rmap); + let t4 = TaskBuilder::new(4).task_deps(&[&t1, &t2]).build(rmap); + let t5 = TaskBuilder::new(5).task_deps(&[&t2]).build(rmap); + let t6 = TaskBuilder::new(6).task_deps(&[&t1, &t5, &t3]).build(rmap); submit_test_tasks(core, vec![t1, t2, t3, t4, t5, t6]); } @@ -77,13 +78,14 @@ pub fn submit_example_4(core: &mut Core) { T3 */ - let t1 = TaskBuilder::new(1).build(); - let t2 = TaskBuilder::new(2).build(); + let rmap = core.get_resource_map_mut(); + let t1 = TaskBuilder::new(1).build(rmap); + let t2 = TaskBuilder::new(2).build(rmap); let t3 = TaskBuilder::new(3) .data_dep(&t1, 0) .data_dep(&t2, 0) .data_dep(&t2, 1) - .build(); + .build(rmap); submit_test_tasks(core, vec![t1, t2, t3]); } diff --git a/crates/tako/src/internal/worker/rpc.rs b/crates/tako/src/internal/worker/rpc.rs index 0c3b3d208..fba4f6866 100644 --- a/crates/tako/src/internal/worker/rpc.rs +++ b/crates/tako/src/internal/worker/rpc.rs @@ -401,7 +401,8 @@ pub(crate) fn process_worker_message(state: &mut WorkerState, message: ToWorkerM } else { shared.clone() }; - state.add_task(Task::new(task, shared, task_state)); + let rqv = state.get_resource_rq(task.resource_rq_id); + state.add_task(Task::new(task, rqv.clone(), shared, task_state)); } } ToWorkerMessage::StealTasks(msg) => { @@ -444,9 +445,7 @@ pub(crate) fn process_worker_message(state: &mut WorkerState, message: ToWorkerM ToWorkerMessage::SetOverviewIntervalOverride(r#override) => { state.worker_overview_interval_override = r#override; } - ToWorkerMessage::NewResourceRequest(rq_id, rqv) => { - todo!() - } + ToWorkerMessage::NewResourceRequest(rq_id, rqv) => state.register_resource_rq(rq_id, rqv), } false } diff --git a/crates/tako/src/internal/worker/state.rs b/crates/tako/src/internal/worker/state.rs index 7a8f2e0b5..3a5a77bcc 100644 --- a/crates/tako/src/internal/worker/state.rs +++ b/crates/tako/src/internal/worker/state.rs @@ -1,6 +1,6 @@ use crate::datasrv::DataObjectId; use crate::internal::common::resources::map::{ResourceIdMap, ResourceRqMap}; -use crate::internal::common::resources::{Allocation, ResourceId}; +use crate::internal::common::resources::{Allocation, ResourceId, ResourceRqId}; use crate::internal::common::stablemap::StableMap; use crate::internal::common::{Map, Set, WrappedRcRefCell}; use crate::internal::datasrv::{DataObjectRef, DataStorage}; @@ -26,6 +26,7 @@ use crate::internal::worker::rqueue::ResourceWaitQueue; use crate::internal::worker::task::{RunningState, Task, TaskState}; use crate::internal::worker::task_comm::RunningTaskComm; use crate::launcher::TaskLauncher; +use crate::resources::ResourceRequestVariants; use crate::{PriorityTuple, TaskId}; use orion::aead::SecretKey; use rand::SeedableRng; @@ -62,7 +63,7 @@ pub struct WorkerState { placement_resolver: Map>>, resource_rq_map: ResourceRqMap, - resource_map: ResourceIdMap, + resource_id_map: ResourceIdMap, resource_label_map: ResourceLabelMap, secret_key: Option>, @@ -317,7 +318,11 @@ impl WorkerState { } pub fn get_resource_map(&self) -> &ResourceIdMap { - &self.resource_map + &self.resource_id_map + } + + pub fn get_resource_rq(&self, rq_id: ResourceRqId) -> &ResourceRequestVariants { + self.resource_rq_map.get(&rq_id) } pub fn get_resource_label_map(&self) -> &ResourceLabelMap { @@ -416,6 +421,14 @@ impl WorkerState { } } + pub fn register_resource_rq( + &mut self, + resource_rq_id: ResourceRqId, + rqv: ResourceRequestVariants, + ) { + self.resource_rq_map.insert(resource_rq_id, rqv) + } + pub fn download_object( &mut self, data_id: DataObjectId, @@ -466,7 +479,7 @@ impl WorkerStateRef { start_task_scheduled: false, running_tasks: Default::default(), start_time: now, - resource_map, + resource_id_map: resource_map, resource_rq_map, resource_label_map, worker_addresses: Default::default(), diff --git a/crates/tako/src/internal/worker/task.rs b/crates/tako/src/internal/worker/task.rs index 2e5f54cdb..323a04e58 100644 --- a/crates/tako/src/internal/worker/task.rs +++ b/crates/tako/src/internal/worker/task.rs @@ -1,11 +1,13 @@ use crate::datasrv::DataObjectId; use crate::gateway::{EntryType, TaskDataFlags}; use crate::internal::common::resources::Allocation; +use crate::internal::common::resources::map::ResourceRqMap; use crate::internal::common::stablemap::ExtractKey; use crate::internal::messages::worker::{ ComputeTaskSeparateData, ComputeTaskSharedData, TaskOutput, }; use crate::internal::worker::task_comm::RunningTaskComm; +use crate::resources::ResourceRequestVariants; use crate::{InstanceId, Priority, TaskId, WorkerId}; use std::rc::Rc; use std::time::Duration; @@ -40,6 +42,7 @@ pub struct Task { impl Task { pub fn new( task: ComputeTaskSeparateData, + rqv: ResourceRequestVariants, shared: ComputeTaskSharedData, task_state: TaskState, ) -> Self { @@ -48,7 +51,7 @@ impl Task { id: task.id, priority: (shared.user_priority, task.scheduler_priority), instance_id: task.instance_id, - resources: shared.resources, + resources: rqv, time_limit: shared.time_limit, body: shared.body, entry: task.entry, diff --git a/crates/tako/src/internal/worker/test_rqueue.rs b/crates/tako/src/internal/worker/test_rqueue.rs index 31ea8e530..097b4eb3a 100644 --- a/crates/tako/src/internal/worker/test_rqueue.rs +++ b/crates/tako/src/internal/worker/test_rqueue.rs @@ -8,6 +8,7 @@ use crate::internal::worker::test_util::{WorkerTaskBuilder, worker_task}; use std::ops::Deref; use std::time::Duration; +use crate::internal::common::resources::map::ResourceRqMap; use crate::internal::messages::worker::WorkerResourceCounts; use crate::internal::server::workerload::WorkerResources; use crate::internal::tests::utils::shared::{ @@ -29,6 +30,7 @@ impl ResourceWaitQueue { #[test] fn test_rqueue_resource_priority() { + let mut rqs = ResourceRqMap::default(); let mut rq = RB::new(wait_queue(vec![res_item( "cpus", res_kind_groups(&[vec!["0", "1", "2", "3"], vec!["7", "8"]]), @@ -38,12 +40,14 @@ fn test_rqueue_resource_priority() { 10, ResBuilder::default().add_scatter(0, 3).finish(), 1, + &mut rqs, )); - rq.add_task(worker_task(11, cpus_compact(4).finish(), 1)); + rq.add_task(worker_task(11, cpus_compact(4).finish(), 1, &mut rqs)); rq.add_task(worker_task( 12, ResBuilder::default().add_force_compact(0, 4).finish(), 1, + &mut rqs, )); let mut a = rq.start_tasks(); @@ -70,10 +74,11 @@ fn test_rqueue_resource_priority() { #[test] fn test_rqueue1() { + let mut rqs = ResourceRqMap::default(); let mut rq = RB::new(wait_queue(ResourceDescriptor::sockets(3, 5))); - rq.add_task(worker_task(10, cpus_compact(2).finish(), 1)); - rq.add_task(worker_task(11, cpus_compact(5).finish(), 1)); - rq.add_task(worker_task(12, cpus_compact(2).finish(), 1)); + rq.add_task(worker_task(10, cpus_compact(2).finish(), 1, &mut rqs)); + rq.add_task(worker_task(11, cpus_compact(5).finish(), 1, &mut rqs)); + rq.add_task(worker_task(12, cpus_compact(2).finish(), 1, &mut rqs)); let a = rq.start_tasks(); assert_eq!(a.get(&10).unwrap().get_indices(0).len(), 2); @@ -83,11 +88,12 @@ fn test_rqueue1() { #[test] fn test_rqueue2() { + let mut rqs = ResourceRqMap::default(); let mut rq = RB::new(wait_queue(ResourceDescriptor::simple_cpus(4))); - rq.add_task(worker_task(10, cpus_compact(2).finish(), 1)); - rq.add_task(worker_task(11, cpus_compact(1).finish(), 2)); - rq.add_task(worker_task(12, cpus_compact(2).finish(), 2)); + rq.add_task(worker_task(10, cpus_compact(2).finish(), 1, &mut rqs)); + rq.add_task(worker_task(11, cpus_compact(1).finish(), 2, &mut rqs)); + rq.add_task(worker_task(12, cpus_compact(2).finish(), 2, &mut rqs)); let a = rq.start_tasks(); assert!(!a.contains_key(&10)); @@ -98,11 +104,12 @@ fn test_rqueue2() { #[test] fn test_rqueue3() { + let mut rqs = ResourceRqMap::default(); let mut rq = RB::new(wait_queue(ResourceDescriptor::simple_cpus(4))); - rq.add_task(worker_task(10, cpus_compact(2).finish(), 1)); - rq.add_task(worker_task(11, cpus_compact(1).finish(), 1)); - rq.add_task(worker_task(12, cpus_compact(2).finish(), 2)); + rq.add_task(worker_task(10, cpus_compact(2).finish(), 1, &mut rqs)); + rq.add_task(worker_task(11, cpus_compact(1).finish(), 1, &mut rqs)); + rq.add_task(worker_task(12, cpus_compact(2).finish(), 2, &mut rqs)); let a = rq.start_tasks(); assert!(a.contains_key(&10)); @@ -112,11 +119,13 @@ fn test_rqueue3() { #[test] fn test_rqueue_time_request() { + let mut rqs = ResourceRqMap::default(); let mut rq = RB::new(wait_queue(ResourceDescriptor::simple_cpus(4))); rq.add_task(worker_task( 10, ResBuilder::default().add(0, 1).min_time_secs(10).finish(), 1, + &mut rqs, )); assert_eq!(rq.start_tasks_duration(Duration::new(9, 0)).len(), 0); @@ -125,26 +134,31 @@ fn test_rqueue_time_request() { #[test] fn test_rqueue_time_request_priority1() { + let mut rqs = ResourceRqMap::default(); let mut rq = RB::new(wait_queue(ResourceDescriptor::simple_cpus(4))); rq.add_task(worker_task( 10, cpus_compact(2).min_time_secs(10).finish(), 1, + &mut rqs, )); rq.add_task(worker_task( 11, cpus_compact(2).min_time_secs(40).finish(), 1, + &mut rqs, )); rq.add_task(worker_task( 12, cpus_compact(2).min_time_secs(20).finish(), 1, + &mut rqs, )); rq.add_task(worker_task( 13, cpus_compact(2).min_time_secs(30).finish(), 1, + &mut rqs, )); let map = rq.start_tasks_duration(Duration::new(40, 0)); @@ -155,26 +169,31 @@ fn test_rqueue_time_request_priority1() { #[test] fn test_rqueue_time_request_priority2() { + let mut rqs = ResourceRqMap::default(); let mut rq = RB::new(wait_queue(ResourceDescriptor::simple_cpus(4))); rq.add_task(worker_task( 10, cpus_compact(2).min_time_secs(10).finish(), 1, + &mut rqs, )); rq.add_task(worker_task( 11, cpus_compact(2).min_time_secs(40).finish(), 1, + &mut rqs, )); rq.add_task(worker_task( 12, cpus_compact(2).min_time_secs(20).finish(), 1, + &mut rqs, )); rq.add_task(worker_task( 13, cpus_compact(2).min_time_secs(30).finish(), 1, + &mut rqs, )); let map = rq.start_tasks_duration(Duration::new(30, 0)); @@ -185,6 +204,7 @@ fn test_rqueue_time_request_priority2() { #[test] fn test_rqueue_generic_resource1_priorities() { + let mut rqs = ResourceRqMap::default(); let resources = vec![ ResourceDescriptorItem::range("cpus", 0, 3), ResourceDescriptorItem::range("Res0", 1, 20), @@ -195,8 +215,8 @@ fn test_rqueue_generic_resource1_priorities() { let request: ResourceRequest = cpus_compact(2).add(1, 2).finish(); - rq.add_task(worker_task(10, request, 1)); - rq.add_task(worker_task(11, cpus_compact(4).finish(), 1)); + rq.add_task(worker_task(10, request, 1, &mut rqs)); + rq.add_task(worker_task(11, cpus_compact(4).finish(), 1, &mut rqs)); let map = rq.start_tasks(); assert!(!map.contains_key(&10)); @@ -205,6 +225,7 @@ fn test_rqueue_generic_resource1_priorities() { #[test] fn test_rqueue_generic_resource2_priorities() { + let mut rqs = ResourceRqMap::default(); let resources = vec![ ResourceDescriptorItem::range("cpus", 0, 3), ResourceDescriptorItem::range("Res0", 1, 20), @@ -215,13 +236,13 @@ fn test_rqueue_generic_resource2_priorities() { let mut rq = RB::new(wait_queue(resources)); let request: ResourceRequest = cpus_compact(2).add(1, 8).finish(); - rq.add_task(worker_task(10, request, 1)); + rq.add_task(worker_task(10, request, 1, &mut rqs)); let request: ResourceRequest = cpus_compact(2).add(1, 12).finish(); - rq.add_task(worker_task(11, request, 1)); + rq.add_task(worker_task(11, request, 1, &mut rqs)); let request: ResourceRequest = cpus_compact(2).add(2, 50_000_000).finish(); - rq.add_task(worker_task(12, request, 1)); + rq.add_task(worker_task(12, request, 1, &mut rqs)); let map = rq.start_tasks(); assert!(!map.contains_key(&10)); @@ -231,6 +252,7 @@ fn test_rqueue_generic_resource2_priorities() { #[test] fn test_rqueue_generic_resource3_priorities() { + let mut rqs = ResourceRqMap::default(); let resources = vec![ ResourceDescriptorItem::range("cpus", 0, 3), ResourceDescriptorItem::range("Res0", 1, 20), @@ -241,13 +263,13 @@ fn test_rqueue_generic_resource3_priorities() { let mut rq = RB::new(wait_queue(resources)); let request: ResourceRequest = cpus_compact(2).add(1, 18).finish(); - rq.add_task(worker_task(10, request, 1)); + rq.add_task(worker_task(10, request, 1, &mut rqs)); let request: ResourceRequest = cpus_compact(2).add(1, 10).add(2, 60_000_000).finish(); - rq.add_task(worker_task(11, request, 1)); + rq.add_task(worker_task(11, request, 1, &mut rqs)); let request: ResourceRequest = cpus_compact(2).add(2, 99_000_000).finish(); - rq.add_task(worker_task(12, request, 1)); + rq.add_task(worker_task(12, request, 1, &mut rqs)); let map = rq.start_tasks(); assert!(!map.contains_key(&10)); @@ -319,6 +341,7 @@ fn test_worker_resource_priorities() { #[test] fn test_uniq_resource_priorities1() { + let mut requests = ResourceRqMap::default(); let resources = vec![ ResourceDescriptorItem::range("cpus", 0, 16), ResourceDescriptorItem::range("res0", 1, 10), @@ -332,11 +355,15 @@ fn test_uniq_resource_priorities1() { WorkerTaskBuilder::new(10) .resources(request) .server_priority(1) - .build(), + .build(&mut requests), ); let request: ResourceRequest = cpus_compact(16).add(2, 2).finish(); - rq.add_task(WorkerTaskBuilder::new(11).resources(request).build()); + rq.add_task( + WorkerTaskBuilder::new(11) + .resources(request) + .build(&mut requests), + ); let map = rq.start_tasks(); assert_eq!(map.len(), 1); @@ -345,6 +372,7 @@ fn test_uniq_resource_priorities1() { #[test] fn test_uniq_resource_priorities2() { + let mut rqs = ResourceRqMap::default(); let resources = vec![ ResourceDescriptorItem::range("cpus", 0, 16), ResourceDescriptorItem::range("res0", 1, 10), @@ -365,11 +393,15 @@ fn test_uniq_resource_priorities2() { WorkerTaskBuilder::new(10) .resources(request) .server_priority(1) - .build(), + .build(&mut rqs), ); let request: ResourceRequest = cpus_compact(16).add(2, 2).finish(); - rq.add_task(WorkerTaskBuilder::new(11).resources(request).build()); + rq.add_task( + WorkerTaskBuilder::new(11) + .resources(request) + .build(&mut rqs), + ); let map = rq.start_tasks(); assert_eq!(map.len(), 1); @@ -378,6 +410,7 @@ fn test_uniq_resource_priorities2() { #[test] fn test_uniq_resource_priorities3() { + let mut rqs = ResourceRqMap::default(); let resources = vec![ ResourceDescriptorItem::range("cpus", 0, 16), ResourceDescriptorItem::range("res0", 1, 10), @@ -398,11 +431,15 @@ fn test_uniq_resource_priorities3() { WorkerTaskBuilder::new(10) .resources(request) .user_priority(1) - .build(), + .build(&mut rqs), ); let request: ResourceRequest = cpus_compact(16).add(2, 2).finish(); - rq.add_task(WorkerTaskBuilder::new(11).resources(request).build()); + rq.add_task( + WorkerTaskBuilder::new(11) + .resources(request) + .build(&mut rqs), + ); let map = rq.start_tasks(); assert_eq!(map.len(), 1); @@ -411,6 +448,7 @@ fn test_uniq_resource_priorities3() { #[test] fn test_different_resources_and_priorities() { + let mut rqs = ResourceRqMap::default(); let resources = vec![ ResourceDescriptorItem::range("cpus", 0, 63), ResourceDescriptorItem::range("gpus/nvidia", 0, 3), @@ -423,7 +461,7 @@ fn test_different_resources_and_priorities() { WorkerTaskBuilder::new(i) .resources(request) .user_priority(if i % 2 == 0 { 0 } else { -1 }) - .build(), + .build(&mut rqs), ); } for i in 0..12 { @@ -432,7 +470,7 @@ fn test_different_resources_and_priorities() { WorkerTaskBuilder::new(i + 20) .resources(request) .user_priority(-3) - .build(), + .build(&mut rqs), ); } let map = rq.start_tasks(); @@ -448,6 +486,7 @@ fn test_different_resources_and_priorities() { #[test] fn test_different_resources_and_priorities1() { + let mut rqs = ResourceRqMap::default(); let resources = vec![ ResourceDescriptorItem::range("cpus", 0, 63), ResourceDescriptorItem::range("gpus/nvidia", 0, 3), @@ -460,7 +499,7 @@ fn test_different_resources_and_priorities1() { WorkerTaskBuilder::new(i) .resources(request) .user_priority(if i % 2 == 0 { 0 } else { -1 }) - .build(), + .build(&mut rqs), ); } for i in 0..12 { @@ -469,7 +508,7 @@ fn test_different_resources_and_priorities1() { WorkerTaskBuilder::new(i + 20) .resources(request) .user_priority(-3) - .build(), + .build(&mut rqs), ); } let map = rq.start_tasks(); @@ -485,6 +524,7 @@ fn test_different_resources_and_priorities1() { #[test] fn test_different_resources_and_priorities2() { + let mut rqs = ResourceRqMap::default(); let resources = vec![ ResourceDescriptorItem::range("cpus", 0, 10), ResourceDescriptorItem::range("foo", 1, 3), @@ -493,7 +533,7 @@ fn test_different_resources_and_priorities2() { for i in 0..6 { let request: ResourceRequest = cpus_compact(1).add(1, 1).finish(); - rq.add_task(WorkerTaskBuilder::new(i).resources(request).build()); + rq.add_task(WorkerTaskBuilder::new(i).resources(request).build(&mut rqs)); } let map = rq.start_tasks(); assert_eq!(map.len(), 3); @@ -503,7 +543,7 @@ fn test_different_resources_and_priorities2() { WorkerTaskBuilder::new(i + 10) .resources(request) .user_priority(1) - .build(), + .build(&mut rqs), ); } let map = rq.start_tasks(); @@ -514,7 +554,7 @@ fn test_different_resources_and_priorities2() { WorkerTaskBuilder::new(i + 20) .resources(request) .user_priority(-3) - .build(), + .build(&mut rqs), ); } let map = rq.start_tasks(); @@ -524,6 +564,7 @@ fn test_different_resources_and_priorities2() { #[test] fn test_different_resources_and_priorities3() { + let mut rqs = ResourceRqMap::default(); let resources = vec![ ResourceDescriptorItem::range("cpus", 0, 9), ResourceDescriptorItem::range("foo", 1, 3), @@ -532,7 +573,7 @@ fn test_different_resources_and_priorities3() { for i in 0..6 { let request: ResourceRequest = cpus_compact(1).add(1, 2).finish(); - rq.add_task(WorkerTaskBuilder::new(i).resources(request).build()); + rq.add_task(WorkerTaskBuilder::new(i).resources(request).build(&mut rqs)); } let map = rq.start_tasks(); assert_eq!(map.len(), 1); @@ -542,7 +583,7 @@ fn test_different_resources_and_priorities3() { WorkerTaskBuilder::new(i + 10) .resources(request) .user_priority(1) - .build(), + .build(&mut rqs), ); } let map = rq.start_tasks(); @@ -553,7 +594,7 @@ fn test_different_resources_and_priorities3() { WorkerTaskBuilder::new(i + 20) .resources(request) .user_priority(-3) - .build(), + .build(&mut rqs), ); } let map = rq.start_tasks(); @@ -563,6 +604,7 @@ fn test_different_resources_and_priorities3() { #[test] fn test_uniq_resource_priorities4() { + let mut rqs = ResourceRqMap::default(); let resources = vec![ ResourceDescriptorItem::range("cpus", 0, 16), ResourceDescriptorItem::range("res0", 1, 10), @@ -583,13 +625,17 @@ fn test_uniq_resource_priorities4() { WorkerTaskBuilder::new(10) .resources(request) .server_priority(1) - .build(), + .build(&mut rqs), ); rq.queue.remove_worker(400.into()); let request: ResourceRequest = cpus_compact(16).add(2, 2).finish(); - rq.add_task(WorkerTaskBuilder::new(11).resources(request).build()); + rq.add_task( + WorkerTaskBuilder::new(11) + .resources(request) + .build(&mut rqs), + ); let map = rq.start_tasks(); assert_eq!(map.len(), 1); diff --git a/crates/tako/src/internal/worker/test_util.rs b/crates/tako/src/internal/worker/test_util.rs index 27e3e3ce3..8573819fb 100644 --- a/crates/tako/src/internal/worker/test_util.rs +++ b/crates/tako/src/internal/worker/test_util.rs @@ -1,6 +1,7 @@ use crate::datasrv::DataObjectId; use crate::gateway::TaskDataFlags; use crate::internal::common::Map; +use crate::internal::common::resources::map::ResourceRqMap; use crate::internal::common::resources::{Allocation, ResourceRequest, ResourceRequestVariants}; use crate::internal::messages::worker::{ComputeTaskSeparateData, ComputeTaskSharedData}; use crate::internal::server::workerload::WorkerResources; @@ -52,15 +53,17 @@ impl WorkerTaskBuilder { self } - pub fn build(self) -> Task { + pub fn build(self, requests: &mut ResourceRqMap) -> Task { let resources = ResourceRequestVariants::new(if self.resources.is_empty() { smallvec![cpus_compact(1).finish()] } else { self.resources.into() }); + let resource_rq_id = requests.get_or_create(resources.clone()); Task::new( ComputeTaskSeparateData { + resource_rq_id, shared_index: 0, id: self.task_id, instance_id: self.instance_id, @@ -69,9 +72,9 @@ impl WorkerTaskBuilder { data_deps: self.data_deps, entry: None, }, + resources, ComputeTaskSharedData { user_priority: self.user_priority, - resources, time_limit: None, data_flags: self.data_flags, body: Default::default(), @@ -85,11 +88,12 @@ pub fn worker_task>( task_id: T, resources: ResourceRequest, u_priority: Priority, + requests: &mut ResourceRqMap, ) -> Task { WorkerTaskBuilder::new(task_id) .resources(resources) .user_priority(u_priority) - .build() + .build(requests) } pub(crate) struct ResourceQueueBuilder { From 711c224a7f0536f372b2421d556d4d4ba31424a9 Mon Sep 17 00:00:00 2001 From: Ada Bohm Date: Fri, 28 Nov 2025 15:44:27 +0100 Subject: [PATCH 03/17] ResourceRqId in hyperqueue --- crates/hyperqueue/src/client/commands/job.rs | 8 +- .../src/client/commands/journal/output.rs | 9 +- .../src/client/commands/journal/report.rs | 5 +- .../src/client/commands/submit/command.rs | 83 ++++++----- .../src/client/commands/submit/jobfile.rs | 60 ++++++-- .../hyperqueue/src/client/commands/worker.rs | 20 +-- crates/hyperqueue/src/client/job.rs | 23 ++- crates/hyperqueue/src/client/output/cli.rs | 139 ++++++++++-------- crates/hyperqueue/src/client/output/common.rs | 5 +- crates/hyperqueue/src/client/output/json.rs | 29 ++-- .../hyperqueue/src/client/output/outputs.rs | 10 +- crates/hyperqueue/src/client/output/quiet.rs | 12 +- crates/hyperqueue/src/client/task.rs | 18 +-- .../ui/screens/jobs/job_info_display.rs | 5 +- crates/hyperqueue/src/server/client/mod.rs | 60 +++++--- crates/hyperqueue/src/server/client/submit.rs | 85 +++++++---- crates/hyperqueue/src/server/job.rs | 43 +++--- crates/hyperqueue/src/server/restore.rs | 9 +- crates/hyperqueue/src/server/state.rs | 47 +----- crates/hyperqueue/src/transfer/messages.rs | 38 +++-- crates/tako/src/connection.rs | 6 +- crates/tako/src/control.rs | 22 ++- .../tako/src/internal/common/resources/map.rs | 41 +++++- .../tako/src/internal/common/resources/mod.rs | 10 +- .../tako/src/internal/scheduler/multinode.rs | 6 +- crates/tako/src/internal/scheduler/state.rs | 10 +- crates/tako/src/internal/server/core.rs | 15 +- crates/tako/src/internal/server/reactor.rs | 29 ++-- crates/tako/src/internal/server/task.rs | 5 +- crates/tako/src/internal/server/worker.rs | 8 +- .../tests/integration/utils/server.rs | 14 +- crates/tako/src/internal/worker/state.rs | 15 +- crates/tako/src/lib.rs | 9 +- 33 files changed, 512 insertions(+), 386 deletions(-) diff --git a/crates/hyperqueue/src/client/commands/job.rs b/crates/hyperqueue/src/client/commands/job.rs index 3c39f127e..ea3865c06 100644 --- a/crates/hyperqueue/src/client/commands/job.rs +++ b/crates/hyperqueue/src/client/commands/job.rs @@ -1,11 +1,11 @@ use clap::Parser; use crate::client::globalsettings::GlobalSettings; -use crate::client::job::get_worker_map; +use crate::client::job::{get_remote_lists, get_worker_map}; use crate::client::output::outputs::OutputStream; use crate::client::output::resolve_task_paths; -use crate::client::status::{Status, job_status}; -use crate::common::cli::{TaskSelectorArg, parse_last_all_range, parse_last_range}; +use crate::client::status::{job_status, Status}; +use crate::common::cli::{parse_last_all_range, parse_last_range, TaskSelectorArg}; use crate::common::utils::str::pluralize; use crate::rpc_call; use crate::transfer::connection::ClientSession; @@ -196,7 +196,7 @@ pub async fn output_job_detail( .collect(); gsettings .printer() - .print_job_detail(jobs, worker_map, &response.server_uid); + .print_job_detail(jobs, &worker_map, &response.server_uid); Ok(()) } diff --git a/crates/hyperqueue/src/client/commands/journal/output.rs b/crates/hyperqueue/src/client/commands/journal/output.rs index 93ea1cf12..10d1d428c 100644 --- a/crates/hyperqueue/src/client/commands/journal/output.rs +++ b/crates/hyperqueue/src/client/commands/journal/output.rs @@ -192,23 +192,26 @@ impl SubmitDescFormatter<'_> { ids, entries: _, task_desc, + resource_rq, } => { let TaskDescription { kind: _, - resources, time_limit, priority, crash_limit, } = task_desc; json!({ "ids": ids, - "resources": resources, + "resources": resource_rq, "time_limit": time_limit, "priority": priority, "crash_limit": crash_limit }) } - JobTaskDescription::Graph { tasks } => { + JobTaskDescription::Graph { + resource_rqs: _, + tasks, + } => { json!({ "n_tasks": tasks.len() }) diff --git a/crates/hyperqueue/src/client/commands/journal/report.rs b/crates/hyperqueue/src/client/commands/journal/report.rs index 2d24593d0..0d27d1f38 100644 --- a/crates/hyperqueue/src/client/commands/journal/report.rs +++ b/crates/hyperqueue/src/client/commands/journal/report.rs @@ -389,7 +389,8 @@ impl JournalStats { } fn new_submit(&mut self, job_id: JobId, submit: SubmitRequest) { - let rq = match submit.submit_desc.task_desc { + todo!() + /*let rq = match submit.submit_desc.task_desc { JobTaskDescription::Array { task_desc, .. } => { JobResourceRq::Array(task_desc.resources) } @@ -401,7 +402,7 @@ impl JournalStats { JobResourceRq::TaskGraph(map) } }; - self.job_requests.insert(job_id, rq); + self.job_requests.insert(job_id, rq);*/ } fn new_worker( diff --git a/crates/hyperqueue/src/client/commands/submit/command.rs b/crates/hyperqueue/src/client/commands/submit/command.rs index 031fa14d9..91ca23457 100644 --- a/crates/hyperqueue/src/client/commands/submit/command.rs +++ b/crates/hyperqueue/src/client/commands/submit/command.rs @@ -18,9 +18,9 @@ use crate::rpc_call; use crate::server::event::streamer::{EventFilter, EventFilterFlags}; use crate::transfer::connection::ClientSession; use crate::transfer::messages::{ - FromClientMessage, JobDescription, JobSubmitDescription, JobTaskDescription, PinMode, - StreamEvents, StreamEventsMode, SubmitRequest, SubmitResponse, TaskDescription, TaskKind, - TaskKindProgram, ToClientMessage, + FromClientMessage, JobDescription, JobSubmitDescription, JobTaskDescription, LocalResourceRqId, + PinMode, StreamEvents, StreamEventsMode, SubmitRequest, SubmitResponse, TaskDescription, + TaskKind, TaskKindProgram, ToClientMessage, }; use anyhow::{anyhow, bail}; use bstr::BString; @@ -43,7 +43,9 @@ use tako::gateway::{ ResourceRequestVariants, }; use tako::program::{FileOnCloseBehavior, ProgramDefinition, StdioDef}; -use tako::resources::{AllocationRequest, CPU_RESOURCE_NAME, NumOfNodes, ResourceAmount}; +use tako::resources::{ + AllocationRequest, CPU_RESOURCE_NAME, NumOfNodes, ResourceAmount, ResourceRqId, +}; use tako::{JobId, JobTaskCount, Map}; const SUBMIT_ARRAY_LIMIT: JobTaskCount = 999; @@ -609,7 +611,7 @@ pub async fn open_job( let response = rpc_call!(session.connection(), FromClientMessage::OpenJob(JobDescription { name, max_fails }), ToClientMessage::OpenJobResponse(r) => r) - .await?; + .await?; gsettings.printer().print_job_open(response.job_id); Ok(()) @@ -659,26 +661,26 @@ pub async fn submit_computation( stdin: _, directives: _, conf: - SubmitJobTaskConfOpts { - job_conf: SubmitJobConfOpts { name, max_fails }, - nodes: _, - cpus: _, - resource: _, - time_request: _, - pin, - task_dir, - cwd, - stdout, - stderr, - env, - each_line: _, - from_json: _, - array: _, - priority, - time_limit, - stream, - crash_limit, - }, + SubmitJobTaskConfOpts { + job_conf: SubmitJobConfOpts { name, max_fails }, + nodes: _, + cpus: _, + resource: _, + time_request: _, + pin, + task_dir, + cwd, + stdout, + stderr, + env, + each_line: _, + from_json: _, + array: _, + priority, + time_limit, + stream, + crash_limit, + }, on_notify, } = opts; @@ -691,6 +693,10 @@ pub async fn submit_computation( .unwrap_or_else(|| "job".to_string()) }; + // Force task_dir for multi node tasks (for a place where to create node file) + let task_dir = task_dir | (resources.n_nodes > 0); + let resources = ResourceRequestVariants::new(smallvec![resources]); + let args: Vec = commands.into_iter().map(|arg| arg.into()).collect(); let stdout = create_stdio(stdout, &stream, DEFAULT_STDOUT_PATH); @@ -715,21 +721,14 @@ pub async fn submit_computation( stdin: stdin.unwrap_or_default(), }; - // Force task_dir for multi node tasks (for a place where to create node file) - let task_dir = if resources.n_nodes > 0 { - true - } else { - task_dir - }; - let task_kind = TaskKind::ExternalProgram(TaskKindProgram { program: program_def, pin_mode: pin.map(|arg| arg.into()).unwrap_or(PinMode::None), task_dir, }); + let task_desc = TaskDescription { kind: task_kind, - resources: ResourceRequestVariants::new(smallvec![resources]), priority, time_limit, crash_limit, @@ -739,6 +738,7 @@ pub async fn submit_computation( ids, entries, task_desc, + resource_rq: resources, }; let request = SubmitRequest { @@ -759,9 +759,20 @@ pub async fn submit_computation( progress, on_notify.as_deref(), ) - .await + .await } +/*pub(crate) async fn get_resource_rq_ids( + session: &mut ClientSession, + rqv: Vec, +) -> crate::Result> { + let message = FromClientMessage::GetResourceRqId(rqv); + let response = + rpc_call!(session.connection(), message, ToClientMessage::ResourceRqIdResponse(r) => r) + .await?; + Ok(response) +}*/ + pub(crate) async fn send_submit_request( gsettings: &GlobalSettings, session: &mut ClientSession, @@ -1038,14 +1049,14 @@ impl TypedValueParser for CrashLimitParser { .map_err(|e| clap::Error::raw(ErrorKind::InvalidValue, format!("{e}\n"))) } - fn possible_values(&self) -> Option + '_>> { + fn possible_values(&self) -> Option + '_>> { Some(Box::new( [ PossibleValue::new("never-restart"), PossibleValue::new("unlimited"), PossibleValue::new(""), ] - .into_iter(), + .into_iter(), )) } } diff --git a/crates/hyperqueue/src/client/commands/submit/jobfile.rs b/crates/hyperqueue/src/client/commands/submit/jobfile.rs index 928cfb5f3..13553fca6 100644 --- a/crates/hyperqueue/src/client/commands/submit/jobfile.rs +++ b/crates/hyperqueue/src/client/commands/submit/jobfile.rs @@ -1,5 +1,5 @@ use crate::client::commands::submit::command::{ - DEFAULT_STDERR_PATH, DEFAULT_STDOUT_PATH, send_submit_request, + send_submit_request, DEFAULT_STDERR_PATH, DEFAULT_STDOUT_PATH, }; use crate::client::commands::submit::defs::{ ArrayDef, JobDef, StdioDefFull, StdioDefInput, TaskDef, @@ -10,15 +10,15 @@ use crate::common::arraydef::IntArray; use crate::common::utils::fs::get_current_dir; use crate::transfer::connection::ClientSession; use crate::transfer::messages::{ - JobDescription, JobSubmitDescription, JobTaskDescription, PinMode, SubmitRequest, - TaskDescription, TaskKind, TaskKindProgram, TaskWithDependencies, + JobDescription, JobSubmitDescription, JobTaskDescription, LocalResourceRqId, PinMode, + SubmitRequest, TaskDescription, TaskKind, TaskKindProgram, TaskWithDependencies, }; use clap::Parser; use smallvec::smallvec; use std::path::PathBuf; -use tako::Map; use tako::gateway::{EntryType, ResourceRequest, ResourceRequestVariants, TaskDataFlags}; use tako::program::{FileOnCloseBehavior, ProgramDefinition, StdioDef}; +use tako::Map; use tako::{JobId, JobTaskCount, JobTaskId}; #[derive(Parser)] @@ -54,6 +54,19 @@ fn create_stdio(def: Option, default: &str, has_streaming: bool) } } +fn build_resource_request(cfg: &mut TaskConfigDef) -> ResourceRequestVariants { + ResourceRequestVariants { + variants: if cfg.request.is_empty() { + smallvec![ResourceRequest::default()] + } else { + std::mem::take(&mut cfg.request) + .into_iter() + .map(|r| r.into_request()) + .collect() + }, + } +} + fn build_task_description(cfg: TaskConfigDef, has_streaming: bool) -> TaskDescription { TaskDescription { kind: TaskKind::ExternalProgram(TaskKindProgram { @@ -72,13 +85,6 @@ fn build_task_description(cfg: TaskConfigDef, has_streaming: bool) -> TaskDescri }, task_dir: cfg.task_dir, }), - resources: ResourceRequestVariants { - variants: if cfg.request.is_empty() { - smallvec![ResourceRequest::default()] - } else { - cfg.request.into_iter().map(|r| r.into_request()).collect() - }, - }, time_limit: cfg.time_limit, priority: cfg.priority, crash_limit: cfg.crash_limit, @@ -86,8 +92,9 @@ fn build_task_description(cfg: TaskConfigDef, has_streaming: bool) -> TaskDescri } fn build_task( - tdef: TaskDef, + mut tdef: TaskDef, max_id: &mut JobTaskId, + resource_map: &mut Map, data_flags: TaskDataFlags, has_streaming: bool, ) -> TaskWithDependencies { @@ -95,16 +102,23 @@ fn build_task( *max_id = JobTaskId::new(max_id.as_num() + 1); *max_id }); + let resource = build_resource_request(&mut tdef.config); + let resource_rq_id = resource_map.get(&resource).copied().unwrap_or_else(|| { + let new_id = LocalResourceRqId::new(resource_map.len() as u32); + resource_map.insert(resource, new_id); + new_id + }); TaskWithDependencies { id, data_flags, task_desc: build_task_description(tdef.config, has_streaming), + resource_rq_id, task_deps: tdef.deps, data_deps: tdef.data_deps, } } -fn build_job_desc_array(array: ArrayDef, has_streaming: bool) -> JobTaskDescription { +fn build_job_desc_array(mut array: ArrayDef, has_streaming: bool) -> JobTaskDescription { let ids = array .ids .unwrap_or_else(|| IntArray::from_range(0, array.entries.len() as JobTaskCount)); @@ -119,9 +133,11 @@ fn build_job_desc_array(array: ArrayDef, has_streaming: bool) -> JobTaskDescript .collect(), ) }; + let resources = build_resource_request(&mut array.config); JobTaskDescription::Array { ids, entries, + resource_rq: resources, task_desc: build_task_description(array.config, has_streaming), } } @@ -144,8 +160,15 @@ fn build_job_desc_individual_tasks( let mut unprocessed_tasks = Map::new(); let mut in_degrees = Map::new(); let mut consumers: Map> = Map::new(); + let mut resource_map: Map = Map::new(); for task in tasks { - let t = build_task(task, &mut max_id, data_flags, has_streaming); + let t = build_task( + task, + &mut max_id, + &mut resource_map, + data_flags, + has_streaming, + ); if in_degrees.insert(t.id, t.task_deps.len()).is_some() { return Err(crate::Error::GenericError(format!( "Task {} is defined multiple times", @@ -187,7 +210,14 @@ fn build_job_desc_individual_tasks( ))); } - Ok(JobTaskDescription::Graph { tasks: new_tasks }) + let mut resource_rqs_pairs: Vec<_> = resource_map.into_iter().collect(); + resource_rqs_pairs.sort_unstable_by_key(|(_, v)| *v); + let resource_rqs = resource_rqs_pairs.into_iter().map(|(k, _)| k).collect(); + + Ok(JobTaskDescription::Graph { + tasks: new_tasks, + resource_rqs, + }) } fn build_job_submit(jdef: JobDef, job_id: Option) -> crate::Result { diff --git a/crates/hyperqueue/src/client/commands/worker.rs b/crates/hyperqueue/src/client/commands/worker.rs index 02c17af95..75cd49b49 100644 --- a/crates/hyperqueue/src/client/commands/worker.rs +++ b/crates/hyperqueue/src/client/commands/worker.rs @@ -1,5 +1,5 @@ use crate::client::commands::duration_doc; -use anyhow::{Context, bail}; +use anyhow::{bail, Context}; use chrono::Utc; use clap::builder::{PossibleValue, TypedValueParser}; use std::collections::HashSet; @@ -8,11 +8,11 @@ use std::fmt::{Display, Formatter}; use std::path::{Path, PathBuf}; use std::process::Stdio; use std::time::Duration; -use tako::Map; use tako::resources::{ - CPU_RESOURCE_NAME, ResourceDescriptor, ResourceDescriptorItem, ResourceDescriptorKind, + ResourceDescriptor, ResourceDescriptorItem, ResourceDescriptorKind, CPU_RESOURCE_NAME, }; use tako::worker::{ServerLostPolicy, WorkerConfiguration}; +use tako::Map; use clap::error::ErrorKind; use clap::{Arg, Error, Parser, ValueEnum}; @@ -25,7 +25,7 @@ use tokio::task::JoinSet; use tokio::time::sleep; use crate::client::globalsettings::GlobalSettings; -use crate::client::utils::{PassThroughArgument, passthrough_parser}; +use crate::client::utils::{passthrough_parser, PassThroughArgument}; use crate::common::cli::DeploySshOpts; use crate::common::manager::info::{ManagerInfo, WORKER_EXTRA_MANAGER_KEY}; use crate::common::utils::fs::get_hq_binary_path; @@ -41,12 +41,12 @@ use crate::worker::bootstrap::{ finalize_configuration, initialize_worker, try_get_pbs_info, try_get_slurm_info, }; use crate::worker::hwdetect::{ - GPU_ENVIRONMENTS, detect_additional_resources, detect_cpus, prune_hyper_threading, + detect_additional_resources, detect_cpus, prune_hyper_threading, GPU_ENVIRONMENTS, }; use crate::worker::parser::{ parse_cpu_definition, parse_resource_coupling, parse_resource_definition, }; -use crate::{DEFAULT_WORKER_GROUP_NAME, rpc_call}; +use crate::{rpc_call, DEFAULT_WORKER_GROUP_NAME}; use tako::WorkerId; #[derive(clap::ValueEnum, Clone)] @@ -507,8 +507,8 @@ pub async fn get_worker_list( ) -> crate::Result> { let msg = rpc_call!( session.connection(), - FromClientMessage::WorkerList, - ToClientMessage::WorkerListResponse(r) => r + FromClientMessage::GetList { workers: true }, + ToClientMessage::GetListResponse(r) => r ) .await?; @@ -577,8 +577,8 @@ pub async fn wait_for_workers( async fn get_workers_status(session: &mut ClientSession) -> anyhow::Result<(u32, u32)> { let msg = rpc_call!( session.connection(), - FromClientMessage::WorkerList, - ToClientMessage::WorkerListResponse(r) => r + FromClientMessage::GetList { workers: true }, + ToClientMessage::GetListResponse(r) => r ) .await?; diff --git a/crates/hyperqueue/src/client/job.rs b/crates/hyperqueue/src/client/job.rs index 8c333119d..33f42695c 100644 --- a/crates/hyperqueue/src/client/job.rs +++ b/crates/hyperqueue/src/client/job.rs @@ -1,20 +1,27 @@ use crate::rpc_call; use crate::transfer::connection::ClientSession; -use crate::transfer::messages::{FromClientMessage, ToClientMessage}; +use crate::transfer::messages::{FromClientMessage, GetListResponse, ToClientMessage}; +use orion::kex::SessionKeys; use tako::{Map, WorkerId}; /// Maps worker IDs to hostnames. pub type WorkerMap = Map; -pub async fn get_worker_map(session: &mut ClientSession) -> anyhow::Result { - let message = FromClientMessage::WorkerList; +pub async fn get_remote_lists( + session: &mut ClientSession, + workers: bool, +) -> anyhow::Result { + let message = FromClientMessage::GetList { workers }; let response = - rpc_call!(session.connection(), message, ToClientMessage::WorkerListResponse(r) => r) - .await?; - let map = response + rpc_call!(session.connection(), message, ToClientMessage::GetListResponse(r) => r).await?; + Ok(response) +} + +pub async fn get_worker_map(session: &mut ClientSession) -> anyhow::Result { + let response = get_remote_lists(session, true).await?; + Ok(response .workers .into_iter() .map(|w| (w.id, w.configuration.hostname)) - .collect(); - Ok(map) + .collect()) } diff --git a/crates/hyperqueue/src/client/output/cli.rs b/crates/hyperqueue/src/client/output/cli.rs index 201cabbc9..5c6affc5e 100644 --- a/crates/hyperqueue/src/client/output/cli.rs +++ b/crates/hyperqueue/src/client/output/cli.rs @@ -14,8 +14,8 @@ use crate::server::job::{JobTaskCounters, JobTaskInfo, JobTaskState}; use crate::stream::reader::outputlog::Summary; use crate::transfer::messages::{ AutoAllocListQueuesResponse, JobDetail, JobInfo, JobTaskDescription, PinMode, QueueData, - QueueState, ServerInfo, TaskDescription, TaskKind, TaskKindProgram, WaitForJobsResponse, - WorkerExitInfo, WorkerInfo, + QueueState, ServerInfo, TaskDescription, TaskKind, TaskKindProgram, + WaitForJobsResponse, WorkerExitInfo, WorkerInfo, }; use tako::{JobId, JobTaskCount, JobTaskId, TaskId, WorkerId}; @@ -102,10 +102,10 @@ impl CliOutput { &self, rows: &mut Vec>, task_desc: &TaskDescription, + resource_rq: &ResourceRequestVariants, ) { let TaskDescription { kind, - resources, time_limit, priority, crash_limit, @@ -113,11 +113,11 @@ impl CliOutput { match kind { TaskKind::ExternalProgram(TaskKindProgram { - program, - pin_mode, - task_dir: _task_dir, - }) => { - let resources = format_resource_variants(resources); + program, + pin_mode, + task_dir: _task_dir, + }) => { + let resources = format_resource_variants(resource_rq); rows.push(vec![ "Resources".cell().bold(true), if !matches!(pin_mode, PinMode::None) { @@ -125,7 +125,7 @@ impl CliOutput { } else { resources } - .cell(), + .cell(), ]); rows.push(vec!["Priority".cell().bold(true), priority.cell()]); @@ -345,7 +345,7 @@ impl Output for CliOutput { configuration.max_download_tries, format_duration(configuration.wait_between_download_tries) ) - .cell(), + .cell(), ], vec![ "Manager".cell().bold(true), @@ -484,7 +484,7 @@ impl Output for CliOutput { } else { t.id.cell() } - .justify(Justify::Right), + .justify(Justify::Right), truncate_middle(&t.name, 50).cell(), status, t.n_tasks.cell(), @@ -529,7 +529,7 @@ impl Output for CliOutput { self.print_horizontal_table(rows, header); } - fn print_job_detail(&self, jobs: Vec, worker_map: WorkerMap, _server_uid: &str) { + fn print_job_detail(&self, jobs: Vec, worker_map: &WorkerMap, _server_uid: &str) { for job in jobs { let JobDetail { info, @@ -576,7 +576,7 @@ impl Output for CliOutput { JobTaskDescription::Array { ids, .. } => { itertools::Either::Left(ids.iter()) } - JobTaskDescription::Graph { tasks } => { + JobTaskDescription::Graph { tasks, .. } => { itertools::Either::Right(tasks.iter().map(|t| t.id.as_num())) } }) @@ -595,10 +595,13 @@ impl Output for CliOutput { ]); if submit_descs.len() == 1 - && let JobTaskDescription::Array { task_desc, .. } = - &submit_descs[0].description().task_desc + && let JobTaskDescription::Array { + task_desc, + resource_rq, + .. + } = &submit_descs[0].description().task_desc { - self.print_job_shared_task_description(&mut rows, task_desc); + self.print_job_shared_task_description(&mut rows, task_desc, resource_rq); } rows.push(vec![ @@ -634,7 +637,7 @@ impl Output for CliOutput { duration: Duration, response: &WaitForJobsResponse, details: &[(JobId, Option)], - worker_map: WorkerMap, + worker_map: &WorkerMap, ) { let mut msgs = vec![]; @@ -679,7 +682,7 @@ impl Output for CliOutput { fn print_task_list( &self, mut jobs: Vec<(JobId, JobDetail)>, - worker_map: WorkerMap, + worker_map: &WorkerMap, _server_uid: &str, verbosity: Verbosity, ) { @@ -753,7 +756,7 @@ impl Output for CliOutput { &self, job: (JobId, JobDetail), tasks: &[(JobTaskId, JobTaskInfo)], - worker_map: WorkerMap, + worker_map: &WorkerMap, server_uid: &str, verbosity: Verbosity, ) { @@ -765,20 +768,29 @@ impl Output for CliOutput { let (start, end) = get_task_time(&task.state); let (cwd, stdout, stderr) = format_task_paths(&task_to_paths, *task_id); - let (task_desc, task_deps) = if let Some(x) = - job.submit_descs.iter().find_map(|submit_desc| { - match &submit_desc.description().task_desc { - JobTaskDescription::Array { - ids, - entries: _, - task_desc, - } if ids.contains(task_id.as_num()) => Some((task_desc, [].as_slice())), - JobTaskDescription::Array { .. } => None, - JobTaskDescription::Graph { tasks } => tasks - .iter() - .find(|t| t.id == *task_id) - .map(|task_dep| (&task_dep.task_desc, task_dep.task_deps.as_slice())), + let (task_desc, resource_rq, task_deps) = if let Some(x) = job + .submit_descs + .iter() + .find_map(|submit_desc| match &submit_desc.description().task_desc { + JobTaskDescription::Array { + ids, + entries: _, + task_desc, + resource_rq, + } if ids.contains(task_id.as_num()) => { + Some((task_desc, resource_rq, [].as_slice())) } + JobTaskDescription::Array { .. } => None, + JobTaskDescription::Graph { + tasks, + resource_rqs, + } => tasks.iter().find(|t| t.id == *task_id).map(|task_dep| { + ( + &task_dep.task_desc, + &resource_rqs[task_dep.resource_rq_id.as_usize()], + task_dep.task_deps.as_slice(), + ) + }), }) { x } else { @@ -788,10 +800,10 @@ impl Output for CliOutput { match &task_desc.kind { TaskKind::ExternalProgram(TaskKindProgram { - program, - pin_mode, - task_dir, - }) => { + program, + pin_mode, + task_dir, + }) => { let mut env_vars: Vec<(_, _)> = program.env.iter().filter(|(k, _)| !is_hq_env(k)).collect(); env_vars.sort_by_key(|item| item.0); @@ -853,10 +865,9 @@ impl Output for CliOutput { .unwrap_or_else(|| "None".to_string()) .cell(), ], - vec![ - "Resources".cell().bold(true), - format_resource_variants(&task_desc.resources).cell(), - ], + vec!["Resources".cell().bold(true), { + format_resource_variants(resource_rq).cell() + }], vec!["Priority".cell().bold(true), task_desc.priority.cell()], vec!["Pin".cell().bold(true), pin_mode.to_str().cell()], vec![ @@ -928,7 +939,7 @@ impl Output for CliOutput { human_size(summary.stdout_size), human_size(summary.stderr_size) ) - .cell(), + .cell(), ], vec![ "Superseded streams".cell().bold(true), @@ -941,7 +952,7 @@ impl Output for CliOutput { human_size(summary.superseded_stdout_size), human_size(summary.superseded_stderr_size) ) - .cell(), + .cell(), ], ]; self.print_vertical_table(rows); @@ -968,7 +979,7 @@ impl Output for CliOutput { QueueState::Active => "ACTIVE", QueueState::Paused => "PAUSED", } - .cell(), + .cell(), params.backlog.cell(), params.max_workers_per_alloc.cell(), params.max_worker_count.unwrap_or_default().cell(), @@ -1115,7 +1126,7 @@ impl Output for CliOutput { enabled_variants.to_string().color(colored::Color::Green), all_varints ) - .cell() + .cell() }; let mut header = vec![w.worker_id.cell(), can_run]; for (i, variant) in w.variants.iter().enumerate() { @@ -1299,25 +1310,25 @@ pub fn worker_status(worker_info: &WorkerInfo) -> CellStruct { match worker_info.ended.as_ref() { None => "RUNNING".cell().foreground_color(Some(Color::Green)), Some(WorkerExitInfo { - reason: LostWorkerReason::ConnectionLost, - .. - }) => "CONNECTION LOST".cell().foreground_color(Some(Color::Red)), + reason: LostWorkerReason::ConnectionLost, + .. + }) => "CONNECTION LOST".cell().foreground_color(Some(Color::Red)), Some(WorkerExitInfo { - reason: LostWorkerReason::HeartbeatLost, - .. - }) => "HEARTBEAT LOST".cell().foreground_color(Some(Color::Red)), + reason: LostWorkerReason::HeartbeatLost, + .. + }) => "HEARTBEAT LOST".cell().foreground_color(Some(Color::Red)), Some(WorkerExitInfo { - reason: LostWorkerReason::IdleTimeout, - .. - }) => "IDLE TIMEOUT".cell().foreground_color(Some(Color::Cyan)), + reason: LostWorkerReason::IdleTimeout, + .. + }) => "IDLE TIMEOUT".cell().foreground_color(Some(Color::Cyan)), Some(WorkerExitInfo { - reason: LostWorkerReason::Stopped, - .. - }) => "STOPPED".cell().foreground_color(Some(Color::Magenta)), + reason: LostWorkerReason::Stopped, + .. + }) => "STOPPED".cell().foreground_color(Some(Color::Magenta)), Some(WorkerExitInfo { - reason: LostWorkerReason::TimeLimitReached, - .. - }) => "TIME LIMIT REACHED" + reason: LostWorkerReason::TimeLimitReached, + .. + }) => "TIME LIMIT REACHED" .cell() .foreground_color(Some(Color::Cyan)), } @@ -1349,7 +1360,7 @@ pub fn job_progress_bar(counters: JobTaskCounters, n_tasks: JobTaskCount, width: "{}", ".".repeat(width.saturating_sub(total_char_count)) ) - .unwrap(); + .unwrap(); buffer.push(']'); buffer @@ -1441,7 +1452,7 @@ fn format_resource_request(rq: &ResourceRequest) -> String { grq.resource, grq.policy ) - .unwrap(); + .unwrap(); first = false; } result @@ -1462,7 +1473,7 @@ fn format_resource_variants(rqv: &ResourceRequestVariants) -> String { format_resource_request(v), if is_last { "" } else { "\n\n" } ) - .unwrap(); + .unwrap(); } result } @@ -1607,7 +1618,7 @@ fn resources_full_describe(resources: &ResourceDescriptor) -> String { &descriptor.name, format_descriptor_kind(&descriptor.kind), ) - .unwrap(); + .unwrap(); first = false; } result @@ -1698,7 +1709,7 @@ fn resources_summary(resources: &ResourceDescriptor, multiline: bool) -> String "" } ) - .unwrap(); + .unwrap(); first = false; } result diff --git a/crates/hyperqueue/src/client/output/common.rs b/crates/hyperqueue/src/client/output/common.rs index c6e46ff98..5bcc69455 100644 --- a/crates/hyperqueue/src/client/output/common.rs +++ b/crates/hyperqueue/src/client/output/common.rs @@ -34,7 +34,10 @@ pub fn resolve_task_paths(job: &JobDetail, server_uid: &str) -> TaskToPathsMap { ); } } - JobTaskDescription::Graph { tasks } => { + JobTaskDescription::Graph { + tasks, + resource_rqs, + } => { for t in tasks { task_to_desc_map.insert( t.id, diff --git a/crates/hyperqueue/src/client/output/json.rs b/crates/hyperqueue/src/client/output/json.rs index 03a1a5e48..841db1de8 100644 --- a/crates/hyperqueue/src/client/output/json.rs +++ b/crates/hyperqueue/src/client/output/json.rs @@ -7,18 +7,18 @@ use anyhow::Error; use chrono::{DateTime, Utc}; use serde::{Serialize, Serializer}; use serde_json; -use serde_json::{Value, json}; +use serde_json::{json, Value}; -use tako::gateway::{CrashLimit, ResourceRequest}; +use tako::gateway::{CrashLimit, ResourceRequest, ResourceRequestVariants}; use tako::program::{ProgramDefinition, StdioDef}; use tako::resources::{ResourceDescriptor, ResourceDescriptorItem, ResourceDescriptorKind}; use tako::worker::WorkerConfiguration; use tako::{Map, TaskId}; use crate::client::job::WorkerMap; -use crate::client::output::Verbosity; -use crate::client::output::common::{TaskToPathsMap, group_jobs_by_status, resolve_task_paths}; +use crate::client::output::common::{group_jobs_by_status, resolve_task_paths, TaskToPathsMap}; use crate::client::output::outputs::{Output, OutputStream}; +use crate::client::output::Verbosity; use crate::common::arraydef::IntArray; use crate::common::manager::info::{GetManagerInfo, ManagerType}; use crate::server::autoalloc::{Allocation, AllocationState, QueueId}; @@ -107,7 +107,7 @@ impl Output for JsonOutput { let statuses = group_jobs_by_status(&jobs); self.print(json!(statuses)) } - fn print_job_detail(&self, jobs: Vec, _worker_map: WorkerMap, server_uid: &str) { + fn print_job_detail(&self, jobs: Vec, _worker_map: &WorkerMap, server_uid: &str) { let job_details: Vec<_> = jobs .into_iter() .map(|job| { @@ -136,15 +136,15 @@ impl Output for JsonOutput { "finished_at": finished_at.map(format_datetime), "submits": submit_descs.iter().map(|submit_desc| match &submit_desc.description().task_desc { - JobTaskDescription::Array { task_desc, .. } => { + JobTaskDescription::Array { task_desc, resource_rq, .. } => { json!({ - "array": format_task_description(task_desc) + "array": format_task_description(task_desc, resource_rq) }) } - JobTaskDescription::Graph { tasks } => { + JobTaskDescription::Graph { tasks, resource_rqs } => { let tasks: Vec = tasks .iter() - .map(|task| format_task_description(&task.task_desc)) + .map(|task| format_task_description(&task.task_desc, &resource_rqs[task.resource_rq_id.as_usize()])) .collect(); json!({ "graph": tasks @@ -164,7 +164,7 @@ impl Output for JsonOutput { duration: Duration, response: &WaitForJobsResponse, _details: &[(JobId, Option)], - _worker_map: WorkerMap, + _worker_map: &WorkerMap, ) { let WaitForJobsResponse { finished, @@ -194,7 +194,7 @@ impl Output for JsonOutput { fn print_task_list( &self, jobs: Vec<(JobId, JobDetail)>, - _worker_map: WorkerMap, + _worker_map: &WorkerMap, server_uid: &str, _verbosity: Verbosity, ) { @@ -210,7 +210,7 @@ impl Output for JsonOutput { &self, job: (JobId, JobDetail), tasks: &[(JobTaskId, JobTaskInfo)], - _worker_map: WorkerMap, + _worker_map: &WorkerMap, server_uid: &str, _verbosity: Verbosity, ) { @@ -289,10 +289,9 @@ fn format_crash_limit(limit: CrashLimit) -> Value { } } -fn format_task_description(task_desc: &TaskDescription) -> Value { +fn format_task_description(task_desc: &TaskDescription, rqv: &ResourceRequestVariants) -> Value { let TaskDescription { kind, - resources, time_limit, priority, crash_limit, @@ -320,7 +319,7 @@ fn format_task_description(task_desc: &TaskDescription) -> Value { "stderr": format_stdio_def(stderr), "stdout": format_stdio_def(stdout), }, - "resources": resources + "resources": rqv .variants .iter() .map(|v| { diff --git a/crates/hyperqueue/src/client/output/outputs.rs b/crates/hyperqueue/src/client/output/outputs.rs index fe7ad15cf..9b4b5ab0c 100644 --- a/crates/hyperqueue/src/client/output/outputs.rs +++ b/crates/hyperqueue/src/client/output/outputs.rs @@ -7,8 +7,8 @@ use crate::server::autoalloc::Allocation; use crate::stream::reader::outputlog::Summary; use std::path::Path; -use crate::client::output::Verbosity; use crate::client::output::common::TaskToPathsMap; +use crate::client::output::Verbosity; use crate::common::arraydef::IntArray; use crate::server::job::JobTaskInfo; use core::time::Duration; @@ -47,13 +47,13 @@ pub trait Output { fn print_job_open(&self, job_id: JobId); fn print_job_list(&self, jobs: Vec, total_jobs: usize); fn print_job_summary(&self, jobs: Vec); - fn print_job_detail(&self, jobs: Vec, worker_map: WorkerMap, server_uid: &str); + fn print_job_detail(&self, jobs: Vec, worker_map: &WorkerMap, server_uid: &str); fn print_job_wait( &self, duration: Duration, response: &WaitForJobsResponse, details: &[(JobId, Option)], - worker_map: WorkerMap, + worker_map: &WorkerMap, ); fn print_job_output( &self, @@ -67,7 +67,7 @@ pub trait Output { fn print_task_list( &self, jobs: Vec<(JobId, JobDetail)>, - worker_map: WorkerMap, + worker_map: &WorkerMap, server_uid: &str, verbosity: Verbosity, ); @@ -75,7 +75,7 @@ pub trait Output { &self, job: (JobId, JobDetail), tasks: &[(JobTaskId, JobTaskInfo)], - worker_map: WorkerMap, + worker_map: &WorkerMap, server_uid: &str, verbosity: Verbosity, ); diff --git a/crates/hyperqueue/src/client/output/quiet.rs b/crates/hyperqueue/src/client/output/quiet.rs index 9299a2ecc..77761796d 100644 --- a/crates/hyperqueue/src/client/output/quiet.rs +++ b/crates/hyperqueue/src/client/output/quiet.rs @@ -9,10 +9,10 @@ use tako::resources::ResourceDescriptor; use crate::client::job::WorkerMap; use crate::client::output::cli::print_job_output; use crate::client::output::common::{ - JOB_SUMMARY_STATUS_ORDER, TaskToPathsMap, Verbosity, group_jobs_by_status, + group_jobs_by_status, TaskToPathsMap, Verbosity, JOB_SUMMARY_STATUS_ORDER, }; use crate::client::output::outputs::{Output, OutputStream}; -use crate::client::status::{Status, job_status}; +use crate::client::status::{job_status, Status}; use crate::common::arraydef::IntArray; use crate::server::autoalloc::Allocation; use crate::server::job::JobTaskInfo; @@ -97,14 +97,14 @@ impl Output for Quiet { println!("{status} {count}"); } } - fn print_job_detail(&self, _jobs: Vec, _worker_map: WorkerMap, _server_uid: &str) {} + fn print_job_detail(&self, _jobs: Vec, _worker_map: &WorkerMap, _server_uid: &str) {} fn print_job_wait( &self, _duration: Duration, _response: &WaitForJobsResponse, _details: &[(JobId, Option)], - _worker_map: WorkerMap, + _worker_map: &WorkerMap, ) { } fn print_job_output( @@ -121,7 +121,7 @@ impl Output for Quiet { fn print_task_list( &self, _jobs: Vec<(JobId, JobDetail)>, - _worker_map: WorkerMap, + _worker_map: &WorkerMap, _server_uid: &str, _verbosity: Verbosity, ) { @@ -131,7 +131,7 @@ impl Output for Quiet { &self, _job: (JobId, JobDetail), _tasks: &[(JobTaskId, JobTaskInfo)], - _worker_map: WorkerMap, + _worker_map: &WorkerMap, _server_uid: &str, _verbosity: Verbosity, ) { diff --git a/crates/hyperqueue/src/client/task.rs b/crates/hyperqueue/src/client/task.rs index e4f378cf4..d31a694fd 100644 --- a/crates/hyperqueue/src/client/task.rs +++ b/crates/hyperqueue/src/client/task.rs @@ -1,9 +1,9 @@ use crate::client::commands::job::JobTaskIdsOpts; use crate::client::globalsettings::GlobalSettings; -use crate::client::job::get_worker_map; +use crate::client::job::{get_remote_lists, get_worker_map}; use crate::client::output::{Verbosity, VerbosityFlag}; use crate::common::arraydef::IntArray; -use crate::common::cli::{TaskSelectorArg, parse_last_range, parse_last_single_id}; +use crate::common::cli::{parse_last_range, parse_last_single_id, TaskSelectorArg}; use crate::common::error::HqError; use crate::rpc_call; use crate::transfer::connection::ClientSession; @@ -101,12 +101,11 @@ pub async fn output_job_task_list( }) .collect(); - gsettings.printer().print_task_list( - jobs, - get_worker_map(session).await?, - &response.server_uid, - verbosity, - ); + let worker_map = get_worker_map(session).await?; + + gsettings + .printer() + .print_task_list(jobs, &worker_map, &response.server_uid, verbosity); Ok(()) } @@ -135,10 +134,11 @@ pub async fn output_job_task_info( match opt_job { None => log::error!("Cannot find job {job_id}"), Some(job) => { + let worker_map = get_worker_map(session).await?; gsettings.printer().print_task_info( (*job_id, job.clone()), &job.tasks, - get_worker_map(session).await?, + &worker_map, &response.server_uid, verbosity, ); diff --git a/crates/hyperqueue/src/dashboard/ui/screens/jobs/job_info_display.rs b/crates/hyperqueue/src/dashboard/ui/screens/jobs/job_info_display.rs index dc21977b6..e1e20c293 100644 --- a/crates/hyperqueue/src/dashboard/ui/screens/jobs/job_info_display.rs +++ b/crates/hyperqueue/src/dashboard/ui/screens/jobs/job_info_display.rs @@ -85,7 +85,10 @@ fn create_rows(info: &DashboardJobInfo) -> Vec { }; rows.push(JobInfoDataRow { label: "Resources", - data: format_resources(&task_desc.resources).into(), + data: { + let resources = todo!(); + format_resources(resources).into() + }, }); if let Some(time_limit) = task_desc.time_limit { rows.push(JobInfoDataRow { diff --git a/crates/hyperqueue/src/server/client/mod.rs b/crates/hyperqueue/src/server/client/mod.rs index d861a5ce8..c8135384d 100644 --- a/crates/hyperqueue/src/server/client/mod.rs +++ b/crates/hyperqueue/src/server/client/mod.rs @@ -17,12 +17,12 @@ use crate::server::event::Event; use crate::server::job::JobTaskState; use crate::server::state::{State, StateRef}; use crate::transfer::connection::accept_client; -use crate::transfer::messages::ForgetJobResponse; use crate::transfer::messages::{ CancelJobResponse, CloseJobResponse, FromClientMessage, IdSelector, JobDetail, JobDetailResponse, JobInfoResponse, JobSubmitDescription, StopWorkerResponse, StreamEvents, - SubmitRequest, SubmitResponse, TaskSelector, ToClientMessage, WorkerListResponse, + SubmitRequest, SubmitResponse, TaskSelector, ToClientMessage, }; +use crate::transfer::messages::{ForgetJobResponse, GetListResponse}; use tako::{JobId, JobTaskCount, WorkerId}; pub mod autoalloc; @@ -77,7 +77,7 @@ async fn handle_client( Ok(()) } -async fn stream_history_events + Unpin + 'static>( +async fn stream_history_events + Unpin + 'static>( tx: &mut Tx, mut history: mpsc::UnboundedReceiver, ) { @@ -98,8 +98,8 @@ async fn stream_history_events + } async fn stream_events< - Tx: Sink + Unpin + 'static, - Rx: Stream> + Unpin, + Tx: Sink + Unpin + 'static, + Rx: Stream> + Unpin, >( tx: &mut Tx, rx: &mut Rx, @@ -126,8 +126,8 @@ async fn stream_events< } async fn start_streaming< - Tx: Sink + Unpin + 'static, - Rx: Stream> + Unpin, + Tx: Sink + Unpin + 'static, + Rx: Stream> + Unpin, >( mut tx: Tx, mut rx: Rx, @@ -190,8 +190,8 @@ async fn start_streaming< } pub async fn client_rpc_loop< - Tx: Sink + Unpin + 'static, - Rx: Stream> + Unpin, + Tx: Sink + Unpin + 'static, + Rx: Stream> + Unpin, >( mut tx: Tx, mut rx: Rx, @@ -213,8 +213,8 @@ pub async fn client_rpc_loop< }; if let Some(mut stream_opts) = stream_opts && let ToClientMessage::SubmitResponse(SubmitResponse::Ok { - job, .. - }) = &response + job, .. + }) = &response { if !stream_opts.filter.is_filtering_jobs() { let mut s = Set::new(); @@ -229,7 +229,7 @@ pub async fn client_rpc_loop< stream_opts, Some(response), ) - .await; + .await; break; } response @@ -239,7 +239,7 @@ pub async fn client_rpc_loop< compute_job_info(&state_ref, &msg.selector, msg.include_running_tasks); if let Some(mut stream_opts) = stream_opts && let ToClientMessage::JobInfoResponse(JobInfoResponse { jobs }) = - &response + &response { if !stream_opts.filter.is_filtering_jobs() { stream_opts @@ -254,7 +254,7 @@ pub async fn client_rpc_loop< stream_opts, Some(response), ) - .await; + .await; break; } response @@ -263,7 +263,9 @@ pub async fn client_rpc_loop< end_flag.notify_one(); break; } - FromClientMessage::WorkerList => handle_worker_list(&state_ref), + FromClientMessage::GetList { workers } => { + handle_get_list(&state_ref, workers) + } FromClientMessage::WorkerInfo(msg) => { handle_worker_info(&state_ref, senders, msg.worker_id, msg.runtime_info) } @@ -318,6 +320,20 @@ pub async fn client_rpc_loop< handle_task_explain(&state_ref, senders, request) } FromClientMessage::ServerDebugDump(path) => handle_server_dump(senders, &path), + /*FromClientMessage::GetResourceRqId(rqvs) => { + ToClientMessage::ResourceRqIdResponse( + rqvs.into_iter() + .map(|rqv| { + let (rq_id, new) = + senders.server_control.get_or_create_resource_rq_id(&rqv); + if new { + state_ref.get_mut().register_resource_rq(rq_id, rqv); + } + rq_id + }) + .collect(), + ) + }*/ }; if let Err(error) = tx.send(response).await { log::error!("Cannot reply to client: {error:?}"); @@ -758,16 +774,20 @@ fn handle_job_forget( ToClientMessage::ForgetJobResponse(ForgetJobResponse { forgotten, ignored }) } -fn handle_worker_list(state_ref: &StateRef) -> ToClientMessage { +fn handle_get_list(state_ref: &StateRef, workers: bool) -> ToClientMessage { let state = state_ref.get(); - ToClientMessage::WorkerListResponse(WorkerListResponse { - workers: state + let workers = if workers { + state .get_workers() .values() .map(|w| w.make_info(None)) - .collect(), - }) + .collect() + } else { + Vec::new() + }; + + ToClientMessage::GetListResponse(GetListResponse { workers }) } fn handle_worker_info( diff --git a/crates/hyperqueue/src/server/client/submit.rs b/crates/hyperqueue/src/server/client/submit.rs index e35a6be73..f0c6372f9 100644 --- a/crates/hyperqueue/src/server/client/submit.rs +++ b/crates/hyperqueue/src/server/client/submit.rs @@ -4,7 +4,8 @@ use std::fmt::{Debug, Formatter}; use std::path::PathBuf; use std::rc::Rc; use tako::gateway::{ - EntryType, SharedTaskConfiguration, TaskConfiguration, TaskDataFlags, TaskSubmit, + EntryType, ResourceRequestVariants, SharedTaskConfiguration, TaskConfiguration, TaskDataFlags, + TaskSubmit, }; use tako::{Map, Set, TaskId}; use thin_vec::ThinVec; @@ -14,9 +15,9 @@ use crate::common::format::human_duration; use crate::common::placeholders::{ fill_placeholders_after_submit, fill_placeholders_log, normalize_path, }; -use crate::server::Senders; use crate::server::job::{Job, JobTaskState, SubmittedJobDescription}; use crate::server::state::{State, StateRef}; +use crate::server::Senders; use crate::transfer::messages::{ JobDescription, JobSubmitDescription, JobTaskDescription, OpenJobResponse, SingleIdSelector, SubmitRequest, SubmitResponse, TaskBuildDescription, TaskDescription, TaskExplainRequest, @@ -24,39 +25,61 @@ use crate::transfer::messages::{ TaskStatusSelector, TaskWithDependencies, ToClientMessage, }; use tako::program::ProgramDefinition; +use tako::resources::{GlobalResourceMapping, ResourceRqAllocator, ResourceRqId}; use tako::{JobId, JobTaskCount, JobTaskId}; -fn create_task_submit(job_id: JobId, submit_desc: &mut JobSubmitDescription) -> TaskSubmit { +fn create_task_submit( + ra: &dyn ResourceRqAllocator, + job_id: JobId, + submit_desc: &mut JobSubmitDescription, +) -> TaskSubmit { match &mut submit_desc.task_desc { JobTaskDescription::Array { ids, entries, task_desc, - } => build_tasks_array( - job_id, - ids, - std::mem::take(entries), - task_desc, - &submit_desc.submit_dir, - submit_desc.stream_path.as_ref(), - ), - JobTaskDescription::Graph { tasks } => build_tasks_graph( - job_id, + resource_rq, + } => { + //let rqv = grm.convert_client_resource_rq(resource_rq); + let resource_rq_id = ra.get_or_create_resource_rq_id(resource_rq); + build_tasks_array( + job_id, + ids, + resource_rq_id, + std::mem::take(entries), + task_desc, + &submit_desc.submit_dir, + submit_desc.stream_path.as_ref(), + ) + } + JobTaskDescription::Graph { tasks, - &submit_desc.submit_dir, - submit_desc.stream_path.as_ref(), - ), + resource_rqs, + } => { + let resources: Vec = resource_rqs + .iter() + .map(|rqv| ra.get_or_create_resource_rq_id(rqv)) + .collect(); + build_tasks_graph( + &resources, + job_id, + tasks, + &submit_desc.submit_dir, + submit_desc.stream_path.as_ref(), + ) + } } } pub(crate) fn submit_job_desc( state: &mut State, + ra: &dyn ResourceRqAllocator, job_id: JobId, mut submit_desc: JobSubmitDescription, submitted_at: DateTime, ) -> TaskSubmit { prepare_job(job_id, &mut submit_desc, state); - let task_submit = create_task_submit(job_id, &mut submit_desc); + let task_submit = create_task_submit(ra, job_id, &mut submit_desc); submit_desc.strip_large_data(); state .get_job_mut(job_id) @@ -80,13 +103,17 @@ pub(crate) fn validate_submit( } } } - JobTaskDescription::Graph { tasks } => { + JobTaskDescription::Graph { + tasks, + resource_rqs, + } => { if let Some(job) = job { for task in tasks { if job.tasks.contains_key(&task.id) { let id = task.id; return Some(SubmitResponse::TaskIdAlreadyExists(id)); } + assert!(task.resource_rq_id.as_usize() < resource_rqs.len()) } } let mut task_ids = Set::new(); @@ -179,7 +206,13 @@ pub(crate) fn handle_submit( state.add_job(job); } - let new_tasks = submit_job_desc(&mut state, job_id, submit_desc, Utc::now()); + let new_tasks = submit_job_desc( + &mut state, + &senders.server_control, + job_id, + submit_desc, + Utc::now(), + ); senders.autoalloc.on_job_submit(job_id); let job_detail = state @@ -207,6 +240,7 @@ fn log_submit_request(request: &SubmitRequest) { JobTaskDescription::Array { ids, entries, + resource_rq, task_desc: TaskDescription { kind: @@ -223,7 +257,6 @@ fn log_submit_request(request: &SubmitRequest) { pin_mode, task_dir, }), - resources, time_limit, priority, crash_limit, @@ -232,7 +265,7 @@ fn log_submit_request(request: &SubmitRequest) { .debug_struct("Array") .field("ids", ids) .field("entries", &entries.as_ref().map(|e| e.len())) - .field("resources", resources) + .field("resources", resource_rq) .field( "args", &args @@ -260,7 +293,7 @@ fn log_submit_request(request: &SubmitRequest) { .field("priority", priority) .field("crash_limit", crash_limit) .finish(), - JobTaskDescription::Graph { tasks } => { + JobTaskDescription::Graph { tasks, .. } => { f.write_fmt(format_args!("Graph ({}) task(s)", tasks.len())) } } @@ -346,6 +379,7 @@ fn serialize_task_body( fn build_tasks_array( job_id: JobId, ids: &IntArray, + resource_rq_id: ResourceRqId, entries: Option>, task_desc: &TaskDescription, submit_dir: &PathBuf, @@ -353,6 +387,7 @@ fn build_tasks_array( ) -> TaskSubmit { let build_task_conf = |tako_id: TaskId, entry: Option| TaskConfiguration { id: tako_id, + resource_rq_id, shared_data_index: 0, task_deps: ThinVec::new(), dataobj_deps: ThinVec::new(), @@ -380,7 +415,6 @@ fn build_tasks_array( TaskSubmit { tasks, shared_data: vec![SharedTaskConfiguration { - resources: task_desc.resources.clone(), time_limit: task_desc.time_limit, priority: task_desc.priority, crash_limit: task_desc.crash_limit, @@ -392,6 +426,7 @@ fn build_tasks_array( } fn build_tasks_graph( + resources: &[ResourceRqId], job_id: JobId, tasks: &[TaskWithDependencies], submit_dir: &PathBuf, @@ -401,7 +436,6 @@ fn build_tasks_graph( let mut allocate_shared_data = |task: &TaskDescription, data_flags: TaskDataFlags| -> u32 { let index = shared_data.len(); shared_data.push(SharedTaskConfiguration { - resources: task.resources.clone(), time_limit: task.time_limit, priority: task.priority, crash_limit: task.crash_limit, @@ -434,6 +468,7 @@ fn build_tasks_graph( task_configs.push(TaskConfiguration { id: TaskId::new(job_id, task.id), + resource_rq_id: resources[task.resource_rq_id.as_usize()], shared_data_index, task_deps, dataobj_deps, @@ -514,7 +549,7 @@ mod tests { }; use tako::internal::tests::utils::sorted_vec; use tako::program::ProgramDefinition; - use tako::resources::{AllocationRequest, CPU_RESOURCE_NAME, ResourceAmount}; + use tako::resources::{AllocationRequest, ResourceAmount, CPU_RESOURCE_NAME}; use tako::{Priority, TaskId}; #[test] diff --git a/crates/hyperqueue/src/server/job.rs b/crates/hyperqueue/src/server/job.rs index c94aedad0..938837847 100644 --- a/crates/hyperqueue/src/server/job.rs +++ b/crates/hyperqueue/src/server/job.rs @@ -440,31 +440,32 @@ impl Job { self.tasks.reserve(ids.id_count() as usize); ids.iter().for_each(|task_id| { let task_id = JobTaskId::new(task_id); - assert!( - self.tasks - .insert( - task_id, - JobTaskInfo { - state: JobTaskState::Waiting, - }, - ) - .is_none() - ); + assert!(self + .tasks + .insert( + task_id, + JobTaskInfo { + state: JobTaskState::Waiting, + }, + ) + .is_none()); }) } - JobTaskDescription::Graph { tasks } => { + JobTaskDescription::Graph { + tasks, + resource_rqs, + } => { self.tasks.reserve(tasks.len()); tasks.iter().for_each(|task| { - assert!( - self.tasks - .insert( - task.id, - JobTaskInfo { - state: JobTaskState::Waiting, - }, - ) - .is_none() - ); + assert!(self + .tasks + .insert( + task.id, + JobTaskInfo { + state: JobTaskState::Waiting, + }, + ) + .is_none()); }) } }; diff --git a/crates/hyperqueue/src/server/restore.rs b/crates/hyperqueue/src/server/restore.rs index ec1ee578c..8c99d0610 100644 --- a/crates/hyperqueue/src/server/restore.rs +++ b/crates/hyperqueue/src/server/restore.rs @@ -66,6 +66,7 @@ impl RestorerJob { } let mut new_tasks = submit_job_desc( state, + todo!(), job_id, submit.description().clone(), submit.submitted_at(), @@ -122,10 +123,10 @@ impl RestorerJob { for task in self.tasks.values_mut() { match &task.state { JobTaskState::Running { started_data } - if started_data.worker_ids.contains(&worker_id) => - { - task.crash_counter += 1; - } + if started_data.worker_ids.contains(&worker_id) => + { + task.crash_counter += 1; + } _ => {} } } diff --git a/crates/hyperqueue/src/server/state.rs b/crates/hyperqueue/src/server/state.rs index ef3cadfc4..953a80e98 100644 --- a/crates/hyperqueue/src/server/state.rs +++ b/crates/hyperqueue/src/server/state.rs @@ -1,5 +1,5 @@ use std::cmp::min; - +use std::collections::HashMap; use chrono::Utc; use smallvec::SmallVec; use tako::{InstanceId, ResourceVariantId, define_wrapped_type}; @@ -11,9 +11,10 @@ use crate::server::autoalloc::LostWorkerDetails; use crate::server::job::Job; use crate::server::restore::StateRestorer; use crate::server::worker::Worker; -use crate::transfer::messages::ServerInfo; -use tako::gateway::LostWorkerReason; +use crate::transfer::messages::{ServerInfo}; +use tako::gateway::{LostWorkerReason, ResourceRequestVariants}; use tako::internal::messages::common::TaskFailInfo; +use tako::resources::{GlobalResourceMapping, ResourceRqId}; use tako::task::SerializedTaskContext; use tako::worker::WorkerConfiguration; use tako::{JobId, Map, WorkerId}; @@ -36,7 +37,7 @@ impl State { self.jobs.get_mut(&job_id) } - pub fn jobs(&self) -> impl Iterator { + pub fn jobs(&self) -> impl Iterator { self.jobs.values() } @@ -90,7 +91,7 @@ impl State { self.job_id_counter = id.as_num(); } - pub fn last_n_ids(&self, n: u32) -> impl Iterator + use<> { + pub fn last_n_ids(&self, n: u32) -> impl Iterator + use < > { let n = min(n, self.job_id_counter - 1); ((self.job_id_counter - n)..self.job_id_counter).map(|id| id.into()) } @@ -179,42 +180,6 @@ impl State { job.set_finished_state(id.job_task_id(), now, senders); } - /* - pub fn process_task_update(&mut self, id: TaskId, state: TaskState, senders: &Senders) { - log::debug!("Task id={} updated {:?}", id, state); - match state { - TaskState::Running { - instance_id, - worker_ids, - context, - } => { - let job = self.get_job_mut(id.job_id()).unwrap(); - let now = Utc::now(); - job.set_running_state(id.job_task_id(), worker_ids.clone(), context, now); - for worker_id in &worker_ids { - if let Some(worker) = self.workers.get_mut(worker_id) { - worker.update_task_started(id, now); - } - } - senders - .events - .on_task_started(id, instance_id, worker_ids.clone(), now); - } - TaskState::Finished => { - let now = Utc::now(); - let job = self.get_job_mut(id.job_id()).unwrap(); - job.set_finished_state(id.job_task_id(), now, senders); - } - TaskState::Waiting => { - let job = self.get_job_mut(id.job_id()).unwrap(); - job.set_waiting_state(id.job_task_id()); - } - TaskState::Invalid => { - unreachable!() - } - }; - }*/ - pub fn process_worker_new( &mut self, senders: &Senders, diff --git a/crates/hyperqueue/src/transfer/messages.rs b/crates/hyperqueue/src/transfer/messages.rs index 094fb18fd..1bdd09ce8 100644 --- a/crates/hyperqueue/src/transfer/messages.rs +++ b/crates/hyperqueue/src/transfer/messages.rs @@ -3,14 +3,14 @@ use serde::Deserialize; use serde::Serialize; use std::borrow::Cow; -use crate::JobDataObjectId; use crate::client::status::Status; use crate::common::arraydef::IntArray; use crate::common::manager::info::ManagerType; use crate::server::autoalloc::{Allocation, AllocationId, QueueId, QueueParameters}; -use crate::server::event::Event; use crate::server::event::streamer::EventFilter; +use crate::server::event::Event; use crate::server::job::{JobTaskCounters, JobTaskInfo, SubmittedJobDescription}; +use crate::JobDataObjectId; use std::path::PathBuf; use std::time::Duration; use tako::gateway::{ @@ -18,10 +18,10 @@ use tako::gateway::{ WorkerRuntimeInfo, }; use tako::program::ProgramDefinition; -use tako::resources::ResourceDescriptor; +use tako::resources::{ResourceDescriptor, ResourceRqId}; use tako::server::TaskExplanation; use tako::worker::WorkerConfiguration; -use tako::{JobId, JobTaskCount, JobTaskId, Map, TaskId, WorkerId}; +use tako::{define_id_type, JobId, JobTaskCount, JobTaskId, Map, TaskId, WorkerId}; // Messages client -> server #[allow(clippy::large_enum_variant)] @@ -32,6 +32,7 @@ pub enum FromClientMessage { /// It is basically as sending Submit and StreamEvents, but it is done atomically, /// so no message is lost. Submit(SubmitRequest, Option), + //GetResourceRqId(Vec), Cancel(CancelRequest), ForgetJob(ForgetJobRequest), JobDetail(JobDetailRequest), @@ -40,7 +41,9 @@ pub enum FromClientMessage { /// It is basically as sending JobInfo and StreamEvents, but it is done atomically, /// so no message is lost. JobInfo(JobInfoRequest, Option), - WorkerList, + GetList { + workers: bool, + }, WorkerInfo(WorkerInfoRequest), StopWorker(StopWorkerMessage), Stop, @@ -139,10 +142,11 @@ pub enum TaskKind { ExternalProgram(TaskKindProgram), } +define_id_type!(LocalResourceRqId, u32); + #[derive(Serialize, Deserialize, Debug, Clone)] pub struct TaskDescription { pub kind: TaskKind, - pub resources: ResourceRequestVariants, pub time_limit: Option, pub priority: tako::Priority, pub crash_limit: CrashLimit, @@ -161,6 +165,7 @@ impl TaskDescription { #[derive(Serialize, Deserialize, Debug, Clone)] pub struct TaskWithDependencies { pub id: JobTaskId, + pub resource_rq_id: LocalResourceRqId, pub task_desc: TaskDescription, pub task_deps: Vec, pub data_deps: Vec, @@ -180,17 +185,21 @@ pub enum JobTaskDescription { Array { ids: IntArray, entries: Option>, + resource_rq: ResourceRequestVariants, task_desc: TaskDescription, }, - /// Generic DAG of tasks usually submitted through the Python binding. - Graph { tasks: Vec }, + /// Generic DAG of tasks usually submitted through the Python binding or job file. + Graph { + resource_rqs: Vec, + tasks: Vec, + }, } impl JobTaskDescription { pub fn task_count(&self) -> JobTaskCount { match self { JobTaskDescription::Array { ids, .. } => ids.id_count() as JobTaskCount, - JobTaskDescription::Graph { tasks } => tasks.len() as JobTaskCount, + JobTaskDescription::Graph { tasks, .. } => tasks.len() as JobTaskCount, } } @@ -200,11 +209,15 @@ impl JobTaskDescription { ids: _, entries, task_desc, + resource_rq: _, } => { *entries = None; task_desc.strip_large_data(); } - JobTaskDescription::Graph { tasks } => { + JobTaskDescription::Graph { + resource_rqs: _, + tasks, + } => { for task in tasks { task.strip_large_data() } @@ -381,7 +394,8 @@ pub enum ToClientMessage { JobInfoResponse(JobInfoResponse), JobDetailResponse(JobDetailResponse), SubmitResponse(SubmitResponse), - WorkerListResponse(WorkerListResponse), + ResourceRqIdResponse(Vec), + GetListResponse(GetListResponse), WorkerInfoResponse(Option), StopWorkerResponse(Vec<(WorkerId, StopWorkerResponse)>), CancelJobResponse(Vec<(JobId, CancelJobResponse)>), @@ -506,7 +520,7 @@ pub struct JobDetail { } #[derive(Serialize, Deserialize, Debug)] -pub struct WorkerListResponse { +pub struct GetListResponse { pub workers: Vec, } diff --git a/crates/tako/src/connection.rs b/crates/tako/src/connection.rs index de6960ecb..c726c2d7f 100644 --- a/crates/tako/src/connection.rs +++ b/crates/tako/src/connection.rs @@ -6,8 +6,8 @@ use futures::stream::{SplitSink, SplitStream}; use futures::{Sink, SinkExt, Stream, StreamExt}; use orion::aead::streaming::{StreamOpener, StreamSealer}; use orion::kdf::SecretKey; -use serde::Serialize; use serde::de::DeserializeOwned; +use serde::Serialize; use std::marker::PhantomData; use std::sync::Arc; use tokio::net::TcpStream; @@ -41,9 +41,7 @@ impl Connection { self.send(item).await?; match self.receive().await { Some(msg) => msg, - None => Err(crate::Error::GenericError( - "Expected response was not received".into(), - )), + None => Err(crate::Error::GenericError("Connection closed".into())), } } diff --git a/crates/tako/src/control.rs b/crates/tako/src/control.rs index 3d0aa4383..801164f29 100644 --- a/crates/tako/src/control.rs +++ b/crates/tako/src/control.rs @@ -13,6 +13,7 @@ use crate::gateway::{ LostWorkerReason, MultiNodeAllocationResponse, TaskSubmit, WorkerRuntimeInfo, }; use crate::internal::common::error::DsError; +use crate::internal::common::resources::map::ResourceRqAllocator; use crate::internal::common::resources::{ResourceId, ResourceRqId}; use crate::internal::messages::worker::ToWorkerMessage; use crate::internal::scheduler::query::compute_new_worker_query; @@ -21,7 +22,7 @@ use crate::internal::server::client::handle_new_tasks; use crate::internal::server::comm::{Comm, CommSenderRef}; use crate::internal::server::core::{CoreRef, CustomConnectionHandler}; use crate::internal::server::explain::{ - TaskExplanation, task_explain_for_worker, task_explain_init, + task_explain_for_worker, task_explain_init, TaskExplanation, }; use crate::internal::server::reactor::{get_or_create_resource_rq_id, on_cancel_tasks}; use crate::internal::server::worker::DEFAULT_WORKER_OVERVIEW_INTERVAL; @@ -204,13 +205,6 @@ impl ServerRef { let core = self.core_ref.get(); core.dump(now) } - - pub fn get_or_create_resource_rq_id(&self, rqv: &ResourceRequestVariants) -> ResourceRqId { - let mut core = self.core_ref.get_mut(); - let mut comm = self.comm_ref.get_mut(); - let rqv = core.convert_client_resource_rq(rqv); - get_or_create_resource_rq_id(&mut core, &mut *comm, &rqv) - } } #[allow(clippy::too_many_arguments)] @@ -258,3 +252,15 @@ pub fn server_start( Ok((ServerRef { core_ref, comm_ref }, future)) } + +impl ResourceRqAllocator for ServerRef { + fn get_or_create_resource_rq_id( + &self, + rqv: &crate::gateway::ResourceRequestVariants, + ) -> ResourceRqId { + let mut core = self.core_ref.get_mut(); + let mut comm = self.comm_ref.get_mut(); + let (rq_id, _) = get_or_create_resource_rq_id(&mut core, &mut *comm, &rqv); + rq_id + } +} diff --git a/crates/tako/src/internal/common/resources/map.rs b/crates/tako/src/internal/common/resources/map.rs index 684fb68df..7a81b90ca 100644 --- a/crates/tako/src/internal/common/resources/map.rs +++ b/crates/tako/src/internal/common/resources/map.rs @@ -1,6 +1,6 @@ use crate::gateway::ResourceRequestVariants as ClientResourceRequestVariants; -use crate::internal::common::Map; use crate::internal::common::resources::{ResourceId, ResourceRqId}; +use crate::internal::common::Map; use crate::internal::server::core::Core; use crate::resources::{ResourceAllocRequest, ResourceRequest, ResourceRequestVariants}; use serde::{Deserialize, Serialize}; @@ -13,7 +13,7 @@ pub const AMD_GPU_RESOURCE_NAME: &str = "gpus/amd"; pub const MEM_RESOURCE_NAME: &str = "mem"; #[derive(Debug)] -pub(crate) struct GlobalResourceMapping { +pub struct GlobalResourceMapping { resource_rq_from_id: ResourceRqMap, resource_rq_to_id: Map, resource_names: Map, @@ -33,7 +33,7 @@ impl Default for GlobalResourceMapping { } impl GlobalResourceMapping { - pub(crate) fn convert_client_resource_rq( + pub fn convert_client_resource_rq( &mut self, resources: &ClientResourceRequestVariants, ) -> ResourceRequestVariants { @@ -93,6 +93,23 @@ impl GlobalResourceMapping { } pub fn get_or_create_resource_rq_id( + &mut self, + rq: &ClientResourceRequestVariants, + ) -> (ResourceRqId, bool) { + let rqv = self.convert_client_resource_rq(rq); + match self.resource_rq_to_id.get(&rqv) { + Some(&id) => (id, false), + None => { + let mut id = ResourceRqId::new(self.resource_rq_to_id.len() as u32); + log::debug!("New resource request registered {rqv:?} as {id}"); + self.resource_rq_to_id.insert(rqv.clone(), id); + self.resource_rq_from_id.insert(id, rqv); + (id, true) + } + } + } + + /* pub fn get_or_create_resource_rq_id( &mut self, rqv: &ResourceRequestVariants, ) -> (ResourceRqId, bool) { @@ -109,7 +126,7 @@ impl GlobalResourceMapping { (id, true) } } - } + }*/ } #[derive(Default, Debug)] @@ -158,16 +175,17 @@ impl ResourceIdMap { #[derive(Default, Debug, Clone, Serialize, Deserialize)] #[serde(transparent)] -pub struct ResourceRqMap(Map); +pub struct ResourceRqMap(Vec); impl ResourceRqMap { pub fn insert(&mut self, rq_id: ResourceRqId, rqv: ResourceRequestVariants) { - assert!(self.0.insert(rq_id, rqv).is_none()); + assert_eq!(rq_id.as_usize(), self.0.len()); + self.0.push(rqv); } #[inline] - pub fn get(&self, rq_id: &ResourceRqId) -> &ResourceRequestVariants { - self.0.get(rq_id).unwrap() + pub fn get(&self, rq_id: ResourceRqId) -> &ResourceRequestVariants { + self.0.get(rq_id.as_usize()).unwrap() } #[cfg(test)] @@ -187,3 +205,10 @@ impl ResourceRqMap { } } } + +pub trait ResourceRqAllocator { + fn get_or_create_resource_rq_id( + &self, + rqv: &crate::gateway::ResourceRequestVariants, + ) -> ResourceRqId; +} diff --git a/crates/tako/src/internal/common/resources/mod.rs b/crates/tako/src/internal/common/resources/mod.rs index f3e73d6a3..23b1efd19 100644 --- a/crates/tako/src/internal/common/resources/mod.rs +++ b/crates/tako/src/internal/common/resources/mod.rs @@ -12,8 +12,8 @@ pub use descriptor::{ ResourceDescriptorItem, ResourceDescriptorKind, }; pub use map::{ - AMD_GPU_RESOURCE_NAME, CPU_RESOURCE_ID, CPU_RESOURCE_NAME, MEM_RESOURCE_NAME, - NVIDIA_GPU_RESOURCE_NAME, + GlobalResourceMapping, ResourceRqAllocator, AMD_GPU_RESOURCE_NAME, CPU_RESOURCE_ID, + CPU_RESOURCE_NAME, MEM_RESOURCE_NAME, NVIDIA_GPU_RESOURCE_NAME, }; pub use request::{ AllocationRequest, ResourceAllocRequest, ResourceRequest, ResourceRequestEntries, @@ -30,12 +30,6 @@ define_id_type!(ResourceId, u32); // Identifies a globally unique Resource request stored in Core. define_id_type!(ResourceRqId, u32); -impl ResourceRqId { - pub fn is_multi_node(&self) -> bool { - self.0 % 2 == 1 - } -} - // Represents an index within a single generic resource (e.g. GPU with ID 1). define_id_type!(ResourceIndex, u32); diff --git a/crates/tako/src/internal/scheduler/multinode.rs b/crates/tako/src/internal/scheduler/multinode.rs index f8cdbb799..07039c504 100644 --- a/crates/tako/src/internal/scheduler/multinode.rs +++ b/crates/tako/src/internal/scheduler/multinode.rs @@ -1,5 +1,5 @@ -use crate::internal::common::resources::ResourceRqId; use crate::internal::common::resources::map::{GlobalResourceMapping, ResourceRqMap}; +use crate::internal::common::resources::ResourceRqId; use crate::internal::server::task::Task; use crate::internal::server::taskmap::TaskMap; use crate::internal::server::worker::Worker; @@ -56,7 +56,7 @@ impl MultiNodeQueue { } else { self.requests.push(task.resource_rq_id); self.requests.sort_unstable_by_key(|id| { - let rq = resource_map.get(id).trivial_request().unwrap(); + let rq = resource_map.get(*id).trivial_request().unwrap(); std::cmp::Reverse((rq.n_nodes(), rq.min_time())) }); &mut self @@ -210,7 +210,7 @@ impl<'a> MultiNodeAllocator<'a> { continue; } - let rq = self.resource_map.get(rq_id).unwrap_first(); + let rq = self.resource_map.get(*rq_id).unwrap_first(); match find_workers_for_task(rq, self.worker_map, self.worker_groups, self.now) { TaskFindWorkersResult::Ready(workers) => { let task_id = qfr.queue.pop().unwrap().0; diff --git a/crates/tako/src/internal/scheduler/state.rs b/crates/tako/src/internal/scheduler/state.rs index fca5a66cc..9611f7625 100644 --- a/crates/tako/src/internal/scheduler/state.rs +++ b/crates/tako/src/internal/scheduler/state.rs @@ -5,8 +5,8 @@ use std::time::{Duration, Instant}; use tokio::sync::Notify; use tokio::time::sleep; -use crate::internal::common::Map; use crate::internal::common::resources::map::ResourceRqMap; +use crate::internal::common::Map; use crate::internal::messages::worker::{TaskIdsMsg, ToWorkerMessage}; use crate::internal::scheduler::multinode::MultiNodeAllocator; use crate::internal::server::comm::{Comm, CommSender, CommSenderRef}; @@ -308,7 +308,7 @@ impl SchedulerState { let (tasks, workers, requests) = core.split_tasks_workers_requests_mut(); let task = tasks.get_task_mut(task_id); let assigned_worker = task.get_assigned_worker(); - let rqv = requests.get(&task.resource_rq_id); + let rqv = requests.get(task.resource_rq_id); if let Some(w_id) = assigned_worker { log::debug!( "Changing assignment of task={} from worker={} to worker={}", @@ -412,7 +412,7 @@ impl SchedulerState { let Some(task) = tasks.find_task_mut(task_id) else { continue; }; - let rq = resource_map.get(&task.resource_rq_id); + let rq = resource_map.get(task.resource_rq_id); if let Some(worker) = self.choose_worker_for_task( task, rq, @@ -458,7 +458,7 @@ impl SchedulerState { if task.is_sn_running() { continue; } - let rq = request_map.get(&task.resource_rq_id); + let rq = request_map.get(task.resource_rq_id); task.set_take_flag(false); min_resource.include_rqv(rq); balanced_tasks.push(task_id); @@ -507,7 +507,7 @@ impl SchedulerState { .difficulty .entry(task.resource_rq_id) .or_insert_with(|| { - let rqv = requests.get(&task.resource_rq_id); + let rqv = requests.get(task.resource_rq_id); worker.resources.compute_difficulty_score_of_rqv(&rqv) }); log::debug!( diff --git a/crates/tako/src/internal/server/core.rs b/crates/tako/src/internal/server/core.rs index 523a14c55..ff695ce13 100644 --- a/crates/tako/src/internal/server/core.rs +++ b/crates/tako/src/internal/server/core.rs @@ -357,7 +357,11 @@ impl Core { pub fn add_ready_to_assign(&mut self, task_id: TaskId) { let task = self.tasks.get_task(task_id); - if task.resource_rq_id.is_multi_node() { + if self + .get_resource_rq_map() + .get(task.resource_rq_id) + .is_multi_node() + { self.multi_node_queue .add_task(task, self.resource_map.get_resource_rq_map()); } else { @@ -559,11 +563,8 @@ impl Core { } #[inline] - pub fn get_or_create_resource_rq_id( - &mut self, - rqv: &ResourceRequestVariants, - ) -> (ResourceRqId, bool) { - self.resource_map.get_or_create_resource_rq_id(rqv) + pub fn resource_map_mut(&mut self) -> &mut GlobalResourceMapping { + &mut self.resource_map } #[inline] @@ -577,7 +578,7 @@ impl Core { #[inline] pub fn get_resource_rq(&self, rq_id: ResourceRqId) -> &ResourceRequestVariants { - self.resource_map.get_resource_rq_map().get(&rq_id) + self.resource_map.get_resource_rq_map().get(rq_id) } pub fn secret_key(&self) -> Option<&Arc> { diff --git a/crates/tako/src/internal/server/reactor.rs b/crates/tako/src/internal/server/reactor.rs index 618c4dc66..08b88c2b2 100644 --- a/crates/tako/src/internal/server/reactor.rs +++ b/crates/tako/src/internal/server/reactor.rs @@ -1,4 +1,5 @@ use crate::datasrv::{DataObjectId, OutputId}; +use crate::gateway::ResourceRequestVariants as ClientResourceRequestVariants; use crate::gateway::{CrashLimit, LostWorkerReason}; use crate::internal::common::resources::ResourceRqId; use crate::internal::common::{Map, Set}; @@ -122,7 +123,7 @@ pub(crate) fn on_remove_worker( let (tasks, workers, requests) = core.split_tasks_workers_requests_mut(); for (w_id, task_id) in removes { let task = tasks.get_task(task_id); - let rqv = requests.get(&task.resource_rq_id); + let rqv = requests.get(task.resource_rq_id); workers.get_worker_mut(w_id).remove_sn_task(task, rqv); } } @@ -233,7 +234,7 @@ pub(crate) fn on_task_running( TaskRuntimeState::Stealing(w_id, Some(target_id)) => { assert_eq!(*w_id, worker_id); let worker = workers.get_worker_mut(*target_id); - let rqv = requests.get(&task.resource_rq_id); + let rqv = requests.get(task.resource_rq_id); worker.remove_sn_task(task, rqv); let worker = workers.get_worker_mut(*w_id); worker.insert_sn_task(task, rqv); @@ -292,7 +293,7 @@ pub(crate) fn on_task_finished( &msg.outputs ); assert!(task.is_assigned_or_stolen_from(worker_id)); - let rqv = requests.get(&task.resource_rq_id); + let rqv = requests.get(task.resource_rq_id); match &task.state { TaskRuntimeState::Assigned(w_id) @@ -474,12 +475,12 @@ fn fail_task_helper( if let Some(task) = tasks.find_task(task_id) { log::debug!("Task task_id={task_id} failed"); if let Some(worker_id) = worker_id { - if task.resource_rq_id.is_multi_node() { + if requests.get(task.resource_rq_id).is_multi_node() { let ws = task.mn_placement().unwrap(); assert_eq!(ws[0], worker_id); reset_mn_task_workers(workers, ws, task_id); } else { - let rqv = requests.get(&task.resource_rq_id); + let rqv = requests.get(task.resource_rq_id); assert!(task.is_assigned_or_stolen_from(worker_id)); workers.get_worker_mut(worker_id).remove_sn_task(task, rqv); } @@ -553,7 +554,7 @@ pub(crate) fn on_cancel_tasks(core: &mut Core, comm: &mut impl Comm, task_ids: & | TaskRuntimeState::Running { worker_id: w_id, .. } => { - let rqv = requests.get(&task.resource_rq_id); + let rqv = requests.get(task.resource_rq_id); workers.get_worker_mut(w_id).remove_sn_task(task, rqv); running_ids.entry(w_id).or_default().push(task_id); } @@ -565,7 +566,7 @@ pub(crate) fn on_cancel_tasks(core: &mut Core, comm: &mut impl Comm, task_ids: & } TaskRuntimeState::Stealing(from_id, to_id) => { if let Some(to_id) = to_id { - let rqv = requests.get(&task.resource_rq_id); + let rqv = requests.get(task.resource_rq_id); workers.get_worker_mut(to_id).remove_sn_task(task, rqv); } running_ids.entry(from_id).or_default().push(task_id); @@ -606,12 +607,16 @@ pub(crate) fn on_resolve_placement( pub(crate) fn get_or_create_resource_rq_id( core: &mut Core, comm: &mut impl Comm, - rqv: &ResourceRequestVariants, -) -> ResourceRqId { - let (rq_id, is_new) = core.get_or_create_resource_rq_id(rqv); + rqv: &ClientResourceRequestVariants, +) -> (ResourceRqId, bool) { + let map = core.resource_map_mut(); + let (rq_id, is_new) = map.get_or_create_resource_rq_id(rqv); if is_new { - let msg = ToWorkerMessage::NewResourceRequest(rq_id, rqv.clone()); + let msg = ToWorkerMessage::NewResourceRequest( + rq_id, + map.get_resource_rq_map().get(rq_id).clone(), + ); comm.broadcast_worker_message(&msg); } - rq_id + (rq_id, is_new) } diff --git a/crates/tako/src/internal/server/task.rs b/crates/tako/src/internal/server/task.rs index 7612391b9..f3262f1f1 100644 --- a/crates/tako/src/internal/server/task.rs +++ b/crates/tako/src/internal/server/task.rs @@ -137,7 +137,10 @@ static_assert_size!(Task, 120); impl fmt::Debug for Task { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { //let task_ids : Vec<_> = self.tasks.iter().map(|r| r.get().id.to_string()).collect(); - f.debug_struct("Task").field("id", &self.id).finish() + f.debug_struct("Task") + .field("id", &self.id) + .field("state", &self.state) + .finish() } } diff --git a/crates/tako/src/internal/server/worker.rs b/crates/tako/src/internal/server/worker.rs index df6f3372d..ff17de27d 100644 --- a/crates/tako/src/internal/server/worker.rs +++ b/crates/tako/src/internal/server/worker.rs @@ -1,10 +1,10 @@ use std::fmt; use crate::gateway::{LostWorkerReason, WorkerRuntimeInfo}; -use crate::internal::common::Set; use crate::internal::common::resources::map::{ResourceIdMap, ResourceRqMap}; use crate::internal::common::resources::{ResourceRequest, ResourceRequestVariants}; use crate::internal::common::resources::{ResourceRqId, TimeRequest}; +use crate::internal::common::Set; use crate::internal::messages::worker::{TaskIdsMsg, ToWorkerMessage}; use crate::internal::server::comm::Comm; use crate::internal::server::task::{Task, TaskRuntimeState}; @@ -73,7 +73,7 @@ impl fmt::Debug for Worker { .field("id", &self.id) .field("resources", &self.configuration.resources) .field("load", &self.sn_load) - .field("tasks", &self.sn_tasks.len()) + .field("tasks", &self.sn_tasks) .finish() } } @@ -163,7 +163,7 @@ impl Worker { let mut trivial = true; for &task_id in &self.sn_tasks { let task = task_map.get_task(task_id); - let rqv = request_map.get(&task.resource_rq_id); + let rqv = request_map.get(task.resource_rq_id); trivial &= rqv.is_trivial(); check_load.add_request(task_id, rqv, task.running_variant(), &self.resources); } @@ -273,7 +273,7 @@ impl Worker { .filter(|task_id| { let task = task_map.get_task_mut(*task_id); if task.is_assigned() - && !self.is_capable_to_run_rqv(request_map.get(&task.resource_rq_id), now) + && !self.is_capable_to_run_rqv(request_map.get(task.resource_rq_id), now) { log::debug!( "Retracting task={task_id}, time request cannot be fulfilled anymore" diff --git a/crates/tako/src/internal/tests/integration/utils/server.rs b/crates/tako/src/internal/tests/integration/utils/server.rs index 8fc7c8e5a..3d26df88a 100644 --- a/crates/tako/src/internal/tests/integration/utils/server.rs +++ b/crates/tako/src/internal/tests/integration/utils/server.rs @@ -21,9 +21,9 @@ use crate::gateway::{ use crate::internal::common::resources::ResourceRqId; use crate::internal::common::{Map, Set}; use crate::internal::messages::common::TaskFailInfo; -use crate::internal::tests::integration::utils::api::{WaitResult, wait_for_tasks}; +use crate::internal::tests::integration::utils::api::{wait_for_tasks, WaitResult}; use crate::internal::tests::integration::utils::worker::{ - WorkerContext, WorkerHandle, start_worker, + start_worker, WorkerContext, WorkerHandle, }; use crate::task::SerializedTaskContext; use crate::tests::integration::utils::task::{ResourceRequestConfig, ResourceRequestConfigBuilder}; @@ -117,16 +117,6 @@ impl ServerHandle { .await .unwrap() } - - pub fn register_request(&self, rr_builder: ResourceRequestConfigBuilder) -> ResourceRqId { - let rqv = rr_builder.into_rqv(); - self.server_ref.get_or_create_resource_rq_id(&rqv) - } - - pub fn register_default_request(&self) -> ResourceRqId { - let config = ResourceRequestConfigBuilder::default().cpus(1); - self.register_request(config) - } } #[derive(Clone)] diff --git a/crates/tako/src/internal/worker/state.rs b/crates/tako/src/internal/worker/state.rs index 3a5a77bcc..64cfe159c 100644 --- a/crates/tako/src/internal/worker/state.rs +++ b/crates/tako/src/internal/worker/state.rs @@ -17,7 +17,6 @@ use std::rc::Rc; use std::sync::Arc; use std::time::{Duration, Instant}; -use crate::WorkerId; use crate::internal::worker::data::download::WorkerDownloadManagerRef; use crate::internal::worker::localcomm::LocalCommState; use crate::internal::worker::resources::allocator::ResourceAllocator; @@ -27,11 +26,12 @@ use crate::internal::worker::task::{RunningState, Task, TaskState}; use crate::internal::worker::task_comm::RunningTaskComm; use crate::launcher::TaskLauncher; use crate::resources::ResourceRequestVariants; +use crate::WorkerId; use crate::{PriorityTuple, TaskId}; use orion::aead::SecretKey; -use rand::SeedableRng; use rand::prelude::IndexedRandom; use rand::rngs::SmallRng; +use rand::SeedableRng; use tokio::sync::oneshot; pub type TaskMap = StableMap; @@ -322,7 +322,7 @@ impl WorkerState { } pub fn get_resource_rq(&self, rq_id: ResourceRqId) -> &ResourceRequestVariants { - self.resource_rq_map.get(&rq_id) + self.resource_rq_map.get(rq_id) } pub fn get_resource_label_map(&self) -> &ResourceLabelMap { @@ -345,11 +345,10 @@ impl WorkerState { &other_worker.address ); assert_ne!(self.worker_id, other_worker.worker_id); // We should not receive message about ourselves - assert!( - self.worker_addresses - .insert(other_worker.worker_id, other_worker.address) - .is_none() - ); + assert!(self + .worker_addresses + .insert(other_worker.worker_id, other_worker.address) + .is_none()); let resources = WorkerResources::from_transport(other_worker.resources); self.ready_task_queue diff --git a/crates/tako/src/lib.rs b/crates/tako/src/lib.rs index 337a7e2a9..21382cd72 100644 --- a/crates/tako/src/lib.rs +++ b/crates/tako/src/lib.rs @@ -12,10 +12,10 @@ pub mod hwstats; pub mod launcher; pub mod program; -pub use crate::internal::common::WrappedRcRefCell; pub use crate::internal::common::index::{AsIdVec, ItemId}; pub use crate::internal::common::taskgroup::TaskGroup; pub use crate::internal::common::utils::format_comma_delimited; +pub use crate::internal::common::WrappedRcRefCell; pub use crate::internal::common::{Map, Set}; pub use crate::internal::common::ids::{ @@ -35,12 +35,13 @@ pub const MAX_FRAME_SIZE: usize = 128 * 1024 * 1024; pub mod resources { pub use crate::internal::common::resources::{ - AMD_GPU_RESOURCE_NAME, Allocation, AllocationRequest, CPU_RESOURCE_ID, CPU_RESOURCE_NAME, - MEM_RESOURCE_NAME, NVIDIA_GPU_RESOURCE_NAME, NumOfNodes, ResourceAllocRequest, + Allocation, AllocationRequest, GlobalResourceMapping, NumOfNodes, ResourceAllocRequest, ResourceAllocation, ResourceAmount, ResourceDescriptor, ResourceDescriptorCoupling, ResourceDescriptorCouplingItem, ResourceDescriptorItem, ResourceDescriptorKind, ResourceFractions, ResourceGroupIdx, ResourceIndex, ResourceLabel, ResourceRequest, - ResourceRequestEntries, ResourceRequestVariants, ResourceUnits, TimeRequest, + ResourceRequestEntries, ResourceRequestVariants, ResourceRqAllocator, ResourceRqId, + ResourceUnits, TimeRequest, AMD_GPU_RESOURCE_NAME, CPU_RESOURCE_ID, CPU_RESOURCE_NAME, + MEM_RESOURCE_NAME, NVIDIA_GPU_RESOURCE_NAME, }; pub use crate::internal::common::resources::map::ResourceIdMap; From d3cab965880b7b6fa50168dca1165b151633c85e Mon Sep 17 00:00:00 2001 From: Ada Bohm Date: Wed, 17 Dec 2025 15:47:10 +0100 Subject: [PATCH 04/17] Tests updated --- crates/tako/src/control.rs | 12 ++++++++++ .../tako/src/internal/common/resources/map.rs | 23 +++++++++++-------- crates/tako/src/internal/server/reactor.rs | 18 +++++++++++++++ .../internal/tests/integration/test_basic.rs | 6 ++--- .../tests/integration/utils/server.rs | 13 +++++++++++ crates/tako/src/internal/tests/test_worker.rs | 20 ++++++++-------- crates/tako/src/internal/tests/utils/task.rs | 2 +- 7 files changed, 69 insertions(+), 25 deletions(-) diff --git a/crates/tako/src/control.rs b/crates/tako/src/control.rs index 801164f29..18dceedb3 100644 --- a/crates/tako/src/control.rs +++ b/crates/tako/src/control.rs @@ -205,6 +205,18 @@ impl ServerRef { let core = self.core_ref.get(); core.dump(now) } + + #[cfg(test)] + pub fn get_or_create_raw_rq_id( + &self, + rqv: crate::resources::ResourceRequestVariants, + ) -> ResourceRqId { + use crate::internal::server::reactor::get_or_create_raw_resource_rq_id; + let mut core = self.core_ref.get_mut(); + let mut comm = self.comm_ref.get_mut(); + let (rq_id, _) = get_or_create_raw_resource_rq_id(&mut core, &mut *comm, rqv); + rq_id + } } #[allow(clippy::too_many_arguments)] diff --git a/crates/tako/src/internal/common/resources/map.rs b/crates/tako/src/internal/common/resources/map.rs index 7a81b90ca..32af4eaed 100644 --- a/crates/tako/src/internal/common/resources/map.rs +++ b/crates/tako/src/internal/common/resources/map.rs @@ -92,11 +92,7 @@ impl GlobalResourceMapping { *self.resource_rq_to_id.get(rqv).unwrap() } - pub fn get_or_create_resource_rq_id( - &mut self, - rq: &ClientResourceRequestVariants, - ) -> (ResourceRqId, bool) { - let rqv = self.convert_client_resource_rq(rq); + pub fn get_or_create_rq_id(&mut self, rqv: ResourceRequestVariants) -> (ResourceRqId, bool) { match self.resource_rq_to_id.get(&rqv) { Some(&id) => (id, false), None => { @@ -109,6 +105,14 @@ impl GlobalResourceMapping { } } + pub fn get_or_create_resource_rq_id( + &mut self, + rq: &ClientResourceRequestVariants, + ) -> (ResourceRqId, bool) { + let rqv = self.convert_client_resource_rq(rq); + self.get_or_create_rq_id(rqv) + } + /* pub fn get_or_create_resource_rq_id( &mut self, rqv: &ResourceRequestVariants, @@ -193,14 +197,13 @@ impl ResourceRqMap { if let Some(rq_id) = self .0 .iter() - .find_map(|(rq_id, rqv2)| (&rqv == rqv2).then(|| *rq_id)) + .enumerate() + .find_map(|(rq_id, rqv2)| (&rqv == rqv2).then(|| ResourceRqId::new(rq_id as u32))) { rq_id } else { - let mut new_id = ResourceRqId::new( - self.0.len() as u32 * 2 + if rqv.is_multi_node() { 1 } else { 0 }, - ); - self.0.insert(new_id, rqv); + let mut new_id = ResourceRqId::new(self.0.len() as u32); + self.0.push(rqv); new_id } } diff --git a/crates/tako/src/internal/server/reactor.rs b/crates/tako/src/internal/server/reactor.rs index 08b88c2b2..d30e36fcb 100644 --- a/crates/tako/src/internal/server/reactor.rs +++ b/crates/tako/src/internal/server/reactor.rs @@ -620,3 +620,21 @@ pub(crate) fn get_or_create_resource_rq_id( } (rq_id, is_new) } + +#[cfg(test)] +pub(crate) fn get_or_create_raw_resource_rq_id( + core: &mut Core, + comm: &mut impl Comm, + rqv: ResourceRequestVariants, +) -> (ResourceRqId, bool) { + let map = core.resource_map_mut(); + let (rq_id, is_new) = map.get_or_create_rq_id(rqv); + if is_new { + let msg = ToWorkerMessage::NewResourceRequest( + rq_id, + map.get_resource_rq_map().get(rq_id).clone(), + ); + comm.broadcast_worker_message(&msg); + } + (rq_id, is_new) +} diff --git a/crates/tako/src/internal/tests/integration/test_basic.rs b/crates/tako/src/internal/tests/integration/test_basic.rs index 5dcfc7ea9..59fc03f67 100644 --- a/crates/tako/src/internal/tests/integration/test_basic.rs +++ b/crates/tako/src/internal/tests/integration/test_basic.rs @@ -1,9 +1,9 @@ use crate::control::{NewWorkerAllocationResponse, WorkerTypeQuery}; use crate::internal::tests::integration::utils::check_file_contents; -use crate::internal::tests::integration::utils::server::{ServerHandle, run_server_test}; +use crate::internal::tests::integration::utils::server::{run_server_test, ServerHandle}; use crate::internal::tests::integration::utils::task::ResourceRequestConfigBuilder; use crate::internal::tests::integration::utils::task::{ - GraphBuilder, TaskConfigBuilder, simple_args, simple_task, + simple_args, simple_task, GraphBuilder, TaskConfigBuilder, }; use crate::program::StdioDef; use crate::resources::ResourceDescriptor; @@ -13,7 +13,7 @@ use tokio::time::sleep; #[tokio::test] async fn test_submit_simple_task_ok() { run_server_test(Default::default(), |mut handler| async move { - let rq = handler.register_default_request(); + let rq = todo!(); // handler.register_default_request(); let worker = handler.start_worker(Default::default()).await.unwrap(); let stdout = worker.workdir.join("test.out"); diff --git a/crates/tako/src/internal/tests/integration/utils/server.rs b/crates/tako/src/internal/tests/integration/utils/server.rs index 3d26df88a..4843d949e 100644 --- a/crates/tako/src/internal/tests/integration/utils/server.rs +++ b/crates/tako/src/internal/tests/integration/utils/server.rs @@ -25,6 +25,7 @@ use crate::internal::tests::integration::utils::api::{wait_for_tasks, WaitResult use crate::internal::tests::integration::utils::worker::{ start_worker, WorkerContext, WorkerHandle, }; +use crate::resources::ResourceRqAllocator; use crate::task::SerializedTaskContext; use crate::tests::integration::utils::task::{ResourceRequestConfig, ResourceRequestConfigBuilder}; use crate::worker::{WorkerConfiguration, WorkerOverview}; @@ -117,6 +118,18 @@ impl ServerHandle { .await .unwrap() } + + #[cfg(test)] + pub fn register_default_request(&self) -> ResourceRqId { + self.server_ref + .get_or_create_raw_rq_id(crate::resources::ResourceRequestVariants::default()) + } + + #[cfg(test)] + pub fn register_request(&self, rbuilder: ResourceRequestConfigBuilder) -> ResourceRqId { + let rqv = rbuilder.into_rqv(); + self.server_ref.get_or_create_resource_rq_id(&rqv) + } } #[derive(Clone)] diff --git a/crates/tako/src/internal/tests/test_worker.rs b/crates/tako/src/internal/tests/test_worker.rs index a93c2c944..0d3531999 100644 --- a/crates/tako/src/internal/tests/test_worker.rs +++ b/crates/tako/src/internal/tests/test_worker.rs @@ -6,11 +6,11 @@ use crate::internal::messages::worker::{ WorkerResourceCounts, }; use crate::internal::server::workerload::WorkerResources; -use crate::internal::tests::utils::resources::{ResourceRequestBuilder, ra_builder}; +use crate::internal::tests::utils::resources::{ra_builder, ResourceRequestBuilder}; use crate::internal::worker::comm::WorkerComm; use crate::internal::worker::configuration::{ - DEFAULT_MAX_DOWNLOAD_TRIES, DEFAULT_MAX_PARALLEL_DOWNLOADS, - DEFAULT_WAIT_BETWEEN_DOWNLOAD_TRIES, OverviewConfiguration, + OverviewConfiguration, DEFAULT_MAX_DOWNLOAD_TRIES, DEFAULT_MAX_PARALLEL_DOWNLOADS, + DEFAULT_WAIT_BETWEEN_DOWNLOAD_TRIES, }; use crate::internal::worker::rpc::process_worker_message; use crate::internal::worker::state::WorkerStateRef; @@ -108,7 +108,7 @@ fn create_dummy_compute_msg(task_id: TaskId, resource_rq_id: ResourceRqId) -> Co fn test_worker_start_task() { let mut rmap = GlobalResourceMapping::default(); let rqv = ResourceRequestBuilder::default().cpus(3).finish_v(); - let (rq_id, _) = rmap.get_or_create_resource_rq_id(&rqv); + let (rq_id, _) = rmap.get_or_create_rq_id(rqv.clone()); let config = create_test_worker_config(); let state_ref = create_test_worker_state(config, rmap.get_resource_rq_map().clone()); @@ -232,12 +232,10 @@ fn test_worker_other_workers() { assert_eq!(state.ready_task_queue.worker_resources()[&wr2], t); process_worker_message(&mut state, ToWorkerMessage::LostWorker(30.into())); - assert!( - state - .ready_task_queue - .worker_resources() - .get(&wr1) - .is_none() - ); + assert!(state + .ready_task_queue + .worker_resources() + .get(&wr1) + .is_none()); assert_eq!(state.ready_task_queue.worker_resources()[&wr2], t); } diff --git a/crates/tako/src/internal/tests/utils/task.rs b/crates/tako/src/internal/tests/utils/task.rs index 53a214a88..02c46797f 100644 --- a/crates/tako/src/internal/tests/utils/task.rs +++ b/crates/tako/src/internal/tests/utils/task.rs @@ -99,7 +99,7 @@ impl TaskBuilder { rq.validate().unwrap(); } let resources = ResourceRequestVariants::new(resources); - let (rq_id, _) = resource_map.get_or_create_resource_rq_id(&resources); + let (rq_id, _) = resource_map.get_or_create_rq_id(resources); Task::new( self.id, rq_id, From 4627602e2553e2d9c15f2678cd438957799b56b3 Mon Sep 17 00:00:00 2001 From: Ada Bohm Date: Thu, 18 Dec 2025 13:14:55 +0100 Subject: [PATCH 05/17] Explain command migrated to ResourceRqId --- crates/hyperqueue/src/client/commands/job.rs | 4 +- .../src/client/commands/submit/command.rs | 48 +++++------ .../src/client/commands/submit/jobfile.rs | 4 +- .../hyperqueue/src/client/commands/worker.rs | 12 +-- crates/hyperqueue/src/client/output/cli.rs | 82 +++++++++---------- crates/hyperqueue/src/client/output/json.rs | 6 +- .../hyperqueue/src/client/output/outputs.rs | 2 +- crates/hyperqueue/src/client/output/quiet.rs | 4 +- crates/hyperqueue/src/client/task.rs | 2 +- crates/hyperqueue/src/server/client/mod.rs | 28 +++---- crates/hyperqueue/src/server/client/submit.rs | 4 +- crates/hyperqueue/src/server/job.rs | 38 +++++---- crates/hyperqueue/src/server/restore.rs | 8 +- crates/hyperqueue/src/server/state.rs | 10 +-- crates/hyperqueue/src/transfer/messages.rs | 6 +- crates/tako/src/connection.rs | 2 +- crates/tako/src/control.rs | 4 +- .../tako/src/internal/common/resources/map.rs | 2 +- .../tako/src/internal/common/resources/mod.rs | 4 +- .../tako/src/internal/scheduler/multinode.rs | 2 +- crates/tako/src/internal/scheduler/state.rs | 2 +- crates/tako/src/internal/server/explain.rs | 55 +++++++++---- crates/tako/src/internal/server/worker.rs | 2 +- .../internal/tests/integration/test_basic.rs | 4 +- .../tests/integration/utils/server.rs | 6 +- crates/tako/src/internal/tests/test_worker.rs | 18 ++-- crates/tako/src/internal/worker/state.rs | 13 +-- crates/tako/src/lib.rs | 16 ++-- 28 files changed, 207 insertions(+), 181 deletions(-) diff --git a/crates/hyperqueue/src/client/commands/job.rs b/crates/hyperqueue/src/client/commands/job.rs index ea3865c06..5073af7e8 100644 --- a/crates/hyperqueue/src/client/commands/job.rs +++ b/crates/hyperqueue/src/client/commands/job.rs @@ -4,8 +4,8 @@ use crate::client::globalsettings::GlobalSettings; use crate::client::job::{get_remote_lists, get_worker_map}; use crate::client::output::outputs::OutputStream; use crate::client::output::resolve_task_paths; -use crate::client::status::{job_status, Status}; -use crate::common::cli::{parse_last_all_range, parse_last_range, TaskSelectorArg}; +use crate::client::status::{Status, job_status}; +use crate::common::cli::{TaskSelectorArg, parse_last_all_range, parse_last_range}; use crate::common::utils::str::pluralize; use crate::rpc_call; use crate::transfer::connection::ClientSession; diff --git a/crates/hyperqueue/src/client/commands/submit/command.rs b/crates/hyperqueue/src/client/commands/submit/command.rs index 91ca23457..77d9c0ebc 100644 --- a/crates/hyperqueue/src/client/commands/submit/command.rs +++ b/crates/hyperqueue/src/client/commands/submit/command.rs @@ -611,7 +611,7 @@ pub async fn open_job( let response = rpc_call!(session.connection(), FromClientMessage::OpenJob(JobDescription { name, max_fails }), ToClientMessage::OpenJobResponse(r) => r) - .await?; + .await?; gsettings.printer().print_job_open(response.job_id); Ok(()) @@ -661,26 +661,26 @@ pub async fn submit_computation( stdin: _, directives: _, conf: - SubmitJobTaskConfOpts { - job_conf: SubmitJobConfOpts { name, max_fails }, - nodes: _, - cpus: _, - resource: _, - time_request: _, - pin, - task_dir, - cwd, - stdout, - stderr, - env, - each_line: _, - from_json: _, - array: _, - priority, - time_limit, - stream, - crash_limit, - }, + SubmitJobTaskConfOpts { + job_conf: SubmitJobConfOpts { name, max_fails }, + nodes: _, + cpus: _, + resource: _, + time_request: _, + pin, + task_dir, + cwd, + stdout, + stderr, + env, + each_line: _, + from_json: _, + array: _, + priority, + time_limit, + stream, + crash_limit, + }, on_notify, } = opts; @@ -759,7 +759,7 @@ pub async fn submit_computation( progress, on_notify.as_deref(), ) - .await + .await } /*pub(crate) async fn get_resource_rq_ids( @@ -1049,14 +1049,14 @@ impl TypedValueParser for CrashLimitParser { .map_err(|e| clap::Error::raw(ErrorKind::InvalidValue, format!("{e}\n"))) } - fn possible_values(&self) -> Option + '_>> { + fn possible_values(&self) -> Option + '_>> { Some(Box::new( [ PossibleValue::new("never-restart"), PossibleValue::new("unlimited"), PossibleValue::new(""), ] - .into_iter(), + .into_iter(), )) } } diff --git a/crates/hyperqueue/src/client/commands/submit/jobfile.rs b/crates/hyperqueue/src/client/commands/submit/jobfile.rs index 13553fca6..f83b9c76f 100644 --- a/crates/hyperqueue/src/client/commands/submit/jobfile.rs +++ b/crates/hyperqueue/src/client/commands/submit/jobfile.rs @@ -1,5 +1,5 @@ use crate::client::commands::submit::command::{ - send_submit_request, DEFAULT_STDERR_PATH, DEFAULT_STDOUT_PATH, + DEFAULT_STDERR_PATH, DEFAULT_STDOUT_PATH, send_submit_request, }; use crate::client::commands::submit::defs::{ ArrayDef, JobDef, StdioDefFull, StdioDefInput, TaskDef, @@ -16,9 +16,9 @@ use crate::transfer::messages::{ use clap::Parser; use smallvec::smallvec; use std::path::PathBuf; +use tako::Map; use tako::gateway::{EntryType, ResourceRequest, ResourceRequestVariants, TaskDataFlags}; use tako::program::{FileOnCloseBehavior, ProgramDefinition, StdioDef}; -use tako::Map; use tako::{JobId, JobTaskCount, JobTaskId}; #[derive(Parser)] diff --git a/crates/hyperqueue/src/client/commands/worker.rs b/crates/hyperqueue/src/client/commands/worker.rs index 75cd49b49..9f95414a0 100644 --- a/crates/hyperqueue/src/client/commands/worker.rs +++ b/crates/hyperqueue/src/client/commands/worker.rs @@ -1,5 +1,5 @@ use crate::client::commands::duration_doc; -use anyhow::{bail, Context}; +use anyhow::{Context, bail}; use chrono::Utc; use clap::builder::{PossibleValue, TypedValueParser}; use std::collections::HashSet; @@ -8,11 +8,11 @@ use std::fmt::{Display, Formatter}; use std::path::{Path, PathBuf}; use std::process::Stdio; use std::time::Duration; +use tako::Map; use tako::resources::{ - ResourceDescriptor, ResourceDescriptorItem, ResourceDescriptorKind, CPU_RESOURCE_NAME, + CPU_RESOURCE_NAME, ResourceDescriptor, ResourceDescriptorItem, ResourceDescriptorKind, }; use tako::worker::{ServerLostPolicy, WorkerConfiguration}; -use tako::Map; use clap::error::ErrorKind; use clap::{Arg, Error, Parser, ValueEnum}; @@ -25,7 +25,7 @@ use tokio::task::JoinSet; use tokio::time::sleep; use crate::client::globalsettings::GlobalSettings; -use crate::client::utils::{passthrough_parser, PassThroughArgument}; +use crate::client::utils::{PassThroughArgument, passthrough_parser}; use crate::common::cli::DeploySshOpts; use crate::common::manager::info::{ManagerInfo, WORKER_EXTRA_MANAGER_KEY}; use crate::common::utils::fs::get_hq_binary_path; @@ -41,12 +41,12 @@ use crate::worker::bootstrap::{ finalize_configuration, initialize_worker, try_get_pbs_info, try_get_slurm_info, }; use crate::worker::hwdetect::{ - detect_additional_resources, detect_cpus, prune_hyper_threading, GPU_ENVIRONMENTS, + GPU_ENVIRONMENTS, detect_additional_resources, detect_cpus, prune_hyper_threading, }; use crate::worker::parser::{ parse_cpu_definition, parse_resource_coupling, parse_resource_definition, }; -use crate::{rpc_call, DEFAULT_WORKER_GROUP_NAME}; +use crate::{DEFAULT_WORKER_GROUP_NAME, rpc_call}; use tako::WorkerId; #[derive(clap::ValueEnum, Clone)] diff --git a/crates/hyperqueue/src/client/output/cli.rs b/crates/hyperqueue/src/client/output/cli.rs index 5c6affc5e..c50fea607 100644 --- a/crates/hyperqueue/src/client/output/cli.rs +++ b/crates/hyperqueue/src/client/output/cli.rs @@ -14,8 +14,8 @@ use crate::server::job::{JobTaskCounters, JobTaskInfo, JobTaskState}; use crate::stream::reader::outputlog::Summary; use crate::transfer::messages::{ AutoAllocListQueuesResponse, JobDetail, JobInfo, JobTaskDescription, PinMode, QueueData, - QueueState, ServerInfo, TaskDescription, TaskKind, TaskKindProgram, - WaitForJobsResponse, WorkerExitInfo, WorkerInfo, + QueueState, ServerInfo, TaskDescription, TaskKind, TaskKindProgram, WaitForJobsResponse, + WorkerExitInfo, WorkerInfo, }; use tako::{JobId, JobTaskCount, JobTaskId, TaskId, WorkerId}; @@ -113,10 +113,10 @@ impl CliOutput { match kind { TaskKind::ExternalProgram(TaskKindProgram { - program, - pin_mode, - task_dir: _task_dir, - }) => { + program, + pin_mode, + task_dir: _task_dir, + }) => { let resources = format_resource_variants(resource_rq); rows.push(vec![ "Resources".cell().bold(true), @@ -125,7 +125,7 @@ impl CliOutput { } else { resources } - .cell(), + .cell(), ]); rows.push(vec!["Priority".cell().bold(true), priority.cell()]); @@ -345,7 +345,7 @@ impl Output for CliOutput { configuration.max_download_tries, format_duration(configuration.wait_between_download_tries) ) - .cell(), + .cell(), ], vec![ "Manager".cell().bold(true), @@ -484,7 +484,7 @@ impl Output for CliOutput { } else { t.id.cell() } - .justify(Justify::Right), + .justify(Justify::Right), truncate_middle(&t.name, 50).cell(), status, t.n_tasks.cell(), @@ -596,10 +596,10 @@ impl Output for CliOutput { if submit_descs.len() == 1 && let JobTaskDescription::Array { - task_desc, - resource_rq, - .. - } = &submit_descs[0].description().task_desc + task_desc, + resource_rq, + .. + } = &submit_descs[0].description().task_desc { self.print_job_shared_task_description(&mut rows, task_desc, resource_rq); } @@ -800,10 +800,10 @@ impl Output for CliOutput { match &task_desc.kind { TaskKind::ExternalProgram(TaskKindProgram { - program, - pin_mode, - task_dir, - }) => { + program, + pin_mode, + task_dir, + }) => { let mut env_vars: Vec<(_, _)> = program.env.iter().filter(|(k, _)| !is_hq_env(k)).collect(); env_vars.sort_by_key(|item| item.0); @@ -939,7 +939,7 @@ impl Output for CliOutput { human_size(summary.stdout_size), human_size(summary.stderr_size) ) - .cell(), + .cell(), ], vec![ "Superseded streams".cell().bold(true), @@ -952,7 +952,7 @@ impl Output for CliOutput { human_size(summary.superseded_stdout_size), human_size(summary.superseded_stderr_size) ) - .cell(), + .cell(), ], ]; self.print_vertical_table(rows); @@ -979,7 +979,7 @@ impl Output for CliOutput { QueueState::Active => "ACTIVE", QueueState::Paused => "PAUSED", } - .cell(), + .cell(), params.backlog.cell(), params.max_workers_per_alloc.cell(), params.max_worker_count.unwrap_or_default().cell(), @@ -1126,7 +1126,7 @@ impl Output for CliOutput { enabled_variants.to_string().color(colored::Color::Green), all_varints ) - .cell() + .cell() }; let mut header = vec![w.worker_id.cell(), can_run]; for (i, variant) in w.variants.iter().enumerate() { @@ -1310,25 +1310,25 @@ pub fn worker_status(worker_info: &WorkerInfo) -> CellStruct { match worker_info.ended.as_ref() { None => "RUNNING".cell().foreground_color(Some(Color::Green)), Some(WorkerExitInfo { - reason: LostWorkerReason::ConnectionLost, - .. - }) => "CONNECTION LOST".cell().foreground_color(Some(Color::Red)), + reason: LostWorkerReason::ConnectionLost, + .. + }) => "CONNECTION LOST".cell().foreground_color(Some(Color::Red)), Some(WorkerExitInfo { - reason: LostWorkerReason::HeartbeatLost, - .. - }) => "HEARTBEAT LOST".cell().foreground_color(Some(Color::Red)), + reason: LostWorkerReason::HeartbeatLost, + .. + }) => "HEARTBEAT LOST".cell().foreground_color(Some(Color::Red)), Some(WorkerExitInfo { - reason: LostWorkerReason::IdleTimeout, - .. - }) => "IDLE TIMEOUT".cell().foreground_color(Some(Color::Cyan)), + reason: LostWorkerReason::IdleTimeout, + .. + }) => "IDLE TIMEOUT".cell().foreground_color(Some(Color::Cyan)), Some(WorkerExitInfo { - reason: LostWorkerReason::Stopped, - .. - }) => "STOPPED".cell().foreground_color(Some(Color::Magenta)), + reason: LostWorkerReason::Stopped, + .. + }) => "STOPPED".cell().foreground_color(Some(Color::Magenta)), Some(WorkerExitInfo { - reason: LostWorkerReason::TimeLimitReached, - .. - }) => "TIME LIMIT REACHED" + reason: LostWorkerReason::TimeLimitReached, + .. + }) => "TIME LIMIT REACHED" .cell() .foreground_color(Some(Color::Cyan)), } @@ -1360,7 +1360,7 @@ pub fn job_progress_bar(counters: JobTaskCounters, n_tasks: JobTaskCount, width: "{}", ".".repeat(width.saturating_sub(total_char_count)) ) - .unwrap(); + .unwrap(); buffer.push(']'); buffer @@ -1452,7 +1452,7 @@ fn format_resource_request(rq: &ResourceRequest) -> String { grq.resource, grq.policy ) - .unwrap(); + .unwrap(); first = false; } result @@ -1473,7 +1473,7 @@ fn format_resource_variants(rqv: &ResourceRequestVariants) -> String { format_resource_request(v), if is_last { "" } else { "\n\n" } ) - .unwrap(); + .unwrap(); } result } @@ -1618,7 +1618,7 @@ fn resources_full_describe(resources: &ResourceDescriptor) -> String { &descriptor.name, format_descriptor_kind(&descriptor.kind), ) - .unwrap(); + .unwrap(); first = false; } result @@ -1709,7 +1709,7 @@ fn resources_summary(resources: &ResourceDescriptor, multiline: bool) -> String "" } ) - .unwrap(); + .unwrap(); first = false; } result diff --git a/crates/hyperqueue/src/client/output/json.rs b/crates/hyperqueue/src/client/output/json.rs index 841db1de8..816f6094e 100644 --- a/crates/hyperqueue/src/client/output/json.rs +++ b/crates/hyperqueue/src/client/output/json.rs @@ -7,7 +7,7 @@ use anyhow::Error; use chrono::{DateTime, Utc}; use serde::{Serialize, Serializer}; use serde_json; -use serde_json::{json, Value}; +use serde_json::{Value, json}; use tako::gateway::{CrashLimit, ResourceRequest, ResourceRequestVariants}; use tako::program::{ProgramDefinition, StdioDef}; @@ -16,9 +16,9 @@ use tako::worker::WorkerConfiguration; use tako::{Map, TaskId}; use crate::client::job::WorkerMap; -use crate::client::output::common::{group_jobs_by_status, resolve_task_paths, TaskToPathsMap}; -use crate::client::output::outputs::{Output, OutputStream}; use crate::client::output::Verbosity; +use crate::client::output::common::{TaskToPathsMap, group_jobs_by_status, resolve_task_paths}; +use crate::client::output::outputs::{Output, OutputStream}; use crate::common::arraydef::IntArray; use crate::common::manager::info::{GetManagerInfo, ManagerType}; use crate::server::autoalloc::{Allocation, AllocationState, QueueId}; diff --git a/crates/hyperqueue/src/client/output/outputs.rs b/crates/hyperqueue/src/client/output/outputs.rs index 9b4b5ab0c..9debc813b 100644 --- a/crates/hyperqueue/src/client/output/outputs.rs +++ b/crates/hyperqueue/src/client/output/outputs.rs @@ -7,8 +7,8 @@ use crate::server::autoalloc::Allocation; use crate::stream::reader::outputlog::Summary; use std::path::Path; -use crate::client::output::common::TaskToPathsMap; use crate::client::output::Verbosity; +use crate::client::output::common::TaskToPathsMap; use crate::common::arraydef::IntArray; use crate::server::job::JobTaskInfo; use core::time::Duration; diff --git a/crates/hyperqueue/src/client/output/quiet.rs b/crates/hyperqueue/src/client/output/quiet.rs index 77761796d..0317e5717 100644 --- a/crates/hyperqueue/src/client/output/quiet.rs +++ b/crates/hyperqueue/src/client/output/quiet.rs @@ -9,10 +9,10 @@ use tako::resources::ResourceDescriptor; use crate::client::job::WorkerMap; use crate::client::output::cli::print_job_output; use crate::client::output::common::{ - group_jobs_by_status, TaskToPathsMap, Verbosity, JOB_SUMMARY_STATUS_ORDER, + JOB_SUMMARY_STATUS_ORDER, TaskToPathsMap, Verbosity, group_jobs_by_status, }; use crate::client::output::outputs::{Output, OutputStream}; -use crate::client::status::{job_status, Status}; +use crate::client::status::{Status, job_status}; use crate::common::arraydef::IntArray; use crate::server::autoalloc::Allocation; use crate::server::job::JobTaskInfo; diff --git a/crates/hyperqueue/src/client/task.rs b/crates/hyperqueue/src/client/task.rs index d31a694fd..f3fa2d7c9 100644 --- a/crates/hyperqueue/src/client/task.rs +++ b/crates/hyperqueue/src/client/task.rs @@ -3,7 +3,7 @@ use crate::client::globalsettings::GlobalSettings; use crate::client::job::{get_remote_lists, get_worker_map}; use crate::client::output::{Verbosity, VerbosityFlag}; use crate::common::arraydef::IntArray; -use crate::common::cli::{parse_last_range, parse_last_single_id, TaskSelectorArg}; +use crate::common::cli::{TaskSelectorArg, parse_last_range, parse_last_single_id}; use crate::common::error::HqError; use crate::rpc_call; use crate::transfer::connection::ClientSession; diff --git a/crates/hyperqueue/src/server/client/mod.rs b/crates/hyperqueue/src/server/client/mod.rs index c8135384d..521b0d7f9 100644 --- a/crates/hyperqueue/src/server/client/mod.rs +++ b/crates/hyperqueue/src/server/client/mod.rs @@ -77,7 +77,7 @@ async fn handle_client( Ok(()) } -async fn stream_history_events + Unpin + 'static>( +async fn stream_history_events + Unpin + 'static>( tx: &mut Tx, mut history: mpsc::UnboundedReceiver, ) { @@ -98,8 +98,8 @@ async fn stream_history_events + Un } async fn stream_events< - Tx: Sink + Unpin + 'static, - Rx: Stream> + Unpin, + Tx: Sink + Unpin + 'static, + Rx: Stream> + Unpin, >( tx: &mut Tx, rx: &mut Rx, @@ -126,8 +126,8 @@ async fn stream_events< } async fn start_streaming< - Tx: Sink + Unpin + 'static, - Rx: Stream> + Unpin, + Tx: Sink + Unpin + 'static, + Rx: Stream> + Unpin, >( mut tx: Tx, mut rx: Rx, @@ -190,8 +190,8 @@ async fn start_streaming< } pub async fn client_rpc_loop< - Tx: Sink + Unpin + 'static, - Rx: Stream> + Unpin, + Tx: Sink + Unpin + 'static, + Rx: Stream> + Unpin, >( mut tx: Tx, mut rx: Rx, @@ -213,8 +213,8 @@ pub async fn client_rpc_loop< }; if let Some(mut stream_opts) = stream_opts && let ToClientMessage::SubmitResponse(SubmitResponse::Ok { - job, .. - }) = &response + job, .. + }) = &response { if !stream_opts.filter.is_filtering_jobs() { let mut s = Set::new(); @@ -229,7 +229,7 @@ pub async fn client_rpc_loop< stream_opts, Some(response), ) - .await; + .await; break; } response @@ -239,7 +239,7 @@ pub async fn client_rpc_loop< compute_job_info(&state_ref, &msg.selector, msg.include_running_tasks); if let Some(mut stream_opts) = stream_opts && let ToClientMessage::JobInfoResponse(JobInfoResponse { jobs }) = - &response + &response { if !stream_opts.filter.is_filtering_jobs() { stream_opts @@ -254,7 +254,7 @@ pub async fn client_rpc_loop< stream_opts, Some(response), ) - .await; + .await; break; } response @@ -263,9 +263,7 @@ pub async fn client_rpc_loop< end_flag.notify_one(); break; } - FromClientMessage::GetList { workers } => { - handle_get_list(&state_ref, workers) - } + FromClientMessage::GetList { workers } => handle_get_list(&state_ref, workers), FromClientMessage::WorkerInfo(msg) => { handle_worker_info(&state_ref, senders, msg.worker_id, msg.runtime_info) } diff --git a/crates/hyperqueue/src/server/client/submit.rs b/crates/hyperqueue/src/server/client/submit.rs index f0c6372f9..bdbfe447a 100644 --- a/crates/hyperqueue/src/server/client/submit.rs +++ b/crates/hyperqueue/src/server/client/submit.rs @@ -15,9 +15,9 @@ use crate::common::format::human_duration; use crate::common::placeholders::{ fill_placeholders_after_submit, fill_placeholders_log, normalize_path, }; +use crate::server::Senders; use crate::server::job::{Job, JobTaskState, SubmittedJobDescription}; use crate::server::state::{State, StateRef}; -use crate::server::Senders; use crate::transfer::messages::{ JobDescription, JobSubmitDescription, JobTaskDescription, OpenJobResponse, SingleIdSelector, SubmitRequest, SubmitResponse, TaskBuildDescription, TaskDescription, TaskExplainRequest, @@ -549,7 +549,7 @@ mod tests { }; use tako::internal::tests::utils::sorted_vec; use tako::program::ProgramDefinition; - use tako::resources::{AllocationRequest, ResourceAmount, CPU_RESOURCE_NAME}; + use tako::resources::{AllocationRequest, CPU_RESOURCE_NAME, ResourceAmount}; use tako::{Priority, TaskId}; #[test] diff --git a/crates/hyperqueue/src/server/job.rs b/crates/hyperqueue/src/server/job.rs index 938837847..d8582e6ea 100644 --- a/crates/hyperqueue/src/server/job.rs +++ b/crates/hyperqueue/src/server/job.rs @@ -440,15 +440,16 @@ impl Job { self.tasks.reserve(ids.id_count() as usize); ids.iter().for_each(|task_id| { let task_id = JobTaskId::new(task_id); - assert!(self - .tasks - .insert( - task_id, - JobTaskInfo { - state: JobTaskState::Waiting, - }, - ) - .is_none()); + assert!( + self.tasks + .insert( + task_id, + JobTaskInfo { + state: JobTaskState::Waiting, + }, + ) + .is_none() + ); }) } JobTaskDescription::Graph { @@ -457,15 +458,16 @@ impl Job { } => { self.tasks.reserve(tasks.len()); tasks.iter().for_each(|task| { - assert!(self - .tasks - .insert( - task.id, - JobTaskInfo { - state: JobTaskState::Waiting, - }, - ) - .is_none()); + assert!( + self.tasks + .insert( + task.id, + JobTaskInfo { + state: JobTaskState::Waiting, + }, + ) + .is_none() + ); }) } }; diff --git a/crates/hyperqueue/src/server/restore.rs b/crates/hyperqueue/src/server/restore.rs index 8c99d0610..b9189f8fe 100644 --- a/crates/hyperqueue/src/server/restore.rs +++ b/crates/hyperqueue/src/server/restore.rs @@ -123,10 +123,10 @@ impl RestorerJob { for task in self.tasks.values_mut() { match &task.state { JobTaskState::Running { started_data } - if started_data.worker_ids.contains(&worker_id) => - { - task.crash_counter += 1; - } + if started_data.worker_ids.contains(&worker_id) => + { + task.crash_counter += 1; + } _ => {} } } diff --git a/crates/hyperqueue/src/server/state.rs b/crates/hyperqueue/src/server/state.rs index 953a80e98..fa32fd7ae 100644 --- a/crates/hyperqueue/src/server/state.rs +++ b/crates/hyperqueue/src/server/state.rs @@ -1,7 +1,7 @@ -use std::cmp::min; -use std::collections::HashMap; use chrono::Utc; use smallvec::SmallVec; +use std::cmp::min; +use std::collections::HashMap; use tako::{InstanceId, ResourceVariantId, define_wrapped_type}; use tako::{ItemId, TaskId}; @@ -11,7 +11,7 @@ use crate::server::autoalloc::LostWorkerDetails; use crate::server::job::Job; use crate::server::restore::StateRestorer; use crate::server::worker::Worker; -use crate::transfer::messages::{ServerInfo}; +use crate::transfer::messages::ServerInfo; use tako::gateway::{LostWorkerReason, ResourceRequestVariants}; use tako::internal::messages::common::TaskFailInfo; use tako::resources::{GlobalResourceMapping, ResourceRqId}; @@ -37,7 +37,7 @@ impl State { self.jobs.get_mut(&job_id) } - pub fn jobs(&self) -> impl Iterator { + pub fn jobs(&self) -> impl Iterator { self.jobs.values() } @@ -91,7 +91,7 @@ impl State { self.job_id_counter = id.as_num(); } - pub fn last_n_ids(&self, n: u32) -> impl Iterator + use < > { + pub fn last_n_ids(&self, n: u32) -> impl Iterator + use<> { let n = min(n, self.job_id_counter - 1); ((self.job_id_counter - n)..self.job_id_counter).map(|id| id.into()) } diff --git a/crates/hyperqueue/src/transfer/messages.rs b/crates/hyperqueue/src/transfer/messages.rs index 1bdd09ce8..cc84f1853 100644 --- a/crates/hyperqueue/src/transfer/messages.rs +++ b/crates/hyperqueue/src/transfer/messages.rs @@ -3,14 +3,14 @@ use serde::Deserialize; use serde::Serialize; use std::borrow::Cow; +use crate::JobDataObjectId; use crate::client::status::Status; use crate::common::arraydef::IntArray; use crate::common::manager::info::ManagerType; use crate::server::autoalloc::{Allocation, AllocationId, QueueId, QueueParameters}; -use crate::server::event::streamer::EventFilter; use crate::server::event::Event; +use crate::server::event::streamer::EventFilter; use crate::server::job::{JobTaskCounters, JobTaskInfo, SubmittedJobDescription}; -use crate::JobDataObjectId; use std::path::PathBuf; use std::time::Duration; use tako::gateway::{ @@ -21,7 +21,7 @@ use tako::program::ProgramDefinition; use tako::resources::{ResourceDescriptor, ResourceRqId}; use tako::server::TaskExplanation; use tako::worker::WorkerConfiguration; -use tako::{define_id_type, JobId, JobTaskCount, JobTaskId, Map, TaskId, WorkerId}; +use tako::{JobId, JobTaskCount, JobTaskId, Map, TaskId, WorkerId, define_id_type}; // Messages client -> server #[allow(clippy::large_enum_variant)] diff --git a/crates/tako/src/connection.rs b/crates/tako/src/connection.rs index c726c2d7f..f2b2baf49 100644 --- a/crates/tako/src/connection.rs +++ b/crates/tako/src/connection.rs @@ -6,8 +6,8 @@ use futures::stream::{SplitSink, SplitStream}; use futures::{Sink, SinkExt, Stream, StreamExt}; use orion::aead::streaming::{StreamOpener, StreamSealer}; use orion::kdf::SecretKey; -use serde::de::DeserializeOwned; use serde::Serialize; +use serde::de::DeserializeOwned; use std::marker::PhantomData; use std::sync::Arc; use tokio::net::TcpStream; diff --git a/crates/tako/src/control.rs b/crates/tako/src/control.rs index 18dceedb3..df298e6d6 100644 --- a/crates/tako/src/control.rs +++ b/crates/tako/src/control.rs @@ -22,7 +22,7 @@ use crate::internal::server::client::handle_new_tasks; use crate::internal::server::comm::{Comm, CommSenderRef}; use crate::internal::server::core::{CoreRef, CustomConnectionHandler}; use crate::internal::server::explain::{ - task_explain_for_worker, task_explain_init, TaskExplanation, + TaskExplanation, task_explain_for_worker, task_explain_init, }; use crate::internal::server::reactor::{get_or_create_resource_rq_id, on_cancel_tasks}; use crate::internal::server::worker::DEFAULT_WORKER_OVERVIEW_INTERVAL; @@ -147,6 +147,7 @@ impl ServerRef { return Err(DsError::from("Task not found")); }; let resource_map = core.create_resource_map(); + let resource_rq_map = core.get_resource_rq_map(); let now = Instant::now(); let mut explanation = task_explain_init(task); explanation.workers = core @@ -158,6 +159,7 @@ impl ServerRef { .unwrap(); Ok(task_explain_for_worker( &resource_map, + resource_rq_map, task, worker, group, diff --git a/crates/tako/src/internal/common/resources/map.rs b/crates/tako/src/internal/common/resources/map.rs index 32af4eaed..f63e7e356 100644 --- a/crates/tako/src/internal/common/resources/map.rs +++ b/crates/tako/src/internal/common/resources/map.rs @@ -1,6 +1,6 @@ use crate::gateway::ResourceRequestVariants as ClientResourceRequestVariants; -use crate::internal::common::resources::{ResourceId, ResourceRqId}; use crate::internal::common::Map; +use crate::internal::common::resources::{ResourceId, ResourceRqId}; use crate::internal::server::core::Core; use crate::resources::{ResourceAllocRequest, ResourceRequest, ResourceRequestVariants}; use serde::{Deserialize, Serialize}; diff --git a/crates/tako/src/internal/common/resources/mod.rs b/crates/tako/src/internal/common/resources/mod.rs index 23b1efd19..e6a885cd7 100644 --- a/crates/tako/src/internal/common/resources/mod.rs +++ b/crates/tako/src/internal/common/resources/mod.rs @@ -12,8 +12,8 @@ pub use descriptor::{ ResourceDescriptorItem, ResourceDescriptorKind, }; pub use map::{ - GlobalResourceMapping, ResourceRqAllocator, AMD_GPU_RESOURCE_NAME, CPU_RESOURCE_ID, - CPU_RESOURCE_NAME, MEM_RESOURCE_NAME, NVIDIA_GPU_RESOURCE_NAME, + AMD_GPU_RESOURCE_NAME, CPU_RESOURCE_ID, CPU_RESOURCE_NAME, GlobalResourceMapping, + MEM_RESOURCE_NAME, NVIDIA_GPU_RESOURCE_NAME, ResourceRqAllocator, }; pub use request::{ AllocationRequest, ResourceAllocRequest, ResourceRequest, ResourceRequestEntries, diff --git a/crates/tako/src/internal/scheduler/multinode.rs b/crates/tako/src/internal/scheduler/multinode.rs index 07039c504..10062b57f 100644 --- a/crates/tako/src/internal/scheduler/multinode.rs +++ b/crates/tako/src/internal/scheduler/multinode.rs @@ -1,5 +1,5 @@ -use crate::internal::common::resources::map::{GlobalResourceMapping, ResourceRqMap}; use crate::internal::common::resources::ResourceRqId; +use crate::internal::common::resources::map::{GlobalResourceMapping, ResourceRqMap}; use crate::internal::server::task::Task; use crate::internal::server::taskmap::TaskMap; use crate::internal::server::worker::Worker; diff --git a/crates/tako/src/internal/scheduler/state.rs b/crates/tako/src/internal/scheduler/state.rs index 9611f7625..f1c65b7f0 100644 --- a/crates/tako/src/internal/scheduler/state.rs +++ b/crates/tako/src/internal/scheduler/state.rs @@ -5,8 +5,8 @@ use std::time::{Duration, Instant}; use tokio::sync::Notify; use tokio::time::sleep; -use crate::internal::common::resources::map::ResourceRqMap; use crate::internal::common::Map; +use crate::internal::common::resources::map::ResourceRqMap; use crate::internal::messages::worker::{TaskIdsMsg, ToWorkerMessage}; use crate::internal::scheduler::multinode::MultiNodeAllocator; use crate::internal::server::comm::{Comm, CommSender, CommSenderRef}; diff --git a/crates/tako/src/internal/server/explain.rs b/crates/tako/src/internal/server/explain.rs index bb3e9f286..3fc8c6b85 100644 --- a/crates/tako/src/internal/server/explain.rs +++ b/crates/tako/src/internal/server/explain.rs @@ -1,4 +1,5 @@ use crate::WorkerId; +use crate::internal::common::resources::map::ResourceRqMap; use crate::internal::server::task::{Task, TaskRuntimeState}; use crate::internal::server::worker::Worker; use crate::internal::server::workergroup::WorkerGroup; @@ -96,17 +97,16 @@ pub fn task_explain_init(task: &Task) -> TaskExplanation { pub fn task_explain_for_worker( resource_map: &ResourceIdMap, + resource_rq_map: &ResourceRqMap, task: &Task, worker: &Worker, worker_group: &WorkerGroup, now: std::time::Instant, ) -> TaskExplanationForWorker { - todo!() - /*TaskExplanationForWorker { + let rqv = resource_rq_map.get(task.resource_rq_id); + TaskExplanationForWorker { worker_id: worker.id, - variants: task - .configuration - .resources + variants: rqv .requests() .iter() .map(|rq| { @@ -139,7 +139,7 @@ pub fn task_explain_for_worker( result }) .collect(), - }*/ + } } #[cfg(test)] @@ -178,14 +178,21 @@ mod tests { wcfg.time_limit = Some(Duration::from_secs(40_000)); let worker2 = Worker::new(2.into(), wcfg, &resource_map, now); - let explain = |task, worker, now| { + let explain = |task, rqs: &GlobalResourceMapping, worker, now| { let group = WorkerGroup::new(Set::new()); - task_explain_for_worker(&resource_map, task, worker, &group, now) + task_explain_for_worker( + &resource_map, + rqs.get_resource_rq_map(), + task, + worker, + &group, + now, + ) }; let task_id = 1; let task = TaskBuilder::new(task_id).build(&mut rqs); - let r = explain(&task, &worker1, now); + let r = explain(&task, &rqs, &worker1, now); assert_eq!(r.variants.len(), 1); assert_eq!(r.variants[0].len(), 1); assert_eq!(r.n_enabled_variants(), 1); @@ -193,23 +200,23 @@ mod tests { let task = TaskBuilder::new(task_id) .time_request(20_000) .build(&mut rqs); - let r = explain(&task, &worker1, now); + let r = explain(&task, &rqs, &worker1, now); assert_eq!(r.variants.len(), 1); assert_eq!(r.variants[0].len(), 2); assert_eq!(r.n_enabled_variants(), 1); - let r = explain(&task, &worker2, now); + let r = explain(&task, &rqs, &worker2, now); assert_eq!(r.variants.len(), 1); assert_eq!(r.variants[0].len(), 2); assert_eq!(r.n_enabled_variants(), 1); let now2 = now + Duration::from_secs(21_000); - let r = explain(&task, &worker1, now2); + let r = explain(&task, &rqs, &worker1, now2); assert_eq!(r.variants.len(), 1); assert_eq!(r.variants[0].len(), 2); assert_eq!(r.n_enabled_variants(), 1); - let r = explain(&task, &worker2, now2); + let r = explain(&task, &rqs, &worker2, now2); assert_eq!(r.variants.len(), 1); assert_eq!(r.variants[0].len(), 2); assert!(matches!( @@ -226,7 +233,7 @@ mod tests { .cpus_compact(30) .add_resource(1, 3) .build(&mut rqs); - let r = explain(&task, &worker2, now); + let r = explain(&task, &rqs, &worker2, now); assert_eq!(r.variants.len(), 1); assert_eq!(r.variants[0].len(), 3); assert!(matches!( @@ -245,7 +252,7 @@ mod tests { .cpus_compact(2) .add_resource(1, 32) .build(&mut rqs); - let r = explain(&task, &worker2, now2); + let r = explain(&task, &rqs, &worker2, now2); assert_eq!(r.variants.len(), 2); assert_eq!(r.variants[0].len(), 3); assert_eq!(r.variants[1].len(), 2); @@ -291,7 +298,14 @@ mod tests { wset.insert(WorkerId::new(3)); wset.insert(WorkerId::new(132)); let group = WorkerGroup::new(wset); - let r = task_explain_for_worker(&resource_map, &task, &worker, &group, now); + let r = task_explain_for_worker( + &resource_map, + rqs.get_resource_rq_map(), + &task, + &worker, + &group, + now, + ); assert_eq!(r.variants.len(), 1); assert_eq!(r.variants[0].len(), 1); assert_eq!(r.n_enabled_variants(), 1); @@ -300,7 +314,14 @@ mod tests { wset.insert(WorkerId::new(1)); wset.insert(WorkerId::new(132)); let group = WorkerGroup::new(wset); - let r = task_explain_for_worker(&resource_map, &task, &worker, &group, now); + let r = task_explain_for_worker( + &resource_map, + rqs.get_resource_rq_map(), + &task, + &worker, + &group, + now, + ); assert_eq!(r.variants.len(), 1); assert_eq!(r.variants[0].len(), 1); assert!(matches!( diff --git a/crates/tako/src/internal/server/worker.rs b/crates/tako/src/internal/server/worker.rs index ff17de27d..04ba6d28c 100644 --- a/crates/tako/src/internal/server/worker.rs +++ b/crates/tako/src/internal/server/worker.rs @@ -1,10 +1,10 @@ use std::fmt; use crate::gateway::{LostWorkerReason, WorkerRuntimeInfo}; +use crate::internal::common::Set; use crate::internal::common::resources::map::{ResourceIdMap, ResourceRqMap}; use crate::internal::common::resources::{ResourceRequest, ResourceRequestVariants}; use crate::internal::common::resources::{ResourceRqId, TimeRequest}; -use crate::internal::common::Set; use crate::internal::messages::worker::{TaskIdsMsg, ToWorkerMessage}; use crate::internal::server::comm::Comm; use crate::internal::server::task::{Task, TaskRuntimeState}; diff --git a/crates/tako/src/internal/tests/integration/test_basic.rs b/crates/tako/src/internal/tests/integration/test_basic.rs index 59fc03f67..f2773242f 100644 --- a/crates/tako/src/internal/tests/integration/test_basic.rs +++ b/crates/tako/src/internal/tests/integration/test_basic.rs @@ -1,9 +1,9 @@ use crate::control::{NewWorkerAllocationResponse, WorkerTypeQuery}; use crate::internal::tests::integration::utils::check_file_contents; -use crate::internal::tests::integration::utils::server::{run_server_test, ServerHandle}; +use crate::internal::tests::integration::utils::server::{ServerHandle, run_server_test}; use crate::internal::tests::integration::utils::task::ResourceRequestConfigBuilder; use crate::internal::tests::integration::utils::task::{ - simple_args, simple_task, GraphBuilder, TaskConfigBuilder, + GraphBuilder, TaskConfigBuilder, simple_args, simple_task, }; use crate::program::StdioDef; use crate::resources::ResourceDescriptor; diff --git a/crates/tako/src/internal/tests/integration/utils/server.rs b/crates/tako/src/internal/tests/integration/utils/server.rs index 4843d949e..e8b18e5b5 100644 --- a/crates/tako/src/internal/tests/integration/utils/server.rs +++ b/crates/tako/src/internal/tests/integration/utils/server.rs @@ -21,9 +21,9 @@ use crate::gateway::{ use crate::internal::common::resources::ResourceRqId; use crate::internal::common::{Map, Set}; use crate::internal::messages::common::TaskFailInfo; -use crate::internal::tests::integration::utils::api::{wait_for_tasks, WaitResult}; +use crate::internal::tests::integration::utils::api::{WaitResult, wait_for_tasks}; use crate::internal::tests::integration::utils::worker::{ - start_worker, WorkerContext, WorkerHandle, + WorkerContext, WorkerHandle, start_worker, }; use crate::resources::ResourceRqAllocator; use crate::task::SerializedTaskContext; @@ -122,7 +122,7 @@ impl ServerHandle { #[cfg(test)] pub fn register_default_request(&self) -> ResourceRqId { self.server_ref - .get_or_create_raw_rq_id(crate::resources::ResourceRequestVariants::default()) + .get_or_create_resource_rq_id(&ResourceRequestVariants::default()) } #[cfg(test)] diff --git a/crates/tako/src/internal/tests/test_worker.rs b/crates/tako/src/internal/tests/test_worker.rs index 0d3531999..ef133ac9f 100644 --- a/crates/tako/src/internal/tests/test_worker.rs +++ b/crates/tako/src/internal/tests/test_worker.rs @@ -6,11 +6,11 @@ use crate::internal::messages::worker::{ WorkerResourceCounts, }; use crate::internal::server::workerload::WorkerResources; -use crate::internal::tests::utils::resources::{ra_builder, ResourceRequestBuilder}; +use crate::internal::tests::utils::resources::{ResourceRequestBuilder, ra_builder}; use crate::internal::worker::comm::WorkerComm; use crate::internal::worker::configuration::{ - OverviewConfiguration, DEFAULT_MAX_DOWNLOAD_TRIES, DEFAULT_MAX_PARALLEL_DOWNLOADS, - DEFAULT_WAIT_BETWEEN_DOWNLOAD_TRIES, + DEFAULT_MAX_DOWNLOAD_TRIES, DEFAULT_MAX_PARALLEL_DOWNLOADS, + DEFAULT_WAIT_BETWEEN_DOWNLOAD_TRIES, OverviewConfiguration, }; use crate::internal::worker::rpc::process_worker_message; use crate::internal::worker::state::WorkerStateRef; @@ -232,10 +232,12 @@ fn test_worker_other_workers() { assert_eq!(state.ready_task_queue.worker_resources()[&wr2], t); process_worker_message(&mut state, ToWorkerMessage::LostWorker(30.into())); - assert!(state - .ready_task_queue - .worker_resources() - .get(&wr1) - .is_none()); + assert!( + state + .ready_task_queue + .worker_resources() + .get(&wr1) + .is_none() + ); assert_eq!(state.ready_task_queue.worker_resources()[&wr2], t); } diff --git a/crates/tako/src/internal/worker/state.rs b/crates/tako/src/internal/worker/state.rs index 64cfe159c..834df5eba 100644 --- a/crates/tako/src/internal/worker/state.rs +++ b/crates/tako/src/internal/worker/state.rs @@ -17,6 +17,7 @@ use std::rc::Rc; use std::sync::Arc; use std::time::{Duration, Instant}; +use crate::WorkerId; use crate::internal::worker::data::download::WorkerDownloadManagerRef; use crate::internal::worker::localcomm::LocalCommState; use crate::internal::worker::resources::allocator::ResourceAllocator; @@ -26,12 +27,11 @@ use crate::internal::worker::task::{RunningState, Task, TaskState}; use crate::internal::worker::task_comm::RunningTaskComm; use crate::launcher::TaskLauncher; use crate::resources::ResourceRequestVariants; -use crate::WorkerId; use crate::{PriorityTuple, TaskId}; use orion::aead::SecretKey; +use rand::SeedableRng; use rand::prelude::IndexedRandom; use rand::rngs::SmallRng; -use rand::SeedableRng; use tokio::sync::oneshot; pub type TaskMap = StableMap; @@ -345,10 +345,11 @@ impl WorkerState { &other_worker.address ); assert_ne!(self.worker_id, other_worker.worker_id); // We should not receive message about ourselves - assert!(self - .worker_addresses - .insert(other_worker.worker_id, other_worker.address) - .is_none()); + assert!( + self.worker_addresses + .insert(other_worker.worker_id, other_worker.address) + .is_none() + ); let resources = WorkerResources::from_transport(other_worker.resources); self.ready_task_queue diff --git a/crates/tako/src/lib.rs b/crates/tako/src/lib.rs index 21382cd72..75f0c5bbb 100644 --- a/crates/tako/src/lib.rs +++ b/crates/tako/src/lib.rs @@ -12,10 +12,10 @@ pub mod hwstats; pub mod launcher; pub mod program; +pub use crate::internal::common::WrappedRcRefCell; pub use crate::internal::common::index::{AsIdVec, ItemId}; pub use crate::internal::common::taskgroup::TaskGroup; pub use crate::internal::common::utils::format_comma_delimited; -pub use crate::internal::common::WrappedRcRefCell; pub use crate::internal::common::{Map, Set}; pub use crate::internal::common::ids::{ @@ -35,13 +35,13 @@ pub const MAX_FRAME_SIZE: usize = 128 * 1024 * 1024; pub mod resources { pub use crate::internal::common::resources::{ - Allocation, AllocationRequest, GlobalResourceMapping, NumOfNodes, ResourceAllocRequest, - ResourceAllocation, ResourceAmount, ResourceDescriptor, ResourceDescriptorCoupling, - ResourceDescriptorCouplingItem, ResourceDescriptorItem, ResourceDescriptorKind, - ResourceFractions, ResourceGroupIdx, ResourceIndex, ResourceLabel, ResourceRequest, - ResourceRequestEntries, ResourceRequestVariants, ResourceRqAllocator, ResourceRqId, - ResourceUnits, TimeRequest, AMD_GPU_RESOURCE_NAME, CPU_RESOURCE_ID, CPU_RESOURCE_NAME, - MEM_RESOURCE_NAME, NVIDIA_GPU_RESOURCE_NAME, + AMD_GPU_RESOURCE_NAME, Allocation, AllocationRequest, CPU_RESOURCE_ID, CPU_RESOURCE_NAME, + GlobalResourceMapping, MEM_RESOURCE_NAME, NVIDIA_GPU_RESOURCE_NAME, NumOfNodes, + ResourceAllocRequest, ResourceAllocation, ResourceAmount, ResourceDescriptor, + ResourceDescriptorCoupling, ResourceDescriptorCouplingItem, ResourceDescriptorItem, + ResourceDescriptorKind, ResourceFractions, ResourceGroupIdx, ResourceIndex, ResourceLabel, + ResourceRequest, ResourceRequestEntries, ResourceRequestVariants, ResourceRqAllocator, + ResourceRqId, ResourceUnits, TimeRequest, }; pub use crate::internal::common::resources::map::ResourceIdMap; From 3e71189f1620417eba3bce093c3c1142cef18f52 Mon Sep 17 00:00:00 2001 From: Ada Bohm Date: Thu, 18 Dec 2025 13:23:17 +0100 Subject: [PATCH 06/17] ServerQuery migrated to ResourceRqId --- crates/tako/src/internal/scheduler/query.rs | 19 +++++++++---------- .../internal/tests/integration/test_basic.rs | 2 +- 2 files changed, 10 insertions(+), 11 deletions(-) diff --git a/crates/tako/src/internal/scheduler/query.rs b/crates/tako/src/internal/scheduler/query.rs index d9919e047..652f80821 100644 --- a/crates/tako/src/internal/scheduler/query.rs +++ b/crates/tako/src/internal/scheduler/query.rs @@ -19,22 +19,21 @@ pub(crate) fn compute_new_worker_query( core: &mut Core, queries: &[WorkerTypeQuery], ) -> NewWorkerAllocationResponse { - todo!() - /*log::debug!("Compute new worker query: query = {queries:?}"); + log::debug!("Compute new worker query: query = {queries:?}"); // Scheduler has to be performed before the query, so there should be no ready_to_assign tasks assert!(core.sn_ready_to_assign().is_empty() || !core.has_workers()); - let add_task = |new_loads: &mut [WorkerTypeState], task: &Task| { - let request = core.get_resource_rq_map().get(&task.resource_rq_id); + let add_task = |core: &Core, new_loads: &mut [WorkerTypeState], task: &Task| { + let request = core.get_resource_rq_map().get(task.resource_rq_id); for ws in new_loads.iter_mut() { if !ws.w_resources.is_capable_to_run_with(request, |rq| { ws.time_limit.is_none_or(|t| rq.min_time() <= t) }) { if ws.partial && ws.w_resources.is_lowerbound_for(request, |rq| { - ws.time_limit.is_none_or(|t| rq.min_time() <= t) - }) + ws.time_limit.is_none_or(|t| rq.min_time() <= t) + }) { ws.min = 1; } @@ -86,14 +85,14 @@ pub(crate) fn compute_new_worker_query( load.add_request(task.id, request, task.running_variant(), &worker.resources); continue; } - add_task(&mut new_loads, task); + add_task(core, &mut new_loads, task); } } for task_id in core.sleeping_sn_tasks() { let Some(task) = core.find_task(*task_id) else { continue; }; - add_task(&mut new_loads, task); + add_task(core, &mut new_loads, task); } // `compute_new_worker_query` should be called immediately after scheduling was performed, @@ -104,7 +103,7 @@ pub(crate) fn compute_new_worker_query( let Some(task) = core.find_task(*task_id) else { continue; }; - add_task(&mut new_loads, task); + add_task(core, &mut new_loads, task); } let single_node_allocations: Vec = new_loads @@ -154,5 +153,5 @@ pub(crate) fn compute_new_worker_query( NewWorkerAllocationResponse { single_node_workers_per_query: single_node_allocations, multi_node_allocations, - }*/ + } } diff --git a/crates/tako/src/internal/tests/integration/test_basic.rs b/crates/tako/src/internal/tests/integration/test_basic.rs index f2773242f..5dcfc7ea9 100644 --- a/crates/tako/src/internal/tests/integration/test_basic.rs +++ b/crates/tako/src/internal/tests/integration/test_basic.rs @@ -13,7 +13,7 @@ use tokio::time::sleep; #[tokio::test] async fn test_submit_simple_task_ok() { run_server_test(Default::default(), |mut handler| async move { - let rq = todo!(); // handler.register_default_request(); + let rq = handler.register_default_request(); let worker = handler.start_worker(Default::default()).await.unwrap(); let stdout = worker.workdir.join("test.out"); From 1014aceb396f65e30cad247a91019541e4707e05 Mon Sep 17 00:00:00 2001 From: Ada Bohm Date: Thu, 18 Dec 2025 14:07:54 +0100 Subject: [PATCH 07/17] HQ tests fixed --- .../src/server/autoalloc/process.rs | 100 +++++++++++------- crates/hyperqueue/src/server/client/submit.rs | 69 ++++++------ .../tests/integration/utils/server.rs | 1 - 3 files changed, 102 insertions(+), 68 deletions(-) diff --git a/crates/hyperqueue/src/server/autoalloc/process.rs b/crates/hyperqueue/src/server/autoalloc/process.rs index aee4d7f6b..a604840e8 100644 --- a/crates/hyperqueue/src/server/autoalloc/process.rs +++ b/crates/hyperqueue/src/server/autoalloc/process.rs @@ -1276,8 +1276,8 @@ mod tests { use derive_builder::Builder; use log::LevelFilter; use tako::WorkerId; - use tako::gateway::LostWorkerReason; - use tako::resources::ResourceDescriptor; + use tako::gateway::{LostWorkerReason, ResourceRequestVariants}; + use tako::resources::{ResourceDescriptor, ResourceRqAllocator, ResourceRqId}; use tako::tests::integration::utils::api::wait_for_worker_connected; use tako::tests::integration::utils::server::{ ServerConfigBuilder, ServerHandle, run_server_test, @@ -1502,7 +1502,8 @@ mod tests { ) .await; - ctx.create_simple_tasks(100).await; + let rq_id = ctx.default_rq_id(); + ctx.create_simple_tasks(100, rq_id).await; ctx.assign_worker_resource(queue_id, WorkerConfigBuilder::default()); ctx.try_submit().await; @@ -1529,7 +1530,8 @@ mod tests { ) .await; - ctx.create_simple_tasks(100).await; + let rq_id = ctx.default_rq_id(); + ctx.create_simple_tasks(100, rq_id).await; ctx.try_submit().await; let allocations = ctx.get_allocations(queue_id); @@ -1558,7 +1560,8 @@ mod tests { // Note: we currently create an allocation per queue even if the task count is smaller // than the queue count. Could be improved in the future. - ctx.create_simple_tasks(1).await; + let rq_id = ctx.default_rq_id(); + ctx.create_simple_tasks(1, rq_id).await; ctx.try_submit().await; for queue_id in queues { @@ -1581,14 +1584,14 @@ mod tests { ) .await; + let rq_id = ctx + .handle + .register_request(ResourceRequestConfigBuilder::default().cpus(4)); // Create 4 CPU core tasks ctx.handle .submit( GraphBuilder::default() - .task( - TaskConfigBuilder::default() - .resources(ResourceRequestConfigBuilder::default().cpus(4)), - ) + .task(TaskConfigBuilder::default().resources(rq_id)) .build(), ) .await; @@ -1607,8 +1610,8 @@ mod tests { let queue_id = ctx .add_queue(always_queued_handler(), QueueBuilder::default().backlog(4)) .await; - - ctx.create_simple_tasks(1000).await; + let rq_id = ctx.default_rq_id(); + ctx.create_simple_tasks(1000, rq_id).await; // Create a single allocation ctx.try_submit().await; @@ -1637,7 +1640,8 @@ mod tests { .await; ctx.assign_worker_resource(queue_id, WorkerConfigBuilder::default()); - ctx.create_simple_tasks(100).await; + let rq_id = ctx.default_rq_id(); + ctx.create_simple_tasks(100, rq_id).await; for _ in 0..5 { ctx.try_submit().await; @@ -1658,7 +1662,8 @@ mod tests { ) .await; - ctx.create_simple_tasks(100).await; + let rq_id = ctx.default_rq_id(); + ctx.create_simple_tasks(100, rq_id).await; ctx.try_submit().await; // Worker from an unknown allocation ctx.start_worker(WorkerConfigBuilder::default(), "foo") @@ -1683,7 +1688,8 @@ mod tests { ) .await; - ctx.create_simple_tasks(1).await; + let rq_id = ctx.default_rq_id(); + ctx.create_simple_tasks(1, rq_id).await; ctx.try_submit().await; let allocations = ctx.get_allocations(queue_id); @@ -1706,7 +1712,8 @@ mod tests { ) .await; - ctx.create_simple_tasks(100).await; + let rq_id = ctx.default_rq_id(); + ctx.create_simple_tasks(100, rq_id).await; ctx.try_submit().await; let allocations = ctx.get_allocations(queue_id); @@ -1732,7 +1739,8 @@ mod tests { ) .await; - ctx.create_simple_tasks(100).await; + let rq_id = ctx.default_rq_id(); + ctx.create_simple_tasks(100, rq_id).await; ctx.try_submit().await; let w0 = ctx @@ -1762,7 +1770,8 @@ mod tests { .await; ctx.assign_worker_resource(queue_id, WorkerConfigBuilder::default()); - ctx.create_simple_tasks(100).await; + let rq_id = ctx.default_rq_id(); + ctx.create_simple_tasks(100, rq_id).await; ctx.try_submit().await; let allocations = ctx.get_allocations(queue_id); @@ -1816,7 +1825,8 @@ mod tests { .await; ctx.assign_worker_resource(queue_id, WorkerConfigBuilder::default()); - ctx.create_simple_tasks(5).await; + let rq_id = ctx.default_rq_id(); + ctx.create_simple_tasks(5, rq_id).await; ctx.try_submit().await; // 5 tasks, 3 * 2 workers -> last two allocations should be ignored @@ -1837,7 +1847,8 @@ mod tests { .await; ctx.assign_worker_resource(queue_id, WorkerConfigBuilder::default()); - ctx.create_simple_tasks(5).await; + let rq_id = ctx.default_rq_id(); + ctx.create_simple_tasks(5, rq_id).await; handler_state.get_mut().allocation_will_fail = true; @@ -1863,16 +1874,17 @@ mod tests { QueueBuilder::default().timelimit(Duration::from_secs(60 * 30)), ) .await; + + let rq_id = ctx.handle.register_request( + ResourceRequestConfigBuilder::default() + .cpus(1) + .min_time(Duration::from_secs(60 * 60)), + ); + ctx.handle .submit( GraphBuilder::default() - .task( - TaskConfigBuilder::default().resources( - ResourceRequestConfigBuilder::default() - .cpus(1) - .min_time(Duration::from_secs(60 * 60)), - ), - ) + .task(TaskConfigBuilder::default().resources(rq_id)) .build(), ) .await; @@ -1896,7 +1908,8 @@ mod tests { .await; ctx.assign_worker_resource(queue_id, WorkerConfigBuilder::default()); - ctx.create_simple_tasks(100).await; + let rq_id = ctx.default_rq_id(); + ctx.create_simple_tasks(100, rq_id).await; // Put 4 allocations into the queue. ctx.try_submit().await; @@ -1937,8 +1950,9 @@ mod tests { ) .await; + let rq_id = ctx.default_rq_id(); ctx.assign_worker_resource(queue_id, WorkerConfigBuilder::default()); - ctx.create_simple_tasks(100).await; + ctx.create_simple_tasks(100, rq_id).await; ctx.try_submit().await; let allocations = ctx.get_allocations(queue_id); @@ -1961,7 +1975,8 @@ mod tests { ctx.state.set_max_kept_directories(max_kept); ctx.add_queue(fails_submit_handler(), QueueBuilder::default()) .await; - ctx.create_simple_tasks(100).await; + let rq_id = ctx.default_rq_id(); + ctx.create_simple_tasks(100, rq_id).await; let dirs = [make_dir(), make_dir()]; ctx.state @@ -1992,7 +2007,8 @@ mod tests { .limiter_max_submit_fails(2), ) .await; - ctx.create_simple_tasks(100).await; + let rq_id = ctx.default_rq_id(); + ctx.create_simple_tasks(100, rq_id).await; ctx.try_submit().await; ctx.check_queue_status(queue_id, AllocationQueueState::Active); @@ -2015,7 +2031,8 @@ mod tests { .await; ctx.assign_worker_resource(queue_id, WorkerConfigBuilder::default()); - ctx.create_simple_tasks(100).await; + let rq_id = ctx.default_rq_id(); + ctx.create_simple_tasks(100, rq_id).await; ctx.try_submit().await; let allocations = ctx.get_allocations(queue_id); @@ -2051,7 +2068,8 @@ mod tests { ]), ) .await; - ctx.create_simple_tasks(100).await; + let rq_id = ctx.default_rq_id(); + ctx.create_simple_tasks(100, rq_id).await; shared.get_mut().allocation_will_fail = true; @@ -2112,7 +2130,8 @@ mod tests { QueueBuilder::default().backlog(1).max_workers_per_alloc(1), ) .await; - ctx.create_simple_tasks(1000).await; + let rq_id = ctx.default_rq_id(); + ctx.create_simple_tasks(1000, rq_id).await; ctx.try_submit().await; let allocations = ctx.get_allocations(queue_id); @@ -2139,7 +2158,8 @@ mod tests { QueueBuilder::default().backlog(1).max_workers_per_alloc(1), ) .await; - ctx.create_simple_tasks(1000).await; + let rq_id = ctx.default_rq_id(); + ctx.create_simple_tasks(1000, rq_id).await; ctx.try_submit().await; let allocations = ctx.get_allocations(queue_id); @@ -2170,6 +2190,12 @@ mod tests { } impl TestCtx { + fn default_rq_id(&self) -> ResourceRqId { + self.senders + .server + .get_or_create_resource_rq_id(&ResourceRequestVariants::default()) + } + async fn add_queue( &mut self, handler: Box, @@ -2208,12 +2234,14 @@ mod tests { .unwrap(); } - async fn create_simple_tasks(&mut self, count: u64) { + async fn create_simple_tasks(&mut self, count: u64, resource_rq_id: ResourceRqId) { self.handle .submit( GraphBuilder::default() .task_copied( - TaskConfigBuilder::default().args(simple_args(&["ls"])), + TaskConfigBuilder::default() + .resources(resource_rq_id) + .args(simple_args(&["ls"])), count, ) .build(), diff --git a/crates/hyperqueue/src/server/client/submit.rs b/crates/hyperqueue/src/server/client/submit.rs index bdbfe447a..b593c246d 100644 --- a/crates/hyperqueue/src/server/client/submit.rs +++ b/crates/hyperqueue/src/server/client/submit.rs @@ -537,8 +537,8 @@ mod tests { use crate::server::client::validate_submit; use crate::server::job::{Job, SubmittedJobDescription}; use crate::transfer::messages::{ - JobDescription, JobSubmitDescription, JobTaskDescription, PinMode, SubmitResponse, - TaskDescription, TaskKind, TaskKindProgram, TaskWithDependencies, + JobDescription, JobSubmitDescription, JobTaskDescription, LocalResourceRqId, PinMode, + SubmitResponse, TaskDescription, TaskKind, TaskKindProgram, TaskWithDependencies, }; use chrono::Utc; use smallvec::smallvec; @@ -549,7 +549,7 @@ mod tests { }; use tako::internal::tests::utils::sorted_vec; use tako::program::ProgramDefinition; - use tako::resources::{AllocationRequest, CPU_RESOURCE_NAME, ResourceAmount}; + use tako::resources::{AllocationRequest, CPU_RESOURCE_NAME, ResourceAmount, ResourceRqId}; use tako::{Priority, TaskId}; #[test] @@ -568,7 +568,8 @@ mod tests { task_desc: JobTaskDescription::Array { ids: IntArray::from_range(100, 10), entries: None, - task_desc: task_desc(None, 0, 1), + task_desc: task_desc(None, 0), + resource_rq: ResourceRequestVariants::default(), }, submit_dir: Default::default(), stream_path: None, @@ -578,17 +579,21 @@ mod tests { let job_task_desc = JobTaskDescription::Array { ids: IntArray::from_range(109, 2), entries: None, - task_desc: task_desc(None, 0, 1), + task_desc: task_desc(None, 0), + resource_rq: ResourceRequestVariants::default(), }; assert!(validate_submit(None, &job_task_desc).is_none()); assert!(matches!( validate_submit(Some(&job), &job_task_desc), Some(SubmitResponse::TaskIdAlreadyExists(x)) if x.as_num() == 109 )); + let rqs = vec![ResourceRequestVariants::default()]; let job_task_desc = JobTaskDescription::Graph { + resource_rqs: rqs, tasks: vec![TaskWithDependencies { id: 102.into(), - task_desc: task_desc(None, 0, 1), + resource_rq_id: LocalResourceRqId::new(0), + task_desc: task_desc(None, 0), task_deps: vec![], data_deps: vec![], data_flags: TaskDataFlags::empty(), @@ -600,17 +605,20 @@ mod tests { Some(SubmitResponse::TaskIdAlreadyExists(x)) if x.as_num() == 102 )); let job_task_desc = JobTaskDescription::Graph { + resource_rqs: vec![ResourceRequestVariants::default()], tasks: vec![ TaskWithDependencies { id: 2.into(), - task_desc: task_desc(None, 0, 1), + resource_rq_id: LocalResourceRqId::new(0), + task_desc: task_desc(None, 0), task_deps: vec![], data_deps: vec![], data_flags: TaskDataFlags::empty(), }, TaskWithDependencies { id: 2.into(), - task_desc: task_desc(None, 0, 1), + resource_rq_id: LocalResourceRqId::new(0), + task_desc: task_desc(None, 0), task_deps: vec![], data_deps: vec![], data_flags: TaskDataFlags::empty(), @@ -622,9 +630,11 @@ mod tests { Some(SubmitResponse::NonUniqueTaskId(x)) if x.as_num() == 2 )); let job_task_desc = JobTaskDescription::Graph { + resource_rqs: vec![ResourceRequestVariants::default()], tasks: vec![TaskWithDependencies { id: 2.into(), - task_desc: task_desc(None, 0, 1), + resource_rq_id: LocalResourceRqId::new(0), + task_desc: task_desc(None, 0), task_deps: vec![3.into()], data_deps: vec![], data_flags: TaskDataFlags::empty(), @@ -635,9 +645,11 @@ mod tests { Some(SubmitResponse::InvalidDependencies(x)) if x.as_num() == 3 )); let job_task_desc = JobTaskDescription::Graph { + resource_rqs: vec![ResourceRequestVariants::default()], tasks: vec![TaskWithDependencies { id: 2.into(), - task_desc: task_desc(None, 0, 1), + resource_rq_id: LocalResourceRqId::new(0), + task_desc: task_desc(None, 0), task_deps: vec![2.into()], data_deps: vec![], data_flags: TaskDataFlags::empty(), @@ -651,16 +663,17 @@ mod tests { #[test] fn test_build_graph_with_dependencies() { - let desc = || task_desc(None, 0, 1); + let desc = || task_desc(None, 0); let tasks = vec![ - task(0, desc(), vec![2, 1]), - task(1, desc(), vec![0]), - task(2, desc(), vec![3, 4]), - task(3, desc(), vec![]), - task(4, desc(), vec![0]), + task(0, 0, desc(), vec![2, 1]), + task(1, 0, desc(), vec![0]), + task(2, 0, desc(), vec![3, 4]), + task(3, 0, desc(), vec![]), + task(4, 0, desc(), vec![0]), ]; - let msg = build_tasks_graph(1.into(), &tasks, &PathBuf::from("foo"), None); + let rqs = vec![ResourceRqId::new(0)]; + let msg = build_tasks_graph(&rqs, 1.into(), &tasks, &PathBuf::from("foo"), None); assert_eq!( sorted_vec(msg.tasks[0].task_deps.to_vec()), vec![ @@ -686,11 +699,7 @@ mod tests { ); } - fn task_desc( - time_limit: Option, - priority: Priority, - cpu_count: u32, - ) -> TaskDescription { + fn task_desc(time_limit: Option, priority: Priority) -> TaskDescription { TaskDescription { kind: TaskKind::ExternalProgram(TaskKindProgram { program: ProgramDefinition { @@ -704,23 +713,21 @@ mod tests { pin_mode: PinMode::None, task_dir: false, }), - resources: ResourceRequestVariants::new_simple(ResourceRequest { - n_nodes: 0, - min_time: Duration::from_secs(2), - resources: smallvec![ResourceRequestEntry { - resource: CPU_RESOURCE_NAME.to_string(), - policy: AllocationRequest::Compact(ResourceAmount::new_units(cpu_count)), - }], - }), time_limit, priority, crash_limit: CrashLimit::default(), } } - fn task(id: u32, task_desc: TaskDescription, dependencies: Vec) -> TaskWithDependencies { + fn task( + id: u32, + resource_rq_id: u32, + task_desc: TaskDescription, + dependencies: Vec, + ) -> TaskWithDependencies { TaskWithDependencies { id: id.into(), + resource_rq_id: LocalResourceRqId::new(resource_rq_id), task_desc, task_deps: dependencies.into_iter().map(|id| id.into()).collect(), data_deps: vec![], diff --git a/crates/tako/src/internal/tests/integration/utils/server.rs b/crates/tako/src/internal/tests/integration/utils/server.rs index e8b18e5b5..8318fe933 100644 --- a/crates/tako/src/internal/tests/integration/utils/server.rs +++ b/crates/tako/src/internal/tests/integration/utils/server.rs @@ -125,7 +125,6 @@ impl ServerHandle { .get_or_create_resource_rq_id(&ResourceRequestVariants::default()) } - #[cfg(test)] pub fn register_request(&self, rbuilder: ResourceRequestConfigBuilder) -> ResourceRqId { let rqv = rbuilder.into_rqv(); self.server_ref.get_or_create_resource_rq_id(&rqv) From 2f49a83ae83e28615a59aa530d21fbea9f00c95b Mon Sep 17 00:00:00 2001 From: Ada Bohm Date: Fri, 19 Dec 2025 15:46:06 +0100 Subject: [PATCH 08/17] State restoring migrated to request ids --- .../src/server/autoalloc/process.rs | 2 +- crates/hyperqueue/src/server/bootstrap.rs | 5 ++--- crates/hyperqueue/src/server/client/submit.rs | 13 ++++++------ crates/hyperqueue/src/server/restore.rs | 7 +++++-- crates/tako/src/control.rs | 20 +++++++------------ .../tako/src/internal/common/resources/map.rs | 7 ------- .../tako/src/internal/common/resources/mod.rs | 2 +- .../tests/integration/utils/server.rs | 1 - crates/tako/src/lib.rs | 4 ++-- 9 files changed, 25 insertions(+), 36 deletions(-) diff --git a/crates/hyperqueue/src/server/autoalloc/process.rs b/crates/hyperqueue/src/server/autoalloc/process.rs index a604840e8..bc2bb6f23 100644 --- a/crates/hyperqueue/src/server/autoalloc/process.rs +++ b/crates/hyperqueue/src/server/autoalloc/process.rs @@ -1277,7 +1277,7 @@ mod tests { use log::LevelFilter; use tako::WorkerId; use tako::gateway::{LostWorkerReason, ResourceRequestVariants}; - use tako::resources::{ResourceDescriptor, ResourceRqAllocator, ResourceRqId}; + use tako::resources::{ResourceDescriptor, ResourceRqId}; use tako::tests::integration::utils::api::wait_for_worker_connected; use tako::tests::integration::utils::server::{ ServerConfigBuilder, ServerHandle, run_server_test, diff --git a/crates/hyperqueue/src/server/bootstrap.rs b/crates/hyperqueue/src/server/bootstrap.rs index 78de53e0e..b2ece36a9 100644 --- a/crates/hyperqueue/src/server/bootstrap.rs +++ b/crates/hyperqueue/src/server/bootstrap.rs @@ -358,11 +358,10 @@ async fn start_server( ) .await?; let new_tasks_and_queues = if let Some(restorer) = restorer { - // This is early state recovery, we restore jobs later as we start futures because restoring - // jobs already needs a running Tako let mut state = state_ref.get_mut(); + let ra = &senders.server_control; state.restore_state(&restorer); - Some(restorer.restore_jobs_and_queues(&mut state)?) + Some(restorer.restore_jobs_and_queues(&mut state, ra)?) } else { None }; diff --git a/crates/hyperqueue/src/server/client/submit.rs b/crates/hyperqueue/src/server/client/submit.rs index b593c246d..2ab2b6d35 100644 --- a/crates/hyperqueue/src/server/client/submit.rs +++ b/crates/hyperqueue/src/server/client/submit.rs @@ -24,12 +24,13 @@ use crate::transfer::messages::{ TaskExplainResponse, TaskIdSelector, TaskKind, TaskKindProgram, TaskSelector, TaskStatusSelector, TaskWithDependencies, ToClientMessage, }; +use tako::control::ServerRef; use tako::program::ProgramDefinition; -use tako::resources::{GlobalResourceMapping, ResourceRqAllocator, ResourceRqId}; +use tako::resources::{GlobalResourceMapping, ResourceRqId}; use tako::{JobId, JobTaskCount, JobTaskId}; fn create_task_submit( - ra: &dyn ResourceRqAllocator, + server_ref: &ServerRef, job_id: JobId, submit_desc: &mut JobSubmitDescription, ) -> TaskSubmit { @@ -41,7 +42,7 @@ fn create_task_submit( resource_rq, } => { //let rqv = grm.convert_client_resource_rq(resource_rq); - let resource_rq_id = ra.get_or_create_resource_rq_id(resource_rq); + let resource_rq_id = server_ref.get_or_create_resource_rq_id(resource_rq); build_tasks_array( job_id, ids, @@ -58,7 +59,7 @@ fn create_task_submit( } => { let resources: Vec = resource_rqs .iter() - .map(|rqv| ra.get_or_create_resource_rq_id(rqv)) + .map(|rqv| server_ref.get_or_create_resource_rq_id(rqv)) .collect(); build_tasks_graph( &resources, @@ -73,13 +74,13 @@ fn create_task_submit( pub(crate) fn submit_job_desc( state: &mut State, - ra: &dyn ResourceRqAllocator, + server_ref: &ServerRef, job_id: JobId, mut submit_desc: JobSubmitDescription, submitted_at: DateTime, ) -> TaskSubmit { prepare_job(job_id, &mut submit_desc, state); - let task_submit = create_task_submit(ra, job_id, &mut submit_desc); + let task_submit = create_task_submit(server_ref, job_id, &mut submit_desc); submit_desc.strip_large_data(); state .get_job_mut(job_id) diff --git a/crates/hyperqueue/src/server/restore.rs b/crates/hyperqueue/src/server/restore.rs index b9189f8fe..733d7ceb3 100644 --- a/crates/hyperqueue/src/server/restore.rs +++ b/crates/hyperqueue/src/server/restore.rs @@ -9,6 +9,7 @@ use crate::server::state::State; use crate::transfer::messages::{JobDescription, SubmitRequest}; use crate::worker::start::RunningTaskContext; use std::path::Path; +use tako::control::ServerRef; use tako::gateway::TaskSubmit; use tako::resources::ResourceDescriptor; use tako::{InstanceId, ItemId, JobId, JobTaskId, Map, TaskId, WorkerId}; @@ -52,6 +53,7 @@ impl RestorerJob { mut self, job_id: JobId, state: &mut State, + server_ref: &ServerRef, ) -> crate::Result> { log::debug!("Restoring job {job_id}"); let job = Job::new(job_id, self.job_desc, self.is_open); @@ -66,7 +68,7 @@ impl RestorerJob { } let mut new_tasks = submit_job_desc( state, - todo!(), + server_ref, job_id, submit.description().clone(), submit.submitted_at(), @@ -167,10 +169,11 @@ impl StateRestorer { pub fn restore_jobs_and_queues( mut self, state: &mut State, + server_ref: &ServerRef, ) -> crate::Result<(Vec, Vec)> { let mut jobs = Vec::new(); for (job_id, job) in self.jobs { - let mut new_jobs = job.restore_job(job_id, state)?; + let mut new_jobs = job.restore_job(job_id, state, server_ref)?; jobs.append(&mut new_jobs); } let queues: Vec = self diff --git a/crates/tako/src/control.rs b/crates/tako/src/control.rs index df298e6d6..7769a033f 100644 --- a/crates/tako/src/control.rs +++ b/crates/tako/src/control.rs @@ -13,7 +13,6 @@ use crate::gateway::{ LostWorkerReason, MultiNodeAllocationResponse, TaskSubmit, WorkerRuntimeInfo, }; use crate::internal::common::error::DsError; -use crate::internal::common::resources::map::ResourceRqAllocator; use crate::internal::common::resources::{ResourceId, ResourceRqId}; use crate::internal::messages::worker::ToWorkerMessage; use crate::internal::scheduler::query::compute_new_worker_query; @@ -219,6 +218,13 @@ impl ServerRef { let (rq_id, _) = get_or_create_raw_resource_rq_id(&mut core, &mut *comm, rqv); rq_id } + + pub fn get_or_create_resource_rq_id(&self, rqv: &ResourceRequestVariants) -> ResourceRqId { + let mut core = self.core_ref.get_mut(); + let mut comm = self.comm_ref.get_mut(); + let (rq_id, _) = get_or_create_resource_rq_id(&mut core, &mut *comm, &rqv); + rq_id + } } #[allow(clippy::too_many_arguments)] @@ -266,15 +272,3 @@ pub fn server_start( Ok((ServerRef { core_ref, comm_ref }, future)) } - -impl ResourceRqAllocator for ServerRef { - fn get_or_create_resource_rq_id( - &self, - rqv: &crate::gateway::ResourceRequestVariants, - ) -> ResourceRqId { - let mut core = self.core_ref.get_mut(); - let mut comm = self.comm_ref.get_mut(); - let (rq_id, _) = get_or_create_resource_rq_id(&mut core, &mut *comm, &rqv); - rq_id - } -} diff --git a/crates/tako/src/internal/common/resources/map.rs b/crates/tako/src/internal/common/resources/map.rs index f63e7e356..64a4d55ee 100644 --- a/crates/tako/src/internal/common/resources/map.rs +++ b/crates/tako/src/internal/common/resources/map.rs @@ -208,10 +208,3 @@ impl ResourceRqMap { } } } - -pub trait ResourceRqAllocator { - fn get_or_create_resource_rq_id( - &self, - rqv: &crate::gateway::ResourceRequestVariants, - ) -> ResourceRqId; -} diff --git a/crates/tako/src/internal/common/resources/mod.rs b/crates/tako/src/internal/common/resources/mod.rs index e6a885cd7..703df1513 100644 --- a/crates/tako/src/internal/common/resources/mod.rs +++ b/crates/tako/src/internal/common/resources/mod.rs @@ -13,7 +13,7 @@ pub use descriptor::{ }; pub use map::{ AMD_GPU_RESOURCE_NAME, CPU_RESOURCE_ID, CPU_RESOURCE_NAME, GlobalResourceMapping, - MEM_RESOURCE_NAME, NVIDIA_GPU_RESOURCE_NAME, ResourceRqAllocator, + MEM_RESOURCE_NAME, NVIDIA_GPU_RESOURCE_NAME, }; pub use request::{ AllocationRequest, ResourceAllocRequest, ResourceRequest, ResourceRequestEntries, diff --git a/crates/tako/src/internal/tests/integration/utils/server.rs b/crates/tako/src/internal/tests/integration/utils/server.rs index 8318fe933..db636ecb8 100644 --- a/crates/tako/src/internal/tests/integration/utils/server.rs +++ b/crates/tako/src/internal/tests/integration/utils/server.rs @@ -25,7 +25,6 @@ use crate::internal::tests::integration::utils::api::{WaitResult, wait_for_tasks use crate::internal::tests::integration::utils::worker::{ WorkerContext, WorkerHandle, start_worker, }; -use crate::resources::ResourceRqAllocator; use crate::task::SerializedTaskContext; use crate::tests::integration::utils::task::{ResourceRequestConfig, ResourceRequestConfigBuilder}; use crate::worker::{WorkerConfiguration, WorkerOverview}; diff --git a/crates/tako/src/lib.rs b/crates/tako/src/lib.rs index 75f0c5bbb..146649e79 100644 --- a/crates/tako/src/lib.rs +++ b/crates/tako/src/lib.rs @@ -40,8 +40,8 @@ pub mod resources { ResourceAllocRequest, ResourceAllocation, ResourceAmount, ResourceDescriptor, ResourceDescriptorCoupling, ResourceDescriptorCouplingItem, ResourceDescriptorItem, ResourceDescriptorKind, ResourceFractions, ResourceGroupIdx, ResourceIndex, ResourceLabel, - ResourceRequest, ResourceRequestEntries, ResourceRequestVariants, ResourceRqAllocator, - ResourceRqId, ResourceUnits, TimeRequest, + ResourceRequest, ResourceRequestEntries, ResourceRequestVariants, ResourceRqId, + ResourceUnits, TimeRequest, }; pub use crate::internal::common::resources::map::ResourceIdMap; From 1653c0e3bf7c1eca08f29aeaab6370720a0293fc Mon Sep 17 00:00:00 2001 From: Ada Bohm Date: Fri, 19 Dec 2025 16:14:44 +0100 Subject: [PATCH 09/17] PyAPI updated to ResourceRqId --- .../src/client/commands/submit/jobfile.rs | 20 +++-- .../src/client/commands/submit/mod.rs | 4 +- crates/pyhq/src/client/job.rs | 87 +++++++++++-------- crates/tako/src/control.rs | 2 +- 4 files changed, 67 insertions(+), 46 deletions(-) diff --git a/crates/hyperqueue/src/client/commands/submit/jobfile.rs b/crates/hyperqueue/src/client/commands/submit/jobfile.rs index f83b9c76f..eaa272722 100644 --- a/crates/hyperqueue/src/client/commands/submit/jobfile.rs +++ b/crates/hyperqueue/src/client/commands/submit/jobfile.rs @@ -1,5 +1,5 @@ use crate::client::commands::submit::command::{ - DEFAULT_STDERR_PATH, DEFAULT_STDOUT_PATH, send_submit_request, + send_submit_request, DEFAULT_STDERR_PATH, DEFAULT_STDOUT_PATH, }; use crate::client::commands::submit::defs::{ ArrayDef, JobDef, StdioDefFull, StdioDefInput, TaskDef, @@ -16,9 +16,9 @@ use crate::transfer::messages::{ use clap::Parser; use smallvec::smallvec; use std::path::PathBuf; -use tako::Map; use tako::gateway::{EntryType, ResourceRequest, ResourceRequestVariants, TaskDataFlags}; use tako::program::{FileOnCloseBehavior, ProgramDefinition, StdioDef}; +use tako::Map; use tako::{JobId, JobTaskCount, JobTaskId}; #[derive(Parser)] @@ -210,13 +210,9 @@ fn build_job_desc_individual_tasks( ))); } - let mut resource_rqs_pairs: Vec<_> = resource_map.into_iter().collect(); - resource_rqs_pairs.sort_unstable_by_key(|(_, v)| *v); - let resource_rqs = resource_rqs_pairs.into_iter().map(|(k, _)| k).collect(); - Ok(JobTaskDescription::Graph { tasks: new_tasks, - resource_rqs, + resource_rqs: resource_rq_map_to_vec(resource_map), }) } @@ -258,3 +254,13 @@ pub async fn submit_computation_from_job_file( let request = build_job_submit(jdef, opts.job)?; send_submit_request(gsettings, session, request, false, false, None).await } + +pub fn resource_rq_map_to_vec( + map: Map, +) -> Vec { + let mut result = vec![None; map.len()]; + for (rq, id) in map.into_iter() { + result[id.as_num() as usize] = Some(rq); + } + result.into_iter().map(|x| x.unwrap()).collect() +} diff --git a/crates/hyperqueue/src/client/commands/submit/mod.rs b/crates/hyperqueue/src/client/commands/submit/mod.rs index 5a20abe15..ea8450188 100644 --- a/crates/hyperqueue/src/client/commands/submit/mod.rs +++ b/crates/hyperqueue/src/client/commands/submit/mod.rs @@ -4,6 +4,6 @@ pub mod directives; mod jobfile; pub use command::SubmitJobTaskConfOpts; -pub use command::{JobSubmitOpts, submit_computation}; +pub use command::{submit_computation, JobSubmitOpts}; -pub use jobfile::{JobSubmitFileOpts, submit_computation_from_job_file}; +pub use jobfile::{resource_rq_map_to_vec, submit_computation_from_job_file, JobSubmitFileOpts}; diff --git a/crates/pyhq/src/client/job.rs b/crates/pyhq/src/client/job.rs index 1c6759713..b1c631cbe 100644 --- a/crates/pyhq/src/client/job.rs +++ b/crates/pyhq/src/client/job.rs @@ -1,10 +1,11 @@ use crate::marshal::FromPy; use crate::utils::error::ToPyResult; -use crate::{ClientContextPtr, FromPyObject, PyJobId, PyTaskId, borrow_mut, run_future}; +use crate::{borrow_mut, run_future, ClientContextPtr, FromPyObject, PyJobId, PyTaskId}; use hyperqueue::client::commands::submit::command::{DEFAULT_STDERR_PATH, DEFAULT_STDOUT_PATH}; +use hyperqueue::client::commands::submit::resource_rq_map_to_vec; use hyperqueue::client::output::resolve_task_paths; use hyperqueue::client::resources::parse_allocation_request; -use hyperqueue::client::status::{Status, is_terminated}; +use hyperqueue::client::status::{is_terminated, Status}; use hyperqueue::common::arraydef::IntArray; use hyperqueue::common::utils::fs::get_current_dir; use hyperqueue::rpc_call; @@ -12,9 +13,9 @@ use hyperqueue::server::job::JobTaskState; use hyperqueue::transfer::messages::{ ForgetJobRequest, FromClientMessage, IdSelector, JobDescription, JobDetailRequest, JobInfoRequest, JobInfoResponse, JobSubmitDescription, JobTaskDescription as HqJobDescription, - PinMode, SubmitRequest, SubmitResponse, TaskDescription as HqTaskDescription, TaskIdSelector, - TaskKind, TaskKindProgram, TaskSelector, TaskStatusSelector, TaskWithDependencies, - ToClientMessage, + LocalResourceRqId, PinMode, SubmitRequest, SubmitResponse, + TaskDescription as HqTaskDescription, TaskIdSelector, TaskKind, TaskKindProgram, TaskSelector, + TaskStatusSelector, TaskWithDependencies, ToClientMessage, }; use pyo3::exceptions::PyException; use pyo3::prelude::PyAnyMethods; @@ -23,13 +24,13 @@ use pyo3::{Bound, IntoPyObject, PyAny, PyResult, Python}; use std::collections::{BTreeSet, HashMap}; use std::path::{Path, PathBuf}; use std::time::Duration; -use tako::JobTaskCount; use tako::gateway::{ CrashLimit, ResourceRequestEntries, ResourceRequestEntry, ResourceRequestVariants, TaskDataFlags, }; use tako::program::{FileOnCloseBehavior, ProgramDefinition, StdioDef}; -use tako::resources::{AllocationRequest, NumOfNodes, ResourceAmount}; +use tako::resources::{AllocationRequest, NumOfNodes, ResourceAmount, ResourceRqId}; +use tako::{JobTaskCount, Map}; #[derive(Debug, FromPyObject)] enum AllocationValue { @@ -77,8 +78,13 @@ pub struct PyJobDescription { pub fn submit_job_impl(py: Python, ctx: ClientContextPtr, job: PyJobDescription) -> PyResult { run_future(async move { let submit_dir = get_current_dir(); - let tasks = build_tasks(job.tasks, &submit_dir)?; - let task_desc = HqJobDescription::Graph { tasks }; + let mut resource_map = Map::new(); + let tasks = build_tasks(&mut resource_map, job.tasks, &submit_dir)?; + + let task_desc = HqJobDescription::Graph { + tasks, + resource_rqs: resource_rq_map_to_vec(resource_map), + }; let message = FromClientMessage::Submit( SubmitRequest { @@ -140,14 +146,17 @@ pub fn forget_job_impl(py: Python, ctx: ClientContextPtr, job_id: PyJobId) -> Py } fn build_tasks( + resource_map: &mut Map, tasks: Vec, submit_dir: &Path, ) -> anyhow::Result> { tasks .into_iter() .map(|mut task| { + let resource_rq_id = build_task_resources(&mut task, resource_map)?; Ok(TaskWithDependencies { id: task.id.into(), + resource_rq_id, task_deps: std::mem::take(&mut task.dependencies) .into_iter() .map(|id| id.into()) @@ -160,33 +169,13 @@ fn build_tasks( .collect() } -fn build_task_desc(desc: TaskDescription, submit_dir: &Path) -> anyhow::Result { - let args = desc.args.into_iter().map(|arg| arg.into()).collect(); - let env = desc - .env - .into_iter() - .map(|(k, v)| (k.into(), v.into())) - .collect(); - let stdout = desc - .stdout - .map(|stdio| StdioDef::File { - path: stdio.path.unwrap_or(PathBuf::from(DEFAULT_STDOUT_PATH)), - on_close: stdio.on_close.extract(), - }) - .unwrap_or_default(); - let stderr = desc - .stderr - .map(|stdio| StdioDef::File { - path: stdio.path.unwrap_or(PathBuf::from(DEFAULT_STDERR_PATH)), - on_close: stdio.on_close.extract(), - }) - .unwrap_or_default(); - let stdin = desc.stdin.unwrap_or_default(); - let cwd = desc.cwd.unwrap_or_else(|| submit_dir.to_path_buf()); - - let resources = if !desc.resource_request.is_empty() { +fn build_task_resources( + desc: &mut TaskDescription, + resource_map: &mut Map, +) -> anyhow::Result { + let rqv = if !desc.resource_request.is_empty() { ResourceRequestVariants::new( - desc.resource_request + std::mem::take(&mut desc.resource_request) .into_iter() .map(|rq| { anyhow::Ok(tako::gateway::ResourceRequest { @@ -221,6 +210,33 @@ fn build_task_desc(desc: TaskDescription, submit_dir: &Path) -> anyhow::Result anyhow::Result { + let args = desc.args.into_iter().map(|arg| arg.into()).collect(); + let env = desc + .env + .into_iter() + .map(|(k, v)| (k.into(), v.into())) + .collect(); + let stdout = desc + .stdout + .map(|stdio| StdioDef::File { + path: stdio.path.unwrap_or(PathBuf::from(DEFAULT_STDOUT_PATH)), + on_close: stdio.on_close.extract(), + }) + .unwrap_or_default(); + let stderr = desc + .stderr + .map(|stdio| StdioDef::File { + path: stdio.path.unwrap_or(PathBuf::from(DEFAULT_STDERR_PATH)), + on_close: stdio.on_close.extract(), + }) + .unwrap_or_default(); + let stdin = desc.stdin.unwrap_or_default(); + let cwd = desc.cwd.unwrap_or_else(|| submit_dir.to_path_buf()); Ok(HqTaskDescription { kind: TaskKind::ExternalProgram(TaskKindProgram { @@ -235,7 +251,6 @@ fn build_task_desc(desc: TaskDescription, submit_dir: &Path) -> anyhow::Result Date: Mon, 22 Dec 2025 14:19:14 +0100 Subject: [PATCH 10/17] Journal report migrated to ResourceRqId --- .../src/client/commands/journal/report.rs | 41 +++++++++++++------ .../src/client/commands/submit/jobfile.rs | 4 +- .../src/client/commands/submit/mod.rs | 4 +- crates/pyhq/src/client/job.rs | 4 +- crates/tako/src/control.rs | 2 +- 5 files changed, 35 insertions(+), 20 deletions(-) diff --git a/crates/hyperqueue/src/client/commands/journal/report.rs b/crates/hyperqueue/src/client/commands/journal/report.rs index 0d27d1f38..139086c9f 100644 --- a/crates/hyperqueue/src/client/commands/journal/report.rs +++ b/crates/hyperqueue/src/client/commands/journal/report.rs @@ -4,7 +4,7 @@ use crate::common::utils::time::parse_hms_or_human_time; use crate::server::autoalloc::AllocationId; use crate::server::event::journal::JournalReader; use crate::server::event::payload::EventPayload; -use crate::transfer::messages::{JobTaskDescription, SubmitRequest}; +use crate::transfer::messages::{JobTaskDescription, LocalResourceRqId, SubmitRequest}; use anyhow::anyhow; use chrono::{DateTime, Duration, TimeDelta, Utc}; use clap::{Parser, ValueHint}; @@ -15,7 +15,7 @@ use std::path::PathBuf; use tako::gateway::{ResourceRequest, ResourceRequestVariants}; use tako::resources::ResourceAmount; use tako::worker::WorkerConfiguration; -use tako::{JobId, JobTaskId, ResourceVariantId, TaskId, WorkerId}; +use tako::{JobId, JobTaskId, Map, ResourceVariantId, TaskId, WorkerId}; #[derive(Parser)] pub(crate) struct JournalReportOpts { @@ -113,7 +113,10 @@ impl ResCount { enum JobResourceRq { Array(ResourceRequestVariants), - TaskGraph(HashMap), + TaskGraph { + resource_rqs: Vec, + task_rqs: Map, + }, } struct TaskDuration { @@ -335,7 +338,12 @@ impl JournalStats { let jrq = self.job_requests.get(&task_id.job_id()).unwrap(); let rq = match jrq { JobResourceRq::Array(rq) => rq, - JobResourceRq::TaskGraph(map) => map.get(&task_id.job_task_id()).unwrap(), + JobResourceRq::TaskGraph { + resource_rqs, + task_rqs, + } => resource_rqs + .get(task_rqs.get(&task_id.job_task_id()).unwrap().as_usize()) + .unwrap(), }; let rq = &rq.variants[rv_id.as_usize()]; if rq.n_nodes > 0 { @@ -389,20 +397,27 @@ impl JournalStats { } fn new_submit(&mut self, job_id: JobId, submit: SubmitRequest) { - todo!() - /*let rq = match submit.submit_desc.task_desc { - JobTaskDescription::Array { task_desc, .. } => { - JobResourceRq::Array(task_desc.resources) - } - JobTaskDescription::Graph { tasks } => { + let rq = match submit.submit_desc.task_desc { + JobTaskDescription::Array { + task_desc, + resource_rq, + .. + } => JobResourceRq::Array(resource_rq), + JobTaskDescription::Graph { + tasks, + resource_rqs, + } => { let map = tasks .into_iter() - .map(|t| (t.id, t.task_desc.resources)) + .map(|t| (t.id, t.resource_rq_id)) .collect(); - JobResourceRq::TaskGraph(map) + JobResourceRq::TaskGraph { + task_rqs: map, + resource_rqs: resource_rqs.clone(), + } } }; - self.job_requests.insert(job_id, rq);*/ + self.job_requests.insert(job_id, rq); } fn new_worker( diff --git a/crates/hyperqueue/src/client/commands/submit/jobfile.rs b/crates/hyperqueue/src/client/commands/submit/jobfile.rs index eaa272722..97f46e182 100644 --- a/crates/hyperqueue/src/client/commands/submit/jobfile.rs +++ b/crates/hyperqueue/src/client/commands/submit/jobfile.rs @@ -1,5 +1,5 @@ use crate::client::commands::submit::command::{ - send_submit_request, DEFAULT_STDERR_PATH, DEFAULT_STDOUT_PATH, + DEFAULT_STDERR_PATH, DEFAULT_STDOUT_PATH, send_submit_request, }; use crate::client::commands::submit::defs::{ ArrayDef, JobDef, StdioDefFull, StdioDefInput, TaskDef, @@ -16,9 +16,9 @@ use crate::transfer::messages::{ use clap::Parser; use smallvec::smallvec; use std::path::PathBuf; +use tako::Map; use tako::gateway::{EntryType, ResourceRequest, ResourceRequestVariants, TaskDataFlags}; use tako::program::{FileOnCloseBehavior, ProgramDefinition, StdioDef}; -use tako::Map; use tako::{JobId, JobTaskCount, JobTaskId}; #[derive(Parser)] diff --git a/crates/hyperqueue/src/client/commands/submit/mod.rs b/crates/hyperqueue/src/client/commands/submit/mod.rs index ea8450188..132d31442 100644 --- a/crates/hyperqueue/src/client/commands/submit/mod.rs +++ b/crates/hyperqueue/src/client/commands/submit/mod.rs @@ -4,6 +4,6 @@ pub mod directives; mod jobfile; pub use command::SubmitJobTaskConfOpts; -pub use command::{submit_computation, JobSubmitOpts}; +pub use command::{JobSubmitOpts, submit_computation}; -pub use jobfile::{resource_rq_map_to_vec, submit_computation_from_job_file, JobSubmitFileOpts}; +pub use jobfile::{JobSubmitFileOpts, resource_rq_map_to_vec, submit_computation_from_job_file}; diff --git a/crates/pyhq/src/client/job.rs b/crates/pyhq/src/client/job.rs index b1c631cbe..5adb09873 100644 --- a/crates/pyhq/src/client/job.rs +++ b/crates/pyhq/src/client/job.rs @@ -1,11 +1,11 @@ use crate::marshal::FromPy; use crate::utils::error::ToPyResult; -use crate::{borrow_mut, run_future, ClientContextPtr, FromPyObject, PyJobId, PyTaskId}; +use crate::{ClientContextPtr, FromPyObject, PyJobId, PyTaskId, borrow_mut, run_future}; use hyperqueue::client::commands::submit::command::{DEFAULT_STDERR_PATH, DEFAULT_STDOUT_PATH}; use hyperqueue::client::commands::submit::resource_rq_map_to_vec; use hyperqueue::client::output::resolve_task_paths; use hyperqueue::client::resources::parse_allocation_request; -use hyperqueue::client::status::{is_terminated, Status}; +use hyperqueue::client::status::{Status, is_terminated}; use hyperqueue::common::arraydef::IntArray; use hyperqueue::common::utils::fs::get_current_dir; use hyperqueue::rpc_call; diff --git a/crates/tako/src/control.rs b/crates/tako/src/control.rs index 936c5d30d..7769a033f 100644 --- a/crates/tako/src/control.rs +++ b/crates/tako/src/control.rs @@ -21,7 +21,7 @@ use crate::internal::server::client::handle_new_tasks; use crate::internal::server::comm::{Comm, CommSenderRef}; use crate::internal::server::core::{CoreRef, CustomConnectionHandler}; use crate::internal::server::explain::{ - task_explain_for_worker, task_explain_init, TaskExplanation, + TaskExplanation, task_explain_for_worker, task_explain_init, }; use crate::internal::server::reactor::{get_or_create_resource_rq_id, on_cancel_tasks}; use crate::internal::server::worker::DEFAULT_WORKER_OVERVIEW_INTERVAL; From da4880ed63f5a9d937eed9af35fb23da0df69c66 Mon Sep 17 00:00:00 2001 From: Ada Bohm Date: Mon, 22 Dec 2025 14:25:12 +0100 Subject: [PATCH 11/17] Dashboard migrated to ResourceRqId --- .../dashboard/ui/screens/jobs/job_info_display.rs | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/crates/hyperqueue/src/dashboard/ui/screens/jobs/job_info_display.rs b/crates/hyperqueue/src/dashboard/ui/screens/jobs/job_info_display.rs index e1e20c293..5a6cf2f7b 100644 --- a/crates/hyperqueue/src/dashboard/ui/screens/jobs/job_info_display.rs +++ b/crates/hyperqueue/src/dashboard/ui/screens/jobs/job_info_display.rs @@ -57,7 +57,12 @@ fn create_rows(info: &DashboardJobInfo) -> Vec { JobTaskDescription::Graph { .. } => "Graph".into(), }, }]; - if let JobTaskDescription::Array { task_desc, .. } = &info.submit_data.task_desc { + if let JobTaskDescription::Array { + task_desc, + resource_rq, + .. + } = &info.submit_data.task_desc + { match &task_desc.kind { TaskKind::ExternalProgram(program) => { // TODO: wrap text @@ -85,10 +90,7 @@ fn create_rows(info: &DashboardJobInfo) -> Vec { }; rows.push(JobInfoDataRow { label: "Resources", - data: { - let resources = todo!(); - format_resources(resources).into() - }, + data: { format_resources(resource_rq).into() }, }); if let Some(time_limit) = task_desc.time_limit { rows.push(JobInfoDataRow { From 11383053c9833e410b27578767b3b3ff7db0c8d8 Mon Sep 17 00:00:00 2001 From: Ada Bohm Date: Mon, 22 Dec 2025 14:30:50 +0100 Subject: [PATCH 12/17] Clippy fixed --- crates/hyperqueue/src/client/commands/job.rs | 2 +- .../src/client/commands/journal/report.rs | 2 +- .../src/client/commands/submit/command.rs | 21 ++++--------------- crates/hyperqueue/src/client/job.rs | 1 - crates/hyperqueue/src/client/output/cli.rs | 10 ++++----- crates/hyperqueue/src/client/output/common.rs | 2 +- crates/hyperqueue/src/client/task.rs | 2 +- crates/hyperqueue/src/server/client/submit.rs | 5 ++--- crates/hyperqueue/src/server/job.rs | 2 +- crates/hyperqueue/src/server/state.rs | 4 +--- crates/tako/src/control.rs | 6 +++--- .../tako/src/internal/common/resources/map.rs | 5 ++--- crates/tako/src/internal/messages/worker.rs | 2 +- .../tako/src/internal/scheduler/multinode.rs | 2 +- crates/tako/src/internal/scheduler/state.rs | 3 +-- crates/tako/src/internal/server/client.rs | 9 ++------ crates/tako/src/internal/server/core.rs | 2 +- crates/tako/src/internal/server/reactor.rs | 1 - crates/tako/src/internal/server/task.rs | 3 +-- crates/tako/src/internal/server/workerload.rs | 2 +- .../tests/integration/utils/server.rs | 8 ++----- .../tako/src/internal/worker/resources/map.rs | 2 +- crates/tako/src/internal/worker/state.rs | 2 +- crates/tako/src/internal/worker/task.rs | 1 - 24 files changed, 34 insertions(+), 65 deletions(-) diff --git a/crates/hyperqueue/src/client/commands/job.rs b/crates/hyperqueue/src/client/commands/job.rs index 5073af7e8..4f5abe8b8 100644 --- a/crates/hyperqueue/src/client/commands/job.rs +++ b/crates/hyperqueue/src/client/commands/job.rs @@ -1,7 +1,7 @@ use clap::Parser; use crate::client::globalsettings::GlobalSettings; -use crate::client::job::{get_remote_lists, get_worker_map}; +use crate::client::job::get_worker_map; use crate::client::output::outputs::OutputStream; use crate::client::output::resolve_task_paths; use crate::client::status::{Status, job_status}; diff --git a/crates/hyperqueue/src/client/commands/journal/report.rs b/crates/hyperqueue/src/client/commands/journal/report.rs index 139086c9f..317658161 100644 --- a/crates/hyperqueue/src/client/commands/journal/report.rs +++ b/crates/hyperqueue/src/client/commands/journal/report.rs @@ -399,7 +399,7 @@ impl JournalStats { fn new_submit(&mut self, job_id: JobId, submit: SubmitRequest) { let rq = match submit.submit_desc.task_desc { JobTaskDescription::Array { - task_desc, + task_desc: _, resource_rq, .. } => JobResourceRq::Array(resource_rq), diff --git a/crates/hyperqueue/src/client/commands/submit/command.rs b/crates/hyperqueue/src/client/commands/submit/command.rs index 77d9c0ebc..48b7e4c50 100644 --- a/crates/hyperqueue/src/client/commands/submit/command.rs +++ b/crates/hyperqueue/src/client/commands/submit/command.rs @@ -18,9 +18,9 @@ use crate::rpc_call; use crate::server::event::streamer::{EventFilter, EventFilterFlags}; use crate::transfer::connection::ClientSession; use crate::transfer::messages::{ - FromClientMessage, JobDescription, JobSubmitDescription, JobTaskDescription, LocalResourceRqId, - PinMode, StreamEvents, StreamEventsMode, SubmitRequest, SubmitResponse, TaskDescription, - TaskKind, TaskKindProgram, ToClientMessage, + FromClientMessage, JobDescription, JobSubmitDescription, JobTaskDescription, PinMode, + StreamEvents, StreamEventsMode, SubmitRequest, SubmitResponse, TaskDescription, TaskKind, + TaskKindProgram, ToClientMessage, }; use anyhow::{anyhow, bail}; use bstr::BString; @@ -43,9 +43,7 @@ use tako::gateway::{ ResourceRequestVariants, }; use tako::program::{FileOnCloseBehavior, ProgramDefinition, StdioDef}; -use tako::resources::{ - AllocationRequest, CPU_RESOURCE_NAME, NumOfNodes, ResourceAmount, ResourceRqId, -}; +use tako::resources::{AllocationRequest, CPU_RESOURCE_NAME, NumOfNodes, ResourceAmount}; use tako::{JobId, JobTaskCount, Map}; const SUBMIT_ARRAY_LIMIT: JobTaskCount = 999; @@ -762,17 +760,6 @@ pub async fn submit_computation( .await } -/*pub(crate) async fn get_resource_rq_ids( - session: &mut ClientSession, - rqv: Vec, -) -> crate::Result> { - let message = FromClientMessage::GetResourceRqId(rqv); - let response = - rpc_call!(session.connection(), message, ToClientMessage::ResourceRqIdResponse(r) => r) - .await?; - Ok(response) -}*/ - pub(crate) async fn send_submit_request( gsettings: &GlobalSettings, session: &mut ClientSession, diff --git a/crates/hyperqueue/src/client/job.rs b/crates/hyperqueue/src/client/job.rs index 33f42695c..3f3b36864 100644 --- a/crates/hyperqueue/src/client/job.rs +++ b/crates/hyperqueue/src/client/job.rs @@ -1,7 +1,6 @@ use crate::rpc_call; use crate::transfer::connection::ClientSession; use crate::transfer::messages::{FromClientMessage, GetListResponse, ToClientMessage}; -use orion::kex::SessionKeys; use tako::{Map, WorkerId}; /// Maps worker IDs to hostnames. diff --git a/crates/hyperqueue/src/client/output/cli.rs b/crates/hyperqueue/src/client/output/cli.rs index c50fea607..9c623dab8 100644 --- a/crates/hyperqueue/src/client/output/cli.rs +++ b/crates/hyperqueue/src/client/output/cli.rs @@ -591,7 +591,7 @@ impl Output for CliOutput { rows.push(vec!["Tasks".cell().bold(true), n_tasks.cell()]); rows.push(vec![ "Workers".cell().bold(true), - format_job_workers(&tasks, &worker_map).cell(), + format_job_workers(&tasks, worker_map).cell(), ]); if submit_descs.len() == 1 @@ -628,7 +628,7 @@ impl Output for CliOutput { self.print_vertical_table(rows); tasks.sort_unstable_by_key(|t| t.0); - self.print_task_summary(&tasks, &info, &worker_map); + self.print_task_summary(&tasks, &info, worker_map); } } @@ -657,7 +657,7 @@ impl Output for CliOutput { match detail { (id, None) => log::warn!("Job {id} not found"), (_id, Some(detail)) => { - self.print_task_summary(&detail.tasks, &detail.info, &worker_map) + self.print_task_summary(&detail.tasks, &detail.info, worker_map) } } } @@ -708,7 +708,7 @@ impl Output for CliOutput { job_rows.append(&mut vec![ task_id.cell().justify(Justify::Right), status_to_cell(&get_task_status(&task.state)), - format_workers(task.state.get_workers(), &worker_map).cell(), + format_workers(task.state.get_workers(), worker_map).cell(), format_task_duration(start, end).cell(), match (verbosity, &task.state) { (Verbosity::Normal, JobTaskState::Failed { error, .. }) => { @@ -835,7 +835,7 @@ impl Output for CliOutput { ], vec![ "Worker".cell().bold(true), - format_workers(task.state.get_workers(), &worker_map).cell(), + format_workers(task.state.get_workers(), worker_map).cell(), ], vec![ "Start".cell().bold(true), diff --git a/crates/hyperqueue/src/client/output/common.rs b/crates/hyperqueue/src/client/output/common.rs index 5bcc69455..b3894fbb9 100644 --- a/crates/hyperqueue/src/client/output/common.rs +++ b/crates/hyperqueue/src/client/output/common.rs @@ -36,7 +36,7 @@ pub fn resolve_task_paths(job: &JobDetail, server_uid: &str) -> TaskToPathsMap { } JobTaskDescription::Graph { tasks, - resource_rqs, + resource_rqs: _, } => { for t in tasks { task_to_desc_map.insert( diff --git a/crates/hyperqueue/src/client/task.rs b/crates/hyperqueue/src/client/task.rs index f3fa2d7c9..1b16d1a27 100644 --- a/crates/hyperqueue/src/client/task.rs +++ b/crates/hyperqueue/src/client/task.rs @@ -1,6 +1,6 @@ use crate::client::commands::job::JobTaskIdsOpts; use crate::client::globalsettings::GlobalSettings; -use crate::client::job::{get_remote_lists, get_worker_map}; +use crate::client::job::get_worker_map; use crate::client::output::{Verbosity, VerbosityFlag}; use crate::common::arraydef::IntArray; use crate::common::cli::{TaskSelectorArg, parse_last_range, parse_last_single_id}; diff --git a/crates/hyperqueue/src/server/client/submit.rs b/crates/hyperqueue/src/server/client/submit.rs index 2ab2b6d35..a7fcf48ab 100644 --- a/crates/hyperqueue/src/server/client/submit.rs +++ b/crates/hyperqueue/src/server/client/submit.rs @@ -4,8 +4,7 @@ use std::fmt::{Debug, Formatter}; use std::path::PathBuf; use std::rc::Rc; use tako::gateway::{ - EntryType, ResourceRequestVariants, SharedTaskConfiguration, TaskConfiguration, TaskDataFlags, - TaskSubmit, + EntryType, SharedTaskConfiguration, TaskConfiguration, TaskDataFlags, TaskSubmit, }; use tako::{Map, Set, TaskId}; use thin_vec::ThinVec; @@ -26,7 +25,7 @@ use crate::transfer::messages::{ }; use tako::control::ServerRef; use tako::program::ProgramDefinition; -use tako::resources::{GlobalResourceMapping, ResourceRqId}; +use tako::resources::ResourceRqId; use tako::{JobId, JobTaskCount, JobTaskId}; fn create_task_submit( diff --git a/crates/hyperqueue/src/server/job.rs b/crates/hyperqueue/src/server/job.rs index d8582e6ea..d85ae8c51 100644 --- a/crates/hyperqueue/src/server/job.rs +++ b/crates/hyperqueue/src/server/job.rs @@ -454,7 +454,7 @@ impl Job { } JobTaskDescription::Graph { tasks, - resource_rqs, + resource_rqs: _, } => { self.tasks.reserve(tasks.len()); tasks.iter().for_each(|task| { diff --git a/crates/hyperqueue/src/server/state.rs b/crates/hyperqueue/src/server/state.rs index fa32fd7ae..cf9add088 100644 --- a/crates/hyperqueue/src/server/state.rs +++ b/crates/hyperqueue/src/server/state.rs @@ -1,7 +1,6 @@ use chrono::Utc; use smallvec::SmallVec; use std::cmp::min; -use std::collections::HashMap; use tako::{InstanceId, ResourceVariantId, define_wrapped_type}; use tako::{ItemId, TaskId}; @@ -12,9 +11,8 @@ use crate::server::job::Job; use crate::server::restore::StateRestorer; use crate::server::worker::Worker; use crate::transfer::messages::ServerInfo; -use tako::gateway::{LostWorkerReason, ResourceRequestVariants}; +use tako::gateway::LostWorkerReason; use tako::internal::messages::common::TaskFailInfo; -use tako::resources::{GlobalResourceMapping, ResourceRqId}; use tako::task::SerializedTaskContext; use tako::worker::WorkerConfiguration; use tako::{JobId, Map, WorkerId}; diff --git a/crates/tako/src/control.rs b/crates/tako/src/control.rs index 7769a033f..4f68dc795 100644 --- a/crates/tako/src/control.rs +++ b/crates/tako/src/control.rs @@ -13,7 +13,7 @@ use crate::gateway::{ LostWorkerReason, MultiNodeAllocationResponse, TaskSubmit, WorkerRuntimeInfo, }; use crate::internal::common::error::DsError; -use crate::internal::common::resources::{ResourceId, ResourceRqId}; +use crate::internal::common::resources::ResourceRqId; use crate::internal::messages::worker::ToWorkerMessage; use crate::internal::scheduler::query::compute_new_worker_query; use crate::internal::scheduler::state::{run_scheduling_now, scheduler_loop}; @@ -25,7 +25,7 @@ use crate::internal::server::explain::{ }; use crate::internal::server::reactor::{get_or_create_resource_rq_id, on_cancel_tasks}; use crate::internal::server::worker::DEFAULT_WORKER_OVERVIEW_INTERVAL; -use crate::resources::{ResourceDescriptor, ResourceRequest}; +use crate::resources::ResourceDescriptor; use crate::{TaskId, WorkerId}; #[derive(Debug)] @@ -222,7 +222,7 @@ impl ServerRef { pub fn get_or_create_resource_rq_id(&self, rqv: &ResourceRequestVariants) -> ResourceRqId { let mut core = self.core_ref.get_mut(); let mut comm = self.comm_ref.get_mut(); - let (rq_id, _) = get_or_create_resource_rq_id(&mut core, &mut *comm, &rqv); + let (rq_id, _) = get_or_create_resource_rq_id(&mut core, &mut *comm, rqv); rq_id } } diff --git a/crates/tako/src/internal/common/resources/map.rs b/crates/tako/src/internal/common/resources/map.rs index 64a4d55ee..da324cb32 100644 --- a/crates/tako/src/internal/common/resources/map.rs +++ b/crates/tako/src/internal/common/resources/map.rs @@ -1,7 +1,6 @@ use crate::gateway::ResourceRequestVariants as ClientResourceRequestVariants; use crate::internal::common::Map; use crate::internal::common::resources::{ResourceId, ResourceRqId}; -use crate::internal::server::core::Core; use crate::resources::{ResourceAllocRequest, ResourceRequest, ResourceRequestVariants}; use serde::{Deserialize, Serialize}; @@ -96,7 +95,7 @@ impl GlobalResourceMapping { match self.resource_rq_to_id.get(&rqv) { Some(&id) => (id, false), None => { - let mut id = ResourceRqId::new(self.resource_rq_to_id.len() as u32); + let id = ResourceRqId::new(self.resource_rq_to_id.len() as u32); log::debug!("New resource request registered {rqv:?} as {id}"); self.resource_rq_to_id.insert(rqv.clone(), id); self.resource_rq_from_id.insert(id, rqv); @@ -157,7 +156,7 @@ impl ResourceIdMap { } #[inline] - pub fn len(&self) -> usize { + pub fn size(&self) -> usize { self.resource_names.len() } diff --git a/crates/tako/src/internal/messages/worker.rs b/crates/tako/src/internal/messages/worker.rs index 4331f0693..c58fbacf2 100644 --- a/crates/tako/src/internal/messages/worker.rs +++ b/crates/tako/src/internal/messages/worker.rs @@ -4,7 +4,7 @@ use crate::hwstats::WorkerHwStateMessage; use crate::internal::common::resources::map::ResourceRqMap; use crate::internal::common::resources::{ResourceAmount, ResourceIndex, ResourceRqId}; use crate::internal::messages::common::TaskFailInfo; -use crate::resources::{ResourceFractions, ResourceRequest, ResourceRequestVariants}; +use crate::resources::{ResourceFractions, ResourceRequestVariants}; use crate::task::SerializedTaskContext; use crate::{InstanceId, Priority, ResourceVariantId}; use crate::{TaskId, WorkerId}; diff --git a/crates/tako/src/internal/scheduler/multinode.rs b/crates/tako/src/internal/scheduler/multinode.rs index 10062b57f..484c441f3 100644 --- a/crates/tako/src/internal/scheduler/multinode.rs +++ b/crates/tako/src/internal/scheduler/multinode.rs @@ -1,5 +1,5 @@ use crate::internal::common::resources::ResourceRqId; -use crate::internal::common::resources::map::{GlobalResourceMapping, ResourceRqMap}; +use crate::internal::common::resources::map::ResourceRqMap; use crate::internal::server::task::Task; use crate::internal::server::taskmap::TaskMap; use crate::internal::server::worker::Worker; diff --git a/crates/tako/src/internal/scheduler/state.rs b/crates/tako/src/internal/scheduler/state.rs index f1c65b7f0..fca141538 100644 --- a/crates/tako/src/internal/scheduler/state.rs +++ b/crates/tako/src/internal/scheduler/state.rs @@ -6,7 +6,6 @@ use tokio::sync::Notify; use tokio::time::sleep; use crate::internal::common::Map; -use crate::internal::common::resources::map::ResourceRqMap; use crate::internal::messages::worker::{TaskIdsMsg, ToWorkerMessage}; use crate::internal::scheduler::multinode::MultiNodeAllocator; use crate::internal::server::comm::{Comm, CommSender, CommSenderRef}; @@ -508,7 +507,7 @@ impl SchedulerState { .entry(task.resource_rq_id) .or_insert_with(|| { let rqv = requests.get(task.resource_rq_id); - worker.resources.compute_difficulty_score_of_rqv(&rqv) + worker.resources.compute_difficulty_score_of_rqv(rqv) }); log::debug!( "Transfer cost task={} -> worker={} is {}", diff --git a/crates/tako/src/internal/server/client.rs b/crates/tako/src/internal/server/client.rs index ac684ec62..ebf1cb0b4 100644 --- a/crates/tako/src/internal/server/client.rs +++ b/crates/tako/src/internal/server/client.rs @@ -1,17 +1,12 @@ -use crate::internal::common::resources::{ResourceRequest, ResourceRequestVariants, ResourceRqId}; +use crate::gateway::{SharedTaskConfiguration, TaskSubmit}; -use crate::gateway::{ - ResourceRequestVariants as ClientResourceRequestVariants, SharedTaskConfiguration, TaskSubmit, -}; - -use crate::internal::common::resources::request::ResourceAllocRequest; use crate::internal::server::comm::CommSender; use crate::internal::server::core::Core; use crate::internal::server::reactor::on_new_tasks; use crate::internal::server::task::{Task, TaskConfiguration}; use std::rc::Rc; -fn create_task_configuration(core: &mut Core, msg: SharedTaskConfiguration) -> TaskConfiguration { +fn create_task_configuration(_core: &mut Core, msg: SharedTaskConfiguration) -> TaskConfiguration { TaskConfiguration { time_limit: msg.time_limit, user_priority: msg.priority, diff --git a/crates/tako/src/internal/server/core.rs b/crates/tako/src/internal/server/core.rs index ff695ce13..76431fb8a 100644 --- a/crates/tako/src/internal/server/core.rs +++ b/crates/tako/src/internal/server/core.rs @@ -173,7 +173,7 @@ impl Core { &mut self.tasks, &mut self.workers, &self.worker_groups, - &self.resource_map.get_resource_rq_map(), + self.resource_map.get_resource_rq_map(), ) } diff --git a/crates/tako/src/internal/server/reactor.rs b/crates/tako/src/internal/server/reactor.rs index d30e36fcb..693651d1e 100644 --- a/crates/tako/src/internal/server/reactor.rs +++ b/crates/tako/src/internal/server/reactor.rs @@ -15,7 +15,6 @@ use crate::internal::server::task::{ComputeTasksBuilder, WaitingInfo}; use crate::internal::server::task::{Task, TaskRuntimeState}; use crate::internal::server::worker::Worker; use crate::internal::server::workermap::WorkerMap; -use crate::resources::ResourceRequestVariants; use crate::{TaskId, WorkerId}; use std::fmt::Write; diff --git a/crates/tako/src/internal/server/task.rs b/crates/tako/src/internal/server/task.rs index f3262f1f1..8f5edba0b 100644 --- a/crates/tako/src/internal/server/task.rs +++ b/crates/tako/src/internal/server/task.rs @@ -8,7 +8,7 @@ use crate::internal::common::Set; use crate::internal::common::stablemap::ExtractKey; use crate::{MAX_FRAME_SIZE, Map, ResourceVariantId, WorkerId}; -use crate::gateway::{CrashLimit, EntryType, ResourceRequestVariants, TaskDataFlags}; +use crate::gateway::{CrashLimit, EntryType, TaskDataFlags}; use crate::internal::datasrv::dataobj::DataObjectId; use crate::internal::common::resources::ResourceRqId; @@ -16,7 +16,6 @@ use crate::internal::messages::worker::{ ComputeTaskSeparateData, ComputeTaskSharedData, ComputeTasksMsg, ToWorkerMessage, }; use crate::internal::server::taskmap::TaskMap; -use crate::internal::server::workerload::ResourceRequestLowerBound; use crate::{InstanceId, Priority}; use crate::{TaskId, static_assert_size}; diff --git a/crates/tako/src/internal/server/workerload.rs b/crates/tako/src/internal/server/workerload.rs index 1178a7982..ba830f2c2 100644 --- a/crates/tako/src/internal/server/workerload.rs +++ b/crates/tako/src/internal/server/workerload.rs @@ -3,7 +3,7 @@ use crate::internal::common::resources::map::ResourceIdMap; use crate::internal::common::resources::request::ResourceAllocRequest; use crate::internal::common::resources::{ ResourceAmount, ResourceDescriptor, ResourceId, ResourceRequest, ResourceRequestVariants, - ResourceRqId, ResourceVec, + ResourceVec, }; use crate::internal::messages::worker::WorkerResourceCounts; use crate::{Map, ResourceVariantId, Set, TaskId}; diff --git a/crates/tako/src/internal/tests/integration/utils/server.rs b/crates/tako/src/internal/tests/integration/utils/server.rs index db636ecb8..6724b3479 100644 --- a/crates/tako/src/internal/tests/integration/utils/server.rs +++ b/crates/tako/src/internal/tests/integration/utils/server.rs @@ -1,6 +1,5 @@ use derive_builder::Builder; use orion::auth::SecretKey; -use smallvec::smallvec; use std::future::Future; use std::net::{Ipv4Addr, SocketAddr}; use std::rc::Rc; @@ -14,10 +13,7 @@ use tokio::time::timeout; use super::worker::WorkerConfigBuilder; use crate::control::ServerRef; use crate::events::EventProcessor; -use crate::gateway::{ - LostWorkerReason, ResourceRequest, ResourceRequestVariants, SharedTaskConfiguration, - TaskConfiguration, TaskSubmit, -}; +use crate::gateway::{LostWorkerReason, SharedTaskConfiguration, TaskConfiguration, TaskSubmit}; use crate::internal::common::resources::ResourceRqId; use crate::internal::common::{Map, Set}; use crate::internal::messages::common::TaskFailInfo; @@ -26,7 +22,7 @@ use crate::internal::tests::integration::utils::worker::{ WorkerContext, WorkerHandle, start_worker, }; use crate::task::SerializedTaskContext; -use crate::tests::integration::utils::task::{ResourceRequestConfig, ResourceRequestConfigBuilder}; +use crate::tests::integration::utils::task::ResourceRequestConfigBuilder; use crate::worker::{WorkerConfiguration, WorkerOverview}; use crate::{InstanceId, ResourceVariantId, TaskId, WorkerId, WrappedRcRefCell}; diff --git a/crates/tako/src/internal/worker/resources/map.rs b/crates/tako/src/internal/worker/resources/map.rs index 45cdaea08..57c16b6a0 100644 --- a/crates/tako/src/internal/worker/resources/map.rs +++ b/crates/tako/src/internal/worker/resources/map.rs @@ -14,7 +14,7 @@ pub struct ResourceLabelMap { impl ResourceLabelMap { pub fn new(descriptor: &ResourceDescriptor, map: &ResourceIdMap) -> Self { - let mut resources: IndexVec = vec![Default::default(); map.len()].into(); + let mut resources: IndexVec = vec![Default::default(); map.size()].into(); for resource in &descriptor.resources { let index = map.get_index(&resource.name).unwrap(); diff --git a/crates/tako/src/internal/worker/state.rs b/crates/tako/src/internal/worker/state.rs index 834df5eba..7a3d26e2a 100644 --- a/crates/tako/src/internal/worker/state.rs +++ b/crates/tako/src/internal/worker/state.rs @@ -1,6 +1,6 @@ use crate::datasrv::DataObjectId; use crate::internal::common::resources::map::{ResourceIdMap, ResourceRqMap}; -use crate::internal::common::resources::{Allocation, ResourceId, ResourceRqId}; +use crate::internal::common::resources::{Allocation, ResourceRqId}; use crate::internal::common::stablemap::StableMap; use crate::internal::common::{Map, Set, WrappedRcRefCell}; use crate::internal::datasrv::{DataObjectRef, DataStorage}; diff --git a/crates/tako/src/internal/worker/task.rs b/crates/tako/src/internal/worker/task.rs index 323a04e58..cbdb31f1c 100644 --- a/crates/tako/src/internal/worker/task.rs +++ b/crates/tako/src/internal/worker/task.rs @@ -1,7 +1,6 @@ use crate::datasrv::DataObjectId; use crate::gateway::{EntryType, TaskDataFlags}; use crate::internal::common::resources::Allocation; -use crate::internal::common::resources::map::ResourceRqMap; use crate::internal::common::stablemap::ExtractKey; use crate::internal::messages::worker::{ ComputeTaskSeparateData, ComputeTaskSharedData, TaskOutput, From ab54dc4f8746ed2f4d4fdcdd19c32ac3d186e554 Mon Sep 17 00:00:00 2001 From: Ada Bohm Date: Mon, 22 Dec 2025 15:08:41 +0100 Subject: [PATCH 13/17] Worker utilizes ResourceRqId --- crates/hyperqueue/src/worker/start/mod.rs | 5 +- crates/hyperqueue/src/worker/start/program.rs | 7 +-- .../tests/integration/utils/worker.rs | 5 +- crates/tako/src/internal/worker/rpc.rs | 9 ++-- crates/tako/src/internal/worker/rqueue.rs | 49 ++++++++++++------- crates/tako/src/internal/worker/state.rs | 32 +++++++++--- crates/tako/src/internal/worker/task.rs | 7 ++- crates/tako/src/launcher.rs | 17 +++---- 8 files changed, 80 insertions(+), 51 deletions(-) diff --git a/crates/hyperqueue/src/worker/start/mod.rs b/crates/hyperqueue/src/worker/start/mod.rs index 5e2014e1b..0c97d7ff8 100644 --- a/crates/hyperqueue/src/worker/start/mod.rs +++ b/crates/hyperqueue/src/worker/start/mod.rs @@ -37,9 +37,10 @@ impl TaskLauncher for HqTaskLauncher { stop_receiver: Receiver, ) -> tako::Result { log::debug!( - "Starting task launcher task_id={} res={:?} alloc={:?} body_len={}", + "Starting task launcher task_id={} res={} variant={} alloc={:?} body_len={}", build_ctx.task_id(), - build_ctx.resources(), + build_ctx.resource_rq_id(), + build_ctx.resource_variant(), build_ctx.allocation(), build_ctx.body().len(), ); diff --git a/crates/hyperqueue/src/worker/start/program.rs b/crates/hyperqueue/src/worker/start/program.rs index a8efd25bb..39392cbde 100644 --- a/crates/hyperqueue/src/worker/start/program.rs +++ b/crates/hyperqueue/src/worker/start/program.rs @@ -268,16 +268,17 @@ fn write_node_file(ctx: &TaskBuildContext, path: &Path, short_names: bool) -> st } fn insert_resources_into_env(ctx: &TaskBuildContext, program: &mut ProgramDefinition) { - let resource_map = ctx.get_resource_map(); + let (resource_map, resource_rq_map) = ctx.get_resource_maps(); + let rqv = resource_rq_map.get(ctx.resource_rq_id()); - if ctx.n_resource_variants() > 1 { + if rqv.requests().len() > 1 { program.env.insert( "HQ_RESOURCE_VARIANT".into(), ctx.resource_variant().to_string().into(), ); } - for entry in ctx.resources().entries() { + for entry in rqv.requests()[ctx.resource_variant().as_usize()].entries() { let resource_name = resource_map.get_name(entry.resource_id).unwrap(); program.env.insert( resource_env_var_name("HQ_RESOURCE_REQUEST_", resource_name), diff --git a/crates/tako/src/internal/tests/integration/utils/worker.rs b/crates/tako/src/internal/tests/integration/utils/worker.rs index 7b7691f32..99c9e4a98 100644 --- a/crates/tako/src/internal/tests/integration/utils/worker.rs +++ b/crates/tako/src/internal/tests/integration/utils/worker.rs @@ -260,9 +260,10 @@ impl TaskLauncher for TestTaskLauncher { ) -> crate::Result { let program: ProgramDefinition = { log::debug!( - "Starting program launcher task_id={} res={:?} alloc={:?} body_len={}", + "Starting program launcher task_id={} res={} variant={} alloc={:?} body_len={}", ctx.task_id(), - ctx.resources(), + ctx.resource_rq_id(), + ctx.resource_variant(), ctx.allocation(), ctx.body().len(), ); diff --git a/crates/tako/src/internal/worker/rpc.rs b/crates/tako/src/internal/worker/rpc.rs index fba4f6866..c53efc0e2 100644 --- a/crates/tako/src/internal/worker/rpc.rs +++ b/crates/tako/src/internal/worker/rpc.rs @@ -344,8 +344,9 @@ async fn task_starter_process(state_ref: WrappedRcRefCell, notify: None }; loop { - let (task_map, ready_task_queue) = state.borrow_tasks_and_queue(); - let allocations = ready_task_queue.try_start_tasks(task_map, remaining_time); + let (task_map, resource_rq_map, ready_task_queue) = state.borrow_tasks_and_queue(); + let allocations = + ready_task_queue.try_start_tasks(task_map, resource_rq_map, remaining_time); if allocations.is_empty() { break; } @@ -401,8 +402,8 @@ pub(crate) fn process_worker_message(state: &mut WorkerState, message: ToWorkerM } else { shared.clone() }; - let rqv = state.get_resource_rq(task.resource_rq_id); - state.add_task(Task::new(task, rqv.clone(), shared, task_state)); + let new_task = Task::new(task, shared, task_state); + state.add_task(new_task); } } ToWorkerMessage::StealTasks(msg) => { diff --git a/crates/tako/src/internal/worker/rqueue.rs b/crates/tako/src/internal/worker/rqueue.rs index c161f94bc..1bb2b39cf 100644 --- a/crates/tako/src/internal/worker/rqueue.rs +++ b/crates/tako/src/internal/worker/rqueue.rs @@ -1,9 +1,11 @@ use crate::internal::common::Map; +use crate::internal::common::resources::map::ResourceRqMap; use crate::internal::common::resources::{Allocation, ResourceRequestVariants}; use crate::internal::server::workerload::WorkerResources; use crate::internal::worker::resources::allocator::ResourceAllocator; use crate::internal::worker::state::TaskMap; use crate::internal::worker::task::Task; +use crate::resources::ResourceRqId; use crate::{Priority, PriorityTuple, ResourceVariantId, Set, TaskId, WorkerId}; use priority_queue::PriorityQueue; use std::rc::Rc; @@ -63,8 +65,8 @@ impl QueueForRequest { } pub struct ResourceWaitQueue { - pub(super) queues: Map, - pub(super) requests: Vec, + pub(super) queues: Map, + pub(super) requests: Vec, pub(super) allocator: ResourceAllocator, pub(super) worker_resources: Map>, } @@ -79,22 +81,27 @@ impl ResourceWaitQueue { } } - pub fn new_worker(&mut self, worker_id: WorkerId, resources: WorkerResources) { + pub fn new_worker( + &mut self, + worker_id: WorkerId, + resources: WorkerResources, + resource_rq_map: &ResourceRqMap, + ) { assert!( self.worker_resources .entry(resources) .or_default() .insert(worker_id) ); - self.recompute_resource_priorities(); + self.recompute_resource_priorities(resource_rq_map); } - pub fn remove_worker(&mut self, worker_id: WorkerId) { + pub fn remove_worker(&mut self, worker_id: WorkerId, resource_rq_map: &ResourceRqMap) { self.worker_resources.retain(|_, value| { let is_empty = value.remove(&worker_id) && value.is_empty(); !is_empty }); - self.recompute_resource_priorities(); + self.recompute_resource_priorities(resource_rq_map); } pub fn resource_priority(&self, rqv: &ResourceRequestVariants) -> Priority { @@ -111,32 +118,35 @@ impl ResourceWaitQueue { self.allocator.release_allocation(allocation); } - pub fn add_task(&mut self, task: &Task) { + pub fn add_task(&mut self, resource_rq_map: &ResourceRqMap, task: &Task) { let priority = task.priority; let (queue, priority, task_id) = { ( - if let Some(qfr) = self.queues.get_mut(&task.resources) { + if let Some(qfr) = self.queues.get_mut(&task.resource_rq_id) { &mut qfr.queue } else { log::debug!( "Creating new request queue for {:?} (task {})", - task.resources, + task.resource_rq_id, task.id ); - self.requests.push(task.resources.clone()); + self.requests.push(task.resource_rq_id); let mut requests = std::mem::take(&mut self.requests); // Sort bigger values first requests.sort_unstable_by(|x, y| { - y.sort_key(&self.allocator) - .partial_cmp(&x.sort_key(&self.allocator)) + let rx = resource_rq_map.get(*x); + let ry = resource_rq_map.get(*y); + ry.sort_key(&self.allocator) + .partial_cmp(&rx.sort_key(&self.allocator)) .unwrap() }); self.requests = requests; - let resource_priority = self.resource_priority(&task.resources); + let rq = resource_rq_map.get(task.resource_rq_id); + let resource_priority = self.resource_priority(rq); &mut self .queues - .entry(task.resources.clone()) + .entry(task.resource_rq_id) .or_insert(QueueForRequest { resource_priority, queue: PriorityQueue::new(), @@ -160,11 +170,11 @@ impl ResourceWaitQueue { panic!("Removing unknown task"); } - pub fn recompute_resource_priorities(&mut self) { + pub fn recompute_resource_priorities(&mut self, resource_rq_map: &ResourceRqMap) { log::debug!("Recomputing resource priorities"); let mut queues = std::mem::take(&mut self.queues); for (rq, qfr) in queues.iter_mut() { - qfr.resource_priority = self.resource_priority(rq); + qfr.resource_priority = self.resource_priority(resource_rq_map.get(*rq)); } self.queues = queues; } @@ -172,6 +182,7 @@ impl ResourceWaitQueue { pub fn try_start_tasks( &mut self, task_map: &TaskMap, + resource_rq_map: &ResourceRqMap, remaining_time: Option, ) -> Vec<(TaskId, Rc, ResourceVariantId)> { for qfr in self.queues.values_mut() { @@ -179,7 +190,7 @@ impl ResourceWaitQueue { } self.allocator.reset_temporaries(remaining_time); let mut out = Vec::new(); - while !self.try_start_tasks_helper(task_map, &mut out) { + while !self.try_start_tasks_helper(task_map, resource_rq_map, &mut out) { self.allocator.close_priority_level() } out @@ -203,6 +214,7 @@ impl ResourceWaitQueue { fn try_start_tasks_helper( &mut self, _task_map: &TaskMap, + resource_rq_map: &ResourceRqMap, out: &mut Vec<(TaskId, Rc, ResourceVariantId)>, ) -> bool { let current_priority: QueuePriorityTuple = if let Some(Some(priority)) = @@ -219,7 +231,8 @@ impl ResourceWaitQueue { break; } let (allocation, rv_id) = { - if let Some(x) = self.allocator.try_allocate(rqv) { + let rq = resource_rq_map.get(*rqv); + if let Some(x) = self.allocator.try_allocate(rq) { x } else { qfr.set_blocked(); diff --git a/crates/tako/src/internal/worker/state.rs b/crates/tako/src/internal/worker/state.rs index 7a3d26e2a..de4e13cf3 100644 --- a/crates/tako/src/internal/worker/state.rs +++ b/crates/tako/src/internal/worker/state.rs @@ -118,8 +118,12 @@ impl WorkerState { } #[inline] - pub fn borrow_tasks_and_queue(&mut self) -> (&TaskMap, &mut ResourceWaitQueue) { - (&self.tasks, &mut self.ready_task_queue) + pub fn borrow_tasks_and_queue(&mut self) -> (&TaskMap, &ResourceRqMap, &mut ResourceWaitQueue) { + ( + &self.tasks, + &self.resource_rq_map, + &mut self.ready_task_queue, + ) } pub fn is_empty(&self) -> bool { @@ -127,13 +131,13 @@ impl WorkerState { } pub fn add_ready_task(&mut self, task: &Task) { - self.ready_task_queue.add_task(task); + self.ready_task_queue.add_task(&self.resource_rq_map, task); self.schedule_task_start(); } - pub fn add_ready_tasks(&mut self, tasks: &[Task]) { + pub fn add_ready_tasks(&mut self, resource_rq_map: &ResourceRqMap, tasks: &[Task]) { for task in tasks { - self.ready_task_queue.add_task(task); + self.ready_task_queue.add_task(resource_rq_map, task); } self.schedule_task_start(); } @@ -317,10 +321,21 @@ impl WorkerState { self.remove_task(task_id, true, false); } + #[inline] pub fn get_resource_map(&self) -> &ResourceIdMap { &self.resource_id_map } + pub fn get_resource_maps(&self) -> (&ResourceIdMap, &ResourceRqMap) { + (&self.resource_id_map, &self.resource_rq_map) + } + + #[inline] + pub fn get_resource_rq_map(&self) -> &ResourceRqMap { + &self.resource_rq_map + } + + #[inline] pub fn get_resource_rq(&self, rq_id: ResourceRqId) -> &ResourceRequestVariants { self.resource_rq_map.get(rq_id) } @@ -353,13 +368,14 @@ impl WorkerState { let resources = WorkerResources::from_transport(other_worker.resources); self.ready_task_queue - .new_worker(other_worker.worker_id, resources); + .new_worker(other_worker.worker_id, resources, &self.resource_rq_map); } pub fn remove_worker(&mut self, worker_id: WorkerId) { log::debug!("Lost worker={worker_id} announced"); assert!(self.worker_addresses.remove(&worker_id).is_some()); - self.ready_task_queue.remove_worker(worker_id); + self.ready_task_queue + .remove_worker(worker_id, &self.resource_rq_map); } pub fn send_notify(&mut self, task_id: TaskId, message: Box<[u8]>) { @@ -381,7 +397,7 @@ impl WorkerState { if let Some(task) = self.tasks.find_mut(&task_id) { log::debug!("Task {} is directly ready", task.id); if task.decrease_waiting_count() { - self.ready_task_queue.add_task(task); + self.ready_task_queue.add_task(&self.resource_rq_map, task); new_ready = true; } } diff --git a/crates/tako/src/internal/worker/task.rs b/crates/tako/src/internal/worker/task.rs index cbdb31f1c..99446febc 100644 --- a/crates/tako/src/internal/worker/task.rs +++ b/crates/tako/src/internal/worker/task.rs @@ -6,7 +6,7 @@ use crate::internal::messages::worker::{ ComputeTaskSeparateData, ComputeTaskSharedData, TaskOutput, }; use crate::internal::worker::task_comm::RunningTaskComm; -use crate::resources::ResourceRequestVariants; +use crate::resources::ResourceRqId; use crate::{InstanceId, Priority, TaskId, WorkerId}; use std::rc::Rc; use std::time::Duration; @@ -28,7 +28,7 @@ pub struct Task { pub priority: (Priority, Priority), pub instance_id: InstanceId, - pub resources: crate::internal::common::resources::ResourceRequestVariants, + pub resource_rq_id: ResourceRqId, pub time_limit: Option, pub body: Rc<[u8]>, pub entry: Option, @@ -41,7 +41,6 @@ pub struct Task { impl Task { pub fn new( task: ComputeTaskSeparateData, - rqv: ResourceRequestVariants, shared: ComputeTaskSharedData, task_state: TaskState, ) -> Self { @@ -50,7 +49,7 @@ impl Task { id: task.id, priority: (shared.user_priority, task.scheduler_priority), instance_id: task.instance_id, - resources: rqv, + resource_rq_id: task.resource_rq_id, time_limit: shared.time_limit, body: shared.body, entry: task.entry, diff --git a/crates/tako/src/launcher.rs b/crates/tako/src/launcher.rs index 9694e431b..db8b3a39c 100644 --- a/crates/tako/src/launcher.rs +++ b/crates/tako/src/launcher.rs @@ -5,19 +5,20 @@ use std::pin::Pin; use std::process::Stdio; use crate::internal::common::error::DsError::GenericError; -use crate::internal::common::resources::{Allocation, ResourceRequest}; +use crate::internal::common::resources::Allocation; use bstr::{BString, ByteSlice}; use nix::libc; use tokio::process::Command; use crate::gateway::{EntryType, TaskDataFlags}; -use crate::internal::common::resources::map::ResourceIdMap; +use crate::internal::common::resources::map::{ResourceIdMap, ResourceRqMap}; use crate::internal::worker::configuration::WorkerConfiguration; use crate::internal::worker::localcomm::Token; use crate::internal::worker::resources::map::ResourceLabelMap; use crate::internal::worker::state::WorkerState; use crate::internal::worker::task::Task; use crate::program::{ProgramDefinition, StdioDef}; +use crate::resources::ResourceRqId; use crate::task::SerializedTaskContext; use crate::{InstanceId, ResourceVariantId, TaskId, WorkerId}; @@ -86,18 +87,14 @@ impl<'a> TaskBuildContext<'a> { self.task.entry.as_ref() } - pub fn resources(&self) -> &'a ResourceRequest { - &self.task.resources.requests()[self.rv_id.as_usize()] + pub fn resource_rq_id(&self) -> ResourceRqId { + self.task.resource_rq_id } pub fn data_flags(&self) -> TaskDataFlags { self.task.data_flags } - pub fn n_resource_variants(&self) -> usize { - self.task.resources.requests().len() - } - pub fn resource_variant(&self) -> ResourceVariantId { self.rv_id } @@ -126,8 +123,8 @@ impl<'a> TaskBuildContext<'a> { self.state.worker_hostname(worker_id) } - pub fn get_resource_map(&self) -> &ResourceIdMap { - self.state.get_resource_map() + pub fn get_resource_maps(&self) -> (&ResourceIdMap, &ResourceRqMap) { + self.state.get_resource_maps() } pub fn get_resource_label_map(&self) -> &ResourceLabelMap { From 39d1d6c6682357e871367de8fc7f7eede285f4b0 Mon Sep 17 00:00:00 2001 From: Ada Bohm Date: Tue, 23 Dec 2025 13:44:34 +0100 Subject: [PATCH 14/17] Fixed benchmarks --- crates/tako/benches/benchmarks/core.rs | 25 +++- crates/tako/benches/benchmarks/scheduler.rs | 5 +- crates/tako/benches/benchmarks/worker.rs | 119 ++++++++++-------- crates/tako/benches/utils/mod.rs | 9 +- .../tako/src/internal/common/resources/map.rs | 10 +- .../tako/src/internal/common/resources/mod.rs | 2 +- .../src/internal/common/resources/request.rs | 22 +++- crates/tako/src/internal/server/reactor.rs | 2 +- crates/tako/src/internal/worker/rpc.rs | 5 +- crates/tako/src/internal/worker/state.rs | 8 +- crates/tako/src/lib.rs | 2 +- 11 files changed, 123 insertions(+), 86 deletions(-) diff --git a/crates/tako/benches/benchmarks/core.rs b/crates/tako/benches/benchmarks/core.rs index dc579c46c..ce09f7b3f 100644 --- a/crates/tako/benches/benchmarks/core.rs +++ b/crates/tako/benches/benchmarks/core.rs @@ -4,11 +4,14 @@ use std::hint::black_box; use tako::Set; use tako::TaskId; use tako::internal::server::core::Core; +use tako::resources::{ResourceRequestVariants, ResourceRqMap}; use tako::server::ObjsToRemoveFromWorkers; use crate::{add_tasks, create_task}; fn bench_remove_single_task(c: &mut BenchmarkGroup) { + let mut resource_map = ResourceRqMap::default(); + let rq_id = resource_map.insert(ResourceRequestVariants::new_cpu1()); for task_count in [10, 1_000, 100_000] { c.bench_with_input( BenchmarkId::new("remove a single task", task_count), @@ -17,7 +20,7 @@ fn bench_remove_single_task(c: &mut BenchmarkGroup) { b.iter_batched_ref( || { let mut core = Core::default(); - add_tasks(&mut core, task_count); + add_tasks(&mut core, task_count, rq_id); (core, TaskId::new_test(0)) }, |(core, task_id)| { @@ -32,6 +35,8 @@ fn bench_remove_single_task(c: &mut BenchmarkGroup) { } fn bench_remove_all_tasks(c: &mut BenchmarkGroup) { + let mut resource_map = ResourceRqMap::default(); + let rq_id = resource_map.insert(ResourceRequestVariants::new_cpu1()); for task_count in [10, 1_000, 100_000] { c.bench_with_input( BenchmarkId::new("remove all tasks", task_count), @@ -40,7 +45,9 @@ fn bench_remove_all_tasks(c: &mut BenchmarkGroup) { b.iter_batched_ref( || { let mut core = Core::default(); - let tasks: Set<_> = add_tasks(&mut core, task_count).into_iter().collect(); + let tasks: Set<_> = add_tasks(&mut core, task_count, rq_id) + .into_iter() + .collect(); (core, tasks) }, |(core, tasks)| { @@ -55,6 +62,8 @@ fn bench_remove_all_tasks(c: &mut BenchmarkGroup) { } fn bench_add_task(c: &mut BenchmarkGroup) { + let mut resource_map = ResourceRqMap::default(); + let rq_id = resource_map.insert(ResourceRequestVariants::new_cpu1()); for task_count in [10, 1_000, 100_000] { c.bench_with_input( BenchmarkId::new("add task", task_count), @@ -63,9 +72,9 @@ fn bench_add_task(c: &mut BenchmarkGroup) { b.iter_batched_ref( || { let mut core = Core::default(); - add_tasks(&mut core, task_count); + add_tasks(&mut core, task_count, rq_id); - let task = create_task(TaskId::new_test(task_count + 1)); + let task = create_task(TaskId::new_test(task_count + 1), rq_id); (core, Some(task)) }, |(core, task)| { @@ -79,6 +88,8 @@ fn bench_add_task(c: &mut BenchmarkGroup) { } fn bench_add_tasks(c: &mut BenchmarkGroup) { + let mut resource_map = ResourceRqMap::default(); + let rq_id = resource_map.insert(ResourceRequestVariants::new_cpu1()); for task_count in [10, 1_000, 100_000] { c.bench_with_input( BenchmarkId::new("add tasks", task_count), @@ -88,7 +99,7 @@ fn bench_add_tasks(c: &mut BenchmarkGroup) { || { let core = Core::default(); let tasks: Vec<_> = (0..task_count) - .map(|id| create_task(TaskId::new_test(id as u32))) + .map(|id| create_task(TaskId::new_test(id as u32), rq_id)) .collect(); (core, tasks) }, @@ -105,6 +116,8 @@ fn bench_add_tasks(c: &mut BenchmarkGroup) { } fn bench_iterate_tasks(c: &mut BenchmarkGroup) { + let mut resource_map = ResourceRqMap::default(); + let rq_id = resource_map.insert(ResourceRequestVariants::new_cpu1()); for task_count in [10, 1_000, 100_000] { c.bench_with_input( BenchmarkId::new("iterate tasks", task_count), @@ -113,7 +126,7 @@ fn bench_iterate_tasks(c: &mut BenchmarkGroup) { b.iter_batched_ref( || { let mut core = Core::default(); - add_tasks(&mut core, task_count); + add_tasks(&mut core, task_count, rq_id); core }, |ref mut core| { diff --git a/crates/tako/benches/benchmarks/scheduler.rs b/crates/tako/benches/benchmarks/scheduler.rs index a75d581db..be15a3baf 100644 --- a/crates/tako/benches/benchmarks/scheduler.rs +++ b/crates/tako/benches/benchmarks/scheduler.rs @@ -9,8 +9,11 @@ use tako::internal::messages::worker::ToWorkerMessage; use tako::internal::scheduler::state::SchedulerState; use tako::internal::server::comm::Comm; use tako::internal::server::core::Core; +use tako::resources::{ResourceRequestVariants, ResourceRqMap}; fn bench_schedule(c: &mut BenchmarkGroup) { + let mut resource_map = ResourceRqMap::default(); + let rq_id = resource_map.insert(ResourceRequestVariants::new_cpu1()); for task_count in [10, 1_000, 100_000] { for worker_count in [1, 8, 16, 32] { c.bench_with_input( @@ -23,7 +26,7 @@ fn bench_schedule(c: &mut BenchmarkGroup) { b.iter_batched_ref( || { let mut core = Core::default(); - add_tasks(&mut core, task_count); + add_tasks(&mut core, task_count, rq_id); for worker_id in 0..worker_count { core.new_worker(create_worker(worker_id as u64)); diff --git a/crates/tako/benches/benchmarks/worker.rs b/crates/tako/benches/benchmarks/worker.rs index e4ff23023..d1a89dbb5 100644 --- a/crates/tako/benches/benchmarks/worker.rs +++ b/crates/tako/benches/benchmarks/worker.rs @@ -13,12 +13,12 @@ use tako::internal::worker::rqueue::ResourceWaitQueue; use tako::internal::worker::state::{TaskMap, WorkerStateRef}; use tako::internal::worker::task::{Task, TaskState}; use tako::launcher::{StopReason, TaskBuildContext, TaskLaunchData, TaskLauncher, TaskResult}; -use tako::resources::ResourceAmount; use tako::resources::{ AllocationRequest, CPU_RESOURCE_NAME, NVIDIA_GPU_RESOURCE_NAME, ResourceAllocRequest, ResourceDescriptor, ResourceDescriptorItem, ResourceDescriptorKind, ResourceRequest, - ResourceRequestVariants, TimeRequest, + ResourceRequestVariants, ResourceRqMap, TimeRequest, }; +use tako::resources::{ResourceAmount, ResourceRqId}; use tokio::sync::Notify; use tokio::sync::mpsc::unbounded_channel; @@ -38,7 +38,7 @@ impl TaskLauncher for BenchmarkTaskLauncher { } } -fn create_worker_state() -> WorkerStateRef { +fn create_worker_state(resource_rq_map: tako::resources::ResourceRqMap) -> WorkerStateRef { let worker = create_worker(1); let (tx, _) = unbounded_channel(); @@ -51,16 +51,18 @@ fn create_worker_state() -> WorkerStateRef { worker.configuration().clone(), None, Default::default(), + resource_rq_map, Box::new(BenchmarkTaskLauncher), "testuid".to_string(), ) } -fn create_worker_task(id: u32) -> Task { +fn create_worker_task(id: u32, resource_rq_id: ResourceRqId) -> Task { Task::new( ComputeTaskSeparateData { shared_index: 0, id: TaskId::new_test(id), + resource_rq_id, instance_id: Default::default(), scheduler_priority: 0, node_list: vec![], @@ -69,7 +71,6 @@ fn create_worker_task(id: u32) -> Task { }, ComputeTaskSharedData { user_priority: 0, - resources: Default::default(), time_limit: None, data_flags: TaskDataFlags::empty(), body: Default::default(), @@ -94,15 +95,16 @@ fn bench_add_task(c: &mut BenchmarkGroup) { |b, &task_count| { b.iter_custom(|iters| { let mut total = Duration::new(0, 0); - + let mut resource_map = ResourceRqMap::default(); + let rq_id = resource_map.insert(ResourceRequestVariants::new_cpu1()); for _ in 0..iters { - let state = create_worker_state(); + let state = create_worker_state(resource_map.clone()); let mut state = state.get_mut(); for id in 0..task_count { - state.add_task(create_worker_task(id)); + state.add_task(create_worker_task(id, rq_id)); } - let task = create_worker_task(task_count); + let task = create_worker_task(task_count, rq_id); let duration = measure_time!({ state.add_task(task); @@ -118,6 +120,8 @@ fn bench_add_task(c: &mut BenchmarkGroup) { } fn bench_add_tasks(c: &mut BenchmarkGroup) { + let mut resource_map = ResourceRqMap::default(); + let rq_id = resource_map.insert(ResourceRequestVariants::new_cpu1()); for task_count in [10, 1_000, 100_000] { c.bench_with_input( BenchmarkId::new("add tasks", task_count), @@ -125,8 +129,10 @@ fn bench_add_tasks(c: &mut BenchmarkGroup) { |b, &task_count| { b.iter_batched( || { - let state = create_worker_state(); - let tasks: Vec<_> = (0..task_count).map(create_worker_task).collect(); + let state = create_worker_state(resource_map.clone()); + let tasks: Vec<_> = (0..task_count) + .map(|x| create_worker_task(x, rq_id)) + .collect(); (state, tasks) }, |(state, tasks)| { @@ -143,6 +149,8 @@ fn bench_add_tasks(c: &mut BenchmarkGroup) { } fn bench_cancel_waiting_task(c: &mut BenchmarkGroup) { + let mut resource_map = ResourceRqMap::default(); + let rq_id = resource_map.insert(ResourceRequestVariants::new_cpu1()); for task_count in [10, 1_000, 100_000] { c.bench_with_input( BenchmarkId::new("cancel waiting task", task_count), @@ -150,12 +158,12 @@ fn bench_cancel_waiting_task(c: &mut BenchmarkGroup) { |b, &task_count| { b.iter_batched_ref( || { - let state = create_worker_state(); + let state = create_worker_state(resource_map.clone()); { let mut state = state.get_mut(); for id in 0..task_count { - state.add_task(create_worker_task(id)); + state.add_task(create_worker_task(id, rq_id)); } } (state, TaskId::new_test(0)) @@ -189,41 +197,46 @@ fn create_resource_queue(num_cpus: u32) -> ResourceWaitQueue { } fn bench_resource_queue_add_task(c: &mut BenchmarkGroup) { + let mut resource_map = ResourceRqMap::default(); + let rq_id = resource_map.insert(ResourceRequestVariants::new_cpu1()); c.bench_function("add task to resource queue", |b| { b.iter_batched_ref( - || (create_resource_queue(64), create_worker_task(0)), - |(queue, task)| queue.add_task(task), + || (create_resource_queue(64), create_worker_task(0, rq_id)), + |(queue, task)| queue.add_task(&resource_map, task), BatchSize::SmallInput, ); }); } fn bench_resource_queue_release_allocation(c: &mut BenchmarkGroup) { + let mut resource_map = ResourceRqMap::default(); + let rq_id = resource_map.insert(ResourceRequestVariants::new(smallvec![ + ResourceRequest::new( + 0, + TimeRequest::new(0, 0), + smallvec![ + ResourceAllocRequest { + resource_id: 0.into(), + request: AllocationRequest::Compact(ResourceAmount::new_units(64)), + }, + ResourceAllocRequest { + resource_id: 1.into(), + request: AllocationRequest::Compact(ResourceAmount::new_units(2)), + }, + ], + ) + ])); c.bench_function("release allocation from resource queue", |b| { b.iter_batched_ref( || { let mut queue = create_resource_queue(64); - let mut task = create_worker_task(0); - task.resources = ResourceRequestVariants::new(smallvec![ResourceRequest::new( - 0, - TimeRequest::new(0, 0), - smallvec![ - ResourceAllocRequest { - resource_id: 0.into(), - request: AllocationRequest::Compact(ResourceAmount::new_units(64)), - }, - ResourceAllocRequest { - resource_id: 1.into(), - request: AllocationRequest::Compact(ResourceAmount::new_units(2)), - }, - ], - )]); - queue.add_task(&task); + let task = create_worker_task(0, rq_id); + queue.add_task(&resource_map, &task); let mut map = TaskMap::default(); map.insert(task); - let mut started = queue.try_start_tasks(&map, None); + let mut started = queue.try_start_tasks(&map, &resource_map, None); (queue, Some(started.pop().unwrap().1)) }, |(queue, allocation)| queue.release_allocation(allocation.take().unwrap()), @@ -233,6 +246,23 @@ fn bench_resource_queue_release_allocation(c: &mut BenchmarkGroup) { } fn bench_resource_queue_start_tasks(c: &mut BenchmarkGroup) { + let mut resource_map = ResourceRqMap::default(); + let rq_id = resource_map.insert(ResourceRequestVariants::new(smallvec![ + ResourceRequest::new( + 0, + TimeRequest::new(0, 0), + smallvec![ + ResourceAllocRequest { + resource_id: 0.into(), + request: AllocationRequest::Compact(ResourceAmount::new_units(64)), + }, + ResourceAllocRequest { + resource_id: 1.into(), + request: AllocationRequest::Compact(ResourceAmount::new_units(2)), + }, + ], + ) + ])); for task_count in [1, 10, 1_000, 100_000] { c.bench_with_input( BenchmarkId::new("start tasks in resource queue", task_count), @@ -244,33 +274,14 @@ fn bench_resource_queue_start_tasks(c: &mut BenchmarkGroup) { let mut map = TaskMap::default(); for id in 0..task_count { - let mut task = create_worker_task(id); - task.resources = - ResourceRequestVariants::new(smallvec![ResourceRequest::new( - 0, - TimeRequest::new(0, 0), - smallvec![ - ResourceAllocRequest { - resource_id: 0.into(), - request: AllocationRequest::Compact( - ResourceAmount::new_units(64) - ), - }, - ResourceAllocRequest { - resource_id: 1.into(), - request: AllocationRequest::Compact( - ResourceAmount::new_units(2) - ), - }, - ], - )]); - queue.add_task(&task); + let task = create_worker_task(id, rq_id); + queue.add_task(&resource_map, &task); map.insert(task); } (queue, map) }, - |(queue, map)| queue.try_start_tasks(map, None), + |(queue, map)| queue.try_start_tasks(map, &resource_map, None), BatchSize::SmallInput, ); }, diff --git a/crates/tako/benches/utils/mod.rs b/crates/tako/benches/utils/mod.rs index f0e3a0d68..648e4c643 100644 --- a/crates/tako/benches/utils/mod.rs +++ b/crates/tako/benches/utils/mod.rs @@ -7,14 +7,14 @@ use tako::internal::server::worker::Worker; use tako::internal::worker::configuration::OverviewConfiguration; use tako::resources::{ CPU_RESOURCE_NAME, ResourceDescriptor, ResourceDescriptorItem, ResourceDescriptorKind, + ResourceRqId, }; use tako::worker::ServerLostPolicy; use tako::worker::WorkerConfiguration; use tako::{TaskId, WorkerId}; -pub fn create_task(id: TaskId) -> Task { +pub fn create_task(id: TaskId, resource_rq_id: ResourceRqId) -> Task { let conf = TaskConfiguration { - resources: Default::default(), user_priority: 0, time_limit: None, crash_limit: CrashLimit::default(), @@ -23,6 +23,7 @@ pub fn create_task(id: TaskId) -> Task { }; Task::new( id, + resource_rq_id, Default::default(), Default::default(), None, @@ -59,11 +60,11 @@ pub fn create_worker(id: u64) -> Worker { ) } -pub fn add_tasks(core: &mut Core, count: u32) -> Vec { +pub fn add_tasks(core: &mut Core, count: u32, resource_rq_id: ResourceRqId) -> Vec { let mut tasks = Vec::with_capacity(count as usize); for id in 0..count { let task_id = TaskId::new_test(id); - let task = create_task(task_id); + let task = create_task(task_id, resource_rq_id); core.add_task(task); tasks.push(task_id); } diff --git a/crates/tako/src/internal/common/resources/map.rs b/crates/tako/src/internal/common/resources/map.rs index da324cb32..bb7adce70 100644 --- a/crates/tako/src/internal/common/resources/map.rs +++ b/crates/tako/src/internal/common/resources/map.rs @@ -95,10 +95,9 @@ impl GlobalResourceMapping { match self.resource_rq_to_id.get(&rqv) { Some(&id) => (id, false), None => { - let id = ResourceRqId::new(self.resource_rq_to_id.len() as u32); + let id = self.resource_rq_from_id.insert(rqv.clone()); log::debug!("New resource request registered {rqv:?} as {id}"); - self.resource_rq_to_id.insert(rqv.clone(), id); - self.resource_rq_from_id.insert(id, rqv); + self.resource_rq_to_id.insert(rqv, id); (id, true) } } @@ -181,9 +180,10 @@ impl ResourceIdMap { pub struct ResourceRqMap(Vec); impl ResourceRqMap { - pub fn insert(&mut self, rq_id: ResourceRqId, rqv: ResourceRequestVariants) { - assert_eq!(rq_id.as_usize(), self.0.len()); + pub fn insert(&mut self, rqv: ResourceRequestVariants) -> ResourceRqId { + let id = ResourceRqId::new(self.0.len() as u32); self.0.push(rqv); + id } #[inline] diff --git a/crates/tako/src/internal/common/resources/mod.rs b/crates/tako/src/internal/common/resources/mod.rs index 703df1513..09046a499 100644 --- a/crates/tako/src/internal/common/resources/mod.rs +++ b/crates/tako/src/internal/common/resources/mod.rs @@ -13,7 +13,7 @@ pub use descriptor::{ }; pub use map::{ AMD_GPU_RESOURCE_NAME, CPU_RESOURCE_ID, CPU_RESOURCE_NAME, GlobalResourceMapping, - MEM_RESOURCE_NAME, NVIDIA_GPU_RESOURCE_NAME, + MEM_RESOURCE_NAME, NVIDIA_GPU_RESOURCE_NAME, ResourceRqMap, }; pub use request::{ AllocationRequest, ResourceAllocRequest, ResourceRequest, ResourceRequestEntries, diff --git a/crates/tako/src/internal/common/resources/request.rs b/crates/tako/src/internal/common/resources/request.rs index 734b1840e..18ec4e278 100644 --- a/crates/tako/src/internal/common/resources/request.rs +++ b/crates/tako/src/internal/common/resources/request.rs @@ -7,7 +7,7 @@ use crate::internal::common::resources::{NumOfNodes, ResourceAmount, ResourceId} use crate::internal::server::workerload::WorkerResources; use crate::internal::worker::resources::allocator::ResourceAllocator; use crate::resources::ResourceIdMap; -use smallvec::SmallVec; +use smallvec::{SmallVec, smallvec}; use std::time::Duration; #[derive(Serialize, Deserialize, Debug, Clone, Hash, Eq, PartialEq)] @@ -191,6 +191,21 @@ impl ResourceRequestVariants { ResourceRequestVariants { variants } } + pub fn new_simple(rq: ResourceRequest) -> ResourceRequestVariants { + ResourceRequestVariants::new(smallvec![rq]) + } + + pub fn new_cpu1() -> ResourceRequestVariants { + Self::new_simple(ResourceRequest::new( + 0, + TimeRequest::new(0, 0), + smallvec![ResourceAllocRequest { + resource_id: crate::resources::CPU_RESOURCE_ID, + request: AllocationRequest::Compact(ResourceAmount::ONE), + }], + )) + } + pub fn sort_key(&self, allocator: &ResourceAllocator) -> (f32, TimeRequest) { /* The following unwrap is ok since there has to be always at least at least one @@ -307,11 +322,6 @@ mod tests { use crate::internal::tests::utils::resources::ResBuilder; use crate::resources::ResourceRequest; use smallvec::smallvec; - impl ResourceRequestVariants { - pub fn new_simple(rq: ResourceRequest) -> ResourceRequestVariants { - ResourceRequestVariants::new(smallvec![rq]) - } - } #[test] fn test_resource_request_validate() { diff --git a/crates/tako/src/internal/server/reactor.rs b/crates/tako/src/internal/server/reactor.rs index 693651d1e..02585c5b5 100644 --- a/crates/tako/src/internal/server/reactor.rs +++ b/crates/tako/src/internal/server/reactor.rs @@ -624,7 +624,7 @@ pub(crate) fn get_or_create_resource_rq_id( pub(crate) fn get_or_create_raw_resource_rq_id( core: &mut Core, comm: &mut impl Comm, - rqv: ResourceRequestVariants, + rqv: crate::resources::ResourceRequestVariants, ) -> (ResourceRqId, bool) { let map = core.resource_map_mut(); let (rq_id, is_new) = map.get_or_create_rq_id(rqv); diff --git a/crates/tako/src/internal/worker/rpc.rs b/crates/tako/src/internal/worker/rpc.rs index c53efc0e2..e102cd53b 100644 --- a/crates/tako/src/internal/worker/rpc.rs +++ b/crates/tako/src/internal/worker/rpc.rs @@ -446,7 +446,10 @@ pub(crate) fn process_worker_message(state: &mut WorkerState, message: ToWorkerM ToWorkerMessage::SetOverviewIntervalOverride(r#override) => { state.worker_overview_interval_override = r#override; } - ToWorkerMessage::NewResourceRequest(rq_id, rqv) => state.register_resource_rq(rq_id, rqv), + ToWorkerMessage::NewResourceRequest(rq_id, rqv) => { + let new_id = state.register_resource_rq(rqv); + assert_eq!(rq_id, new_id); + } } false } diff --git a/crates/tako/src/internal/worker/state.rs b/crates/tako/src/internal/worker/state.rs index de4e13cf3..e1e2b36af 100644 --- a/crates/tako/src/internal/worker/state.rs +++ b/crates/tako/src/internal/worker/state.rs @@ -437,12 +437,8 @@ impl WorkerState { } } - pub fn register_resource_rq( - &mut self, - resource_rq_id: ResourceRqId, - rqv: ResourceRequestVariants, - ) { - self.resource_rq_map.insert(resource_rq_id, rqv) + pub fn register_resource_rq(&mut self, rqv: ResourceRequestVariants) -> ResourceRqId { + self.resource_rq_map.insert(rqv) } pub fn download_object( diff --git a/crates/tako/src/lib.rs b/crates/tako/src/lib.rs index 146649e79..560f18096 100644 --- a/crates/tako/src/lib.rs +++ b/crates/tako/src/lib.rs @@ -41,7 +41,7 @@ pub mod resources { ResourceDescriptorCoupling, ResourceDescriptorCouplingItem, ResourceDescriptorItem, ResourceDescriptorKind, ResourceFractions, ResourceGroupIdx, ResourceIndex, ResourceLabel, ResourceRequest, ResourceRequestEntries, ResourceRequestVariants, ResourceRqId, - ResourceUnits, TimeRequest, + ResourceRqMap, ResourceUnits, TimeRequest, }; pub use crate::internal::common::resources::map::ResourceIdMap; From a9669d221d1785dc676479d476ed89aa059e6f82 Mon Sep 17 00:00:00 2001 From: Ada Bohm Date: Tue, 23 Dec 2025 14:53:02 +0100 Subject: [PATCH 15/17] Worker tests migrated --- crates/hyperqueue/src/server/client/submit.rs | 7 +- .../tako/src/internal/common/resources/map.rs | 2 +- .../src/internal/common/resources/request.rs | 3 - crates/tako/src/internal/server/explain.rs | 2 +- .../tests/integration/test_resources.rs | 1 - .../tests/integration/utils/server.rs | 2 +- crates/tako/src/internal/tests/test_worker.rs | 7 +- crates/tako/src/internal/tests/utils/task.rs | 2 +- .../tako/src/internal/worker/test_rqueue.rs | 355 +++++++++--------- crates/tako/src/internal/worker/test_util.rs | 37 +- 10 files changed, 217 insertions(+), 201 deletions(-) diff --git a/crates/hyperqueue/src/server/client/submit.rs b/crates/hyperqueue/src/server/client/submit.rs index a7fcf48ab..42d348709 100644 --- a/crates/hyperqueue/src/server/client/submit.rs +++ b/crates/hyperqueue/src/server/client/submit.rs @@ -541,15 +541,12 @@ mod tests { SubmitResponse, TaskDescription, TaskKind, TaskKindProgram, TaskWithDependencies, }; use chrono::Utc; - use smallvec::smallvec; use std::path::PathBuf; use std::time::Duration; - use tako::gateway::{ - CrashLimit, ResourceRequest, ResourceRequestEntry, ResourceRequestVariants, TaskDataFlags, - }; + use tako::gateway::{CrashLimit, ResourceRequestVariants, TaskDataFlags}; use tako::internal::tests::utils::sorted_vec; use tako::program::ProgramDefinition; - use tako::resources::{AllocationRequest, CPU_RESOURCE_NAME, ResourceAmount, ResourceRqId}; + use tako::resources::ResourceRqId; use tako::{Priority, TaskId}; #[test] diff --git a/crates/tako/src/internal/common/resources/map.rs b/crates/tako/src/internal/common/resources/map.rs index bb7adce70..299bc435f 100644 --- a/crates/tako/src/internal/common/resources/map.rs +++ b/crates/tako/src/internal/common/resources/map.rs @@ -201,7 +201,7 @@ impl ResourceRqMap { { rq_id } else { - let mut new_id = ResourceRqId::new(self.0.len() as u32); + let new_id = ResourceRqId::new(self.0.len() as u32); self.0.push(rqv); new_id } diff --git a/crates/tako/src/internal/common/resources/request.rs b/crates/tako/src/internal/common/resources/request.rs index 18ec4e278..503f00619 100644 --- a/crates/tako/src/internal/common/resources/request.rs +++ b/crates/tako/src/internal/common/resources/request.rs @@ -318,10 +318,7 @@ impl ResourceRequestVariants { #[cfg(test)] mod tests { - use crate::internal::common::resources::ResourceRequestVariants; use crate::internal::tests::utils::resources::ResBuilder; - use crate::resources::ResourceRequest; - use smallvec::smallvec; #[test] fn test_resource_request_validate() { diff --git a/crates/tako/src/internal/server/explain.rs b/crates/tako/src/internal/server/explain.rs index 3fc8c6b85..d28027c1a 100644 --- a/crates/tako/src/internal/server/explain.rs +++ b/crates/tako/src/internal/server/explain.rs @@ -144,7 +144,7 @@ pub fn task_explain_for_worker( #[cfg(test)] mod tests { - use crate::internal::common::resources::map::{GlobalResourceMapping, ResourceRqMap}; + use crate::internal::common::resources::map::GlobalResourceMapping; use crate::internal::server::explain::{TaskExplainItem, task_explain_for_worker}; use crate::internal::server::worker::Worker; use crate::internal::server::workergroup::WorkerGroup; diff --git a/crates/tako/src/internal/tests/integration/test_resources.rs b/crates/tako/src/internal/tests/integration/test_resources.rs index 81d4bee6d..8ea456f72 100644 --- a/crates/tako/src/internal/tests/integration/test_resources.rs +++ b/crates/tako/src/internal/tests/integration/test_resources.rs @@ -12,7 +12,6 @@ use crate::internal::tests::integration::utils::task::{ }; use crate::internal::tests::integration::utils::worker::WorkerConfigBuilder as WC; use crate::resources::ResourceDescriptor; -use crate::tests::integration::utils::task::ResourceRequestConfigBuilder; use tokio::time::sleep; #[tokio::test] diff --git a/crates/tako/src/internal/tests/integration/utils/server.rs b/crates/tako/src/internal/tests/integration/utils/server.rs index 6724b3479..58a669430 100644 --- a/crates/tako/src/internal/tests/integration/utils/server.rs +++ b/crates/tako/src/internal/tests/integration/utils/server.rs @@ -117,7 +117,7 @@ impl ServerHandle { #[cfg(test)] pub fn register_default_request(&self) -> ResourceRqId { self.server_ref - .get_or_create_resource_rq_id(&ResourceRequestVariants::default()) + .get_or_create_resource_rq_id(&crate::gateway::ResourceRequestVariants::default()) } pub fn register_request(&self, rbuilder: ResourceRequestConfigBuilder) -> ResourceRqId { diff --git a/crates/tako/src/internal/tests/test_worker.rs b/crates/tako/src/internal/tests/test_worker.rs index ef133ac9f..406c5aed5 100644 --- a/crates/tako/src/internal/tests/test_worker.rs +++ b/crates/tako/src/internal/tests/test_worker.rs @@ -1,6 +1,6 @@ use crate::gateway::TaskDataFlags; +use crate::internal::common::resources::ResourceRqId; use crate::internal::common::resources::map::{GlobalResourceMapping, ResourceRqMap}; -use crate::internal::common::resources::{ResourceRequestVariants, ResourceRqId}; use crate::internal::messages::worker::{ ComputeTaskSeparateData, ComputeTaskSharedData, ComputeTasksMsg, NewWorkerMsg, ToWorkerMessage, WorkerResourceCounts, @@ -18,7 +18,6 @@ use crate::launcher::{StopReason, TaskBuildContext, TaskLaunchData, TaskLauncher use crate::resources::{ResourceDescriptor, ResourceIdMap}; use crate::worker::{ServerLostPolicy, WorkerConfiguration}; use crate::{Set, TaskId, WorkerId}; -use smallvec::smallvec; use std::ops::Deref; use std::time::Duration; use tokio::sync::oneshot::Receiver; @@ -108,7 +107,7 @@ fn create_dummy_compute_msg(task_id: TaskId, resource_rq_id: ResourceRqId) -> Co fn test_worker_start_task() { let mut rmap = GlobalResourceMapping::default(); let rqv = ResourceRequestBuilder::default().cpus(3).finish_v(); - let (rq_id, _) = rmap.get_or_create_rq_id(rqv.clone()); + let (rq_id, _) = rmap.get_or_create_rq_id(rqv); let config = create_test_worker_config(); let state_ref = create_test_worker_state(config, rmap.get_resource_rq_map().clone()); @@ -123,7 +122,7 @@ fn test_worker_start_task() { assert!(state.running_tasks.is_empty()); let requests = state.ready_task_queue.requests(); assert_eq!(requests.len(), 1); - assert_eq!(requests[0], rqv); + assert_eq!(requests[0], rq_id); } /*#[test] diff --git a/crates/tako/src/internal/tests/utils/task.rs b/crates/tako/src/internal/tests/utils/task.rs index 02c46797f..81a58bf27 100644 --- a/crates/tako/src/internal/tests/utils/task.rs +++ b/crates/tako/src/internal/tests/utils/task.rs @@ -1,7 +1,7 @@ use super::resources::ResBuilder; use crate::datasrv::DataObjectId; use crate::gateway::{CrashLimit, TaskDataFlags}; -use crate::internal::common::resources::map::{GlobalResourceMapping, ResourceRqMap}; +use crate::internal::common::resources::map::GlobalResourceMapping; use crate::internal::common::resources::{ NumOfNodes, ResourceAmount, ResourceId, ResourceRequestVariants, }; diff --git a/crates/tako/src/internal/worker/test_rqueue.rs b/crates/tako/src/internal/worker/test_rqueue.rs index 097b4eb3a..2c17d41e9 100644 --- a/crates/tako/src/internal/worker/test_rqueue.rs +++ b/crates/tako/src/internal/worker/test_rqueue.rs @@ -1,10 +1,8 @@ -use crate::internal::common::resources::{ - ResourceDescriptor, ResourceRequest, ResourceRequestVariants, -}; +use crate::internal::common::resources::{ResourceDescriptor, ResourceRequest}; use crate::internal::tests::utils::resources::{ResBuilder, ra_builder}; use crate::internal::tests::utils::resources::{ResourceRequestBuilder, cpus_compact}; use crate::internal::worker::rqueue::ResourceWaitQueue; -use crate::internal::worker::test_util::{WorkerTaskBuilder, worker_task}; +use crate::internal::worker::test_util::{WorkerTaskBuilder, worker_task, worker_task_add}; use std::ops::Deref; use std::time::Duration; @@ -15,11 +13,11 @@ use crate::internal::tests::utils::shared::{ res_allocator_from_descriptor, res_item, res_kind_groups, res_kind_list, res_kind_range, }; use crate::internal::worker::test_util::ResourceQueueBuilder as RB; -use crate::resources::ResourceDescriptorItem; +use crate::resources::{ResourceDescriptorItem, ResourceRqId}; use crate::{Map, Set, WorkerId}; impl ResourceWaitQueue { - pub fn requests(&self) -> &[ResourceRequestVariants] { + pub fn requests(&self) -> &[ResourceRqId] { &self.requests } @@ -36,39 +34,42 @@ fn test_rqueue_resource_priority() { res_kind_groups(&[vec!["0", "1", "2", "3"], vec!["7", "8"]]), )])); - rq.add_task(worker_task( + let w = worker_task( 10, ResBuilder::default().add_scatter(0, 3).finish(), 1, &mut rqs, - )); - rq.add_task(worker_task(11, cpus_compact(4).finish(), 1, &mut rqs)); - rq.add_task(worker_task( + ); + rq.add_task(&rqs, w); + let w = worker_task(11, cpus_compact(4).finish(), 1, &mut rqs); + rq.add_task(&rqs, w); + let w = worker_task( 12, ResBuilder::default().add_force_compact(0, 4).finish(), 1, &mut rqs, - )); + ); + rq.add_task(&rqs, w); - let mut a = rq.start_tasks(); + let mut a = rq.start_tasks(&rqs); assert!(!a.contains_key(&10)); assert!(!a.contains_key(&11)); assert!(a.contains_key(&12)); - let tasks = rq.start_tasks(); + let tasks = rq.start_tasks(&rqs); assert!(tasks.is_empty()); rq.queue.release_allocation(a.remove(&12).unwrap()); - let mut tasks = rq.start_tasks(); + let mut tasks = rq.start_tasks(&rqs); assert_eq!(tasks.len(), 1); assert!(tasks.contains_key(&11)); - assert!(rq.start_tasks().is_empty()); + assert!(rq.start_tasks(&rqs).is_empty()); rq.queue.release_allocation(tasks.remove(&11).unwrap()); - let mut tasks = rq.start_tasks(); + let mut tasks = rq.start_tasks(&rqs); assert_eq!(tasks.len(), 1); assert!(tasks.contains_key(&10)); - assert!(rq.start_tasks().is_empty()); + assert!(rq.start_tasks(&rqs).is_empty()); rq.queue.release_allocation(tasks.remove(&10).unwrap()); } @@ -76,11 +77,11 @@ fn test_rqueue_resource_priority() { fn test_rqueue1() { let mut rqs = ResourceRqMap::default(); let mut rq = RB::new(wait_queue(ResourceDescriptor::sockets(3, 5))); - rq.add_task(worker_task(10, cpus_compact(2).finish(), 1, &mut rqs)); - rq.add_task(worker_task(11, cpus_compact(5).finish(), 1, &mut rqs)); - rq.add_task(worker_task(12, cpus_compact(2).finish(), 1, &mut rqs)); + worker_task_add(&mut rq, &mut rqs, 10, cpus_compact(2).finish(), 1); + worker_task_add(&mut rq, &mut rqs, 11, cpus_compact(5).finish(), 1); + worker_task_add(&mut rq, &mut rqs, 12, cpus_compact(2).finish(), 1); - let a = rq.start_tasks(); + let a = rq.start_tasks(&rqs); assert_eq!(a.get(&10).unwrap().get_indices(0).len(), 2); assert_eq!(a.get(&11).unwrap().get_indices(0).len(), 5); assert_eq!(a.get(&12).unwrap().get_indices(0).len(), 2); @@ -91,15 +92,15 @@ fn test_rqueue2() { let mut rqs = ResourceRqMap::default(); let mut rq = RB::new(wait_queue(ResourceDescriptor::simple_cpus(4))); - rq.add_task(worker_task(10, cpus_compact(2).finish(), 1, &mut rqs)); - rq.add_task(worker_task(11, cpus_compact(1).finish(), 2, &mut rqs)); - rq.add_task(worker_task(12, cpus_compact(2).finish(), 2, &mut rqs)); + worker_task_add(&mut rq, &mut rqs, 10, cpus_compact(2).finish(), 1); + worker_task_add(&mut rq, &mut rqs, 11, cpus_compact(1).finish(), 2); + worker_task_add(&mut rq, &mut rqs, 12, cpus_compact(2).finish(), 2); - let a = rq.start_tasks(); + let a = rq.start_tasks(&rqs); assert!(!a.contains_key(&10)); assert!(a.contains_key(&11)); assert!(a.contains_key(&12)); - assert!(rq.start_tasks().is_empty()); + assert!(rq.start_tasks(&rqs).is_empty()); } #[test] @@ -107,11 +108,11 @@ fn test_rqueue3() { let mut rqs = ResourceRqMap::default(); let mut rq = RB::new(wait_queue(ResourceDescriptor::simple_cpus(4))); - rq.add_task(worker_task(10, cpus_compact(2).finish(), 1, &mut rqs)); - rq.add_task(worker_task(11, cpus_compact(1).finish(), 1, &mut rqs)); - rq.add_task(worker_task(12, cpus_compact(2).finish(), 2, &mut rqs)); + worker_task_add(&mut rq, &mut rqs, 10, cpus_compact(2).finish(), 1); + worker_task_add(&mut rq, &mut rqs, 11, cpus_compact(1).finish(), 1); + worker_task_add(&mut rq, &mut rqs, 12, cpus_compact(2).finish(), 2); - let a = rq.start_tasks(); + let a = rq.start_tasks(&rqs); assert!(a.contains_key(&10)); assert!(!a.contains_key(&11)); assert!(a.contains_key(&12)); @@ -121,47 +122,52 @@ fn test_rqueue3() { fn test_rqueue_time_request() { let mut rqs = ResourceRqMap::default(); let mut rq = RB::new(wait_queue(ResourceDescriptor::simple_cpus(4))); - rq.add_task(worker_task( + worker_task_add( + &mut rq, + &mut rqs, 10, ResBuilder::default().add(0, 1).min_time_secs(10).finish(), 1, - &mut rqs, - )); + ); - assert_eq!(rq.start_tasks_duration(Duration::new(9, 0)).len(), 0); - assert_eq!(rq.start_tasks_duration(Duration::new(11, 0)).len(), 1); + assert_eq!(rq.start_tasks_duration(&rqs, Duration::new(9, 0)).len(), 0); + assert_eq!(rq.start_tasks_duration(&rqs, Duration::new(11, 0)).len(), 1); } #[test] fn test_rqueue_time_request_priority1() { let mut rqs = ResourceRqMap::default(); let mut rq = RB::new(wait_queue(ResourceDescriptor::simple_cpus(4))); - rq.add_task(worker_task( + worker_task_add( + &mut rq, + &mut rqs, 10, cpus_compact(2).min_time_secs(10).finish(), 1, + ); + worker_task_add( + &mut rq, &mut rqs, - )); - rq.add_task(worker_task( 11, cpus_compact(2).min_time_secs(40).finish(), 1, + ); + worker_task_add( + &mut rq, &mut rqs, - )); - rq.add_task(worker_task( 12, cpus_compact(2).min_time_secs(20).finish(), 1, + ); + worker_task_add( + &mut rq, &mut rqs, - )); - rq.add_task(worker_task( 13, cpus_compact(2).min_time_secs(30).finish(), 1, - &mut rqs, - )); + ); - let map = rq.start_tasks_duration(Duration::new(40, 0)); + let map = rq.start_tasks_duration(&rqs, Duration::new(40, 0)); assert_eq!(map.len(), 2); assert!(map.contains_key(&11)); assert!(map.contains_key(&13)); @@ -171,32 +177,36 @@ fn test_rqueue_time_request_priority1() { fn test_rqueue_time_request_priority2() { let mut rqs = ResourceRqMap::default(); let mut rq = RB::new(wait_queue(ResourceDescriptor::simple_cpus(4))); - rq.add_task(worker_task( + worker_task_add( + &mut rq, + &mut rqs, 10, cpus_compact(2).min_time_secs(10).finish(), 1, + ); + worker_task_add( + &mut rq, &mut rqs, - )); - rq.add_task(worker_task( 11, cpus_compact(2).min_time_secs(40).finish(), 1, + ); + worker_task_add( + &mut rq, &mut rqs, - )); - rq.add_task(worker_task( 12, cpus_compact(2).min_time_secs(20).finish(), 1, + ); + worker_task_add( + &mut rq, &mut rqs, - )); - rq.add_task(worker_task( 13, cpus_compact(2).min_time_secs(30).finish(), 1, - &mut rqs, - )); + ); - let map = rq.start_tasks_duration(Duration::new(30, 0)); + let map = rq.start_tasks_duration(&rqs, Duration::new(30, 0)); assert_eq!(map.len(), 2); assert!(map.contains_key(&12)); assert!(map.contains_key(&13)); @@ -215,10 +225,10 @@ fn test_rqueue_generic_resource1_priorities() { let request: ResourceRequest = cpus_compact(2).add(1, 2).finish(); - rq.add_task(worker_task(10, request, 1, &mut rqs)); - rq.add_task(worker_task(11, cpus_compact(4).finish(), 1, &mut rqs)); + worker_task_add(&mut rq, &mut rqs, 10, request, 1); + worker_task_add(&mut rq, &mut rqs, 11, cpus_compact(4).finish(), 1); - let map = rq.start_tasks(); + let map = rq.start_tasks(&rqs); assert!(!map.contains_key(&10)); assert!(map.contains_key(&11)); } @@ -236,15 +246,15 @@ fn test_rqueue_generic_resource2_priorities() { let mut rq = RB::new(wait_queue(resources)); let request: ResourceRequest = cpus_compact(2).add(1, 8).finish(); - rq.add_task(worker_task(10, request, 1, &mut rqs)); + worker_task_add(&mut rq, &mut rqs, 10, request, 1); let request: ResourceRequest = cpus_compact(2).add(1, 12).finish(); - rq.add_task(worker_task(11, request, 1, &mut rqs)); + worker_task_add(&mut rq, &mut rqs, 11, request, 1); let request: ResourceRequest = cpus_compact(2).add(2, 50_000_000).finish(); - rq.add_task(worker_task(12, request, 1, &mut rqs)); + worker_task_add(&mut rq, &mut rqs, 12, request, 1); - let map = rq.start_tasks(); + let map = rq.start_tasks(&rqs); assert!(!map.contains_key(&10)); assert!(map.contains_key(&11)); assert!(map.contains_key(&12)); @@ -263,15 +273,15 @@ fn test_rqueue_generic_resource3_priorities() { let mut rq = RB::new(wait_queue(resources)); let request: ResourceRequest = cpus_compact(2).add(1, 18).finish(); - rq.add_task(worker_task(10, request, 1, &mut rqs)); + worker_task_add(&mut rq, &mut rqs, 10, request, 1); let request: ResourceRequest = cpus_compact(2).add(1, 10).add(2, 60_000_000).finish(); - rq.add_task(worker_task(11, request, 1, &mut rqs)); + worker_task_add(&mut rq, &mut rqs, 11, request, 1); let request: ResourceRequest = cpus_compact(2).add(2, 99_000_000).finish(); - rq.add_task(worker_task(12, request, 1, &mut rqs)); + worker_task_add(&mut rq, &mut rqs, 12, request, 1); - let map = rq.start_tasks(); + let map = rq.start_tasks(&rqs); assert!(!map.contains_key(&10)); assert!(map.contains_key(&11)); assert!(!map.contains_key(&12)); @@ -300,11 +310,14 @@ fn test_worker_resource_priorities() { assert_eq!(rq.resource_priority(&rq2), 0); assert_eq!(rq.resource_priority(&rq3), 0); + let resource_map = ResourceRqMap::default(); + rq.new_worker( 400.into(), WorkerResources::from_transport(WorkerResourceCounts { n_resources: ra_builder(&[2, 0]).deref().clone(), }), + &resource_map, ); assert_eq!(rq.resource_priority(&rq1), 0); @@ -316,6 +329,7 @@ fn test_worker_resource_priorities() { WorkerResources::from_transport(WorkerResourceCounts { n_resources: ra_builder(&[2, 2]).deref().clone(), }), + &resource_map, ); assert_eq!(rq.resource_priority(&rq1), 0); assert_eq!(rq.resource_priority(&rq2), 2); @@ -327,13 +341,15 @@ fn test_worker_resource_priorities() { WorkerResources::from_transport(WorkerResourceCounts { n_resources: ra_builder(&[3, 0]).deref().clone(), }), + &resource_map, ); } assert_eq!(rq.resource_priority(&rq1), 0); assert_eq!(rq.resource_priority(&rq2), 2); assert_eq!(rq.resource_priority(&rq3), 41); - rq.remove_worker(504.into()); + rq.remove_worker(504.into(), &resource_map); + assert_eq!(rq.resource_priority(&rq1), 0); assert_eq!(rq.resource_priority(&rq2), 2); assert_eq!(rq.resource_priority(&rq3), 40); @@ -341,7 +357,7 @@ fn test_worker_resource_priorities() { #[test] fn test_uniq_resource_priorities1() { - let mut requests = ResourceRqMap::default(); + let mut rqs = ResourceRqMap::default(); let resources = vec![ ResourceDescriptorItem::range("cpus", 0, 16), ResourceDescriptorItem::range("res0", 1, 10), @@ -351,21 +367,19 @@ fn test_uniq_resource_priorities1() { let mut rq = RB::new(wait_queue(resources)); let request: ResourceRequest = cpus_compact(16).finish(); - rq.add_task( - WorkerTaskBuilder::new(10) - .resources(request) - .server_priority(1) - .build(&mut requests), - ); + let wt = WorkerTaskBuilder::new(10) + .resources(request) + .server_priority(1) + .build(&mut rqs); + rq.add_task(&rqs, wt); let request: ResourceRequest = cpus_compact(16).add(2, 2).finish(); - rq.add_task( - WorkerTaskBuilder::new(11) - .resources(request) - .build(&mut requests), - ); + let wt = WorkerTaskBuilder::new(11) + .resources(request) + .build(&mut rqs); + rq.add_task(&rqs, wt); - let map = rq.start_tasks(); + let map = rq.start_tasks(&rqs); assert_eq!(map.len(), 1); assert!(map.contains_key(&10)); } @@ -386,24 +400,23 @@ fn test_uniq_resource_priorities2() { WorkerResources::from_transport(WorkerResourceCounts { n_resources: ra_builder(&[16, 2, 0, 1]).deref().clone(), }), + &rqs, ); let request: ResourceRequest = cpus_compact(16).finish(); - rq.add_task( - WorkerTaskBuilder::new(10) - .resources(request) - .server_priority(1) - .build(&mut rqs), - ); + let wt = WorkerTaskBuilder::new(10) + .resources(request) + .server_priority(1) + .build(&mut rqs); + rq.add_task(&rqs, wt); let request: ResourceRequest = cpus_compact(16).add(2, 2).finish(); - rq.add_task( - WorkerTaskBuilder::new(11) - .resources(request) - .build(&mut rqs), - ); + let wt = WorkerTaskBuilder::new(11) + .resources(request) + .build(&mut rqs); + rq.add_task(&rqs, wt); - let map = rq.start_tasks(); + let map = rq.start_tasks(&rqs); assert_eq!(map.len(), 1); assert!(map.contains_key(&11)); } @@ -424,24 +437,23 @@ fn test_uniq_resource_priorities3() { WorkerResources::from_transport(WorkerResourceCounts { n_resources: ra_builder(&[16, 2, 0, 1]).deref().clone(), }), + &rqs, ); let request: ResourceRequest = cpus_compact(16).finish(); - rq.add_task( - WorkerTaskBuilder::new(10) - .resources(request) - .user_priority(1) - .build(&mut rqs), - ); + let wt = WorkerTaskBuilder::new(10) + .resources(request) + .user_priority(1) + .build(&mut rqs); + rq.add_task(&rqs, wt); let request: ResourceRequest = cpus_compact(16).add(2, 2).finish(); - rq.add_task( - WorkerTaskBuilder::new(11) - .resources(request) - .build(&mut rqs), - ); + let wt = WorkerTaskBuilder::new(11) + .resources(request) + .build(&mut rqs); + rq.add_task(&rqs, wt); - let map = rq.start_tasks(); + let map = rq.start_tasks(&rqs); assert_eq!(map.len(), 1); assert!(map.contains_key(&10)); } @@ -457,23 +469,21 @@ fn test_different_resources_and_priorities() { for i in 0..20 { let request: ResourceRequest = cpus_compact(1).add(1, 1).finish(); - rq.add_task( - WorkerTaskBuilder::new(i) - .resources(request) - .user_priority(if i % 2 == 0 { 0 } else { -1 }) - .build(&mut rqs), - ); + let wt = WorkerTaskBuilder::new(i) + .resources(request) + .user_priority(if i % 2 == 0 { 0 } else { -1 }) + .build(&mut rqs); + rq.add_task(&rqs, wt); } for i in 0..12 { let request: ResourceRequest = cpus_compact(16).finish(); - rq.add_task( - WorkerTaskBuilder::new(i + 20) - .resources(request) - .user_priority(-3) - .build(&mut rqs), - ); + let wt = WorkerTaskBuilder::new(i + 20) + .resources(request) + .user_priority(-3) + .build(&mut rqs); + rq.add_task(&rqs, wt); } - let map = rq.start_tasks(); + let map = rq.start_tasks(&rqs); assert_eq!(map.len(), 7); let ids = map.keys().copied().collect::>(); assert_eq!( @@ -495,23 +505,21 @@ fn test_different_resources_and_priorities1() { for i in 0..20 { let request: ResourceRequest = cpus_compact(1).add(1, 1).finish(); - rq.add_task( - WorkerTaskBuilder::new(i) - .resources(request) - .user_priority(if i % 2 == 0 { 0 } else { -1 }) - .build(&mut rqs), - ); + let wt = WorkerTaskBuilder::new(i) + .resources(request) + .user_priority(if i % 2 == 0 { 0 } else { -1 }) + .build(&mut rqs); + rq.add_task(&rqs, wt); } for i in 0..12 { let request: ResourceRequest = cpus_compact(16).finish(); - rq.add_task( - WorkerTaskBuilder::new(i + 20) - .resources(request) - .user_priority(-3) - .build(&mut rqs), - ); + let wt = WorkerTaskBuilder::new(i + 20) + .resources(request) + .user_priority(-3) + .build(&mut rqs); + rq.add_task(&rqs, wt); } - let map = rq.start_tasks(); + let map = rq.start_tasks(&rqs); assert_eq!(map.len(), 7); let ids = map.keys().copied().collect::>(); assert_eq!( @@ -533,31 +541,30 @@ fn test_different_resources_and_priorities2() { for i in 0..6 { let request: ResourceRequest = cpus_compact(1).add(1, 1).finish(); - rq.add_task(WorkerTaskBuilder::new(i).resources(request).build(&mut rqs)); + let wt = WorkerTaskBuilder::new(i).resources(request).build(&mut rqs); + rq.add_task(&rqs, wt); } - let map = rq.start_tasks(); + let map = rq.start_tasks(&rqs); assert_eq!(map.len(), 3); for i in 0..6 { let request: ResourceRequest = cpus_compact(1).add(1, 1).finish(); - rq.add_task( - WorkerTaskBuilder::new(i + 10) - .resources(request) - .user_priority(1) - .build(&mut rqs), - ); + let wt = WorkerTaskBuilder::new(i + 10) + .resources(request) + .user_priority(1) + .build(&mut rqs); + rq.add_task(&rqs, wt); } - let map = rq.start_tasks(); + let map = rq.start_tasks(&rqs); assert!(map.is_empty()); for i in 0..6 { let request: ResourceRequest = cpus_compact(5).finish(); - rq.add_task( - WorkerTaskBuilder::new(i + 20) - .resources(request) - .user_priority(-3) - .build(&mut rqs), - ); + let wt = WorkerTaskBuilder::new(i + 20) + .resources(request) + .user_priority(-3) + .build(&mut rqs); + rq.add_task(&rqs, wt); } - let map = rq.start_tasks(); + let map = rq.start_tasks(&rqs); assert_eq!(map.len(), 1); assert!(map.keys().all(|id| *id >= 20)); } @@ -573,31 +580,30 @@ fn test_different_resources_and_priorities3() { for i in 0..6 { let request: ResourceRequest = cpus_compact(1).add(1, 2).finish(); - rq.add_task(WorkerTaskBuilder::new(i).resources(request).build(&mut rqs)); + let wt = WorkerTaskBuilder::new(i).resources(request).build(&mut rqs); + rq.add_task(&rqs, wt); } - let map = rq.start_tasks(); + let map = rq.start_tasks(&rqs); assert_eq!(map.len(), 1); for i in 0..6 { let request: ResourceRequest = cpus_compact(1).add(1, 3).finish(); - rq.add_task( - WorkerTaskBuilder::new(i + 10) - .resources(request) - .user_priority(1) - .build(&mut rqs), - ); + let wt = WorkerTaskBuilder::new(i + 10) + .resources(request) + .user_priority(1) + .build(&mut rqs); + rq.add_task(&rqs, wt); } - let map = rq.start_tasks(); + let map = rq.start_tasks(&rqs); assert!(map.is_empty()); for i in 0..6 { let request: ResourceRequest = cpus_compact(2).finish(); - rq.add_task( - WorkerTaskBuilder::new(i + 20) - .resources(request) - .user_priority(-3) - .build(&mut rqs), - ); + let wt = WorkerTaskBuilder::new(i + 20) + .resources(request) + .user_priority(-3) + .build(&mut rqs); + rq.add_task(&rqs, wt); } - let map = rq.start_tasks(); + let map = rq.start_tasks(&rqs); assert_eq!(map.len(), 4); assert!(map.keys().all(|id| *id >= 20)); } @@ -618,26 +624,25 @@ fn test_uniq_resource_priorities4() { WorkerResources::from_transport(WorkerResourceCounts { n_resources: ra_builder(&[16, 2, 0, 1]).deref().clone(), }), + &rqs, ); let request: ResourceRequest = cpus_compact(16).finish(); - rq.add_task( - WorkerTaskBuilder::new(10) - .resources(request) - .server_priority(1) - .build(&mut rqs), - ); + let wt = WorkerTaskBuilder::new(10) + .resources(request) + .server_priority(1) + .build(&mut rqs); + rq.add_task(&rqs, wt); - rq.queue.remove_worker(400.into()); + rq.queue.remove_worker(400.into(), &rqs); let request: ResourceRequest = cpus_compact(16).add(2, 2).finish(); - rq.add_task( - WorkerTaskBuilder::new(11) - .resources(request) - .build(&mut rqs), - ); + let wt = WorkerTaskBuilder::new(11) + .resources(request) + .build(&mut rqs); + rq.add_task(&rqs, wt); - let map = rq.start_tasks(); + let map = rq.start_tasks(&rqs); assert_eq!(map.len(), 1); assert!(map.contains_key(&10)); } diff --git a/crates/tako/src/internal/worker/test_util.rs b/crates/tako/src/internal/worker/test_util.rs index 8573819fb..705148c33 100644 --- a/crates/tako/src/internal/worker/test_util.rs +++ b/crates/tako/src/internal/worker/test_util.rs @@ -72,7 +72,6 @@ impl WorkerTaskBuilder { data_deps: self.data_deps, entry: None, }, - resources, ComputeTaskSharedData { user_priority: self.user_priority, time_limit: None, @@ -84,6 +83,17 @@ impl WorkerTaskBuilder { } } +pub fn worker_task_add>( + rbuilder: &mut ResourceQueueBuilder, + resource_map: &mut ResourceRqMap, + task_id: T, + resources: ResourceRequest, + u_priority: Priority, +) { + let w = worker_task(task_id, resources, u_priority, resource_map); + rbuilder.add_task(resource_map, w); +} + pub fn worker_task>( task_id: T, resources: ResourceRequest, @@ -109,26 +119,35 @@ impl ResourceQueueBuilder { } } - pub fn add_task(&mut self, task: Task) { - self.queue.add_task(&task); + pub fn add_task(&mut self, resource_map: &ResourceRqMap, task: Task) { + self.queue.add_task(resource_map, &task); self.task_map.insert(task); } - pub fn new_worker(&mut self, worker_id: WorkerId, wr: WorkerResources) { - self.queue.new_worker(worker_id, wr); + pub fn new_worker( + &mut self, + worker_id: WorkerId, + wr: WorkerResources, + resource_map: &ResourceRqMap, + ) { + self.queue.new_worker(worker_id, wr, resource_map); } - pub fn start_tasks(&mut self) -> Map> { + pub fn start_tasks(&mut self, rqs: &ResourceRqMap) -> Map> { self.queue - .try_start_tasks(&self.task_map, None) + .try_start_tasks(&self.task_map, rqs, None) .into_iter() .map(|(t, a, _)| (t.job_task_id().as_num(), a)) .collect() } - pub fn start_tasks_duration(&mut self, duration: Duration) -> Map> { + pub fn start_tasks_duration( + &mut self, + rqs: &ResourceRqMap, + duration: Duration, + ) -> Map> { self.queue - .try_start_tasks(&self.task_map, Some(duration)) + .try_start_tasks(&self.task_map, rqs, Some(duration)) .into_iter() .map(|(t, a, _)| (t.job_task_id().as_num(), a)) .collect() From 9183dd94ac3001c67c4a31b7e8f5bb4b855a924d Mon Sep 17 00:00:00 2001 From: Ada Bohm Date: Fri, 2 Jan 2026 14:57:58 +0100 Subject: [PATCH 16/17] PyHQ lint fixed --- crates/pyhq/src/client/job.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/pyhq/src/client/job.rs b/crates/pyhq/src/client/job.rs index 5adb09873..9135261b4 100644 --- a/crates/pyhq/src/client/job.rs +++ b/crates/pyhq/src/client/job.rs @@ -29,7 +29,7 @@ use tako::gateway::{ TaskDataFlags, }; use tako::program::{FileOnCloseBehavior, ProgramDefinition, StdioDef}; -use tako::resources::{AllocationRequest, NumOfNodes, ResourceAmount, ResourceRqId}; +use tako::resources::{AllocationRequest, NumOfNodes, ResourceAmount}; use tako::{JobTaskCount, Map}; #[derive(Debug, FromPyObject)] From 5e6bdc3014ddaf579ae61d061f3b65c9f329ab97 Mon Sep 17 00:00:00 2001 From: Ada Bohm Date: Fri, 2 Jan 2026 15:12:23 +0100 Subject: [PATCH 17/17] Added comment for GetList --- crates/hyperqueue/src/transfer/messages.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/crates/hyperqueue/src/transfer/messages.rs b/crates/hyperqueue/src/transfer/messages.rs index cc84f1853..0bc17b391 100644 --- a/crates/hyperqueue/src/transfer/messages.rs +++ b/crates/hyperqueue/src/transfer/messages.rs @@ -41,6 +41,10 @@ pub enum FromClientMessage { /// It is basically as sending JobInfo and StreamEvents, but it is done atomically, /// so no message is lost. JobInfo(JobInfoRequest, Option), + /// Get a list of items from the server. Response is sent as GetListResponse. + /// It contains boolean flags of what information you want to get. + /// In the current implementation it allows asking only for workers, + /// leaving still as flags for future extension. GetList { workers: bool, },