Skip to content

Commit 32a0b90

Browse files
committed
tfscheduler: run fine grained scheduling by default
1 parent 6e27d5f commit 32a0b90

File tree

1 file changed

+7
-11
lines changed

1 file changed

+7
-11
lines changed

src/common/discovery/DataDistributionOptions.h

Lines changed: 7 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ static constexpr bool DataDistMonitorRpcDurationDefault = false;
3434

3535
// Enable the token manager for stf transfers
3636
static constexpr std::string_view DataDistEnableStfTransferTokensKey = "DataDistEnableStfTransferTokens";
37-
static constexpr bool DataDistEnableStfTransferTokensDefault = false;
37+
static constexpr bool DataDistEnableStfTransferTokensDefault = true;
3838

3939

4040
////////////////////////////////////////////////////////////////////////////////
@@ -64,7 +64,7 @@ static constexpr std::uint64_t StaleStfTimeoutMsDefault = 60000;
6464

6565
// Standalone: Chance the stf will be deleted on arrival
6666
static constexpr std::string_view StandaloneStfDeleteChanceKey = "StandaloneStfDeleteChance";
67-
static constexpr std::uint64_t StandaloneStfDeleteChanceDefault = 50;
67+
static constexpr std::uint64_t StandaloneStfDeleteChanceDefault = 25;
6868

6969
// Standalone: Amount of data to keep while running
7070
static constexpr std::string_view StandaloneStfDataBufferSizeMBKey = "StandaloneStfDataBufferSizeMB";
@@ -105,15 +105,15 @@ static constexpr std::uint64_t StfSenderGrpcThreadPoolSizeDefault = 8;
105105
/// UCX transport
106106
// Size of receiver treadpool. Default 1, works best. Should not be set over 2, to avoid congestion on the receiver.
107107
static constexpr std::string_view UcxTfBuilderThreadPoolSizeKey = "UcxTfBuilderThreadPoolSize";
108-
static constexpr std::uint64_t UcxTfBuilderThreadPoolSizeDefault = 2;
108+
static constexpr std::uint64_t UcxTfBuilderThreadPoolSizeDefault = 1;
109109

110110
// Use polling or blocking waiting method for RDMA completion.
111111
static constexpr std::string_view UcxPollForRDMACompletionKey = "UcxPollForRDMACompletion";
112112
static constexpr bool UcxPollForRDMACompletionDefault = false;
113113

114114
// Allow smaller STFs to be fetched concurrently to improve TF building time
115115
static constexpr std::string_view UcxMaxStfSizeForConcurrentFetchBKey = "UcxMaxStfSizeForConcurrentFetchB";
116-
static constexpr std::uint64_t UcxMaxStfSizeForConcurrentFetchBDefault = std::uint64_t(4) << 20;
116+
static constexpr std::uint64_t UcxMaxStfSizeForConcurrentFetchBDefault = std::uint64_t(2) << 20;
117117

118118

119119
////////////////////////////////////////////////////////////////////////////////
@@ -122,23 +122,19 @@ static constexpr std::uint64_t UcxMaxStfSizeForConcurrentFetchBDefault = std::ui
122122

123123
// Define maximum number of concurrent TFs in building per TfBuilder
124124
static constexpr std::string_view MaxNumTfsInBuildingKey = "MaxNumTfsInBuilding";
125-
static constexpr std::uint64_t MaxNumTfsInBuildingDevault = 25;
125+
static constexpr std::uint64_t MaxNumTfsInBuildingDevault = 5;
126126

127127
// Decision wether to build or drop incomplete (stale) TFs
128128
static constexpr std::string_view BuildIncompleteTfsKey = "BuildIncompleteTfs";
129129
static constexpr bool BuildIncompleteTfsValue = true;
130130

131131
// An incomplete TF is considered stale when the following timeout expires after the last STF is reported.
132132
static constexpr std::string_view StaleTfTimeoutMsKey = "StaleTfTimeoutMs";
133-
static constexpr std::uint64_t StaleTfTimeoutMsDefault = 1000;
134-
135-
// Max number of incomplete TFs to keep before considering them stale
136-
static constexpr std::string_view IncompleteTfsMaxCntKey = "IncompleteTfsMaxCnt";
137-
static constexpr std::uint64_t IncompleteTfsMaxCntValue = 100;
133+
static constexpr std::uint64_t StaleTfTimeoutMsDefault = 750;
138134

139135
// Token reset timeout. All tokens are returned to the scheduler, as a protection from failed EPNs.
140136
static constexpr std::string_view TokenResetTimeoutMsKey = "TokenResetTimeoutMs";
141-
static constexpr std::uint64_t TokenResetTimeoutMsDefault = 1000;
137+
static constexpr std::uint64_t TokenResetTimeoutMsDefault = 5000;
142138

143139
// Number of tokens assigned to each FLP. Increasing the cnt will allow multiple receivers in parallel.
144140
static constexpr std::string_view TokensPerStfSenderCntKey = "TokensPerStfSenderCnt";

0 commit comments

Comments
 (0)