forked from ClickHouse/ClickHouse
-
Notifications
You must be signed in to change notification settings - Fork 17
Expand file tree
/
Copy pathStorageObjectStorageStableTaskDistributor.cpp
More file actions
323 lines (274 loc) · 11.3 KB
/
StorageObjectStorageStableTaskDistributor.cpp
File metadata and controls
323 lines (274 loc) · 11.3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
#include <Storages/ObjectStorage/StorageObjectStorageStableTaskDistributor.h>
#include <Storages/ObjectStorage/Utils.h>
#include <Storages/ObjectStorage/DataLakes/Iceberg/Utils.h>
#include <Common/SipHash.h>
#include <consistent_hashing.h>
#include <optional>
namespace ProfileEvents
{
extern const Event ObjectStorageClusterSentToMatchedReplica;
extern const Event ObjectStorageClusterSentToNonMatchedReplica;
};
namespace DB
{
namespace ErrorCodes
{
extern const int LOGICAL_ERROR;
extern const int CANNOT_READ_ALL_DATA;
};
StorageObjectStorageStableTaskDistributor::StorageObjectStorageStableTaskDistributor(
std::shared_ptr<IObjectIterator> iterator_,
std::vector<std::string> && ids_of_nodes_,
bool send_over_whole_archive_,
uint64_t lock_object_storage_task_distribution_ms_,
bool iceberg_read_optimization_enabled_)
: iterator(std::move(iterator_))
, send_over_whole_archive(send_over_whole_archive_)
, connection_to_files(ids_of_nodes_.size())
, ids_of_nodes(std::move(ids_of_nodes_))
, lock_object_storage_task_distribution_us(lock_object_storage_task_distribution_ms_ * 1000)
, iterator_exhausted(false)
, iceberg_read_optimization_enabled(iceberg_read_optimization_enabled_)
{
Poco::Timestamp now;
size_t nodes = ids_of_nodes.size();
for (size_t i = 0; i < nodes; ++i)
{
replica_to_files_to_be_processed[i] = std::list<ObjectInfoPtr>{};
last_node_activity[i] = now;
}
}
ObjectInfoPtr StorageObjectStorageStableTaskDistributor::getNextTask(size_t number_of_current_replica)
{
LOG_TRACE(log, "Received request from replica {} looking for a file", number_of_current_replica);
saveLastNodeActivity(number_of_current_replica);
auto processed_file_list_ptr = replica_to_files_to_be_processed.find(number_of_current_replica);
if (processed_file_list_ptr == replica_to_files_to_be_processed.end())
throw Exception(
ErrorCodes::LOGICAL_ERROR,
"Replica number {} was marked as lost, can't set task for it anymore",
number_of_current_replica
);
// 1. Check pre-queued files first
auto file = getPreQueuedFile(number_of_current_replica);
// 2. Try to find a matching file from the iterator
if (!file)
file = getMatchingFileFromIterator(number_of_current_replica);
// 3. Process unprocessed files if iterator is exhausted
if (!file)
file = getAnyUnprocessedFile(number_of_current_replica);
if (file)
processed_file_list_ptr->second.push_back(file);
return file;
}
size_t StorageObjectStorageStableTaskDistributor::getReplicaForFile(const String & file_path)
{
size_t nodes_count = ids_of_nodes.size();
/// Trivial case
if (nodes_count < 2)
return 0;
/// Rendezvous hashing
auto replica = replica_to_files_to_be_processed.begin();
if (replica == replica_to_files_to_be_processed.end())
throw Exception(
ErrorCodes::LOGICAL_ERROR,
"No active replicas, can't find best replica for file {}",
file_path
);
size_t best_id = replica->first;
UInt64 best_weight = sipHash64(ids_of_nodes[best_id] + file_path);
++replica;
while (replica != replica_to_files_to_be_processed.end())
{
size_t id = replica->first;
UInt64 weight = sipHash64(ids_of_nodes[id] + file_path);
if (weight > best_weight)
{
best_weight = weight;
best_id = id;
}
++replica;
}
return best_id;
}
ObjectInfoPtr StorageObjectStorageStableTaskDistributor::getPreQueuedFile(size_t number_of_current_replica)
{
std::lock_guard lock(mutex);
if (connection_to_files.size() <= number_of_current_replica)
throw Exception(
ErrorCodes::LOGICAL_ERROR,
"Replica number {} is out of range. Expected range: [0, {})",
number_of_current_replica,
connection_to_files.size()
);
auto & files = connection_to_files[number_of_current_replica];
while (!files.empty())
{
auto next_file = files.back();
files.pop_back();
auto file_identifier = send_over_whole_archive ? next_file->getPathOrPathToArchiveIfArchive() : getAbsolutePathFromObjectInfo(next_file).value_or(next_file->getIdentifier());
auto it = unprocessed_files.find(file_identifier);
if (it == unprocessed_files.end())
continue;
unprocessed_files.erase(it);
LOG_TRACE(
log,
"Assigning pre-queued file {} to replica {}",
file_identifier,
number_of_current_replica
);
ProfileEvents::increment(ProfileEvents::ObjectStorageClusterSentToMatchedReplica);
return next_file;
}
return {};
}
ObjectInfoPtr StorageObjectStorageStableTaskDistributor::getMatchingFileFromIterator(size_t number_of_current_replica)
{
{
std::lock_guard lock(mutex);
if (iterator_exhausted)
return {};
}
while (true)
{
ObjectInfoPtr object_info;
{
std::lock_guard lock(mutex);
object_info = iterator->next(0);
if (!object_info)
{
LOG_TEST(log, "Iterator is exhausted");
iterator_exhausted = true;
break;
}
}
String file_identifier;
if (send_over_whole_archive && object_info->isArchive())
{
file_identifier = object_info->getPathOrPathToArchiveIfArchive();
LOG_TEST(log, "Will send over the whole archive {} to replicas. "
"This will be suboptimal, consider turning on "
"cluster_function_process_archive_on_multiple_nodes setting", file_identifier);
}
else
{
file_identifier = getAbsolutePathFromObjectInfo(object_info).value_or(object_info->getIdentifier());
}
if (iceberg_read_optimization_enabled)
{
auto file_meta_info = object_info->relative_path_with_metadata.getFileMetaInfo();
if (file_meta_info.has_value())
{
auto file_path = send_over_whole_archive ? object_info->getPathOrPathToArchiveIfArchive() : object_info->getPath();
object_info->relative_path_with_metadata.command.setFilePath(file_path);
object_info->relative_path_with_metadata.command.setFileMetaInfo(file_meta_info.value());
}
}
size_t file_replica_idx = getReplicaForFile(file_identifier);
if (file_replica_idx == number_of_current_replica)
{
LOG_TRACE(
log, "Found file {} for replica {}",
file_identifier, number_of_current_replica
);
ProfileEvents::increment(ProfileEvents::ObjectStorageClusterSentToMatchedReplica);
return object_info;
}
LOG_TEST(
log,
"Found file {} for replica {} (number of current replica: {})",
file_identifier,
file_replica_idx,
number_of_current_replica
);
// Queue file for its assigned replica
{
std::lock_guard lock(mutex);
unprocessed_files.emplace(file_identifier, std::make_pair(object_info, file_replica_idx));
connection_to_files[file_replica_idx].push_back(object_info);
}
}
return {};
}
ObjectInfoPtr StorageObjectStorageStableTaskDistributor::getAnyUnprocessedFile(size_t number_of_current_replica)
{
/// Limit time of node activity to keep task in queue
Poco::Timestamp activity_limit;
Poco::Timestamp oldest_activity;
if (lock_object_storage_task_distribution_us > 0)
activity_limit -= lock_object_storage_task_distribution_us;
std::lock_guard lock(mutex);
if (!unprocessed_files.empty())
{
auto it = unprocessed_files.begin();
while (it != unprocessed_files.end())
{
auto number_of_matched_replica = it->second.second;
auto last_activity = last_node_activity.find(number_of_matched_replica);
if (lock_object_storage_task_distribution_us <= 0 // file deferring is turned off
|| it->second.second == number_of_current_replica // file is matching with current replica
|| last_activity == last_node_activity.end() // msut never be happen, last_activity is filled for each replica on start
|| activity_limit > last_activity->second) // matched replica did not ask for a new files for a while
{
auto next_file = it->second.first;
unprocessed_files.erase(it);
auto file_path = send_over_whole_archive ? next_file->getPathOrPathToArchiveIfArchive() : getAbsolutePathFromObjectInfo(next_file).value_or(next_file->getPath());
LOG_TRACE(
log,
"Iterator exhausted. Assigning unprocessed file {} to replica {} from matched replica {}",
file_path,
number_of_current_replica,
number_of_matched_replica
);
ProfileEvents::increment(ProfileEvents::ObjectStorageClusterSentToNonMatchedReplica);
return next_file;
}
oldest_activity = std::min(oldest_activity, last_activity->second);
++it;
}
LOG_TRACE(
log,
"No unprocessed file for replica {}, need to retry after {} us",
number_of_current_replica,
oldest_activity - activity_limit
);
/// All unprocessed files owned by alive replicas with recenlty activity
/// Need to retry after (oldest_activity - activity_limit) microseconds
RelativePathWithMetadata::CommandInTaskResponse response;
response.setRetryAfterUs(oldest_activity - activity_limit);
return std::make_shared<ObjectInfo>(response.toString());
}
return {};
}
void StorageObjectStorageStableTaskDistributor::saveLastNodeActivity(size_t number_of_current_replica)
{
Poco::Timestamp now;
std::lock_guard lock(mutex);
last_node_activity[number_of_current_replica] = now;
}
void StorageObjectStorageStableTaskDistributor::rescheduleTasksFromReplica(size_t number_of_current_replica)
{
LOG_INFO(log, "Replica {} is marked as lost, tasks are returned to queue", number_of_current_replica);
std::lock_guard lock(mutex);
auto processed_file_list_ptr = replica_to_files_to_be_processed.find(number_of_current_replica);
if (processed_file_list_ptr == replica_to_files_to_be_processed.end())
throw Exception(
ErrorCodes::LOGICAL_ERROR,
"Replica number {} was marked as lost already",
number_of_current_replica
);
if (replica_to_files_to_be_processed.size() < 2)
throw Exception(
ErrorCodes::CANNOT_READ_ALL_DATA,
"All replicas were marked as lost"
);
auto files = std::move(processed_file_list_ptr->second);
replica_to_files_to_be_processed.erase(number_of_current_replica);
for (const auto & file : files)
{
auto file_replica_idx = getReplicaForFile(file->getPath());
unprocessed_files.emplace(file->getPath(), std::make_pair(file, file_replica_idx));
connection_to_files[file_replica_idx].push_back(file);
}
}
}