Skip to content

Commit f84a80a

Browse files
authored
Serialization in tokio thread instead of blocking thread, 50% reduction in latency for small models (#767)
1 parent 106d25f commit f84a80a

File tree

1 file changed

+4
-4
lines changed

1 file changed

+4
-4
lines changed

core/src/infer.rs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -550,8 +550,8 @@ async fn backend_task(backend: Backend, mut embed_receiver: mpsc::Receiver<NextB
550550
ModelType::Classifier => {
551551
let results = backend.predict(batch.1).await;
552552

553-
// Handle sending responses in another thread to avoid starving the backend
554-
std::thread::spawn(move || match results {
553+
// Handle sending responses in a blocking task to avoid starving the backend
554+
tokio::task::spawn_blocking(move || match results {
555555
Ok((mut predictions, inference_duration)) => {
556556
batch.0.into_iter().enumerate().for_each(|(i, m)| {
557557
let infer_metadata = InferMetadata {
@@ -581,8 +581,8 @@ async fn backend_task(backend: Backend, mut embed_receiver: mpsc::Receiver<NextB
581581
ModelType::Embedding(_) => {
582582
let results = backend.embed(batch.1).await;
583583

584-
// Handle sending responses in another thread to avoid starving the backend
585-
std::thread::spawn(move || match results {
584+
// Handle sending responses in a blocking task to avoid starving the backend
585+
tokio::task::spawn_blocking(move || match results {
586586
Ok((mut embeddings, inference_duration)) => {
587587
batch.0.into_iter().enumerate().for_each(|(i, m)| {
588588
let metadata = InferMetadata {

0 commit comments

Comments
 (0)