I get a segfault ~1/10 times when running a benchmark. Each benchmark runs 10 times the same set of requests (requests mean_input_token_count = 1024, n= 10240). After that the prefill hangs. I couldn't find any errors or warnings related in logs.
replica-0/engine-prefill | !!!!!!! Segfault encountered !!!!!!!
replica-0/engine-prefill | File "<unknown>", line 0, in PyCallable_Check
replica-0/engine-prefill | File "/src/tensorrt_llm/cpp/build_RelWithDebInfo/_deps/nanobind-src/include/nanobind/nb_types.h", line 752, in nanobind::callable::check_(nanobind::handle)
replica-0/engine-prefill | File "/src/tensorrt_llm/cpp/build_RelWithDebInfo/_deps/nanobind-src/include/nanobind/nb_types.h", line 676, in bool nanobind::isinstance<nanobind::callable>(nanobind::handle)
replica-0/engine-prefill | File "/src/tensorrt_llm/cpp/build_RelWithDebInfo/_deps/nanobind-src/include/nanobind/nb_cast.h", line 429, in nanobind::detail::type_caster<nanobind::callable, int>::from_python(nanobind::handle, unsigned char, nanobind::detail::cleanup_list*)
replica-0/engine-prefill | File "/src/tensorrt_llm/cpp/build_RelWithDebInfo/_deps/nanobind-src/include/nanobind/nb_func.h", line 254, in nanobind::detail::func_create<false, false, nanobind::detail::func_create<false, true, std::optional<unsigned long> (*&)(unsigned long, bool, nanobind::callable, nanobind::args, nanobind::kwargs), std::optional<unsigned long>, unsigned long, bool, nanobind::callable, nanobind::args, nanobind::kwargs, 0ul, 1ul, 2ul, 3ul, 4ul, nanobind::scope, nanobind::name, char [47], nanobind::call_guard<nanobind::gil_scoped_release> >(std::optional<unsigned long> (*&)(unsigned long, bool, nanobind::callable, nanobind::args, nanobind::kwargs), std::optional<unsigned long> (*)(unsigned long, bool, nanobind::callable, nanobind::args, nanobind::kwargs), std::integer_sequence<unsigned long, 0ul, 1ul, 2ul, 3ul, 4ul>, nanobind::scope const&, nanobind::name const&, char const (&) [47], nanobind::call_guard<nanobind::gil_scoped_release> const&)::{lambda(unsigned long, bool, nanobind::callable, nanobind::args, nanobind::kwargs)#1}, std::optional<unsigned long>, unsigned long, bool, nanobind::callable, nanobind::args, nanobind::kwargs, 0ul, 1ul, 2ul, 3ul, 4ul, nanobind::scope, nanobind::name, char [47], nanobind::call_guard<nanobind::gil_scoped_release> >(std::optional<unsigned long> (*&)(unsigned long, bool, nanobind::callable, nanobind::args, nanobind::kwargs), std::optional<unsigned long> (*)(unsigned long, bool, nanobind::callable, nanobind::args, nanobind::kwargs), std::integer_sequence<unsigned long, 0ul, 1ul, 2ul, 3ul, 4ul>, nanobind::scope const&, nanobind::name const&, char const (&) [47], nanobind::call_guard<nanobind::gil_scoped_release> const&)::{lambda(void*, _object**, unsigned char*, nanobind::rv_policy, nanobind::detail::cleanup_list*)#1}::operator()(void*, _object**, unsigned char*, nanobind::rv_policy, nanobind::detail::cleanup_list*) const
replica-0/engine-prefill | File "/src/tensorrt_llm/cpp/build_RelWithDebInfo/_deps/nanobind-src/include/nanobind/nb_func.h", line 219, in nanobind::detail::func_create<false, false, nanobind::detail::func_create<false, true, std::optional<unsigned long> (*&)(unsigned long, bool, nanobind::callable, nanobind::args, nanobind::kwargs), std::optional<unsigned long>, unsigned long, bool, nanobind::callable, nanobind::args, nanobind::kwargs, 0ul, 1ul, 2ul, 3ul, 4ul, nanobind::scope, nanobind::name, char [47], nanobind::call_guard<nanobind::gil_scoped_release> >(std::optional<unsigned long> (*&)(unsigned long, bool, nanobind::callable, nanobind::args, nanobind::kwargs), std::optional<unsigned long> (*)(unsigned long, bool, nanobind::callable, nanobind::args, nanobind::kwargs), std::integer_sequence<unsigned long, 0ul, 1ul, 2ul, 3ul, 4ul>, nanobind::scope const&, nanobind::name const&, char const (&) [47], nanobind::call_guard<nanobind::gil_scoped_release> const&)::{lambda(unsigned long, bool, nanobind::callable, nanobind::args, nanobind::kwargs)#1}, std::optional<unsigned long>, unsigned long, bool, nanobind::callable, nanobind::args, nanobind::kwargs, 0ul, 1ul, 2ul, 3ul, 4ul, nanobind::scope, nanobind::name, char [47], nanobind::call_guard<nanobind::gil_scoped_release> >(std::optional<unsigned long> (*&)(unsigned long, bool, nanobind::callable, nanobind::args, nanobind::kwargs), std::optional<unsigned long> (*)(unsigned long, bool, nanobind::callable, nanobind::args, nanobind::kwargs), std::integer_sequence<unsigned long, 0ul, 1ul, 2ul, 3ul, 4ul>, nanobind::scope const&, nanobind::name const&, char const (&) [47], nanobind::call_guard<nanobind::gil_scoped_release> const&)::{lambda(void*, _object**, unsigned char*, nanobind::rv_policy, nanobind::detail::cleanup_list*)#1}::_FUN(void*, _object**, unsigned char*, nanobind::rv_policy, nanobind::detail::cleanup_list*)
replica-0/engine-prefill | File "/src/tensorrt_llm/cpp/build_RelWithDebInfo/_deps/nanobind-src/src/nb_func.cpp", line 839, in nb_func_vectorcall_complex
replica-0/engine-prefill | File "<unknown>", line 0, in PyObject_Call
replica-0/engine-prefill | File "<unknown>", line 0, in _PyEval_EvalFrameDefault
replica-0/engine-prefill | File "<unknown>", line 0, in PyObject_Call
replica-0/engine-prefill | File "<unknown>", line 0, in _PyEval_EvalFrameDefault
replica-0/engine-prefill | File "<unknown>", line 0, in PyObject_Call
replica-0/engine-prefill | File "<unknown>", line 0, in _PyEval_EvalFrameDefault
replica-0/engine-prefill | File "<unknown>", line 0, in _PyObject_Call_Prepend
replica-0/engine-prefill | File "<unknown>", line 0, in _PyObject_MakeTpCall
replica-0/engine-prefill | File "<unknown>", line 0, in _PyEval_EvalFrameDefault
replica-0/engine-prefill | File "<unknown>", line 0, in PyObject_Call
replica-0/engine-prefill | File "<unknown>", line 0, in _PyEval_EvalFrameDefault
replica-0/engine-prefill | File "<unknown>", line 0, in PyObject_Call
replica-0/engine-prefill | File "<unknown>", line 0, in _PyEval_EvalFrameDefault
replica-0/engine-prefill | File "<unknown>", line 0, in PyObject_Call
replica-0/engine-prefill | File "<unknown>", line 0, in _PyEval_EvalFrameDefault
replica-0/engine-prefill | File "<unknown>", line 0, in _PyEval_EvalFrameDefault
replica-0/engine-prefill | File "<unknown>", line 0, in 0xffffffffffffffff
Run a set of 10240 requests with mean_input_tokens = 1024 over and over until a segfault occurs.
System Info
Hardware: AWS P5en, 8xH200
Software:
Runtime configuration:
Who can help?
I get a segfault ~1/10 times when running a benchmark. Each benchmark runs 10 times the same set of requests (requests mean_input_token_count = 1024, n= 10240). After that the prefill hangs. I couldn't find any errors or warnings related in logs.
Information
Tasks
examplesfolder (such as GLUE/SQuAD, ...)Reproduction
Run a set of 10240 requests with mean_input_tokens = 1024 over and over until a segfault occurs.
Expected behavior
There is no segfault
actual behavior
There is a segfault
additional notes
I haven't yet narrow it down
Before submitting a new issue...