Skip to content

Commit 4054fff

Browse files
Copilotfruch
andcommitted
Implement Solution 1: Fix atexit registration to prevent shutdown crashes
This implements the minimal fix for the libev atexit cleanup bug. Changes: - Replace atexit.register(partial(_cleanup, _global_loop)) with a wrapper function _atexit_cleanup() that looks up _global_loop at shutdown time - Remove unused 'partial' import from functools - Update tests to verify the fix works correctly The bug was that partial() captured _global_loop=None at import time, so cleanup always received None at shutdown instead of the actual LibevLoop instance. This prevented proper cleanup, leaving active callbacks that could crash during Python interpreter shutdown. The fix ensures _global_loop is looked up when atexit calls the cleanup, not when the callback is registered, so cleanup receives the actual loop instance and can properly shut down watchers and join the event loop thread. Co-authored-by: fruch <340979+fruch@users.noreply.github.com>
1 parent 618fecf commit 4054fff

2 files changed

Lines changed: 105 additions & 108 deletions

File tree

cassandra/io/libevreactor.py

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,6 @@
1313
# limitations under the License.
1414
import atexit
1515
from collections import deque
16-
from functools import partial
1716
import logging
1817
import os
1918
import socket
@@ -227,8 +226,20 @@ def _loop_will_run(self, prepare):
227226
self._notifier.send()
228227

229228

229+
def _atexit_cleanup():
230+
"""Cleanup function called by atexit that uses the current _global_loop value.
231+
232+
This wrapper ensures that cleanup receives the actual LibevLoop instance
233+
instead of None, which was the value of _global_loop when the module was
234+
imported.
235+
"""
236+
global _global_loop
237+
if _global_loop is not None:
238+
_cleanup(_global_loop)
239+
240+
230241
_global_loop = None
231-
atexit.register(partial(_cleanup, _global_loop))
242+
atexit.register(_atexit_cleanup)
232243

233244

234245
class LibevConnection(Connection):

tests/unit/io/test_libevreactor_shutdown.py

Lines changed: 92 additions & 106 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,6 @@
2121
"""
2222

2323
import unittest
24-
import atexit
2524
import sys
2625
import subprocess
2726
import tempfile
@@ -53,77 +52,67 @@ def setUp(self):
5352
if LibevConnection is None:
5453
raise unittest.SkipTest('libev does not appear to be installed correctly')
5554

56-
def test_atexit_callback_registered_with_none(self):
55+
def test_atexit_callback_uses_current_global_loop(self):
5756
"""
58-
Test that demonstrates the atexit callback bug.
57+
Test that verifies the atexit callback fix.
5958
60-
The atexit.register(partial(_cleanup, _global_loop)) line is executed
61-
when _global_loop is None. This means the partial function captures
62-
None as the argument, and when atexit calls it during shutdown, it
63-
passes None to _cleanup instead of the actual loop instance.
59+
The fix uses a wrapper function _atexit_cleanup() that looks up the
60+
current value of _global_loop at shutdown time, instead of capturing
61+
it at import time with partial().
6462
6563
@since 3.29
6664
@jira_ticket PYTHON-XXX
67-
@expected_result The test demonstrates that atexit cleanup is broken
65+
@expected_result The atexit handler calls cleanup with the actual loop
6866
6967
@test_category connection
7068
"""
7169
from cassandra.io import libevreactor
72-
from functools import partial
7370

74-
# Check the current atexit handlers
75-
# Note: atexit._exithandlers is an implementation detail but useful for debugging
76-
if hasattr(atexit, '_exithandlers'):
77-
# Find our cleanup handler
78-
cleanup_handler = None
79-
for handler in atexit._exithandlers:
80-
func = handler[0]
81-
# Check if this is our partial(_cleanup, _global_loop) handler
82-
if isinstance(func, partial):
83-
if func.func.__name__ == '_cleanup':
84-
cleanup_handler = func
85-
break
86-
87-
if cleanup_handler:
88-
# The problem: the partial was created with _global_loop=None
89-
# So even if _global_loop is later set to a LibevLoop instance,
90-
# the atexit callback will still call _cleanup(None)
91-
captured_arg = cleanup_handler.args[0] if cleanup_handler.args else None
92-
93-
# This assertion will fail after LibevConnection.initialize_reactor()
94-
# is called and _global_loop is set to a LibevLoop instance
95-
LibevConnection.initialize_reactor()
96-
97-
# At this point, libevreactor._global_loop is not None
98-
self.assertIsNotNone(libevreactor._global_loop,
99-
"Global loop should be initialized")
100-
101-
# But the atexit handler still has None captured!
102-
self.assertIsNone(captured_arg,
103-
"The atexit handler captured None, not the actual loop instance. "
104-
"This is the BUG: cleanup will receive None at shutdown!")
105-
106-
def test_shutdown_crash_scenario_subprocess(self):
71+
# Verify the fix: _atexit_cleanup should exist as a module-level function
72+
self.assertTrue(hasattr(libevreactor, '_atexit_cleanup'),
73+
"Module should have _atexit_cleanup function")
74+
75+
# Verify it's not a partial (the old buggy implementation)
76+
from functools import partial
77+
self.assertNotIsInstance(libevreactor._atexit_cleanup, partial,
78+
"The _atexit_cleanup should NOT be a partial function")
79+
80+
# Verify it's actually a function
81+
self.assertTrue(callable(libevreactor._atexit_cleanup),
82+
"_atexit_cleanup should be callable")
83+
84+
# Initialize the reactor
85+
LibevConnection.initialize_reactor()
86+
87+
# At this point, libevreactor._global_loop is not None
88+
self.assertIsNotNone(libevreactor._global_loop,
89+
"Global loop should be initialized")
90+
91+
# The fix: _atexit_cleanup is a function that will look up
92+
# _global_loop when it's called, not a partial with captured args
93+
self.assertEqual(libevreactor._atexit_cleanup.__name__, '_atexit_cleanup',
94+
"The function should have the correct name")
95+
96+
def test_shutdown_cleanup_works_with_fix(self):
10797
"""
108-
Test that simulates a Python shutdown crash scenario in a subprocess.
98+
Test that verifies the atexit cleanup fix works in a subprocess.
10999
110100
This test creates a minimal script that:
111101
1. Imports the driver
112-
2. Creates a connection (which starts the event loop)
113-
3. Exits without explicit cleanup
102+
2. Initializes the reactor (creates the global loop)
103+
3. Verifies the _atexit_cleanup function is available
104+
4. Exits without explicit cleanup
114105
115-
The expected behavior is that atexit should clean up the loop, but
116-
because of the bug, the cleanup receives None and doesn't actually
117-
stop the loop or its watchers. This can lead to crashes if callbacks
118-
fire during shutdown.
106+
With the fix, atexit should properly clean up the loop using the
107+
wrapper function that looks up _global_loop at shutdown time.
119108
120109
@since 3.29
121110
@jira_ticket PYTHON-XXX
122-
@expected_result The subprocess demonstrates the cleanup issue
111+
@expected_result The subprocess shows the fix is working
123112
124113
@test_category connection
125114
"""
126-
# Create a test script that demonstrates the issue
115+
# Create a test script that verifies the fix
127116
test_script = '''
128117
import sys
129118
import os
@@ -132,28 +121,29 @@ def test_shutdown_crash_scenario_subprocess(self):
132121
sys.path.insert(0, {driver_path!r})
133122
134123
# Import and setup
135-
from cassandra.io.libevreactor import LibevConnection, _global_loop
124+
from cassandra.io import libevreactor
125+
from cassandra.io.libevreactor import LibevConnection
136126
import atexit
137127
138128
# Initialize the reactor (creates the global loop)
139129
LibevConnection.initialize_reactor()
140130
141-
print("Global loop initialized:", _global_loop is not None)
142-
143-
# Check what atexit will actually call
144-
if hasattr(atexit, '_exithandlers'):
145-
from functools import partial
146-
for handler in atexit._exithandlers:
147-
func = handler[0]
148-
if isinstance(func, partial) and func.func.__name__ == '_cleanup':
149-
captured_arg = func.args[0] if func.args else None
150-
print("Atexit will call _cleanup with:", captured_arg)
151-
print("But _global_loop is:", _global_loop)
152-
print("BUG: Cleanup will receive None instead of the loop!")
153-
break
154-
155-
# Exit without explicit cleanup - atexit should handle it, but won't!
156-
print("Exiting...")
131+
print("Global loop initialized:", libevreactor._global_loop is not None)
132+
133+
# Verify the fix is in place: _atexit_cleanup should be a module-level function
134+
if hasattr(libevreactor, '_atexit_cleanup'):
135+
print("FIXED: Module has _atexit_cleanup function")
136+
print("This function will look up _global_loop at shutdown time")
137+
# Verify it's not using partial with None
138+
import inspect
139+
source = inspect.getsource(libevreactor._atexit_cleanup)
140+
if "global _global_loop" in source and "_global_loop is not None" in source:
141+
print("Verified: _atexit_cleanup uses current _global_loop value")
142+
else:
143+
print("BUG: No _atexit_cleanup function found")
144+
145+
# Exit without explicit cleanup - atexit should handle it properly with the fix!
146+
print("Exiting with proper cleanup...")
157147
'''
158148

159149
driver_path = str(Path(__file__).parent.parent.parent.parent)
@@ -176,11 +166,12 @@ def test_shutdown_crash_scenario_subprocess(self):
176166
print(output)
177167
print("=== End Output ===\n")
178168

179-
# Verify the output shows the bug
169+
# Verify the output shows the fix is working
180170
self.assertIn("Global loop initialized: True", output)
181-
self.assertIn("Atexit will call _cleanup with: None", output)
182-
self.assertIn("BUG: Cleanup will receive None instead of the loop!", output)
183-
171+
self.assertIn("FIXED: Module has _atexit_cleanup function", output)
172+
self.assertIn("Verified: _atexit_cleanup uses current _global_loop value", output)
173+
self.assertNotIn("BUG", output.replace("BUG STILL PRESENT", "").replace("DEBUG", "")) # Allow "BUG" only in success message
174+
184175
finally:
185176
os.unlink(script_path)
186177

@@ -196,54 +187,49 @@ def setUp(self):
196187
if LibevConnection is None:
197188
raise unittest.SkipTest('libev does not appear to be installed correctly')
198189

199-
def test_callback_during_shutdown_scenario(self):
190+
def test_cleanup_with_fix_properly_shuts_down(self):
200191
"""
201-
Test to document the potential crash scenario.
202-
203-
When Python is shutting down:
204-
1. Various modules are being torn down
205-
2. The libev event loop may still be running
206-
3. If a callback (io_callback, timer_callback, prepare_callback) fires:
207-
- It calls PyGILState_Ensure()
208-
- It tries to call Python functions (PyObject_CallFunction)
209-
- If Python objects have been deallocated, this can crash
210-
211-
The root cause: The atexit cleanup doesn't actually run because it
212-
receives None instead of the loop instance, so it never:
213-
- Sets _shutdown flag
214-
- Stops watchers
215-
- Joins the event loop thread
192+
Test to verify the fix properly shuts down the event loop.
193+
194+
With the fix in place, the atexit cleanup will:
195+
1. Look up the current _global_loop value (not None)
196+
2. Call _cleanup with the actual loop instance
197+
3. Properly shut down the loop and its watchers
198+
199+
This prevents the crash scenario where:
200+
- Various modules are being torn down during Python shutdown
201+
- The libev event loop is still running
202+
- Callbacks fire and try to access deallocated Python objects
216203
217204
@since 3.29
218205
@jira_ticket PYTHON-XXX
219-
@expected_result Documents the crash scenario
206+
@expected_result Cleanup properly shuts down the loop with the fix
220207
221208
@test_category connection
222209
"""
223-
from cassandra.io.libevreactor import _global_loop, _cleanup
224-
225-
# This test documents the issue - we can't easily reproduce a crash
226-
# in a unit test without actually tearing down Python, but we can
227-
# verify the conditions that lead to it
228-
210+
from cassandra.io import libevreactor
211+
from cassandra.io.libevreactor import _cleanup, _atexit_cleanup
212+
229213
LibevConnection.initialize_reactor()
230214

231215
# Verify the loop exists
232-
self.assertIsNotNone(_global_loop)
233-
234-
# Simulate what atexit would call (with the bug)
235-
_cleanup(None) # BUG: receives None instead of _global_loop
216+
self.assertIsNotNone(libevreactor._global_loop)
217+
218+
# Before cleanup, the loop should not be shut down
219+
self.assertFalse(libevreactor._global_loop._shutdown,
220+
"Loop should not be shut down initially")
236221

237-
# The loop is still running because cleanup did nothing!
238-
self.assertFalse(_global_loop._shutdown,
222+
# Simulate what the OLD buggy code would do
223+
_cleanup(None) # This does nothing
224+
self.assertFalse(libevreactor._global_loop._shutdown,
239225
"Loop should NOT be shut down when cleanup receives None")
240226

241-
# Now call it correctly
242-
_cleanup(_global_loop)
227+
# Now test the FIX: call the wrapper that looks up _global_loop
228+
_atexit_cleanup() # This is what atexit will actually call
243229

244-
# Now it should be shut down
245-
self.assertTrue(_global_loop._shutdown,
246-
"Loop should be shut down when cleanup receives the actual loop")
230+
# With the fix, the loop should be properly shut down
231+
self.assertTrue(libevreactor._global_loop._shutdown,
232+
"Loop should be shut down when _atexit_cleanup is called")
247233

248234

249235
if __name__ == '__main__':

0 commit comments

Comments
 (0)