Skip to content

Commit 788c565

Browse files
committed
Speed up general case
1 parent b90e37c commit 788c565

File tree

7 files changed

+234
-156
lines changed

7 files changed

+234
-156
lines changed

Lib/profiling/sampling/collector.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,17 @@ def _iter_async_frames(self, awaited_info_list):
8686
# Phase 3: Build linear stacks from each leaf to root (optimized - no sorting!)
8787
yield from self._build_linear_stacks(leaf_task_ids, task_map, child_to_parent)
8888

89+
def _iter_stacks(self, stack_frames, skip_idle=False):
90+
"""Yield (frames, thread_id) for all stacks, handling both sync and async modes."""
91+
if stack_frames and hasattr(stack_frames[0], "awaited_by"):
92+
for frames, thread_id, _ in self._iter_async_frames(stack_frames):
93+
if frames:
94+
yield frames, thread_id
95+
else:
96+
for frames, thread_id in self._iter_all_frames(stack_frames, skip_idle=skip_idle):
97+
if frames:
98+
yield frames, thread_id
99+
89100
def _build_task_graph(self, awaited_info_list):
90101
task_map = {}
91102
child_to_parent = {} # Maps child_id -> (selected_parent_id, parent_count)

Lib/profiling/sampling/gecko_collector.py

Lines changed: 58 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -141,27 +141,35 @@ def _track_state_transition(self, tid, condition, active_dict, inactive_dict,
141141
self._add_marker(tid, active_name, active_dict.pop(tid),
142142
current_time, category)
143143

144-
def collect(self, stack_frames, timestamp_us=None):
145-
"""Collect a sample from stack frames."""
146-
if timestamp_us is not None:
147-
# Use provided timestamp (from binary replay)
148-
# Track first timestamp as base for relative time calculation
149-
if self._replay_base_timestamp_us is None:
150-
self._replay_base_timestamp_us = timestamp_us
151-
# Convert to milliseconds relative to first sample
152-
current_time = (timestamp_us - self._replay_base_timestamp_us) / 1000
153-
else:
154-
# Live sampling - use monotonic clock
144+
def collect(self, stack_frames, timestamps_us=None):
145+
"""Collect samples from stack frames.
146+
147+
Args:
148+
stack_frames: List of interpreter/thread frame info
149+
timestamps_us: List of timestamps in microseconds (None for live sampling)
150+
"""
151+
# Handle live sampling (no timestamps provided)
152+
if timestamps_us is None:
155153
current_time = (time.monotonic() * 1000) - self.start_time
154+
times = [current_time]
155+
else:
156+
if not timestamps_us:
157+
return
158+
# Initialize base timestamp if needed
159+
if self._replay_base_timestamp_us is None:
160+
self._replay_base_timestamp_us = timestamps_us[0]
161+
# Convert all timestamps to times (ms relative to first sample)
162+
base = self._replay_base_timestamp_us
163+
times = [(ts - base) / 1000 for ts in timestamps_us]
164+
165+
first_time = times[0]
156166

157167
# Update interval calculation
158168
if self.sample_count > 0 and self.last_sample_time > 0:
159-
self.interval = (
160-
current_time - self.last_sample_time
161-
) / self.sample_count
162-
self.last_sample_time = current_time
169+
self.interval = (times[-1] - self.last_sample_time) / self.sample_count
170+
self.last_sample_time = times[-1]
163171

164-
# Process threads and track GC per thread
172+
# Process threads
165173
for interpreter_info in stack_frames:
166174
for thread_info in interpreter_info.threads:
167175
frames = thread_info.frame_info
@@ -179,92 +187,86 @@ def collect(self, stack_frames, timestamp_us=None):
179187
on_cpu = bool(status_flags & THREAD_STATUS_ON_CPU)
180188
gil_requested = bool(status_flags & THREAD_STATUS_GIL_REQUESTED)
181189

182-
# Track GIL possession (Has GIL / No GIL)
190+
# Track state transitions using first timestamp
183191
self._track_state_transition(
184192
tid, has_gil, self.has_gil_start, self.no_gil_start,
185-
"Has GIL", "No GIL", CATEGORY_GIL, current_time
193+
"Has GIL", "No GIL", CATEGORY_GIL, first_time
186194
)
187-
188-
# Track CPU state (On CPU / Off CPU)
189195
self._track_state_transition(
190196
tid, on_cpu, self.on_cpu_start, self.off_cpu_start,
191-
"On CPU", "Off CPU", CATEGORY_CPU, current_time
197+
"On CPU", "Off CPU", CATEGORY_CPU, first_time
192198
)
193199

194-
# Track code type (Python Code / Native Code)
195-
# This is tri-state: Python (has_gil), Native (on_cpu without gil), or Neither
200+
# Track code type
196201
if has_gil:
197202
self._track_state_transition(
198203
tid, True, self.python_code_start, self.native_code_start,
199-
"Python Code", "Native Code", CATEGORY_CODE_TYPE, current_time
204+
"Python Code", "Native Code", CATEGORY_CODE_TYPE, first_time
200205
)
201206
elif on_cpu:
202207
self._track_state_transition(
203208
tid, True, self.native_code_start, self.python_code_start,
204-
"Native Code", "Python Code", CATEGORY_CODE_TYPE, current_time
209+
"Native Code", "Python Code", CATEGORY_CODE_TYPE, first_time
205210
)
206211
else:
207-
# Thread is idle (neither has GIL nor on CPU) - close any open code markers
208-
# This handles the third state that _track_state_transition doesn't cover
209212
if tid in self.initialized_threads:
210213
if tid in self.python_code_start:
211214
self._add_marker(tid, "Python Code", self.python_code_start.pop(tid),
212-
current_time, CATEGORY_CODE_TYPE)
215+
first_time, CATEGORY_CODE_TYPE)
213216
if tid in self.native_code_start:
214217
self._add_marker(tid, "Native Code", self.native_code_start.pop(tid),
215-
current_time, CATEGORY_CODE_TYPE)
218+
first_time, CATEGORY_CODE_TYPE)
216219

217-
# Track "Waiting for GIL" intervals (one-sided tracking)
220+
# Track GIL wait
218221
if gil_requested:
219-
self.gil_wait_start.setdefault(tid, current_time)
222+
self.gil_wait_start.setdefault(tid, first_time)
220223
elif tid in self.gil_wait_start:
221224
self._add_marker(tid, "Waiting for GIL", self.gil_wait_start.pop(tid),
222-
current_time, CATEGORY_GIL)
225+
first_time, CATEGORY_GIL)
223226

224-
# Track exception state (Has Exception / No Exception)
227+
# Track exception state
225228
has_exception = bool(status_flags & THREAD_STATUS_HAS_EXCEPTION)
226229
self._track_state_transition(
227230
tid, has_exception, self.exception_start, self.no_exception_start,
228-
"Has Exception", "No Exception", CATEGORY_EXCEPTION, current_time
231+
"Has Exception", "No Exception", CATEGORY_EXCEPTION, first_time
229232
)
230233

231-
# Track GC events by detecting <GC> frames in the stack trace
232-
# This leverages the improved GC frame tracking from commit 336366fd7ca
233-
# which precisely identifies the thread that initiated GC collection
234+
# Track GC events
234235
has_gc_frame = any(frame[2] == "<GC>" for frame in frames)
235236
if has_gc_frame:
236-
# This thread initiated GC collection
237237
if tid not in self.gc_start_per_thread:
238-
self.gc_start_per_thread[tid] = current_time
238+
self.gc_start_per_thread[tid] = first_time
239239
elif tid in self.gc_start_per_thread:
240-
# End GC marker when no more GC frames are detected
241240
self._add_marker(tid, "GC Collecting", self.gc_start_per_thread.pop(tid),
242-
current_time, CATEGORY_GC)
241+
first_time, CATEGORY_GC)
243242

244-
# Mark thread as initialized after processing all state transitions
243+
# Mark thread as initialized
245244
self.initialized_threads.add(tid)
246245

247-
# Categorize: idle if neither has GIL nor on CPU
246+
# Skip idle threads if requested
248247
is_idle = not has_gil and not on_cpu
249-
250-
# Skip idle threads if skip_idle is enabled
251248
if self.skip_idle and is_idle:
252249
continue
253250

254251
if not frames:
255252
continue
256253

257-
# Process the stack
254+
# Process stack once to get stack_index
258255
stack_index = self._process_stack(thread_data, frames)
259256

260-
# Add sample - cache references to avoid dictionary lookups
257+
# Add samples with timestamps
261258
samples = thread_data["samples"]
262-
samples["stack"].append(stack_index)
263-
samples["time"].append(current_time)
264-
samples["eventDelay"].append(None)
259+
samples_stack = samples["stack"]
260+
samples_time = samples["time"]
261+
samples_delay = samples["eventDelay"]
262+
263+
for t in times:
264+
samples_stack.append(stack_index)
265+
samples_time.append(t)
266+
samples_delay.append(None)
265267

266-
# Track opcode state changes for interval markers (leaf frame only)
267-
if self.opcodes_enabled:
268+
# Handle opcodes
269+
if self.opcodes_enabled and frames:
268270
leaf_frame = frames[0]
269271
filename, location, funcname, opcode = leaf_frame
270272
if isinstance(location, tuple):
@@ -276,18 +278,15 @@ def collect(self, stack_frames, timestamp_us=None):
276278
current_state = (opcode, lineno, col_offset, funcname, filename)
277279

278280
if tid not in self.opcode_state:
279-
# First observation - start tracking
280-
self.opcode_state[tid] = (*current_state, current_time)
281+
self.opcode_state[tid] = (*current_state, first_time)
281282
elif self.opcode_state[tid][:5] != current_state:
282-
# State changed - emit marker for previous state
283283
prev_opcode, prev_lineno, prev_col, prev_funcname, prev_filename, prev_start = self.opcode_state[tid]
284284
self._add_opcode_interval_marker(
285-
tid, prev_opcode, prev_lineno, prev_col, prev_funcname, prev_start, current_time
285+
tid, prev_opcode, prev_lineno, prev_col, prev_funcname, prev_start, first_time
286286
)
287-
# Start tracking new state
288-
self.opcode_state[tid] = (*current_state, current_time)
287+
self.opcode_state[tid] = (*current_state, first_time)
289288

290-
self.sample_count += 1
289+
self.sample_count += len(times)
291290

292291
def _create_thread(self, tid):
293292
"""Create a new thread structure with processed profile format."""

Lib/profiling/sampling/heatmap_collector.py

Lines changed: 16 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -518,16 +518,17 @@ def set_stats(self, sample_interval_usec, duration_sec, sample_rate, error_rate=
518518
}
519519
self.stats.update(kwargs)
520520

521-
def process_frames(self, frames, thread_id):
521+
def process_frames(self, frames, thread_id, weight=1):
522522
"""Process stack frames and count samples per line.
523523
524524
Args:
525525
frames: List of (filename, location, funcname, opcode) tuples in
526526
leaf-to-root order. location is (lineno, end_lineno, col_offset, end_col_offset).
527527
opcode is None if not gathered.
528528
thread_id: Thread ID for this stack trace
529+
weight: Number of samples this stack represents (for batched RLE)
529530
"""
530-
self._total_samples += 1
531+
self._total_samples += weight
531532
self._seen_lines.clear()
532533

533534
for i, (filename, location, funcname, opcode) in enumerate(frames):
@@ -545,15 +546,16 @@ def process_frames(self, frames, thread_id):
545546
self._seen_lines.add(line_key)
546547

547548
self._record_line_sample(filename, lineno, funcname, is_leaf=is_leaf,
548-
count_cumulative=count_cumulative)
549+
count_cumulative=count_cumulative, weight=weight)
549550

550551
if opcode is not None:
551552
# Set opcodes_enabled flag when we first encounter opcode data
552553
self.opcodes_enabled = True
553554
self._record_bytecode_sample(filename, lineno, opcode,
554-
end_lineno, col_offset, end_col_offset)
555+
end_lineno, col_offset, end_col_offset,
556+
weight=weight)
555557

556-
# Build call graph for adjacent frames
558+
# Build call graph for adjacent frames (relationships are deduplicated anyway)
557559
if i + 1 < len(frames):
558560
next_frame = frames[i + 1]
559561
next_lineno = extract_lineno(next_frame[1])
@@ -575,24 +577,25 @@ def _is_valid_frame(self, filename, lineno):
575577
return True
576578

577579
def _record_line_sample(self, filename, lineno, funcname, is_leaf=False,
578-
count_cumulative=True):
580+
count_cumulative=True, weight=1):
579581
"""Record a sample for a specific line."""
580582
# Track cumulative samples (all occurrences in stack)
581583
if count_cumulative:
582-
self.line_samples[(filename, lineno)] += 1
583-
self.file_samples[filename][lineno] += 1
584+
self.line_samples[(filename, lineno)] += weight
585+
self.file_samples[filename][lineno] += weight
584586

585587
# Track self/leaf samples (only when at top of stack)
586588
if is_leaf:
587-
self.line_self_samples[(filename, lineno)] += 1
588-
self.file_self_samples[filename][lineno] += 1
589+
self.line_self_samples[(filename, lineno)] += weight
590+
self.file_self_samples[filename][lineno] += weight
589591

590592
# Record function definition location
591593
if funcname and (filename, funcname) not in self.function_definitions:
592594
self.function_definitions[(filename, funcname)] = lineno
593595

594596
def _record_bytecode_sample(self, filename, lineno, opcode,
595-
end_lineno=None, col_offset=None, end_col_offset=None):
597+
end_lineno=None, col_offset=None, end_col_offset=None,
598+
weight=1):
596599
"""Record a sample for a specific bytecode instruction.
597600
598601
Args:
@@ -602,14 +605,15 @@ def _record_bytecode_sample(self, filename, lineno, opcode,
602605
end_lineno: End line number (may be -1 if not available)
603606
col_offset: Column offset in UTF-8 bytes (may be -1 if not available)
604607
end_col_offset: End column offset in UTF-8 bytes (may be -1 if not available)
608+
weight: Number of samples this represents (for batched RLE)
605609
"""
606610
key = (filename, lineno)
607611

608612
# Initialize opcode entry if needed - use set for location deduplication
609613
if opcode not in self.line_opcodes[key]:
610614
self.line_opcodes[key][opcode] = {'count': 0, 'locations': set()}
611615

612-
self.line_opcodes[key][opcode]['count'] += 1
616+
self.line_opcodes[key][opcode]['count'] += weight
613617

614618
# Store unique location info if column offset is available (not -1)
615619
if col_offset is not None and col_offset >= 0:

Lib/profiling/sampling/pstats_collector.py

Lines changed: 8 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ def __init__(self, sample_interval_usec, *, skip_idle=False):
1818
self.skip_idle = skip_idle
1919
self._seen_locations = set()
2020

21-
def _process_frames(self, frames):
21+
def _process_frames(self, frames, weight=1):
2222
"""Process a single thread's frame stack."""
2323
if not frames:
2424
return
@@ -32,12 +32,12 @@ def _process_frames(self, frames):
3232
location = (frame.filename, lineno, frame.funcname)
3333
if location not in self._seen_locations:
3434
self._seen_locations.add(location)
35-
self.result[location]["cumulative_calls"] += 1
35+
self.result[location]["cumulative_calls"] += weight
3636

3737
# The top frame gets counted as an inline call (directly executing)
3838
top_lineno = extract_lineno(frames[0].location)
3939
top_location = (frames[0].filename, top_lineno, frames[0].funcname)
40-
self.result[top_location]["direct_calls"] += 1
40+
self.result[top_location]["direct_calls"] += weight
4141

4242
# Track caller-callee relationships for call graph
4343
for i in range(1, len(frames)):
@@ -49,17 +49,12 @@ def _process_frames(self, frames):
4949
callee = (callee_frame.filename, callee_lineno, callee_frame.funcname)
5050
caller = (caller_frame.filename, caller_lineno, caller_frame.funcname)
5151

52-
self.callers[callee][caller] += 1
52+
self.callers[callee][caller] += weight
5353

54-
def collect(self, stack_frames, timestamp_us=None):
55-
if stack_frames and hasattr(stack_frames[0], "awaited_by"):
56-
# Async frame processing
57-
for frames, thread_id, task_id in self._iter_async_frames(stack_frames):
58-
self._process_frames(frames)
59-
else:
60-
# Regular frame processing
61-
for frames, thread_id in self._iter_all_frames(stack_frames, skip_idle=self.skip_idle):
62-
self._process_frames(frames)
54+
def collect(self, stack_frames, timestamps_us=None):
55+
weight = len(timestamps_us) if timestamps_us else 1
56+
for frames, _ in self._iter_stacks(stack_frames, skip_idle=self.skip_idle):
57+
self._process_frames(frames, weight=weight)
6358

6459
def export(self, filename):
6560
self.create_stats()

0 commit comments

Comments
 (0)