From e047cea2804c987c32450279a9b8fd78655cfa9d Mon Sep 17 00:00:00 2001 From: Stan Lo Date: Mon, 20 Oct 2025 21:10:25 +0100 Subject: [PATCH 1/3] ZJIT: Optimize send with block into CCallWithFrame (#14863) Since `Send` has a block iseq, I updated `CCallWithFrame` to take an optional `blockiseq` as well, and then generate `CCallWithFrame` for `Send` when the condition is right. ## Stats `liquid-render` Benchmark | Metric | Before | After | Change | |----------------------|--------------------|--------------------|--------------------- | | send_no_profiles | 3,209,418 (34.1%) | 4,119 (0.1%) | -3,205,299 (-99.9%) | | dynamic_send_count | 9,410,758 (23.1%) | 6,459,678 (15.9%) | -2,951,080 (-31.4%) | | optimized_send_count | 31,269,388 (76.9%) | 34,220,474 (84.1%) | +2,951,086 (+9.4%) | `lobsters` Benchmark | Metric | Before | After | Change | |----------------------|------------|------------|---------------------| | send_no_profiles | 10,769,052 | 2,902,865 | -7,866,187 (-73.0%) | | dynamic_send_count | 45,673,185 | 42,880,160 | -2,793,025 (-6.1%) | | optimized_send_count | 75,142,407 | 78,378,514 | +3,236,107 (+4.3%) | ### `liquid-render` Before
``` Average of last 22, non-warmup iters: 262ms ***ZJIT: Printing ZJIT statistics on exit*** Top-20 not inlined C methods (96.9% of total 10,370,809): Kernel#respond_to?: 5,069,204 (48.9%) Hash#key?: 2,394,488 (23.1%) Set#include?: 778,429 ( 7.5%) String#===: 326,134 ( 3.1%) String#<<: 203,231 ( 2.0%) Integer#<<: 166,768 ( 1.6%) Kernel#is_a?: 164,272 ( 1.6%) Kernel#format: 124,262 ( 1.2%) Integer#/: 124,262 ( 1.2%) Array#<<: 115,325 ( 1.1%) Regexp.last_match: 94,862 ( 0.9%) Hash#[]=: 88,485 ( 0.9%) String#start_with?: 55,933 ( 0.5%) CGI::EscapeExt#escapeHTML: 55,471 ( 0.5%) Array#shift: 55,298 ( 0.5%) Regexp#===: 48,928 ( 0.5%) String#=~: 48,477 ( 0.5%) Array#unshift: 47,331 ( 0.5%) String#empty?: 42,870 ( 0.4%) Array#push: 41,215 ( 0.4%) Top-20 not annotated C methods (97.1% of total 10,394,421): Kernel#respond_to?: 5,069,204 (48.8%) Hash#key?: 2,394,488 (23.0%) Set#include?: 778,429 ( 7.5%) String#===: 326,134 ( 3.1%) Kernel#is_a?: 208,664 ( 2.0%) String#<<: 203,231 ( 2.0%) Integer#<<: 166,768 ( 1.6%) Integer#/: 124,262 ( 1.2%) Kernel#format: 124,262 ( 1.2%) Array#<<: 115,325 ( 1.1%) Regexp.last_match: 94,862 ( 0.9%) Hash#[]=: 88,485 ( 0.9%) String#start_with?: 55,933 ( 0.5%) CGI::EscapeExt#escapeHTML: 55,471 ( 0.5%) Array#shift: 55,298 ( 0.5%) Regexp#===: 48,928 ( 0.5%) String#=~: 48,477 ( 0.5%) Array#unshift: 47,331 ( 0.5%) String#empty?: 42,870 ( 0.4%) Array#push: 41,215 ( 0.4%) Top-2 not optimized method types for send (100.0% of total 2,382): cfunc: 1,196 (50.2%) iseq: 1,186 (49.8%) Top-4 not optimized method types for send_without_block (100.0% of total 2,561,006): iseq: 2,442,091 (95.4%) optimized: 118,882 ( 4.6%) alias: 20 ( 0.0%) null: 13 ( 0.0%) Top-9 not optimized instructions (100.0% of total 685,128): invokeblock: 227,376 (33.2%) opt_neq: 166,471 (24.3%) opt_and: 166,471 (24.3%) opt_eq: 66,721 ( 9.7%) invokesuper: 39,363 ( 5.7%) opt_le: 16,278 ( 2.4%) opt_minus: 1,574 ( 0.2%) opt_send_without_block: 772 ( 0.1%) opt_or: 102 ( 0.0%) Top-8 send fallback reasons (100.0% of total 9,410,758): send_no_profiles: 3,209,418 (34.1%) send_without_block_polymorphic: 2,858,558 (30.4%) send_without_block_not_optimized_method_type: 2,561,006 (27.2%) not_optimized_instruction: 685,128 ( 7.3%) send_without_block_no_profiles: 91,913 ( 1.0%) send_not_optimized_method_type: 2,382 ( 0.0%) obj_to_string_not_string: 2,352 ( 0.0%) send_without_block_cfunc_array_variadic: 1 ( 0.0%) Top-3 unhandled YARV insns (100.0% of total 83,682): getclassvariable: 83,431 (99.7%) once: 137 ( 0.2%) getconstant: 114 ( 0.1%) Top-3 compile error reasons (100.0% of total 5,431,910): register_spill_on_alloc: 4,665,393 (85.9%) exception_handler: 766,347 (14.1%) register_spill_on_ccall: 170 ( 0.0%) Top-11 side exit reasons (100.0% of total 14,635,508): compile_error: 5,431,910 (37.1%) guard_shape_failure: 3,436,341 (23.5%) guard_type_failure: 2,545,791 (17.4%) unhandled_splat: 2,162,907 (14.8%) unhandled_kwarg: 952,568 ( 6.5%) unhandled_yarv_insn: 83,682 ( 0.6%) unhandled_hir_insn: 19,112 ( 0.1%) patchpoint_stable_constant_names: 1,608 ( 0.0%) obj_to_string_fallback: 902 ( 0.0%) patchpoint_method_redefined: 599 ( 0.0%) block_param_proxy_not_iseq_or_ifunc: 88 ( 0.0%) send_count: 40,680,153 dynamic_send_count: 9,410,758 (23.1%) optimized_send_count: 31,269,395 (76.9%) iseq_optimized_send_count: 13,886,902 (34.1%) inline_cfunc_optimized_send_count: 7,011,684 (17.2%) non_variadic_cfunc_optimized_send_count: 4,670,333 (11.5%) variadic_cfunc_optimized_send_count: 5,700,476 (14.0%) dynamic_getivar_count: 1,144,613 dynamic_setivar_count: 950,830 compiled_iseq_count: 402 failed_iseq_count: 48 compile_time: 976ms profile_time: 3,223ms gc_time: 22ms invalidation_time: 0ms vm_write_pc_count: 37,744,491 vm_write_sp_count: 37,511,865 vm_write_locals_count: 37,511,865 vm_write_stack_count: 37,511,865 vm_write_to_parent_iseq_local_count: 558,177 vm_read_from_parent_iseq_local_count: 14,317,032 code_region_bytes: 2,211,840 side_exit_count: 14,635,508 total_insn_count: 476,097,972 vm_insn_count: 253,795,154 zjit_insn_count: 222,302,818 ratio_in_zjit: 46.7% ```
### `liquid-render` After
``` Average of last 21, non-warmup iters: 272ms ***ZJIT: Printing ZJIT statistics on exit*** Top-20 not inlined C methods (96.8% of total 10,093,966): Kernel#respond_to?: 4,932,224 (48.9%) Hash#key?: 2,329,928 (23.1%) Set#include?: 757,389 ( 7.5%) String#===: 317,494 ( 3.1%) String#<<: 197,831 ( 2.0%) Integer#<<: 162,268 ( 1.6%) Kernel#is_a?: 159,892 ( 1.6%) Kernel#format: 120,902 ( 1.2%) Integer#/: 120,902 ( 1.2%) Array#<<: 112,225 ( 1.1%) Regexp.last_match: 92,382 ( 0.9%) Hash#[]=: 86,145 ( 0.9%) String#start_with?: 54,953 ( 0.5%) Array#shift: 54,038 ( 0.5%) CGI::EscapeExt#escapeHTML: 53,971 ( 0.5%) Regexp#===: 47,848 ( 0.5%) String#=~: 47,237 ( 0.5%) Array#unshift: 46,051 ( 0.5%) String#empty?: 41,750 ( 0.4%) Array#push: 40,115 ( 0.4%) Top-20 not annotated C methods (97.1% of total 10,116,938): Kernel#respond_to?: 4,932,224 (48.8%) Hash#key?: 2,329,928 (23.0%) Set#include?: 757,389 ( 7.5%) String#===: 317,494 ( 3.1%) Kernel#is_a?: 203,084 ( 2.0%) String#<<: 197,831 ( 2.0%) Integer#<<: 162,268 ( 1.6%) Kernel#format: 120,902 ( 1.2%) Integer#/: 120,902 ( 1.2%) Array#<<: 112,225 ( 1.1%) Regexp.last_match: 92,382 ( 0.9%) Hash#[]=: 86,145 ( 0.9%) String#start_with?: 54,953 ( 0.5%) Array#shift: 54,038 ( 0.5%) CGI::EscapeExt#escapeHTML: 53,971 ( 0.5%) Regexp#===: 47,848 ( 0.5%) String#=~: 47,237 ( 0.5%) Array#unshift: 46,051 ( 0.5%) String#empty?: 41,750 ( 0.4%) Array#push: 40,115 ( 0.4%) Top-2 not optimized method types for send (100.0% of total 182,938): iseq: 178,414 (97.5%) cfunc: 4,524 ( 2.5%) Top-4 not optimized method types for send_without_block (100.0% of total 2,492,246): iseq: 2,376,511 (95.4%) optimized: 115,702 ( 4.6%) alias: 20 ( 0.0%) null: 13 ( 0.0%) Top-9 not optimized instructions (100.0% of total 667,727): invokeblock: 221,375 (33.2%) opt_neq: 161,971 (24.3%) opt_and: 161,971 (24.3%) opt_eq: 64,921 ( 9.7%) invokesuper: 39,243 ( 5.9%) opt_le: 15,838 ( 2.4%) opt_minus: 1,534 ( 0.2%) opt_send_without_block: 772 ( 0.1%) opt_or: 102 ( 0.0%) Top-9 send fallback reasons (100.0% of total 6,287,956): send_without_block_polymorphic: 2,782,058 (44.2%) send_without_block_not_optimized_method_type: 2,492,246 (39.6%) not_optimized_instruction: 667,727 (10.6%) send_not_optimized_method_type: 182,938 ( 2.9%) send_without_block_no_profiles: 89,613 ( 1.4%) send_polymorphic: 66,962 ( 1.1%) send_no_profiles: 4,059 ( 0.1%) obj_to_string_not_string: 2,352 ( 0.0%) send_without_block_cfunc_array_variadic: 1 ( 0.0%) Top-3 unhandled YARV insns (100.0% of total 81,482): getclassvariable: 81,231 (99.7%) once: 137 ( 0.2%) getconstant: 114 ( 0.1%) Top-3 compile error reasons (100.0% of total 5,286,310): register_spill_on_alloc: 4,540,413 (85.9%) exception_handler: 745,727 (14.1%) register_spill_on_ccall: 170 ( 0.0%) Top-12 side exit reasons (100.0% of total 14,244,881): compile_error: 5,286,310 (37.1%) guard_shape_failure: 3,346,873 (23.5%) guard_type_failure: 2,477,071 (17.4%) unhandled_splat: 2,104,447 (14.8%) unhandled_kwarg: 926,828 ( 6.5%) unhandled_yarv_insn: 81,482 ( 0.6%) unhandled_hir_insn: 18,672 ( 0.1%) patchpoint_stable_constant_names: 1,608 ( 0.0%) obj_to_string_fallback: 902 ( 0.0%) patchpoint_method_redefined: 599 ( 0.0%) block_param_proxy_not_iseq_or_ifunc: 88 ( 0.0%) interrupt: 1 ( 0.0%) send_count: 39,591,410 dynamic_send_count: 6,287,956 (15.9%) optimized_send_count: 33,303,454 (84.1%) iseq_optimized_send_count: 13,514,283 (34.1%) inline_cfunc_optimized_send_count: 6,823,745 (17.2%) non_variadic_cfunc_optimized_send_count: 7,417,432 (18.7%) variadic_cfunc_optimized_send_count: 5,547,994 (14.0%) dynamic_getivar_count: 1,110,647 dynamic_setivar_count: 927,309 compiled_iseq_count: 403 failed_iseq_count: 48 compile_time: 968ms profile_time: 3,547ms gc_time: 22ms invalidation_time: 0ms vm_write_pc_count: 36,735,108 vm_write_sp_count: 36,508,262 vm_write_locals_count: 36,508,262 vm_write_stack_count: 36,508,262 vm_write_to_parent_iseq_local_count: 543,097 vm_read_from_parent_iseq_local_count: 13,930,672 code_region_bytes: 2,228,224 side_exit_count: 14,244,881 total_insn_count: 463,357,969 vm_insn_count: 247,003,727 zjit_insn_count: 216,354,242 ratio_in_zjit: 46.7% ```
### `lobsters` Before
``` Average of last 10, non-warmup iters: 898ms ***ZJIT: Printing ZJIT statistics on exit*** Top-20 not inlined C methods (61.3% of total 19,495,906): String#<<: 1,764,437 ( 9.1%) Kernel#is_a?: 1,615,120 ( 8.3%) Hash#[]=: 1,159,455 ( 5.9%) Regexp#match?: 777,496 ( 4.0%) String#empty?: 722,953 ( 3.7%) Hash#key?: 685,258 ( 3.5%) Kernel#respond_to?: 602,017 ( 3.1%) TrueClass#===: 447,671 ( 2.3%) FalseClass#===: 439,276 ( 2.3%) Array#include?: 426,758 ( 2.2%) Kernel#block_given?: 405,271 ( 2.1%) Hash#fetch: 382,302 ( 2.0%) ObjectSpace::WeakKeyMap#[]: 356,654 ( 1.8%) String#start_with?: 353,793 ( 1.8%) Kernel#kind_of?: 340,341 ( 1.7%) Kernel#dup: 328,162 ( 1.7%) String.new: 306,667 ( 1.6%) String#==: 287,549 ( 1.5%) BasicObject#!=: 284,642 ( 1.5%) String#length: 256,070 ( 1.3%) Top-20 not annotated C methods (62.4% of total 19,796,172): Kernel#is_a?: 1,993,676 (10.1%) String#<<: 1,764,437 ( 8.9%) Hash#[]=: 1,159,634 ( 5.9%) Regexp#match?: 777,496 ( 3.9%) String#empty?: 738,030 ( 3.7%) Hash#key?: 685,258 ( 3.5%) Kernel#respond_to?: 602,017 ( 3.0%) TrueClass#===: 447,671 ( 2.3%) FalseClass#===: 439,276 ( 2.2%) Array#include?: 426,758 ( 2.2%) Kernel#block_given?: 425,813 ( 2.2%) Hash#fetch: 382,302 ( 1.9%) ObjectSpace::WeakKeyMap#[]: 356,654 ( 1.8%) String#start_with?: 353,793 ( 1.8%) Kernel#kind_of?: 340,375 ( 1.7%) Kernel#dup: 328,169 ( 1.7%) String.new: 306,667 ( 1.5%) String#==: 293,520 ( 1.5%) BasicObject#!=: 284,825 ( 1.4%) String#length: 256,070 ( 1.3%) Top-2 not optimized method types for send (100.0% of total 115,007): cfunc: 76,172 (66.2%) iseq: 38,835 (33.8%) Top-6 not optimized method types for send_without_block (100.0% of total 8,003,641): iseq: 3,999,211 (50.0%) bmethod: 1,750,271 (21.9%) optimized: 1,653,426 (20.7%) alias: 591,342 ( 7.4%) null: 8,174 ( 0.1%) cfunc: 1,217 ( 0.0%) Top-13 not optimized instructions (100.0% of total 7,590,826): invokesuper: 4,335,446 (57.1%) invokeblock: 1,329,215 (17.5%) sendforward: 841,463 (11.1%) opt_eq: 810,614 (10.7%) opt_plus: 141,773 ( 1.9%) opt_minus: 52,270 ( 0.7%) opt_send_without_block: 43,248 ( 0.6%) opt_neq: 15,047 ( 0.2%) opt_mult: 13,824 ( 0.2%) opt_or: 7,451 ( 0.1%) opt_lt: 348 ( 0.0%) opt_ge: 91 ( 0.0%) opt_gt: 36 ( 0.0%) Top-9 send fallback reasons (100.0% of total 45,673,212): send_without_block_polymorphic: 17,390,335 (38.1%) send_no_profiles: 10,769,053 (23.6%) send_without_block_not_optimized_method_type: 8,003,641 (17.5%) not_optimized_instruction: 7,590,826 (16.6%) send_without_block_no_profiles: 1,757,109 ( 3.8%) send_not_optimized_method_type: 115,007 ( 0.3%) send_without_block_cfunc_array_variadic: 31,149 ( 0.1%) obj_to_string_not_string: 15,518 ( 0.0%) send_without_block_direct_too_many_args: 574 ( 0.0%) Top-9 unhandled YARV insns (100.0% of total 1,242,228): expandarray: 622,203 (50.1%) checkkeyword: 316,111 (25.4%) getclassvariable: 120,540 ( 9.7%) getblockparam: 88,480 ( 7.1%) invokesuperforward: 78,842 ( 6.3%) opt_duparray_send: 14,149 ( 1.1%) getconstant: 1,588 ( 0.1%) checkmatch: 288 ( 0.0%) once: 27 ( 0.0%) Top-3 compile error reasons (100.0% of total 6,769,693): register_spill_on_alloc: 6,188,305 (91.4%) register_spill_on_ccall: 347,108 ( 5.1%) exception_handler: 234,280 ( 3.5%) Top-17 side exit reasons (100.0% of total 20,142,827): compile_error: 6,769,693 (33.6%) guard_type_failure: 5,169,050 (25.7%) guard_shape_failure: 3,726,362 (18.5%) unhandled_yarv_insn: 1,242,228 ( 6.2%) block_param_proxy_not_iseq_or_ifunc: 984,480 ( 4.9%) unhandled_kwarg: 800,154 ( 4.0%) unknown_newarray_send: 539,317 ( 2.7%) patchpoint_stable_constant_names: 340,283 ( 1.7%) unhandled_splat: 229,440 ( 1.1%) unhandled_hir_insn: 147,351 ( 0.7%) patchpoint_no_singleton_class: 128,856 ( 0.6%) patchpoint_method_redefined: 32,718 ( 0.2%) block_param_proxy_modified: 25,274 ( 0.1%) patchpoint_no_ep_escape: 7,559 ( 0.0%) obj_to_string_fallback: 24 ( 0.0%) guard_type_not_failure: 22 ( 0.0%) interrupt: 16 ( 0.0%) send_count: 120,815,640 dynamic_send_count: 45,673,212 (37.8%) optimized_send_count: 75,142,428 (62.2%) iseq_optimized_send_count: 32,188,039 (26.6%) inline_cfunc_optimized_send_count: 23,458,483 (19.4%) non_variadic_cfunc_optimized_send_count: 14,809,797 (12.3%) variadic_cfunc_optimized_send_count: 4,686,109 ( 3.9%) dynamic_getivar_count: 13,023,437 dynamic_setivar_count: 12,311,158 compiled_iseq_count: 4,806 failed_iseq_count: 466 compile_time: 8,943ms profile_time: 99ms gc_time: 45ms invalidation_time: 239ms vm_write_pc_count: 113,652,291 vm_write_sp_count: 111,209,623 vm_write_locals_count: 111,209,623 vm_write_stack_count: 111,209,623 vm_write_to_parent_iseq_local_count: 516,800 vm_read_from_parent_iseq_local_count: 11,225,587 code_region_bytes: 22,609,920 side_exit_count: 20,142,827 total_insn_count: 926,088,942 vm_insn_count: 297,636,255 zjit_insn_count: 628,452,687 ratio_in_zjit: 67.9% ```
### `lobsters` After
``` Average of last 10, non-warmup iters: 919ms ***ZJIT: Printing ZJIT statistics on exit*** Top-20 not inlined C methods (61.3% of total 19,495,868): String#<<: 1,764,437 ( 9.1%) Kernel#is_a?: 1,615,110 ( 8.3%) Hash#[]=: 1,159,455 ( 5.9%) Regexp#match?: 777,496 ( 4.0%) String#empty?: 722,953 ( 3.7%) Hash#key?: 685,258 ( 3.5%) Kernel#respond_to?: 602,016 ( 3.1%) TrueClass#===: 447,671 ( 2.3%) FalseClass#===: 439,276 ( 2.3%) Array#include?: 426,758 ( 2.2%) Kernel#block_given?: 405,271 ( 2.1%) Hash#fetch: 382,302 ( 2.0%) ObjectSpace::WeakKeyMap#[]: 356,654 ( 1.8%) String#start_with?: 353,793 ( 1.8%) Kernel#kind_of?: 340,341 ( 1.7%) Kernel#dup: 328,162 ( 1.7%) String.new: 306,667 ( 1.6%) String#==: 287,545 ( 1.5%) BasicObject#!=: 284,642 ( 1.5%) String#length: 256,070 ( 1.3%) Top-20 not annotated C methods (62.4% of total 19,796,134): Kernel#is_a?: 1,993,666 (10.1%) String#<<: 1,764,437 ( 8.9%) Hash#[]=: 1,159,634 ( 5.9%) Regexp#match?: 777,496 ( 3.9%) String#empty?: 738,030 ( 3.7%) Hash#key?: 685,258 ( 3.5%) Kernel#respond_to?: 602,016 ( 3.0%) TrueClass#===: 447,671 ( 2.3%) FalseClass#===: 439,276 ( 2.2%) Array#include?: 426,758 ( 2.2%) Kernel#block_given?: 425,813 ( 2.2%) Hash#fetch: 382,302 ( 1.9%) ObjectSpace::WeakKeyMap#[]: 356,654 ( 1.8%) String#start_with?: 353,793 ( 1.8%) Kernel#kind_of?: 340,375 ( 1.7%) Kernel#dup: 328,169 ( 1.7%) String.new: 306,667 ( 1.5%) String#==: 293,516 ( 1.5%) BasicObject#!=: 284,825 ( 1.4%) String#length: 256,070 ( 1.3%) Top-4 not optimized method types for send (100.0% of total 4,749,678): iseq: 2,563,391 (54.0%) cfunc: 2,064,888 (43.5%) alias: 118,577 ( 2.5%) null: 2,822 ( 0.1%) Top-6 not optimized method types for send_without_block (100.0% of total 8,003,641): iseq: 3,999,211 (50.0%) bmethod: 1,750,271 (21.9%) optimized: 1,653,426 (20.7%) alias: 591,342 ( 7.4%) null: 8,174 ( 0.1%) cfunc: 1,217 ( 0.0%) Top-13 not optimized instructions (100.0% of total 7,590,818): invokesuper: 4,335,442 (57.1%) invokeblock: 1,329,215 (17.5%) sendforward: 841,463 (11.1%) opt_eq: 810,610 (10.7%) opt_plus: 141,773 ( 1.9%) opt_minus: 52,270 ( 0.7%) opt_send_without_block: 43,248 ( 0.6%) opt_neq: 15,047 ( 0.2%) opt_mult: 13,824 ( 0.2%) opt_or: 7,451 ( 0.1%) opt_lt: 348 ( 0.0%) opt_ge: 91 ( 0.0%) opt_gt: 36 ( 0.0%) Top-10 send fallback reasons (100.0% of total 43,152,037): send_without_block_polymorphic: 17,390,322 (40.3%) send_without_block_not_optimized_method_type: 8,003,641 (18.5%) not_optimized_instruction: 7,590,818 (17.6%) send_not_optimized_method_type: 4,749,678 (11.0%) send_no_profiles: 2,893,666 ( 6.7%) send_without_block_no_profiles: 1,757,109 ( 4.1%) send_polymorphic: 719,562 ( 1.7%) send_without_block_cfunc_array_variadic: 31,149 ( 0.1%) obj_to_string_not_string: 15,518 ( 0.0%) send_without_block_direct_too_many_args: 574 ( 0.0%) Top-9 unhandled YARV insns (100.0% of total 1,242,215): expandarray: 622,203 (50.1%) checkkeyword: 316,111 (25.4%) getclassvariable: 120,540 ( 9.7%) getblockparam: 88,467 ( 7.1%) invokesuperforward: 78,842 ( 6.3%) opt_duparray_send: 14,149 ( 1.1%) getconstant: 1,588 ( 0.1%) checkmatch: 288 ( 0.0%) once: 27 ( 0.0%) Top-3 compile error reasons (100.0% of total 6,769,688): register_spill_on_alloc: 6,188,305 (91.4%) register_spill_on_ccall: 347,108 ( 5.1%) exception_handler: 234,275 ( 3.5%) Top-17 side exit reasons (100.0% of total 20,144,372): compile_error: 6,769,688 (33.6%) guard_type_failure: 5,169,204 (25.7%) guard_shape_failure: 3,726,374 (18.5%) unhandled_yarv_insn: 1,242,215 ( 6.2%) block_param_proxy_not_iseq_or_ifunc: 984,480 ( 4.9%) unhandled_kwarg: 800,154 ( 4.0%) unknown_newarray_send: 539,317 ( 2.7%) patchpoint_stable_constant_names: 340,283 ( 1.7%) unhandled_splat: 229,440 ( 1.1%) unhandled_hir_insn: 147,351 ( 0.7%) patchpoint_no_singleton_class: 130,252 ( 0.6%) patchpoint_method_redefined: 32,716 ( 0.2%) block_param_proxy_modified: 25,274 ( 0.1%) patchpoint_no_ep_escape: 7,559 ( 0.0%) obj_to_string_fallback: 24 ( 0.0%) guard_type_not_failure: 22 ( 0.0%) interrupt: 19 ( 0.0%) send_count: 120,812,030 dynamic_send_count: 43,152,037 (35.7%) optimized_send_count: 77,659,993 (64.3%) iseq_optimized_send_count: 32,187,900 (26.6%) inline_cfunc_optimized_send_count: 23,458,491 (19.4%) non_variadic_cfunc_optimized_send_count: 17,327,499 (14.3%) variadic_cfunc_optimized_send_count: 4,686,103 ( 3.9%) dynamic_getivar_count: 13,023,424 dynamic_setivar_count: 12,310,991 compiled_iseq_count: 4,806 failed_iseq_count: 466 compile_time: 9,012ms profile_time: 104ms gc_time: 44ms invalidation_time: 239ms vm_write_pc_count: 113,648,665 vm_write_sp_count: 111,205,997 vm_write_locals_count: 111,205,997 vm_write_stack_count: 111,205,997 vm_write_to_parent_iseq_local_count: 516,800 vm_read_from_parent_iseq_local_count: 11,225,587 code_region_bytes: 23,052,288 side_exit_count: 20,144,372 total_insn_count: 926,090,214 vm_insn_count: 297,647,811 zjit_insn_count: 628,442,403 ratio_in_zjit: 67.9% ```
--- insns.def | 1 + zjit/src/codegen.rs | 43 +++++- zjit/src/cruby_bindings.inc.rs | 53 +++---- zjit/src/hir.rs | 259 +++++++++++++++++++++++++++++++-- zjit/src/profile.rs | 2 +- 5 files changed, 315 insertions(+), 43 deletions(-) diff --git a/insns.def b/insns.def index 8225d1cceaf97e..ce358da28575ed 100644 --- a/insns.def +++ b/insns.def @@ -846,6 +846,7 @@ send (CALL_DATA cd, ISEQ blockiseq) (...) (VALUE val) +// attr bool zjit_profile = true; // attr rb_snum_t sp_inc = sp_inc_of_sendish(cd->ci); // attr rb_snum_t comptime_sp_inc = sp_inc_of_sendish(ci); { diff --git a/zjit/src/codegen.rs b/zjit/src/codegen.rs index 1f04e61dbc9757..87e0ed907a044d 100644 --- a/zjit/src/codegen.rs +++ b/zjit/src/codegen.rs @@ -411,7 +411,8 @@ fn gen_insn(cb: &mut CodeBlock, jit: &mut JITState, asm: &mut Assembler, functio // Give up CCallWithFrame for 7+ args since asm.ccall() doesn't support it. Insn::CCallWithFrame { cd, state, args, .. } if args.len() > C_ARG_OPNDS.len() => gen_send_without_block(jit, asm, *cd, &function.frame_state(*state), SendFallbackReason::CCallWithFrameTooManyArgs), - Insn::CCallWithFrame { cfunc, args, cme, state, .. } => gen_ccall_with_frame(jit, asm, *cfunc, opnds!(args), *cme, &function.frame_state(*state)), + Insn::CCallWithFrame { cfunc, args, cme, state, blockiseq, .. } => + gen_ccall_with_frame(jit, asm, *cfunc, opnds!(args), *cme, *blockiseq, &function.frame_state(*state)), Insn::CCallVariadic { cfunc, recv, args, name: _, cme, state, return_type: _, elidable: _ } => { gen_ccall_variadic(jit, asm, *cfunc, opnd!(recv), opnds!(args), *cme, &function.frame_state(*state)) } @@ -673,20 +674,36 @@ fn gen_patch_point(jit: &mut JITState, asm: &mut Assembler, invariant: &Invarian } /// Generate code for a C function call that pushes a frame -fn gen_ccall_with_frame(jit: &mut JITState, asm: &mut Assembler, cfunc: *const u8, args: Vec, cme: *const rb_callable_method_entry_t, state: &FrameState) -> lir::Opnd { +fn gen_ccall_with_frame( + jit: &mut JITState, + asm: &mut Assembler, + cfunc: *const u8, + args: Vec, + cme: *const rb_callable_method_entry_t, + blockiseq: Option, + state: &FrameState, +) -> lir::Opnd { gen_incr_counter(asm, Counter::non_variadic_cfunc_optimized_send_count); - gen_prepare_non_leaf_call(jit, asm, state); + let caller_stack_size = state.stack_size() - args.len(); + + // Can't use gen_prepare_non_leaf_call() because we need to adjust the SP + // to account for the receiver and arguments (and block arguments if any) + gen_prepare_call_with_gc(asm, state, false); + gen_save_sp(asm, caller_stack_size); + gen_spill_stack(jit, asm, state); + gen_spill_locals(jit, asm, state); gen_push_frame(asm, args.len(), state, ControlFrame { recv: args[0], iseq: None, cme, frame_type: VM_FRAME_MAGIC_CFUNC | VM_FRAME_FLAG_CFRAME | VM_ENV_FLAG_LOCAL, + block_iseq: blockiseq, }); asm_comment!(asm, "switch to new SP register"); - let sp_offset = (state.stack().len() - args.len() + VM_ENV_DATA_SIZE.as_usize()) * SIZEOF_VALUE; + let sp_offset = (caller_stack_size + VM_ENV_DATA_SIZE.as_usize()) * SIZEOF_VALUE; let new_sp = asm.add(SP, sp_offset.into()); asm.mov(SP, new_sp); @@ -738,6 +755,7 @@ fn gen_ccall_variadic( iseq: None, cme, frame_type: VM_FRAME_MAGIC_CFUNC | VM_FRAME_FLAG_CFRAME | VM_ENV_FLAG_LOCAL, + block_iseq: None, }); asm_comment!(asm, "switch to new SP register"); @@ -1130,6 +1148,7 @@ fn gen_send_without_block_direct( iseq: Some(iseq), cme, frame_type: VM_FRAME_MAGIC_METHOD | VM_ENV_FLAG_LOCAL, + block_iseq: None, }); asm_comment!(asm, "switch to new SP register"); @@ -1719,6 +1738,7 @@ struct ControlFrame { iseq: Option, cme: *const rb_callable_method_entry_t, frame_type: u32, + block_iseq: Option, } /// Compile an interpreter frame @@ -1735,9 +1755,20 @@ fn gen_push_frame(asm: &mut Assembler, argc: usize, state: &FrameState, frame: C }; let ep_offset = state.stack().len() as i32 + local_size - argc as i32 + VM_ENV_DATA_SIZE as i32 - 1; asm.store(Opnd::mem(64, SP, (ep_offset - 2) * SIZEOF_VALUE_I32), VALUE::from(frame.cme).into()); + + let block_handler_opnd = if let Some(block_iseq) = frame.block_iseq { + // Change cfp->block_code in the current frame. See vm_caller_setup_arg_block(). + // VM_CFP_TO_CAPTURED_BLOCK does &cfp->self, rb_captured_block->code.iseq aliases + // with cfp->block_code. + asm.store(Opnd::mem(64, CFP, RUBY_OFFSET_CFP_BLOCK_CODE), VALUE::from(block_iseq).into()); + let cfp_self_addr = asm.lea(Opnd::mem(64, CFP, RUBY_OFFSET_CFP_SELF)); + asm.or(cfp_self_addr, Opnd::Imm(1)) + } else { + VM_BLOCK_HANDLER_NONE.into() + }; + // ep[-1]: block_handler or prev EP - // block_handler is not supported for now - asm.store(Opnd::mem(64, SP, (ep_offset - 1) * SIZEOF_VALUE_I32), VM_BLOCK_HANDLER_NONE.into()); + asm.store(Opnd::mem(64, SP, (ep_offset - 1) * SIZEOF_VALUE_I32), block_handler_opnd); // ep[0]: ENV_FLAGS asm.store(Opnd::mem(64, SP, ep_offset * SIZEOF_VALUE_I32), frame.frame_type.into()); diff --git a/zjit/src/cruby_bindings.inc.rs b/zjit/src/cruby_bindings.inc.rs index c67e229a8009e7..af604661b299b3 100644 --- a/zjit/src/cruby_bindings.inc.rs +++ b/zjit/src/cruby_bindings.inc.rs @@ -680,32 +680,33 @@ pub const YARVINSN_trace_setlocal_WC_1: ruby_vminsn_type = 215; pub const YARVINSN_trace_putobject_INT2FIX_0_: ruby_vminsn_type = 216; pub const YARVINSN_trace_putobject_INT2FIX_1_: ruby_vminsn_type = 217; pub const YARVINSN_zjit_getinstancevariable: ruby_vminsn_type = 218; -pub const YARVINSN_zjit_opt_send_without_block: ruby_vminsn_type = 219; -pub const YARVINSN_zjit_objtostring: ruby_vminsn_type = 220; -pub const YARVINSN_zjit_opt_nil_p: ruby_vminsn_type = 221; -pub const YARVINSN_zjit_opt_plus: ruby_vminsn_type = 222; -pub const YARVINSN_zjit_opt_minus: ruby_vminsn_type = 223; -pub const YARVINSN_zjit_opt_mult: ruby_vminsn_type = 224; -pub const YARVINSN_zjit_opt_div: ruby_vminsn_type = 225; -pub const YARVINSN_zjit_opt_mod: ruby_vminsn_type = 226; -pub const YARVINSN_zjit_opt_eq: ruby_vminsn_type = 227; -pub const YARVINSN_zjit_opt_neq: ruby_vminsn_type = 228; -pub const YARVINSN_zjit_opt_lt: ruby_vminsn_type = 229; -pub const YARVINSN_zjit_opt_le: ruby_vminsn_type = 230; -pub const YARVINSN_zjit_opt_gt: ruby_vminsn_type = 231; -pub const YARVINSN_zjit_opt_ge: ruby_vminsn_type = 232; -pub const YARVINSN_zjit_opt_ltlt: ruby_vminsn_type = 233; -pub const YARVINSN_zjit_opt_and: ruby_vminsn_type = 234; -pub const YARVINSN_zjit_opt_or: ruby_vminsn_type = 235; -pub const YARVINSN_zjit_opt_aref: ruby_vminsn_type = 236; -pub const YARVINSN_zjit_opt_aset: ruby_vminsn_type = 237; -pub const YARVINSN_zjit_opt_length: ruby_vminsn_type = 238; -pub const YARVINSN_zjit_opt_size: ruby_vminsn_type = 239; -pub const YARVINSN_zjit_opt_empty_p: ruby_vminsn_type = 240; -pub const YARVINSN_zjit_opt_succ: ruby_vminsn_type = 241; -pub const YARVINSN_zjit_opt_not: ruby_vminsn_type = 242; -pub const YARVINSN_zjit_opt_regexpmatch2: ruby_vminsn_type = 243; -pub const VM_INSTRUCTION_SIZE: ruby_vminsn_type = 244; +pub const YARVINSN_zjit_send: ruby_vminsn_type = 219; +pub const YARVINSN_zjit_opt_send_without_block: ruby_vminsn_type = 220; +pub const YARVINSN_zjit_objtostring: ruby_vminsn_type = 221; +pub const YARVINSN_zjit_opt_nil_p: ruby_vminsn_type = 222; +pub const YARVINSN_zjit_opt_plus: ruby_vminsn_type = 223; +pub const YARVINSN_zjit_opt_minus: ruby_vminsn_type = 224; +pub const YARVINSN_zjit_opt_mult: ruby_vminsn_type = 225; +pub const YARVINSN_zjit_opt_div: ruby_vminsn_type = 226; +pub const YARVINSN_zjit_opt_mod: ruby_vminsn_type = 227; +pub const YARVINSN_zjit_opt_eq: ruby_vminsn_type = 228; +pub const YARVINSN_zjit_opt_neq: ruby_vminsn_type = 229; +pub const YARVINSN_zjit_opt_lt: ruby_vminsn_type = 230; +pub const YARVINSN_zjit_opt_le: ruby_vminsn_type = 231; +pub const YARVINSN_zjit_opt_gt: ruby_vminsn_type = 232; +pub const YARVINSN_zjit_opt_ge: ruby_vminsn_type = 233; +pub const YARVINSN_zjit_opt_ltlt: ruby_vminsn_type = 234; +pub const YARVINSN_zjit_opt_and: ruby_vminsn_type = 235; +pub const YARVINSN_zjit_opt_or: ruby_vminsn_type = 236; +pub const YARVINSN_zjit_opt_aref: ruby_vminsn_type = 237; +pub const YARVINSN_zjit_opt_aset: ruby_vminsn_type = 238; +pub const YARVINSN_zjit_opt_length: ruby_vminsn_type = 239; +pub const YARVINSN_zjit_opt_size: ruby_vminsn_type = 240; +pub const YARVINSN_zjit_opt_empty_p: ruby_vminsn_type = 241; +pub const YARVINSN_zjit_opt_succ: ruby_vminsn_type = 242; +pub const YARVINSN_zjit_opt_not: ruby_vminsn_type = 243; +pub const YARVINSN_zjit_opt_regexpmatch2: ruby_vminsn_type = 244; +pub const VM_INSTRUCTION_SIZE: ruby_vminsn_type = 245; pub type ruby_vminsn_type = u32; pub type rb_iseq_callback = ::std::option::Option< unsafe extern "C" fn(arg1: *const rb_iseq_t, arg2: *mut ::std::os::raw::c_void), diff --git a/zjit/src/hir.rs b/zjit/src/hir.rs index 370ed568579e0c..1f77f38dc8872b 100644 --- a/zjit/src/hir.rs +++ b/zjit/src/hir.rs @@ -668,6 +668,7 @@ pub enum Insn { state: InsnId, return_type: Type, elidable: bool, + blockiseq: Option, }, /// Call a variadic C function with signature: func(int argc, VALUE *argv, VALUE recv) @@ -1063,11 +1064,14 @@ impl<'a> std::fmt::Display for InsnPrinter<'a> { } Ok(()) }, - Insn::CCallWithFrame { cfunc, args, name, .. } => { + Insn::CCallWithFrame { cfunc, args, name, blockiseq, .. } => { write!(f, "CCallWithFrame {}@{:p}", name.contents_lossy(), self.ptr_map.map_ptr(cfunc))?; for arg in args { write!(f, ", {arg}")?; } + if let Some(blockiseq) = blockiseq { + write!(f, ", block={:p}", self.ptr_map.map_ptr(blockiseq))?; + } Ok(()) }, Insn::CCallVariadic { cfunc, recv, args, name, .. } => { @@ -1598,7 +1602,17 @@ impl Function { &ObjectAlloc { val, state } => ObjectAlloc { val: find!(val), state }, &ObjectAllocClass { class, state } => ObjectAllocClass { class, state: find!(state) }, &CCall { cfunc, ref args, name, return_type, elidable } => CCall { cfunc, args: find_vec!(args), name, return_type, elidable }, - &CCallWithFrame { cd, cfunc, ref args, cme, name, state, return_type, elidable } => CCallWithFrame { cd, cfunc, args: find_vec!(args), cme, name, state: find!(state), return_type, elidable }, + &CCallWithFrame { cd, cfunc, ref args, cme, name, state, return_type, elidable, blockiseq } => CCallWithFrame { + cd, + cfunc, + args: find_vec!(args), + cme, + name, + state: find!(state), + return_type, + elidable, + blockiseq, + }, &CCallVariadic { cfunc, recv, ref args, cme, name, state, return_type, elidable } => CCallVariadic { cfunc, recv: find!(recv), args: find_vec!(args), cme, name, state, return_type, elidable }, @@ -2134,7 +2148,7 @@ impl Function { } } // This doesn't actually optimize Send yet, just replaces the fallback reason to be more precise. - // TODO: Optimize Send + // The actual optimization is done in reduce_send_to_ccall. Insn::Send { recv, cd, state, .. } => { let frame_state = self.frame_state(state); let klass = if let Some(klass) = self.type_of(recv).runtime_exact_ruby_class() { @@ -2338,8 +2352,111 @@ impl Function { fun.push_insn(block, Insn::PatchPoint { invariant: Invariant::MethodRedefined { klass: recv_class, method: method_id, cme: method }, state }); } - // Try to reduce one SendWithoutBlock to a CCall - fn reduce_to_ccall( + // Try to reduce a Send insn to a CCallWithFrame + fn reduce_send_to_ccall( + fun: &mut Function, + block: BlockId, + self_type: Type, + send: Insn, + send_insn_id: InsnId, + ) -> Result<(), ()> { + let Insn::Send { mut recv, cd, blockiseq, mut args, state, .. } = send else { + return Err(()); + }; + + let call_info = unsafe { (*cd).ci }; + let argc = unsafe { vm_ci_argc(call_info) }; + let method_id = unsafe { rb_vm_ci_mid(call_info) }; + + // If we have info about the class of the receiver + let (recv_class, profiled_type) = if let Some(class) = self_type.runtime_exact_ruby_class() { + (class, None) + } else { + let iseq_insn_idx = fun.frame_state(state).insn_idx; + let Some(recv_type) = fun.profiled_type_of_at(recv, iseq_insn_idx) else { return Err(()) }; + (recv_type.class(), Some(recv_type)) + }; + + // Do method lookup + let method: *const rb_callable_method_entry_struct = unsafe { rb_callable_method_entry(recv_class, method_id) }; + if method.is_null() { + return Err(()); + } + + // Filter for C methods + let def_type = unsafe { get_cme_def_type(method) }; + if def_type != VM_METHOD_TYPE_CFUNC { + return Err(()); + } + + // Find the `argc` (arity) of the C method, which describes the parameters it expects + let cfunc = unsafe { get_cme_def_body_cfunc(method) }; + let cfunc_argc = unsafe { get_mct_argc(cfunc) }; + match cfunc_argc { + 0.. => { + // (self, arg0, arg1, ..., argc) form + // + // Bail on argc mismatch + if argc != cfunc_argc as u32 { + return Err(()); + } + + let ci_flags = unsafe { vm_ci_flag(call_info) }; + + // When seeing &block argument, fall back to dynamic dispatch for now + // TODO: Support block forwarding + if ci_flags & VM_CALL_ARGS_BLOCKARG != 0 { + return Err(()); + } + + // Commit to the replacement. Put PatchPoint. + gen_patch_points_for_optimized_ccall(fun, block, recv_class, method_id, method, state); + if recv_class.instance_can_have_singleton_class() { + fun.push_insn(block, Insn::PatchPoint { invariant: Invariant::NoSingletonClass { klass: recv_class }, state }); + } + + if let Some(profiled_type) = profiled_type { + // Guard receiver class + recv = fun.push_insn(block, Insn::GuardType { val: recv, guard_type: Type::from_profiled_type(profiled_type), state }); + fun.insn_types[recv.0] = fun.infer_type(recv); + } + + let blockiseq = if blockiseq.is_null() { None } else { Some(blockiseq) }; + + // Emit a call + let cfunc = unsafe { get_mct_func(cfunc) }.cast(); + let mut cfunc_args = vec![recv]; + cfunc_args.append(&mut args); + + let ccall = fun.push_insn(block, Insn::CCallWithFrame { + cd, + cfunc, + args: cfunc_args, + cme: method, + name: method_id, + state, + return_type: types::BasicObject, + elidable: false, + blockiseq, + }); + fun.make_equal_to(send_insn_id, ccall); + return Ok(()); + } + // Variadic method + -1 => { + // func(int argc, VALUE *argv, VALUE recv) + return Err(()); + } + -2 => { + // (self, args_ruby_array) + return Err(()); + } + _ => unreachable!("unknown cfunc kind: argc={argc}") + } + } + + // Try to reduce a SendWithoutBlock insn to a CCall/CCallWithFrame + fn reduce_send_without_block_to_ccall( fun: &mut Function, block: BlockId, self_type: Type, @@ -2440,7 +2557,17 @@ impl Function { if get_option!(stats) { count_not_inlined_cfunc(fun, block, method); } - let ccall = fun.push_insn(block, Insn::CCallWithFrame { cd, cfunc, args: cfunc_args, cme: method, name: method_id, state, return_type, elidable }); + let ccall = fun.push_insn(block, Insn::CCallWithFrame { + cd, + cfunc, + args: cfunc_args, + cme: method, + name: method_id, + state, + return_type, + elidable, + blockiseq: None, + }); fun.make_equal_to(send_insn_id, ccall); } @@ -2555,11 +2682,21 @@ impl Function { let old_insns = std::mem::take(&mut self.blocks[block.0].insns); assert!(self.blocks[block.0].insns.is_empty()); for insn_id in old_insns { - if let send @ Insn::SendWithoutBlock { recv, .. } = self.find(insn_id) { - let recv_type = self.type_of(recv); - if reduce_to_ccall(self, block, recv_type, send, insn_id).is_ok() { - continue; + let send = self.find(insn_id); + match send { + send @ Insn::SendWithoutBlock { recv, .. } => { + let recv_type = self.type_of(recv); + if reduce_send_without_block_to_ccall(self, block, recv_type, send, insn_id).is_ok() { + continue; + } + } + send @ Insn::Send { recv, .. } => { + let recv_type = self.type_of(recv); + if reduce_send_to_ccall(self, block, recv_type, send, insn_id).is_ok() { + continue; + } } + _ => {} } self.push_insn_id(block, insn_id); } @@ -12583,6 +12720,108 @@ mod opt_tests { "); } + #[test] + fn test_optimize_send_with_block() { + eval(r#" + def test = [1, 2, 3].map { |x| x * 2 } + test; test + "#); + assert_snapshot!(hir_string("test"), @r" + fn test@:2: + bb0(): + EntryPoint interpreter + v1:BasicObject = LoadSelf + Jump bb2(v1) + bb1(v4:BasicObject): + EntryPoint JIT(0) + Jump bb2(v4) + bb2(v6:BasicObject): + v10:ArrayExact[VALUE(0x1000)] = Const Value(VALUE(0x1000)) + v12:ArrayExact = ArrayDup v10 + PatchPoint MethodRedefined(Array@0x1008, map@0x1010, cme:0x1018) + PatchPoint NoSingletonClass(Array@0x1008) + v23:BasicObject = CCallWithFrame map@0x1040, v12, block=0x1048 + CheckInterrupts + Return v23 + "); + } + + #[test] + fn test_do_not_optimize_send_variadic_with_block() { + eval(r#" + def test = [1, 2, 3].index { |x| x == 2 } + test; test + "#); + assert_snapshot!(hir_string("test"), @r" + fn test@:2: + bb0(): + EntryPoint interpreter + v1:BasicObject = LoadSelf + Jump bb2(v1) + bb1(v4:BasicObject): + EntryPoint JIT(0) + Jump bb2(v4) + bb2(v6:BasicObject): + v10:ArrayExact[VALUE(0x1000)] = Const Value(VALUE(0x1000)) + v12:ArrayExact = ArrayDup v10 + v14:BasicObject = Send v12, 0x1008, :index + CheckInterrupts + Return v14 + "); + } + + #[test] + fn test_do_not_optimize_send_with_block_forwarding() { + eval(r#" + def test(&block) = [].map(&block) + test; test + "#); + assert_snapshot!(hir_string("test"), @r" + fn test@:2: + bb0(): + EntryPoint interpreter + v1:BasicObject = LoadSelf + v2:BasicObject = GetLocal l0, SP@4 + Jump bb2(v1, v2) + bb1(v5:BasicObject, v6:BasicObject): + EntryPoint JIT(0) + Jump bb2(v5, v6) + bb2(v8:BasicObject, v9:BasicObject): + v14:ArrayExact = NewArray + GuardBlockParamProxy l0 + v17:HeapObject[BlockParamProxy] = Const Value(VALUE(0x1000)) + v19:BasicObject = Send v14, 0x1008, :map, v17 + CheckInterrupts + Return v19 + "); + } + + #[test] + fn test_do_not_optimize_send_to_iseq_method_with_block() { + eval(r#" + def foo + yield 1 + end + + def test = foo {} + test; test + "#); + assert_snapshot!(hir_string("test"), @r" + fn test@:6: + bb0(): + EntryPoint interpreter + v1:BasicObject = LoadSelf + Jump bb2(v1) + bb1(v4:BasicObject): + EntryPoint JIT(0) + Jump bb2(v4) + bb2(v6:BasicObject): + v11:BasicObject = Send v6, 0x1000, :foo + CheckInterrupts + Return v11 + "); + } + #[test] fn test_inline_attr_reader_constant() { eval(" diff --git a/zjit/src/profile.rs b/zjit/src/profile.rs index e935ec9731f383..a6c837df5a48ff 100644 --- a/zjit/src/profile.rs +++ b/zjit/src/profile.rs @@ -83,7 +83,7 @@ fn profile_insn(bare_opcode: ruby_vminsn_type, ec: EcPtr) { YARVINSN_opt_length => profile_operands(profiler, profile, 1), YARVINSN_opt_size => profile_operands(profiler, profile, 1), YARVINSN_opt_succ => profile_operands(profiler, profile, 1), - YARVINSN_opt_send_without_block => { + YARVINSN_opt_send_without_block | YARVINSN_send => { let cd: *const rb_call_data = profiler.insn_opnd(0).as_ptr(); let argc = unsafe { vm_ci_argc((*cd).ci) }; // Profile all the arguments and self (+1). From 6e9f7974df5608dd74b5a107658ac944ec05f8d0 Mon Sep 17 00:00:00 2001 From: Luke Gruber Date: Wed, 17 Sep 2025 16:30:41 -0400 Subject: [PATCH 2/3] [DOC] Create doc/contributing/concurrency_guide.md This guide is for those that want to contribute to ruby but don't understand where they need to use locks or other concurrency mechanisms. It teaches them how to use these locks safely and what is prohibited in certain circumstances. --- doc/contributing/concurrency_guide.md | 154 ++++++++++++++++++++++++++ 1 file changed, 154 insertions(+) create mode 100644 doc/contributing/concurrency_guide.md diff --git a/doc/contributing/concurrency_guide.md b/doc/contributing/concurrency_guide.md new file mode 100644 index 00000000000000..1fb58f7203ad8f --- /dev/null +++ b/doc/contributing/concurrency_guide.md @@ -0,0 +1,154 @@ +# Concurrency Guide + +This is a guide to thinking about concurrency in the cruby source code, whether that's contributing to Ruby +by writing C or by contributing to one of the JITs. This does not touch on native extensions, only the core +language. It will go over: + +* What needs synchronizing? +* How to use the VM lock, and what you can and can't do when you've acquired this lock. +* What you can and can't do when you've acquired other native locks. +* The difference between the VM lock and the GVL. +* What a VM barrier is and when to use it. +* The lock ordering of some important locks. +* How ruby interrupt handling works. +* The timer thread and what it's responsible for. + +## What needs synchronizing? + +Before ractors, only one ruby thread could run at once. That didn't mean you could forget about concurrency issues, though. The timer thread +is a native thread that interacts with other ruby threads and changes some VM internals, so if these changes can be done in parallel by both the timer +thread and a ruby thread, they need to be synchronized. + +When you add ractors to the mix, it gets more complicated. However, ractors allow you to forget about synchronization for non-shareable objects because +they aren't used across ractors. Only one ruby thread can touch the object at once. For shareable objects, they are deeply frozen so there isn't any +mutation on the objects themselves. However, something like reading/writing constants across ractors does need to be synchronized. In this case, ruby threads need to +see a consistent view of the VM. If publishing the update takes 2 steps or even two separate instructions, like in this case, synchronization is required. + +Most synchronization is to protect VM internals. These internals include structures for the thread scheduler on each ractor, the global ractor scheduler, the +coordination between ruby threads and ractors, global tables (for `fstrings`, encodings, symbols and global vars), etc. Anything that can be mutated by a ractor +that can also be read or mutated by another ractor at the same time requires proper synchronization. + +## The VM Lock + +There's only one VM lock and it is for critical sections that can only be entered by one ractor at a time. +Without ractors, the VM lock is useless. It does not stop all ractors from running, as ractors can run +without trying to acquire this lock. If you're updating global (shared) data between ractors and aren't using +atomics, you need to use a lock and this is a convenient one to use. Unlike other locks, you can allocate ruby-managed +memory with it held. When you take the VM lock, there are things you can and can't do during your critical section: + +You can (as long as no other locks are also held before the VM lock): + +* Create ruby objects, call `ruby_xmalloc`, etc. + +You can't: + +* Context switch to another ruby thread or ractor. This is important, as many things can cause ruby-level context switches including: + + * Calling any ruby method through, for example, `rb_funcall`. If you execute ruby code, a context switch could happen. + This also applies to ruby methods defined in C, as they can be redefined in Ruby. Things that call ruby methods such as + `rb_obj_respond_to` are also disallowed. + + * Calling `rb_raise`. This will call `initialize` on the new exception object. With the VM lock + held, nothing you call should be able to raise an exception. `NoMemoryError` is allowed, however. + + * Calling `rb_nogvl` or a ruby-level mechanism that can context switch like `rb_mutex_lock`. + + * Enter any blocking operation managed by ruby. This will context switch to another ruby thread using `rb_nogvl` or + something equivalent. A blocking operation is one that blocks the thread's progress, such as `sleep` or `IO#read`. + +Internally, the VM lock is the `vm->ractor.sync.lock`. + +You need to be on a ruby thread to take the VM lock. You also can't take it inside any functions that could be called during sweeping, as MMTK sweeps +on another thread and you need a valid `ec` to grab the lock. For this same reason (among others), you can't take it from the timer thread either. + +## Other Locks + +All native locks that aren't the VM lock share a more strict set of rules for what's allowed during the critical section. By native locks, we mean +anything that uses `rb_native_mutex_lock`. Some important locks include the `interrupt_lock`, the ractor scheduling lock (protects global scheduling data structures), +the thread scheduling lock (local to each ractor, protects per-ractor scheduling data structures) and the ractor lock (local to each ractor, protects ractor data structures). + +When you acquire one of these locks, + +You can: + +* Allocate memory though non-ruby allocation such as raw `malloc` or the standard library. But be careful, some functions like `strdup` use +ruby allocation through the use of macros! + +* Use `ccan` lists, as they don't allocate. + +* Do the usual things like set variables or struct fields, manipulate linked lists, signal condition variables etc. + +You can't: + +* Allocate ruby-managed memory. This includes creating ruby objects or using `ruby_xmalloc` or `st_insert`. The reason this +is disallowed is if that allocation causes a GC, then all other ruby threads must join a VM barrier as soon as possible +(when they next check interrupts or acquire the VM lock). This is so that no other ractors are running during GC. If a ruby thread +is waiting (blocked) on this same native lock, it can't join the barrier and a deadlock occurs because the barrier will never finish. + +* Raise exceptions. You also can't use `EC_JUMP_TAG` if it jumps out of the critical section. + +* Context switch. See the `VM Lock` section for more info. + +## Difference Between VM Lock and GVL + +The VM Lock is a particular lock in the source code. There is only one VM Lock. The GVL, on the other hand, is more of a combination of locks. +It is "acquired" when a ruby thread is about to run or is running. Since many ruby threads can run at the same time if they're in different ractors, +there are many GVLs (1 per `SNT` + 1 for the main ractor). It can no longer be thought of as a "Global VM Lock" like it once was before ractors. + +## VM Barriers + +Sometimes, taking the VM Lock isn't enough and you need a guarantee that all ractors have stopped. This happens when running `GC`, for instance. +To get a barrier, you take the VM Lock and call `rb_vm_barrier()`. For the duration that the VM lock is held, no other ractors will be running. It's not used +often as taking a barrier slows ractor performance down considerably, but it's useful to know about and is sometimes the only solution. + +## Lock Orderings + +It's a good idea to not hold more than 2 locks at once on the same thread. Locking multiple locks can introduce deadlocks, so do it with care. When locking +multiple locks at once, follow an ordering that is consistent across the program, otherwise you can introduce deadlocks. Here are the orderings of some important locks: + +* VM lock before ractor_sched_lock +* thread_sched_lock before ractor_sched_lock +* interrupt_lock before timer_th.waiting_lock +* timer_th.waiting_lock before ractor_sched_lock + +These orderings are subject to change, so check the source if you're not sure. On top of this: + +* During each `ubf` (unblock) function, the VM lock can be taken around it in some circumstances. This happens during VM shutdown, for example. +See the "Interrupt Handling" section for more details. + +## Ruby Interrupt Handling + +When the VM runs ruby code, ruby's threads intermittently check ruby-level interrupts. These software interrupts +are for various things in ruby and they can be set by other ruby threads or the timer thread. + +* Ruby threads check when they should give up their timeslice. The native thread switches to another ruby thread when their time is up. +* The timer thread sends a "trap" interrupt to the main thread if any ruby-level signal handlers are pending. +* Ruby threads can have other ruby threads run tasks for them by sending them an interrupt. For instance, ractors send +the main thread an interrupt when they need to `require` a file so that it's done on the main thread. They wait for the +main thread's result. +* During VM shutdown, a "terminate" interrupt is sent to all ractor main threads top stop them asap. +* When calling `Thread#raise`, the caller sends an interrupt to that thread telling it which exception to raise. +* Unlocking a mutex sends the next waiter (if any) an interrupt telling it to grab the lock. +* Signalling or broadcasting on a condition variable tells the waiter(s) to wake up. + +This isn't a complete list. + +When sending an interrupt to a ruby thread, the ruby thread can be blocked. For example, it could be in the middle of a `TCPSocket#read` call. If so, +the receiving thread's `ubf` (unblock function) gets called from the thread (ruby thread or timer thread) that sent the interrupt. +Each ruby thread has a `ubf` that is set when it enters a blocking operation and is unset after returning from it. By default, this `ubf` function sends a +`SIGVTALRM` to the receiving thread to try to unblock it from the kernel so it can check its interrupts. There are other `ubfs` that +aren't associated with a syscall, such as when calling `Ractor#join` or `sleep`. All `ubfs` are called with the `interrupt_lock` held, +so take that into account when using locks inside `ubfs`. + +Remember, `ubfs` can be called from the timer thread so you cannot assume an `ec` inside them. The `ec` (execution context) is only set on ruby threads. + +## The Timer Thread + +The timer thread has a few functions. They are: + +* Send interrupts to ruby threads that have run for their whole timeslice. +* Wake up M:N ruby threads (threads in non-main ractors) blocked on IO or after a specified timeout. This +uses `kqueue` or `epoll`, depending on the OS, to receive IO events on behalf of the threads. +* Continue calling the `SIGVTARLM` signal if a thread is still blocked on a syscall after the first `ubf` call. +* Signal native threads (`SNT`) waiting on a ractor if there are ractors waiting in the global run queue. +* Create more `SNT`s if some are blocked, like on IO or on `Ractor#join`. From 17368234bfd0b37c58a4b694d764d42e2cad8880 Mon Sep 17 00:00:00 2001 From: Max Bernstein Date: Mon, 20 Oct 2025 17:30:48 -0400 Subject: [PATCH 3/3] ZJIT: Implement codegen for FixnumMod (#14857) This is mostly to see what happens to the loops-times benchmark. --- depend | 2 ++ jit.c | 7 +++++++ yjit.c | 6 ------ yjit/bindgen/src/main.rs | 2 +- yjit/src/cruby.rs | 2 +- yjit/src/cruby_bindings.inc.rs | 2 +- zjit/src/codegen.rs | 9 ++++++++- zjit/src/cruby.rs | 2 ++ zjit/src/hir.rs | 1 + zjit/src/stats.rs | 2 ++ 10 files changed, 25 insertions(+), 10 deletions(-) diff --git a/depend b/depend index 63e73a56390056..1f9f0c31eba695 100644 --- a/depend +++ b/depend @@ -7382,8 +7382,10 @@ jit.$(OBJEXT): $(CCAN_DIR)/str/str.h jit.$(OBJEXT): $(hdrdir)/ruby/ruby.h jit.$(OBJEXT): $(top_srcdir)/internal/array.h jit.$(OBJEXT): $(top_srcdir)/internal/basic_operators.h +jit.$(OBJEXT): $(top_srcdir)/internal/bits.h jit.$(OBJEXT): $(top_srcdir)/internal/class.h jit.$(OBJEXT): $(top_srcdir)/internal/compilers.h +jit.$(OBJEXT): $(top_srcdir)/internal/fixnum.h jit.$(OBJEXT): $(top_srcdir)/internal/gc.h jit.$(OBJEXT): $(top_srcdir)/internal/imemo.h jit.$(OBJEXT): $(top_srcdir)/internal/namespace.h diff --git a/jit.c b/jit.c index 0b491f0481d875..b7cb05d1c34efd 100644 --- a/jit.c +++ b/jit.c @@ -14,6 +14,7 @@ #include "iseq.h" #include "internal/gc.h" #include "vm_sync.h" +#include "internal/fixnum.h" // Field offsets for the RObject struct enum robject_offsets { @@ -720,3 +721,9 @@ rb_jit_icache_invalidate(void *start, void *end) #error No instruction cache clear available with this compiler on Aarch64! #endif } + +VALUE +rb_jit_fix_mod_fix(VALUE recv, VALUE obj) +{ + return rb_fix_mod_fix(recv, obj); +} diff --git a/yjit.c b/yjit.c index 598fe5716704d0..d0ab367b1c7bb1 100644 --- a/yjit.c +++ b/yjit.c @@ -332,12 +332,6 @@ rb_yjit_fix_div_fix(VALUE recv, VALUE obj) return rb_fix_div_fix(recv, obj); } -VALUE -rb_yjit_fix_mod_fix(VALUE recv, VALUE obj) -{ - return rb_fix_mod_fix(recv, obj); -} - // Return non-zero when `obj` is an array and its last item is a // `ruby2_keywords` hash. We don't support this kind of splat. size_t diff --git a/yjit/bindgen/src/main.rs b/yjit/bindgen/src/main.rs index 0d4d57e0695941..2b4f48d73ec4bd 100644 --- a/yjit/bindgen/src/main.rs +++ b/yjit/bindgen/src/main.rs @@ -367,7 +367,7 @@ fn main() { .allowlist_function("rb_yarv_ary_entry_internal") .allowlist_function("rb_yjit_ruby2_keywords_splat_p") .allowlist_function("rb_yjit_fix_div_fix") - .allowlist_function("rb_yjit_fix_mod_fix") + .allowlist_function("rb_jit_fix_mod_fix") .allowlist_function("rb_FL_TEST") .allowlist_function("rb_FL_TEST_RAW") .allowlist_function("rb_RB_TYPE_P") diff --git a/yjit/src/cruby.rs b/yjit/src/cruby.rs index 36baecd5358031..0d9e3b74dad874 100644 --- a/yjit/src/cruby.rs +++ b/yjit/src/cruby.rs @@ -199,7 +199,7 @@ pub use rb_get_call_data_ci as get_call_data_ci; pub use rb_yarv_str_eql_internal as rb_str_eql_internal; pub use rb_yarv_ary_entry_internal as rb_ary_entry_internal; pub use rb_yjit_fix_div_fix as rb_fix_div_fix; -pub use rb_yjit_fix_mod_fix as rb_fix_mod_fix; +pub use rb_jit_fix_mod_fix as rb_fix_mod_fix; pub use rb_FL_TEST as FL_TEST; pub use rb_FL_TEST_RAW as FL_TEST_RAW; pub use rb_RB_TYPE_P as RB_TYPE_P; diff --git a/yjit/src/cruby_bindings.inc.rs b/yjit/src/cruby_bindings.inc.rs index 0a14a699284268..74661e7ade9bf8 100644 --- a/yjit/src/cruby_bindings.inc.rs +++ b/yjit/src/cruby_bindings.inc.rs @@ -1142,7 +1142,6 @@ extern "C" { pub fn rb_ary_unshift_m(argc: ::std::os::raw::c_int, argv: *mut VALUE, ary: VALUE) -> VALUE; pub fn rb_yjit_rb_ary_subseq_length(ary: VALUE, beg: ::std::os::raw::c_long) -> VALUE; pub fn rb_yjit_fix_div_fix(recv: VALUE, obj: VALUE) -> VALUE; - pub fn rb_yjit_fix_mod_fix(recv: VALUE, obj: VALUE) -> VALUE; pub fn rb_yjit_ruby2_keywords_splat_p(obj: VALUE) -> usize; pub fn rb_yjit_splat_varg_checks( sp: *mut VALUE, @@ -1275,4 +1274,5 @@ extern "C" { start: *mut ::std::os::raw::c_void, end: *mut ::std::os::raw::c_void, ); + pub fn rb_jit_fix_mod_fix(recv: VALUE, obj: VALUE) -> VALUE; } diff --git a/zjit/src/codegen.rs b/zjit/src/codegen.rs index 87e0ed907a044d..f7b335f1bfce89 100644 --- a/zjit/src/codegen.rs +++ b/zjit/src/codegen.rs @@ -398,6 +398,7 @@ fn gen_insn(cb: &mut CodeBlock, jit: &mut JITState, asm: &mut Assembler, functio Insn::FixnumGe { left, right } => gen_fixnum_ge(asm, opnd!(left), opnd!(right)), Insn::FixnumAnd { left, right } => gen_fixnum_and(asm, opnd!(left), opnd!(right)), Insn::FixnumOr { left, right } => gen_fixnum_or(asm, opnd!(left), opnd!(right)), + &Insn::FixnumMod { left, right, state } => gen_fixnum_mod(jit, asm, opnd!(left), opnd!(right), &function.frame_state(state)), Insn::IsNil { val } => gen_isnil(asm, opnd!(val)), &Insn::IsMethodCfunc { val, cd, cfunc, state: _ } => gen_is_method_cfunc(jit, asm, opnd!(val), cd, cfunc), &Insn::IsBitEqual { left, right } => gen_is_bit_equal(asm, opnd!(left), opnd!(right)), @@ -447,7 +448,6 @@ fn gen_insn(cb: &mut CodeBlock, jit: &mut JITState, asm: &mut Assembler, functio &Insn::LoadIvarExtended { self_val, id, index } => gen_load_ivar_extended(asm, opnd!(self_val), id, index), &Insn::ArrayMax { state, .. } | &Insn::FixnumDiv { state, .. } - | &Insn::FixnumMod { state, .. } | &Insn::Throw { state, .. } => return Err(state), }; @@ -1460,6 +1460,13 @@ fn gen_fixnum_or(asm: &mut Assembler, left: lir::Opnd, right: lir::Opnd) -> lir: asm.or(left, right) } +fn gen_fixnum_mod(jit: &mut JITState, asm: &mut Assembler, left: lir::Opnd, right: lir::Opnd, state: &FrameState) -> lir::Opnd { + // Check for left % 0, which raises ZeroDivisionError + asm.cmp(right, Opnd::from(VALUE::fixnum_from_usize(0))); + asm.je(side_exit(jit, state, FixnumModByZero)); + asm_ccall!(asm, rb_fix_mod_fix, left, right) +} + // Compile val == nil fn gen_isnil(asm: &mut Assembler, val: lir::Opnd) -> lir::Opnd { asm.cmp(val, Qnil.into()); diff --git a/zjit/src/cruby.rs b/zjit/src/cruby.rs index 5f4eac1db5ed9e..a84e408861fc54 100644 --- a/zjit/src/cruby.rs +++ b/zjit/src/cruby.rs @@ -134,6 +134,7 @@ unsafe extern "C" { pub fn rb_str_setbyte(str: VALUE, index: VALUE, value: VALUE) -> VALUE; pub fn rb_str_getbyte(str: VALUE, index: VALUE) -> VALUE; pub fn rb_vm_splat_array(flag: VALUE, ary: VALUE) -> VALUE; + pub fn rb_jit_fix_mod_fix(x: VALUE, y: VALUE) -> VALUE; pub fn rb_vm_concat_array(ary1: VALUE, ary2st: VALUE) -> VALUE; pub fn rb_vm_get_special_object(reg_ep: *const VALUE, value_type: vm_special_object_type) -> VALUE; pub fn rb_vm_concat_to_array(ary1: VALUE, ary2st: VALUE) -> VALUE; @@ -219,6 +220,7 @@ pub use rb_vm_ci_kwarg as vm_ci_kwarg; pub use rb_METHOD_ENTRY_VISI as METHOD_ENTRY_VISI; pub use rb_RCLASS_ORIGIN as RCLASS_ORIGIN; pub use rb_vm_get_special_object as vm_get_special_object; +pub use rb_jit_fix_mod_fix as rb_fix_mod_fix; /// Helper so we can get a Rust string for insn_name() pub fn insn_name(opcode: usize) -> String { diff --git a/zjit/src/hir.rs b/zjit/src/hir.rs index 1f77f38dc8872b..7083a082fba1a8 100644 --- a/zjit/src/hir.rs +++ b/zjit/src/hir.rs @@ -468,6 +468,7 @@ pub enum SideExitReason { BlockParamProxyModified, BlockParamProxyNotIseqOrIfunc, StackOverflow, + FixnumModByZero, } #[derive(Debug, Clone, Copy)] diff --git a/zjit/src/stats.rs b/zjit/src/stats.rs index 50f6e61f5c242e..33f29fb3aaed22 100644 --- a/zjit/src/stats.rs +++ b/zjit/src/stats.rs @@ -137,6 +137,7 @@ make_counters! { exit_fixnum_add_overflow, exit_fixnum_sub_overflow, exit_fixnum_mult_overflow, + exit_fixnum_mod_by_zero, exit_guard_type_failure, exit_guard_type_not_failure, exit_guard_bit_equals_failure, @@ -332,6 +333,7 @@ pub fn side_exit_counter(reason: crate::hir::SideExitReason) -> Counter { FixnumAddOverflow => exit_fixnum_add_overflow, FixnumSubOverflow => exit_fixnum_sub_overflow, FixnumMultOverflow => exit_fixnum_mult_overflow, + FixnumModByZero => exit_fixnum_mod_by_zero, GuardType(_) => exit_guard_type_failure, GuardTypeNot(_) => exit_guard_type_not_failure, GuardBitEquals(_) => exit_guard_bit_equals_failure,