From 467992ee35b59577ddb53f1323b19af3f3e3d134 Mon Sep 17 00:00:00 2001 From: Jimmy Miller Date: Tue, 11 Oct 2022 16:37:05 -0400 Subject: [PATCH] Implement optimize send in yjit (#6488) * Implement optimize send in yjit This successfully makes all our benchmarks exit way less for optimize send reasons. It makes some benchmarks faster, but not by as much as I'd like. I think this implementation works, but there are definitely more optimial arrangements. For example, what if we compiled send to a jump table? That seems like perhaps the most optimal we could do, but not obvious (to me) how to implement give our current setup. Co-authored-by: Alan Wu * Attempt at fixing the issues raised by @XrXr * fix allowlist * returns 0 instead of nil when not found * remove comment about encoding exception * Fix up c changes * Update assert Co-authored-by: Alan Wu * get rid of unneeded code and fix the flags * Apply suggestions from code review Co-authored-by: Alan Wu * rename and fix typo Co-authored-by: Alan Wu --- symbol.c | 22 ++++ yjit.c | 2 + yjit/bindgen/src/main.rs | 1 + yjit/src/codegen.rs | 217 ++++++++++++++++++++++++++++++--- yjit/src/core.rs | 15 +++ yjit/src/cruby.rs | 5 + yjit/src/cruby_bindings.inc.rs | 3 + yjit/src/stats.rs | 14 ++- 8 files changed, 262 insertions(+), 17 deletions(-) diff --git a/symbol.c b/symbol.c index 6aceab72c1..adcc1275e3 100644 --- a/symbol.c +++ b/symbol.c @@ -1112,6 +1112,28 @@ rb_check_id(volatile VALUE *namep) return lookup_str_id(name); } +// Used by yjit for handling .send without throwing exceptions +ID +rb_get_symbol_id(VALUE name) +{ + if (STATIC_SYM_P(name)) { + return STATIC_SYM2ID(name); + } + else if (DYNAMIC_SYM_P(name)) { + if (SYMBOL_PINNED_P(name)) { + return RSYMBOL(name)->id; + } + else { + return 0; + } + } + else { + RUBY_ASSERT_ALWAYS(RB_TYPE_P(name, T_STRING)); + return lookup_str_id(name); + } +} + + VALUE rb_check_symbol(volatile VALUE *namep) { diff --git a/yjit.c b/yjit.c index cc64cccac3..838956f7b4 100644 --- a/yjit.c +++ b/yjit.c @@ -509,6 +509,8 @@ rb_get_cme_def_body_attr_id(const rb_callable_method_entry_t *cme) return cme->def->body.attr.id; } +ID rb_get_symbol_id(VALUE namep); + enum method_optimized_type rb_get_cme_def_body_optimized_type(const rb_callable_method_entry_t *cme) { diff --git a/yjit/bindgen/src/main.rs b/yjit/bindgen/src/main.rs index 7ae9f06adc..b945e9b106 100644 --- a/yjit/bindgen/src/main.rs +++ b/yjit/bindgen/src/main.rs @@ -332,6 +332,7 @@ fn main() { .allowlist_function("rb_get_cfp_ep_level") .allowlist_function("rb_get_cme_def_type") .allowlist_function("rb_get_cme_def_body_attr_id") + .allowlist_function("rb_get_symbol_id") .allowlist_function("rb_get_cme_def_body_optimized_type") .allowlist_function("rb_get_cme_def_body_optimized_index") .allowlist_function("rb_get_cme_def_body_cfunc") diff --git a/yjit/src/codegen.rs b/yjit/src/codegen.rs index 56877c1721..8b0409649f 100644 --- a/yjit/src/codegen.rs +++ b/yjit/src/codegen.rs @@ -1918,6 +1918,9 @@ pub const OPT_AREF_MAX_CHAIN_DEPTH: i32 = 2; // up to 5 different classes pub const SEND_MAX_DEPTH: i32 = 5; +// up to 20 different methods for send +pub const SEND_MAX_CHAIN_DEPTH: i32 = 20; + // Codegen for setting an instance variable. // Preconditions: // - receiver is in REG0 @@ -1929,7 +1932,15 @@ fn gen_set_ivar( asm: &mut Assembler, recv: VALUE, ivar_name: ID, + flags: u32, + argc: i32, ) -> CodegenStatus { + + // This is a .send call and we need to adjust the stack + if flags & VM_CALL_OPT_SEND != 0 { + handle_opt_send_shift_stack(asm, argc as i32, ctx); + } + // Save the PC and SP because the callee may allocate // Note that this modifies REG_SP, which is why we do it first jit_prepare_routine_call(jit, ctx, asm); @@ -4148,8 +4159,9 @@ fn gen_send_cfunc( ci: *const rb_callinfo, cme: *const rb_callable_method_entry_t, block: Option, - argc: i32, recv_known_klass: *const VALUE, + flags: u32, + argc: i32, ) -> CodegenStatus { let cfunc = unsafe { get_cme_def_body_cfunc(cme) }; let cfunc_argc = unsafe { get_mct_argc(cfunc) }; @@ -4158,8 +4170,6 @@ fn gen_send_cfunc( // Create a side-exit to fall back to the interpreter let side_exit = get_side_exit(jit, ocb, ctx); - let flags = unsafe { vm_ci_flag(ci) }; - // If the function expects a Ruby array of arguments if cfunc_argc < 0 && cfunc_argc != -1 { gen_counter_incr!(asm, send_cfunc_ruby_array_varg); @@ -4228,6 +4238,11 @@ fn gen_send_cfunc( return CantCompile; } + // This is a .send call and we need to adjust the stack + if flags & VM_CALL_OPT_SEND != 0 { + handle_opt_send_shift_stack(asm, argc as i32, ctx); + } + // Points to the receiver operand on the stack let recv = ctx.stack_opnd(argc); @@ -4440,6 +4455,7 @@ fn gen_send_bmethod( ci: *const rb_callinfo, cme: *const rb_callable_method_entry_t, block: Option, + flags: u32, argc: i32, ) -> CodegenStatus { let procv = unsafe { rb_get_def_bmethod_proc((*cme).def) }; @@ -4469,7 +4485,7 @@ fn gen_send_bmethod( } let frame_type = VM_FRAME_MAGIC_BLOCK | VM_FRAME_FLAG_BMETHOD | VM_FRAME_FLAG_LAMBDA; - gen_send_iseq(jit, ctx, asm, ocb, iseq, ci, frame_type, Some(capture.ep), cme, block, argc) + gen_send_iseq(jit, ctx, asm, ocb, iseq, ci, frame_type, Some(capture.ep), cme, block, flags, argc) } fn gen_send_iseq( @@ -4483,11 +4499,11 @@ fn gen_send_iseq( prev_ep: Option<*const VALUE>, cme: *const rb_callable_method_entry_t, block: Option, + flags: u32, argc: i32, ) -> CodegenStatus { let mut argc = argc; - let flags = unsafe { vm_ci_flag(ci) }; // Create a side-exit to fall back to the interpreter let side_exit = get_side_exit(jit, ocb, ctx); @@ -4711,6 +4727,13 @@ fn gen_send_iseq( Some(leaf_builtin_raw) }; if let (None, Some(builtin_info)) = (block, leaf_builtin) { + + // this is a .send call not currently supported for builtins + if flags & VM_CALL_OPT_SEND != 0 { + gen_counter_incr!(asm, send_send_builtin); + return CantCompile; + } + let builtin_argc = unsafe { (*builtin_info).argc }; if builtin_argc + 1 < (C_ARG_OPNDS.len() as i32) { asm.comment("inlined leaf builtin"); @@ -4758,6 +4781,11 @@ fn gen_send_iseq( push_splat_args(required_args, ctx, asm, ocb, side_exit) } + // This is a .send call and we need to adjust the stack + if flags & VM_CALL_OPT_SEND != 0 { + handle_opt_send_shift_stack(asm, argc as i32, ctx); + } + if doing_kw_call { // Here we're calling a method with keyword arguments and specifying // keyword arguments at this call site. @@ -5012,7 +5040,10 @@ fn gen_struct_aref( cme: *const rb_callable_method_entry_t, comptime_recv: VALUE, _comptime_recv_klass: VALUE, + flags: u32, + argc: i32, ) -> CodegenStatus { + if unsafe { vm_ci_argc(ci) } != 0 { return CantCompile; } @@ -5034,6 +5065,11 @@ fn gen_struct_aref( } } + // This is a .send call and we need to adjust the stack + if flags & VM_CALL_OPT_SEND != 0 { + handle_opt_send_shift_stack(asm, argc as i32, ctx); + } + // All structs from the same Struct class should have the same // length. So if our comptime_recv is embedded all runtime // structs of the same class should be as well, and the same is @@ -5067,11 +5103,18 @@ fn gen_struct_aset( cme: *const rb_callable_method_entry_t, comptime_recv: VALUE, _comptime_recv_klass: VALUE, + flags: u32, + argc: i32, ) -> CodegenStatus { if unsafe { vm_ci_argc(ci) } != 1 { return CantCompile; } + // This is a .send call and we need to adjust the stack + if flags & VM_CALL_OPT_SEND != 0 { + handle_opt_send_shift_stack(asm, argc, ctx); + } + let off: i32 = unsafe { get_cme_def_body_optimized_index(cme) } .try_into() .unwrap(); @@ -5113,9 +5156,9 @@ fn gen_send_general( // see vm_call_method(). let ci = unsafe { get_call_data_ci(cd) }; // info about the call site - let argc: i32 = unsafe { vm_ci_argc(ci) }.try_into().unwrap(); - let mid = unsafe { vm_ci_mid(ci) }; - let flags = unsafe { vm_ci_flag(ci) }; + let mut argc: i32 = unsafe { vm_ci_argc(ci) }.try_into().unwrap(); + let mut mid = unsafe { vm_ci_mid(ci) }; + let mut flags = unsafe { vm_ci_flag(ci) }; // Don't JIT calls with keyword splat if flags & VM_CALL_KW_SPLAT != 0 { @@ -5205,7 +5248,7 @@ fn gen_send_general( VM_METHOD_TYPE_ISEQ => { let iseq = unsafe { get_def_iseq_ptr((*cme).def) }; let frame_type = VM_FRAME_MAGIC_METHOD | VM_ENV_FLAG_LOCAL; - return gen_send_iseq(jit, ctx, asm, ocb, iseq, ci, frame_type, None, cme, block, argc); + return gen_send_iseq(jit, ctx, asm, ocb, iseq, ci, frame_type, None, cme, block, flags, argc); } VM_METHOD_TYPE_CFUNC => { return gen_send_cfunc( @@ -5216,8 +5259,9 @@ fn gen_send_general( ci, cme, block, - argc, &comptime_recv_klass, + flags, + argc, ); } VM_METHOD_TYPE_IVAR => { @@ -5227,6 +5271,12 @@ fn gen_send_general( return CantCompile; } + // This is a .send call not supported right now for getters + if flags & VM_CALL_OPT_SEND != 0 { + gen_counter_incr!(asm, send_send_getter); + return CantCompile; + } + if c_method_tracing_currently_enabled(jit) { // Can't generate code for firing c_call and c_return events // :attr-tracing: @@ -5270,12 +5320,12 @@ fn gen_send_general( return CantCompile; } else { let ivar_name = unsafe { get_cme_def_body_attr_id(cme) }; - return gen_set_ivar(jit, ctx, asm, comptime_recv, ivar_name); + return gen_set_ivar(jit, ctx, asm, comptime_recv, ivar_name, flags, argc); } } // Block method, e.g. define_method(:foo) { :my_block } VM_METHOD_TYPE_BMETHOD => { - return gen_send_bmethod(jit, ctx, asm, ocb, ci, cme, block, argc); + return gen_send_bmethod(jit, ctx, asm, ocb, ci, cme, block, flags, argc); } VM_METHOD_TYPE_ZSUPER => { gen_counter_incr!(asm, send_zsuper_method); @@ -5296,11 +5346,116 @@ fn gen_send_general( } // Send family of methods, e.g. call/apply VM_METHOD_TYPE_OPTIMIZED => { + let opt_type = unsafe { get_cme_def_body_optimized_type(cme) }; match opt_type { OPTIMIZED_METHOD_TYPE_SEND => { - gen_counter_incr!(asm, send_optimized_method_send); - return CantCompile; + + // This is for method calls like `foo.send(:bar)` + // The `send` method does not get its own stack frame. + // instead we look up the method and call it, + // doing some stack shifting based on the VM_CALL_OPT_SEND flag + + let starting_context = *ctx; + + if argc == 0 { + gen_counter_incr!(asm, send_send_wrong_args); + return CantCompile; + } + + argc -= 1; + + let compile_time_name = jit_peek_at_stack(jit, ctx, argc as isize); + + if !compile_time_name.string_p() && !compile_time_name.static_sym_p() { + gen_counter_incr!(asm, send_send_chain_not_string_or_sym); + return CantCompile; + } + + mid = unsafe { rb_get_symbol_id(compile_time_name) }; + if mid == 0 { + gen_counter_incr!(asm, send_send_null_mid); + return CantCompile; + } + + cme = unsafe { rb_callable_method_entry(comptime_recv_klass, mid) }; + if cme.is_null() { + gen_counter_incr!(asm, send_send_null_cme); + return CantCompile; + } + + // We aren't going to handle `send(send(:foo))`. We would need to + // do some stack manipulation here or keep track of how many levels + // deep we need to stack manipulate + // Because of how exits currently work, we can't do stack manipulation + // until we will no longer side exit. + let def_type = unsafe { get_cme_def_type(cme) }; + if let VM_METHOD_TYPE_OPTIMIZED = def_type { + let opt_type = unsafe { get_cme_def_body_optimized_type(cme) }; + if let OPTIMIZED_METHOD_TYPE_SEND = opt_type { + gen_counter_incr!(asm, send_send_nested); + return CantCompile; + } + } + + flags |= VM_CALL_FCALL | VM_CALL_OPT_SEND; + + assume_method_lookup_stable(jit, ocb, comptime_recv_klass, cme); + + let (known_class, type_mismatch_exit) = { + if compile_time_name.string_p() { + ( + unsafe { rb_cString }, + counted_exit!(ocb, side_exit, send_send_chain_not_string), + + ) + } else { + ( + unsafe { rb_cSymbol }, + counted_exit!(ocb, side_exit, send_send_chain_not_sym), + ) + } + }; + + jit_guard_known_klass( + jit, + ctx, + asm, + ocb, + known_class, + ctx.stack_opnd(argc), + StackOpnd(argc as u16), + compile_time_name, + 2, // We have string or symbol, so max depth is 2 + type_mismatch_exit + ); + + // Need to do this here so we don't have too many live + // values for the register allocator. + let name_opnd = asm.load(ctx.stack_opnd(argc)); + + let symbol_id_opnd = asm.ccall(rb_get_symbol_id as *const u8, vec![name_opnd]); + + asm.comment("chain_guard_send"); + let chain_exit = counted_exit!(ocb, side_exit, send_send_chain); + asm.cmp(symbol_id_opnd, 0.into()); + asm.jbe(chain_exit.into()); + + asm.cmp(symbol_id_opnd, mid.into()); + jit_chain_guard( + JCC_JNE, + jit, + &starting_context, + asm, + ocb, + SEND_MAX_CHAIN_DEPTH as i32, + chain_exit, + ); + + // We have changed the argc, flags, mid, and cme, so we need to re-enter the match + // and compile whatever method we found from send. + continue; + } OPTIMIZED_METHOD_TYPE_CALL => { gen_counter_incr!(asm, send_optimized_method_call); @@ -5320,6 +5475,8 @@ fn gen_send_general( cme, comptime_recv, comptime_recv_klass, + flags, + argc, ); } OPTIMIZED_METHOD_TYPE_STRUCT_ASET => { @@ -5332,6 +5489,8 @@ fn gen_send_general( cme, comptime_recv, comptime_recv_klass, + flags, + argc, ); } _ => { @@ -5354,6 +5513,32 @@ fn gen_send_general( } } + +/// Shifts the stack for send in order to remove the name of the method +/// Comment below borrow from vm_call_opt_send in vm_insnhelper.c +/// E.g. when argc == 2 +/// | | | | TOPN +/// +------+ | | +/// | arg1 | ---+ | | 0 +/// +------+ | +------+ +/// | arg0 | -+ +-> | arg1 | 1 +/// +------+ | +------+ +/// | sym | +---> | arg0 | 2 +/// +------+ +------+ +/// | recv | | recv | 3 +///--+------+--------+------+------ +/// +/// We do this for our compiletime context and the actual stack +fn handle_opt_send_shift_stack(asm: &mut Assembler, argc: i32, ctx: &mut Context) { + asm.comment("shift_stack"); + for j in (0..argc).rev() { + let opnd = ctx.stack_opnd(j); + let opnd2 = ctx.stack_opnd(j + 1); + asm.mov(opnd2, opnd); + } + ctx.shift_stack(argc as usize); +} + fn gen_opt_send_without_block( jit: &mut JITState, ctx: &mut Context, @@ -5515,10 +5700,10 @@ fn gen_invokesuper( VM_METHOD_TYPE_ISEQ => { let iseq = unsafe { get_def_iseq_ptr((*cme).def) }; let frame_type = VM_FRAME_MAGIC_METHOD | VM_ENV_FLAG_LOCAL; - gen_send_iseq(jit, ctx, asm, ocb, iseq, ci, frame_type, None, cme, block, argc) + gen_send_iseq(jit, ctx, asm, ocb, iseq, ci, frame_type, None, cme, block, ci_flags, argc) } VM_METHOD_TYPE_CFUNC => { - gen_send_cfunc(jit, ctx, asm, ocb, ci, cme, block, argc, ptr::null()) + gen_send_cfunc(jit, ctx, asm, ocb, ci, cme, block, ptr::null(), ci_flags, argc) } _ => unreachable!(), } diff --git a/yjit/src/core.rs b/yjit/src/core.rs index 66c1df083f..ec9384c403 100644 --- a/yjit/src/core.rs +++ b/yjit/src/core.rs @@ -1061,6 +1061,21 @@ impl Context { return top; } + pub fn shift_stack(&mut self, argc: usize) { + assert!(argc < self.stack_size.into()); + + let method_name_index = (self.stack_size - argc as u16 - 1) as usize; + + for i in method_name_index..(self.stack_size - 1) as usize { + + if i + 1 < MAX_TEMP_TYPES { + self.temp_types[i] = self.temp_types[i + 1]; + self.temp_mapping[i] = self.temp_mapping[i + 1]; + } + } + self.stack_pop(1); + } + /// Get an operand pointing to a slot on the temp stack pub fn stack_opnd(&self, idx: i32) -> Opnd { // SP points just above the topmost value diff --git a/yjit/src/cruby.rs b/yjit/src/cruby.rs index 4307937707..5ddc31a06d 100644 --- a/yjit/src/cruby.rs +++ b/yjit/src/cruby.rs @@ -352,6 +352,10 @@ impl VALUE { self == Qnil } + pub fn string_p(self) -> bool { + self.class_of() == unsafe { rb_cString } + } + /// Read the flags bits from the RBasic object, then return a Ruby type enum (e.g. RUBY_T_ARRAY) pub fn builtin_type(self) -> ruby_value_type { (self.builtin_flags() & (RUBY_T_MASK as usize)) as ruby_value_type @@ -629,6 +633,7 @@ mod manual_defs { pub const VM_CALL_KW_SPLAT: u32 = 1 << VM_CALL_KW_SPLAT_bit; pub const VM_CALL_TAILCALL: u32 = 1 << VM_CALL_TAILCALL_bit; pub const VM_CALL_ZSUPER : u32 = 1 << VM_CALL_ZSUPER_bit; + pub const VM_CALL_OPT_SEND : u32 = 1 << VM_CALL_OPT_SEND_bit; // From internal/struct.h - in anonymous enum, so we can't easily import it pub const RSTRUCT_EMBED_LEN_MASK: usize = (RUBY_FL_USER2 | RUBY_FL_USER1) as usize; diff --git a/yjit/src/cruby_bindings.inc.rs b/yjit/src/cruby_bindings.inc.rs index cd29f883e5..db14f9ca02 100644 --- a/yjit/src/cruby_bindings.inc.rs +++ b/yjit/src/cruby_bindings.inc.rs @@ -1362,6 +1362,9 @@ extern "C" { extern "C" { pub fn rb_get_cme_def_body_attr_id(cme: *const rb_callable_method_entry_t) -> ID; } +extern "C" { + pub fn rb_get_symbol_id(namep: VALUE) -> ID; +} extern "C" { pub fn rb_get_cme_def_body_optimized_type( cme: *const rb_callable_method_entry_t, diff --git a/yjit/src/stats.rs b/yjit/src/stats.rs index 74319ec4ed..bf968ee563 100644 --- a/yjit/src/stats.rs +++ b/yjit/src/stats.rs @@ -171,7 +171,6 @@ make_counters! { send_zsuper_method, send_undef_method, send_optimized_method, - send_optimized_method_send, send_optimized_method_call, send_optimized_method_block_call, send_missing_method, @@ -197,6 +196,19 @@ make_counters! { send_args_splat_non_iseq, send_args_splat_cfunc, send_iseq_ruby2_keywords, + send_send_not_imm, + send_send_wrong_args, + send_send_null_mid, + send_send_null_cme, + send_send_nested, + send_send_chain, + send_send_chain_string, + send_send_chain_not_string, + send_send_chain_not_sym, + send_send_chain_not_string_or_sym, + send_send_getter, + send_send_builtin, + send_bmethod_ractor, send_bmethod_block_arg,