From b626dd7211b9d45f1dab6f82057445781f34f20f Mon Sep 17 00:00:00 2001 From: Alan Wu Date: Wed, 24 Mar 2021 18:07:26 -0400 Subject: [PATCH] YJIT: Fancier opt_getinlinecache Make sure `opt_getinlinecache` is in a block all on its own, and invalidate it from the interpreter when `opt_setinlinecache`. It will recompile with a filled cache the second time around. This lets YJIT runs well when the IC for constant is cold. --- compile.c | 34 +++++++++++++++- vm_core.h | 3 ++ vm_insnhelper.c | 3 ++ yjit.h | 5 +-- yjit_codegen.c | 102 +++++++++++++++++++++++++----------------------- yjit_core.c | 19 ++++----- yjit_core.h | 1 + yjit_iface.c | 44 ++++++++++++++++++--- yjit_iface.h | 8 ++-- 9 files changed, 146 insertions(+), 73 deletions(-) diff --git a/compile.c b/compile.c index 1e88dc242d..b11650c885 100644 --- a/compile.c +++ b/compile.c @@ -2259,6 +2259,7 @@ iseq_set_sequence(rb_iseq_t *iseq, LINK_ANCHOR *const anchor) VALUE *generated_iseq; rb_event_flag_t events = 0; long data = 0; + long getinlinecache_idx = -1; int insn_num, code_index, insns_info_index, sp = 0; int stack_max = fix_sp_depth(iseq, anchor); @@ -2362,6 +2363,11 @@ iseq_set_sequence(rb_iseq_t *iseq, LINK_ANCHOR *const anchor) types = insn_op_types(insn); len = insn_len(insn); + if (insn == BIN(opt_getinlinecache)) { + assert(getinlinecache_idx < 0 && "one get per set, no nesting"); + getinlinecache_idx = code_index; + } + for (j = 0; types[j]; j++) { char type = types[j]; /* printf("--> [%c - (%d-%d)]\n", type, k, j); */ @@ -2419,6 +2425,13 @@ iseq_set_sequence(rb_iseq_t *iseq, LINK_ANCHOR *const anchor) } generated_iseq[code_index + 1 + j] = (VALUE)ic; FL_SET(iseqv, ISEQ_MARKABLE_ISEQ); + + if (insn == BIN(opt_setinlinecache) && type == TS_IC) { + assert(getinlinecache_idx >= 0); + // Store index to the matching opt_getinlinecache on the IC for YJIT + ic->get_insn_idx = (unsigned)getinlinecache_idx; + getinlinecache_idx = -1; + } break; } case TS_CALLDATA: @@ -11107,6 +11120,7 @@ ibf_load_code(const struct ibf_load *load, rb_iseq_t *iseq, ibf_offset_t bytecod unsigned int code_index; ibf_offset_t reading_pos = bytecode_offset; VALUE *code = ALLOC_N(VALUE, iseq_size); + long getinlinecache_idx = -1; struct rb_iseq_constant_body *load_body = iseq->body; struct rb_call_data *cd_entries = load_body->call_data; @@ -11114,13 +11128,22 @@ ibf_load_code(const struct ibf_load *load, rb_iseq_t *iseq, ibf_offset_t bytecod for (code_index=0; code_index= 0); + // Store index to the matching opt_getinlinecache on the IC for YJIT + is_entries[op].ic_cache.get_insn_idx = (unsigned)getinlinecache_idx; + getinlinecache_idx = -1; + } } FL_SET(iseqv, ISEQ_MARKABLE_ISEQ); break; diff --git a/vm_core.h b/vm_core.h index cd8a01d1ca..6fd24e962f 100644 --- a/vm_core.h +++ b/vm_core.h @@ -236,6 +236,9 @@ STATIC_ASSERT(sizeof_iseq_inline_constant_cache_entry, struct iseq_inline_constant_cache { struct iseq_inline_constant_cache_entry *entry; + // For YJIT: the index to the opt_getinlinecache instruction in the same iseq. + // It's set during compile time and constant once set. + unsigned get_insn_idx; }; struct iseq_inline_iv_cache_entry { diff --git a/vm_insnhelper.c b/vm_insnhelper.c index 16f46e50d3..00b352df3d 100644 --- a/vm_insnhelper.c +++ b/vm_insnhelper.c @@ -4743,6 +4743,9 @@ vm_ic_update(const rb_iseq_t *iseq, IC ic, VALUE val, const VALUE *reg_ep) if (rb_ractor_shareable_p(val)) ice->flags |= IMEMO_CONST_CACHE_SHAREABLE; ruby_vm_const_missing_count = 0; RB_OBJ_WRITE(iseq, &ic->entry, ice); +#ifndef MJIT_HEADER + yjit_constant_ic_update(iseq, ic); +#endif } static VALUE diff --git a/yjit.h b/yjit.h index cfb25a529d..00ed486054 100644 --- a/yjit.h +++ b/yjit.h @@ -5,9 +5,7 @@ #ifndef YJIT_H #define YJIT_H 1 -#include "stddef.h" -#include "stdint.h" -#include "stdbool.h" +#include "vm_core.h" #include "method.h" #ifdef _WIN32 @@ -61,5 +59,6 @@ void rb_yjit_iseq_mark(const struct rb_iseq_constant_body *body); void rb_yjit_iseq_update_references(const struct rb_iseq_constant_body *body); void rb_yjit_iseq_free(const struct rb_iseq_constant_body *body); void rb_yjit_before_ractor_spawn(void); +void yjit_constant_ic_update(const rb_iseq_t *iseq, IC ic); #endif // #ifndef YJIT_H diff --git a/yjit_codegen.c b/yjit_codegen.c index 7ff59b94a1..ce9e56a157 100644 --- a/yjit_codegen.c +++ b/yjit_codegen.c @@ -43,7 +43,7 @@ jit_print_loc(jitstate_t* jit, const char* msg) static int jit_get_opcode(jitstate_t* jit) { - return opcode_at_pc(jit->iseq, jit->pc); + return yjit_opcode_at_pc(jit->iseq, jit->pc); } // Get the index of the next instruction @@ -147,7 +147,7 @@ yjit_gen_exit(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb) // Write back the old instruction at the exit PC // Otherwise the interpreter may jump right back to the // JITted code we're trying to exit - int exit_opcode = opcode_at_pc(jit->iseq, exit_pc); + int exit_opcode = yjit_opcode_at_pc(jit->iseq, exit_pc); void* handler_addr = (void*)handler_table[exit_opcode]; mov(cb, REG0, const_ptr_opnd(exit_pc)); mov(cb, REG1, const_ptr_opnd(handler_addr)); @@ -255,9 +255,8 @@ yjit_entry_prologue(void) return code_ptr; } -/* -Generate code to check for interrupts and take a side-exit -*/ + +// Generate code to check for interrupts and take a side-exit static void yjit_check_ints(codeblock_t* cb, uint8_t* side_exit) { @@ -269,17 +268,36 @@ yjit_check_ints(codeblock_t* cb, uint8_t* side_exit) jnz_ptr(cb, side_exit); } -/* -Compile a sequence of bytecode instructions for a given basic block version -*/ +// Generate a stubbed unconditional jump to the next bytecode instruction. +// Blocks that are part of a guard chain can use this to share the same successor. +static void +jit_jump_to_next_insn(jitstate_t *jit, const ctx_t *current_context) +{ + // Reset the depth since in current usages we only ever jump to to + // chain_depth > 0 from the same instruction. + ctx_t reset_depth = *current_context; + reset_depth.chain_depth = 0; + + blockid_t jump_block = { jit->iseq, jit_next_insn_idx(jit) }; + + // Generate the jump instruction + gen_direct_jump( + &reset_depth, + jump_block + ); +} + + +// Compile a sequence of bytecode instructions for a given basic block version void -yjit_gen_block(ctx_t* ctx, block_t* block, rb_execution_context_t* ec) +yjit_gen_block(ctx_t *ctx, block_t *block, rb_execution_context_t *ec) { RUBY_ASSERT(cb != NULL); RUBY_ASSERT(block != NULL); const rb_iseq_t *iseq = block->blockid.iseq; uint32_t insn_idx = block->blockid.idx; + const uint32_t starting_insn_idx = insn_idx; // NOTE: if we are ever deployed in production, we // should probably just log an error and return NULL here, @@ -305,13 +323,21 @@ yjit_gen_block(ctx_t* ctx, block_t* block, rb_execution_context_t* ec) // For each instruction to compile for (;;) { + // Get the current pc and opcode + VALUE *pc = yjit_iseq_pc_at_idx(iseq, insn_idx); + int opcode = yjit_opcode_at_pc(iseq, pc); + RUBY_ASSERT(opcode >= 0 && opcode < VM_INSTRUCTION_SIZE); + + // opt_getinlinecache wants to be in a block all on its own. Cut the block short + // if we run into it. See gen_opt_getinlinecache for details. + if (opcode == BIN(opt_getinlinecache) && insn_idx > starting_insn_idx) { + jit_jump_to_next_insn(&jit, ctx); + break; + } + // Set the current instruction jit.insn_idx = insn_idx; - jit.pc = iseq_pc_at_idx(iseq, insn_idx); - - // Get the current opcode - int opcode = jit_get_opcode(&jit); - RUBY_ASSERT(opcode >= 0 && opcode < VM_INSTRUCTION_SIZE); + jit.pc = pc; // Lookup the codegen function for this instruction codegen_fn gen_fn = gen_fns[opcode]; @@ -322,8 +348,10 @@ yjit_gen_block(ctx_t* ctx, block_t* block, rb_execution_context_t* ec) break; } - //fprintf(stderr, "compiling %d: %s\n", insn_idx, insn_name(opcode)); - //print_str(cb, insn_name(opcode)); + if (0) { + fprintf(stderr, "compiling %d: %s\n", insn_idx, insn_name(opcode)); + print_str(cb, insn_name(opcode)); + } // :count-placement: // Count bytecode instructions that execute in generated code. @@ -366,9 +394,8 @@ yjit_gen_block(ctx_t* ctx, block_t* block, rb_execution_context_t* ec) if (YJIT_DUMP_MODE >= 2) { // Dump list of compiled instrutions fprintf(stderr, "Compiled the following for iseq=%p:\n", (void *)iseq); - for (uint32_t idx = block->blockid.idx; idx < insn_idx;) - { - int opcode = opcode_at_pc(iseq, iseq_pc_at_idx(iseq, idx)); + for (uint32_t idx = block->blockid.idx; idx < insn_idx; ) { + int opcode = yjit_opcode_at_pc(iseq, yjit_iseq_pc_at_idx(iseq, idx)); fprintf(stderr, " %04d %s\n", idx, insn_name(opcode)); idx += insn_len(opcode); } @@ -605,25 +632,6 @@ guard_self_is_heap(codeblock_t *cb, x86opnd_t self_opnd, uint8_t *side_exit, ctx } } -// Generate a stubbed unconditional jump to the next bytecode instruction. -// Blocks that are part of a guard chain can use this to share the same successor. -static void -jit_jump_to_next_insn(jitstate_t *jit, const ctx_t *current_context) -{ - // Reset the depth since in current usages we only ever jump to to - // chain_depth > 0 from the same instruction. - ctx_t reset_depth = *current_context; - reset_depth.chain_depth = 0; - - blockid_t jump_block = { jit->iseq, jit_next_insn_idx(jit) }; - - // Generate the jump instruction - gen_direct_jump( - &reset_depth, - jump_block - ); -} - static void gen_jnz_to_target0(codeblock_t *cb, uint8_t *target0, uint8_t *target1, uint8_t shape) { @@ -1918,6 +1926,7 @@ gen_leave(jitstate_t* jit, ctx_t* ctx) } RUBY_EXTERN rb_serial_t ruby_vm_global_constant_state; + static codegen_status_t gen_opt_getinlinecache(jitstate_t *jit, ctx_t *ctx) { @@ -1927,16 +1936,11 @@ gen_opt_getinlinecache(jitstate_t *jit, ctx_t *ctx) // See vm_ic_hit_p(). struct iseq_inline_constant_cache_entry *ice = ic->entry; - if (!ice) { - // Cache not filled - return YJIT_CANT_COMPILE; - } - if (ice->ic_serial != ruby_vm_global_constant_state) { - // Cache miss at compile time. - return YJIT_CANT_COMPILE; - } - if (ice->ic_cref) { - // Only compile for caches that don't care about lexical scope. + if (!ice || // cache not filled + ice->ic_serial != ruby_vm_global_constant_state || // cache out of date + ice->ic_cref /* cache only valid for certain lexical scopes */) { + // In these cases, leave a block that unconditionally side exits + // for the interpreter to invalidate. return YJIT_CANT_COMPILE; } @@ -1946,7 +1950,7 @@ gen_opt_getinlinecache(jitstate_t *jit, ctx_t *ctx) // Invalidate output code on any and all constant writes // FIXME: This leaks when st_insert raises NoMemoryError - if (!assume_stable_global_constant_state(jit->block)) return YJIT_CANT_COMPILE; + assume_stable_global_constant_state(jit->block); x86opnd_t stack_top = ctx_stack_push(ctx, TYPE_UNKNOWN); jit_mov_gc_ptr(jit, cb, REG0, ice->value); diff --git a/yjit_core.c b/yjit_core.c index 521db462fa..3be45c12e5 100644 --- a/yjit_core.c +++ b/yjit_core.c @@ -289,8 +289,8 @@ int ctx_diff(const ctx_t* src, const ctx_t* dst) } // Get all blocks for a particular place in an iseq. -static rb_yjit_block_array_t -get_version_array(const rb_iseq_t *iseq, unsigned idx) +rb_yjit_block_array_t +yjit_get_version_array(const rb_iseq_t *iseq, unsigned idx) { struct rb_iseq_constant_body *body = iseq->body; @@ -305,7 +305,7 @@ get_version_array(const rb_iseq_t *iseq, unsigned idx) // Count the number of block versions matching a given blockid static size_t get_num_versions(blockid_t blockid) { - return rb_darray_size(get_version_array(blockid.iseq, blockid.idx)); + return rb_darray_size(yjit_get_version_array(blockid.iseq, blockid.idx)); } // Keep track of a block version. Block should be fully constructed. @@ -364,7 +364,7 @@ add_block_version(blockid_t blockid, block_t* block) // Retrieve a basic block version for an (iseq, idx) tuple block_t* find_block_version(blockid_t blockid, const ctx_t* ctx) { - rb_yjit_block_array_t versions = get_version_array(blockid.iseq, blockid.idx); + rb_yjit_block_array_t versions = yjit_get_version_array(blockid.iseq, blockid.idx); // Best match found block_t* best_version = NULL; @@ -522,7 +522,7 @@ branch_stub_hit(const uint32_t branch_idx, const uint32_t target_idx, rb_executi // Update the PC in the current CFP, because it // may be out of sync in JITted code - ec->cfp->pc = iseq_pc_at_idx(target.iseq, target.idx); + ec->cfp->pc = yjit_iseq_pc_at_idx(target.iseq, target.idx); // Try to find an existing compiled version of this block block_t* p_block = find_block_version(target, target_ctx); @@ -846,7 +846,8 @@ void invalidate_block_version(block_t* block) { ASSERT_vm_locking(); - rb_vm_barrier(); // Stop other ractors since we are going to patch machine code. + // TODO: want to assert that all other ractors are stopped here. Can't patch + // machine code that some other thread is running. const rb_iseq_t *iseq = block->blockid.iseq; @@ -854,7 +855,7 @@ invalidate_block_version(block_t* block) // fprintf(stderr, "block=%p\n", block); // Remove this block from the version array - rb_yjit_block_array_t versions = get_version_array(iseq, block->blockid.idx); + rb_yjit_block_array_t versions = yjit_get_version_array(iseq, block->blockid.idx); RB_UNUSED_VAR(bool removed); removed = block_array_remove(versions, block); RUBY_ASSERT(removed); @@ -909,8 +910,8 @@ invalidate_block_version(block_t* block) uint32_t idx = block->blockid.idx; // FIXME: the following says "if", but it's unconditional. // If the block is an entry point, it needs to be unmapped from its iseq - VALUE* entry_pc = iseq_pc_at_idx(iseq, idx); - int entry_opcode = opcode_at_pc(iseq, entry_pc); + VALUE* entry_pc = yjit_iseq_pc_at_idx(iseq, idx); + int entry_opcode = yjit_opcode_at_pc(iseq, entry_pc); // TODO: unmap_addr2insn in yjit_iface.c? Maybe we can write a function to encompass this logic? // Should check how it's used in exit and side-exit diff --git a/yjit_core.h b/yjit_core.h index aa2d3fd008..e264d89ffa 100644 --- a/yjit_core.h +++ b/yjit_core.h @@ -235,6 +235,7 @@ block_t* gen_block_version(blockid_t blockid, const ctx_t* ctx, rb_execution_con uint8_t* gen_entry_point(const rb_iseq_t *iseq, uint32_t insn_idx, rb_execution_context_t *ec); void yjit_free_block(block_t *block); void yjit_branches_update_references(void); +rb_yjit_block_array_t yjit_get_version_array(const rb_iseq_t *iseq, unsigned idx); void gen_branch( const ctx_t* src_ctx, diff --git a/yjit_iface.c b/yjit_iface.c index 4116e526d8..56274c4993 100644 --- a/yjit_iface.c +++ b/yjit_iface.c @@ -65,7 +65,8 @@ cb_write_post_call_bytes(codeblock_t* cb) } // Get the PC for a given index in an iseq -VALUE *iseq_pc_at_idx(const rb_iseq_t *iseq, uint32_t insn_idx) +VALUE * +yjit_iseq_pc_at_idx(const rb_iseq_t *iseq, uint32_t insn_idx) { RUBY_ASSERT(iseq != NULL); RUBY_ASSERT(insn_idx < iseq->body->iseq_size); @@ -91,7 +92,7 @@ map_addr2insn(void *code_ptr, int insn) } int -opcode_at_pc(const rb_iseq_t *iseq, const VALUE *pc) +yjit_opcode_at_pc(const rb_iseq_t *iseq, const VALUE *pc) { const VALUE at_pc = *pc; if (FL_TEST_RAW((VALUE)iseq, ISEQ_TRANSLATED)) { @@ -269,11 +270,9 @@ static st_table *blocks_assuming_stable_global_constant_state; // Assume that the global constant state has not changed since call to this function. // Can raise NoMemoryError. -RBIMPL_ATTR_NODISCARD() -bool +void assume_stable_global_constant_state(block_t *block) { st_insert(blocks_assuming_stable_global_constant_state, (st_data_t)block, 1); - return true; } static int @@ -491,7 +490,7 @@ rb_yjit_compile_iseq(const rb_iseq_t *iseq, rb_execution_context_t *ec) if (code_ptr) { // Map the code address to the corresponding opcode - int first_opcode = opcode_at_pc(iseq, &encoded[0]); + int first_opcode = yjit_opcode_at_pc(iseq, &encoded[0]); map_addr2insn(code_ptr, first_opcode); encoded[0] = (VALUE)code_ptr; } @@ -601,6 +600,39 @@ rb_yjit_constant_state_changed(void) } } +// Callback from the opt_setinlinecache instruction in the interpreter +void +yjit_constant_ic_update(const rb_iseq_t *iseq, IC ic) +{ + RB_VM_LOCK_ENTER(); + rb_vm_barrier(); // Stop other ractors since we are going to patch machine code. + { + + const struct rb_iseq_constant_body *const body = iseq->body; + VALUE *code = body->iseq_encoded; + + // This should come from a running iseq, so direct threading translation + // should have been done + RUBY_ASSERT(FL_TEST((VALUE)iseq, ISEQ_TRANSLATED)); + RUBY_ASSERT(ic->get_insn_idx < body->iseq_size); + RUBY_ASSERT(rb_vm_insn_addr2insn((const void *)code[ic->get_insn_idx]) == BIN(opt_getinlinecache)); + + // Find the matching opt_getinlinecache and invalidate all the blocks there + RUBY_ASSERT(insn_op_type(BIN(opt_getinlinecache), 1) == TS_IC); + if (ic == (IC)code[ic->get_insn_idx + 1 + 1]) { + rb_yjit_block_array_t getinlinecache_blocks = yjit_get_version_array(iseq, ic->get_insn_idx); + rb_darray_for(getinlinecache_blocks, i) { + block_t *block = rb_darray_get(getinlinecache_blocks, i); + invalidate_block_version(block); + } + } + else { + RUBY_ASSERT(false && "ic->get_insn_diex not set properly"); + } + } + RB_VM_LOCK_LEAVE(); +} + void rb_yjit_before_ractor_spawn(void) { diff --git a/yjit_iface.h b/yjit_iface.h index 63871d24c2..b7af40cfd7 100644 --- a/yjit_iface.h +++ b/yjit_iface.h @@ -85,9 +85,9 @@ RUBY_EXTERN struct rb_yjit_runtime_counters yjit_runtime_counters; void cb_write_pre_call_bytes(codeblock_t* cb); void cb_write_post_call_bytes(codeblock_t* cb); -VALUE *iseq_pc_at_idx(const rb_iseq_t *iseq, uint32_t insn_idx); -void map_addr2insn(void *code_ptr, int insn); -int opcode_at_pc(const rb_iseq_t *iseq, const VALUE *pc); +void yjit_map_addr2insn(void *code_ptr, int insn); +VALUE *yjit_iseq_pc_at_idx(const rb_iseq_t *iseq, uint32_t insn_idx); +int yjit_opcode_at_pc(const rb_iseq_t *iseq, const VALUE *pc); void check_cfunc_dispatch(VALUE receiver, struct rb_callinfo *ci, void *callee, rb_callable_method_entry_t *compile_time_cme); bool cfunc_needs_frame(const rb_method_cfunc_t *cfunc); @@ -95,7 +95,7 @@ bool cfunc_needs_frame(const rb_method_cfunc_t *cfunc); RBIMPL_ATTR_NODISCARD() bool assume_bop_not_redefined(block_t *block, int redefined_flag, enum ruby_basic_operators bop); void assume_method_lookup_stable(VALUE receiver_klass, const rb_callable_method_entry_t *cme, block_t *block); RBIMPL_ATTR_NODISCARD() bool assume_single_ractor_mode(block_t *block); -RBIMPL_ATTR_NODISCARD() bool assume_stable_global_constant_state(block_t *block); +void assume_stable_global_constant_state(block_t *block); // this function *must* return passed exit_pc const VALUE *rb_yjit_count_side_exit_op(const VALUE *exit_pc);