mjit.c: introduce JIT compaction [experimental]

When all compilation finishes or the number of JIT-ed code reaches --jit-max-cache, this compacts all generated code to a single .so file and re-loads all methods from it. In the future, it may trigger compaction more frequently and/or limit the maximum times of compaction to prevent unlimited memory usage. So the current behavior is experimental, but at least the performance improvement in this commit won't be removed. === Benchmark === In this benchmark, I'll compare following four conditions: * trunk: r64082 * trunk JIT: r64082 w/ --jit * single-so JIT: This commit w/ --jit * objfcn JIT: This branch https://github.com/k0kubun/ruby/tree/objfcn w/ --jit, which is shinh's objfcn https://github.com/shinh/ruby/tree/objfcn rebased from this commit ``` $ uname -a Linux bionic 4.15.0-29-generic #31-Ubuntu SMP Tue Jul 17 15:39:52 UTC 2018 x86_64 x86_64 x86_64 GNU/Linux ``` * Micro benchmark Using this script https://gist.github.com/k0kubun/10e6d3387c9ab1b134622b2c9d76ef51, calls some amount of different methods that just return `nil`. The following tables are its average duration seconds of 3 measurements. Smaller is better. ** 1 method (seconds) | | trunk | trunk JIT | single-so JIT | objfcn JIT | |:------|:------------------|:------------------|:------------------|:------------------| | Time | 5.576067774333296 | 5.915551971666446 | 5.833641665666619 | 5.845915191666639 | | Ratio | 1.00x | 1.06x | 1.05x | 1.05x | ** 50 methods (seconds) | | trunk | trunk JIT | single-so JIT | objfcn JIT | |:------|:------------------|:------------------|:------------------|:------------------| | Time | 3.1661167996666677| 6.125825928333342 | 4.135432743666665 | 3.750358728333348 | | Ratio | 1.00x | 1.93x | 1.31x | 1.18x | ** 1500 methods (seconds) | | trunk | trunk JIT | single-so JIT | objfcn JIT | |:------|:------------------|:------------------|:------------------|:------------------| | Time | 5.971650823666664 | 19.579182102999994| 10.511108153999961| 10.854653588999932| | Ratio | 1.00x | 3.28x | 1.76x | 1.82x | * Discourse Using the same benchmark strategy as https://bugs.ruby-lang.org/issues/14490 with this branch https://github.com/k0kubun/discourse/commits/benchmark2 forked from discourse v1.8.11 to support running trunk. 1. Run ruby script/bench.rb to warm up profiling database 2. Run RUBYOPT='--jit-verbose=1 --jit-max-cache=10000' RAILS_ENV=profile bin/puma -e production 3. WAIT 5-15 or so minutes for all jitting to stop so we have no cross talk 4. Run ab -n 100 http://localhost:9292/ 5. Wait for all new jitting to finish 6. Run ab -n 100 http://localhost:9292/ ** Response time (ms) Here is the response time milliseconds for each percentile. Skipping 99%ile because it's the same as 100%ile in 100 calls. | | trunk| trunk|single|objfcn| | | | JIT|so JIT| JIT| |:----|:-----|:-----|:-----|:-----| | 50% | 38 | 45 | 41 | 43 | | 66% | 39 | 50 | 44 | 44 | | 75% | 47 | 51 | 46 | 45 | | 80% | 49 | 52 | 47 | 47 | | 90% | 50 | 63 | 50 | 52 | | 95% | 60 | 79 | 52 | 55 | | 98% | 91 | 114 | 91 | 91 | |100% | 97 | 133 | 96 | 99 | ** Ratio (smaller is better) Here is the response time increase ratio against no-JIT trunk's one. | | trunk| trunk|single|objfcn| | | | JIT|so JIT| JIT| |:----|:-----|:-----|:-----|:-----| | 50% | 1.00x| 1.18x| 1.08x| 1.13x| | 66% | 1.00x| 1.28x| 1.13x| 1.13x| | 75% | 1.00x| 1.09x| 0.98x| 0.96x| | 80% | 1.00x| 1.06x| 0.96x| 0.96x| | 90% | 1.00x| 1.26x| 1.00x| 1.04x| | 95% | 1.00x| 1.32x| 0.87x| 0.92x| | 98% | 1.00x| 1.25x| 1.00x| 1.00x| |100% | 1.00x| 1.37x| 0.99x| 1.02x| While 50 and 60 %ile are still worse than no-JIT trunk, 75, 80, 90, 95, 98 and 100% are not slower than that. So now it's a little harder to say "MJIT slows down Rails applications". Probably I can close [Bug #14490] now. Let's start improving it. Close https://github.com/ruby/ruby/pull/1921 git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@64094 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2022-11-09 12:17:21 -05:00 · 2018-07-28 16:14:56 +00:00 · 2018-07-28 16:14:56 +00:00 · 443f4d583c
commit 443f4d583c
parent 2a9cae3328
2 changed files with 121 additions and 15 deletions
--- a/mjit.c
+++ b/mjit.c
@ -198,6 +198,8 @@ int mjit_call_p = FALSE;
 static struct rb_mjit_unit_list unit_queue;
 /* List of units which are successfully compiled. */
 static struct rb_mjit_unit_list active_units;
 /* List of compacted so files which will be deleted in `mjit_finish()`. */
 static struct rb_mjit_unit_list compact_units;
 /* The number of so far processed ISEQs, used to generate unique id.  */
 static int current_unit_num;
 /* A mutex for conitionals and critical sections.  */
@ -774,6 +776,8 @@ make_pch(void)
 #define append_str(p, str) append_str2(p, str, sizeof(str)-1)
 #define append_lit(p, str) append_str2(p, str, rb_strlen_lit(str))
 #define MJIT_TMP_PREFIX "_ruby_mjit_"
 #ifdef _MSC_VER
 /* Compile C file to so. It returns 1 if it succeeds. (mswin) */
@ -808,7 +812,7 @@ compile_c_to_so(const char *c_file, const char *so_file)
    return exit_code == 0;
 }
-#else
+#else /* _MSC_VER */
 /* Compile .c file to .o file. It returns 1 if it succeeds. (non-mswin) */
 static int
@ -841,13 +845,13 @@ compile_c_to_o(const char *c_file, const char *o_file)
    return exit_code == 0;
 }
-/* Link .o file to .so file. It returns 1 if it succeeds. (non-mswin) */
+/* Link .o files to .so file. It returns 1 if it succeeds. (non-mswin) */
 static int
-link_o_to_so(const char *o_file, const char *so_file)
+link_o_to_so(const char **o_files, const char *so_file)
 {
    int exit_code;
-    const char *files[] = {
+    const char *options[] = {
-        "-o", NULL, NULL,
+        "-o", NULL,
 # ifdef _WIN32
        libruby_pathflag,
 # endif
@ -855,10 +859,9 @@ link_o_to_so(const char *o_file, const char *so_file)
    };
    char **args;
-    files[1] = so_file;
+    options[1] = so_file;
-    files[2] = o_file;
+    args = form_args(6, CC_LDSHARED_ARGS, CC_CODEFLAG_ARGS,
-    args = form_args(5, CC_LDSHARED_ARGS, CC_CODEFLAG_ARGS,
+                     options, o_files, CC_LIBS, CC_DLDFLAGS_ARGS);
                     files, CC_LIBS, CC_DLDFLAGS_ARGS);
    if (args == NULL)
        return FALSE;
@ -870,7 +873,94 @@ link_o_to_so(const char *o_file, const char *so_file)
    return exit_code == 0;
 }
 /* Link all cached .o files and build a .so file. Reload all JIT func from it. This
   allows to avoid JIT code fragmentation and improve performance to call JIT-ed code.  */
 static void
 compact_all_jit_code(void)
 {
    struct rb_mjit_unit *unit;
    struct rb_mjit_unit_node *node;
    double start_time, end_time;
    static const char so_ext[] = DLEXT;
    char so_file[MAXPATHLEN];
    const char **o_files;
    int i = 0, success;
    /* Abnormal use case of rb_mjit_unit that doesn't have ISeq */
    unit = (struct rb_mjit_unit *)calloc(1, sizeof(struct rb_mjit_unit)); /* To prevent GC, don't use ZALLOC */
    if (unit == NULL) return;
    unit->id = current_unit_num++;
    sprint_uniq_filename(so_file, (int)sizeof(so_file), unit->id, MJIT_TMP_PREFIX, so_ext);
    /* NULL-ending for form_args */
    o_files = (const char **)alloca(sizeof(char *) * (active_units.length + 1));
    o_files[active_units.length] = NULL;
    CRITICAL_SECTION_START(3, "in compact_all_jit_code to keep .o files");
    for (node = active_units.head; node != NULL; node = node->next) {
        o_files[i] = node->unit->o_file;
        i++;
    }
    start_time = real_ms_time();
    success = link_o_to_so(o_files, so_file);
    end_time = real_ms_time();
    /* TODO: Shrink this big critical section. For now, this is needed to prevent failure by missing .o files.
       This assumes that o -> so link doesn't take long time because the bottleneck, which is compiler optimization,
       is already done. But actually it takes about 500ms for 5,000 methods on my Linux machine, so it's better to
       finish this critical section before link_o_to_so by disabling unload_units. */
    CRITICAL_SECTION_FINISH(3, "in compact_all_jit_code to keep .o files");
    if (success) {
        void *handle = dlopen(so_file, RTLD_NOW);
        if (handle == NULL) {
            if (mjit_opts.warnings || mjit_opts.verbose)
                fprintf(stderr, "MJIT warning: failure in loading code from compacted '%s': %s\n", so_file, dlerror());
            free(unit);
            return;
        }
        unit->handle = handle;
        /* lazily dlclose handle (and .so file for win32) on `mjit_finish()`. */
        node = (struct rb_mjit_unit_node *)calloc(1, sizeof(struct rb_mjit_unit_node)); /* To prevent GC, don't use ZALLOC */
        node->unit = unit;
        add_to_list(node, &compact_units);
        if (!mjit_opts.save_temps) {
 #ifdef _WIN32
            unit->so_file = strdup(so_file); /* lazily delete on `clean_object_files()` */
 #else
            remove_file(so_file);
 #endif
        }
        CRITICAL_SECTION_START(3, "in compact_all_jit_code to read list");
        for (node = active_units.head; node != NULL; node = node->next) {
            void *func;
            char funcname[35]; /* TODO: reconsider `35` */
            sprintf(funcname, "_mjit%d", node->unit->id);
            if ((func = dlsym(handle, funcname)) == NULL) {
                if (mjit_opts.warnings || mjit_opts.verbose)
                    fprintf(stderr, "MJIT warning: skipping to reload '%s' from '%s': %s\n", funcname, so_file, dlerror());
                continue;
            }
            if (node->unit->iseq) { /* Check whether GCed or not */
                /* Usage of jit_code might be not in a critical section.  */
                MJIT_ATOMIC_SET(node->unit->iseq->body->jit_func, (mjit_func_t)func);
            }
        }
        CRITICAL_SECTION_FINISH(3, "in compact_all_jit_code to read list");
        verbose(1, "JIT compaction (%.1fms): Compacted %d methods -> %s", end_time - start_time, active_units.length, so_file);
    }
    else {
        free(unit);
        verbose(1, "JIT compaction failure (%.1fms): Failed to compact methods", end_time - start_time);
    }
 }
 #endif /* _MSC_VER */
 static void *
 load_func_from_so(const char *so_file, const char *funcname, struct rb_mjit_unit *unit)
@ -889,8 +979,6 @@ load_func_from_so(const char *so_file, const char *funcname, struct rb_mjit_unit
    return func;
 }
 #define MJIT_TMP_PREFIX "_ruby_mjit_"
 #ifndef __clang__
 static const char *
 header_name_end(const char *s)
@ -930,7 +1018,7 @@ remove_file(const char *filename)
 static mjit_func_t
 convert_unit_to_func(struct rb_mjit_unit *unit)
 {
-    char c_file_buff[70], *c_file = c_file_buff, *so_file, funcname[35];
+    char c_file_buff[MAXPATHLEN], *c_file = c_file_buff, *so_file, funcname[35]; /* TODO: reconsider `35` */
    int success;
    int fd;
    FILE *f;
@ -1045,7 +1133,8 @@ convert_unit_to_func(struct rb_mjit_unit *unit)
 #else
    /* splitting .c -> .o step and .o -> .so step, to cache .o files in the future */
    if (success = compile_c_to_o(c_file, o_file)) {
-        success = link_o_to_so(o_file, so_file);
+        const char *o_files[] = { o_file, NULL };
        success = link_o_to_so(o_files, so_file);
        if (!mjit_opts.save_temps)
            unit->o_file = strdup(o_file); /* lazily delete on `clean_object_files()` */
@ -1127,6 +1216,13 @@ worker(void)
            }
            remove_from_list(node, &unit_queue);
            CRITICAL_SECTION_FINISH(3, "in jit func replace");
 #ifndef _MSC_VER
            /* Combine .o files to one .so and reload all jit_func to improve memory locality */
            if ((unit_queue.length == 0 && active_units.length > 1) || active_units.length == mjit_opts.max_cache_size) {
                compact_all_jit_code();
            }
 #endif
        }
    }
@ -1699,6 +1795,7 @@ mjit_finish(void)
    mjit_call_p = FALSE;
    free_list(&unit_queue);
    free_list(&active_units);
    free_list(&compact_units);
    finish_conts();
    mjit_enabled = FALSE;
--- a/test/ruby/test_jit.rb
+++ b/test/ruby/test_jit.rb
@ -7,6 +7,11 @@ require_relative '../lib/jit_support'
 class TestJIT < Test::Unit::TestCase
  include JITSupport
  IGNORABLE_PATTERNS = [
    /\ASuccessful MJIT finish\n\z/,
    /\AJIT compaction \(\d+\.\dms\): Compacted \d+ methods ->/,
  ]
  # trace_* insns are not compiled for now...
  TEST_PENDING_INSNS = RubyVM::INSTRUCTION_NAMES.select { |n| n.start_with?('trace_') }.map(&:to_sym) + [
    # not supported yet
@ -544,7 +549,9 @@ class TestJIT < Test::Unit::TestCase
        end
      end;
      assert_equal('0123456789', out)
-      errs = err.lines
+      errs = err.lines.reject do |l|
        IGNORABLE_PATTERNS.any? { |pat| pat.match?(l) }
      end
      assert_match(/\A#{JIT_SUCCESS_PREFIX}: block in <main>@-e:/, errs[0])
      9.times do |i|
        assert_match(/\A#{JIT_SUCCESS_PREFIX}: mjit#{i}@\(eval\):/, errs[i + 1])
@ -776,7 +783,9 @@ class TestJIT < Test::Unit::TestCase
    if stdout
      assert_equal(stdout, out, "Expected stdout #{out.inspect} to match #{stdout.inspect} with script:\n#{code_block(script)}")
    end
-    err_lines = err.lines.reject! { |l| l.chomp.empty? || l.match?(/\A#{JIT_SUCCESS_PREFIX}/) || l == "Successful MJIT finish\n" }
+    err_lines = err.lines.reject! do |l|
      l.chomp.empty? || l.match?(/\A#{JIT_SUCCESS_PREFIX}/) || IGNORABLE_PATTERNS.any? { |pat| pat.match?(l) }
    end
    unless err_lines.empty?
      warn err_lines.join(''), uplevel: uplevel
    end