mirror of
https://github.com/ruby/ruby.git
synced 2022-11-09 12:17:21 -05:00
mjit.c: introduce JIT compaction [experimental]
When all compilation finishes or the number of JIT-ed code reaches --jit-max-cache, this compacts all generated code to a single .so file and re-loads all methods from it. In the future, it may trigger compaction more frequently and/or limit the maximum times of compaction to prevent unlimited memory usage. So the current behavior is experimental, but at least the performance improvement in this commit won't be removed. === Benchmark === In this benchmark, I'll compare following four conditions: * trunk: r64082 * trunk JIT: r64082 w/ --jit * single-so JIT: This commit w/ --jit * objfcn JIT: This branch https://github.com/k0kubun/ruby/tree/objfcn w/ --jit, which is shinh's objfcn https://github.com/shinh/ruby/tree/objfcn rebased from this commit ``` $ uname -a Linux bionic 4.15.0-29-generic #31-Ubuntu SMP Tue Jul 17 15:39:52 UTC 2018 x86_64 x86_64 x86_64 GNU/Linux ``` * Micro benchmark Using this script https://gist.github.com/k0kubun/10e6d3387c9ab1b134622b2c9d76ef51, calls some amount of different methods that just return `nil`. The following tables are its average duration seconds of 3 measurements. Smaller is better. ** 1 method (seconds) | | trunk | trunk JIT | single-so JIT | objfcn JIT | |:------|:------------------|:------------------|:------------------|:------------------| | Time | 5.576067774333296 | 5.915551971666446 | 5.833641665666619 | 5.845915191666639 | | Ratio | 1.00x | 1.06x | 1.05x | 1.05x | ** 50 methods (seconds) | | trunk | trunk JIT | single-so JIT | objfcn JIT | |:------|:------------------|:------------------|:------------------|:------------------| | Time | 3.1661167996666677| 6.125825928333342 | 4.135432743666665 | 3.750358728333348 | | Ratio | 1.00x | 1.93x | 1.31x | 1.18x | ** 1500 methods (seconds) | | trunk | trunk JIT | single-so JIT | objfcn JIT | |:------|:------------------|:------------------|:------------------|:------------------| | Time | 5.971650823666664 | 19.579182102999994| 10.511108153999961| 10.854653588999932| | Ratio | 1.00x | 3.28x | 1.76x | 1.82x | * Discourse Using the same benchmark strategy as https://bugs.ruby-lang.org/issues/14490 with this branch https://github.com/k0kubun/discourse/commits/benchmark2 forked from discourse v1.8.11 to support running trunk. 1. Run ruby script/bench.rb to warm up profiling database 2. Run RUBYOPT='--jit-verbose=1 --jit-max-cache=10000' RAILS_ENV=profile bin/puma -e production 3. WAIT 5-15 or so minutes for all jitting to stop so we have no cross talk 4. Run ab -n 100 http://localhost:9292/ 5. Wait for all new jitting to finish 6. Run ab -n 100 http://localhost:9292/ ** Response time (ms) Here is the response time milliseconds for each percentile. Skipping 99%ile because it's the same as 100%ile in 100 calls. | | trunk| trunk|single|objfcn| | | | JIT|so JIT| JIT| |:----|:-----|:-----|:-----|:-----| | 50% | 38 | 45 | 41 | 43 | | 66% | 39 | 50 | 44 | 44 | | 75% | 47 | 51 | 46 | 45 | | 80% | 49 | 52 | 47 | 47 | | 90% | 50 | 63 | 50 | 52 | | 95% | 60 | 79 | 52 | 55 | | 98% | 91 | 114 | 91 | 91 | |100% | 97 | 133 | 96 | 99 | ** Ratio (smaller is better) Here is the response time increase ratio against no-JIT trunk's one. | | trunk| trunk|single|objfcn| | | | JIT|so JIT| JIT| |:----|:-----|:-----|:-----|:-----| | 50% | 1.00x| 1.18x| 1.08x| 1.13x| | 66% | 1.00x| 1.28x| 1.13x| 1.13x| | 75% | 1.00x| 1.09x| 0.98x| 0.96x| | 80% | 1.00x| 1.06x| 0.96x| 0.96x| | 90% | 1.00x| 1.26x| 1.00x| 1.04x| | 95% | 1.00x| 1.32x| 0.87x| 0.92x| | 98% | 1.00x| 1.25x| 1.00x| 1.00x| |100% | 1.00x| 1.37x| 0.99x| 1.02x| While 50 and 60 %ile are still worse than no-JIT trunk, 75, 80, 90, 95, 98 and 100% are not slower than that. So now it's a little harder to say "MJIT slows down Rails applications". Probably I can close [Bug #14490] now. Let's start improving it. Close https://github.com/ruby/ruby/pull/1921 git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@64094 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
This commit is contained in:
parent
2a9cae3328
commit
443f4d583c
2 changed files with 121 additions and 15 deletions
123
mjit.c
123
mjit.c
|
@ -198,6 +198,8 @@ int mjit_call_p = FALSE;
|
|||
static struct rb_mjit_unit_list unit_queue;
|
||||
/* List of units which are successfully compiled. */
|
||||
static struct rb_mjit_unit_list active_units;
|
||||
/* List of compacted so files which will be deleted in `mjit_finish()`. */
|
||||
static struct rb_mjit_unit_list compact_units;
|
||||
/* The number of so far processed ISEQs, used to generate unique id. */
|
||||
static int current_unit_num;
|
||||
/* A mutex for conitionals and critical sections. */
|
||||
|
@ -774,6 +776,8 @@ make_pch(void)
|
|||
#define append_str(p, str) append_str2(p, str, sizeof(str)-1)
|
||||
#define append_lit(p, str) append_str2(p, str, rb_strlen_lit(str))
|
||||
|
||||
#define MJIT_TMP_PREFIX "_ruby_mjit_"
|
||||
|
||||
#ifdef _MSC_VER
|
||||
|
||||
/* Compile C file to so. It returns 1 if it succeeds. (mswin) */
|
||||
|
@ -808,7 +812,7 @@ compile_c_to_so(const char *c_file, const char *so_file)
|
|||
return exit_code == 0;
|
||||
}
|
||||
|
||||
#else
|
||||
#else /* _MSC_VER */
|
||||
|
||||
/* Compile .c file to .o file. It returns 1 if it succeeds. (non-mswin) */
|
||||
static int
|
||||
|
@ -841,13 +845,13 @@ compile_c_to_o(const char *c_file, const char *o_file)
|
|||
return exit_code == 0;
|
||||
}
|
||||
|
||||
/* Link .o file to .so file. It returns 1 if it succeeds. (non-mswin) */
|
||||
/* Link .o files to .so file. It returns 1 if it succeeds. (non-mswin) */
|
||||
static int
|
||||
link_o_to_so(const char *o_file, const char *so_file)
|
||||
link_o_to_so(const char **o_files, const char *so_file)
|
||||
{
|
||||
int exit_code;
|
||||
const char *files[] = {
|
||||
"-o", NULL, NULL,
|
||||
const char *options[] = {
|
||||
"-o", NULL,
|
||||
# ifdef _WIN32
|
||||
libruby_pathflag,
|
||||
# endif
|
||||
|
@ -855,10 +859,9 @@ link_o_to_so(const char *o_file, const char *so_file)
|
|||
};
|
||||
char **args;
|
||||
|
||||
files[1] = so_file;
|
||||
files[2] = o_file;
|
||||
args = form_args(5, CC_LDSHARED_ARGS, CC_CODEFLAG_ARGS,
|
||||
files, CC_LIBS, CC_DLDFLAGS_ARGS);
|
||||
options[1] = so_file;
|
||||
args = form_args(6, CC_LDSHARED_ARGS, CC_CODEFLAG_ARGS,
|
||||
options, o_files, CC_LIBS, CC_DLDFLAGS_ARGS);
|
||||
if (args == NULL)
|
||||
return FALSE;
|
||||
|
||||
|
@ -870,7 +873,94 @@ link_o_to_so(const char *o_file, const char *so_file)
|
|||
return exit_code == 0;
|
||||
}
|
||||
|
||||
/* Link all cached .o files and build a .so file. Reload all JIT func from it. This
|
||||
allows to avoid JIT code fragmentation and improve performance to call JIT-ed code. */
|
||||
static void
|
||||
compact_all_jit_code(void)
|
||||
{
|
||||
struct rb_mjit_unit *unit;
|
||||
struct rb_mjit_unit_node *node;
|
||||
double start_time, end_time;
|
||||
static const char so_ext[] = DLEXT;
|
||||
char so_file[MAXPATHLEN];
|
||||
const char **o_files;
|
||||
int i = 0, success;
|
||||
|
||||
/* Abnormal use case of rb_mjit_unit that doesn't have ISeq */
|
||||
unit = (struct rb_mjit_unit *)calloc(1, sizeof(struct rb_mjit_unit)); /* To prevent GC, don't use ZALLOC */
|
||||
if (unit == NULL) return;
|
||||
unit->id = current_unit_num++;
|
||||
sprint_uniq_filename(so_file, (int)sizeof(so_file), unit->id, MJIT_TMP_PREFIX, so_ext);
|
||||
|
||||
/* NULL-ending for form_args */
|
||||
o_files = (const char **)alloca(sizeof(char *) * (active_units.length + 1));
|
||||
o_files[active_units.length] = NULL;
|
||||
CRITICAL_SECTION_START(3, "in compact_all_jit_code to keep .o files");
|
||||
for (node = active_units.head; node != NULL; node = node->next) {
|
||||
o_files[i] = node->unit->o_file;
|
||||
i++;
|
||||
}
|
||||
|
||||
start_time = real_ms_time();
|
||||
success = link_o_to_so(o_files, so_file);
|
||||
end_time = real_ms_time();
|
||||
|
||||
/* TODO: Shrink this big critical section. For now, this is needed to prevent failure by missing .o files.
|
||||
This assumes that o -> so link doesn't take long time because the bottleneck, which is compiler optimization,
|
||||
is already done. But actually it takes about 500ms for 5,000 methods on my Linux machine, so it's better to
|
||||
finish this critical section before link_o_to_so by disabling unload_units. */
|
||||
CRITICAL_SECTION_FINISH(3, "in compact_all_jit_code to keep .o files");
|
||||
|
||||
if (success) {
|
||||
void *handle = dlopen(so_file, RTLD_NOW);
|
||||
if (handle == NULL) {
|
||||
if (mjit_opts.warnings || mjit_opts.verbose)
|
||||
fprintf(stderr, "MJIT warning: failure in loading code from compacted '%s': %s\n", so_file, dlerror());
|
||||
free(unit);
|
||||
return;
|
||||
}
|
||||
unit->handle = handle;
|
||||
|
||||
/* lazily dlclose handle (and .so file for win32) on `mjit_finish()`. */
|
||||
node = (struct rb_mjit_unit_node *)calloc(1, sizeof(struct rb_mjit_unit_node)); /* To prevent GC, don't use ZALLOC */
|
||||
node->unit = unit;
|
||||
add_to_list(node, &compact_units);
|
||||
|
||||
if (!mjit_opts.save_temps) {
|
||||
#ifdef _WIN32
|
||||
unit->so_file = strdup(so_file); /* lazily delete on `clean_object_files()` */
|
||||
#else
|
||||
remove_file(so_file);
|
||||
#endif
|
||||
}
|
||||
|
||||
CRITICAL_SECTION_START(3, "in compact_all_jit_code to read list");
|
||||
for (node = active_units.head; node != NULL; node = node->next) {
|
||||
void *func;
|
||||
char funcname[35]; /* TODO: reconsider `35` */
|
||||
sprintf(funcname, "_mjit%d", node->unit->id);
|
||||
|
||||
if ((func = dlsym(handle, funcname)) == NULL) {
|
||||
if (mjit_opts.warnings || mjit_opts.verbose)
|
||||
fprintf(stderr, "MJIT warning: skipping to reload '%s' from '%s': %s\n", funcname, so_file, dlerror());
|
||||
continue;
|
||||
}
|
||||
|
||||
if (node->unit->iseq) { /* Check whether GCed or not */
|
||||
/* Usage of jit_code might be not in a critical section. */
|
||||
MJIT_ATOMIC_SET(node->unit->iseq->body->jit_func, (mjit_func_t)func);
|
||||
}
|
||||
}
|
||||
CRITICAL_SECTION_FINISH(3, "in compact_all_jit_code to read list");
|
||||
verbose(1, "JIT compaction (%.1fms): Compacted %d methods -> %s", end_time - start_time, active_units.length, so_file);
|
||||
}
|
||||
else {
|
||||
free(unit);
|
||||
verbose(1, "JIT compaction failure (%.1fms): Failed to compact methods", end_time - start_time);
|
||||
}
|
||||
}
|
||||
|
||||
#endif /* _MSC_VER */
|
||||
|
||||
static void *
|
||||
load_func_from_so(const char *so_file, const char *funcname, struct rb_mjit_unit *unit)
|
||||
|
@ -889,8 +979,6 @@ load_func_from_so(const char *so_file, const char *funcname, struct rb_mjit_unit
|
|||
return func;
|
||||
}
|
||||
|
||||
#define MJIT_TMP_PREFIX "_ruby_mjit_"
|
||||
|
||||
#ifndef __clang__
|
||||
static const char *
|
||||
header_name_end(const char *s)
|
||||
|
@ -930,7 +1018,7 @@ remove_file(const char *filename)
|
|||
static mjit_func_t
|
||||
convert_unit_to_func(struct rb_mjit_unit *unit)
|
||||
{
|
||||
char c_file_buff[70], *c_file = c_file_buff, *so_file, funcname[35];
|
||||
char c_file_buff[MAXPATHLEN], *c_file = c_file_buff, *so_file, funcname[35]; /* TODO: reconsider `35` */
|
||||
int success;
|
||||
int fd;
|
||||
FILE *f;
|
||||
|
@ -1045,7 +1133,8 @@ convert_unit_to_func(struct rb_mjit_unit *unit)
|
|||
#else
|
||||
/* splitting .c -> .o step and .o -> .so step, to cache .o files in the future */
|
||||
if (success = compile_c_to_o(c_file, o_file)) {
|
||||
success = link_o_to_so(o_file, so_file);
|
||||
const char *o_files[] = { o_file, NULL };
|
||||
success = link_o_to_so(o_files, so_file);
|
||||
|
||||
if (!mjit_opts.save_temps)
|
||||
unit->o_file = strdup(o_file); /* lazily delete on `clean_object_files()` */
|
||||
|
@ -1127,6 +1216,13 @@ worker(void)
|
|||
}
|
||||
remove_from_list(node, &unit_queue);
|
||||
CRITICAL_SECTION_FINISH(3, "in jit func replace");
|
||||
|
||||
#ifndef _MSC_VER
|
||||
/* Combine .o files to one .so and reload all jit_func to improve memory locality */
|
||||
if ((unit_queue.length == 0 && active_units.length > 1) || active_units.length == mjit_opts.max_cache_size) {
|
||||
compact_all_jit_code();
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1699,6 +1795,7 @@ mjit_finish(void)
|
|||
mjit_call_p = FALSE;
|
||||
free_list(&unit_queue);
|
||||
free_list(&active_units);
|
||||
free_list(&compact_units);
|
||||
finish_conts();
|
||||
|
||||
mjit_enabled = FALSE;
|
||||
|
|
|
@ -7,6 +7,11 @@ require_relative '../lib/jit_support'
|
|||
class TestJIT < Test::Unit::TestCase
|
||||
include JITSupport
|
||||
|
||||
IGNORABLE_PATTERNS = [
|
||||
/\ASuccessful MJIT finish\n\z/,
|
||||
/\AJIT compaction \(\d+\.\dms\): Compacted \d+ methods ->/,
|
||||
]
|
||||
|
||||
# trace_* insns are not compiled for now...
|
||||
TEST_PENDING_INSNS = RubyVM::INSTRUCTION_NAMES.select { |n| n.start_with?('trace_') }.map(&:to_sym) + [
|
||||
# not supported yet
|
||||
|
@ -544,7 +549,9 @@ class TestJIT < Test::Unit::TestCase
|
|||
end
|
||||
end;
|
||||
assert_equal('0123456789', out)
|
||||
errs = err.lines
|
||||
errs = err.lines.reject do |l|
|
||||
IGNORABLE_PATTERNS.any? { |pat| pat.match?(l) }
|
||||
end
|
||||
assert_match(/\A#{JIT_SUCCESS_PREFIX}: block in <main>@-e:/, errs[0])
|
||||
9.times do |i|
|
||||
assert_match(/\A#{JIT_SUCCESS_PREFIX}: mjit#{i}@\(eval\):/, errs[i + 1])
|
||||
|
@ -776,7 +783,9 @@ class TestJIT < Test::Unit::TestCase
|
|||
if stdout
|
||||
assert_equal(stdout, out, "Expected stdout #{out.inspect} to match #{stdout.inspect} with script:\n#{code_block(script)}")
|
||||
end
|
||||
err_lines = err.lines.reject! { |l| l.chomp.empty? || l.match?(/\A#{JIT_SUCCESS_PREFIX}/) || l == "Successful MJIT finish\n" }
|
||||
err_lines = err.lines.reject! do |l|
|
||||
l.chomp.empty? || l.match?(/\A#{JIT_SUCCESS_PREFIX}/) || IGNORABLE_PATTERNS.any? { |pat| pat.match?(l) }
|
||||
end
|
||||
unless err_lines.empty?
|
||||
warn err_lines.join(''), uplevel: uplevel
|
||||
end
|
||||
|
|
Loading…
Add table
Reference in a new issue