1
0
Fork 0
mirror of https://github.com/ruby/ruby.git synced 2022-11-09 12:17:21 -05:00

mjit.c: introduce JIT compaction [experimental]

When all compilation finishes or the number of JIT-ed code reaches
--jit-max-cache, this compacts all generated code to a single .so file
and re-loads all methods from it.

In the future, it may trigger compaction more frequently and/or limit
the maximum times of compaction to prevent unlimited memory usage.
So the current behavior is experimental, but at least the performance
improvement in this commit won't be removed.

=== Benchmark ===
In this benchmark, I'll compare following four conditions:

* trunk: r64082
* trunk JIT: r64082 w/ --jit
* single-so JIT: This commit w/ --jit
* objfcn JIT: This branch https://github.com/k0kubun/ruby/tree/objfcn w/ --jit,
  which is shinh's objfcn https://github.com/shinh/ruby/tree/objfcn rebased from this commit

```
$ uname -a
Linux bionic 4.15.0-29-generic #31-Ubuntu SMP Tue Jul 17 15:39:52 UTC 2018 x86_64 x86_64 x86_64 GNU/Linux
```

* Micro benchmark
Using this script https://gist.github.com/k0kubun/10e6d3387c9ab1b134622b2c9d76ef51,
calls some amount of different methods that just return `nil`. The following tables
are its average duration seconds of 3 measurements.

Smaller is better.

** 1 method (seconds)
|       | trunk             | trunk JIT         | single-so JIT     | objfcn JIT        |
|:------|:------------------|:------------------|:------------------|:------------------|
| Time  | 5.576067774333296 | 5.915551971666446 | 5.833641665666619 | 5.845915191666639 |
| Ratio | 1.00x             | 1.06x             | 1.05x             | 1.05x             |

** 50 methods (seconds)
|       | trunk             | trunk JIT         | single-so JIT     | objfcn JIT        |
|:------|:------------------|:------------------|:------------------|:------------------|
| Time  | 3.1661167996666677| 6.125825928333342 | 4.135432743666665 | 3.750358728333348 |
| Ratio | 1.00x             | 1.93x             | 1.31x             | 1.18x             |

** 1500 methods (seconds)
|       | trunk             | trunk JIT         | single-so JIT     | objfcn JIT        |
|:------|:------------------|:------------------|:------------------|:------------------|
| Time  | 5.971650823666664 | 19.579182102999994| 10.511108153999961| 10.854653588999932|
| Ratio | 1.00x             | 3.28x             | 1.76x             | 1.82x             |

* Discourse
Using the same benchmark strategy as https://bugs.ruby-lang.org/issues/14490 with
this branch https://github.com/k0kubun/discourse/commits/benchmark2 forked from discourse
v1.8.11 to support running trunk.

1. Run ruby script/bench.rb to warm up profiling database
2. Run RUBYOPT='--jit-verbose=1 --jit-max-cache=10000' RAILS_ENV=profile bin/puma -e production
3. WAIT 5-15 or so minutes for all jitting to stop so we have no cross talk
4. Run ab -n 100 http://localhost:9292/
5. Wait for all new jitting to finish
6. Run ab -n 100 http://localhost:9292/

** Response time (ms)
Here is the response time milliseconds for each percentile.
Skipping 99%ile because it's the same as 100%ile in 100 calls.

|     | trunk| trunk|single|objfcn|
|     |      |   JIT|so JIT|   JIT|
|:----|:-----|:-----|:-----|:-----|
| 50% |   38 |   45 |   41 |   43 |
| 66% |   39 |   50 |   44 |   44 |
| 75% |   47 |   51 |   46 |   45 |
| 80% |   49 |   52 |   47 |   47 |
| 90% |   50 |   63 |   50 |   52 |
| 95% |   60 |   79 |   52 |   55 |
| 98% |   91 |  114 |   91 |   91 |
|100% |   97 |  133 |   96 |   99 |

** Ratio (smaller is better)
Here is the response time increase ratio against no-JIT trunk's one.

|     | trunk| trunk|single|objfcn|
|     |      |   JIT|so JIT|   JIT|
|:----|:-----|:-----|:-----|:-----|
| 50% | 1.00x| 1.18x| 1.08x| 1.13x|
| 66% | 1.00x| 1.28x| 1.13x| 1.13x|
| 75% | 1.00x| 1.09x| 0.98x| 0.96x|
| 80% | 1.00x| 1.06x| 0.96x| 0.96x|
| 90% | 1.00x| 1.26x| 1.00x| 1.04x|
| 95% | 1.00x| 1.32x| 0.87x| 0.92x|
| 98% | 1.00x| 1.25x| 1.00x| 1.00x|
|100% | 1.00x| 1.37x| 0.99x| 1.02x|

While 50 and 60 %ile are still worse than no-JIT trunk, 75, 80, 90, 95,
98 and 100% are not slower than that.

So now it's a little harder to say "MJIT slows down Rails applications".
Probably I can close [Bug #14490] now. Let's start improving it.

Close https://github.com/ruby/ruby/pull/1921

git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@64094 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
This commit is contained in:
k0kubun 2018-07-28 16:14:56 +00:00
parent 2a9cae3328
commit 443f4d583c
2 changed files with 121 additions and 15 deletions

123
mjit.c
View file

@ -198,6 +198,8 @@ int mjit_call_p = FALSE;
static struct rb_mjit_unit_list unit_queue; static struct rb_mjit_unit_list unit_queue;
/* List of units which are successfully compiled. */ /* List of units which are successfully compiled. */
static struct rb_mjit_unit_list active_units; static struct rb_mjit_unit_list active_units;
/* List of compacted so files which will be deleted in `mjit_finish()`. */
static struct rb_mjit_unit_list compact_units;
/* The number of so far processed ISEQs, used to generate unique id. */ /* The number of so far processed ISEQs, used to generate unique id. */
static int current_unit_num; static int current_unit_num;
/* A mutex for conitionals and critical sections. */ /* A mutex for conitionals and critical sections. */
@ -774,6 +776,8 @@ make_pch(void)
#define append_str(p, str) append_str2(p, str, sizeof(str)-1) #define append_str(p, str) append_str2(p, str, sizeof(str)-1)
#define append_lit(p, str) append_str2(p, str, rb_strlen_lit(str)) #define append_lit(p, str) append_str2(p, str, rb_strlen_lit(str))
#define MJIT_TMP_PREFIX "_ruby_mjit_"
#ifdef _MSC_VER #ifdef _MSC_VER
/* Compile C file to so. It returns 1 if it succeeds. (mswin) */ /* Compile C file to so. It returns 1 if it succeeds. (mswin) */
@ -808,7 +812,7 @@ compile_c_to_so(const char *c_file, const char *so_file)
return exit_code == 0; return exit_code == 0;
} }
#else #else /* _MSC_VER */
/* Compile .c file to .o file. It returns 1 if it succeeds. (non-mswin) */ /* Compile .c file to .o file. It returns 1 if it succeeds. (non-mswin) */
static int static int
@ -841,13 +845,13 @@ compile_c_to_o(const char *c_file, const char *o_file)
return exit_code == 0; return exit_code == 0;
} }
/* Link .o file to .so file. It returns 1 if it succeeds. (non-mswin) */ /* Link .o files to .so file. It returns 1 if it succeeds. (non-mswin) */
static int static int
link_o_to_so(const char *o_file, const char *so_file) link_o_to_so(const char **o_files, const char *so_file)
{ {
int exit_code; int exit_code;
const char *files[] = { const char *options[] = {
"-o", NULL, NULL, "-o", NULL,
# ifdef _WIN32 # ifdef _WIN32
libruby_pathflag, libruby_pathflag,
# endif # endif
@ -855,10 +859,9 @@ link_o_to_so(const char *o_file, const char *so_file)
}; };
char **args; char **args;
files[1] = so_file; options[1] = so_file;
files[2] = o_file; args = form_args(6, CC_LDSHARED_ARGS, CC_CODEFLAG_ARGS,
args = form_args(5, CC_LDSHARED_ARGS, CC_CODEFLAG_ARGS, options, o_files, CC_LIBS, CC_DLDFLAGS_ARGS);
files, CC_LIBS, CC_DLDFLAGS_ARGS);
if (args == NULL) if (args == NULL)
return FALSE; return FALSE;
@ -870,7 +873,94 @@ link_o_to_so(const char *o_file, const char *so_file)
return exit_code == 0; return exit_code == 0;
} }
/* Link all cached .o files and build a .so file. Reload all JIT func from it. This
allows to avoid JIT code fragmentation and improve performance to call JIT-ed code. */
static void
compact_all_jit_code(void)
{
struct rb_mjit_unit *unit;
struct rb_mjit_unit_node *node;
double start_time, end_time;
static const char so_ext[] = DLEXT;
char so_file[MAXPATHLEN];
const char **o_files;
int i = 0, success;
/* Abnormal use case of rb_mjit_unit that doesn't have ISeq */
unit = (struct rb_mjit_unit *)calloc(1, sizeof(struct rb_mjit_unit)); /* To prevent GC, don't use ZALLOC */
if (unit == NULL) return;
unit->id = current_unit_num++;
sprint_uniq_filename(so_file, (int)sizeof(so_file), unit->id, MJIT_TMP_PREFIX, so_ext);
/* NULL-ending for form_args */
o_files = (const char **)alloca(sizeof(char *) * (active_units.length + 1));
o_files[active_units.length] = NULL;
CRITICAL_SECTION_START(3, "in compact_all_jit_code to keep .o files");
for (node = active_units.head; node != NULL; node = node->next) {
o_files[i] = node->unit->o_file;
i++;
}
start_time = real_ms_time();
success = link_o_to_so(o_files, so_file);
end_time = real_ms_time();
/* TODO: Shrink this big critical section. For now, this is needed to prevent failure by missing .o files.
This assumes that o -> so link doesn't take long time because the bottleneck, which is compiler optimization,
is already done. But actually it takes about 500ms for 5,000 methods on my Linux machine, so it's better to
finish this critical section before link_o_to_so by disabling unload_units. */
CRITICAL_SECTION_FINISH(3, "in compact_all_jit_code to keep .o files");
if (success) {
void *handle = dlopen(so_file, RTLD_NOW);
if (handle == NULL) {
if (mjit_opts.warnings || mjit_opts.verbose)
fprintf(stderr, "MJIT warning: failure in loading code from compacted '%s': %s\n", so_file, dlerror());
free(unit);
return;
}
unit->handle = handle;
/* lazily dlclose handle (and .so file for win32) on `mjit_finish()`. */
node = (struct rb_mjit_unit_node *)calloc(1, sizeof(struct rb_mjit_unit_node)); /* To prevent GC, don't use ZALLOC */
node->unit = unit;
add_to_list(node, &compact_units);
if (!mjit_opts.save_temps) {
#ifdef _WIN32
unit->so_file = strdup(so_file); /* lazily delete on `clean_object_files()` */
#else
remove_file(so_file);
#endif #endif
}
CRITICAL_SECTION_START(3, "in compact_all_jit_code to read list");
for (node = active_units.head; node != NULL; node = node->next) {
void *func;
char funcname[35]; /* TODO: reconsider `35` */
sprintf(funcname, "_mjit%d", node->unit->id);
if ((func = dlsym(handle, funcname)) == NULL) {
if (mjit_opts.warnings || mjit_opts.verbose)
fprintf(stderr, "MJIT warning: skipping to reload '%s' from '%s': %s\n", funcname, so_file, dlerror());
continue;
}
if (node->unit->iseq) { /* Check whether GCed or not */
/* Usage of jit_code might be not in a critical section. */
MJIT_ATOMIC_SET(node->unit->iseq->body->jit_func, (mjit_func_t)func);
}
}
CRITICAL_SECTION_FINISH(3, "in compact_all_jit_code to read list");
verbose(1, "JIT compaction (%.1fms): Compacted %d methods -> %s", end_time - start_time, active_units.length, so_file);
}
else {
free(unit);
verbose(1, "JIT compaction failure (%.1fms): Failed to compact methods", end_time - start_time);
}
}
#endif /* _MSC_VER */
static void * static void *
load_func_from_so(const char *so_file, const char *funcname, struct rb_mjit_unit *unit) load_func_from_so(const char *so_file, const char *funcname, struct rb_mjit_unit *unit)
@ -889,8 +979,6 @@ load_func_from_so(const char *so_file, const char *funcname, struct rb_mjit_unit
return func; return func;
} }
#define MJIT_TMP_PREFIX "_ruby_mjit_"
#ifndef __clang__ #ifndef __clang__
static const char * static const char *
header_name_end(const char *s) header_name_end(const char *s)
@ -930,7 +1018,7 @@ remove_file(const char *filename)
static mjit_func_t static mjit_func_t
convert_unit_to_func(struct rb_mjit_unit *unit) convert_unit_to_func(struct rb_mjit_unit *unit)
{ {
char c_file_buff[70], *c_file = c_file_buff, *so_file, funcname[35]; char c_file_buff[MAXPATHLEN], *c_file = c_file_buff, *so_file, funcname[35]; /* TODO: reconsider `35` */
int success; int success;
int fd; int fd;
FILE *f; FILE *f;
@ -1045,7 +1133,8 @@ convert_unit_to_func(struct rb_mjit_unit *unit)
#else #else
/* splitting .c -> .o step and .o -> .so step, to cache .o files in the future */ /* splitting .c -> .o step and .o -> .so step, to cache .o files in the future */
if (success = compile_c_to_o(c_file, o_file)) { if (success = compile_c_to_o(c_file, o_file)) {
success = link_o_to_so(o_file, so_file); const char *o_files[] = { o_file, NULL };
success = link_o_to_so(o_files, so_file);
if (!mjit_opts.save_temps) if (!mjit_opts.save_temps)
unit->o_file = strdup(o_file); /* lazily delete on `clean_object_files()` */ unit->o_file = strdup(o_file); /* lazily delete on `clean_object_files()` */
@ -1127,6 +1216,13 @@ worker(void)
} }
remove_from_list(node, &unit_queue); remove_from_list(node, &unit_queue);
CRITICAL_SECTION_FINISH(3, "in jit func replace"); CRITICAL_SECTION_FINISH(3, "in jit func replace");
#ifndef _MSC_VER
/* Combine .o files to one .so and reload all jit_func to improve memory locality */
if ((unit_queue.length == 0 && active_units.length > 1) || active_units.length == mjit_opts.max_cache_size) {
compact_all_jit_code();
}
#endif
} }
} }
@ -1699,6 +1795,7 @@ mjit_finish(void)
mjit_call_p = FALSE; mjit_call_p = FALSE;
free_list(&unit_queue); free_list(&unit_queue);
free_list(&active_units); free_list(&active_units);
free_list(&compact_units);
finish_conts(); finish_conts();
mjit_enabled = FALSE; mjit_enabled = FALSE;

View file

@ -7,6 +7,11 @@ require_relative '../lib/jit_support'
class TestJIT < Test::Unit::TestCase class TestJIT < Test::Unit::TestCase
include JITSupport include JITSupport
IGNORABLE_PATTERNS = [
/\ASuccessful MJIT finish\n\z/,
/\AJIT compaction \(\d+\.\dms\): Compacted \d+ methods ->/,
]
# trace_* insns are not compiled for now... # trace_* insns are not compiled for now...
TEST_PENDING_INSNS = RubyVM::INSTRUCTION_NAMES.select { |n| n.start_with?('trace_') }.map(&:to_sym) + [ TEST_PENDING_INSNS = RubyVM::INSTRUCTION_NAMES.select { |n| n.start_with?('trace_') }.map(&:to_sym) + [
# not supported yet # not supported yet
@ -544,7 +549,9 @@ class TestJIT < Test::Unit::TestCase
end end
end; end;
assert_equal('0123456789', out) assert_equal('0123456789', out)
errs = err.lines errs = err.lines.reject do |l|
IGNORABLE_PATTERNS.any? { |pat| pat.match?(l) }
end
assert_match(/\A#{JIT_SUCCESS_PREFIX}: block in <main>@-e:/, errs[0]) assert_match(/\A#{JIT_SUCCESS_PREFIX}: block in <main>@-e:/, errs[0])
9.times do |i| 9.times do |i|
assert_match(/\A#{JIT_SUCCESS_PREFIX}: mjit#{i}@\(eval\):/, errs[i + 1]) assert_match(/\A#{JIT_SUCCESS_PREFIX}: mjit#{i}@\(eval\):/, errs[i + 1])
@ -776,7 +783,9 @@ class TestJIT < Test::Unit::TestCase
if stdout if stdout
assert_equal(stdout, out, "Expected stdout #{out.inspect} to match #{stdout.inspect} with script:\n#{code_block(script)}") assert_equal(stdout, out, "Expected stdout #{out.inspect} to match #{stdout.inspect} with script:\n#{code_block(script)}")
end end
err_lines = err.lines.reject! { |l| l.chomp.empty? || l.match?(/\A#{JIT_SUCCESS_PREFIX}/) || l == "Successful MJIT finish\n" } err_lines = err.lines.reject! do |l|
l.chomp.empty? || l.match?(/\A#{JIT_SUCCESS_PREFIX}/) || IGNORABLE_PATTERNS.any? { |pat| pat.match?(l) }
end
unless err_lines.empty? unless err_lines.empty?
warn err_lines.join(''), uplevel: uplevel warn err_lines.join(''), uplevel: uplevel
end end