1
0
Fork 0
mirror of https://github.com/ruby/ruby.git synced 2022-11-09 12:17:21 -05:00
ruby--ruby/insns.def

1518 lines
31 KiB
Modula-2
Raw Normal View History

/* -*- C -*-
insns.def - YARV instruction definitions
$Author: $
created at: 04/01/01 01:17:55 JST
Copyright (C) 2004-2007 Koichi Sasada
Massive rewrite by @shyouhei in 2017.
*/
/* Some comments about this file's contents:
- The new format aims to be editable by C editor of your choice;
your mileage might vary of course.
- Each instructions are in following format:
DEFINE_INSN
instruction_name
(type operand, type operand, ..)
(pop_values, ..)
(return values ..)
// attr type name contents..
{
.. // insn body
}
- Unlike the old format which was line-oriented, you can now place
newlines and comments at liberal positions.
- `DEFINE_INSN` is a keyword.
- An instruction name must be a valid C identifier.
- Operands, pop values, return values are series of either variable
declarations, keyword `void`, or keyword `...`. They are much
like C function declarations.
- Attribute pragmas are optional, and can include arbitrary C
expressions. You can write anything there but as of writing,
supported attributes are:
* sp_inc: Used to dynamically calculate sp increase in
`insn_stack_increase`.
* handles_sp: If it is true, VM deals with sp in the insn.
Default is if the instruction takes ISEQ operand or not.
mjit_compile.c: reduce sp motion on JIT This retries r62655, which was reverted at r63863 for r63763. tool/ruby_vm/views/_mjit_compile_insn.erb: revert the revert. tool/ruby_vm/views/_mjit_compile_insn_body.erb: ditto. tool/ruby_vm/views/_mjit_compile_pc_and_sp.erb: ditto. tool/ruby_vm/views/_mjit_compile_send.erb: ditto. tool/ruby_vm/views/mjit_compile.inc.erb: ditto. tool/ruby_vm/views/_insn_entry.erb: revert half of r63763. The commit was originally reverted since changing pc motion was bad for tracing, but changing sp motion was totally fine. For JIT, I wanna resurrect the sp motion change in r62051. tool/ruby_vm/models/bare_instructions.rb: ditto. insns.def: ditto. vm_insnhelper.c: ditto. vm_insnhelper.h: ditto. * benchmark $ benchmark-driver benchmark.yml --rbenv 'before;after;before --jit;after --jit' --repeat-count 12 -v before: ruby 2.6.0dev (2018-07-19 trunk 63998) [x86_64-linux] after: ruby 2.6.0dev (2018-07-19 add-sp 63998) [x86_64-linux] last_commit=mjit_compile.c: reduce sp motion on JIT before --jit: ruby 2.6.0dev (2018-07-19 trunk 63998) +JIT [x86_64-linux] after --jit: ruby 2.6.0dev (2018-07-19 add-sp 63998) +JIT [x86_64-linux] last_commit=mjit_compile.c: reduce sp motion on JIT Calculating ------------------------------------- before after before --jit after --jit Optcarrot Lan_Master.nes 51.354 50.238 70.010 72.139 fps Comparison: Optcarrot Lan_Master.nes after --jit: 72.1 fps before --jit: 70.0 fps - 1.03x slower before: 51.4 fps - 1.40x slower after: 50.2 fps - 1.44x slower git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@63999 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2018-07-19 09:25:22 -04:00
* leaf: indicates that the instruction is "leaf" i.e. it does
not introduce new stack frame on top of it.
If an instruction handles sp, that can never be a leaf.
- Attributes can access operands, but not stack (push/pop) variables.
- An instruction's body is a pure C block, copied verbatimly into
the generated C source code.
*/
/* nop */
DEFINE_INSN
nop
()
()
()
{
/* none */
}
/**********************************************************/
/* deal with variables */
/**********************************************************/
/* Get local variable (pointed by `idx' and `level').
'level' indicates the nesting depth from the current block.
*/
DEFINE_INSN
getlocal
(lindex_t idx, rb_num_t level)
()
(VALUE val)
{
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 06:58:49 -04:00
val = *(vm_get_ep(GET_EP(), level) - idx);
RB_DEBUG_COUNTER_INC(lvar_get);
(void)RB_DEBUG_COUNTER_INC_IF(lvar_get_dynamic, level > 0);
}
/* Set a local variable (pointed to by 'idx') as val.
'level' indicates the nesting depth from the current block.
*/
DEFINE_INSN
setlocal
(lindex_t idx, rb_num_t level)
(VALUE val)
()
{
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 06:58:49 -04:00
vm_env_write(vm_get_ep(GET_EP(), level), -(int)idx, val);
RB_DEBUG_COUNTER_INC(lvar_set);
(void)RB_DEBUG_COUNTER_INC_IF(lvar_set_dynamic, level > 0);
}
/* Get a block parameter. */
DEFINE_INSN
getblockparam
(lindex_t idx, rb_num_t level)
()
(VALUE val)
{
const VALUE *ep = vm_get_ep(GET_EP(), level);
VM_ASSERT(VM_ENV_LOCAL_P(ep));
if (!VM_ENV_FLAGS(ep, VM_FRAME_FLAG_MODIFIED_BLOCK_PARAM)) {
val = rb_vm_bh_to_procval(ec, VM_ENV_BLOCK_HANDLER(ep));
vm_env_write(ep, -(int)idx, val);
VM_ENV_FLAGS_SET(ep, VM_FRAME_FLAG_MODIFIED_BLOCK_PARAM);
}
else {
val = *(ep - idx);
RB_DEBUG_COUNTER_INC(lvar_get);
(void)RB_DEBUG_COUNTER_INC_IF(lvar_get_dynamic, level > 0);
}
}
/* Set block parameter. */
DEFINE_INSN
setblockparam
(lindex_t idx, rb_num_t level)
(VALUE val)
()
{
const VALUE *ep = vm_get_ep(GET_EP(), level);
VM_ASSERT(VM_ENV_LOCAL_P(ep));
vm_env_write(ep, -(int)idx, val);
RB_DEBUG_COUNTER_INC(lvar_set);
(void)RB_DEBUG_COUNTER_INC_IF(lvar_set_dynamic, level > 0);
VM_ENV_FLAGS_SET(ep, VM_FRAME_FLAG_MODIFIED_BLOCK_PARAM);
}
/* Get special proxy object which only responds to `call` method if the block parameter
represents a iseq/ifunc block. Otherwise, same as `getblockparam`.
*/
DEFINE_INSN
getblockparamproxy
(lindex_t idx, rb_num_t level)
()
(VALUE val)
{
const VALUE *ep = vm_get_ep(GET_EP(), level);
VM_ASSERT(VM_ENV_LOCAL_P(ep));
if (!VM_ENV_FLAGS(ep, VM_FRAME_FLAG_MODIFIED_BLOCK_PARAM)) {
VALUE block_handler = VM_ENV_BLOCK_HANDLER(ep);
if (block_handler) {
switch (vm_block_handler_type(block_handler)) {
case block_handler_type_iseq:
case block_handler_type_ifunc:
val = rb_block_param_proxy;
break;
case block_handler_type_symbol:
val = rb_sym_to_proc(VM_BH_TO_SYMBOL(block_handler));
goto INSN_LABEL(set);
case block_handler_type_proc:
val = VM_BH_TO_PROC(block_handler);
goto INSN_LABEL(set);
default:
VM_UNREACHABLE(getblockparamproxy);
}
}
else {
val = Qnil;
INSN_LABEL(set):
vm_env_write(ep, -(int)idx, val);
VM_ENV_FLAGS_SET(ep, VM_FRAME_FLAG_MODIFIED_BLOCK_PARAM);
}
}
else {
val = *(ep - idx);
RB_DEBUG_COUNTER_INC(lvar_get);
(void)RB_DEBUG_COUNTER_INC_IF(lvar_get_dynamic, level > 0);
}
}
/* Get value of special local variable ($~, $_, ..). */
DEFINE_INSN
getspecial
(rb_num_t key, rb_num_t type)
()
(VALUE val)
/* `$~ = MatchData.allocate; $&` can raise. */
// attr bool leaf = (type == 0) ? true : false;
{
val = vm_getspecial(ec, GET_LEP(), key, type);
}
/* Set value of special local variable ($~, $_, ...) to obj. */
DEFINE_INSN
setspecial
(rb_num_t key)
(VALUE obj)
()
{
lep_svar_set(ec, GET_LEP(), key, obj);
}
/* Get value of instance variable id of self. */
DEFINE_INSN
getinstancevariable
(ID id, IVC ic)
()
(VALUE val)
/* Ractor crashes when it accesses class/module-level instances variables. */
// attr bool leaf = false; /* has IVAR_ACCESSOR_SHOULD_BE_MAIN_RACTOR() */
{
val = vm_getinstancevariable(GET_ISEQ(), GET_SELF(), id, ic);
}
/* Set value of instance variable id of self to val. */
DEFINE_INSN
setinstancevariable
(ID id, IVC ic)
(VALUE val)
()
// attr bool leaf = false; /* has rb_check_frozen_internal() */
{
vm_setinstancevariable(GET_ISEQ(), GET_SELF(), id, val, ic);
}
/* Get value of class variable id of klass as val. */
DEFINE_INSN
getclassvariable
Add a cache for class variables Redo of 34a2acdac788602c14bf05fb616215187badd504 and 931138b00696419945dc03e10f033b1f53cd50f3 which were reverted. GitHub PR #4340. This change implements a cache for class variables. Previously there was no cache for cvars. Cvar access is slow due to needing to travel all the way up th ancestor tree before returning the cvar value. The deeper the ancestor tree the slower cvar access will be. The benefits of the cache are more visible with a higher number of included modules due to the way Ruby looks up class variables. The benchmark here includes 26 modules and shows with the cache, this branch is 6.5x faster when accessing class variables. ``` compare-ruby: ruby 3.1.0dev (2021-03-15T06:22:34Z master 9e5105c) [x86_64-darwin19] built-ruby: ruby 3.1.0dev (2021-03-15T12:12:44Z add-cache-for-clas.. c6be009) [x86_64-darwin19] | |compare-ruby|built-ruby| |:--------|-----------:|---------:| |vm_cvar | 5.681M| 36.980M| | | -| 6.51x| ``` Benchmark.ips calling `ActiveRecord::Base.logger` from within a Rails application. ActiveRecord::Base.logger has 71 ancestors. The more ancestors a tree has, the more clear the speed increase. IE if Base had only one ancestor we'd see no improvement. This benchmark is run on a vanilla Rails application. Benchmark code: ```ruby require "benchmark/ips" require_relative "config/environment" Benchmark.ips do |x| x.report "logger" do ActiveRecord::Base.logger end end ``` Ruby 3.0 master / Rails 6.1: ``` Warming up -------------------------------------- logger 155.251k i/100ms Calculating ------------------------------------- ``` Ruby 3.0 with cvar cache / Rails 6.1: ``` Warming up -------------------------------------- logger 1.546M i/100ms Calculating ------------------------------------- logger 14.857M (± 4.8%) i/s - 74.198M in 5.006202s ``` Lastly we ran a benchmark to demonstate the difference between master and our cache when the number of modules increases. This benchmark measures 1 ancestor, 30 ancestors, and 100 ancestors. Ruby 3.0 master: ``` Warming up -------------------------------------- 1 module 1.231M i/100ms 30 modules 432.020k i/100ms 100 modules 145.399k i/100ms Calculating ------------------------------------- 1 module 12.210M (± 2.1%) i/s - 61.553M in 5.043400s 30 modules 4.354M (± 2.7%) i/s - 22.033M in 5.063839s 100 modules 1.434M (± 2.9%) i/s - 7.270M in 5.072531s Comparison: 1 module: 12209958.3 i/s 30 modules: 4354217.8 i/s - 2.80x (± 0.00) slower 100 modules: 1434447.3 i/s - 8.51x (± 0.00) slower ``` Ruby 3.0 with cvar cache: ``` Warming up -------------------------------------- 1 module 1.641M i/100ms 30 modules 1.655M i/100ms 100 modules 1.620M i/100ms Calculating ------------------------------------- 1 module 16.279M (± 3.8%) i/s - 82.038M in 5.046923s 30 modules 15.891M (± 3.9%) i/s - 79.459M in 5.007958s 100 modules 16.087M (± 3.6%) i/s - 81.005M in 5.041931s Comparison: 1 module: 16279458.0 i/s 100 modules: 16087484.6 i/s - same-ish: difference falls within error 30 modules: 15891406.2 i/s - same-ish: difference falls within error ``` Co-authored-by: Aaron Patterson <tenderlove@ruby-lang.org>
2021-06-01 13:34:06 -04:00
(ID id, IVC ic)
()
(VALUE val)
/* "class variable access from toplevel" warning can be hooked. */
// attr bool leaf = false; /* has rb_warning() */
{
Add a cache for class variables Redo of 34a2acdac788602c14bf05fb616215187badd504 and 931138b00696419945dc03e10f033b1f53cd50f3 which were reverted. GitHub PR #4340. This change implements a cache for class variables. Previously there was no cache for cvars. Cvar access is slow due to needing to travel all the way up th ancestor tree before returning the cvar value. The deeper the ancestor tree the slower cvar access will be. The benefits of the cache are more visible with a higher number of included modules due to the way Ruby looks up class variables. The benchmark here includes 26 modules and shows with the cache, this branch is 6.5x faster when accessing class variables. ``` compare-ruby: ruby 3.1.0dev (2021-03-15T06:22:34Z master 9e5105c) [x86_64-darwin19] built-ruby: ruby 3.1.0dev (2021-03-15T12:12:44Z add-cache-for-clas.. c6be009) [x86_64-darwin19] | |compare-ruby|built-ruby| |:--------|-----------:|---------:| |vm_cvar | 5.681M| 36.980M| | | -| 6.51x| ``` Benchmark.ips calling `ActiveRecord::Base.logger` from within a Rails application. ActiveRecord::Base.logger has 71 ancestors. The more ancestors a tree has, the more clear the speed increase. IE if Base had only one ancestor we'd see no improvement. This benchmark is run on a vanilla Rails application. Benchmark code: ```ruby require "benchmark/ips" require_relative "config/environment" Benchmark.ips do |x| x.report "logger" do ActiveRecord::Base.logger end end ``` Ruby 3.0 master / Rails 6.1: ``` Warming up -------------------------------------- logger 155.251k i/100ms Calculating ------------------------------------- ``` Ruby 3.0 with cvar cache / Rails 6.1: ``` Warming up -------------------------------------- logger 1.546M i/100ms Calculating ------------------------------------- logger 14.857M (± 4.8%) i/s - 74.198M in 5.006202s ``` Lastly we ran a benchmark to demonstate the difference between master and our cache when the number of modules increases. This benchmark measures 1 ancestor, 30 ancestors, and 100 ancestors. Ruby 3.0 master: ``` Warming up -------------------------------------- 1 module 1.231M i/100ms 30 modules 432.020k i/100ms 100 modules 145.399k i/100ms Calculating ------------------------------------- 1 module 12.210M (± 2.1%) i/s - 61.553M in 5.043400s 30 modules 4.354M (± 2.7%) i/s - 22.033M in 5.063839s 100 modules 1.434M (± 2.9%) i/s - 7.270M in 5.072531s Comparison: 1 module: 12209958.3 i/s 30 modules: 4354217.8 i/s - 2.80x (± 0.00) slower 100 modules: 1434447.3 i/s - 8.51x (± 0.00) slower ``` Ruby 3.0 with cvar cache: ``` Warming up -------------------------------------- 1 module 1.641M i/100ms 30 modules 1.655M i/100ms 100 modules 1.620M i/100ms Calculating ------------------------------------- 1 module 16.279M (± 3.8%) i/s - 82.038M in 5.046923s 30 modules 15.891M (± 3.9%) i/s - 79.459M in 5.007958s 100 modules 16.087M (± 3.6%) i/s - 81.005M in 5.041931s Comparison: 1 module: 16279458.0 i/s 100 modules: 16087484.6 i/s - same-ish: difference falls within error 30 modules: 15891406.2 i/s - same-ish: difference falls within error ``` Co-authored-by: Aaron Patterson <tenderlove@ruby-lang.org>
2021-06-01 13:34:06 -04:00
rb_cref_t * cref = vm_get_cref(GET_EP());
rb_control_frame_t *cfp = GET_CFP();
val = vm_getclassvariable(GET_ISEQ(), cref, cfp, id, (ICVARC)ic);
}
/* Set value of class variable id of klass as val. */
DEFINE_INSN
setclassvariable
Add a cache for class variables Redo of 34a2acdac788602c14bf05fb616215187badd504 and 931138b00696419945dc03e10f033b1f53cd50f3 which were reverted. GitHub PR #4340. This change implements a cache for class variables. Previously there was no cache for cvars. Cvar access is slow due to needing to travel all the way up th ancestor tree before returning the cvar value. The deeper the ancestor tree the slower cvar access will be. The benefits of the cache are more visible with a higher number of included modules due to the way Ruby looks up class variables. The benchmark here includes 26 modules and shows with the cache, this branch is 6.5x faster when accessing class variables. ``` compare-ruby: ruby 3.1.0dev (2021-03-15T06:22:34Z master 9e5105c) [x86_64-darwin19] built-ruby: ruby 3.1.0dev (2021-03-15T12:12:44Z add-cache-for-clas.. c6be009) [x86_64-darwin19] | |compare-ruby|built-ruby| |:--------|-----------:|---------:| |vm_cvar | 5.681M| 36.980M| | | -| 6.51x| ``` Benchmark.ips calling `ActiveRecord::Base.logger` from within a Rails application. ActiveRecord::Base.logger has 71 ancestors. The more ancestors a tree has, the more clear the speed increase. IE if Base had only one ancestor we'd see no improvement. This benchmark is run on a vanilla Rails application. Benchmark code: ```ruby require "benchmark/ips" require_relative "config/environment" Benchmark.ips do |x| x.report "logger" do ActiveRecord::Base.logger end end ``` Ruby 3.0 master / Rails 6.1: ``` Warming up -------------------------------------- logger 155.251k i/100ms Calculating ------------------------------------- ``` Ruby 3.0 with cvar cache / Rails 6.1: ``` Warming up -------------------------------------- logger 1.546M i/100ms Calculating ------------------------------------- logger 14.857M (± 4.8%) i/s - 74.198M in 5.006202s ``` Lastly we ran a benchmark to demonstate the difference between master and our cache when the number of modules increases. This benchmark measures 1 ancestor, 30 ancestors, and 100 ancestors. Ruby 3.0 master: ``` Warming up -------------------------------------- 1 module 1.231M i/100ms 30 modules 432.020k i/100ms 100 modules 145.399k i/100ms Calculating ------------------------------------- 1 module 12.210M (± 2.1%) i/s - 61.553M in 5.043400s 30 modules 4.354M (± 2.7%) i/s - 22.033M in 5.063839s 100 modules 1.434M (± 2.9%) i/s - 7.270M in 5.072531s Comparison: 1 module: 12209958.3 i/s 30 modules: 4354217.8 i/s - 2.80x (± 0.00) slower 100 modules: 1434447.3 i/s - 8.51x (± 0.00) slower ``` Ruby 3.0 with cvar cache: ``` Warming up -------------------------------------- 1 module 1.641M i/100ms 30 modules 1.655M i/100ms 100 modules 1.620M i/100ms Calculating ------------------------------------- 1 module 16.279M (± 3.8%) i/s - 82.038M in 5.046923s 30 modules 15.891M (± 3.9%) i/s - 79.459M in 5.007958s 100 modules 16.087M (± 3.6%) i/s - 81.005M in 5.041931s Comparison: 1 module: 16279458.0 i/s 100 modules: 16087484.6 i/s - same-ish: difference falls within error 30 modules: 15891406.2 i/s - same-ish: difference falls within error ``` Co-authored-by: Aaron Patterson <tenderlove@ruby-lang.org>
2021-06-01 13:34:06 -04:00
(ID id, IVC ic)
(VALUE val)
()
/* "class variable access from toplevel" warning can be hooked. */
// attr bool leaf = false; /* has rb_warning() */
{
vm_ensure_not_refinement_module(GET_SELF());
Add a cache for class variables Redo of 34a2acdac788602c14bf05fb616215187badd504 and 931138b00696419945dc03e10f033b1f53cd50f3 which were reverted. GitHub PR #4340. This change implements a cache for class variables. Previously there was no cache for cvars. Cvar access is slow due to needing to travel all the way up th ancestor tree before returning the cvar value. The deeper the ancestor tree the slower cvar access will be. The benefits of the cache are more visible with a higher number of included modules due to the way Ruby looks up class variables. The benchmark here includes 26 modules and shows with the cache, this branch is 6.5x faster when accessing class variables. ``` compare-ruby: ruby 3.1.0dev (2021-03-15T06:22:34Z master 9e5105c) [x86_64-darwin19] built-ruby: ruby 3.1.0dev (2021-03-15T12:12:44Z add-cache-for-clas.. c6be009) [x86_64-darwin19] | |compare-ruby|built-ruby| |:--------|-----------:|---------:| |vm_cvar | 5.681M| 36.980M| | | -| 6.51x| ``` Benchmark.ips calling `ActiveRecord::Base.logger` from within a Rails application. ActiveRecord::Base.logger has 71 ancestors. The more ancestors a tree has, the more clear the speed increase. IE if Base had only one ancestor we'd see no improvement. This benchmark is run on a vanilla Rails application. Benchmark code: ```ruby require "benchmark/ips" require_relative "config/environment" Benchmark.ips do |x| x.report "logger" do ActiveRecord::Base.logger end end ``` Ruby 3.0 master / Rails 6.1: ``` Warming up -------------------------------------- logger 155.251k i/100ms Calculating ------------------------------------- ``` Ruby 3.0 with cvar cache / Rails 6.1: ``` Warming up -------------------------------------- logger 1.546M i/100ms Calculating ------------------------------------- logger 14.857M (± 4.8%) i/s - 74.198M in 5.006202s ``` Lastly we ran a benchmark to demonstate the difference between master and our cache when the number of modules increases. This benchmark measures 1 ancestor, 30 ancestors, and 100 ancestors. Ruby 3.0 master: ``` Warming up -------------------------------------- 1 module 1.231M i/100ms 30 modules 432.020k i/100ms 100 modules 145.399k i/100ms Calculating ------------------------------------- 1 module 12.210M (± 2.1%) i/s - 61.553M in 5.043400s 30 modules 4.354M (± 2.7%) i/s - 22.033M in 5.063839s 100 modules 1.434M (± 2.9%) i/s - 7.270M in 5.072531s Comparison: 1 module: 12209958.3 i/s 30 modules: 4354217.8 i/s - 2.80x (± 0.00) slower 100 modules: 1434447.3 i/s - 8.51x (± 0.00) slower ``` Ruby 3.0 with cvar cache: ``` Warming up -------------------------------------- 1 module 1.641M i/100ms 30 modules 1.655M i/100ms 100 modules 1.620M i/100ms Calculating ------------------------------------- 1 module 16.279M (± 3.8%) i/s - 82.038M in 5.046923s 30 modules 15.891M (± 3.9%) i/s - 79.459M in 5.007958s 100 modules 16.087M (± 3.6%) i/s - 81.005M in 5.041931s Comparison: 1 module: 16279458.0 i/s 100 modules: 16087484.6 i/s - same-ish: difference falls within error 30 modules: 15891406.2 i/s - same-ish: difference falls within error ``` Co-authored-by: Aaron Patterson <tenderlove@ruby-lang.org>
2021-06-01 13:34:06 -04:00
vm_setclassvariable(GET_ISEQ(), vm_get_cref(GET_EP()), GET_CFP(), id, val, (ICVARC)ic);
}
/* Get constant variable id. If klass is Qnil and allow_nil is Qtrue, constants
are searched in the current scope. Otherwise, get constant under klass
class or module.
*/
DEFINE_INSN
getconstant
(ID id)
(VALUE klass, VALUE allow_nil)
(VALUE val)
/* getconstant can kick autoload */
// attr bool leaf = false; /* has rb_autoload_load() */
{
val = vm_get_ev_const(ec, klass, id, allow_nil == Qtrue, 0);
}
/* Set constant variable id under cbase class or module.
*/
DEFINE_INSN
setconstant
(ID id)
(VALUE val, VALUE cbase)
()
/* Assigning an object to a constant is basically a leaf operation.
* The problem is, assigning a Module instance to a constant _names_
* that module. Naming involves string manipulations, which are
* method calls. */
// attr bool leaf = false; /* has StringValue() */
{
vm_check_if_namespace(cbase);
vm_ensure_not_refinement_module(GET_SELF());
rb_const_set(cbase, id, val);
}
/* get global variable id. */
DEFINE_INSN
getglobal
(ID gid)
()
(VALUE val)
// attr bool leaf = false;
{
val = rb_gvar_get(gid);
}
/* set global variable id as val. */
DEFINE_INSN
setglobal
(ID gid)
(VALUE val)
()
// attr bool leaf = false;
{
rb_gvar_set(gid, val);
}
/**********************************************************/
/* deal with values */
/**********************************************************/
/* put nil to stack. */
DEFINE_INSN
putnil
()
()
(VALUE val)
{
val = Qnil;
}
/* put self. */
DEFINE_INSN
putself
()
()
(VALUE val)
{
val = GET_SELF();
}
/* put some object.
i.e. Fixnum, true, false, nil, and so on.
*/
DEFINE_INSN
putobject
(VALUE val)
()
(VALUE val)
{
/* */
}
/* put special object. "value_type" is for expansion. */
DEFINE_INSN
putspecialobject
(rb_num_t value_type)
()
(VALUE val)
{
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 06:58:49 -04:00
enum vm_special_object_type type;
type = (enum vm_special_object_type)value_type;
val = vm_get_special_object(GET_EP(), type);
}
/* put string val. string will be copied. */
DEFINE_INSN
putstring
(VALUE str)
()
(VALUE val)
{
val = rb_ec_str_resurrect(ec, str);
}
/* put concatenate strings */
DEFINE_INSN
concatstrings
(rb_num_t num)
(...)
(VALUE val)
/* This instruction can concat UTF-8 and binary strings, resulting in
2020-01-27 02:22:53 -05:00
* Encoding::CompatibilityError. */
// attr bool leaf = false; /* has rb_enc_cr_str_buf_cat() */
// attr rb_snum_t sp_inc = 1 - (rb_snum_t)num;
{
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 06:58:49 -04:00
val = rb_str_concat_literals(num, STACK_ADDR_FROM_TOP(num));
}
/* push the result of to_s. */
DEFINE_INSN
tostring
()
(VALUE val, VALUE str)
(VALUE val)
{
val = rb_obj_as_string_result(str, val);
}
/* compile str to Regexp and push it.
opt is the option for the Regexp.
*/
DEFINE_INSN
toregexp
(rb_num_t opt, rb_num_t cnt)
(...)
(VALUE val)
/* This instruction can raise RegexpError, thus can call
* RegexpError#initialize */
// attr bool leaf = false;
// attr rb_snum_t sp_inc = 1 - (rb_snum_t)cnt;
{
const VALUE ary = rb_ary_tmp_new_from_values(0, cnt, STACK_ADDR_FROM_TOP(cnt));
val = rb_reg_new_ary(ary, (int)opt);
rb_ary_clear(ary);
}
/* intern str to Symbol and push it. */
DEFINE_INSN
intern
()
(VALUE str)
(VALUE sym)
{
sym = rb_str_intern(str);
}
/* put new array initialized with num values on the stack. */
DEFINE_INSN
newarray
(rb_num_t num)
(...)
(VALUE val)
// attr rb_snum_t sp_inc = 1 - (rb_snum_t)num;
{
val = rb_ec_ary_new_from_values(ec, num, STACK_ADDR_FROM_TOP(num));
}
/* put new array initialized with num values on the stack. There
should be at least one element on the stack, and the top element
should be a hash. If the top element is empty, it is not
included in the array.
*/
DEFINE_INSN
newarraykwsplat
(rb_num_t num)
(...)
(VALUE val)
// attr rb_snum_t sp_inc = 1 - (rb_snum_t)num;
{
if (RHASH_EMPTY_P(*STACK_ADDR_FROM_TOP(1))) {
val = rb_ary_new4(num-1, STACK_ADDR_FROM_TOP(num));
}
else {
val = rb_ary_new4(num, STACK_ADDR_FROM_TOP(num));
}
}
/* dup array */
DEFINE_INSN
duparray
(VALUE ary)
()
(VALUE val)
{
RUBY_DTRACE_CREATE_HOOK(ARRAY, RARRAY_LEN(ary));
val = rb_ary_resurrect(ary);
}
Speed up hash literals by duping This commit replaces the `newhashfromarray` instruction with a `duphash` instruction. Instead of allocating a new hash from an array stored in the Instruction Sequences, store a hash directly in the instruction sequences and dup it on execution. == Instruction sequence changes == ```ruby code = <<-eorby { "foo" => "bar", "baz" => "lol" } eorby insns = RubyVM::InstructionSequence.compile(code, __FILE__, nil, 0, frozen_string_literal: true) puts insns.disasm ``` On Ruby 2.5: ``` == disasm: #<ISeq:<compiled>@test.rb:0 (0,0)-(0,36)>==================== 0000 putobject "foo" 0002 putobject "bar" 0004 putobject "baz" 0006 putobject "lol" 0008 newhash 4 0010 leave ``` Ruby 2.6@r66174 3b6321083a2e3525da3b34d08a0b68bac094bd7f: ``` $ ./ruby test.rb == disasm: #<ISeq:<compiled>@test.rb:0 (0,0)-(0,36)> (catch: FALSE) 0000 newhashfromarray 2, ["foo", "bar", "baz", "lol"] 0003 leave ``` Ruby 2.6 + This commit: ``` $ ./ruby test.rb == disasm: #<ISeq:<compiled>@test.rb:0 (0,0)-(0,36)> (catch: FALSE) 0000 duphash {"foo"=>"bar", "baz"=>"lol"} 0002 leave ``` == Benchmark Results == Compared to 2.5.3: ``` $ make benchmark ITEM=hash_literal_small COMPARE_RUBY=/Users/aaron/.rbenv/versions/2.5.3/bin/ruby generating known_errors.inc known_errors.inc unchanged ./revision.h unchanged /Users/aaron/.rbenv/shims/ruby --disable=gems -rrubygems -I./benchmark/lib ./benchmark/benchmark-driver/exe/benchmark-driver \ --executables="compare-ruby::/Users/aaron/.rbenv/versions/2.5.3/bin/ruby -I.ext/common --disable-gem" \ --executables="built-ruby::./miniruby -I./lib -I. -I.ext/common -r./prelude --disable-gem" \ $(find ./benchmark -maxdepth 1 -name '*hash_literal_small*.yml' -o -name '*hash_literal_small*.rb' | sort) Calculating ------------------------------------- compare-ruby built-ruby hash_literal_small2 1.498 1.877 i/s - 1.000 times in 0.667581s 0.532656s hash_literal_small4 1.197 1.642 i/s - 1.000 times in 0.835375s 0.609160s hash_literal_small8 0.620 1.215 i/s - 1.000 times in 1.611638s 0.823090s Comparison: hash_literal_small2 built-ruby: 1.9 i/s compare-ruby: 1.5 i/s - 1.25x slower hash_literal_small4 built-ruby: 1.6 i/s compare-ruby: 1.2 i/s - 1.37x slower hash_literal_small8 built-ruby: 1.2 i/s compare-ruby: 0.6 i/s - 1.96x slower ``` Compared to r66255 ``` $ make benchmark ITEM=hash_literal_small COMPARE_RUBY=/Users/aaron/.rbenv/versions/ruby-trunk/bin/ruby generating known_errors.inc known_errors.inc unchanged ./revision.h unchanged /Users/aaron/.rbenv/shims/ruby --disable=gems -rrubygems -I./benchmark/lib ./benchmark/benchmark-driver/exe/benchmark-driver \ --executables="compare-ruby::/Users/aaron/.rbenv/versions/ruby-trunk/bin/ruby -I.ext/common --disable-gem" \ --executables="built-ruby::./miniruby -I./lib -I. -I.ext/common -r./prelude --disable-gem" \ $(find ./benchmark -maxdepth 1 -name '*hash_literal_small*.yml' -o -name '*hash_literal_small*.rb' | sort) Calculating ------------------------------------- compare-ruby built-ruby hash_literal_small2 1.567 1.831 i/s - 1.000 times in 0.638056s 0.546039s hash_literal_small4 1.298 1.652 i/s - 1.000 times in 0.770214s 0.605182s hash_literal_small8 0.873 1.216 i/s - 1.000 times in 1.145304s 0.822047s Comparison: hash_literal_small2 built-ruby: 1.8 i/s compare-ruby: 1.6 i/s - 1.17x slower hash_literal_small4 built-ruby: 1.7 i/s compare-ruby: 1.3 i/s - 1.27x slower hash_literal_small8 built-ruby: 1.2 i/s compare-ruby: 0.9 i/s - 1.39x slower ``` git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@66258 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2018-12-06 13:28:21 -05:00
/* dup hash */
DEFINE_INSN
duphash
(VALUE hash)
()
(VALUE val)
{
RUBY_DTRACE_CREATE_HOOK(HASH, RHASH_SIZE(hash) << 1);
val = rb_hash_resurrect(hash);
Speed up hash literals by duping This commit replaces the `newhashfromarray` instruction with a `duphash` instruction. Instead of allocating a new hash from an array stored in the Instruction Sequences, store a hash directly in the instruction sequences and dup it on execution. == Instruction sequence changes == ```ruby code = <<-eorby { "foo" => "bar", "baz" => "lol" } eorby insns = RubyVM::InstructionSequence.compile(code, __FILE__, nil, 0, frozen_string_literal: true) puts insns.disasm ``` On Ruby 2.5: ``` == disasm: #<ISeq:<compiled>@test.rb:0 (0,0)-(0,36)>==================== 0000 putobject "foo" 0002 putobject "bar" 0004 putobject "baz" 0006 putobject "lol" 0008 newhash 4 0010 leave ``` Ruby 2.6@r66174 3b6321083a2e3525da3b34d08a0b68bac094bd7f: ``` $ ./ruby test.rb == disasm: #<ISeq:<compiled>@test.rb:0 (0,0)-(0,36)> (catch: FALSE) 0000 newhashfromarray 2, ["foo", "bar", "baz", "lol"] 0003 leave ``` Ruby 2.6 + This commit: ``` $ ./ruby test.rb == disasm: #<ISeq:<compiled>@test.rb:0 (0,0)-(0,36)> (catch: FALSE) 0000 duphash {"foo"=>"bar", "baz"=>"lol"} 0002 leave ``` == Benchmark Results == Compared to 2.5.3: ``` $ make benchmark ITEM=hash_literal_small COMPARE_RUBY=/Users/aaron/.rbenv/versions/2.5.3/bin/ruby generating known_errors.inc known_errors.inc unchanged ./revision.h unchanged /Users/aaron/.rbenv/shims/ruby --disable=gems -rrubygems -I./benchmark/lib ./benchmark/benchmark-driver/exe/benchmark-driver \ --executables="compare-ruby::/Users/aaron/.rbenv/versions/2.5.3/bin/ruby -I.ext/common --disable-gem" \ --executables="built-ruby::./miniruby -I./lib -I. -I.ext/common -r./prelude --disable-gem" \ $(find ./benchmark -maxdepth 1 -name '*hash_literal_small*.yml' -o -name '*hash_literal_small*.rb' | sort) Calculating ------------------------------------- compare-ruby built-ruby hash_literal_small2 1.498 1.877 i/s - 1.000 times in 0.667581s 0.532656s hash_literal_small4 1.197 1.642 i/s - 1.000 times in 0.835375s 0.609160s hash_literal_small8 0.620 1.215 i/s - 1.000 times in 1.611638s 0.823090s Comparison: hash_literal_small2 built-ruby: 1.9 i/s compare-ruby: 1.5 i/s - 1.25x slower hash_literal_small4 built-ruby: 1.6 i/s compare-ruby: 1.2 i/s - 1.37x slower hash_literal_small8 built-ruby: 1.2 i/s compare-ruby: 0.6 i/s - 1.96x slower ``` Compared to r66255 ``` $ make benchmark ITEM=hash_literal_small COMPARE_RUBY=/Users/aaron/.rbenv/versions/ruby-trunk/bin/ruby generating known_errors.inc known_errors.inc unchanged ./revision.h unchanged /Users/aaron/.rbenv/shims/ruby --disable=gems -rrubygems -I./benchmark/lib ./benchmark/benchmark-driver/exe/benchmark-driver \ --executables="compare-ruby::/Users/aaron/.rbenv/versions/ruby-trunk/bin/ruby -I.ext/common --disable-gem" \ --executables="built-ruby::./miniruby -I./lib -I. -I.ext/common -r./prelude --disable-gem" \ $(find ./benchmark -maxdepth 1 -name '*hash_literal_small*.yml' -o -name '*hash_literal_small*.rb' | sort) Calculating ------------------------------------- compare-ruby built-ruby hash_literal_small2 1.567 1.831 i/s - 1.000 times in 0.638056s 0.546039s hash_literal_small4 1.298 1.652 i/s - 1.000 times in 0.770214s 0.605182s hash_literal_small8 0.873 1.216 i/s - 1.000 times in 1.145304s 0.822047s Comparison: hash_literal_small2 built-ruby: 1.8 i/s compare-ruby: 1.6 i/s - 1.17x slower hash_literal_small4 built-ruby: 1.7 i/s compare-ruby: 1.3 i/s - 1.27x slower hash_literal_small8 built-ruby: 1.2 i/s compare-ruby: 0.9 i/s - 1.39x slower ``` git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@66258 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2018-12-06 13:28:21 -05:00
}
/* if TOS is an array expand, expand it to num objects.
if the number of the array is less than num, push nils to fill.
if it is greater than num, exceeding elements are dropped.
unless TOS is an array, push num - 1 nils.
if flags is non-zero, push the array of the rest elements.
flag: 0x01 - rest args array
flag: 0x02 - for postarg
flag: 0x04 - reverse?
*/
DEFINE_INSN
expandarray
(rb_num_t num, rb_num_t flag)
(..., VALUE ary)
(...)
// attr bool leaf = false; /* has rb_check_array_type() */
// attr rb_snum_t sp_inc = (rb_snum_t)num - 1 + (flag & 1 ? 1 : 0);
{
mjit_compile.c: reduce sp motion on JIT This retries r62655, which was reverted at r63863 for r63763. tool/ruby_vm/views/_mjit_compile_insn.erb: revert the revert. tool/ruby_vm/views/_mjit_compile_insn_body.erb: ditto. tool/ruby_vm/views/_mjit_compile_pc_and_sp.erb: ditto. tool/ruby_vm/views/_mjit_compile_send.erb: ditto. tool/ruby_vm/views/mjit_compile.inc.erb: ditto. tool/ruby_vm/views/_insn_entry.erb: revert half of r63763. The commit was originally reverted since changing pc motion was bad for tracing, but changing sp motion was totally fine. For JIT, I wanna resurrect the sp motion change in r62051. tool/ruby_vm/models/bare_instructions.rb: ditto. insns.def: ditto. vm_insnhelper.c: ditto. vm_insnhelper.h: ditto. * benchmark $ benchmark-driver benchmark.yml --rbenv 'before;after;before --jit;after --jit' --repeat-count 12 -v before: ruby 2.6.0dev (2018-07-19 trunk 63998) [x86_64-linux] after: ruby 2.6.0dev (2018-07-19 add-sp 63998) [x86_64-linux] last_commit=mjit_compile.c: reduce sp motion on JIT before --jit: ruby 2.6.0dev (2018-07-19 trunk 63998) +JIT [x86_64-linux] after --jit: ruby 2.6.0dev (2018-07-19 add-sp 63998) +JIT [x86_64-linux] last_commit=mjit_compile.c: reduce sp motion on JIT Calculating ------------------------------------- before after before --jit after --jit Optcarrot Lan_Master.nes 51.354 50.238 70.010 72.139 fps Comparison: Optcarrot Lan_Master.nes after --jit: 72.1 fps before --jit: 70.0 fps - 1.03x slower before: 51.4 fps - 1.40x slower after: 50.2 fps - 1.44x slower git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@63999 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2018-07-19 09:25:22 -04:00
vm_expandarray(GET_SP(), ary, num, (int)flag);
}
/* concat two arrays */
DEFINE_INSN
concatarray
()
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 06:58:49 -04:00
(VALUE ary1, VALUE ary2)
(VALUE ary)
// attr bool leaf = false; /* has rb_check_array_type() */
{
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 06:58:49 -04:00
ary = vm_concat_array(ary1, ary2);
}
/* call to_a on array ary to splat */
DEFINE_INSN
splatarray
(VALUE flag)
(VALUE ary)
(VALUE obj)
// attr bool leaf = false; /* has rb_check_array_type() */
{
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 06:58:49 -04:00
obj = vm_splat_array(flag, ary);
}
/* put new Hash from n elements. n must be an even number. */
DEFINE_INSN
newhash
(rb_num_t num)
(...)
(VALUE val)
// attr bool leaf = false; /* has rb_hash_key_str() */
// attr rb_snum_t sp_inc = 1 - (rb_snum_t)num;
{
RUBY_DTRACE_CREATE_HOOK(HASH, num);
* probes.d: add DTrace probe declarations. [ruby-core:27448] * array.c (empty_ary_alloc, ary_new): added array create DTrace probe. * compile.c (rb_insns_name): allowing DTrace probes to access instruction sequence name. * Makefile.in: translate probes.d file to appropriate header file. * common.mk: declare dependencies on the DTrace header. * configure.in: add a test for existence of DTrace. * eval.c (setup_exception): add a probe for when an exception is raised. * gc.c: Add DTrace probes for mark begin and end, and sweep begin and end. * hash.c (empty_hash_alloc): Add a probe for hash allocation. * insns.def: Add probes for function entry and return. * internal.h: function declaration for compile.c change. * load.c (rb_f_load): add probes for `load` entry and exit, require entry and exit, and wrapping search_required for load path search. * object.c (rb_obj_alloc): added a probe for general object creation. * parse.y (yycompile0): added a probe around parse and compile phase. * string.c (empty_str_alloc, str_new): DTrace probes for string allocation. * test/dtrace/*: tests for DTrace probes. * vm.c (vm_invoke_proc): add probes for function return on exception raise, hash create, and instruction sequence execution. * vm_core.h: add probe declarations for function entry and exit. * vm_dump.c: add probes header file. * vm_eval.c (vm_call0_cfunc, vm_call0_cfunc_with_frame): add probe on function entry and return. * vm_exec.c: expose instruction number to instruction name function. * vm_insnshelper.c: add function entry and exit probes for cfunc methods. * vm_insnhelper.h: vm usage information is always collected, so uncomment the functions. 12 19:14:50 2012 Akinori MUSHA <knu@iDaemons.org> * configure.in (isinf, isnan): isinf() and isnan() are macros on DragonFly which cannot be found by AC_REPLACE_FUNCS(). This workaround enforces the fact that they exist on DragonFly. 12 15:59:38 2012 Shugo Maeda <shugo@ruby-lang.org> * vm_core.h (rb_call_info_t::refinements), compile.c (new_callinfo), vm_insnhelper.c (vm_search_method): revert r37616 because it's too slow. [ruby-dev:46477] * test/ruby/test_refinement.rb (test_inline_method_cache): skip the test until the bug is fixed efficiently. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@37631 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2012-11-12 16:52:12 -05:00
if (num) {
val = rb_hash_new_with_size(num / 2);
rb_hash_bulk_insert(num, STACK_ADDR_FROM_TOP(num), val);
}
else {
val = rb_hash_new();
}
}
/* put new Range object.(Range.new(low, high, flag)) */
DEFINE_INSN
newrange
(rb_num_t flag)
(VALUE low, VALUE high)
(VALUE val)
/* rb_range_new() exercises "bad value for range" check. */
// attr bool leaf = false; /* see also: range.c:range_init() */
{
val = rb_range_new(low, high, (int)flag);
}
/**********************************************************/
/* deal with stack operation */
/**********************************************************/
/* pop from stack. */
DEFINE_INSN
pop
()
(VALUE val)
()
{
(void)val;
/* none */
}
/* duplicate stack top. */
DEFINE_INSN
dup
()
(VALUE val)
(VALUE val1, VALUE val2)
{
val1 = val2 = val;
}
/* duplicate stack top n elements */
DEFINE_INSN
dupn
(rb_num_t n)
(...)
(...)
// attr rb_snum_t sp_inc = n;
{
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 06:58:49 -04:00
void *dst = GET_SP();
void *src = STACK_ADDR_FROM_TOP(n);
MEMCPY(dst, src, VALUE, n);
}
/* swap top 2 vals */
DEFINE_INSN
swap
()
(VALUE val, VALUE obj)
(VALUE obj, VALUE val)
{
/* none */
}
/* for stack caching. */
2019-09-02 13:51:48 -04:00
DEFINE_INSN_IF(STACK_CACHING)
reput
()
(..., VALUE val)
(VALUE val)
// attr rb_snum_t sp_inc = 0;
{
/* none */
}
/* get nth stack value from stack top */
DEFINE_INSN
topn
(rb_num_t n)
(...)
(VALUE val)
// attr rb_snum_t sp_inc = 1;
{
val = TOPN(n);
}
/* set Nth stack entry to stack top */
DEFINE_INSN
setn
(rb_num_t n)
(..., VALUE val)
(VALUE val)
// attr rb_snum_t sp_inc = 0;
{
mjit_compile.c: reduce sp motion on JIT This retries r62655, which was reverted at r63863 for r63763. tool/ruby_vm/views/_mjit_compile_insn.erb: revert the revert. tool/ruby_vm/views/_mjit_compile_insn_body.erb: ditto. tool/ruby_vm/views/_mjit_compile_pc_and_sp.erb: ditto. tool/ruby_vm/views/_mjit_compile_send.erb: ditto. tool/ruby_vm/views/mjit_compile.inc.erb: ditto. tool/ruby_vm/views/_insn_entry.erb: revert half of r63763. The commit was originally reverted since changing pc motion was bad for tracing, but changing sp motion was totally fine. For JIT, I wanna resurrect the sp motion change in r62051. tool/ruby_vm/models/bare_instructions.rb: ditto. insns.def: ditto. vm_insnhelper.c: ditto. vm_insnhelper.h: ditto. * benchmark $ benchmark-driver benchmark.yml --rbenv 'before;after;before --jit;after --jit' --repeat-count 12 -v before: ruby 2.6.0dev (2018-07-19 trunk 63998) [x86_64-linux] after: ruby 2.6.0dev (2018-07-19 add-sp 63998) [x86_64-linux] last_commit=mjit_compile.c: reduce sp motion on JIT before --jit: ruby 2.6.0dev (2018-07-19 trunk 63998) +JIT [x86_64-linux] after --jit: ruby 2.6.0dev (2018-07-19 add-sp 63998) +JIT [x86_64-linux] last_commit=mjit_compile.c: reduce sp motion on JIT Calculating ------------------------------------- before after before --jit after --jit Optcarrot Lan_Master.nes 51.354 50.238 70.010 72.139 fps Comparison: Optcarrot Lan_Master.nes after --jit: 72.1 fps before --jit: 70.0 fps - 1.03x slower before: 51.4 fps - 1.40x slower after: 50.2 fps - 1.44x slower git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@63999 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2018-07-19 09:25:22 -04:00
TOPN(n) = val;
}
/* empty current stack */
DEFINE_INSN
adjuststack
(rb_num_t n)
(...)
(...)
// attr rb_snum_t sp_inc = -(rb_snum_t)n;
{
mjit_compile.c: reduce sp motion on JIT This retries r62655, which was reverted at r63863 for r63763. tool/ruby_vm/views/_mjit_compile_insn.erb: revert the revert. tool/ruby_vm/views/_mjit_compile_insn_body.erb: ditto. tool/ruby_vm/views/_mjit_compile_pc_and_sp.erb: ditto. tool/ruby_vm/views/_mjit_compile_send.erb: ditto. tool/ruby_vm/views/mjit_compile.inc.erb: ditto. tool/ruby_vm/views/_insn_entry.erb: revert half of r63763. The commit was originally reverted since changing pc motion was bad for tracing, but changing sp motion was totally fine. For JIT, I wanna resurrect the sp motion change in r62051. tool/ruby_vm/models/bare_instructions.rb: ditto. insns.def: ditto. vm_insnhelper.c: ditto. vm_insnhelper.h: ditto. * benchmark $ benchmark-driver benchmark.yml --rbenv 'before;after;before --jit;after --jit' --repeat-count 12 -v before: ruby 2.6.0dev (2018-07-19 trunk 63998) [x86_64-linux] after: ruby 2.6.0dev (2018-07-19 add-sp 63998) [x86_64-linux] last_commit=mjit_compile.c: reduce sp motion on JIT before --jit: ruby 2.6.0dev (2018-07-19 trunk 63998) +JIT [x86_64-linux] after --jit: ruby 2.6.0dev (2018-07-19 add-sp 63998) +JIT [x86_64-linux] last_commit=mjit_compile.c: reduce sp motion on JIT Calculating ------------------------------------- before after before --jit after --jit Optcarrot Lan_Master.nes 51.354 50.238 70.010 72.139 fps Comparison: Optcarrot Lan_Master.nes after --jit: 72.1 fps before --jit: 70.0 fps - 1.03x slower before: 51.4 fps - 1.40x slower after: 50.2 fps - 1.44x slower git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@63999 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2018-07-19 09:25:22 -04:00
/* none */
}
/**********************************************************/
/* deal with setting */
/**********************************************************/
/* defined? */
DEFINE_INSN
defined
(rb_num_t op_type, VALUE obj, VALUE pushval)
(VALUE v)
(VALUE val)
// attr bool leaf = leafness_of_defined(op_type);
{
val = Qnil;
if (vm_defined(ec, GET_CFP(), op_type, obj, v)) {
val = pushval;
}
}
/* check `target' matches `pattern'.
`flag & VM_CHECKMATCH_TYPE_MASK' describe how to check pattern.
VM_CHECKMATCH_TYPE_WHEN: ignore target and check pattern is truthy.
VM_CHECKMATCH_TYPE_CASE: check `patten === target'.
2021-09-23 17:14:04 -04:00
VM_CHECKMATCH_TYPE_RESCUE: check `pattern.kind_of?(Module) && pattern === target'.
if `flag & VM_CHECKMATCH_ARRAY' is not 0, then `patten' is array of patterns.
*/
DEFINE_INSN
checkmatch
(rb_num_t flag)
(VALUE target, VALUE pattern)
(VALUE result)
// attr bool leaf = leafness_of_checkmatch(flag);
{
result = vm_check_match(ec, target, pattern, flag);
}
/* check keywords are specified or not. */
* rewrite method/block parameter fitting logic to optimize keyword arguments/parameters and a splat argument. [Feature #10440] (Details are described in this ticket) Most of complex part is moved to vm_args.c. Now, ISeq#to_a does not catch up new instruction format. * vm_core.h: change iseq data structures. * introduce rb_call_info_kw_arg_t to represent keyword arguments. * add rb_call_info_t::kw_arg. * rename rb_iseq_t::arg_post_len to rb_iseq_t::arg_post_num. * rename rb_iseq_t::arg_keywords to arg_keyword_num. * rename rb_iseq_t::arg_keyword to rb_iseq_t::arg_keyword_bits. to represent keyword bitmap parameter index. This bitmap parameter shows that which keyword parameters are given or not given (0 for given). It is refered by `checkkeyword' instruction described bellow. * rename rb_iseq_t::arg_keyword_check to rb_iseq_t::arg_keyword_rest to represent keyword rest parameter index. * add rb_iseq_t::arg_keyword_default_values to represent default keyword values. * rename VM_CALL_ARGS_SKIP_SETUP to VM_CALL_ARGS_SIMPLE to represent (ci->flag & (SPLAT|BLOCKARG)) && ci->blockiseq == NULL && ci->kw_arg == NULL. * vm_insnhelper.c, vm_args.c: rewrite with refactoring. * rewrite splat argument code. * rewrite keyword arguments/parameters code. * merge method and block parameter fitting code into one code base. * vm.c, vm_eval.c: catch up these changes. * compile.c (new_callinfo): callinfo requires kw_arg parameter. * compile.c (compile_array_): check the last argument Hash object or not. If Hash object and all keys are Symbol literals, they are compiled to keyword arguments. * insns.def (checkkeyword): add new instruction. This instruction check the availability of corresponding keyword. For example, a method "def foo k1: 'v1'; end" is cimpiled to the following instructions. 0000 checkkeyword 2, 0 # check k1 is given. 0003 branchif 9 # if given, jump to address #9 0005 putstring "v1" 0007 setlocal_OP__WC__0 3 # k1 = 'v1' 0009 trace 8 0011 putnil 0012 trace 16 0014 leave * insns.def (opt_send_simple): removed and add new instruction "opt_send_without_block". * parse.y (new_args_tail_gen): reorder variables. Before this patch, a method "def foo(k1: 1, kr1:, k2: 2, **krest, &b)" has parameter variables "k1, kr1, k2, &b, internal_id, krest", but this patch reorders to "kr1, k1, k2, internal_id, krest, &b". (locate a block variable at last) * parse.y (vtable_pop): added. This function remove latest `n' variables from vtable. * iseq.c: catch up iseq data changes. * proc.c: ditto. * class.c (keyword_error): export as rb_keyword_error(). * common.mk: depend vm_args.c for vm.o. * hash.c (rb_hash_has_key): export. * internal.h: ditto. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@48239 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2014-11-02 13:02:55 -05:00
DEFINE_INSN
checkkeyword
(lindex_t kw_bits_index, lindex_t keyword_index)
* rewrite method/block parameter fitting logic to optimize keyword arguments/parameters and a splat argument. [Feature #10440] (Details are described in this ticket) Most of complex part is moved to vm_args.c. Now, ISeq#to_a does not catch up new instruction format. * vm_core.h: change iseq data structures. * introduce rb_call_info_kw_arg_t to represent keyword arguments. * add rb_call_info_t::kw_arg. * rename rb_iseq_t::arg_post_len to rb_iseq_t::arg_post_num. * rename rb_iseq_t::arg_keywords to arg_keyword_num. * rename rb_iseq_t::arg_keyword to rb_iseq_t::arg_keyword_bits. to represent keyword bitmap parameter index. This bitmap parameter shows that which keyword parameters are given or not given (0 for given). It is refered by `checkkeyword' instruction described bellow. * rename rb_iseq_t::arg_keyword_check to rb_iseq_t::arg_keyword_rest to represent keyword rest parameter index. * add rb_iseq_t::arg_keyword_default_values to represent default keyword values. * rename VM_CALL_ARGS_SKIP_SETUP to VM_CALL_ARGS_SIMPLE to represent (ci->flag & (SPLAT|BLOCKARG)) && ci->blockiseq == NULL && ci->kw_arg == NULL. * vm_insnhelper.c, vm_args.c: rewrite with refactoring. * rewrite splat argument code. * rewrite keyword arguments/parameters code. * merge method and block parameter fitting code into one code base. * vm.c, vm_eval.c: catch up these changes. * compile.c (new_callinfo): callinfo requires kw_arg parameter. * compile.c (compile_array_): check the last argument Hash object or not. If Hash object and all keys are Symbol literals, they are compiled to keyword arguments. * insns.def (checkkeyword): add new instruction. This instruction check the availability of corresponding keyword. For example, a method "def foo k1: 'v1'; end" is cimpiled to the following instructions. 0000 checkkeyword 2, 0 # check k1 is given. 0003 branchif 9 # if given, jump to address #9 0005 putstring "v1" 0007 setlocal_OP__WC__0 3 # k1 = 'v1' 0009 trace 8 0011 putnil 0012 trace 16 0014 leave * insns.def (opt_send_simple): removed and add new instruction "opt_send_without_block". * parse.y (new_args_tail_gen): reorder variables. Before this patch, a method "def foo(k1: 1, kr1:, k2: 2, **krest, &b)" has parameter variables "k1, kr1, k2, &b, internal_id, krest", but this patch reorders to "kr1, k1, k2, internal_id, krest, &b". (locate a block variable at last) * parse.y (vtable_pop): added. This function remove latest `n' variables from vtable. * iseq.c: catch up iseq data changes. * proc.c: ditto. * class.c (keyword_error): export as rb_keyword_error(). * common.mk: depend vm_args.c for vm.o. * hash.c (rb_hash_has_key): export. * internal.h: ditto. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@48239 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2014-11-02 13:02:55 -05:00
()
(VALUE ret)
{
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 06:58:49 -04:00
ret = vm_check_keyword(kw_bits_index, keyword_index, GET_EP());
* rewrite method/block parameter fitting logic to optimize keyword arguments/parameters and a splat argument. [Feature #10440] (Details are described in this ticket) Most of complex part is moved to vm_args.c. Now, ISeq#to_a does not catch up new instruction format. * vm_core.h: change iseq data structures. * introduce rb_call_info_kw_arg_t to represent keyword arguments. * add rb_call_info_t::kw_arg. * rename rb_iseq_t::arg_post_len to rb_iseq_t::arg_post_num. * rename rb_iseq_t::arg_keywords to arg_keyword_num. * rename rb_iseq_t::arg_keyword to rb_iseq_t::arg_keyword_bits. to represent keyword bitmap parameter index. This bitmap parameter shows that which keyword parameters are given or not given (0 for given). It is refered by `checkkeyword' instruction described bellow. * rename rb_iseq_t::arg_keyword_check to rb_iseq_t::arg_keyword_rest to represent keyword rest parameter index. * add rb_iseq_t::arg_keyword_default_values to represent default keyword values. * rename VM_CALL_ARGS_SKIP_SETUP to VM_CALL_ARGS_SIMPLE to represent (ci->flag & (SPLAT|BLOCKARG)) && ci->blockiseq == NULL && ci->kw_arg == NULL. * vm_insnhelper.c, vm_args.c: rewrite with refactoring. * rewrite splat argument code. * rewrite keyword arguments/parameters code. * merge method and block parameter fitting code into one code base. * vm.c, vm_eval.c: catch up these changes. * compile.c (new_callinfo): callinfo requires kw_arg parameter. * compile.c (compile_array_): check the last argument Hash object or not. If Hash object and all keys are Symbol literals, they are compiled to keyword arguments. * insns.def (checkkeyword): add new instruction. This instruction check the availability of corresponding keyword. For example, a method "def foo k1: 'v1'; end" is cimpiled to the following instructions. 0000 checkkeyword 2, 0 # check k1 is given. 0003 branchif 9 # if given, jump to address #9 0005 putstring "v1" 0007 setlocal_OP__WC__0 3 # k1 = 'v1' 0009 trace 8 0011 putnil 0012 trace 16 0014 leave * insns.def (opt_send_simple): removed and add new instruction "opt_send_without_block". * parse.y (new_args_tail_gen): reorder variables. Before this patch, a method "def foo(k1: 1, kr1:, k2: 2, **krest, &b)" has parameter variables "k1, kr1, k2, &b, internal_id, krest", but this patch reorders to "kr1, k1, k2, internal_id, krest, &b". (locate a block variable at last) * parse.y (vtable_pop): added. This function remove latest `n' variables from vtable. * iseq.c: catch up iseq data changes. * proc.c: ditto. * class.c (keyword_error): export as rb_keyword_error(). * common.mk: depend vm_args.c for vm.o. * hash.c (rb_hash_has_key): export. * internal.h: ditto. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@48239 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2014-11-02 13:02:55 -05:00
}
/* check if val is type. */
DEFINE_INSN
checktype
(rb_num_t type)
(VALUE val)
(VALUE ret)
{
ret = (TYPE(val) == (int)type) ? Qtrue : Qfalse;
}
/**********************************************************/
/* deal with control flow 1: class/module */
/**********************************************************/
/* enter class definition scope. if super is Qfalse, and class
"klass" is defined, it's redefined. Otherwise, define "klass" class.
*/
DEFINE_INSN
defineclass
(ID id, ISEQ class_iseq, rb_num_t flags)
(VALUE cbase, VALUE super)
(VALUE val)
{
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 06:58:49 -04:00
VALUE klass = vm_find_or_create_class_by_id(id, flags, cbase, super);
* introduce new ISeq binary format serializer/de-serializer and a pre-compilation/runtime loader sample. [Feature #11788] * iseq.c: add new methods: * RubyVM::InstructionSequence#to_binary_format(extra_data = nil) * RubyVM::InstructionSequence.from_binary_format(binary) * RubyVM::InstructionSequence.from_binary_format_extra_data(binary) * compile.c: implement body of this new feature. * load.c (rb_load_internal0), iseq.c (rb_iseq_load_iseq): call RubyVM::InstructionSequence.load_iseq(fname) with loading script name if this method is defined. We can return any ISeq object as a result value. Otherwise loading will be continue as usual. This interface is not matured and is not extensible. So that we don't guarantee the future compatibility of this method. Basically, you should'nt use this method. * iseq.h: move ISEQ_MAJOR/MINOR_VERSION (and some definitions) from iseq.c. * encoding.c (rb_data_is_encoding), internal.h: added. * vm_core.h: add several supports for lazy load. * add USE_LAZY_LOAD macro to specify enable or disable of this feature. * add several fields to rb_iseq_t. * introduce new macro rb_iseq_check(). * insns.def: some check for lazy loading feature. * vm_insnhelper.c: ditto. * proc.c: ditto. * vm.c: ditto. * test/lib/iseq_loader_checker.rb: enabled iff suitable environment variables are provided. * test/runner.rb: enable lib/iseq_loader_checker.rb. * sample/iseq_loader.rb: add sample compiler and loader. $ ruby sample/iseq_loader.rb [dir] will compile all ruby scripts in [dir]. With default setting, this compile creates *.rb.yarb files in same directory of target .rb scripts. $ ruby -r sample/iseq_loader.rb [app] will run with enable to load compiled binary data. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@52949 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2015-12-08 08:58:50 -05:00
rb_iseq_check(class_iseq);
/* enter scope */
vm_push_frame(ec, class_iseq, VM_FRAME_MAGIC_CLASS | VM_ENV_FLAG_LOCAL, klass,
GET_BLOCK_HANDLER(),
(VALUE)vm_cref_push(ec, klass, NULL, FALSE),
2015-07-21 18:52:59 -04:00
class_iseq->body->iseq_encoded, GET_SP(),
class_iseq->body->local_table_size,
* introduce new ISeq binary format serializer/de-serializer and a pre-compilation/runtime loader sample. [Feature #11788] * iseq.c: add new methods: * RubyVM::InstructionSequence#to_binary_format(extra_data = nil) * RubyVM::InstructionSequence.from_binary_format(binary) * RubyVM::InstructionSequence.from_binary_format_extra_data(binary) * compile.c: implement body of this new feature. * load.c (rb_load_internal0), iseq.c (rb_iseq_load_iseq): call RubyVM::InstructionSequence.load_iseq(fname) with loading script name if this method is defined. We can return any ISeq object as a result value. Otherwise loading will be continue as usual. This interface is not matured and is not extensible. So that we don't guarantee the future compatibility of this method. Basically, you should'nt use this method. * iseq.h: move ISEQ_MAJOR/MINOR_VERSION (and some definitions) from iseq.c. * encoding.c (rb_data_is_encoding), internal.h: added. * vm_core.h: add several supports for lazy load. * add USE_LAZY_LOAD macro to specify enable or disable of this feature. * add several fields to rb_iseq_t. * introduce new macro rb_iseq_check(). * insns.def: some check for lazy loading feature. * vm_insnhelper.c: ditto. * proc.c: ditto. * vm.c: ditto. * test/lib/iseq_loader_checker.rb: enabled iff suitable environment variables are provided. * test/runner.rb: enable lib/iseq_loader_checker.rb. * sample/iseq_loader.rb: add sample compiler and loader. $ ruby sample/iseq_loader.rb [dir] will compile all ruby scripts in [dir]. With default setting, this compile creates *.rb.yarb files in same directory of target .rb scripts. $ ruby -r sample/iseq_loader.rb [app] will run with enable to load compiled binary data. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@52949 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2015-12-08 08:58:50 -05:00
class_iseq->body->stack_max);
mjit_compile.c: use local variables for stack if catch_except_p is FALSE. If catch_except_p is TRUE, stack values should be on VM's stack when exception is thrown and the JIT-ed frame is re-executed by VM's exception handler. If it's FALSE, the JIT-ed frame won't be re-executed and don't need to keep values on VM's stack. Using local variables allows us to reduce cfp->sp motion. Moving cfp->sp is needed only for insns whose handles_frame? is false. So it improves performance. _mjit_compile_insn.erb: Prepare `stack_size` variable for GET_SP, STACK_ADDR_FROM_TOP, TOPN macros. Share pc and sp motion partial view. Use cancel handler created in mjit_compile.c. _mjit_compile_send.erb: ditto. Also, when iseq->body->catch_except_p is TRUE, this stops to call mjit_exec directly. I described the reason in vm_insnhelper.h's comment for EXEC_EC_CFP. _mjit_compile_pc_and_sp.erb: Shared logic for moving sp and pc. As you can see from thsi file, when status->local_stack_p is TRUE and insn.handles_frame? is false, moving sp is skipped. But if insn.handles_frame? is true, values should be rolled back to VM's stack. common.mk: add dependency for the file _mjit_compile_insn_body.erb: Set sp value before canceling JIT on DISPATCH_ORIGINAL_INSN. Replace GET_SP, STACK_ADDR_FROM_TOP, TOPN macros for the case ocal_stack_p is TRUE and insn.handles_frame? is false. In that case, values are not available on VM's stack and those macros should be replaced. mjit_compile.inc.erb: updated comments of macros which are supported by JIT compiler. All references to `cfp->sp` should be replaced and thus INC_SP, SET_SV, PUSH are no longer supported for now, because they are not used now. vm_exec.h: moved EXEC_EC_CFP definition to vm_insnhelper.h because it's tighly coupled to CALL_METHOD. vm_insnhelper.h: Have revised EXEC_EC_CFP definition moved from vm_exec.h. Now it triggers mjit_exec for VM, and has the guard for catch_except_p on JIT-ed code. See comments for details. CALL_METHOD delegates triggering mjit_exec to EXEC_EC_CFP. insns.def: Stopped using EXEC_EC_CFP for the case we don't want to trigger mjit_exec. Those insns (defineclass, opt_call_c_function) are not supported by JIT and it's safe to use RESTORE_REGS(), NEXT_INSN(). expandarray is changed to pass GET_SP() to replace the macro in _mjit_compile_insn_body.erb. vm_insnhelper.c: change to take sp for the above reason. [close https://github.com/ruby/ruby/pull/1828] This patch resurrects the performance which was attached in [Feature #14235]. * Benchmark Optcarrot (with configuration for benchmark_driver.gem) https://github.com/benchmark-driver/optcarrot $ benchmark-driver benchmark.yml --verbose 1 --rbenv 'before;before+JIT::before,--jit;after;after+JIT::after,--jit' --repeat-count 10 before: ruby 2.6.0dev (2018-03-04 trunk 62652) [x86_64-linux] before+JIT: ruby 2.6.0dev (2018-03-04 trunk 62652) +JIT [x86_64-linux] after: ruby 2.6.0dev (2018-03-04 local-variable.. 62652) [x86_64-linux] last_commit=mjit_compile.c: use local variables for stack after+JIT: ruby 2.6.0dev (2018-03-04 local-variable.. 62652) +JIT [x86_64-linux] last_commit=mjit_compile.c: use local variables for stack Calculating ------------------------------------- before before+JIT after after+JIT optcarrot 53.552 59.680 53.697 63.358 fps Comparison: optcarrot after+JIT: 63.4 fps before+JIT: 59.7 fps - 1.06x slower after: 53.7 fps - 1.18x slower before: 53.6 fps - 1.18x slower git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@62655 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2018-03-04 02:04:40 -05:00
RESTORE_REGS();
NEXT_INSN();
}
DEFINE_INSN
definemethod
(ID id, ISEQ iseq)
()
()
{
vm_define_method(ec, Qnil, id, (VALUE)iseq, FALSE);
}
DEFINE_INSN
definesmethod
(ID id, ISEQ iseq)
(VALUE obj)
()
{
vm_define_method(ec, obj, id, (VALUE)iseq, TRUE);
}
/**********************************************************/
/* deal with control flow 2: method/iterator */
/**********************************************************/
/* invoke method. */
DEFINE_INSN
send
(CALL_DATA cd, ISEQ blockiseq)
(...)
(VALUE val)
// attr rb_snum_t sp_inc = sp_inc_of_sendish(cd->ci);
// attr rb_snum_t comptime_sp_inc = sp_inc_of_sendish(ci);
{
VALUE bh = vm_caller_setup_arg_block(ec, GET_CFP(), cd->ci, blockiseq, false);
val = vm_sendish(ec, GET_CFP(), cd, bh, mexp_search_method);
insns.def: refactor to avoid CALL_METHOD macro These send and its variant instructions are the most frequently called paths in the entire process. Reducing macro expansions to make them dedicated function called vm_sendish() is the main goal of this changeset. It reduces the size of vm_exec_coref from 25,552 bytes to 23,728 bytes on my machine. I see no significant slowdown. Fix: [GH-2056] vanilla: ruby 2.6.0dev (2018-12-19 trunk 66449) [x86_64-darwin15] ours: ruby 2.6.0dev (2018-12-19 refactor-send 66449) [x86_64-darwin15] last_commit=insns.def: refactor to avoid CALL_METHOD macro Calculating ------------------------------------- vanilla ours vm2_defined_method 2.645M 2.823M i/s - 6.000M times in 5.109888s 4.783254s vm2_method 8.553M 8.873M i/s - 6.000M times in 1.579892s 1.524026s vm2_method_missing 3.772M 3.858M i/s - 6.000M times in 3.579482s 3.499220s vm2_method_with_block 8.494M 8.944M i/s - 6.000M times in 1.589774s 1.509463s vm2_poly_method 0.571 0.607 i/s - 1.000 times in 3.947570s 3.733528s vm2_poly_method_ov 5.514 5.168 i/s - 1.000 times in 0.408156s 0.436169s vm3_clearmethodcache 2.875 2.837 i/s - 1.000 times in 0.783018s 0.793493s Comparison: vm2_defined_method ours: 2822555.4 i/s vanilla: 2644878.1 i/s - 1.07x slower vm2_method ours: 8872947.8 i/s vanilla: 8553433.1 i/s - 1.04x slower vm2_method_missing ours: 3858192.3 i/s vanilla: 3772296.3 i/s - 1.02x slower vm2_method_with_block ours: 8943825.1 i/s vanilla: 8493955.0 i/s - 1.05x slower vm2_poly_method ours: 0.6 i/s vanilla: 0.6 i/s - 1.06x slower vm2_poly_method_ov vanilla: 5.5 i/s ours: 5.2 i/s - 1.07x slower vm3_clearmethodcache vanilla: 2.9 i/s ours: 2.8 i/s - 1.01x slower git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@66565 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2018-12-25 19:59:37 -05:00
if (val == Qundef) {
RESTORE_REGS();
NEXT_INSN();
}
}
/* Invoke method without block */
DEFINE_INSN
opt_send_without_block
(CALL_DATA cd)
(...)
(VALUE val)
// attr bool handles_sp = true;
// attr rb_snum_t sp_inc = sp_inc_of_sendish(cd->ci);
// attr rb_snum_t comptime_sp_inc = sp_inc_of_sendish(ci);
{
insns.def: refactor to avoid CALL_METHOD macro These send and its variant instructions are the most frequently called paths in the entire process. Reducing macro expansions to make them dedicated function called vm_sendish() is the main goal of this changeset. It reduces the size of vm_exec_coref from 25,552 bytes to 23,728 bytes on my machine. I see no significant slowdown. Fix: [GH-2056] vanilla: ruby 2.6.0dev (2018-12-19 trunk 66449) [x86_64-darwin15] ours: ruby 2.6.0dev (2018-12-19 refactor-send 66449) [x86_64-darwin15] last_commit=insns.def: refactor to avoid CALL_METHOD macro Calculating ------------------------------------- vanilla ours vm2_defined_method 2.645M 2.823M i/s - 6.000M times in 5.109888s 4.783254s vm2_method 8.553M 8.873M i/s - 6.000M times in 1.579892s 1.524026s vm2_method_missing 3.772M 3.858M i/s - 6.000M times in 3.579482s 3.499220s vm2_method_with_block 8.494M 8.944M i/s - 6.000M times in 1.589774s 1.509463s vm2_poly_method 0.571 0.607 i/s - 1.000 times in 3.947570s 3.733528s vm2_poly_method_ov 5.514 5.168 i/s - 1.000 times in 0.408156s 0.436169s vm3_clearmethodcache 2.875 2.837 i/s - 1.000 times in 0.783018s 0.793493s Comparison: vm2_defined_method ours: 2822555.4 i/s vanilla: 2644878.1 i/s - 1.07x slower vm2_method ours: 8872947.8 i/s vanilla: 8553433.1 i/s - 1.04x slower vm2_method_missing ours: 3858192.3 i/s vanilla: 3772296.3 i/s - 1.02x slower vm2_method_with_block ours: 8943825.1 i/s vanilla: 8493955.0 i/s - 1.05x slower vm2_poly_method ours: 0.6 i/s vanilla: 0.6 i/s - 1.06x slower vm2_poly_method_ov vanilla: 5.5 i/s ours: 5.2 i/s - 1.07x slower vm3_clearmethodcache vanilla: 2.9 i/s ours: 2.8 i/s - 1.01x slower git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@66565 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2018-12-25 19:59:37 -05:00
VALUE bh = VM_BLOCK_HANDLER_NONE;
val = vm_sendish(ec, GET_CFP(), cd, bh, mexp_search_method);
insns.def: refactor to avoid CALL_METHOD macro These send and its variant instructions are the most frequently called paths in the entire process. Reducing macro expansions to make them dedicated function called vm_sendish() is the main goal of this changeset. It reduces the size of vm_exec_coref from 25,552 bytes to 23,728 bytes on my machine. I see no significant slowdown. Fix: [GH-2056] vanilla: ruby 2.6.0dev (2018-12-19 trunk 66449) [x86_64-darwin15] ours: ruby 2.6.0dev (2018-12-19 refactor-send 66449) [x86_64-darwin15] last_commit=insns.def: refactor to avoid CALL_METHOD macro Calculating ------------------------------------- vanilla ours vm2_defined_method 2.645M 2.823M i/s - 6.000M times in 5.109888s 4.783254s vm2_method 8.553M 8.873M i/s - 6.000M times in 1.579892s 1.524026s vm2_method_missing 3.772M 3.858M i/s - 6.000M times in 3.579482s 3.499220s vm2_method_with_block 8.494M 8.944M i/s - 6.000M times in 1.589774s 1.509463s vm2_poly_method 0.571 0.607 i/s - 1.000 times in 3.947570s 3.733528s vm2_poly_method_ov 5.514 5.168 i/s - 1.000 times in 0.408156s 0.436169s vm3_clearmethodcache 2.875 2.837 i/s - 1.000 times in 0.783018s 0.793493s Comparison: vm2_defined_method ours: 2822555.4 i/s vanilla: 2644878.1 i/s - 1.07x slower vm2_method ours: 8872947.8 i/s vanilla: 8553433.1 i/s - 1.04x slower vm2_method_missing ours: 3858192.3 i/s vanilla: 3772296.3 i/s - 1.02x slower vm2_method_with_block ours: 8943825.1 i/s vanilla: 8493955.0 i/s - 1.05x slower vm2_poly_method ours: 0.6 i/s vanilla: 0.6 i/s - 1.06x slower vm2_poly_method_ov vanilla: 5.5 i/s ours: 5.2 i/s - 1.07x slower vm3_clearmethodcache vanilla: 2.9 i/s ours: 2.8 i/s - 1.01x slower git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@66565 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2018-12-25 19:59:37 -05:00
if (val == Qundef) {
RESTORE_REGS();
NEXT_INSN();
}
}
DEFINE_INSN
opt_str_freeze
(VALUE str, CALL_DATA cd)
()
(VALUE val)
{
val = vm_opt_str_freeze(str, BOP_FREEZE, idFreeze);
if (val == Qundef) {
PUSH(rb_str_resurrect(str));
move ADD_PC around (take 2) Now that we can say for sure if an instruction calls a method or not internally, it is now possible to reroute the bugs that forced us to revert the "move PC around" optimization. First try: r62051 Reverted: r63763 See also: r63999 ---- trunk: ruby 2.6.0dev (2018-09-13 trunk 64736) [x86_64-darwin15] ours: ruby 2.6.0dev (2018-09-13 trunk 64736) [x86_64-darwin15] last_commit=move ADD_PC around (take 2) Calculating ------------------------------------- trunk ours so_ackermann 1.884 2.278 i/s - 1.000 times in 0.530926s 0.438935s so_array 1.178 1.157 i/s - 1.000 times in 0.848786s 0.864467s so_binary_trees 0.176 0.177 i/s - 1.000 times in 5.683895s 5.657707s so_concatenate 0.220 0.221 i/s - 1.000 times in 4.546896s 4.518949s so_count_words 6.729 6.470 i/s - 1.000 times in 0.148602s 0.154561s so_exception 3.324 3.688 i/s - 1.000 times in 0.300872s 0.271147s so_fannkuch 0.546 0.968 i/s - 1.000 times in 1.831328s 1.033376s so_fasta 0.541 0.547 i/s - 1.000 times in 1.849923s 1.827091s so_k_nucleotide 0.800 0.777 i/s - 1.000 times in 1.250635s 1.286295s so_lists 2.101 1.848 i/s - 1.000 times in 0.475954s 0.541095s so_mandelbrot 0.435 0.408 i/s - 1.000 times in 2.299328s 2.450535s so_matrix 1.946 1.912 i/s - 1.000 times in 0.513872s 0.523076s so_meteor_contest 0.311 0.317 i/s - 1.000 times in 3.219297s 3.152052s so_nbody 0.746 0.703 i/s - 1.000 times in 1.339815s 1.423441s so_nested_loop 0.899 0.901 i/s - 1.000 times in 1.111767s 1.109555s so_nsieve 0.559 0.579 i/s - 1.000 times in 1.787763s 1.726552s so_nsieve_bits 0.435 0.428 i/s - 1.000 times in 2.296282s 2.333852s so_object 1.368 1.442 i/s - 1.000 times in 0.731237s 0.693684s so_partial_sums 0.616 0.546 i/s - 1.000 times in 1.623592s 1.833097s so_pidigits 0.831 0.832 i/s - 1.000 times in 1.203117s 1.202334s so_random 2.934 2.724 i/s - 1.000 times in 0.340791s 0.367150s so_reverse_complement 0.583 0.866 i/s - 1.000 times in 1.714144s 1.154615s so_sieve 1.829 2.081 i/s - 1.000 times in 0.546607s 0.480562s so_spectralnorm 0.524 0.558 i/s - 1.000 times in 1.908716s 1.792382s git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@64737 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2018-09-14 03:44:44 -04:00
CALL_SIMPLE_METHOD();
}
}
/* optimized nil? */
DEFINE_INSN
opt_nil_p
(CALL_DATA cd)
(VALUE recv)
(VALUE val)
{
val = vm_opt_nil_p(GET_ISEQ(), cd, recv);
if (val == Qundef) {
CALL_SIMPLE_METHOD();
}
}
DEFINE_INSN
opt_str_uminus
(VALUE str, CALL_DATA cd)
()
(VALUE val)
{
val = vm_opt_str_freeze(str, BOP_UMINUS, idUMinus);
if (val == Qundef) {
PUSH(rb_str_resurrect(str));
move ADD_PC around (take 2) Now that we can say for sure if an instruction calls a method or not internally, it is now possible to reroute the bugs that forced us to revert the "move PC around" optimization. First try: r62051 Reverted: r63763 See also: r63999 ---- trunk: ruby 2.6.0dev (2018-09-13 trunk 64736) [x86_64-darwin15] ours: ruby 2.6.0dev (2018-09-13 trunk 64736) [x86_64-darwin15] last_commit=move ADD_PC around (take 2) Calculating ------------------------------------- trunk ours so_ackermann 1.884 2.278 i/s - 1.000 times in 0.530926s 0.438935s so_array 1.178 1.157 i/s - 1.000 times in 0.848786s 0.864467s so_binary_trees 0.176 0.177 i/s - 1.000 times in 5.683895s 5.657707s so_concatenate 0.220 0.221 i/s - 1.000 times in 4.546896s 4.518949s so_count_words 6.729 6.470 i/s - 1.000 times in 0.148602s 0.154561s so_exception 3.324 3.688 i/s - 1.000 times in 0.300872s 0.271147s so_fannkuch 0.546 0.968 i/s - 1.000 times in 1.831328s 1.033376s so_fasta 0.541 0.547 i/s - 1.000 times in 1.849923s 1.827091s so_k_nucleotide 0.800 0.777 i/s - 1.000 times in 1.250635s 1.286295s so_lists 2.101 1.848 i/s - 1.000 times in 0.475954s 0.541095s so_mandelbrot 0.435 0.408 i/s - 1.000 times in 2.299328s 2.450535s so_matrix 1.946 1.912 i/s - 1.000 times in 0.513872s 0.523076s so_meteor_contest 0.311 0.317 i/s - 1.000 times in 3.219297s 3.152052s so_nbody 0.746 0.703 i/s - 1.000 times in 1.339815s 1.423441s so_nested_loop 0.899 0.901 i/s - 1.000 times in 1.111767s 1.109555s so_nsieve 0.559 0.579 i/s - 1.000 times in 1.787763s 1.726552s so_nsieve_bits 0.435 0.428 i/s - 1.000 times in 2.296282s 2.333852s so_object 1.368 1.442 i/s - 1.000 times in 0.731237s 0.693684s so_partial_sums 0.616 0.546 i/s - 1.000 times in 1.623592s 1.833097s so_pidigits 0.831 0.832 i/s - 1.000 times in 1.203117s 1.202334s so_random 2.934 2.724 i/s - 1.000 times in 0.340791s 0.367150s so_reverse_complement 0.583 0.866 i/s - 1.000 times in 1.714144s 1.154615s so_sieve 1.829 2.081 i/s - 1.000 times in 0.546607s 0.480562s so_spectralnorm 0.524 0.558 i/s - 1.000 times in 1.908716s 1.792382s git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@64737 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2018-09-14 03:44:44 -04:00
CALL_SIMPLE_METHOD();
}
}
DEFINE_INSN
opt_newarray_max
(rb_num_t num)
(...)
(VALUE val)
/* This instruction typically has no funcalls. But it compares array
* contents each other by nature. That part could call methods when
* necessary. No way to detect such method calls beforehand. We
* cannot but mark it being not leaf. */
// attr bool leaf = false; /* has rb_funcall() */
// attr rb_snum_t sp_inc = 1 - (rb_snum_t)num;
{
val = vm_opt_newarray_max(ec, num, STACK_ADDR_FROM_TOP(num));
}
DEFINE_INSN
opt_newarray_min
(rb_num_t num)
(...)
(VALUE val)
/* Same discussion as opt_newarray_max. */
// attr bool leaf = false; /* has rb_funcall() */
// attr rb_snum_t sp_inc = 1 - (rb_snum_t)num;
{
val = vm_opt_newarray_min(ec, num, STACK_ADDR_FROM_TOP(num));
}
/* super(args) # args.size => num */
DEFINE_INSN
invokesuper
(CALL_DATA cd, ISEQ blockiseq)
(...)
(VALUE val)
// attr rb_snum_t sp_inc = sp_inc_of_sendish(cd->ci);
// attr rb_snum_t comptime_sp_inc = sp_inc_of_sendish(ci);
{
VALUE bh = vm_caller_setup_arg_block(ec, GET_CFP(), cd->ci, blockiseq, true);
val = vm_sendish(ec, GET_CFP(), cd, bh, mexp_search_super);
insns.def: refactor to avoid CALL_METHOD macro These send and its variant instructions are the most frequently called paths in the entire process. Reducing macro expansions to make them dedicated function called vm_sendish() is the main goal of this changeset. It reduces the size of vm_exec_coref from 25,552 bytes to 23,728 bytes on my machine. I see no significant slowdown. Fix: [GH-2056] vanilla: ruby 2.6.0dev (2018-12-19 trunk 66449) [x86_64-darwin15] ours: ruby 2.6.0dev (2018-12-19 refactor-send 66449) [x86_64-darwin15] last_commit=insns.def: refactor to avoid CALL_METHOD macro Calculating ------------------------------------- vanilla ours vm2_defined_method 2.645M 2.823M i/s - 6.000M times in 5.109888s 4.783254s vm2_method 8.553M 8.873M i/s - 6.000M times in 1.579892s 1.524026s vm2_method_missing 3.772M 3.858M i/s - 6.000M times in 3.579482s 3.499220s vm2_method_with_block 8.494M 8.944M i/s - 6.000M times in 1.589774s 1.509463s vm2_poly_method 0.571 0.607 i/s - 1.000 times in 3.947570s 3.733528s vm2_poly_method_ov 5.514 5.168 i/s - 1.000 times in 0.408156s 0.436169s vm3_clearmethodcache 2.875 2.837 i/s - 1.000 times in 0.783018s 0.793493s Comparison: vm2_defined_method ours: 2822555.4 i/s vanilla: 2644878.1 i/s - 1.07x slower vm2_method ours: 8872947.8 i/s vanilla: 8553433.1 i/s - 1.04x slower vm2_method_missing ours: 3858192.3 i/s vanilla: 3772296.3 i/s - 1.02x slower vm2_method_with_block ours: 8943825.1 i/s vanilla: 8493955.0 i/s - 1.05x slower vm2_poly_method ours: 0.6 i/s vanilla: 0.6 i/s - 1.06x slower vm2_poly_method_ov vanilla: 5.5 i/s ours: 5.2 i/s - 1.07x slower vm3_clearmethodcache vanilla: 2.9 i/s ours: 2.8 i/s - 1.01x slower git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@66565 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2018-12-25 19:59:37 -05:00
if (val == Qundef) {
RESTORE_REGS();
NEXT_INSN();
}
}
/* yield(args) */
DEFINE_INSN
invokeblock
(CALL_DATA cd)
(...)
(VALUE val)
// attr bool handles_sp = true;
// attr rb_snum_t sp_inc = sp_inc_of_invokeblock(cd->ci);
// attr rb_snum_t comptime_sp_inc = sp_inc_of_invokeblock(ci);
{
insns.def: refactor to avoid CALL_METHOD macro These send and its variant instructions are the most frequently called paths in the entire process. Reducing macro expansions to make them dedicated function called vm_sendish() is the main goal of this changeset. It reduces the size of vm_exec_coref from 25,552 bytes to 23,728 bytes on my machine. I see no significant slowdown. Fix: [GH-2056] vanilla: ruby 2.6.0dev (2018-12-19 trunk 66449) [x86_64-darwin15] ours: ruby 2.6.0dev (2018-12-19 refactor-send 66449) [x86_64-darwin15] last_commit=insns.def: refactor to avoid CALL_METHOD macro Calculating ------------------------------------- vanilla ours vm2_defined_method 2.645M 2.823M i/s - 6.000M times in 5.109888s 4.783254s vm2_method 8.553M 8.873M i/s - 6.000M times in 1.579892s 1.524026s vm2_method_missing 3.772M 3.858M i/s - 6.000M times in 3.579482s 3.499220s vm2_method_with_block 8.494M 8.944M i/s - 6.000M times in 1.589774s 1.509463s vm2_poly_method 0.571 0.607 i/s - 1.000 times in 3.947570s 3.733528s vm2_poly_method_ov 5.514 5.168 i/s - 1.000 times in 0.408156s 0.436169s vm3_clearmethodcache 2.875 2.837 i/s - 1.000 times in 0.783018s 0.793493s Comparison: vm2_defined_method ours: 2822555.4 i/s vanilla: 2644878.1 i/s - 1.07x slower vm2_method ours: 8872947.8 i/s vanilla: 8553433.1 i/s - 1.04x slower vm2_method_missing ours: 3858192.3 i/s vanilla: 3772296.3 i/s - 1.02x slower vm2_method_with_block ours: 8943825.1 i/s vanilla: 8493955.0 i/s - 1.05x slower vm2_poly_method ours: 0.6 i/s vanilla: 0.6 i/s - 1.06x slower vm2_poly_method_ov vanilla: 5.5 i/s ours: 5.2 i/s - 1.07x slower vm3_clearmethodcache vanilla: 2.9 i/s ours: 2.8 i/s - 1.01x slower git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@66565 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2018-12-25 19:59:37 -05:00
VALUE bh = VM_BLOCK_HANDLER_NONE;
val = vm_sendish(ec, GET_CFP(), cd, bh, mexp_search_invokeblock);
mjit_compile.c: use local variables for stack if catch_except_p is FALSE. If catch_except_p is TRUE, stack values should be on VM's stack when exception is thrown and the JIT-ed frame is re-executed by VM's exception handler. If it's FALSE, the JIT-ed frame won't be re-executed and don't need to keep values on VM's stack. Using local variables allows us to reduce cfp->sp motion. Moving cfp->sp is needed only for insns whose handles_frame? is false. So it improves performance. _mjit_compile_insn.erb: Prepare `stack_size` variable for GET_SP, STACK_ADDR_FROM_TOP, TOPN macros. Share pc and sp motion partial view. Use cancel handler created in mjit_compile.c. _mjit_compile_send.erb: ditto. Also, when iseq->body->catch_except_p is TRUE, this stops to call mjit_exec directly. I described the reason in vm_insnhelper.h's comment for EXEC_EC_CFP. _mjit_compile_pc_and_sp.erb: Shared logic for moving sp and pc. As you can see from thsi file, when status->local_stack_p is TRUE and insn.handles_frame? is false, moving sp is skipped. But if insn.handles_frame? is true, values should be rolled back to VM's stack. common.mk: add dependency for the file _mjit_compile_insn_body.erb: Set sp value before canceling JIT on DISPATCH_ORIGINAL_INSN. Replace GET_SP, STACK_ADDR_FROM_TOP, TOPN macros for the case ocal_stack_p is TRUE and insn.handles_frame? is false. In that case, values are not available on VM's stack and those macros should be replaced. mjit_compile.inc.erb: updated comments of macros which are supported by JIT compiler. All references to `cfp->sp` should be replaced and thus INC_SP, SET_SV, PUSH are no longer supported for now, because they are not used now. vm_exec.h: moved EXEC_EC_CFP definition to vm_insnhelper.h because it's tighly coupled to CALL_METHOD. vm_insnhelper.h: Have revised EXEC_EC_CFP definition moved from vm_exec.h. Now it triggers mjit_exec for VM, and has the guard for catch_except_p on JIT-ed code. See comments for details. CALL_METHOD delegates triggering mjit_exec to EXEC_EC_CFP. insns.def: Stopped using EXEC_EC_CFP for the case we don't want to trigger mjit_exec. Those insns (defineclass, opt_call_c_function) are not supported by JIT and it's safe to use RESTORE_REGS(), NEXT_INSN(). expandarray is changed to pass GET_SP() to replace the macro in _mjit_compile_insn_body.erb. vm_insnhelper.c: change to take sp for the above reason. [close https://github.com/ruby/ruby/pull/1828] This patch resurrects the performance which was attached in [Feature #14235]. * Benchmark Optcarrot (with configuration for benchmark_driver.gem) https://github.com/benchmark-driver/optcarrot $ benchmark-driver benchmark.yml --verbose 1 --rbenv 'before;before+JIT::before,--jit;after;after+JIT::after,--jit' --repeat-count 10 before: ruby 2.6.0dev (2018-03-04 trunk 62652) [x86_64-linux] before+JIT: ruby 2.6.0dev (2018-03-04 trunk 62652) +JIT [x86_64-linux] after: ruby 2.6.0dev (2018-03-04 local-variable.. 62652) [x86_64-linux] last_commit=mjit_compile.c: use local variables for stack after+JIT: ruby 2.6.0dev (2018-03-04 local-variable.. 62652) +JIT [x86_64-linux] last_commit=mjit_compile.c: use local variables for stack Calculating ------------------------------------- before before+JIT after after+JIT optcarrot 53.552 59.680 53.697 63.358 fps Comparison: optcarrot after+JIT: 63.4 fps before+JIT: 59.7 fps - 1.06x slower after: 53.7 fps - 1.18x slower before: 53.6 fps - 1.18x slower git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@62655 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2018-03-04 02:04:40 -05:00
if (val == Qundef) {
insns.def: refactor to avoid CALL_METHOD macro These send and its variant instructions are the most frequently called paths in the entire process. Reducing macro expansions to make them dedicated function called vm_sendish() is the main goal of this changeset. It reduces the size of vm_exec_coref from 25,552 bytes to 23,728 bytes on my machine. I see no significant slowdown. Fix: [GH-2056] vanilla: ruby 2.6.0dev (2018-12-19 trunk 66449) [x86_64-darwin15] ours: ruby 2.6.0dev (2018-12-19 refactor-send 66449) [x86_64-darwin15] last_commit=insns.def: refactor to avoid CALL_METHOD macro Calculating ------------------------------------- vanilla ours vm2_defined_method 2.645M 2.823M i/s - 6.000M times in 5.109888s 4.783254s vm2_method 8.553M 8.873M i/s - 6.000M times in 1.579892s 1.524026s vm2_method_missing 3.772M 3.858M i/s - 6.000M times in 3.579482s 3.499220s vm2_method_with_block 8.494M 8.944M i/s - 6.000M times in 1.589774s 1.509463s vm2_poly_method 0.571 0.607 i/s - 1.000 times in 3.947570s 3.733528s vm2_poly_method_ov 5.514 5.168 i/s - 1.000 times in 0.408156s 0.436169s vm3_clearmethodcache 2.875 2.837 i/s - 1.000 times in 0.783018s 0.793493s Comparison: vm2_defined_method ours: 2822555.4 i/s vanilla: 2644878.1 i/s - 1.07x slower vm2_method ours: 8872947.8 i/s vanilla: 8553433.1 i/s - 1.04x slower vm2_method_missing ours: 3858192.3 i/s vanilla: 3772296.3 i/s - 1.02x slower vm2_method_with_block ours: 8943825.1 i/s vanilla: 8493955.0 i/s - 1.05x slower vm2_poly_method ours: 0.6 i/s vanilla: 0.6 i/s - 1.06x slower vm2_poly_method_ov vanilla: 5.5 i/s ours: 5.2 i/s - 1.07x slower vm3_clearmethodcache vanilla: 2.9 i/s ours: 2.8 i/s - 1.01x slower git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@66565 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2018-12-25 19:59:37 -05:00
RESTORE_REGS();
NEXT_INSN();
}
}
/* return from this scope. */
DEFINE_INSN
leave
()
(VALUE val)
(VALUE val)
/* This is super surprising but when leaving from a frame, we check
* for interrupts. If any, that should be executed on top of the
* current execution context. This is a method call. */
// attr bool leaf = false; /* has rb_threadptr_execute_interrupts() */
// attr bool handles_sp = true;
{
if (OPT_CHECKED_RUN) {
const VALUE *const bp = vm_base_ptr(GET_CFP());
if (GET_SP() != bp) {
vm_stack_consistency_error(ec, GET_CFP(), bp);
}
}
if (vm_pop_frame(ec, GET_CFP(), GET_EP())) {
#if OPT_CALL_THREADED_CODE
rb_ec_thread_ptr(ec)->retval = val;
return 0;
#else
* vm_core.h: remove VM_FRAME_MAGIC_FINISH (finish frame type). Before this commit: `finish frame' was place holder which indicates that VM loop needs to return function. If a C method calls a Ruby methods (a method written by Ruby), then VM loop will be (re-)invoked. When the Ruby method returns, then also VM loop should be escaped. `finish frame' has only one instruction `finish', which returns VM loop function. VM loop function executes `finish' instruction, then VM loop function returns itself. With such mechanism, `leave' instruction (which returns one frame from current scope) doesn't need to check that this `leave' should also return from VM loop function. Strictly, one branch can be removed from `leave' instructon. Consideration: However, pushing the `finish frame' needs costs because it needs several memory accesses. The number of pushing `finish frame' is greater than I had assumed. Of course, pushing `finish frame' consumes additional control frame. Moreover, recent processors has good branch prediction, with which we can ignore such trivial checking. After this commit: Finally, I decide to remove `finish frame' and `finish' instruction. Some parts of VM depend on `finish frame', so the new frame flag VM_FRAME_FLAG_FINISH is introduced. If this frame should escape from VM function loop, then the result of VM_FRAME_TYPE_FINISH_P(cfp) is true. `leave' instruction checks this flag every time. I measured performance on it. However on my environments, it improves some benchmarks and slows some benchmarks down. Maybe it is because of C compiler optimization parameters. I'll re-visit here if this cause problems. * insns.def (leave, finish): remove finish instruction. * vm.c, vm_eval.c, vm_exec.c, vm_backtrace.c, vm_dump.c: apply above changes. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@36099 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2012-06-15 06:22:34 -04:00
return val;
#endif
* vm_core.h: remove VM_FRAME_MAGIC_FINISH (finish frame type). Before this commit: `finish frame' was place holder which indicates that VM loop needs to return function. If a C method calls a Ruby methods (a method written by Ruby), then VM loop will be (re-)invoked. When the Ruby method returns, then also VM loop should be escaped. `finish frame' has only one instruction `finish', which returns VM loop function. VM loop function executes `finish' instruction, then VM loop function returns itself. With such mechanism, `leave' instruction (which returns one frame from current scope) doesn't need to check that this `leave' should also return from VM loop function. Strictly, one branch can be removed from `leave' instructon. Consideration: However, pushing the `finish frame' needs costs because it needs several memory accesses. The number of pushing `finish frame' is greater than I had assumed. Of course, pushing `finish frame' consumes additional control frame. Moreover, recent processors has good branch prediction, with which we can ignore such trivial checking. After this commit: Finally, I decide to remove `finish frame' and `finish' instruction. Some parts of VM depend on `finish frame', so the new frame flag VM_FRAME_FLAG_FINISH is introduced. If this frame should escape from VM function loop, then the result of VM_FRAME_TYPE_FINISH_P(cfp) is true. `leave' instruction checks this flag every time. I measured performance on it. However on my environments, it improves some benchmarks and slows some benchmarks down. Maybe it is because of C compiler optimization parameters. I'll re-visit here if this cause problems. * insns.def (leave, finish): remove finish instruction. * vm.c, vm_eval.c, vm_exec.c, vm_backtrace.c, vm_dump.c: apply above changes. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@36099 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2012-06-15 06:22:34 -04:00
}
else {
RESTORE_REGS();
}
}
/**********************************************************/
/* deal with control flow 3: exception */
/**********************************************************/
/* longjump */
DEFINE_INSN
throw
(rb_num_t throw_state)
(VALUE throwobj)
(VALUE val)
/* Same discussion as leave. */
// attr bool leaf = false; /* has rb_threadptr_execute_interrupts() */
{
val = vm_throw(ec, GET_CFP(), throw_state, throwobj);
THROW_EXCEPTION(val);
/* unreachable */
}
/**********************************************************/
/* deal with control flow 4: local jump */
/**********************************************************/
/* set PC to (PC + dst). */
DEFINE_INSN
jump
(OFFSET dst)
()
()
/* Same discussion as leave. */
// attr bool leaf = leafness_of_check_ints; /* has rb_threadptr_execute_interrupts() */
{
RUBY_VM_CHECK_INTS(ec);
JUMP(dst);
}
/* if val is not false or nil, set PC to (PC + dst). */
DEFINE_INSN
branchif
(OFFSET dst)
(VALUE val)
()
/* Same discussion as jump. */
// attr bool leaf = leafness_of_check_ints; /* has rb_threadptr_execute_interrupts() */
{
if (RTEST(val)) {
RUBY_VM_CHECK_INTS(ec);
JUMP(dst);
}
}
/* if val is false or nil, set PC to (PC + dst). */
DEFINE_INSN
branchunless
(OFFSET dst)
(VALUE val)
()
/* Same discussion as jump. */
// attr bool leaf = leafness_of_check_ints; /* has rb_threadptr_execute_interrupts() */
{
if (!RTEST(val)) {
RUBY_VM_CHECK_INTS(ec);
JUMP(dst);
}
}
/* if val is nil, set PC to (PC + dst). */
DEFINE_INSN
branchnil
(OFFSET dst)
(VALUE val)
()
/* Same discussion as jump. */
// attr bool leaf = leafness_of_check_ints; /* has rb_threadptr_execute_interrupts() */
{
if (NIL_P(val)) {
RUBY_VM_CHECK_INTS(ec);
JUMP(dst);
}
}
/**********************************************************/
/* for optimize */
/**********************************************************/
/* push inline-cached value and go to dst if it is valid */
DEFINE_INSN
opt_getinlinecache
(OFFSET dst, IC ic)
()
(VALUE val)
{
struct iseq_inline_constant_cache_entry *ice = ic->entry;
if (ice && vm_ic_hit_p(ice, GET_EP())) {
val = ice->value;
JUMP(dst);
}
else {
val = Qnil;
}
}
/* set inline cache */
DEFINE_INSN
opt_setinlinecache
(IC ic)
(VALUE val)
(VALUE val)
// attr bool leaf = false;
{
vm_ic_update(GET_ISEQ(), ic, val, GET_EP());
}
/* run iseq only once */
DEFINE_INSN
once
(ISEQ iseq, ISE ise)
()
(VALUE val)
{
val = vm_once_dispatch(ec, iseq, ise);
}
/* case dispatcher, jump by table if possible */
DEFINE_INSN
opt_case_dispatch
(CDHASH hash, OFFSET else_offset)
(..., VALUE key)
()
// attr rb_snum_t sp_inc = -1;
{
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 06:58:49 -04:00
OFFSET dst = vm_case_dispatch(hash, else_offset, key);
if (dst) {
JUMP(dst);
}
}
/** simple functions */
/* optimized X+Y. */
DEFINE_INSN
opt_plus
(CALL_DATA cd)
(VALUE recv, VALUE obj)
(VALUE val)
{
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 06:58:49 -04:00
val = vm_opt_plus(recv, obj);
if (val == Qundef) {
CALL_SIMPLE_METHOD();
}
}
/* optimized X-Y. */
DEFINE_INSN
opt_minus
(CALL_DATA cd)
(VALUE recv, VALUE obj)
(VALUE val)
{
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 06:58:49 -04:00
val = vm_opt_minus(recv, obj);
if (val == Qundef) {
move ADD_PC around (take 2) Now that we can say for sure if an instruction calls a method or not internally, it is now possible to reroute the bugs that forced us to revert the "move PC around" optimization. First try: r62051 Reverted: r63763 See also: r63999 ---- trunk: ruby 2.6.0dev (2018-09-13 trunk 64736) [x86_64-darwin15] ours: ruby 2.6.0dev (2018-09-13 trunk 64736) [x86_64-darwin15] last_commit=move ADD_PC around (take 2) Calculating ------------------------------------- trunk ours so_ackermann 1.884 2.278 i/s - 1.000 times in 0.530926s 0.438935s so_array 1.178 1.157 i/s - 1.000 times in 0.848786s 0.864467s so_binary_trees 0.176 0.177 i/s - 1.000 times in 5.683895s 5.657707s so_concatenate 0.220 0.221 i/s - 1.000 times in 4.546896s 4.518949s so_count_words 6.729 6.470 i/s - 1.000 times in 0.148602s 0.154561s so_exception 3.324 3.688 i/s - 1.000 times in 0.300872s 0.271147s so_fannkuch 0.546 0.968 i/s - 1.000 times in 1.831328s 1.033376s so_fasta 0.541 0.547 i/s - 1.000 times in 1.849923s 1.827091s so_k_nucleotide 0.800 0.777 i/s - 1.000 times in 1.250635s 1.286295s so_lists 2.101 1.848 i/s - 1.000 times in 0.475954s 0.541095s so_mandelbrot 0.435 0.408 i/s - 1.000 times in 2.299328s 2.450535s so_matrix 1.946 1.912 i/s - 1.000 times in 0.513872s 0.523076s so_meteor_contest 0.311 0.317 i/s - 1.000 times in 3.219297s 3.152052s so_nbody 0.746 0.703 i/s - 1.000 times in 1.339815s 1.423441s so_nested_loop 0.899 0.901 i/s - 1.000 times in 1.111767s 1.109555s so_nsieve 0.559 0.579 i/s - 1.000 times in 1.787763s 1.726552s so_nsieve_bits 0.435 0.428 i/s - 1.000 times in 2.296282s 2.333852s so_object 1.368 1.442 i/s - 1.000 times in 0.731237s 0.693684s so_partial_sums 0.616 0.546 i/s - 1.000 times in 1.623592s 1.833097s so_pidigits 0.831 0.832 i/s - 1.000 times in 1.203117s 1.202334s so_random 2.934 2.724 i/s - 1.000 times in 0.340791s 0.367150s so_reverse_complement 0.583 0.866 i/s - 1.000 times in 1.714144s 1.154615s so_sieve 1.829 2.081 i/s - 1.000 times in 0.546607s 0.480562s so_spectralnorm 0.524 0.558 i/s - 1.000 times in 1.908716s 1.792382s git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@64737 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2018-09-14 03:44:44 -04:00
CALL_SIMPLE_METHOD();
}
}
/* optimized X*Y. */
DEFINE_INSN
opt_mult
(CALL_DATA cd)
(VALUE recv, VALUE obj)
(VALUE val)
{
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 06:58:49 -04:00
val = vm_opt_mult(recv, obj);
if (val == Qundef) {
move ADD_PC around (take 2) Now that we can say for sure if an instruction calls a method or not internally, it is now possible to reroute the bugs that forced us to revert the "move PC around" optimization. First try: r62051 Reverted: r63763 See also: r63999 ---- trunk: ruby 2.6.0dev (2018-09-13 trunk 64736) [x86_64-darwin15] ours: ruby 2.6.0dev (2018-09-13 trunk 64736) [x86_64-darwin15] last_commit=move ADD_PC around (take 2) Calculating ------------------------------------- trunk ours so_ackermann 1.884 2.278 i/s - 1.000 times in 0.530926s 0.438935s so_array 1.178 1.157 i/s - 1.000 times in 0.848786s 0.864467s so_binary_trees 0.176 0.177 i/s - 1.000 times in 5.683895s 5.657707s so_concatenate 0.220 0.221 i/s - 1.000 times in 4.546896s 4.518949s so_count_words 6.729 6.470 i/s - 1.000 times in 0.148602s 0.154561s so_exception 3.324 3.688 i/s - 1.000 times in 0.300872s 0.271147s so_fannkuch 0.546 0.968 i/s - 1.000 times in 1.831328s 1.033376s so_fasta 0.541 0.547 i/s - 1.000 times in 1.849923s 1.827091s so_k_nucleotide 0.800 0.777 i/s - 1.000 times in 1.250635s 1.286295s so_lists 2.101 1.848 i/s - 1.000 times in 0.475954s 0.541095s so_mandelbrot 0.435 0.408 i/s - 1.000 times in 2.299328s 2.450535s so_matrix 1.946 1.912 i/s - 1.000 times in 0.513872s 0.523076s so_meteor_contest 0.311 0.317 i/s - 1.000 times in 3.219297s 3.152052s so_nbody 0.746 0.703 i/s - 1.000 times in 1.339815s 1.423441s so_nested_loop 0.899 0.901 i/s - 1.000 times in 1.111767s 1.109555s so_nsieve 0.559 0.579 i/s - 1.000 times in 1.787763s 1.726552s so_nsieve_bits 0.435 0.428 i/s - 1.000 times in 2.296282s 2.333852s so_object 1.368 1.442 i/s - 1.000 times in 0.731237s 0.693684s so_partial_sums 0.616 0.546 i/s - 1.000 times in 1.623592s 1.833097s so_pidigits 0.831 0.832 i/s - 1.000 times in 1.203117s 1.202334s so_random 2.934 2.724 i/s - 1.000 times in 0.340791s 0.367150s so_reverse_complement 0.583 0.866 i/s - 1.000 times in 1.714144s 1.154615s so_sieve 1.829 2.081 i/s - 1.000 times in 0.546607s 0.480562s so_spectralnorm 0.524 0.558 i/s - 1.000 times in 1.908716s 1.792382s git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@64737 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2018-09-14 03:44:44 -04:00
CALL_SIMPLE_METHOD();
}
}
/* optimized X/Y. */
DEFINE_INSN
opt_div
(CALL_DATA cd)
(VALUE recv, VALUE obj)
(VALUE val)
/* In case of division by zero, it raises. Thus
* ZeroDivisionError#initialize is called. */
// attr bool leaf = false;
{
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 06:58:49 -04:00
val = vm_opt_div(recv, obj);
if (val == Qundef) {
move ADD_PC around (take 2) Now that we can say for sure if an instruction calls a method or not internally, it is now possible to reroute the bugs that forced us to revert the "move PC around" optimization. First try: r62051 Reverted: r63763 See also: r63999 ---- trunk: ruby 2.6.0dev (2018-09-13 trunk 64736) [x86_64-darwin15] ours: ruby 2.6.0dev (2018-09-13 trunk 64736) [x86_64-darwin15] last_commit=move ADD_PC around (take 2) Calculating ------------------------------------- trunk ours so_ackermann 1.884 2.278 i/s - 1.000 times in 0.530926s 0.438935s so_array 1.178 1.157 i/s - 1.000 times in 0.848786s 0.864467s so_binary_trees 0.176 0.177 i/s - 1.000 times in 5.683895s 5.657707s so_concatenate 0.220 0.221 i/s - 1.000 times in 4.546896s 4.518949s so_count_words 6.729 6.470 i/s - 1.000 times in 0.148602s 0.154561s so_exception 3.324 3.688 i/s - 1.000 times in 0.300872s 0.271147s so_fannkuch 0.546 0.968 i/s - 1.000 times in 1.831328s 1.033376s so_fasta 0.541 0.547 i/s - 1.000 times in 1.849923s 1.827091s so_k_nucleotide 0.800 0.777 i/s - 1.000 times in 1.250635s 1.286295s so_lists 2.101 1.848 i/s - 1.000 times in 0.475954s 0.541095s so_mandelbrot 0.435 0.408 i/s - 1.000 times in 2.299328s 2.450535s so_matrix 1.946 1.912 i/s - 1.000 times in 0.513872s 0.523076s so_meteor_contest 0.311 0.317 i/s - 1.000 times in 3.219297s 3.152052s so_nbody 0.746 0.703 i/s - 1.000 times in 1.339815s 1.423441s so_nested_loop 0.899 0.901 i/s - 1.000 times in 1.111767s 1.109555s so_nsieve 0.559 0.579 i/s - 1.000 times in 1.787763s 1.726552s so_nsieve_bits 0.435 0.428 i/s - 1.000 times in 2.296282s 2.333852s so_object 1.368 1.442 i/s - 1.000 times in 0.731237s 0.693684s so_partial_sums 0.616 0.546 i/s - 1.000 times in 1.623592s 1.833097s so_pidigits 0.831 0.832 i/s - 1.000 times in 1.203117s 1.202334s so_random 2.934 2.724 i/s - 1.000 times in 0.340791s 0.367150s so_reverse_complement 0.583 0.866 i/s - 1.000 times in 1.714144s 1.154615s so_sieve 1.829 2.081 i/s - 1.000 times in 0.546607s 0.480562s so_spectralnorm 0.524 0.558 i/s - 1.000 times in 1.908716s 1.792382s git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@64737 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2018-09-14 03:44:44 -04:00
CALL_SIMPLE_METHOD();
}
}
/* optimized X%Y. */
DEFINE_INSN
opt_mod
(CALL_DATA cd)
(VALUE recv, VALUE obj)
(VALUE val)
2020-12-25 22:17:16 -05:00
/* Same discussion as opt_div. */
// attr bool leaf = false;
{
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 06:58:49 -04:00
val = vm_opt_mod(recv, obj);
if (val == Qundef) {
move ADD_PC around (take 2) Now that we can say for sure if an instruction calls a method or not internally, it is now possible to reroute the bugs that forced us to revert the "move PC around" optimization. First try: r62051 Reverted: r63763 See also: r63999 ---- trunk: ruby 2.6.0dev (2018-09-13 trunk 64736) [x86_64-darwin15] ours: ruby 2.6.0dev (2018-09-13 trunk 64736) [x86_64-darwin15] last_commit=move ADD_PC around (take 2) Calculating ------------------------------------- trunk ours so_ackermann 1.884 2.278 i/s - 1.000 times in 0.530926s 0.438935s so_array 1.178 1.157 i/s - 1.000 times in 0.848786s 0.864467s so_binary_trees 0.176 0.177 i/s - 1.000 times in 5.683895s 5.657707s so_concatenate 0.220 0.221 i/s - 1.000 times in 4.546896s 4.518949s so_count_words 6.729 6.470 i/s - 1.000 times in 0.148602s 0.154561s so_exception 3.324 3.688 i/s - 1.000 times in 0.300872s 0.271147s so_fannkuch 0.546 0.968 i/s - 1.000 times in 1.831328s 1.033376s so_fasta 0.541 0.547 i/s - 1.000 times in 1.849923s 1.827091s so_k_nucleotide 0.800 0.777 i/s - 1.000 times in 1.250635s 1.286295s so_lists 2.101 1.848 i/s - 1.000 times in 0.475954s 0.541095s so_mandelbrot 0.435 0.408 i/s - 1.000 times in 2.299328s 2.450535s so_matrix 1.946 1.912 i/s - 1.000 times in 0.513872s 0.523076s so_meteor_contest 0.311 0.317 i/s - 1.000 times in 3.219297s 3.152052s so_nbody 0.746 0.703 i/s - 1.000 times in 1.339815s 1.423441s so_nested_loop 0.899 0.901 i/s - 1.000 times in 1.111767s 1.109555s so_nsieve 0.559 0.579 i/s - 1.000 times in 1.787763s 1.726552s so_nsieve_bits 0.435 0.428 i/s - 1.000 times in 2.296282s 2.333852s so_object 1.368 1.442 i/s - 1.000 times in 0.731237s 0.693684s so_partial_sums 0.616 0.546 i/s - 1.000 times in 1.623592s 1.833097s so_pidigits 0.831 0.832 i/s - 1.000 times in 1.203117s 1.202334s so_random 2.934 2.724 i/s - 1.000 times in 0.340791s 0.367150s so_reverse_complement 0.583 0.866 i/s - 1.000 times in 1.714144s 1.154615s so_sieve 1.829 2.081 i/s - 1.000 times in 0.546607s 0.480562s so_spectralnorm 0.524 0.558 i/s - 1.000 times in 1.908716s 1.792382s git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@64737 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2018-09-14 03:44:44 -04:00
CALL_SIMPLE_METHOD();
}
}
/* optimized X==Y. */
DEFINE_INSN
opt_eq
(CALL_DATA cd)
(VALUE recv, VALUE obj)
(VALUE val)
{
val = opt_equality(GET_ISEQ(), recv, obj, cd);
if (val == Qundef) {
move ADD_PC around (take 2) Now that we can say for sure if an instruction calls a method or not internally, it is now possible to reroute the bugs that forced us to revert the "move PC around" optimization. First try: r62051 Reverted: r63763 See also: r63999 ---- trunk: ruby 2.6.0dev (2018-09-13 trunk 64736) [x86_64-darwin15] ours: ruby 2.6.0dev (2018-09-13 trunk 64736) [x86_64-darwin15] last_commit=move ADD_PC around (take 2) Calculating ------------------------------------- trunk ours so_ackermann 1.884 2.278 i/s - 1.000 times in 0.530926s 0.438935s so_array 1.178 1.157 i/s - 1.000 times in 0.848786s 0.864467s so_binary_trees 0.176 0.177 i/s - 1.000 times in 5.683895s 5.657707s so_concatenate 0.220 0.221 i/s - 1.000 times in 4.546896s 4.518949s so_count_words 6.729 6.470 i/s - 1.000 times in 0.148602s 0.154561s so_exception 3.324 3.688 i/s - 1.000 times in 0.300872s 0.271147s so_fannkuch 0.546 0.968 i/s - 1.000 times in 1.831328s 1.033376s so_fasta 0.541 0.547 i/s - 1.000 times in 1.849923s 1.827091s so_k_nucleotide 0.800 0.777 i/s - 1.000 times in 1.250635s 1.286295s so_lists 2.101 1.848 i/s - 1.000 times in 0.475954s 0.541095s so_mandelbrot 0.435 0.408 i/s - 1.000 times in 2.299328s 2.450535s so_matrix 1.946 1.912 i/s - 1.000 times in 0.513872s 0.523076s so_meteor_contest 0.311 0.317 i/s - 1.000 times in 3.219297s 3.152052s so_nbody 0.746 0.703 i/s - 1.000 times in 1.339815s 1.423441s so_nested_loop 0.899 0.901 i/s - 1.000 times in 1.111767s 1.109555s so_nsieve 0.559 0.579 i/s - 1.000 times in 1.787763s 1.726552s so_nsieve_bits 0.435 0.428 i/s - 1.000 times in 2.296282s 2.333852s so_object 1.368 1.442 i/s - 1.000 times in 0.731237s 0.693684s so_partial_sums 0.616 0.546 i/s - 1.000 times in 1.623592s 1.833097s so_pidigits 0.831 0.832 i/s - 1.000 times in 1.203117s 1.202334s so_random 2.934 2.724 i/s - 1.000 times in 0.340791s 0.367150s so_reverse_complement 0.583 0.866 i/s - 1.000 times in 1.714144s 1.154615s so_sieve 1.829 2.081 i/s - 1.000 times in 0.546607s 0.480562s so_spectralnorm 0.524 0.558 i/s - 1.000 times in 1.908716s 1.792382s git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@64737 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2018-09-14 03:44:44 -04:00
CALL_SIMPLE_METHOD();
}
}
/* optimized X!=Y. */
DEFINE_INSN
opt_neq
(CALL_DATA cd_eq, CALL_DATA cd)
(VALUE recv, VALUE obj)
(VALUE val)
{
val = vm_opt_neq(GET_ISEQ(), cd, cd_eq, recv, obj);
if (val == Qundef) {
move ADD_PC around (take 2) Now that we can say for sure if an instruction calls a method or not internally, it is now possible to reroute the bugs that forced us to revert the "move PC around" optimization. First try: r62051 Reverted: r63763 See also: r63999 ---- trunk: ruby 2.6.0dev (2018-09-13 trunk 64736) [x86_64-darwin15] ours: ruby 2.6.0dev (2018-09-13 trunk 64736) [x86_64-darwin15] last_commit=move ADD_PC around (take 2) Calculating ------------------------------------- trunk ours so_ackermann 1.884 2.278 i/s - 1.000 times in 0.530926s 0.438935s so_array 1.178 1.157 i/s - 1.000 times in 0.848786s 0.864467s so_binary_trees 0.176 0.177 i/s - 1.000 times in 5.683895s 5.657707s so_concatenate 0.220 0.221 i/s - 1.000 times in 4.546896s 4.518949s so_count_words 6.729 6.470 i/s - 1.000 times in 0.148602s 0.154561s so_exception 3.324 3.688 i/s - 1.000 times in 0.300872s 0.271147s so_fannkuch 0.546 0.968 i/s - 1.000 times in 1.831328s 1.033376s so_fasta 0.541 0.547 i/s - 1.000 times in 1.849923s 1.827091s so_k_nucleotide 0.800 0.777 i/s - 1.000 times in 1.250635s 1.286295s so_lists 2.101 1.848 i/s - 1.000 times in 0.475954s 0.541095s so_mandelbrot 0.435 0.408 i/s - 1.000 times in 2.299328s 2.450535s so_matrix 1.946 1.912 i/s - 1.000 times in 0.513872s 0.523076s so_meteor_contest 0.311 0.317 i/s - 1.000 times in 3.219297s 3.152052s so_nbody 0.746 0.703 i/s - 1.000 times in 1.339815s 1.423441s so_nested_loop 0.899 0.901 i/s - 1.000 times in 1.111767s 1.109555s so_nsieve 0.559 0.579 i/s - 1.000 times in 1.787763s 1.726552s so_nsieve_bits 0.435 0.428 i/s - 1.000 times in 2.296282s 2.333852s so_object 1.368 1.442 i/s - 1.000 times in 0.731237s 0.693684s so_partial_sums 0.616 0.546 i/s - 1.000 times in 1.623592s 1.833097s so_pidigits 0.831 0.832 i/s - 1.000 times in 1.203117s 1.202334s so_random 2.934 2.724 i/s - 1.000 times in 0.340791s 0.367150s so_reverse_complement 0.583 0.866 i/s - 1.000 times in 1.714144s 1.154615s so_sieve 1.829 2.081 i/s - 1.000 times in 0.546607s 0.480562s so_spectralnorm 0.524 0.558 i/s - 1.000 times in 1.908716s 1.792382s git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@64737 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2018-09-14 03:44:44 -04:00
CALL_SIMPLE_METHOD();
}
}
/* optimized X<Y. */
DEFINE_INSN
opt_lt
(CALL_DATA cd)
(VALUE recv, VALUE obj)
(VALUE val)
{
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 06:58:49 -04:00
val = vm_opt_lt(recv, obj);
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 06:58:49 -04:00
if (val == Qundef) {
move ADD_PC around (take 2) Now that we can say for sure if an instruction calls a method or not internally, it is now possible to reroute the bugs that forced us to revert the "move PC around" optimization. First try: r62051 Reverted: r63763 See also: r63999 ---- trunk: ruby 2.6.0dev (2018-09-13 trunk 64736) [x86_64-darwin15] ours: ruby 2.6.0dev (2018-09-13 trunk 64736) [x86_64-darwin15] last_commit=move ADD_PC around (take 2) Calculating ------------------------------------- trunk ours so_ackermann 1.884 2.278 i/s - 1.000 times in 0.530926s 0.438935s so_array 1.178 1.157 i/s - 1.000 times in 0.848786s 0.864467s so_binary_trees 0.176 0.177 i/s - 1.000 times in 5.683895s 5.657707s so_concatenate 0.220 0.221 i/s - 1.000 times in 4.546896s 4.518949s so_count_words 6.729 6.470 i/s - 1.000 times in 0.148602s 0.154561s so_exception 3.324 3.688 i/s - 1.000 times in 0.300872s 0.271147s so_fannkuch 0.546 0.968 i/s - 1.000 times in 1.831328s 1.033376s so_fasta 0.541 0.547 i/s - 1.000 times in 1.849923s 1.827091s so_k_nucleotide 0.800 0.777 i/s - 1.000 times in 1.250635s 1.286295s so_lists 2.101 1.848 i/s - 1.000 times in 0.475954s 0.541095s so_mandelbrot 0.435 0.408 i/s - 1.000 times in 2.299328s 2.450535s so_matrix 1.946 1.912 i/s - 1.000 times in 0.513872s 0.523076s so_meteor_contest 0.311 0.317 i/s - 1.000 times in 3.219297s 3.152052s so_nbody 0.746 0.703 i/s - 1.000 times in 1.339815s 1.423441s so_nested_loop 0.899 0.901 i/s - 1.000 times in 1.111767s 1.109555s so_nsieve 0.559 0.579 i/s - 1.000 times in 1.787763s 1.726552s so_nsieve_bits 0.435 0.428 i/s - 1.000 times in 2.296282s 2.333852s so_object 1.368 1.442 i/s - 1.000 times in 0.731237s 0.693684s so_partial_sums 0.616 0.546 i/s - 1.000 times in 1.623592s 1.833097s so_pidigits 0.831 0.832 i/s - 1.000 times in 1.203117s 1.202334s so_random 2.934 2.724 i/s - 1.000 times in 0.340791s 0.367150s so_reverse_complement 0.583 0.866 i/s - 1.000 times in 1.714144s 1.154615s so_sieve 1.829 2.081 i/s - 1.000 times in 0.546607s 0.480562s so_spectralnorm 0.524 0.558 i/s - 1.000 times in 1.908716s 1.792382s git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@64737 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2018-09-14 03:44:44 -04:00
CALL_SIMPLE_METHOD();
}
}
/* optimized X<=Y. */
DEFINE_INSN
opt_le
(CALL_DATA cd)
(VALUE recv, VALUE obj)
(VALUE val)
{
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 06:58:49 -04:00
val = vm_opt_le(recv, obj);
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 06:58:49 -04:00
if (val == Qundef) {
move ADD_PC around (take 2) Now that we can say for sure if an instruction calls a method or not internally, it is now possible to reroute the bugs that forced us to revert the "move PC around" optimization. First try: r62051 Reverted: r63763 See also: r63999 ---- trunk: ruby 2.6.0dev (2018-09-13 trunk 64736) [x86_64-darwin15] ours: ruby 2.6.0dev (2018-09-13 trunk 64736) [x86_64-darwin15] last_commit=move ADD_PC around (take 2) Calculating ------------------------------------- trunk ours so_ackermann 1.884 2.278 i/s - 1.000 times in 0.530926s 0.438935s so_array 1.178 1.157 i/s - 1.000 times in 0.848786s 0.864467s so_binary_trees 0.176 0.177 i/s - 1.000 times in 5.683895s 5.657707s so_concatenate 0.220 0.221 i/s - 1.000 times in 4.546896s 4.518949s so_count_words 6.729 6.470 i/s - 1.000 times in 0.148602s 0.154561s so_exception 3.324 3.688 i/s - 1.000 times in 0.300872s 0.271147s so_fannkuch 0.546 0.968 i/s - 1.000 times in 1.831328s 1.033376s so_fasta 0.541 0.547 i/s - 1.000 times in 1.849923s 1.827091s so_k_nucleotide 0.800 0.777 i/s - 1.000 times in 1.250635s 1.286295s so_lists 2.101 1.848 i/s - 1.000 times in 0.475954s 0.541095s so_mandelbrot 0.435 0.408 i/s - 1.000 times in 2.299328s 2.450535s so_matrix 1.946 1.912 i/s - 1.000 times in 0.513872s 0.523076s so_meteor_contest 0.311 0.317 i/s - 1.000 times in 3.219297s 3.152052s so_nbody 0.746 0.703 i/s - 1.000 times in 1.339815s 1.423441s so_nested_loop 0.899 0.901 i/s - 1.000 times in 1.111767s 1.109555s so_nsieve 0.559 0.579 i/s - 1.000 times in 1.787763s 1.726552s so_nsieve_bits 0.435 0.428 i/s - 1.000 times in 2.296282s 2.333852s so_object 1.368 1.442 i/s - 1.000 times in 0.731237s 0.693684s so_partial_sums 0.616 0.546 i/s - 1.000 times in 1.623592s 1.833097s so_pidigits 0.831 0.832 i/s - 1.000 times in 1.203117s 1.202334s so_random 2.934 2.724 i/s - 1.000 times in 0.340791s 0.367150s so_reverse_complement 0.583 0.866 i/s - 1.000 times in 1.714144s 1.154615s so_sieve 1.829 2.081 i/s - 1.000 times in 0.546607s 0.480562s so_spectralnorm 0.524 0.558 i/s - 1.000 times in 1.908716s 1.792382s git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@64737 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2018-09-14 03:44:44 -04:00
CALL_SIMPLE_METHOD();
}
}
/* optimized X>Y. */
DEFINE_INSN
opt_gt
(CALL_DATA cd)
(VALUE recv, VALUE obj)
(VALUE val)
{
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 06:58:49 -04:00
val = vm_opt_gt(recv, obj);
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 06:58:49 -04:00
if (val == Qundef) {
move ADD_PC around (take 2) Now that we can say for sure if an instruction calls a method or not internally, it is now possible to reroute the bugs that forced us to revert the "move PC around" optimization. First try: r62051 Reverted: r63763 See also: r63999 ---- trunk: ruby 2.6.0dev (2018-09-13 trunk 64736) [x86_64-darwin15] ours: ruby 2.6.0dev (2018-09-13 trunk 64736) [x86_64-darwin15] last_commit=move ADD_PC around (take 2) Calculating ------------------------------------- trunk ours so_ackermann 1.884 2.278 i/s - 1.000 times in 0.530926s 0.438935s so_array 1.178 1.157 i/s - 1.000 times in 0.848786s 0.864467s so_binary_trees 0.176 0.177 i/s - 1.000 times in 5.683895s 5.657707s so_concatenate 0.220 0.221 i/s - 1.000 times in 4.546896s 4.518949s so_count_words 6.729 6.470 i/s - 1.000 times in 0.148602s 0.154561s so_exception 3.324 3.688 i/s - 1.000 times in 0.300872s 0.271147s so_fannkuch 0.546 0.968 i/s - 1.000 times in 1.831328s 1.033376s so_fasta 0.541 0.547 i/s - 1.000 times in 1.849923s 1.827091s so_k_nucleotide 0.800 0.777 i/s - 1.000 times in 1.250635s 1.286295s so_lists 2.101 1.848 i/s - 1.000 times in 0.475954s 0.541095s so_mandelbrot 0.435 0.408 i/s - 1.000 times in 2.299328s 2.450535s so_matrix 1.946 1.912 i/s - 1.000 times in 0.513872s 0.523076s so_meteor_contest 0.311 0.317 i/s - 1.000 times in 3.219297s 3.152052s so_nbody 0.746 0.703 i/s - 1.000 times in 1.339815s 1.423441s so_nested_loop 0.899 0.901 i/s - 1.000 times in 1.111767s 1.109555s so_nsieve 0.559 0.579 i/s - 1.000 times in 1.787763s 1.726552s so_nsieve_bits 0.435 0.428 i/s - 1.000 times in 2.296282s 2.333852s so_object 1.368 1.442 i/s - 1.000 times in 0.731237s 0.693684s so_partial_sums 0.616 0.546 i/s - 1.000 times in 1.623592s 1.833097s so_pidigits 0.831 0.832 i/s - 1.000 times in 1.203117s 1.202334s so_random 2.934 2.724 i/s - 1.000 times in 0.340791s 0.367150s so_reverse_complement 0.583 0.866 i/s - 1.000 times in 1.714144s 1.154615s so_sieve 1.829 2.081 i/s - 1.000 times in 0.546607s 0.480562s so_spectralnorm 0.524 0.558 i/s - 1.000 times in 1.908716s 1.792382s git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@64737 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2018-09-14 03:44:44 -04:00
CALL_SIMPLE_METHOD();
}
}
/* optimized X>=Y. */
DEFINE_INSN
opt_ge
(CALL_DATA cd)
(VALUE recv, VALUE obj)
(VALUE val)
{
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 06:58:49 -04:00
val = vm_opt_ge(recv, obj);
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 06:58:49 -04:00
if (val == Qundef) {
move ADD_PC around (take 2) Now that we can say for sure if an instruction calls a method or not internally, it is now possible to reroute the bugs that forced us to revert the "move PC around" optimization. First try: r62051 Reverted: r63763 See also: r63999 ---- trunk: ruby 2.6.0dev (2018-09-13 trunk 64736) [x86_64-darwin15] ours: ruby 2.6.0dev (2018-09-13 trunk 64736) [x86_64-darwin15] last_commit=move ADD_PC around (take 2) Calculating ------------------------------------- trunk ours so_ackermann 1.884 2.278 i/s - 1.000 times in 0.530926s 0.438935s so_array 1.178 1.157 i/s - 1.000 times in 0.848786s 0.864467s so_binary_trees 0.176 0.177 i/s - 1.000 times in 5.683895s 5.657707s so_concatenate 0.220 0.221 i/s - 1.000 times in 4.546896s 4.518949s so_count_words 6.729 6.470 i/s - 1.000 times in 0.148602s 0.154561s so_exception 3.324 3.688 i/s - 1.000 times in 0.300872s 0.271147s so_fannkuch 0.546 0.968 i/s - 1.000 times in 1.831328s 1.033376s so_fasta 0.541 0.547 i/s - 1.000 times in 1.849923s 1.827091s so_k_nucleotide 0.800 0.777 i/s - 1.000 times in 1.250635s 1.286295s so_lists 2.101 1.848 i/s - 1.000 times in 0.475954s 0.541095s so_mandelbrot 0.435 0.408 i/s - 1.000 times in 2.299328s 2.450535s so_matrix 1.946 1.912 i/s - 1.000 times in 0.513872s 0.523076s so_meteor_contest 0.311 0.317 i/s - 1.000 times in 3.219297s 3.152052s so_nbody 0.746 0.703 i/s - 1.000 times in 1.339815s 1.423441s so_nested_loop 0.899 0.901 i/s - 1.000 times in 1.111767s 1.109555s so_nsieve 0.559 0.579 i/s - 1.000 times in 1.787763s 1.726552s so_nsieve_bits 0.435 0.428 i/s - 1.000 times in 2.296282s 2.333852s so_object 1.368 1.442 i/s - 1.000 times in 0.731237s 0.693684s so_partial_sums 0.616 0.546 i/s - 1.000 times in 1.623592s 1.833097s so_pidigits 0.831 0.832 i/s - 1.000 times in 1.203117s 1.202334s so_random 2.934 2.724 i/s - 1.000 times in 0.340791s 0.367150s so_reverse_complement 0.583 0.866 i/s - 1.000 times in 1.714144s 1.154615s so_sieve 1.829 2.081 i/s - 1.000 times in 0.546607s 0.480562s so_spectralnorm 0.524 0.558 i/s - 1.000 times in 1.908716s 1.792382s git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@64737 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2018-09-14 03:44:44 -04:00
CALL_SIMPLE_METHOD();
}
}
/* << */
DEFINE_INSN
opt_ltlt
(CALL_DATA cd)
(VALUE recv, VALUE obj)
(VALUE val)
/* This instruction can append an integer, as a codepoint, into a
* string. Then what happens if that codepoint does not exist in the
* string's encoding? Of course an exception. That's not a leaf. */
// attr bool leaf = false; /* has "invalid codepoint" exception */
{
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 06:58:49 -04:00
val = vm_opt_ltlt(recv, obj);
if (val == Qundef) {
move ADD_PC around (take 2) Now that we can say for sure if an instruction calls a method or not internally, it is now possible to reroute the bugs that forced us to revert the "move PC around" optimization. First try: r62051 Reverted: r63763 See also: r63999 ---- trunk: ruby 2.6.0dev (2018-09-13 trunk 64736) [x86_64-darwin15] ours: ruby 2.6.0dev (2018-09-13 trunk 64736) [x86_64-darwin15] last_commit=move ADD_PC around (take 2) Calculating ------------------------------------- trunk ours so_ackermann 1.884 2.278 i/s - 1.000 times in 0.530926s 0.438935s so_array 1.178 1.157 i/s - 1.000 times in 0.848786s 0.864467s so_binary_trees 0.176 0.177 i/s - 1.000 times in 5.683895s 5.657707s so_concatenate 0.220 0.221 i/s - 1.000 times in 4.546896s 4.518949s so_count_words 6.729 6.470 i/s - 1.000 times in 0.148602s 0.154561s so_exception 3.324 3.688 i/s - 1.000 times in 0.300872s 0.271147s so_fannkuch 0.546 0.968 i/s - 1.000 times in 1.831328s 1.033376s so_fasta 0.541 0.547 i/s - 1.000 times in 1.849923s 1.827091s so_k_nucleotide 0.800 0.777 i/s - 1.000 times in 1.250635s 1.286295s so_lists 2.101 1.848 i/s - 1.000 times in 0.475954s 0.541095s so_mandelbrot 0.435 0.408 i/s - 1.000 times in 2.299328s 2.450535s so_matrix 1.946 1.912 i/s - 1.000 times in 0.513872s 0.523076s so_meteor_contest 0.311 0.317 i/s - 1.000 times in 3.219297s 3.152052s so_nbody 0.746 0.703 i/s - 1.000 times in 1.339815s 1.423441s so_nested_loop 0.899 0.901 i/s - 1.000 times in 1.111767s 1.109555s so_nsieve 0.559 0.579 i/s - 1.000 times in 1.787763s 1.726552s so_nsieve_bits 0.435 0.428 i/s - 1.000 times in 2.296282s 2.333852s so_object 1.368 1.442 i/s - 1.000 times in 0.731237s 0.693684s so_partial_sums 0.616 0.546 i/s - 1.000 times in 1.623592s 1.833097s so_pidigits 0.831 0.832 i/s - 1.000 times in 1.203117s 1.202334s so_random 2.934 2.724 i/s - 1.000 times in 0.340791s 0.367150s so_reverse_complement 0.583 0.866 i/s - 1.000 times in 1.714144s 1.154615s so_sieve 1.829 2.081 i/s - 1.000 times in 0.546607s 0.480562s so_spectralnorm 0.524 0.558 i/s - 1.000 times in 1.908716s 1.792382s git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@64737 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2018-09-14 03:44:44 -04:00
CALL_SIMPLE_METHOD();
}
}
/* optimized X&Y. */
DEFINE_INSN
opt_and
(CALL_DATA cd)
(VALUE recv, VALUE obj)
(VALUE val)
{
val = vm_opt_and(recv, obj);
if (val == Qundef) {
CALL_SIMPLE_METHOD();
}
}
/* optimized X|Y. */
DEFINE_INSN
opt_or
(CALL_DATA cd)
(VALUE recv, VALUE obj)
(VALUE val)
{
val = vm_opt_or(recv, obj);
if (val == Qundef) {
CALL_SIMPLE_METHOD();
}
}
/* [] */
DEFINE_INSN
opt_aref
(CALL_DATA cd)
(VALUE recv, VALUE obj)
(VALUE val)
/* This is complicated. In case of hash, vm_opt_aref() resorts to
* rb_hash_aref(). If `recv` has no `obj`, this function then yields
* default_proc. This is a method call. So opt_aref is
* (surprisingly) not leaf. */
// attr bool leaf = false; /* has rb_funcall() */ /* calls #yield */
{
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 06:58:49 -04:00
val = vm_opt_aref(recv, obj);
if (val == Qundef) {
move ADD_PC around (take 2) Now that we can say for sure if an instruction calls a method or not internally, it is now possible to reroute the bugs that forced us to revert the "move PC around" optimization. First try: r62051 Reverted: r63763 See also: r63999 ---- trunk: ruby 2.6.0dev (2018-09-13 trunk 64736) [x86_64-darwin15] ours: ruby 2.6.0dev (2018-09-13 trunk 64736) [x86_64-darwin15] last_commit=move ADD_PC around (take 2) Calculating ------------------------------------- trunk ours so_ackermann 1.884 2.278 i/s - 1.000 times in 0.530926s 0.438935s so_array 1.178 1.157 i/s - 1.000 times in 0.848786s 0.864467s so_binary_trees 0.176 0.177 i/s - 1.000 times in 5.683895s 5.657707s so_concatenate 0.220 0.221 i/s - 1.000 times in 4.546896s 4.518949s so_count_words 6.729 6.470 i/s - 1.000 times in 0.148602s 0.154561s so_exception 3.324 3.688 i/s - 1.000 times in 0.300872s 0.271147s so_fannkuch 0.546 0.968 i/s - 1.000 times in 1.831328s 1.033376s so_fasta 0.541 0.547 i/s - 1.000 times in 1.849923s 1.827091s so_k_nucleotide 0.800 0.777 i/s - 1.000 times in 1.250635s 1.286295s so_lists 2.101 1.848 i/s - 1.000 times in 0.475954s 0.541095s so_mandelbrot 0.435 0.408 i/s - 1.000 times in 2.299328s 2.450535s so_matrix 1.946 1.912 i/s - 1.000 times in 0.513872s 0.523076s so_meteor_contest 0.311 0.317 i/s - 1.000 times in 3.219297s 3.152052s so_nbody 0.746 0.703 i/s - 1.000 times in 1.339815s 1.423441s so_nested_loop 0.899 0.901 i/s - 1.000 times in 1.111767s 1.109555s so_nsieve 0.559 0.579 i/s - 1.000 times in 1.787763s 1.726552s so_nsieve_bits 0.435 0.428 i/s - 1.000 times in 2.296282s 2.333852s so_object 1.368 1.442 i/s - 1.000 times in 0.731237s 0.693684s so_partial_sums 0.616 0.546 i/s - 1.000 times in 1.623592s 1.833097s so_pidigits 0.831 0.832 i/s - 1.000 times in 1.203117s 1.202334s so_random 2.934 2.724 i/s - 1.000 times in 0.340791s 0.367150s so_reverse_complement 0.583 0.866 i/s - 1.000 times in 1.714144s 1.154615s so_sieve 1.829 2.081 i/s - 1.000 times in 0.546607s 0.480562s so_spectralnorm 0.524 0.558 i/s - 1.000 times in 1.908716s 1.792382s git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@64737 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2018-09-14 03:44:44 -04:00
CALL_SIMPLE_METHOD();
}
}
/* recv[obj] = set */
DEFINE_INSN
opt_aset
(CALL_DATA cd)
(VALUE recv, VALUE obj, VALUE set)
(VALUE val)
/* This is another story than opt_aref. When vm_opt_aset() resorts
* to rb_hash_aset(), which should call #hash for `obj`. */
// attr bool leaf = false; /* has rb_funcall() */ /* calls #hash */
{
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 06:58:49 -04:00
val = vm_opt_aset(recv, obj, set);
if (val == Qundef) {
move ADD_PC around (take 2) Now that we can say for sure if an instruction calls a method or not internally, it is now possible to reroute the bugs that forced us to revert the "move PC around" optimization. First try: r62051 Reverted: r63763 See also: r63999 ---- trunk: ruby 2.6.0dev (2018-09-13 trunk 64736) [x86_64-darwin15] ours: ruby 2.6.0dev (2018-09-13 trunk 64736) [x86_64-darwin15] last_commit=move ADD_PC around (take 2) Calculating ------------------------------------- trunk ours so_ackermann 1.884 2.278 i/s - 1.000 times in 0.530926s 0.438935s so_array 1.178 1.157 i/s - 1.000 times in 0.848786s 0.864467s so_binary_trees 0.176 0.177 i/s - 1.000 times in 5.683895s 5.657707s so_concatenate 0.220 0.221 i/s - 1.000 times in 4.546896s 4.518949s so_count_words 6.729 6.470 i/s - 1.000 times in 0.148602s 0.154561s so_exception 3.324 3.688 i/s - 1.000 times in 0.300872s 0.271147s so_fannkuch 0.546 0.968 i/s - 1.000 times in 1.831328s 1.033376s so_fasta 0.541 0.547 i/s - 1.000 times in 1.849923s 1.827091s so_k_nucleotide 0.800 0.777 i/s - 1.000 times in 1.250635s 1.286295s so_lists 2.101 1.848 i/s - 1.000 times in 0.475954s 0.541095s so_mandelbrot 0.435 0.408 i/s - 1.000 times in 2.299328s 2.450535s so_matrix 1.946 1.912 i/s - 1.000 times in 0.513872s 0.523076s so_meteor_contest 0.311 0.317 i/s - 1.000 times in 3.219297s 3.152052s so_nbody 0.746 0.703 i/s - 1.000 times in 1.339815s 1.423441s so_nested_loop 0.899 0.901 i/s - 1.000 times in 1.111767s 1.109555s so_nsieve 0.559 0.579 i/s - 1.000 times in 1.787763s 1.726552s so_nsieve_bits 0.435 0.428 i/s - 1.000 times in 2.296282s 2.333852s so_object 1.368 1.442 i/s - 1.000 times in 0.731237s 0.693684s so_partial_sums 0.616 0.546 i/s - 1.000 times in 1.623592s 1.833097s so_pidigits 0.831 0.832 i/s - 1.000 times in 1.203117s 1.202334s so_random 2.934 2.724 i/s - 1.000 times in 0.340791s 0.367150s so_reverse_complement 0.583 0.866 i/s - 1.000 times in 1.714144s 1.154615s so_sieve 1.829 2.081 i/s - 1.000 times in 0.546607s 0.480562s so_spectralnorm 0.524 0.558 i/s - 1.000 times in 1.908716s 1.792382s git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@64737 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2018-09-14 03:44:44 -04:00
CALL_SIMPLE_METHOD();
}
}
/* recv[str] = set */
DEFINE_INSN
opt_aset_with
(VALUE key, CALL_DATA cd)
(VALUE recv, VALUE val)
(VALUE val)
/* Same discussion as opt_aset. */
// attr bool leaf = false; /* has rb_funcall() */ /* calls #hash */
{
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 06:58:49 -04:00
VALUE tmp = vm_opt_aset_with(recv, key, val);
if (tmp != Qundef) {
val = tmp;
}
else {
#ifndef MJIT_HEADER
mjit_compile.c: reduce sp motion on JIT This retries r62655, which was reverted at r63863 for r63763. tool/ruby_vm/views/_mjit_compile_insn.erb: revert the revert. tool/ruby_vm/views/_mjit_compile_insn_body.erb: ditto. tool/ruby_vm/views/_mjit_compile_pc_and_sp.erb: ditto. tool/ruby_vm/views/_mjit_compile_send.erb: ditto. tool/ruby_vm/views/mjit_compile.inc.erb: ditto. tool/ruby_vm/views/_insn_entry.erb: revert half of r63763. The commit was originally reverted since changing pc motion was bad for tracing, but changing sp motion was totally fine. For JIT, I wanna resurrect the sp motion change in r62051. tool/ruby_vm/models/bare_instructions.rb: ditto. insns.def: ditto. vm_insnhelper.c: ditto. vm_insnhelper.h: ditto. * benchmark $ benchmark-driver benchmark.yml --rbenv 'before;after;before --jit;after --jit' --repeat-count 12 -v before: ruby 2.6.0dev (2018-07-19 trunk 63998) [x86_64-linux] after: ruby 2.6.0dev (2018-07-19 add-sp 63998) [x86_64-linux] last_commit=mjit_compile.c: reduce sp motion on JIT before --jit: ruby 2.6.0dev (2018-07-19 trunk 63998) +JIT [x86_64-linux] after --jit: ruby 2.6.0dev (2018-07-19 add-sp 63998) +JIT [x86_64-linux] last_commit=mjit_compile.c: reduce sp motion on JIT Calculating ------------------------------------- before after before --jit after --jit Optcarrot Lan_Master.nes 51.354 50.238 70.010 72.139 fps Comparison: Optcarrot Lan_Master.nes after --jit: 72.1 fps before --jit: 70.0 fps - 1.03x slower before: 51.4 fps - 1.40x slower after: 50.2 fps - 1.44x slower git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@63999 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2018-07-19 09:25:22 -04:00
TOPN(0) = rb_str_resurrect(key);
PUSH(val);
mjit_compile.c: merge initial JIT compiler which has been developed by Takashi Kokubun <takashikkbn@gmail> as YARV-MJIT. Many of its bugs are fixed by wanabe <s.wanabe@gmail.com>. This JIT compiler is designed to be a safe migration path to introduce JIT compiler to MRI. So this commit does not include any bytecode changes or dynamic instruction modifications, which are done in original MJIT. This commit even strips off some aggressive optimizations from YARV-MJIT, and thus it's slower than YARV-MJIT too. But it's still fairly faster than Ruby 2.5 in some benchmarks (attached below). Note that this JIT compiler passes `make test`, `make test-all`, `make test-spec` without JIT, and even with JIT. Not only it's perfectly safe with JIT disabled because it does not replace VM instructions unlike MJIT, but also with JIT enabled it stably runs Ruby applications including Rails applications. I'm expecting this version as just "initial" JIT compiler. I have many optimization ideas which are skipped for initial merging, and you may easily replace this JIT compiler with a faster one by just replacing mjit_compile.c. `mjit_compile` interface is designed for the purpose. common.mk: update dependencies for mjit_compile.c. internal.h: declare `rb_vm_insn_addr2insn` for MJIT. vm.c: exclude some definitions if `-DMJIT_HEADER` is provided to compiler. This avoids to include some functions which take a long time to compile, e.g. vm_exec_core. Some of the purpose is achieved in transform_mjit_header.rb (see `IGNORED_FUNCTIONS`) but others are manually resolved for now. Load mjit_helper.h for MJIT header. mjit_helper.h: New. This is a file used only by JIT-ed code. I'll refactor `mjit_call_cfunc` later. vm_eval.c: add some #ifdef switches to skip compiling some functions like Init_vm_eval. win32/mkexports.rb: export thread/ec functions, which are used by MJIT. include/ruby/defines.h: add MJIT_FUNC_EXPORTED macro alis to clarify that a function is exported only for MJIT. array.c: export a function used by MJIT. bignum.c: ditto. class.c: ditto. compile.c: ditto. error.c: ditto. gc.c: ditto. hash.c: ditto. iseq.c: ditto. numeric.c: ditto. object.c: ditto. proc.c: ditto. re.c: ditto. st.c: ditto. string.c: ditto. thread.c: ditto. variable.c: ditto. vm_backtrace.c: ditto. vm_insnhelper.c: ditto. vm_method.c: ditto. I would like to improve maintainability of function exports, but I believe this way is acceptable as initial merging if we clarify the new exports are for MJIT (so that we can use them as TODO list to fix) and add unit tests to detect unresolved symbols. I'll add unit tests of JIT compilations in succeeding commits. Author: Takashi Kokubun <takashikkbn@gmail.com> Contributor: wanabe <s.wanabe@gmail.com> Part of [Feature #14235] --- * Known issues * Code generated by gcc is faster than clang. The benchmark may be worse in macOS. Following benchmark result is provided by gcc w/ Linux. * Performance is decreased when Google Chrome is running * JIT can work on MinGW, but it doesn't improve performance at least in short running benchmark. * Currently it doesn't perform well with Rails. We'll try to fix this before release. --- * Benchmark reslts Benchmarked with: Intel 4.0GHz i7-4790K with 16GB memory under x86-64 Ubuntu 8 Cores - 2.0.0-p0: Ruby 2.0.0-p0 - r62186: Ruby trunk (early 2.6.0), before MJIT changes - JIT off: On this commit, but without `--jit` option - JIT on: On this commit, and with `--jit` option ** Optcarrot fps Benchmark: https://github.com/mame/optcarrot | |2.0.0-p0 |r62186 |JIT off |JIT on | |:--------|:--------|:--------|:--------|:--------| |fps |37.32 |51.46 |51.31 |58.88 | |vs 2.0.0 |1.00x |1.38x |1.37x |1.58x | ** MJIT benchmarks Benchmark: https://github.com/benchmark-driver/mjit-benchmarks (Original: https://github.com/vnmakarov/ruby/tree/rtl_mjit_branch/MJIT-benchmarks) | |2.0.0-p0 |r62186 |JIT off |JIT on | |:----------|:--------|:--------|:--------|:--------| |aread |1.00 |1.09 |1.07 |2.19 | |aref |1.00 |1.13 |1.11 |2.22 | |aset |1.00 |1.50 |1.45 |2.64 | |awrite |1.00 |1.17 |1.13 |2.20 | |call |1.00 |1.29 |1.26 |2.02 | |const2 |1.00 |1.10 |1.10 |2.19 | |const |1.00 |1.11 |1.10 |2.19 | |fannk |1.00 |1.04 |1.02 |1.00 | |fib |1.00 |1.32 |1.31 |1.84 | |ivread |1.00 |1.13 |1.12 |2.43 | |ivwrite |1.00 |1.23 |1.21 |2.40 | |mandelbrot |1.00 |1.13 |1.16 |1.28 | |meteor |1.00 |2.97 |2.92 |3.17 | |nbody |1.00 |1.17 |1.15 |1.49 | |nest-ntimes|1.00 |1.22 |1.20 |1.39 | |nest-while |1.00 |1.10 |1.10 |1.37 | |norm |1.00 |1.18 |1.16 |1.24 | |nsvb |1.00 |1.16 |1.16 |1.17 | |red-black |1.00 |1.02 |0.99 |1.12 | |sieve |1.00 |1.30 |1.28 |1.62 | |trees |1.00 |1.14 |1.13 |1.19 | |while |1.00 |1.12 |1.11 |2.41 | ** Discourse's script/bench.rb Benchmark: https://github.com/discourse/discourse/blob/v1.8.7/script/bench.rb NOTE: Rails performance was somehow a little degraded with JIT for now. We should fix this. (At least I know opt_aref is performing badly in JIT and I have an idea to fix it. Please wait for the fix.) *** JIT off Your Results: (note for timings- percentile is first, duration is second in millisecs) categories_admin: 50: 17 75: 18 90: 22 99: 29 home_admin: 50: 21 75: 21 90: 27 99: 40 topic_admin: 50: 17 75: 18 90: 22 99: 32 categories: 50: 35 75: 41 90: 43 99: 77 home: 50: 39 75: 46 90: 49 99: 95 topic: 50: 46 75: 52 90: 56 99: 101 *** JIT on Your Results: (note for timings- percentile is first, duration is second in millisecs) categories_admin: 50: 19 75: 21 90: 25 99: 33 home_admin: 50: 24 75: 26 90: 30 99: 35 topic_admin: 50: 19 75: 20 90: 25 99: 30 categories: 50: 40 75: 44 90: 48 99: 76 home: 50: 42 75: 48 90: 51 99: 89 topic: 50: 49 75: 55 90: 58 99: 99 git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@62197 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2018-02-04 06:22:28 -05:00
#endif
move ADD_PC around (take 2) Now that we can say for sure if an instruction calls a method or not internally, it is now possible to reroute the bugs that forced us to revert the "move PC around" optimization. First try: r62051 Reverted: r63763 See also: r63999 ---- trunk: ruby 2.6.0dev (2018-09-13 trunk 64736) [x86_64-darwin15] ours: ruby 2.6.0dev (2018-09-13 trunk 64736) [x86_64-darwin15] last_commit=move ADD_PC around (take 2) Calculating ------------------------------------- trunk ours so_ackermann 1.884 2.278 i/s - 1.000 times in 0.530926s 0.438935s so_array 1.178 1.157 i/s - 1.000 times in 0.848786s 0.864467s so_binary_trees 0.176 0.177 i/s - 1.000 times in 5.683895s 5.657707s so_concatenate 0.220 0.221 i/s - 1.000 times in 4.546896s 4.518949s so_count_words 6.729 6.470 i/s - 1.000 times in 0.148602s 0.154561s so_exception 3.324 3.688 i/s - 1.000 times in 0.300872s 0.271147s so_fannkuch 0.546 0.968 i/s - 1.000 times in 1.831328s 1.033376s so_fasta 0.541 0.547 i/s - 1.000 times in 1.849923s 1.827091s so_k_nucleotide 0.800 0.777 i/s - 1.000 times in 1.250635s 1.286295s so_lists 2.101 1.848 i/s - 1.000 times in 0.475954s 0.541095s so_mandelbrot 0.435 0.408 i/s - 1.000 times in 2.299328s 2.450535s so_matrix 1.946 1.912 i/s - 1.000 times in 0.513872s 0.523076s so_meteor_contest 0.311 0.317 i/s - 1.000 times in 3.219297s 3.152052s so_nbody 0.746 0.703 i/s - 1.000 times in 1.339815s 1.423441s so_nested_loop 0.899 0.901 i/s - 1.000 times in 1.111767s 1.109555s so_nsieve 0.559 0.579 i/s - 1.000 times in 1.787763s 1.726552s so_nsieve_bits 0.435 0.428 i/s - 1.000 times in 2.296282s 2.333852s so_object 1.368 1.442 i/s - 1.000 times in 0.731237s 0.693684s so_partial_sums 0.616 0.546 i/s - 1.000 times in 1.623592s 1.833097s so_pidigits 0.831 0.832 i/s - 1.000 times in 1.203117s 1.202334s so_random 2.934 2.724 i/s - 1.000 times in 0.340791s 0.367150s so_reverse_complement 0.583 0.866 i/s - 1.000 times in 1.714144s 1.154615s so_sieve 1.829 2.081 i/s - 1.000 times in 0.546607s 0.480562s so_spectralnorm 0.524 0.558 i/s - 1.000 times in 1.908716s 1.792382s git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@64737 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2018-09-14 03:44:44 -04:00
CALL_SIMPLE_METHOD();
}
}
/* recv[str] */
DEFINE_INSN
opt_aref_with
(VALUE key, CALL_DATA cd)
(VALUE recv)
(VALUE val)
/* Same discussion as opt_aref. */
// attr bool leaf = false; /* has rb_funcall() */ /* calls #yield */
{
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 06:58:49 -04:00
val = vm_opt_aref_with(recv, key);
if (val == Qundef) {
#ifndef MJIT_HEADER
mjit_compile.c: reduce sp motion on JIT This retries r62655, which was reverted at r63863 for r63763. tool/ruby_vm/views/_mjit_compile_insn.erb: revert the revert. tool/ruby_vm/views/_mjit_compile_insn_body.erb: ditto. tool/ruby_vm/views/_mjit_compile_pc_and_sp.erb: ditto. tool/ruby_vm/views/_mjit_compile_send.erb: ditto. tool/ruby_vm/views/mjit_compile.inc.erb: ditto. tool/ruby_vm/views/_insn_entry.erb: revert half of r63763. The commit was originally reverted since changing pc motion was bad for tracing, but changing sp motion was totally fine. For JIT, I wanna resurrect the sp motion change in r62051. tool/ruby_vm/models/bare_instructions.rb: ditto. insns.def: ditto. vm_insnhelper.c: ditto. vm_insnhelper.h: ditto. * benchmark $ benchmark-driver benchmark.yml --rbenv 'before;after;before --jit;after --jit' --repeat-count 12 -v before: ruby 2.6.0dev (2018-07-19 trunk 63998) [x86_64-linux] after: ruby 2.6.0dev (2018-07-19 add-sp 63998) [x86_64-linux] last_commit=mjit_compile.c: reduce sp motion on JIT before --jit: ruby 2.6.0dev (2018-07-19 trunk 63998) +JIT [x86_64-linux] after --jit: ruby 2.6.0dev (2018-07-19 add-sp 63998) +JIT [x86_64-linux] last_commit=mjit_compile.c: reduce sp motion on JIT Calculating ------------------------------------- before after before --jit after --jit Optcarrot Lan_Master.nes 51.354 50.238 70.010 72.139 fps Comparison: Optcarrot Lan_Master.nes after --jit: 72.1 fps before --jit: 70.0 fps - 1.03x slower before: 51.4 fps - 1.40x slower after: 50.2 fps - 1.44x slower git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@63999 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2018-07-19 09:25:22 -04:00
PUSH(rb_str_resurrect(key));
mjit_compile.c: merge initial JIT compiler which has been developed by Takashi Kokubun <takashikkbn@gmail> as YARV-MJIT. Many of its bugs are fixed by wanabe <s.wanabe@gmail.com>. This JIT compiler is designed to be a safe migration path to introduce JIT compiler to MRI. So this commit does not include any bytecode changes or dynamic instruction modifications, which are done in original MJIT. This commit even strips off some aggressive optimizations from YARV-MJIT, and thus it's slower than YARV-MJIT too. But it's still fairly faster than Ruby 2.5 in some benchmarks (attached below). Note that this JIT compiler passes `make test`, `make test-all`, `make test-spec` without JIT, and even with JIT. Not only it's perfectly safe with JIT disabled because it does not replace VM instructions unlike MJIT, but also with JIT enabled it stably runs Ruby applications including Rails applications. I'm expecting this version as just "initial" JIT compiler. I have many optimization ideas which are skipped for initial merging, and you may easily replace this JIT compiler with a faster one by just replacing mjit_compile.c. `mjit_compile` interface is designed for the purpose. common.mk: update dependencies for mjit_compile.c. internal.h: declare `rb_vm_insn_addr2insn` for MJIT. vm.c: exclude some definitions if `-DMJIT_HEADER` is provided to compiler. This avoids to include some functions which take a long time to compile, e.g. vm_exec_core. Some of the purpose is achieved in transform_mjit_header.rb (see `IGNORED_FUNCTIONS`) but others are manually resolved for now. Load mjit_helper.h for MJIT header. mjit_helper.h: New. This is a file used only by JIT-ed code. I'll refactor `mjit_call_cfunc` later. vm_eval.c: add some #ifdef switches to skip compiling some functions like Init_vm_eval. win32/mkexports.rb: export thread/ec functions, which are used by MJIT. include/ruby/defines.h: add MJIT_FUNC_EXPORTED macro alis to clarify that a function is exported only for MJIT. array.c: export a function used by MJIT. bignum.c: ditto. class.c: ditto. compile.c: ditto. error.c: ditto. gc.c: ditto. hash.c: ditto. iseq.c: ditto. numeric.c: ditto. object.c: ditto. proc.c: ditto. re.c: ditto. st.c: ditto. string.c: ditto. thread.c: ditto. variable.c: ditto. vm_backtrace.c: ditto. vm_insnhelper.c: ditto. vm_method.c: ditto. I would like to improve maintainability of function exports, but I believe this way is acceptable as initial merging if we clarify the new exports are for MJIT (so that we can use them as TODO list to fix) and add unit tests to detect unresolved symbols. I'll add unit tests of JIT compilations in succeeding commits. Author: Takashi Kokubun <takashikkbn@gmail.com> Contributor: wanabe <s.wanabe@gmail.com> Part of [Feature #14235] --- * Known issues * Code generated by gcc is faster than clang. The benchmark may be worse in macOS. Following benchmark result is provided by gcc w/ Linux. * Performance is decreased when Google Chrome is running * JIT can work on MinGW, but it doesn't improve performance at least in short running benchmark. * Currently it doesn't perform well with Rails. We'll try to fix this before release. --- * Benchmark reslts Benchmarked with: Intel 4.0GHz i7-4790K with 16GB memory under x86-64 Ubuntu 8 Cores - 2.0.0-p0: Ruby 2.0.0-p0 - r62186: Ruby trunk (early 2.6.0), before MJIT changes - JIT off: On this commit, but without `--jit` option - JIT on: On this commit, and with `--jit` option ** Optcarrot fps Benchmark: https://github.com/mame/optcarrot | |2.0.0-p0 |r62186 |JIT off |JIT on | |:--------|:--------|:--------|:--------|:--------| |fps |37.32 |51.46 |51.31 |58.88 | |vs 2.0.0 |1.00x |1.38x |1.37x |1.58x | ** MJIT benchmarks Benchmark: https://github.com/benchmark-driver/mjit-benchmarks (Original: https://github.com/vnmakarov/ruby/tree/rtl_mjit_branch/MJIT-benchmarks) | |2.0.0-p0 |r62186 |JIT off |JIT on | |:----------|:--------|:--------|:--------|:--------| |aread |1.00 |1.09 |1.07 |2.19 | |aref |1.00 |1.13 |1.11 |2.22 | |aset |1.00 |1.50 |1.45 |2.64 | |awrite |1.00 |1.17 |1.13 |2.20 | |call |1.00 |1.29 |1.26 |2.02 | |const2 |1.00 |1.10 |1.10 |2.19 | |const |1.00 |1.11 |1.10 |2.19 | |fannk |1.00 |1.04 |1.02 |1.00 | |fib |1.00 |1.32 |1.31 |1.84 | |ivread |1.00 |1.13 |1.12 |2.43 | |ivwrite |1.00 |1.23 |1.21 |2.40 | |mandelbrot |1.00 |1.13 |1.16 |1.28 | |meteor |1.00 |2.97 |2.92 |3.17 | |nbody |1.00 |1.17 |1.15 |1.49 | |nest-ntimes|1.00 |1.22 |1.20 |1.39 | |nest-while |1.00 |1.10 |1.10 |1.37 | |norm |1.00 |1.18 |1.16 |1.24 | |nsvb |1.00 |1.16 |1.16 |1.17 | |red-black |1.00 |1.02 |0.99 |1.12 | |sieve |1.00 |1.30 |1.28 |1.62 | |trees |1.00 |1.14 |1.13 |1.19 | |while |1.00 |1.12 |1.11 |2.41 | ** Discourse's script/bench.rb Benchmark: https://github.com/discourse/discourse/blob/v1.8.7/script/bench.rb NOTE: Rails performance was somehow a little degraded with JIT for now. We should fix this. (At least I know opt_aref is performing badly in JIT and I have an idea to fix it. Please wait for the fix.) *** JIT off Your Results: (note for timings- percentile is first, duration is second in millisecs) categories_admin: 50: 17 75: 18 90: 22 99: 29 home_admin: 50: 21 75: 21 90: 27 99: 40 topic_admin: 50: 17 75: 18 90: 22 99: 32 categories: 50: 35 75: 41 90: 43 99: 77 home: 50: 39 75: 46 90: 49 99: 95 topic: 50: 46 75: 52 90: 56 99: 101 *** JIT on Your Results: (note for timings- percentile is first, duration is second in millisecs) categories_admin: 50: 19 75: 21 90: 25 99: 33 home_admin: 50: 24 75: 26 90: 30 99: 35 topic_admin: 50: 19 75: 20 90: 25 99: 30 categories: 50: 40 75: 44 90: 48 99: 76 home: 50: 42 75: 48 90: 51 99: 89 topic: 50: 49 75: 55 90: 58 99: 99 git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@62197 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2018-02-04 06:22:28 -05:00
#endif
CALL_SIMPLE_METHOD();
}
}
/* optimized length */
DEFINE_INSN
opt_length
(CALL_DATA cd)
(VALUE recv)
(VALUE val)
{
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 06:58:49 -04:00
val = vm_opt_length(recv, BOP_LENGTH);
if (val == Qundef) {
move ADD_PC around (take 2) Now that we can say for sure if an instruction calls a method or not internally, it is now possible to reroute the bugs that forced us to revert the "move PC around" optimization. First try: r62051 Reverted: r63763 See also: r63999 ---- trunk: ruby 2.6.0dev (2018-09-13 trunk 64736) [x86_64-darwin15] ours: ruby 2.6.0dev (2018-09-13 trunk 64736) [x86_64-darwin15] last_commit=move ADD_PC around (take 2) Calculating ------------------------------------- trunk ours so_ackermann 1.884 2.278 i/s - 1.000 times in 0.530926s 0.438935s so_array 1.178 1.157 i/s - 1.000 times in 0.848786s 0.864467s so_binary_trees 0.176 0.177 i/s - 1.000 times in 5.683895s 5.657707s so_concatenate 0.220 0.221 i/s - 1.000 times in 4.546896s 4.518949s so_count_words 6.729 6.470 i/s - 1.000 times in 0.148602s 0.154561s so_exception 3.324 3.688 i/s - 1.000 times in 0.300872s 0.271147s so_fannkuch 0.546 0.968 i/s - 1.000 times in 1.831328s 1.033376s so_fasta 0.541 0.547 i/s - 1.000 times in 1.849923s 1.827091s so_k_nucleotide 0.800 0.777 i/s - 1.000 times in 1.250635s 1.286295s so_lists 2.101 1.848 i/s - 1.000 times in 0.475954s 0.541095s so_mandelbrot 0.435 0.408 i/s - 1.000 times in 2.299328s 2.450535s so_matrix 1.946 1.912 i/s - 1.000 times in 0.513872s 0.523076s so_meteor_contest 0.311 0.317 i/s - 1.000 times in 3.219297s 3.152052s so_nbody 0.746 0.703 i/s - 1.000 times in 1.339815s 1.423441s so_nested_loop 0.899 0.901 i/s - 1.000 times in 1.111767s 1.109555s so_nsieve 0.559 0.579 i/s - 1.000 times in 1.787763s 1.726552s so_nsieve_bits 0.435 0.428 i/s - 1.000 times in 2.296282s 2.333852s so_object 1.368 1.442 i/s - 1.000 times in 0.731237s 0.693684s so_partial_sums 0.616 0.546 i/s - 1.000 times in 1.623592s 1.833097s so_pidigits 0.831 0.832 i/s - 1.000 times in 1.203117s 1.202334s so_random 2.934 2.724 i/s - 1.000 times in 0.340791s 0.367150s so_reverse_complement 0.583 0.866 i/s - 1.000 times in 1.714144s 1.154615s so_sieve 1.829 2.081 i/s - 1.000 times in 0.546607s 0.480562s so_spectralnorm 0.524 0.558 i/s - 1.000 times in 1.908716s 1.792382s git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@64737 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2018-09-14 03:44:44 -04:00
CALL_SIMPLE_METHOD();
}
}
/* optimized size */
DEFINE_INSN
opt_size
(CALL_DATA cd)
(VALUE recv)
(VALUE val)
{
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 06:58:49 -04:00
val = vm_opt_length(recv, BOP_SIZE);
if (val == Qundef) {
move ADD_PC around (take 2) Now that we can say for sure if an instruction calls a method or not internally, it is now possible to reroute the bugs that forced us to revert the "move PC around" optimization. First try: r62051 Reverted: r63763 See also: r63999 ---- trunk: ruby 2.6.0dev (2018-09-13 trunk 64736) [x86_64-darwin15] ours: ruby 2.6.0dev (2018-09-13 trunk 64736) [x86_64-darwin15] last_commit=move ADD_PC around (take 2) Calculating ------------------------------------- trunk ours so_ackermann 1.884 2.278 i/s - 1.000 times in 0.530926s 0.438935s so_array 1.178 1.157 i/s - 1.000 times in 0.848786s 0.864467s so_binary_trees 0.176 0.177 i/s - 1.000 times in 5.683895s 5.657707s so_concatenate 0.220 0.221 i/s - 1.000 times in 4.546896s 4.518949s so_count_words 6.729 6.470 i/s - 1.000 times in 0.148602s 0.154561s so_exception 3.324 3.688 i/s - 1.000 times in 0.300872s 0.271147s so_fannkuch 0.546 0.968 i/s - 1.000 times in 1.831328s 1.033376s so_fasta 0.541 0.547 i/s - 1.000 times in 1.849923s 1.827091s so_k_nucleotide 0.800 0.777 i/s - 1.000 times in 1.250635s 1.286295s so_lists 2.101 1.848 i/s - 1.000 times in 0.475954s 0.541095s so_mandelbrot 0.435 0.408 i/s - 1.000 times in 2.299328s 2.450535s so_matrix 1.946 1.912 i/s - 1.000 times in 0.513872s 0.523076s so_meteor_contest 0.311 0.317 i/s - 1.000 times in 3.219297s 3.152052s so_nbody 0.746 0.703 i/s - 1.000 times in 1.339815s 1.423441s so_nested_loop 0.899 0.901 i/s - 1.000 times in 1.111767s 1.109555s so_nsieve 0.559 0.579 i/s - 1.000 times in 1.787763s 1.726552s so_nsieve_bits 0.435 0.428 i/s - 1.000 times in 2.296282s 2.333852s so_object 1.368 1.442 i/s - 1.000 times in 0.731237s 0.693684s so_partial_sums 0.616 0.546 i/s - 1.000 times in 1.623592s 1.833097s so_pidigits 0.831 0.832 i/s - 1.000 times in 1.203117s 1.202334s so_random 2.934 2.724 i/s - 1.000 times in 0.340791s 0.367150s so_reverse_complement 0.583 0.866 i/s - 1.000 times in 1.714144s 1.154615s so_sieve 1.829 2.081 i/s - 1.000 times in 0.546607s 0.480562s so_spectralnorm 0.524 0.558 i/s - 1.000 times in 1.908716s 1.792382s git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@64737 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2018-09-14 03:44:44 -04:00
CALL_SIMPLE_METHOD();
}
}
/* optimized empty? */
DEFINE_INSN
opt_empty_p
(CALL_DATA cd)
(VALUE recv)
(VALUE val)
{
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 06:58:49 -04:00
val = vm_opt_empty_p(recv);
if (val == Qundef) {
move ADD_PC around (take 2) Now that we can say for sure if an instruction calls a method or not internally, it is now possible to reroute the bugs that forced us to revert the "move PC around" optimization. First try: r62051 Reverted: r63763 See also: r63999 ---- trunk: ruby 2.6.0dev (2018-09-13 trunk 64736) [x86_64-darwin15] ours: ruby 2.6.0dev (2018-09-13 trunk 64736) [x86_64-darwin15] last_commit=move ADD_PC around (take 2) Calculating ------------------------------------- trunk ours so_ackermann 1.884 2.278 i/s - 1.000 times in 0.530926s 0.438935s so_array 1.178 1.157 i/s - 1.000 times in 0.848786s 0.864467s so_binary_trees 0.176 0.177 i/s - 1.000 times in 5.683895s 5.657707s so_concatenate 0.220 0.221 i/s - 1.000 times in 4.546896s 4.518949s so_count_words 6.729 6.470 i/s - 1.000 times in 0.148602s 0.154561s so_exception 3.324 3.688 i/s - 1.000 times in 0.300872s 0.271147s so_fannkuch 0.546 0.968 i/s - 1.000 times in 1.831328s 1.033376s so_fasta 0.541 0.547 i/s - 1.000 times in 1.849923s 1.827091s so_k_nucleotide 0.800 0.777 i/s - 1.000 times in 1.250635s 1.286295s so_lists 2.101 1.848 i/s - 1.000 times in 0.475954s 0.541095s so_mandelbrot 0.435 0.408 i/s - 1.000 times in 2.299328s 2.450535s so_matrix 1.946 1.912 i/s - 1.000 times in 0.513872s 0.523076s so_meteor_contest 0.311 0.317 i/s - 1.000 times in 3.219297s 3.152052s so_nbody 0.746 0.703 i/s - 1.000 times in 1.339815s 1.423441s so_nested_loop 0.899 0.901 i/s - 1.000 times in 1.111767s 1.109555s so_nsieve 0.559 0.579 i/s - 1.000 times in 1.787763s 1.726552s so_nsieve_bits 0.435 0.428 i/s - 1.000 times in 2.296282s 2.333852s so_object 1.368 1.442 i/s - 1.000 times in 0.731237s 0.693684s so_partial_sums 0.616 0.546 i/s - 1.000 times in 1.623592s 1.833097s so_pidigits 0.831 0.832 i/s - 1.000 times in 1.203117s 1.202334s so_random 2.934 2.724 i/s - 1.000 times in 0.340791s 0.367150s so_reverse_complement 0.583 0.866 i/s - 1.000 times in 1.714144s 1.154615s so_sieve 1.829 2.081 i/s - 1.000 times in 0.546607s 0.480562s so_spectralnorm 0.524 0.558 i/s - 1.000 times in 1.908716s 1.792382s git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@64737 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2018-09-14 03:44:44 -04:00
CALL_SIMPLE_METHOD();
}
}
/* optimized succ */
DEFINE_INSN
opt_succ
(CALL_DATA cd)
(VALUE recv)
(VALUE val)
{
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 06:58:49 -04:00
val = vm_opt_succ(recv);
if (val == Qundef) {
move ADD_PC around (take 2) Now that we can say for sure if an instruction calls a method or not internally, it is now possible to reroute the bugs that forced us to revert the "move PC around" optimization. First try: r62051 Reverted: r63763 See also: r63999 ---- trunk: ruby 2.6.0dev (2018-09-13 trunk 64736) [x86_64-darwin15] ours: ruby 2.6.0dev (2018-09-13 trunk 64736) [x86_64-darwin15] last_commit=move ADD_PC around (take 2) Calculating ------------------------------------- trunk ours so_ackermann 1.884 2.278 i/s - 1.000 times in 0.530926s 0.438935s so_array 1.178 1.157 i/s - 1.000 times in 0.848786s 0.864467s so_binary_trees 0.176 0.177 i/s - 1.000 times in 5.683895s 5.657707s so_concatenate 0.220 0.221 i/s - 1.000 times in 4.546896s 4.518949s so_count_words 6.729 6.470 i/s - 1.000 times in 0.148602s 0.154561s so_exception 3.324 3.688 i/s - 1.000 times in 0.300872s 0.271147s so_fannkuch 0.546 0.968 i/s - 1.000 times in 1.831328s 1.033376s so_fasta 0.541 0.547 i/s - 1.000 times in 1.849923s 1.827091s so_k_nucleotide 0.800 0.777 i/s - 1.000 times in 1.250635s 1.286295s so_lists 2.101 1.848 i/s - 1.000 times in 0.475954s 0.541095s so_mandelbrot 0.435 0.408 i/s - 1.000 times in 2.299328s 2.450535s so_matrix 1.946 1.912 i/s - 1.000 times in 0.513872s 0.523076s so_meteor_contest 0.311 0.317 i/s - 1.000 times in 3.219297s 3.152052s so_nbody 0.746 0.703 i/s - 1.000 times in 1.339815s 1.423441s so_nested_loop 0.899 0.901 i/s - 1.000 times in 1.111767s 1.109555s so_nsieve 0.559 0.579 i/s - 1.000 times in 1.787763s 1.726552s so_nsieve_bits 0.435 0.428 i/s - 1.000 times in 2.296282s 2.333852s so_object 1.368 1.442 i/s - 1.000 times in 0.731237s 0.693684s so_partial_sums 0.616 0.546 i/s - 1.000 times in 1.623592s 1.833097s so_pidigits 0.831 0.832 i/s - 1.000 times in 1.203117s 1.202334s so_random 2.934 2.724 i/s - 1.000 times in 0.340791s 0.367150s so_reverse_complement 0.583 0.866 i/s - 1.000 times in 1.714144s 1.154615s so_sieve 1.829 2.081 i/s - 1.000 times in 0.546607s 0.480562s so_spectralnorm 0.524 0.558 i/s - 1.000 times in 1.908716s 1.792382s git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@64737 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2018-09-14 03:44:44 -04:00
CALL_SIMPLE_METHOD();
}
}
/* optimized not */
DEFINE_INSN
opt_not
(CALL_DATA cd)
(VALUE recv)
(VALUE val)
{
val = vm_opt_not(GET_ISEQ(), cd, recv);
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 06:58:49 -04:00
if (val == Qundef) {
move ADD_PC around (take 2) Now that we can say for sure if an instruction calls a method or not internally, it is now possible to reroute the bugs that forced us to revert the "move PC around" optimization. First try: r62051 Reverted: r63763 See also: r63999 ---- trunk: ruby 2.6.0dev (2018-09-13 trunk 64736) [x86_64-darwin15] ours: ruby 2.6.0dev (2018-09-13 trunk 64736) [x86_64-darwin15] last_commit=move ADD_PC around (take 2) Calculating ------------------------------------- trunk ours so_ackermann 1.884 2.278 i/s - 1.000 times in 0.530926s 0.438935s so_array 1.178 1.157 i/s - 1.000 times in 0.848786s 0.864467s so_binary_trees 0.176 0.177 i/s - 1.000 times in 5.683895s 5.657707s so_concatenate 0.220 0.221 i/s - 1.000 times in 4.546896s 4.518949s so_count_words 6.729 6.470 i/s - 1.000 times in 0.148602s 0.154561s so_exception 3.324 3.688 i/s - 1.000 times in 0.300872s 0.271147s so_fannkuch 0.546 0.968 i/s - 1.000 times in 1.831328s 1.033376s so_fasta 0.541 0.547 i/s - 1.000 times in 1.849923s 1.827091s so_k_nucleotide 0.800 0.777 i/s - 1.000 times in 1.250635s 1.286295s so_lists 2.101 1.848 i/s - 1.000 times in 0.475954s 0.541095s so_mandelbrot 0.435 0.408 i/s - 1.000 times in 2.299328s 2.450535s so_matrix 1.946 1.912 i/s - 1.000 times in 0.513872s 0.523076s so_meteor_contest 0.311 0.317 i/s - 1.000 times in 3.219297s 3.152052s so_nbody 0.746 0.703 i/s - 1.000 times in 1.339815s 1.423441s so_nested_loop 0.899 0.901 i/s - 1.000 times in 1.111767s 1.109555s so_nsieve 0.559 0.579 i/s - 1.000 times in 1.787763s 1.726552s so_nsieve_bits 0.435 0.428 i/s - 1.000 times in 2.296282s 2.333852s so_object 1.368 1.442 i/s - 1.000 times in 0.731237s 0.693684s so_partial_sums 0.616 0.546 i/s - 1.000 times in 1.623592s 1.833097s so_pidigits 0.831 0.832 i/s - 1.000 times in 1.203117s 1.202334s so_random 2.934 2.724 i/s - 1.000 times in 0.340791s 0.367150s so_reverse_complement 0.583 0.866 i/s - 1.000 times in 1.714144s 1.154615s so_sieve 1.829 2.081 i/s - 1.000 times in 0.546607s 0.480562s so_spectralnorm 0.524 0.558 i/s - 1.000 times in 1.908716s 1.792382s git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@64737 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2018-09-14 03:44:44 -04:00
CALL_SIMPLE_METHOD();
}
}
/* optimized regexp match 2 */
DEFINE_INSN
opt_regexpmatch2
(CALL_DATA cd)
(VALUE obj2, VALUE obj1)
(VALUE val)
// attr bool leaf = false; /* match_at() has rb_thread_check_ints() */
{
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 06:58:49 -04:00
val = vm_opt_regexpmatch2(obj2, obj1);
if (val == Qundef) {
move ADD_PC around (take 2) Now that we can say for sure if an instruction calls a method or not internally, it is now possible to reroute the bugs that forced us to revert the "move PC around" optimization. First try: r62051 Reverted: r63763 See also: r63999 ---- trunk: ruby 2.6.0dev (2018-09-13 trunk 64736) [x86_64-darwin15] ours: ruby 2.6.0dev (2018-09-13 trunk 64736) [x86_64-darwin15] last_commit=move ADD_PC around (take 2) Calculating ------------------------------------- trunk ours so_ackermann 1.884 2.278 i/s - 1.000 times in 0.530926s 0.438935s so_array 1.178 1.157 i/s - 1.000 times in 0.848786s 0.864467s so_binary_trees 0.176 0.177 i/s - 1.000 times in 5.683895s 5.657707s so_concatenate 0.220 0.221 i/s - 1.000 times in 4.546896s 4.518949s so_count_words 6.729 6.470 i/s - 1.000 times in 0.148602s 0.154561s so_exception 3.324 3.688 i/s - 1.000 times in 0.300872s 0.271147s so_fannkuch 0.546 0.968 i/s - 1.000 times in 1.831328s 1.033376s so_fasta 0.541 0.547 i/s - 1.000 times in 1.849923s 1.827091s so_k_nucleotide 0.800 0.777 i/s - 1.000 times in 1.250635s 1.286295s so_lists 2.101 1.848 i/s - 1.000 times in 0.475954s 0.541095s so_mandelbrot 0.435 0.408 i/s - 1.000 times in 2.299328s 2.450535s so_matrix 1.946 1.912 i/s - 1.000 times in 0.513872s 0.523076s so_meteor_contest 0.311 0.317 i/s - 1.000 times in 3.219297s 3.152052s so_nbody 0.746 0.703 i/s - 1.000 times in 1.339815s 1.423441s so_nested_loop 0.899 0.901 i/s - 1.000 times in 1.111767s 1.109555s so_nsieve 0.559 0.579 i/s - 1.000 times in 1.787763s 1.726552s so_nsieve_bits 0.435 0.428 i/s - 1.000 times in 2.296282s 2.333852s so_object 1.368 1.442 i/s - 1.000 times in 0.731237s 0.693684s so_partial_sums 0.616 0.546 i/s - 1.000 times in 1.623592s 1.833097s so_pidigits 0.831 0.832 i/s - 1.000 times in 1.203117s 1.202334s so_random 2.934 2.724 i/s - 1.000 times in 0.340791s 0.367150s so_reverse_complement 0.583 0.866 i/s - 1.000 times in 1.714144s 1.154615s so_sieve 1.829 2.081 i/s - 1.000 times in 0.546607s 0.480562s so_spectralnorm 0.524 0.558 i/s - 1.000 times in 1.908716s 1.792382s git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@64737 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2018-09-14 03:44:44 -04:00
CALL_SIMPLE_METHOD();
}
}
/* call native compiled method */
DEFINE_INSN_IF(SUPPORT_CALL_C_FUNCTION)
opt_call_c_function
(rb_insn_func_t funcptr)
()
()
// attr bool leaf = false; /* anything can happen inside */
// attr bool handles_sp = true;
{
reg_cfp = (funcptr)(ec, reg_cfp);
if (reg_cfp == 0) {
VALUE err = ec->errinfo;
ec->errinfo = Qnil;
THROW_EXCEPTION(err);
}
mjit_compile.c: use local variables for stack if catch_except_p is FALSE. If catch_except_p is TRUE, stack values should be on VM's stack when exception is thrown and the JIT-ed frame is re-executed by VM's exception handler. If it's FALSE, the JIT-ed frame won't be re-executed and don't need to keep values on VM's stack. Using local variables allows us to reduce cfp->sp motion. Moving cfp->sp is needed only for insns whose handles_frame? is false. So it improves performance. _mjit_compile_insn.erb: Prepare `stack_size` variable for GET_SP, STACK_ADDR_FROM_TOP, TOPN macros. Share pc and sp motion partial view. Use cancel handler created in mjit_compile.c. _mjit_compile_send.erb: ditto. Also, when iseq->body->catch_except_p is TRUE, this stops to call mjit_exec directly. I described the reason in vm_insnhelper.h's comment for EXEC_EC_CFP. _mjit_compile_pc_and_sp.erb: Shared logic for moving sp and pc. As you can see from thsi file, when status->local_stack_p is TRUE and insn.handles_frame? is false, moving sp is skipped. But if insn.handles_frame? is true, values should be rolled back to VM's stack. common.mk: add dependency for the file _mjit_compile_insn_body.erb: Set sp value before canceling JIT on DISPATCH_ORIGINAL_INSN. Replace GET_SP, STACK_ADDR_FROM_TOP, TOPN macros for the case ocal_stack_p is TRUE and insn.handles_frame? is false. In that case, values are not available on VM's stack and those macros should be replaced. mjit_compile.inc.erb: updated comments of macros which are supported by JIT compiler. All references to `cfp->sp` should be replaced and thus INC_SP, SET_SV, PUSH are no longer supported for now, because they are not used now. vm_exec.h: moved EXEC_EC_CFP definition to vm_insnhelper.h because it's tighly coupled to CALL_METHOD. vm_insnhelper.h: Have revised EXEC_EC_CFP definition moved from vm_exec.h. Now it triggers mjit_exec for VM, and has the guard for catch_except_p on JIT-ed code. See comments for details. CALL_METHOD delegates triggering mjit_exec to EXEC_EC_CFP. insns.def: Stopped using EXEC_EC_CFP for the case we don't want to trigger mjit_exec. Those insns (defineclass, opt_call_c_function) are not supported by JIT and it's safe to use RESTORE_REGS(), NEXT_INSN(). expandarray is changed to pass GET_SP() to replace the macro in _mjit_compile_insn_body.erb. vm_insnhelper.c: change to take sp for the above reason. [close https://github.com/ruby/ruby/pull/1828] This patch resurrects the performance which was attached in [Feature #14235]. * Benchmark Optcarrot (with configuration for benchmark_driver.gem) https://github.com/benchmark-driver/optcarrot $ benchmark-driver benchmark.yml --verbose 1 --rbenv 'before;before+JIT::before,--jit;after;after+JIT::after,--jit' --repeat-count 10 before: ruby 2.6.0dev (2018-03-04 trunk 62652) [x86_64-linux] before+JIT: ruby 2.6.0dev (2018-03-04 trunk 62652) +JIT [x86_64-linux] after: ruby 2.6.0dev (2018-03-04 local-variable.. 62652) [x86_64-linux] last_commit=mjit_compile.c: use local variables for stack after+JIT: ruby 2.6.0dev (2018-03-04 local-variable.. 62652) +JIT [x86_64-linux] last_commit=mjit_compile.c: use local variables for stack Calculating ------------------------------------- before before+JIT after after+JIT optcarrot 53.552 59.680 53.697 63.358 fps Comparison: optcarrot after+JIT: 63.4 fps before+JIT: 59.7 fps - 1.06x slower after: 53.7 fps - 1.18x slower before: 53.6 fps - 1.18x slower git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@62655 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2018-03-04 02:04:40 -05:00
RESTORE_REGS();
NEXT_INSN();
}
2019-11-07 02:58:00 -05:00
/* call specific function with args */
DEFINE_INSN
invokebuiltin
(RB_BUILTIN bf)
(...)
(VALUE val)
2019-11-07 02:58:00 -05:00
// attr bool leaf = false; /* anything can happen inside */
// attr rb_snum_t sp_inc = 1 - bf->argc;
{
val = vm_invoke_builtin(ec, reg_cfp, bf, STACK_ADDR_FROM_TOP(bf->argc));
2019-11-07 02:58:00 -05:00
}
/* call specific function with args (same parameters) */
DEFINE_INSN
opt_invokebuiltin_delegate
(RB_BUILTIN bf, rb_num_t index)
2019-11-07 02:58:00 -05:00
()
(VALUE val)
2019-11-07 02:58:00 -05:00
// attr bool leaf = false; /* anything can happen inside */
{
val = vm_invoke_builtin_delegate(ec, reg_cfp, bf, (unsigned int)index);
2019-11-07 02:58:00 -05:00
}
/* call specific function with args (same parameters) and leave */
DEFINE_INSN
opt_invokebuiltin_delegate_leave
(RB_BUILTIN bf, rb_num_t index)
2019-11-07 02:58:00 -05:00
()
(VALUE val)
// attr bool leaf = false; /* anything can happen inside */
{
val = vm_invoke_builtin_delegate(ec, reg_cfp, bf, (unsigned int)index);
2019-11-07 02:58:00 -05:00
/* leave fastpath */
/* TracePoint/return fallbacks this insn to opt_invokebuiltin_delegate */
2019-11-07 02:58:00 -05:00
if (vm_pop_frame(ec, GET_CFP(), GET_EP())) {
#if OPT_CALL_THREADED_CODE
rb_ec_thread_ptr(ec)->retval = val;
return 0;
#else
return val;
#endif
}
else {
RESTORE_REGS();
}
}
/* BLT */
2019-09-02 13:51:48 -04:00
DEFINE_INSN_IF(SUPPORT_JOKE)
bitblt
()
()
(VALUE ret)
{
ret = rb_str_new2("a bit of bacon, lettuce and tomato");
}
/* The Answer to Life, the Universe, and Everything */
2019-09-02 13:51:48 -04:00
DEFINE_INSN_IF(SUPPORT_JOKE)
answer
()
()
(VALUE ret)
{
ret = INT2FIX(42);
}