mirror of
https://github.com/ruby/ruby.git
synced 2022-11-09 12:17:21 -05:00
Introduce table improvement by Vladimir Makarov <vmakarov@redhat.com>.
[Feature #12142] See header of st.c for improvment details. You can see all of code history here: <https://github.com/vnmakarov/ruby/tree/hash_tables_with_open_addressing> This improvement is discussed at <https://bugs.ruby-lang.org/issues/12142> with many people, especially with Yura Sokolov. * st.c: improve st_table. * include/ruby/st.h: ditto. * internal.h, numeric.c, hash.c (rb_dbl_long_hash): extract a function. * ext/-test-/st/foreach/foreach.c: catch up this change. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@56650 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
This commit is contained in:
parent
2bb96b87a7
commit
75775157ea
7 changed files with 1589 additions and 1207 deletions
2
NEWS
2
NEWS
|
@ -324,3 +324,5 @@ with all sufficient information, see the ChangeLog file or Redmine
|
|||
to `Math.max(x, y)`.
|
||||
|
||||
* Thread deadlock detection now shows their backtrace and dependency. [Feature #8214]
|
||||
|
||||
* Table (st.c) internal data structure is improved. [Feature #12142]
|
||||
|
|
|
@ -14,13 +14,13 @@ force_unpack_check(struct checker *c, st_data_t key, st_data_t val)
|
|||
if (c->nr == 0) {
|
||||
st_data_t i;
|
||||
|
||||
if (!c->tbl->entries_packed) rb_bug("should be packed\n");
|
||||
if (c->tbl->bins != NULL) rb_bug("should be packed\n");
|
||||
|
||||
/* force unpacking during iteration: */
|
||||
for (i = 1; i < expect_size; i++)
|
||||
st_add_direct(c->tbl, i, i);
|
||||
|
||||
if (c->tbl->entries_packed) rb_bug("should be unpacked\n");
|
||||
if (c->tbl->bins == NULL) rb_bug("should be unpacked\n");
|
||||
}
|
||||
|
||||
if (key != c->nr) {
|
||||
|
@ -84,7 +84,7 @@ unp_fec(VALUE self, VALUE test)
|
|||
|
||||
st_add_direct(tbl, 0, 0);
|
||||
|
||||
if (!tbl->entries_packed) rb_bug("should still be packed\n");
|
||||
if (tbl->bins != NULL) rb_bug("should still be packed\n");
|
||||
|
||||
st_foreach_check(tbl, unp_fec_i, (st_data_t)&c, -1);
|
||||
|
||||
|
@ -98,7 +98,7 @@ unp_fec(VALUE self, VALUE test)
|
|||
(VALUE)c.nr, (VALUE)expect_size);
|
||||
}
|
||||
|
||||
if (tbl->entries_packed) rb_bug("should be unpacked\n");
|
||||
if (tbl->bins == NULL) rb_bug("should be unpacked\n");
|
||||
|
||||
st_free_table(tbl);
|
||||
|
||||
|
@ -145,7 +145,7 @@ unp_fe(VALUE self, VALUE test)
|
|||
|
||||
st_add_direct(tbl, 0, 0);
|
||||
|
||||
if (!tbl->entries_packed) rb_bug("should still be packed\n");
|
||||
if (tbl->bins != NULL) rb_bug("should still be packed\n");
|
||||
|
||||
st_foreach(tbl, unp_fe_i, (st_data_t)&c);
|
||||
|
||||
|
@ -159,7 +159,7 @@ unp_fe(VALUE self, VALUE test)
|
|||
(VALUE)c.nr, (VALUE)expect_size);
|
||||
}
|
||||
|
||||
if (tbl->entries_packed) rb_bug("should be unpacked\n");
|
||||
if (tbl->bins == NULL) rb_bug("should be unpacked\n");
|
||||
|
||||
st_free_table(tbl);
|
||||
|
||||
|
|
110
hash.c
110
hash.c
|
@ -145,14 +145,36 @@ rb_hash(VALUE obj)
|
|||
|
||||
long rb_objid_hash(st_index_t index);
|
||||
|
||||
static st_index_t
|
||||
any_hash(VALUE a, st_index_t (*other_func)(VALUE))
|
||||
long
|
||||
rb_dbl_long_hash(double d)
|
||||
{
|
||||
/* normalize -0.0 to 0.0 */
|
||||
if (d == 0.0) d = 0.0;
|
||||
#if SIZEOF_INT == SIZEOF_VOIDP
|
||||
return rb_memhash(&d, sizeof(d));
|
||||
#else
|
||||
{
|
||||
union {double d; uint64_t i;} u;
|
||||
|
||||
u.d = d;
|
||||
return rb_objid_hash(u.i);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
#if SIZEOF_INT == SIZEOF_VOIDP
|
||||
static const st_index_t str_seed = 0xfa835867;
|
||||
#else
|
||||
static const st_index_t str_seed = 0xc42b5e2e6480b23bULL;
|
||||
#endif
|
||||
|
||||
static inline st_index_t
|
||||
any_hash_general(VALUE a, int strong_p, st_index_t (*other_func)(VALUE))
|
||||
{
|
||||
VALUE hval;
|
||||
st_index_t hnum;
|
||||
|
||||
if (SPECIAL_CONST_P(a)) {
|
||||
if (a == Qundef) return 0;
|
||||
if (STATIC_SYM_P(a)) {
|
||||
hnum = a >> (RUBY_SPECIAL_SHIFT + ID_SCOPE_SHIFT);
|
||||
goto out;
|
||||
|
@ -164,7 +186,9 @@ any_hash(VALUE a, st_index_t (*other_func)(VALUE))
|
|||
hnum = rb_objid_hash((st_index_t)a);
|
||||
}
|
||||
else if (BUILTIN_TYPE(a) == T_STRING) {
|
||||
hnum = rb_str_hash(a);
|
||||
hnum = (strong_p
|
||||
? rb_str_hash(a)
|
||||
: st_hash(RSTRING_PTR(a), RSTRING_LEN(a), str_seed));
|
||||
}
|
||||
else if (BUILTIN_TYPE(a) == T_SYMBOL) {
|
||||
hnum = RSYMBOL(a)->hashval;
|
||||
|
@ -175,8 +199,7 @@ any_hash(VALUE a, st_index_t (*other_func)(VALUE))
|
|||
}
|
||||
else if (BUILTIN_TYPE(a) == T_FLOAT) {
|
||||
flt:
|
||||
hval = rb_dbl_hash(rb_float_value(a));
|
||||
hnum = FIX2LONG(hval);
|
||||
hnum = rb_dbl_long_hash(rb_float_value(a));
|
||||
}
|
||||
else {
|
||||
hnum = other_func(a);
|
||||
|
@ -193,40 +216,62 @@ obj_any_hash(VALUE obj)
|
|||
return FIX2LONG(obj);
|
||||
}
|
||||
|
||||
static st_index_t
|
||||
rb_any_hash(VALUE a)
|
||||
{
|
||||
return any_hash(a, obj_any_hash);
|
||||
static inline st_index_t
|
||||
any_hash_weak(VALUE a, st_index_t (*other_func)(VALUE)) {
|
||||
return any_hash_general(a, FALSE, other_func);
|
||||
}
|
||||
|
||||
static st_index_t
|
||||
rb_num_hash_start(st_index_t n)
|
||||
rb_any_hash_weak(VALUE a) {
|
||||
return any_hash_weak(a, obj_any_hash);
|
||||
}
|
||||
|
||||
static inline st_index_t
|
||||
any_hash(VALUE a, st_index_t (*other_func)(VALUE)) {
|
||||
return any_hash_general(a, TRUE, other_func);
|
||||
}
|
||||
|
||||
static st_index_t
|
||||
rb_any_hash(VALUE a) {
|
||||
return any_hash(a, obj_any_hash);
|
||||
}
|
||||
|
||||
/* Here is a hash function for 64-bit key. It is about 5 times faster
|
||||
(2 times faster when uint128 type is absent) on Haswell than
|
||||
tailored Spooky or City hash function can be. */
|
||||
|
||||
/* Here we two primes with random bit generation. */
|
||||
static const uint64_t prime1 = 0x2e0bb864e9ea7df5ULL;
|
||||
static const uint64_t prime2 = 0xcdb32970830fcaa1ULL;
|
||||
|
||||
|
||||
static inline uint64_t
|
||||
mult_and_mix (uint64_t m1, uint64_t m2)
|
||||
{
|
||||
/*
|
||||
* This hash function is lightly-tuned for Ruby. Further tuning
|
||||
* should be possible. Notes:
|
||||
*
|
||||
* - (n >> 3) alone is great for heap objects and OK for fixnum,
|
||||
* however symbols perform poorly.
|
||||
* - (n >> (RUBY_SPECIAL_SHIFT+3)) was added to make symbols hash well,
|
||||
* n.b.: +3 to remove most ID scope, +1 worked well initially, too
|
||||
* n.b.: +1 (instead of 3) worked well initially, too
|
||||
* - (n << 16) was finally added to avoid losing bits for fixnums
|
||||
* - avoid expensive modulo instructions, it is currently only
|
||||
* shifts and bitmask operations.
|
||||
*/
|
||||
return (n >> (RUBY_SPECIAL_SHIFT + 3) ^ (n << 16)) ^ (n >> 3);
|
||||
#if defined(__GNUC__) && UINT_MAX != ULONG_MAX
|
||||
__uint128_t r = (__uint128_t) m1 * (__uint128_t) m2;
|
||||
return (uint64_t) (r >> 64) ^ (uint64_t) r;
|
||||
#else
|
||||
uint64_t hm1 = m1 >> 32, hm2 = m2 >> 32;
|
||||
uint64_t lm1 = m1, lm2 = m2;
|
||||
uint64_t v64_128 = hm1 * hm2;
|
||||
uint64_t v32_96 = hm1 * lm2 + lm1 * hm2;
|
||||
uint64_t v1_32 = lm1 * lm2;
|
||||
|
||||
return (v64_128 + (v32_96 >> 32)) ^ ((v32_96 << 32) + v1_32);
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline uint64_t
|
||||
key64_hash (uint64_t key, uint32_t seed)
|
||||
{
|
||||
return mult_and_mix(key + seed, prime1);
|
||||
}
|
||||
|
||||
long
|
||||
rb_objid_hash(st_index_t index)
|
||||
{
|
||||
st_index_t hnum = rb_num_hash_start(index);
|
||||
|
||||
hnum = rb_hash_start(hnum);
|
||||
hnum = rb_hash_uint(hnum, (st_index_t)rb_any_hash);
|
||||
hnum = rb_hash_end(hnum);
|
||||
return hnum;
|
||||
return key64_hash(index, (uint32_t) prime2);
|
||||
}
|
||||
|
||||
static st_index_t
|
||||
|
@ -250,6 +295,7 @@ rb_hash_iter_lev(VALUE h)
|
|||
|
||||
static const struct st_hash_type objhash = {
|
||||
rb_any_cmp,
|
||||
rb_any_hash_weak,
|
||||
rb_any_hash,
|
||||
};
|
||||
|
||||
|
@ -269,7 +315,7 @@ rb_ident_hash(st_data_t n)
|
|||
}
|
||||
#endif
|
||||
|
||||
return (st_index_t)rb_num_hash_start((st_index_t)n);
|
||||
return (st_index_t) key64_hash((st_index_t)n, (uint32_t) prime2);
|
||||
}
|
||||
|
||||
static const struct st_hash_type identhash = {
|
||||
|
|
|
@ -1,6 +1,8 @@
|
|||
/* This is a public domain general purpose hash table package written by Peter Moore @ UCB. */
|
||||
/* This is a public domain general purpose hash table package
|
||||
originally written by Peter Moore @ UCB.
|
||||
|
||||
/* @(#) st.h 5.1 89/12/14 */
|
||||
The hash table data strutures were redesigned and the package was
|
||||
rewritten by Vladimir Makarov <vmakarov@redhat.com>. */
|
||||
|
||||
#ifndef RUBY_ST_H
|
||||
#define RUBY_ST_H 1
|
||||
|
@ -46,6 +48,10 @@ typedef unsigned LONG_LONG st_data_t;
|
|||
typedef struct st_table st_table;
|
||||
|
||||
typedef st_data_t st_index_t;
|
||||
|
||||
/* Maximal value of unsigned integer type st_index_t. */
|
||||
#define MAX_ST_INDEX_VAL (~(st_index_t) 0)
|
||||
|
||||
typedef int st_compare_func(st_data_t, st_data_t);
|
||||
typedef st_index_t st_hash_func(st_data_t);
|
||||
|
||||
|
@ -55,10 +61,13 @@ typedef char st_check_for_sizeof_st_index_t[SIZEOF_VOIDP == (int)sizeof(st_index
|
|||
struct st_hash_type {
|
||||
int (*compare)(ANYARGS /*st_data_t, st_data_t*/); /* st_compare_func* */
|
||||
st_index_t (*hash)(ANYARGS /*st_data_t*/); /* st_hash_func* */
|
||||
/* The following is an optional func for stronger hash. When we
|
||||
have many different keys with the same hash we can switch to
|
||||
use it to prevent a denial attack with usage of hash table
|
||||
collisions. */
|
||||
st_index_t (*strong_hash)(ANYARGS /*st_data_t*/);
|
||||
};
|
||||
|
||||
#define ST_INDEX_BITS (sizeof(st_index_t) * CHAR_BIT)
|
||||
|
||||
#if defined(HAVE_BUILTIN___BUILTIN_CHOOSE_EXPR) && defined(HAVE_BUILTIN___BUILTIN_TYPES_COMPATIBLE_P)
|
||||
# define ST_DATA_COMPATIBLE_P(type) \
|
||||
__builtin_choose_expr(__builtin_types_compatible_p(type, st_data_t), 1, 0)
|
||||
|
@ -66,33 +75,30 @@ struct st_hash_type {
|
|||
# define ST_DATA_COMPATIBLE_P(type) 0
|
||||
#endif
|
||||
|
||||
typedef struct st_table_entry st_table_entry;
|
||||
|
||||
struct st_table_entry; /* defined in st.c */
|
||||
|
||||
struct st_table {
|
||||
/* Cached features of the table -- see st.c for more details. */
|
||||
unsigned char entry_power, bin_power, size_ind;
|
||||
/* True when we are rebuilding the table. */
|
||||
unsigned char inside_rebuild_p;
|
||||
/* How many times the table was rebuilt. */
|
||||
unsigned int rebuilds_num;
|
||||
/* Currently used hash function. */
|
||||
st_index_t (*curr_hash)(ANYARGS /*st_data_t*/);
|
||||
const struct st_hash_type *type;
|
||||
st_index_t num_bins;
|
||||
unsigned int entries_packed : 1;
|
||||
#ifdef __GNUC__
|
||||
/*
|
||||
* C spec says,
|
||||
* A bit-field shall have a type that is a qualified or unqualified
|
||||
* version of _Bool, signed int, unsigned int, or some other
|
||||
* implementation-defined type. It is implementation-defined whether
|
||||
* atomic types are permitted.
|
||||
* In short, long and long long bit-field are implementation-defined
|
||||
* feature. Therefore we want to suppress a warning explicitly.
|
||||
*/
|
||||
__extension__
|
||||
#endif
|
||||
st_index_t num_entries : ST_INDEX_BITS - 1;
|
||||
union {
|
||||
struct {
|
||||
struct st_table_entry **bins;
|
||||
void *private_list_head[2];
|
||||
} big;
|
||||
struct {
|
||||
struct st_packed_entry *entries;
|
||||
st_index_t real_entries;
|
||||
} packed;
|
||||
} as;
|
||||
/* Number of entries currently in the table. */
|
||||
st_index_t num_entries;
|
||||
/* Array of bins used for access by keys. */
|
||||
st_index_t *bins;
|
||||
/* Start and bound index of entries in array entries.
|
||||
entries_starts and entries_bound are in interval
|
||||
[0,allocated_entries]. */
|
||||
st_index_t entries_start, entries_bound;
|
||||
/* Array of size 2^entry_power. */
|
||||
st_table_entry *entries;
|
||||
};
|
||||
|
||||
#define st_is_member(table,key) st_lookup((table),(key),(st_data_t *)0)
|
||||
|
@ -121,7 +127,6 @@ typedef int st_update_callback_func(st_data_t *key, st_data_t *value, st_data_t
|
|||
int st_update(st_table *table, st_data_t key, st_update_callback_func *func, st_data_t arg);
|
||||
int st_foreach(st_table *, int (*)(ANYARGS), st_data_t);
|
||||
int st_foreach_check(st_table *, int (*)(ANYARGS), st_data_t, st_data_t);
|
||||
int st_reverse_foreach(st_table *, int (*)(ANYARGS), st_data_t);
|
||||
st_index_t st_keys(st_table *table, st_data_t *keys, st_index_t size);
|
||||
st_index_t st_keys_check(st_table *table, st_data_t *keys, st_index_t size, st_data_t never);
|
||||
st_index_t st_values(st_table *table, st_data_t *values, st_index_t size);
|
||||
|
|
|
@ -1085,6 +1085,7 @@ VALUE rb_hash_has_key(VALUE hash, VALUE key);
|
|||
VALUE rb_hash_default_value(VALUE hash, VALUE key);
|
||||
VALUE rb_hash_set_default_proc(VALUE hash, VALUE proc);
|
||||
long rb_objid_hash(st_index_t index);
|
||||
long rb_dbl_long_hash(double d);
|
||||
st_table *rb_init_identtable(void);
|
||||
st_table *rb_init_identtable_with_size(st_index_t size);
|
||||
|
||||
|
|
|
@ -1421,12 +1421,7 @@ flo_hash(VALUE num)
|
|||
VALUE
|
||||
rb_dbl_hash(double d)
|
||||
{
|
||||
st_index_t hash;
|
||||
|
||||
/* normalize -0.0 to 0.0 */
|
||||
if (d == 0.0) d = 0.0;
|
||||
hash = rb_memhash(&d, sizeof(d));
|
||||
return ST2FIX(hash);
|
||||
return LONG2FIX(rb_dbl_long_hash (d));
|
||||
}
|
||||
|
||||
VALUE
|
||||
|
|
Loading…
Reference in a new issue