Bob Jenkins' hashing implementation in Ruby
From:
Mauricio Fern疣dez <batsman.geo@...>
Date:
2003-02-28 20:39:48 UTC
List:
ruby-core #889
Just hacked it.
I am using the Jenkins' implementation from Perl 5.8. Moreover, I got
rid of the modulus operations (and the prime numbers) by using bitwise
AND and ensuring that num_bins is 2^n - 1 (this implies that I've had
to add 1 in a number of places, but I believe it is more important to
make FIND_ENTRY as fast as possible).
I'm sorry I didn't put the HASH_JENKINS magic into configure.in; I have
little experience with autoconf. I'll try harder next time :)
Here's the result of an informal benchmark comparing 1.6.8/1.8.0 (CVS
head as of Feb. 28 ~17H) with Jenkins against the vanilla versions:
10000 30000 100000 300000 500000 1000000
ruby-1.6.8.jenkins 0.0910 0.3198 1.2080 10.0227 7.7709 18.0764
ruby-1.6.8.jenkins.orig 0.0907 0.3254 1.2281 10.2327 8.0457 18.3917
ruby.jenkins 0.0857 0.2942 1.3588 4.2214 8.0908 17.9354
ruby.jenkins.orig 0.0902 0.3133 1.4469 4.5248 8.5991 18.7564
(.orig means the version without Jenkins)
[BTW it seems 1.8's GC has been improved quite a bit as shown by the
great difference for 300000 iterations; great work! :) ]
The script used is a minimal modification of the one at The Great Computer
Language Shootout:
#!/usr/local/bin/ruby
# -*- mode: ruby -*-
# $Id: hash.ruby,v 1.3 2001/10/09 23:25:12 doug Exp $
# http://www.bagley.org/~doug/shootout/
# with help from Aristarkh A Zagorodnikov
start = Time.new
n = (ARGV.shift || 1).to_i
hash = {}
for i in 1..n
hash['%x' % i] = 1
end
c = 0
n.downto 1 do |i|
c += 1 if hash.has_key? i.to_s
end
puts(Time.new - start)
And here's the code that drives the benchmarks:
batsman@tux-chan:~/src/cvs$ cat benchmark.rb
dirs = [ "ruby-1.6.8.jenkins", "ruby-1.6.8.jenkins.orig",
"ruby.jenkins", "ruby.jenkins.orig" ]
sizes = [ 10000 , 30000, 100000, 300000, 500000, 1000000 ]
res = {}
dirs.each do |x|
res[x] = {}
sizes.each do |size|
total = 0.0
6.times do
out = `#{x}/ruby /tmp/hash.rb #{size}`
total += Float(out)
end
#puts "Total: #{total}"
res[x][size] = total / 6
end
end
print " " * 23
sizes.each { |x| print "%9d " % x }
puts
dirs.each do |d|
print "%23s" % d
sizes.each { |s| printf "%10.4 f" % res[d][s] }
puts
end
Hope this is useful. It's not a huge performance increase but even 5% in
this particular test is OK as this is all very simple.
--
_ _
| |__ __ _| |_ ___ _ __ ___ __ _ _ __
| '_ \ / _` | __/ __| '_ ` _ \ / _` | '_ \
| |_) | (_| | |_\__ \ | | | | | (_| | | | |
|_.__/ \__,_|\__|___/_| |_| |_|\__,_|_| |_|
Running Debian GNU/Linux Sid (unstable)
batsman dot geo at yahoo dot com
A Linux machine! Because a 486 is a terrible thing to waste!
-- Joe Sloan, jjs@wintermute.ucr.edu
Attachments (2)
ruby-1.6.8-jenkins-0.0.1.patch
(5.55 KB, text/x-diff)
--- ruby-1.6.8.jenkins.orig/st.c 2002-02-27 08:46:50.000000000 +0100
+++ ruby-1.6.8.jenkins/st.c 2003-02-28 19:41:29.000000000 +0100
@@ -10,6 +10,7 @@
#include <malloc.h>
#endif
+#define HASH_JENKINS 1
typedef struct st_table_entry st_table_entry;
struct st_table_entry {
@@ -65,7 +66,11 @@
#define EQUAL(table,x,y) ((x)==(y) || (*table->type->compare)((x),(y)) == 0)
#define do_hash(key,table) (unsigned int)(*(table)->type->hash)((key))
+#ifdef HASH_JENKINS
+#define do_hash_bin(key,table) (do_hash(key, table)&(table)->num_bins)
+#else
#define do_hash_bin(key,table) (do_hash(key, table)%(table)->num_bins)
+#endif
/*
* MINSIZE is the minimum size of a dictionary.
@@ -73,6 +78,7 @@
#define MINSIZE 8
+#ifndef HASH_JENKINS
/*
Table of prime numbers 2^n+a, 2<=n<=30.
*/
@@ -107,6 +113,7 @@
1073741824 + 85,
0
};
+#endif
static int
new_size(size)
@@ -114,9 +121,9 @@
{
int i, newsize;
-#if 0
+#ifdef HASH_JENKINS
for (i=3; i<31; i++) {
- if ((1<<i) > size) return 1<<i;
+ if ((1<<i) > size) return ((1<<i) - 1);
}
return -1;
#else
@@ -164,7 +171,11 @@
tbl->type = type;
tbl->num_entries = 0;
tbl->num_bins = size;
+#ifdef HASH_JENKINS
+ tbl->bins = (st_table_entry **)Calloc(size+1, sizeof(st_table_entry*));
+#else
tbl->bins = (st_table_entry **)Calloc(size, sizeof(st_table_entry*));
+#endif
return tbl;
}
@@ -208,8 +219,11 @@
{
register st_table_entry *ptr, *next;
int i;
-
+#ifdef HASH_JENKINS
+ for(i = 0; i < table->num_bins+1; i++) {
+#else
for(i = 0; i < table->num_bins; i++) {
+#endif
ptr = table->bins[i];
while (ptr != 0) {
next = ptr->next;
@@ -230,6 +244,22 @@
#define COLLISION
#endif
+
+#ifdef HASH_JENKINS
+
+#define FIND_ENTRY(table, ptr, hash_val, bin_pos) \
+bin_pos = hash_val&(table)->num_bins;\
+ptr = (table)->bins[bin_pos];\
+if (PTR_NOT_EQUAL(table, ptr, hash_val, key)) {\
+ COLLISION;\
+ while (PTR_NOT_EQUAL(table, ptr->next, hash_val, key)) {\
+ ptr = ptr->next;\
+ }\
+ ptr = ptr->next;\
+}
+
+#else /* HASH_JENKINS */
+
#define FIND_ENTRY(table, ptr, hash_val, bin_pos) \
bin_pos = hash_val%(table)->num_bins;\
ptr = (table)->bins[bin_pos];\
@@ -241,6 +271,8 @@
ptr = ptr->next;\
}
+#endif /* HASH_JENKINS */
+
int
st_lookup(table, key, value)
st_table *table;
@@ -261,6 +293,28 @@
}
}
+#ifdef HASH_JENKINS
+
+#define ADD_DIRECT(table, key, value, hash_val, bin_pos)\
+do {\
+ st_table_entry *entry;\
+ if (table->num_entries/(table->num_bins+1) > ST_DEFAULT_MAX_DENSITY) {\
+ rehash(table);\
+ bin_pos = hash_val & table->num_bins;\
+ }\
+ \
+ entry = alloc(st_table_entry);\
+ \
+ entry->hash = hash_val;\
+ entry->key = key;\
+ entry->record = value;\
+ entry->next = table->bins[bin_pos];\
+ table->bins[bin_pos] = entry;\
+ table->num_entries++;\
+} while (0)
+
+#else /* HASH_JENKINS */
+
#define ADD_DIRECT(table, key, value, hash_val, bin_pos)\
{\
st_table_entry *entry;\
@@ -279,6 +333,8 @@
table->num_entries++;\
}
+#endif /* HASH_JENKINS */
+
int
st_insert(table, key, value)
register st_table *table;
@@ -309,7 +365,11 @@
unsigned int hash_val, bin_pos;
hash_val = do_hash(key, table);
+#ifdef HASH_JENKINS
+ bin_pos = hash_val & table->num_bins;
+#else
bin_pos = hash_val % table->num_bins;
+#endif
ADD_DIRECT(table, key, value, hash_val, bin_pos);
}
@@ -321,14 +381,27 @@
int i, old_num_bins = table->num_bins, new_num_bins;
unsigned int hash_val;
+#ifdef HASH_JENKINS
+ new_num_bins = new_size(old_num_bins+2);
+ new_bins = (st_table_entry**)Calloc(new_num_bins+1, sizeof(st_table_entry*));
+#else
new_num_bins = new_size(old_num_bins+1);
new_bins = (st_table_entry**)Calloc(new_num_bins, sizeof(st_table_entry*));
+#endif
+#ifdef HASH_JENKINS
+ for(i = 0; i < old_num_bins+1; i++) {
+#else
for(i = 0; i < old_num_bins; i++) {
+#endif
ptr = table->bins[i];
while (ptr != 0) {
next = ptr->next;
+#ifdef HASH_JENKINS
+ hash_val = ptr->hash & new_num_bins;
+#else
hash_val = ptr->hash % new_num_bins;
+#endif
ptr->next = new_bins[hash_val];
new_bins[hash_val] = ptr;
ptr = next;
@@ -345,7 +418,12 @@
{
st_table *new_table;
st_table_entry *ptr, *entry;
- int i, num_bins = old_table->num_bins;
+ int i;
+#ifdef HASH_JENKINS
+ int num_bins = old_table->num_bins+1;
+#else
+ int num_bins = old_table->num_bins;
+#endif
new_table = alloc(st_table);
if (new_table == 0) {
@@ -482,7 +560,11 @@
enum st_retval retval;
int i;
+#ifdef HASH_JENKINS
+ for(i = 0; i < table->num_bins+1; i++) {
+#else
for(i = 0; i < table->num_bins; i++) {
+#endif
last = 0;
for(ptr = table->bins[i]; ptr != 0;) {
retval = (*func)(ptr->key, ptr->record, arg);
@@ -512,9 +594,9 @@
strhash(string)
register char *string;
{
- register int c;
#ifdef HASH_ELFHASH
+ register int c;
register unsigned int h = 0, g;
while ((c = *string++) != '\0') {
@@ -525,6 +607,7 @@
}
return h;
#elif HASH_PERL
+ register int c;
register int val = 0;
while ((c = *string++) != '\0') {
@@ -532,7 +615,20 @@
}
return val + (val>>5);
+#elif HASH_JENKINS
+ register unsigned int c;
+ register unsigned int val = 0;
+ register const unsigned char *str = (unsigned char *)string;
+ while ((c = *str++) != '\0') {
+ val += c;
+ val += (val << 10);
+ val ^= (val >> 6);
+ }
+ val += (val << 3);
+ val ^= (val >> 11);
+ return val + (val << 15);
#else
+ register int c;
register int val = 0;
while ((c = *string++) != '\0') {
ruby-1.8.0-jenkins-0.0.1.patch
(6.26 KB, text/x-diff)
diff -bu ruby.jenkins.orig/gc.c ruby.jenkins/gc.c
--- ruby.jenkins.orig/gc.c 2003-02-28 17:16:13.000000000 +0100
+++ ruby.jenkins/gc.c 2003-02-28 18:40:11.000000000 +0100
@@ -66,6 +66,8 @@
#endif
#endif
+#define HASH_JENKINS 1
+
static unsigned long malloc_increase = 0;
static unsigned long malloc_limit = GC_MALLOC_LIMIT;
static void run_final();
@@ -862,8 +864,13 @@
struct st_table *tbl;
{
if (!tbl) return 0;
+#ifdef HASH_JENKINS
+ return (tbl->num_bins+1) * sizeof(struct st_table_entry *) +
+ tbl->num_entries * 4 * sizeof(VALUE);
+#else
return tbl->num_bins * sizeof(struct st_table_entry *) +
tbl->num_entries * 4 * sizeof(VALUE);
+#endif
}
static void
diff -bu ruby.jenkins.orig/st.c ruby.jenkins/st.c
--- ruby.jenkins.orig/st.c 2003-02-28 17:16:14.000000000 +0100
+++ ruby.jenkins/st.c 2003-02-28 19:18:15.000000000 +0100
@@ -12,6 +12,7 @@
#include <malloc.h>
#endif
+#define HASH_JENKINS 1
typedef struct st_table_entry st_table_entry;
struct st_table_entry {
@@ -67,7 +68,11 @@
#define EQUAL(table,x,y) ((x)==(y) || (*table->type->compare)((x),(y)) == 0)
#define do_hash(key,table) (unsigned int)(*(table)->type->hash)((key))
+#ifdef HASH_JENKINS
+#define do_hash_bin(key,table) (do_hash(key, table)&(table)->num_bins)
+#else
#define do_hash_bin(key,table) (do_hash(key, table)%(table)->num_bins)
+#endif
/*
* MINSIZE is the minimum size of a dictionary.
@@ -75,6 +80,7 @@
#define MINSIZE 8
+#ifndef HASH_JENKINS
/*
Table of prime numbers 2^n+a, 2<=n<=30.
*/
@@ -109,6 +115,7 @@
1073741824 + 85,
0
};
+#endif
static int
new_size(size)
@@ -116,9 +123,9 @@
{
int i;
-#if 0
+#ifdef HASH_JENKINS
for (i=3; i<31; i++) {
- if ((1<<i) > size) return 1<<i;
+ if ((1<<i) > size) return ((1<<i) - 1);
}
return -1;
#else
@@ -168,7 +175,11 @@
tbl->type = type;
tbl->num_entries = 0;
tbl->num_bins = size;
+#ifdef HASH_JENKINS
+ tbl->bins = (st_table_entry **)Calloc(size+1, sizeof(st_table_entry*));
+#else
tbl->bins = (st_table_entry **)Calloc(size, sizeof(st_table_entry*));
+#endif
return tbl;
}
@@ -212,8 +223,11 @@
{
register st_table_entry *ptr, *next;
int i;
-
+#ifdef HASH_JENKINS
+ for(i = 0; i < table->num_bins+1; i++) {
+#else
for(i = 0; i < table->num_bins; i++) {
+#endif
ptr = table->bins[i];
while (ptr != 0) {
next = ptr->next;
@@ -234,6 +248,19 @@
#define COLLISION
#endif
+#ifdef HASH_JENKINS
+#define FIND_ENTRY(table, ptr, hash_val, bin_pos) do {\
+ bin_pos = hash_val & (table)->num_bins;\
+ ptr = (table)->bins[bin_pos];\
+ if (PTR_NOT_EQUAL(table, ptr, hash_val, key)) {\
+ COLLISION;\
+ while (PTR_NOT_EQUAL(table, ptr->next, hash_val, key)) {\
+ ptr = ptr->next;\
+ }\
+ ptr = ptr->next;\
+ }\
+} while (0)
+#else
#define FIND_ENTRY(table, ptr, hash_val, bin_pos) do {\
bin_pos = hash_val%(table)->num_bins;\
ptr = (table)->bins[bin_pos];\
@@ -245,6 +272,7 @@
ptr = ptr->next;\
}\
} while (0)
+#endif
int
st_lookup(table, key, value)
@@ -267,6 +295,28 @@
}
}
+#ifdef HASH_JENKINS
+
+#define ADD_DIRECT(table, key, value, hash_val, bin_pos)\
+do {\
+ st_table_entry *entry;\
+ if (table->num_entries/(table->num_bins+1) > ST_DEFAULT_MAX_DENSITY) {\
+ rehash(table);\
+ bin_pos = hash_val & table->num_bins;\
+ }\
+ \
+ entry = alloc(st_table_entry);\
+ \
+ entry->hash = hash_val;\
+ entry->key = key;\
+ entry->record = value;\
+ entry->next = table->bins[bin_pos];\
+ table->bins[bin_pos] = entry;\
+ table->num_entries++;\
+} while (0)
+
+#else /* HASH_JENKINS */
+
#define ADD_DIRECT(table, key, value, hash_val, bin_pos)\
do {\
st_table_entry *entry;\
@@ -284,6 +334,7 @@
table->bins[bin_pos] = entry;\
table->num_entries++;\
} while (0)
+#endif /* HASH_JENKINS */
int
st_insert(table, key, value)
@@ -316,7 +367,11 @@
unsigned int hash_val, bin_pos;
hash_val = do_hash(key, table);
+#ifdef HASH_JENKINS
+ bin_pos = hash_val & table->num_bins;
+#else
bin_pos = hash_val % table->num_bins;
+#endif
ADD_DIRECT(table, key, value, hash_val, bin_pos);
}
@@ -328,14 +383,27 @@
int i, old_num_bins = table->num_bins, new_num_bins;
unsigned int hash_val;
+#ifdef HASH_JENKINS
+ new_num_bins = new_size(old_num_bins+2);
+ new_bins = (st_table_entry**)Calloc(new_num_bins+1, sizeof(st_table_entry*));
+#else
new_num_bins = new_size(old_num_bins+1);
new_bins = (st_table_entry**)Calloc(new_num_bins, sizeof(st_table_entry*));
+#endif
+#ifdef HASH_JENKINS
+ for(i = 0; i < old_num_bins+1; i++) {
+#else
for(i = 0; i < old_num_bins; i++) {
+#endif
ptr = table->bins[i];
while (ptr != 0) {
next = ptr->next;
+#ifdef HASH_JENKINS
+ hash_val = ptr->hash & new_num_bins;
+#else
hash_val = ptr->hash % new_num_bins;
+#endif
ptr->next = new_bins[hash_val];
new_bins[hash_val] = ptr;
ptr = next;
@@ -352,7 +420,12 @@
{
st_table *new_table;
st_table_entry *ptr, *entry;
- int i, num_bins = old_table->num_bins;
+ int i;
+#ifdef HASH_JENKINS
+ int num_bins = old_table->num_bins+1;
+#else
+ int num_bins = old_table->num_bins;
+#endif
new_table = alloc(st_table);
if (new_table == 0) {
@@ -489,7 +562,11 @@
enum st_retval retval;
int i;
+#ifdef HASH_JENKINS
+ for(i = 0; i < table->num_bins+1; i++) {
+#else
for(i = 0; i < table->num_bins; i++) {
+#endif
last = 0;
for(ptr = table->bins[i]; ptr != 0;) {
retval = (*func)(ptr->key, ptr->record, arg);
@@ -520,9 +597,9 @@
strhash(string)
register const char *string;
{
- register int c;
#ifdef HASH_ELFHASH
+ register int c;
register unsigned int h = 0, g;
while ((c = *string++) != '\0') {
@@ -533,6 +610,7 @@
}
return h;
#elif HASH_PERL
+ register int c;
register int val = 0;
while ((c = *string++) != '\0') {
@@ -540,7 +618,20 @@
}
return val + (val>>5);
+#elif HASH_JENKINS
+ register unsigned int c;
+ register unsigned int val = 0;
+ register const unsigned char *str = (unsigned char *)string;
+ while ((c = *str++) != '\0') {
+ val += c;
+ val += (val << 10);
+ val ^= (val >> 6);
+ }
+ val += (val << 3);
+ val ^= (val >> 11);
+ return val + (val << 15);
#else
+ register int c;
register int val = 0;
while ((c = *string++) != '\0') {