>From 05047644ee380f9d80c0c497641011c29fee610a Mon Sep 17 00:00:00 2001 From: Peter Bex Date: Wed, 4 Jan 2012 21:17:47 +0100 Subject: [PATCH] Provide protection against algorithmic complexity attacks on hash tables: - Replace predictable hashing function with a simple shift-xor-and hash, which has better key distribution for shared suffix strings. - Perturb input with a different random number per process for low-level hash tables and symbol tables, and a different nubmer per hash table for SRFI-69 tables. - In order to make it easier to change the string hashing function in the future, put the algorithm in one place for all three of SRFI-69, low-level and symbol table hashing. Because the random number needs to be used as the seed for the hash in order to properly perturb the input, the C interface must be changed, deprecating C_string_hash[_ci]. --- NEWS | 8 ++ chicken.h | 7 +- eval.scm | 6 +- manual/Unit srfi-69 | 44 +++++-- runtime.c | 63 +++++----- srfi-69.scm | 290 ++++++++++++++++++++++++-------------------- tests/hash-table-tests.scm | 31 +++++- types.db | 26 ++-- 8 files changed, 281 insertions(+), 194 deletions(-) diff --git a/NEWS b/NEWS index 76ca4f7..5bb4093 100644 --- a/NEWS +++ b/NEWS @@ -1,5 +1,11 @@ 4.7.4 +- Security fixes + - improved hash table collision resistance and added randomization + to prevent malicious external causes of collisions. All SRFI-69 + procedures accept extra arguments to control randomization for + testing/debugging. + - Build system - the test-suite is now working on for the mingw32 platform (with a few tests disabled due to missing functionality) @@ -50,6 +56,8 @@ on direction and open/closed state - "mutate-procedure" has been renamed to "mutate-procedure!" - the old name is still available but deprecated ("lolevel" unit) + - deprecated C_hash_string and C_hash_string_ci functions in the C API + in favor of the more secure versions C_i_string_hash, C_i_string_ci_hash. - Compiler - fixed a bug in the compiler that could cause some variable bindings diff --git a/chicken.h b/chicken.h index 8169f6d..fecdcee 100644 --- a/chicken.h +++ b/chicken.h @@ -682,6 +682,7 @@ typedef struct C_symbol_table_struct { char *name; unsigned int size; + unsigned int rand; C_word *table; struct C_symbol_table_struct *next; } C_SYMBOL_TABLE; @@ -1603,8 +1604,10 @@ C_fctexport int C_fcall C_in_heapp(C_word x) C_regparm; C_fctexport int C_fcall C_in_fromspacep(C_word x) C_regparm; C_fctexport void C_fcall C_trace(C_char *name) C_regparm; C_fctexport C_word C_fcall C_emit_trace_info2(char *raw, C_word x, C_word y, C_word t) C_regparm; -C_fctexport C_word C_fcall C_hash_string(C_word str) C_regparm; -C_fctexport C_word C_fcall C_hash_string_ci(C_word str) C_regparm; +C_fctexport C_word C_fcall C_u_i_string_hash(C_word str, C_word rnd) C_regparm; +C_fctexport C_word C_fcall C_u_i_string_ci_hash(C_word str, C_word rnd) C_regparm; +C_fctexport C_word C_fcall C_hash_string(C_word str) C_regparm; /* DEPRECATED, INSECURE */ +C_fctexport C_word C_fcall C_hash_string_ci(C_word str) C_regparm; /* DEPRECATED, INSECURE */ C_fctexport C_word C_halt(C_word msg); C_fctexport C_word C_message(C_word msg); C_fctexport C_word C_fcall C_equalp(C_word x, C_word y) C_regparm; diff --git a/eval.scm b/eval.scm index 04c2cd9..5f4bfc2 100644 --- a/eval.scm +++ b/eval.scm @@ -109,13 +109,15 @@ (define ##sys#hash-symbol (let ([cache-s #f] - [cache-h #f] ) + [cache-h #f] + ;; NOTE: All low-level hash tables share the same randomization factor + [rand (##core#inline "C_random_fixnum" most-positive-fixnum)] ) (lambda (s n) (if (eq? s cache-s) (##core#inline "C_fixnum_modulo" cache-h n) (begin (set! cache-s s) - (set! cache-h (##core#inline "C_hash_string" (##sys#slot s 1))) + (set! cache-h (##core#inline "C_u_i_string_hash" (##sys#slot s 1) rand)) (##core#inline "C_fixnum_modulo" cache-h n)))))) (define (##sys#hash-table-ref ht key) diff --git a/manual/Unit srfi-69 b/manual/Unit srfi-69 index 92db700..9db2a97 100644 --- a/manual/Unit srfi-69 +++ b/manual/Unit srfi-69 @@ -13,7 +13,7 @@ CHICKEN implements SRFI 69 with SRFI 90 extensions. For more information, see ==== make-hash-table -(make-hash-table [TEST HASH SIZE] [#:test TEST] [#:hash HASH] [#:size SIZE] [#:initial INITIAL] [#:min-load MIN-LOAD] [#:max-load MAX-LOAD] [#:weak-keys WEAK-KEYS] [#:weak-values WEAK-VALUES]) +(make-hash-table [TEST HASH SIZE] [#:test TEST] [#:hash HASH] [#:size SIZE] [#:initial INITIAL] [#:randomization RANDOMIZATION] [#:min-load MIN-LOAD] [#:max-load MAX-LOAD] [#:weak-keys WEAK-KEYS] [#:weak-values WEAK-VALUES]) Returns a new {{HASH-TABLE}} with the supplied configuration. @@ -21,6 +21,7 @@ Returns a new {{HASH-TABLE}} with the supplied configuration. ; {{HASH}} : The hash function. ; {{SIZE}} : The expected number of table elements. ; {{INITIAL}} : The default initial value. +; {{RANDOMIZATION}} : A value for perturbing hash values. Should never be a fixed value! ; {{MIN-LOAD}} : The minimum load factor. A {{flonum}} in (0.0 1.0). ; {{MAX-LOAD}} : The maximum load factor. A {{flonum}} in (0.0 1.0). ; {{WEAK-KEYS}} : Use weak references for keys. (Ignored) @@ -29,7 +30,7 @@ Returns a new {{HASH-TABLE}} with the supplied configuration. ==== alist->hash-table -(alist->hash-table A-LIST [#:test TEST] [#:hash HASH] [#:size SIZE] [#:initial INITIAL] [#:min-load MIN-LOAD] [#:max-load MAX-LOAD] [#:weak-keys WEAK-KEYS] [#:weak-values WEAK-VALUES]) +(alist->hash-table A-LIST [#:test TEST] [#:hash HASH] [#:size SIZE] [#:initial INITIAL] [#:randomization RANDOMIZATION] [#:min-load MIN-LOAD] [#:max-load MAX-LOAD] [#:weak-keys WEAK-KEYS] [#:weak-values WEAK-VALUES]) Returns a new {{HASH-TABLE}}. The {{HASH-TABLE}} is populated from the {{A-LIST}}. The keyword arguments are per {{make-hash-table}}. @@ -104,6 +105,13 @@ Does the {{HASH-TABLE}} have a default initial value? The {{HASH-TABLE}} default initial value. +==== hash-table-randomization + +(hash-table-randomization HASH-TABLE) + +The randomization number for {{HASH-TABLE}}. Make sure you never +expose this to a potential attacker. + ==== hash-table-keys @@ -285,38 +293,46 @@ entry. All hash functions return a {{fixnum}} in the range [0 {{BOUND}}). +When given the fixnum RANDOMIZATION, these functions will use this +to perturb the value; if not specified, the value will differ for +each invocation of your program. This is for security reasons; an +attacker who knows what a value hashes to can deliberately try to +cause collisions, thereby flattening your hash table, effectively +reducing it to a list. Always make sure you don't expose any +hashed value to an attacker. + ==== number-hash -(number-hash NUMBER [BOUND]) +(number-hash NUMBER [BOUND RANDOMIZATION]) For use with {{=}} as a {{hash-table-equivalence-function}}. ==== object-uid-hash -(object-uid-hash OBJECT [BOUND]) +(object-uid-hash OBJECT [BOUND RANDOMIZATION]) Currently a synonym for {{equal?-hash}}. ==== symbol-hash -(symbol-hash SYMBOL [BOUND]) +(symbol-hash SYMBOL [BOUND RANDOMIZATION]) For use with {{eq?}} as a {{hash-table-equivalence-function}}. ==== keyword-hash -(keyword-hash KEYWORD [BOUND]) +(keyword-hash KEYWORD [BOUND RANDOMIZATION]) For use with {{eq?}} as a {{hash-table-equivalence-function}}. ==== string-hash -(string-hash STRING [BOUND START END]) +(string-hash STRING [BOUND START END RANDOMIZATION]) For use with {{string=?}} as a {{hash-table-equivalence-function}}. The optional {{START}} and {{END}} arguments may be given to limit @@ -325,43 +341,43 @@ the hash calculation to a specific sub-section of {{STRING}}. ==== string-ci-hash -(string-hash-ci STRING [BOUND START END])
-(string-ci-hash STRING [BOUND START END]) +(string-hash-ci STRING [BOUND START END RANDOMIZATION])
+(string-ci-hash STRING [BOUND START END RANDOMIZATION]) For use with {{string-ci=?}} as a {{hash-table-equivalence-function}}. ==== eq?-hash -(eq?-hash OBJECT [BOUND]) +(eq?-hash OBJECT [BOUND RANDOMIZATION]) For use with {{eq?}} as a {{hash-table-equivalence-function}}. ==== eqv?-hash -(eqv?-hash OBJECT [BOUND]) +(eqv?-hash OBJECT [BOUND RANDOMIZATION]) For use with {{eqv?}} as a {{hash-table-equivalence-function}}. ==== equal?-hash -(equal?-hash OBJECT [BOUND]) +(equal?-hash OBJECT [BOUND RANDOMIZATION]) For use with {{equal?}} as a {{hash-table-equivalence-function}}. ==== hash -(hash OBJECT [BOUND]) +(hash OBJECT [BOUND RANDOMIZATION]) Synonym for {{equal?-hash}}. ==== hash-by-identity -(hash-by-identity OBJECT [BOUND]) +(hash-by-identity OBJECT [BOUND RANDOMIZATION]) Synonym for {{eq?-hash}}. diff --git a/runtime.c b/runtime.c index b7d7d95..3c5f3a9 100644 --- a/runtime.c +++ b/runtime.c @@ -471,7 +471,7 @@ static void C_fcall really_mark(C_word *x) C_regparm; static WEAK_TABLE_ENTRY *C_fcall lookup_weak_table_entry(C_word item, C_word container) C_regparm; static C_ccall void values_continuation(C_word c, C_word closure, C_word dummy, ...) C_noret; static C_word add_symbol(C_word **ptr, C_word key, C_word string, C_SYMBOL_TABLE *stable); -static int C_fcall hash_string(int len, C_char *str, unsigned int m) C_regparm; +static C_word C_fcall hash_string(int len, C_char *str, C_word m, C_word r, int ci) C_regparm; static C_word C_fcall lookup(C_word key, int len, C_char *str, C_SYMBOL_TABLE *stable) C_regparm; static double compute_symbol_table_load(double *avg_bucket_len, int *total); static C_word C_fcall convert_string_to_number(C_char *str, int radix, C_word *fix, double *flo) C_regparm; @@ -846,7 +846,7 @@ void *CHICKEN_global_lookup(char *name) { int len = C_strlen(name), - key = hash_string(len, name, symbol_table->size); + key = hash_string(len, name, symbol_table->size, symbol_table->rand, 0); C_word s; void *root = CHICKEN_new_gc_root(); @@ -886,6 +886,7 @@ C_regparm C_SYMBOL_TABLE *C_new_symbol_table(char *name, unsigned int size) stp->name = name; stp->size = size; stp->next = symbol_table_list; + stp->rand = C_unfix(C_random_fixnum(C_fix(size))); if((stp->table = (C_word *)C_malloc(size * sizeof(C_word))) == NULL) return NULL; @@ -933,7 +934,7 @@ C_regparm C_word C_find_symbol(C_word str, C_SYMBOL_TABLE *stable) char *sptr = C_c_string(str); int len = C_header_size(str), - key = hash_string(len, sptr, stable->size); + key = hash_string(len, sptr, stable->size, stable->rand, 0); C_word s; if(C_truep(s = lookup(key, len, sptr, stable))) return s; @@ -1961,7 +1962,7 @@ C_regparm C_word C_fcall C_intern_in(C_word **ptr, int len, C_char *str, C_SYMBO if(stable == NULL) stable = symbol_table; - key = hash_string(len, str, stable->size); + key = hash_string(len, str, stable->size, stable->rand, 0); if(C_truep(s = lookup(key, len, str, stable))) return s; @@ -1979,7 +1980,7 @@ C_regparm C_word C_fcall C_h_intern_in(C_word *slot, int len, C_char *str, C_SYM if(stable == NULL) stable = symbol_table; - key = hash_string(len, str, stable->size); + key = hash_string(len, str, stable->size, stable->rand, 0); if(C_truep(s = lookup(key, len, str, stable))) { if(C_in_stackp(s)) C_mutate(slot, s); @@ -1995,7 +1996,7 @@ C_regparm C_word C_fcall C_h_intern_in(C_word *slot, int len, C_char *str, C_SYM C_regparm C_word C_fcall intern0(C_char *str) { int len = C_strlen(str); - int key = hash_string(len, str, symbol_table->size); + int key = hash_string(len, str, symbol_table->size, symbol_table->rand, 0); C_word s; if(C_truep(s = lookup(key, len, str, symbol_table))) return s; @@ -2009,7 +2010,7 @@ C_regparm C_word C_fcall C_lookup_symbol(C_word sym) C_word str = C_block_item(sym, 1); int len = C_header_size(str); - key = hash_string(len, C_c_string(str), symbol_table->size); + key = hash_string(len, C_c_string(str), symbol_table->size, symbol_table->rand, 0); return lookup(key, len, C_c_string(str), symbol_table); } @@ -2030,18 +2031,16 @@ C_regparm C_word C_fcall C_intern3(C_word **ptr, C_char *str, C_word value) } -C_regparm int C_fcall hash_string(int len, C_char *str, unsigned int m) +C_regparm C_word C_fcall hash_string(int len, C_char *str, C_word m, C_word r, int ci) { - unsigned int key = 0; + C_uword key = r; -# if 0 - /* Zbigniew's suggested change for extended significance & ^2 table sizes. */ - while(len--) key += (key << 5) + *(str++); -# else - while(len--) key = (key << 4) + *(str++); -# endif + if (ci) + while(len--) key ^= (key << 6) + (key >> 2) + C_tolower((int)(*str++)); + else + while(len--) key ^= (key << 6) + (key >> 2) + *(str++); - return (int)(key % m); + return (C_word)(key % (C_uword)m); } @@ -3743,29 +3742,31 @@ C_word C_fetch_trace(C_word starti, C_word buffer) return C_fix(p); } - -C_regparm C_word C_fcall C_hash_string(C_word str) +C_regparm C_word C_fcall C_u_i_string_hash(C_word str, C_word rnd) { - unsigned C_word key = 0; int len = C_header_size(str); - C_byte *ptr = C_data_pointer(str); - while(len--) key = (key << 4) + (*ptr++); - - return C_fix(key & C_MOST_POSITIVE_FIXNUM); + C_char *ptr = C_data_pointer(str); + return C_fix(hash_string(len, ptr, C_MOST_POSITIVE_FIXNUM, C_unfix(rnd), 0)); } - -C_regparm C_word C_fcall C_hash_string_ci(C_word str) +C_regparm C_word C_fcall C_u_i_string_ci_hash(C_word str, C_word rnd) { - unsigned C_word key = 0; int len = C_header_size(str); - C_byte *ptr = C_data_pointer(str); - - while(len--) key = (key << 4) + C_tolower((int)(*ptr++)); + C_char *ptr = C_data_pointer(str); + return C_fix(hash_string(len, ptr, C_MOST_POSITIVE_FIXNUM, C_unfix(rnd), 1)); +} - return C_fix(key & C_MOST_POSITIVE_FIXNUM); +/* DEPRECATED, INSECURE */ +C_regparm C_word C_fcall C_hash_string(C_word str) +{ + return C_u_i_string_hash(str, C_fix(0)); } +/* DEPRECATED, INSECURE */ +C_regparm C_word C_fcall C_hash_string_ci(C_word str) +{ + return C_u_i_string_ci_hash(str, C_fix(0)); +} C_regparm void C_fcall C_toplevel_entry(C_char *name) { @@ -7092,7 +7093,7 @@ void C_ccall C_string_to_symbol(C_word c, C_word closure, C_word k, C_word strin len = C_header_size(string); name = (C_char *)C_data_pointer(string); - key = hash_string(len, name, symbol_table->size); + key = hash_string(len, name, symbol_table->size, symbol_table->rand, 0); if(!C_truep(s = lookup(key, len, name, symbol_table))) s = add_symbol(&a, key, string, symbol_table); diff --git a/srfi-69.scm b/srfi-69.scm index febfe7e..00b7f92 100644 --- a/srfi-69.scm +++ b/srfi-69.scm @@ -69,11 +69,11 @@ (define-inline (%byte-block? obj) (##core#inline "C_byteblockp" obj) ) -(define-inline (%string-hash str) - (##core#inline "C_hash_string" str) ) +(define-inline (%string-hash str rnd) + (##core#inline "C_u_i_string_hash" str rnd) ) -(define-inline (%string-ci-hash str) - (##core#inline "C_hash_string_ci" str) ) +(define-inline (%string-ci-hash str rnd) + (##core#inline "C_u_i_string_ci_hash" str rnd) ) (define-inline (%subbyte bytvec i) (##core#inline "C_subbyte" bytvec i) ) @@ -105,6 +105,8 @@ (define-constant unknown-immediate-hash-value 262) (define-constant hash-default-bound 536870912) +(define hash-default-randomization + (##core#inline "C_random_fixnum" hash-default-bound)) ;; Force Hash to Bounded Fixnum: @@ -136,22 +138,23 @@ `(,_fx+ (,_%subbyte ,flo ,idx) (,_fxshl ,(loop (fx- idx 1)) 1)) ) ) ) ) ) ) -(define (##sys#number-hash-hook obj) - (*equal?-hash obj) ) +(define (##sys#number-hash-hook obj rnd) + (*equal?-hash obj rnd) ) -(define-inline (%non-fixnum-number-hash obj) - (cond [(flonum? obj) ($flonum-hash obj)] - [else (%fix (##sys#number-hash-hook obj))] ) ) +(define-inline (%non-fixnum-number-hash obj rnd) + (cond [(flonum? obj) ($flonum-hash obj rnd)] + [else (%fix (##sys#number-hash-hook obj rnd))] ) ) -(define-inline (%number-hash obj) - (cond [(fixnum? obj) obj] - [else (%non-fixnum-number-hash obj)] ) ) +(define-inline (%number-hash obj rnd) + (cond [(fixnum? obj) (fxxor obj rnd)] + [else (%non-fixnum-number-hash obj rnd)] ) ) -(define (number-hash obj #!optional (bound hash-default-bound)) +(define (number-hash obj #!optional (bound hash-default-bound) + (randomization hash-default-randomization)) (unless (number? obj) (##sys#signal-hook #:type 'number-hash "invalid number" obj) ) (##sys#check-exact bound 'number-hash) - (%hash/limit (%number-hash obj) bound) ) + (%hash/limit (%number-hash obj randomization) bound) ) ;; Object UID Hash: @@ -159,12 +162,13 @@ (define-inline (%object-uid-hash obj) (%uid-hash (##sys#object->uid obj)) ) -(define-inline (%object-uid-hash obj) - (*equal?-hash obj) ) +(define-inline (%object-uid-hash obj rnd) + (*equal?-hash obj rnd) ) -(define (object-uid-hash obj #!optional (bound hash-default-bound)) +(define (object-uid-hash obj #!optional (bound hash-default-bound) + (randomization hash-default-randomization)) (##sys#check-exact bound 'object-uid-hash) - (%hash/limit (%object-uid-hash obj) bound) ) + (%hash/limit (%object-uid-hash obj randomization) bound) ) ;; Symbol Hash: @@ -172,13 +176,14 @@ (define-inline (%symbol-hash obj) (##sys#slot obj INDEX-OF-UNIQUE-HASH-VALUE-COMPUTED-DURING-SYMBOL-CREATION) ) -(define-inline (%symbol-hash obj) - (%string-hash (##sys#slot obj 1)) ) +(define-inline (%symbol-hash obj rnd) + (%string-hash (##sys#slot obj 1) rnd) ) -(define (symbol-hash obj #!optional (bound hash-default-bound)) +(define (symbol-hash obj #!optional (bound hash-default-bound) + (randomization hash-default-randomization)) (##sys#check-symbol obj 'symbol-hash) (##sys#check-exact bound 'symbol-hash) - (%hash/limit (%symbol-hash obj) bound) ) + (%hash/limit (%symbol-hash obj randomization) bound) ) ;; Keyword Hash: @@ -192,13 +197,14 @@ (define-inline (%keyword-hash obj) (##sys#slot obj INDEX-OF-UNIQUE-HASH-VALUE-COMPUTED-DURING-KEYWORD-CREATION) ) -(define-inline (%keyword-hash obj) - (%string-hash (##sys#slot obj 1)) ) +(define-inline (%keyword-hash obj rnd) + (%string-hash (##sys#slot obj 1) rnd) ) -(define (keyword-hash obj #!optional (bound hash-default-bound)) +(define (keyword-hash obj #!optional (bound hash-default-bound) + (randomization hash-default-randomization)) (##sys#check-keyword obj 'keyword-hash) (##sys#check-exact bound 'keyword-hash) - (%hash/limit (%keyword-hash obj) bound) ) + (%hash/limit (%keyword-hash obj randomization) bound) ) ;; Eq Hash: @@ -208,22 +214,23 @@ #; ;NOT YET (no keyword vs. symbol issue) (keyword? obj) ) ) -(define (*eq?-hash obj) - (cond [(fixnum? obj) obj] - [(char? obj) (char->integer obj)] - [(eq? obj #t) true-hash-value] - [(eq? obj #f) false-hash-value] - [(null? obj) null-hash-value] - [(eof-object? obj) eof-hash-value] - [(symbol? obj) (%symbol-hash obj)] +(define (*eq?-hash obj rnd) + (cond [(fixnum? obj) (fxxor obj rnd)] + [(char? obj) (fxxor (char->integer obj) rnd)] + [(eq? obj #t) (fxxor true-hash-value rnd)] + [(eq? obj #f) (fxxor false-hash-value rnd)] + [(null? obj) (fxxor null-hash-value rnd)] + [(eof-object? obj) (fxxor eof-hash-value rnd)] + [(symbol? obj) (%symbol-hash obj rnd)] #; ;NOT YET (no keyword vs. symbol issue) - [(keyword? obj) (%keyword-hash obj)] - [(%immediate? obj) unknown-immediate-hash-value] - [else (%object-uid-hash obj) ] ) ) + [(keyword? obj) (%keyword-hash obj rnd)] + [(%immediate? obj) (fxxor unknown-immediate-hash-value rnd)] + [else (%object-uid-hash obj rnd) ] ) ) -(define (eq?-hash obj #!optional (bound hash-default-bound)) +(define (eq?-hash obj #!optional (bound hash-default-bound) + (randomization hash-default-randomization)) (##sys#check-exact bound 'eq?-hash) - (%hash/limit (*eq?-hash obj) bound) ) + (%hash/limit (*eq?-hash obj randomization) bound) ) (define hash-by-identity eq?-hash) @@ -233,23 +240,24 @@ (or (%eq?-hash-object? obj) (number? obj) ) ) -(define (*eqv?-hash obj) - (cond [(fixnum? obj) obj] - [(char? obj) (char->integer obj)] - [(eq? obj #t) true-hash-value] - [(eq? obj #f) false-hash-value] - [(null? obj) null-hash-value] - [(eof-object? obj) eof-hash-value] - [(symbol? obj) (%symbol-hash obj)] +(define (*eqv?-hash obj rnd) + (cond [(fixnum? obj) (fxxor obj rnd)] + [(char? obj) (fxxor (char->integer obj) rnd)] + [(eq? obj #t) (fxxor true-hash-value rnd)] + [(eq? obj #f) (fxxor false-hash-value rnd)] + [(null? obj) (fxxor null-hash-value rnd)] + [(eof-object? obj) (fxxor eof-hash-value rnd)] + [(symbol? obj) (%symbol-hash obj rnd)] #; ;NOT YET (no keyword vs. symbol issue) - [(keyword? obj) (%keyword-hash obj)] - [(number? obj) (%non-fixnum-number-hash obj)] - [(%immediate? obj) unknown-immediate-hash-value] - [else (%object-uid-hash obj) ] ) ) + [(keyword? obj) (%keyword-hash obj rnd)] + [(number? obj) (%non-fixnum-number-hash obj rnd)] + [(%immediate? obj) (fxxor unknown-immediate-hash-value rnd)] + [else (%object-uid-hash obj rnd) ] ) ) -(define (eqv?-hash obj #!optional (bound hash-default-bound)) +(define (eqv?-hash obj #!optional (bound hash-default-bound) + (randomization hash-default-randomization)) (##sys#check-exact bound 'eqv?-hash) - (%hash/limit (*eqv?-hash obj) bound) ) + (%hash/limit (*eqv?-hash obj randomization) bound) ) ;; Equal Hash: @@ -259,105 +267,106 @@ ;; NOTE - These refer to identifiers available only within the body of '*equal?-hash'. -(define-inline (%%list-hash obj) +(define-inline (%%list-hash obj rnd) (fx+ (length obj) - (recursive-atomic-hash (##sys#slot obj 0) depth)) ) + (recursive-atomic-hash (##sys#slot obj 0) depth rnd)) ) -(define-inline (%%pair-hash obj) - (fx+ (fxshl (recursive-atomic-hash (##sys#slot obj 0) depth) 16) - (recursive-atomic-hash (##sys#slot obj 1) depth)) ) +(define-inline (%%pair-hash obj rnd) + (fx+ (fxshl (recursive-atomic-hash (##sys#slot obj 0) depth rnd) 16) + (recursive-atomic-hash (##sys#slot obj 1) depth rnd)) ) -(define-inline (%%port-hash obj) - (fx+ (fxshl (##sys#peek-fixnum obj 0) 4) ; Little extra "identity" +(define-inline (%%port-hash obj rnd) + (fx+ (fxxor (fxshl (##sys#peek-fixnum obj 0) 4) rnd) ; Little extra "identity" (if (input-port? obj) input-port-hash-value output-port-hash-value)) ) -(define-inline (%%special-vector-hash obj) - (vector-hash obj (##sys#peek-fixnum obj 0) depth 1) ) +(define-inline (%%special-vector-hash obj rnd) + (vector-hash obj (##sys#peek-fixnum obj 0) depth 1 rnd) ) -(define-inline (%%regular-vector-hash obj) - (vector-hash obj 0 depth 0) ) +(define-inline (%%regular-vector-hash obj rnd) + (vector-hash obj 0 depth 0 rnd) ) -(define (*equal?-hash obj) +(define (*equal?-hash obj rnd) ; Recurse into some portion of the vector's slots - (define (vector-hash obj seed depth start) + (define (vector-hash obj seed depth start rnd) (let ([len (##sys#size obj)]) - (let loop ([hsh (fx+ len seed)] + (let loop ([hsh (fx+ len (fxxor seed rnd))] [i start] [len (fx- (fxmin recursive-hash-max-length len) start)] ) (if (fx= len 0) hsh (loop (fx+ hsh (fx+ (fxshl hsh 4) - (recursive-hash (##sys#slot obj i) (fx+ depth 1)))) + (recursive-hash (##sys#slot obj i) (fx+ depth 1) rnd))) (fx+ i 1) (fx- len 1) ) ) ) ) ) ; Don't recurse into structured objects - (define (recursive-atomic-hash obj depth) + (define (recursive-atomic-hash obj depth rnd) (if (or (%eqv?-hash-object? obj) (%byte-block? obj)) - (recursive-hash obj (fx+ depth 1)) - other-hash-value ) ) + (recursive-hash obj (fx+ depth 1) rnd) + (fxxor other-hash-value rnd) ) ) ; Recurse into structured objects - (define (recursive-hash obj depth) + (define (recursive-hash obj depth rnd) (cond [(fx>= depth recursive-hash-max-depth) - other-hash-value] - [(fixnum? obj) obj] - [(char? obj) (char->integer obj)] - [(eq? obj #t) true-hash-value] - [(eq? obj #f) false-hash-value] - [(null? obj) null-hash-value] - [(eof-object? obj) eof-hash-value] - [(symbol? obj) (%symbol-hash obj)] + (fxxor other-hash-value rnd)] + [(fixnum? obj) (fxxor obj rnd)] + [(char? obj) (fxxor (char->integer obj) rnd)] + [(eq? obj #t) (fxxor true-hash-value rnd)] + [(eq? obj #f) (fxxor false-hash-value rnd)] + [(null? obj) (fxxor null-hash-value rnd)] + [(eof-object? obj) (fxxor eof-hash-value rnd)] + [(symbol? obj) (%symbol-hash obj rnd)] #; ;NOT YET (no keyword vs. symbol issue) - [(keyword? obj) (%keyword-hash obj)] - [(number? obj) (%non-fixnum-number-hash obj)] - [(%immediate? obj) unknown-immediate-hash-value] - [(%byte-block? obj) (%string-hash obj)] - [(list? obj) (%%list-hash obj)] - [(pair? obj) (%%pair-hash obj)] - [(%port? obj) (%%port-hash obj)] - [(%special? obj) (%%special-vector-hash obj)] - [else (%%regular-vector-hash obj)] ) ) + [(keyword? obj) (%keyword-hash obj rnd)] + [(number? obj) (%non-fixnum-number-hash obj rnd)] + [(%immediate? obj) (fxxor unknown-immediate-hash-value rnd)] + [(%byte-block? obj) (%string-hash obj rnd)] + [(list? obj) (%%list-hash obj rnd)] + [(pair? obj) (%%pair-hash obj rnd)] + [(%port? obj) (%%port-hash obj rnd)] + [(%special? obj) (%%special-vector-hash obj rnd)] + [else (%%regular-vector-hash obj rnd)] ) ) ; - (recursive-hash obj 0) ) + (recursive-hash obj 0 rnd) ) -(define (equal?-hash obj #!optional (bound hash-default-bound)) +(define (equal?-hash obj #!optional (bound hash-default-bound) + (randomization hash-default-randomization)) (##sys#check-exact bound 'hash) - (%hash/limit (*equal?-hash obj) bound) ) + (%hash/limit (*equal?-hash obj randomization) bound) ) (define hash equal?-hash) ;; String Hash: -(define (string-hash str #!optional (bound hash-default-bound) . start+end) +(define (string-hash str #!optional (bound hash-default-bound) start end + (randomization hash-default-randomization)) (##sys#check-string str 'string-hash) (##sys#check-exact bound 'string-hash) - (let ((str (if (pair? start+end) - (let-optionals start+end ((start 0) - (end (##sys#size str))) - (##sys#check-range start 0 (##sys#size str) 'string-hash) - (##sys#check-range end 0 (##sys#size str) 'string-hash) - (##sys#substring str start end) ) - str) ) ) - (%hash/limit (%string-hash str) bound) ) ) - -(define (string-ci-hash str #!optional (bound hash-default-bound) . start+end) + (let ((str (if start + (let ((end (or end (##sys#size str)))) + (##sys#check-range start 0 (##sys#size str) 'string-hash) + (##sys#check-range end 0 (##sys#size str) 'string-hash) + (##sys#substring str start end)) + str)) ) + (%hash/limit (%string-hash str randomization) bound) ) ) + +(define (string-ci-hash str #!optional (bound hash-default-bound) start end + (randomization hash-default-randomization)) (##sys#check-string str 'string-ci-hash) (##sys#check-exact bound 'string-ci-hash) - (let ((str (if (pair? start+end) - (let-optionals start+end ((start 0) - (end (##sys#size str))) - (##sys#check-range start 0 (##sys#size str) 'string-hash-ci) - (##sys#check-range end 0 (##sys#size str) 'string-hash-ci) - (##sys#substring str start end) ) - str) ) ) - (%hash/limit (%string-ci-hash str) bound) ) ) + (let ((str (if start + (let ((end (or end (##sys#size str)))) + (##sys#check-range start 0 (##sys#size str) 'string-hash) + (##sys#check-range end 0 (##sys#size str) 'string-hash) + (##sys#substring str start end)) + str)) ) + (%hash/limit (%string-ci-hash str randomization) bound) ) ) (define string-hash-ci string-ci-hash) @@ -405,16 +414,17 @@ (define *make-hash-table (let ([make-vector make-vector]) (lambda (test hash len min-load max-load weak-keys weak-values initial - #!optional (vec (make-vector len '()))) + randomization #!optional (vec (make-vector len '()))) (##sys#make-structure 'hash-table - vec 0 test hash min-load max-load #f #f initial) ) ) ) + vec 0 test hash min-load max-load #f #f initial randomization) ) ) ) ;; SRFI-69 & SRFI-90'ish. ;; ;; Argument list is the pattern ;; ;; (make-hash-table #!optional test hash size -;; #!key test hash size initial min-load max-load weak-keys weak-values) +;; #!key test hash size initial randomization +;; min-load max-load weak-keys weak-values) ;; ;; where a keyword argument takes precedence over the corresponding optional ;; argument. Keyword arguments MUST come after optional & required @@ -435,6 +445,7 @@ [hash #f] [size hash-table-default-length] [initial #f] + [randomization (##core#inline "C_random_fixnum" hash-default-bound)] [min-load hash-table-default-min-load] [max-load hash-table-default-max-load] [weak-keys #f] @@ -501,6 +512,9 @@ (set! size (fxmin hash-table-max-length val))] [(#:initial) (set! initial (lambda () val))] + [(#:randomization) + (##sys#check-exact val 'make-hash-table) + (set! randomization val)] [(#:min-load) (##sys#check-inexact val 'make-hash-table) (unless (and (fp< 0.0 val) (fp< val 1.0)) @@ -533,7 +547,8 @@ (warning 'make-hash-table "user test without user hash") (set! hash equal?-hash) ) ) ) ) ; Done - (*make-hash-table test hash size min-load max-load weak-keys weak-values initial) ) ) ) ) ) + (*make-hash-table test hash size min-load max-load + weak-keys weak-values initial randomization) ) ) ) ) ) ;; Hash-Table Predicate: @@ -580,9 +595,13 @@ (and-let* ([thunk (##sys#slot ht 9)]) (thunk) ) ) +(define (hash-table-randomization ht) + (##sys#check-structure ht 'hash-table 'hash-table-initial) + (##sys#slot ht 10) ) + ;; hash-table-rehash!: -(define (hash-table-rehash! vec1 vec2 hash) +(define (hash-table-rehash! vec1 vec2 hash rnd) (let ([len1 (##sys#size vec1)] [len2 (##sys#size vec2)] ) (do ([i 0 (fx+ i 1)]) @@ -591,7 +610,7 @@ (unless (null? bucket) (let* ([pare (##sys#slot bucket 0)] [key (##sys#slot pare 0)] - [hshidx (hash key len2)] ) + [hshidx (hash key len2 rnd)] ) (##sys#setslot vec2 hshidx (cons (cons key (##sys#slot pare 1)) (##sys#slot vec2 hshidx))) (loop (##sys#slot bucket 1)) ) ) ) ) ) ) @@ -602,7 +621,7 @@ (let* ([deslen (fxmin hash-table-max-length (fx* len hash-table-new-length-factor))] [newlen (hash-table-canonical-length hash-table-prime-lengths deslen)] [vec2 (make-vector newlen '())] ) - (hash-table-rehash! vec vec2 (##sys#slot ht 4)) + (hash-table-rehash! vec vec2 (##sys#slot ht 4) (##sys#slot ht 10)) (##sys#setslot ht 1 vec2) ) ) ;; hash-table-check-resize!: @@ -633,7 +652,7 @@ (##sys#slot ht 2) (##sys#slot ht 5) (##sys#slot ht 6) (##sys#slot ht 7) (##sys#slot ht 8) - (##sys#slot ht 9) + (##sys#slot ht 9) (##sys#slot ht 10) vec2)] (##sys#setslot vec2 i (let copy-loop ([bucket (##sys#slot vec1 i)]) @@ -671,9 +690,10 @@ (hash-table-check-resize! ht newsiz) (let ([hash (##sys#slot ht 4)] [test (##sys#slot ht 3)] - [vec (##sys#slot ht 1)] ) + [vec (##sys#slot ht 1)] + [rnd (##sys#slot ht 10)]) (let* ([len (##sys#size vec)] - [hshidx (hash key len)] + [hshidx (hash key len rnd)] [bucket0 (##sys#slot vec hshidx)] ) (if (eq? core-eq? test) ; Fast path (eq? is rewritten by the compiler): @@ -710,9 +730,10 @@ (hash-table-check-resize! ht newsiz) (let ([hash (##sys#slot ht 4)] [test (##sys#slot ht 3)] - [vec (##sys#slot ht 1)] ) + [vec (##sys#slot ht 1)] + [rnd (##sys#slot ht 10)]) (let* ([len (##sys#size vec)] - [hshidx (hash key len)] + [hshidx (hash key len rnd)] [bucket0 (##sys#slot vec hshidx)] ) (if (eq? core-eq? test) ; Fast path (eq? is rewritten by the compiler): @@ -755,9 +776,10 @@ (hash-table-check-resize! ht newsiz) (let ([hash (##sys#slot ht 4)] [test (##sys#slot ht 3)] - [vec (##sys#slot ht 1)] ) + [vec (##sys#slot ht 1)] + [rnd (##sys#slot ht 10)]) (let* ([len (##sys#size vec)] - [hshidx (hash key len)] + [hshidx (hash key len rnd)] [bucket0 (##sys#slot vec hshidx)] ) (if (eq? core-eq? test) ; Fast path (eq? is rewritten by the compiler): @@ -794,9 +816,10 @@ (##sys#check-structure ht 'hash-table 'hash-table-ref) (##sys#check-closure def 'hash-table-ref) (let ([vec (##sys#slot ht 1)] - [test (##sys#slot ht 3)] ) + [test (##sys#slot ht 3)] + [rnd (##sys#slot ht 10)]) (let* ([hash (##sys#slot ht 4)] - [hshidx (hash key (##sys#size vec))] ) + [hshidx (hash key (##sys#size vec) rnd)] ) (if (eq? core-eq? test) ; Fast path (eq? is rewritten by the compiler): (let loop ([bucket (##sys#slot vec hshidx)]) @@ -822,9 +845,10 @@ (lambda (ht key def) (##sys#check-structure ht 'hash-table 'hash-table-ref/default) (let ([vec (##sys#slot ht 1)] - [test (##sys#slot ht 3)] ) + [test (##sys#slot ht 3)] + [rnd (##sys#slot ht 10)]) (let* ([hash (##sys#slot ht 4)] - [hshidx (hash key (##sys#size vec))] ) + [hshidx (hash key (##sys#size vec) rnd)] ) (if (eq? core-eq? test) ; Fast path (eq? is rewritten by the compiler): (let loop ([bucket (##sys#slot vec hshidx)]) @@ -848,9 +872,10 @@ (lambda (ht key) (##sys#check-structure ht 'hash-table 'hash-table-exists?) (let ([vec (##sys#slot ht 1)] - [test (##sys#slot ht 3)] ) + [test (##sys#slot ht 3)] + [rnd (##sys#slot ht 10)]) (let* ([hash (##sys#slot ht 4)] - [hshidx (hash key (##sys#size vec))] ) + [hshidx (hash key (##sys#size vec) rnd)] ) (if (eq? core-eq? test) ; Fast path (eq? is rewritten by the compiler): (let loop ([bucket (##sys#slot vec hshidx)]) @@ -874,7 +899,8 @@ (let* ([vec (##sys#slot ht 1)] [len (##sys#size vec)] [hash (##sys#slot ht 4)] - [hshidx (hash key len)] ) + [rnd (##sys#slot ht 10)] + [hshidx (hash key len rnd)] ) (let ([test (##sys#slot ht 3)] [newsiz (fx- (##sys#slot ht 2) 1)] [bucket0 (##sys#slot vec hshidx)] ) diff --git a/tests/hash-table-tests.scm b/tests/hash-table-tests.scm index 524c40d..666be34 100644 --- a/tests/hash-table-tests.scm +++ b/tests/hash-table-tests.scm @@ -38,7 +38,7 @@ (print "HT - All Parameters") (set! ht (make-hash-table eqv? eqv?-hash 23 #:test equal? #:hash equal?-hash - #:initial 'foo + #:initial 'foo #:randomization 30 #:size 500 #:min-load 0.45 #:max-load 0.85 #:weak-keys #t #:weak-values #t)) @@ -127,6 +127,35 @@ (assert (list? alist)) (assert (= (length alist) 3)) ) +(print "Hash collision weaknesses") +;; If these fail, it might be bad luck caused by the randomization/modulo combo +;; So don't *immediately* dismiss a hash implementation when it fails here +(assert (not (= (hash "a" 10 1) (hash "a" 10 2)))) +(assert (not (= (hash (make-string 1 #\nul) 10 10) 0))) +;; Long identical suffixes should not hash to the same value +(assert (not (= (hash (string-append (make-string 1000000 #\a) + (make-string 1000000 #\c)) 10 1) + (hash (string-append (make-string 1000000 #\b) + (make-string 1000000 #\c)) 10 1)))) +;; Same for prefixes +(assert (not (= (hash (string-append (make-string 1000000 #\a) + (make-string 1000000 #\b)) 10 1) + (hash (string-append (make-string 1000000 #\a) + (make-string 1000000 #\c)) 10 1)))) +;; And palindromes, too +(assert (not (= (hash (string-append (make-string 1000000 #\a) + (make-string 1000000 #\b) + (make-string 1000000 #\a)) 10 1) + (hash (string-append (make-string 1000000 #\a) + (make-string 1000000 #\c) + (make-string 1000000 #\a)) 10 1)))) +;; differing number of nul bytes should not be identical +(assert (not (= (hash (make-string 1 #\nul) 10 1) + (hash (make-string 2 #\nul) 10 1)))) +;; ensure very long NUL strings don't cause the random value to get pushed out +(assert (not (= (hash (make-string 1000000 #\nul) 10 1) + (hash (make-string 1000001 #\nul) 10 1)))) + ;; Stress Test (set! ht (make-hash-table)) diff --git a/types.db b/types.db index 9f97c46..3ee83db 100644 --- a/types.db +++ b/types.db @@ -2448,11 +2448,11 @@ ;; srfi-69 (alist->hash-table (#(procedure #:clean #:enforce) alist->hash-table ((list-of pair) #!rest) (struct hash-table))) -(eq?-hash (#(procedure #:clean #:enforce) eq?-hash (* #!optional fixnum) fixnum)) -(equal?-hash (#(procedure #:clean #:enforce) equal?-hash (* #!optional fixnum) fixnum)) -(eqv?-hash (#(procedure #:clean #:enforce) eqv?-hash (* #!optional fixnum) fixnum)) -(hash (#(procedure #:pure #:enforce) hash (* #!optional fixnum) fixnum)) -(hash-by-identity (#(procedure #:pure #:enforce) hash-by-identity (* #!optional fixnum) fixnum)) +(eq?-hash (#(procedure #:clean #:enforce) eq?-hash (* #!optional fixnum fixnum) fixnum)) +(equal?-hash (#(procedure #:clean #:enforce) equal?-hash (* #!optional fixnum fixnum) fixnum)) +(eqv?-hash (#(procedure #:clean #:enforce) eqv?-hash (* #!optional fixnum fixnum) fixnum)) +(hash (#(procedure #:pure #:enforce) hash (* #!optional fixnum fixnum) fixnum)) +(hash-by-identity (#(procedure #:pure #:enforce) hash-by-identity (* #!optional fixnum fixnum) fixnum)) (hash-table->alist (#(procedure #:clean #:enforce) hash-table->alist ((struct hash-table)) (list-of pair))) (hash-table-clear! (#(procedure #:clean #:enforce) hash-table-clear! ((struct hash-table)) undefined)) (hash-table-copy (#(procedure #:clean #:enforce) hash-table-copy ((struct hash-table)) (struct hash-table))) @@ -2468,6 +2468,8 @@ (hash-table-hash-function (#(procedure #:clean #:enforce) hash-table-hash-function ((struct hash-table)) (procedure (* fixnum) fixnum)) (((struct hash-table)) (##sys#slot #(1) '4))) +(hash-table-randomization (#(procedure #:clean #:enforce) hash-table-randomization ((struct hash-table)) fixnum) + (((struct hash-table)) (##sys#slot #(1) '10))) (hash-table-initial (#(procedure #:clean #:enforce) hash-table-initial ((struct hash-table)) *)) (hash-table-keys (#(procedure #:clean #:enforce) hash-table-keys ((struct hash-table)) list)) (hash-table-map (#(procedure #:clean #:enforce) hash-table-map ((struct hash-table) (procedure (* *) *)) list)) @@ -2504,15 +2506,15 @@ ;;XXX if we want to hardcode hash-default-bound here, we could rewrite the 1-arg case... ; (applies to all hash-functions) -(keyword-hash (#(procedure #:clean #:enforce) keyword-hash (* #!optional fixnum) fixnum)) +(keyword-hash (#(procedure #:clean #:enforce) keyword-hash (* #!optional fixnum fixnum) fixnum)) (make-hash-table (#(procedure #:clean #:enforce) make-hash-table (#!rest) (struct hash-table))) -(number-hash (#(procedure #:clean #:enforce) number-hash (fixnum #!optional fixnum) fixnum)) -(object-uid-hash (#(procedure #:clean #:enforce) object-uid-hash (* #!optional fixnum) fixnum)) -(symbol-hash (#(procedure #:clean #:enforce) symbol-hash (symbol #!optional fixnum) fixnum)) -(string-hash (#(procedure #:clean #:enforce) string-hash (string #!optional fixnum fixnum fixnum) number)) -(string-hash-ci (#(procedure #:clean #:enforce) string-hash-ci (string #!optional fixnum fixnum fixnum) number)) -(string-ci-hash (#(procedure #:clean #:enforce) string-ci-hash (string #!optional fixnum fixnum fixnum) number)) +(number-hash (#(procedure #:clean #:enforce) number-hash (fixnum #!optional fixnum fixnum) fixnum)) +(object-uid-hash (#(procedure #:clean #:enforce) object-uid-hash (* #!optional fixnum fixnum) fixnum)) +(symbol-hash (#(procedure #:clean #:enforce) symbol-hash (symbol #!optional fixnum fixnum) fixnum)) +(string-hash (#(procedure #:clean #:enforce) string-hash (string #!optional fixnum fixnum fixnum fixnum) number)) +(string-hash-ci (#(procedure #:clean #:enforce) string-hash-ci (string #!optional fixnum fixnum fixnum fixnum) number)) +(string-ci-hash (#(procedure #:clean #:enforce) string-ci-hash (string #!optional fixnum fixnum fixnum fixnum) number)) ;; tcp -- 1.7.3.4