>From 0107e5a2597283bf572f0f79c74c2f315b97e36c Mon Sep 17 00:00:00 2001 From: "A. Gordon" Date: Fri, 06 Feb 2015 21:06:34 -0500 Subject: [PATCH 0/7] *** SUBJECT HERE *** *** BLURB HERE *** A. Gordon (7): gnulib: add randread source with fixed seed gnulib: add randint source with known seed shred: new option: --random-seed=N shuf: new option: --random-seed=N sort: new option: --random-seed=N tests: sort,shuf,shred: add --random-seed=N tests doc: mention --random-seed=N option doc/coreutils.texi | 36 ++++++++++++++++++++++++++++++++- gl/lib/randint.c | 6 ++++++ gl/lib/randint.h | 1 + gl/lib/randread.c | 31 ++++++++++++++++++++++++++++ gl/lib/randread.h | 1 + src/shred.c | 27 +++++++++++++++++++++++-- src/shuf.c | 31 ++++++++++++++++++++++++---- src/sort.c | 50 +++++++++++++++++++++++++++++++++++++++++++++- tests/misc/shred-passes.sh | 14 +++++++++++++ tests/misc/shuf.sh | 14 +++++++++++++ tests/misc/sort-rand.sh | 14 +++++++++++++ 11 files changed, 217 insertions(+), 8 deletions(-) -- 1.9.1 >From 9f197c17f4670b33ff5464c70fd371dd3ba15e78 Mon Sep 17 00:00:00 2001 From: "A. Gordon" Date: Fri, 06 Feb 2015 21:06:34 -0500 Subject: [PATCH 1/7] gnulib: add randread source with fixed seed * gl/lib/randread.{c,h}: randread_new_seed() - new function to allocate and initialize a 'struct randread_source *' with a user-supplied seed (thus, known initial (not so) random state). get_nonce_seed() - initialize 'struct randread_source' with seed value, instead of pid/ppid/time. --- gl/lib/randread.c | 31 +++++++++++++++++++++++++++++++ gl/lib/randread.h | 1 + 2 files changed, 32 insertions(+) diff --git a/gl/lib/randread.c b/gl/lib/randread.c index 5efbd4f..b996c9e 100644 --- a/gl/lib/randread.c +++ b/gl/lib/randread.c @@ -188,6 +188,21 @@ get_nonce (void *buffer, size_t bufsize, size_t bytes_bound) #endif } +/* Put a fixed, user-supplied value into BUFFER, with size BUFSIZE. */ +static void +get_nonce_seed (void *buffer, size_t bufsize, size_t bytes_bound, int seed) +{ + char *buf = buffer; + ssize_t seeded = 0; + + /* Unlike get_nonce(), reset the buffer to a known zero state, + to ensure reproducible seeding */ + memset (buf, 0, bufsize); + + /* TODO: fill more of the state buffer? or leave it as mostly zeros ?*/ + ISAAC_SEED (int, v = seed); +} + /* Create and initialize a random data source from NAME, or use a reasonable default source if NAME is null. BYTES_BOUND is an upper @@ -230,6 +245,22 @@ randread_new (char const *name, size_t bytes_bound) } } +/* Similar to randread_new(), create and initialize a random data source, + except use a user-supplied seed value. */ +struct randread_source *randread_new_seed (int seed, size_t bytes_bound) +{ + /* TODO: not fail on bytes_bound==0 with seed value? */ + if (bytes_bound == 0) + return NULL; + + struct randread_source *s = simple_new (NULL, NULL); + s->buf.isaac.buffered = 0; + get_nonce_seed (s->buf.isaac.state.m, sizeof s->buf.isaac.state.m, + bytes_bound, seed); + isaac_seed (&s->buf.isaac.state); + return s; +} + /* Set S's handler and its argument. HANDLER (HANDLER_ARG) is called when there is a read error or end of file from the random data diff --git a/gl/lib/randread.h b/gl/lib/randread.h index c6ffc79..2022dca 100644 --- a/gl/lib/randread.h +++ b/gl/lib/randread.h @@ -25,6 +25,7 @@ struct randread_source; struct randread_source *randread_new (char const *, size_t); +struct randread_source *randread_new_seed (int seed, size_t); void randread (struct randread_source *, void *, size_t); void randread_set_handler (struct randread_source *, void (*) (void const *)); void randread_set_handler_arg (struct randread_source *, void const *); -- 1.9.1 >From a45684503b08d05e83fe33d871652b30f3b83faf Mon Sep 17 00:00:00 2001 From: "A. Gordon" Date: Fri, 06 Feb 2015 21:06:34 -0500 Subject: [PATCH 2/7] gnulib: add randint source with known seed * gl/lib/randint.{c,h}: randint_all_new_seed(): new function to allocate and initialize a 'struct randint_source *' with a known user-supplied seed (thus, have a not so random initial state). --- gl/lib/randint.c | 6 ++++++ gl/lib/randint.h | 1 + 2 files changed, 7 insertions(+) diff --git a/gl/lib/randint.c b/gl/lib/randint.c index 240fd24..fae729b 100644 --- a/gl/lib/randint.c +++ b/gl/lib/randint.c @@ -87,6 +87,12 @@ randint_all_new (char const *name, size_t bytes_bound) return (source ? randint_new (source) : NULL); } +struct randint_source *randint_all_new_seed (int seed, size_t bytes_bound) +{ + struct randread_source *source = randread_new_seed (seed, bytes_bound); + return (source ? randint_new (source) : NULL); +} + /* Return the random data source of *S. */ struct randread_source * diff --git a/gl/lib/randint.h b/gl/lib/randint.h index d6728c2..1a6c635 100644 --- a/gl/lib/randint.h +++ b/gl/lib/randint.h @@ -34,6 +34,7 @@ struct randint_source; struct randint_source *randint_new (struct randread_source *); struct randint_source *randint_all_new (char const *, size_t); +struct randint_source *randint_all_new_seed (int, size_t); struct randread_source *randint_get_source (struct randint_source const *) _GL_ATTRIBUTE_PURE; randint randint_genmax (struct randint_source *, randint genmax); -- 1.9.1 >From dc16319ef24bf324ab03715ceb58ff5b8fd22407 Mon Sep 17 00:00:00 2001 From: "A. Gordon" Date: Fri, 06 Feb 2015 21:06:34 -0500 Subject: [PATCH 3/7] shred: new option: --random-seed=N * src/shred.c: usage(): mention new option. main(): handle new option. --- src/shred.c | 27 +++++++++++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) diff --git a/src/shred.c b/src/shred.c index 543dcb0..e1718ff 100644 --- a/src/shred.c +++ b/src/shred.c @@ -90,6 +90,7 @@ #include "error.h" #include "fcntl--.h" #include "human.h" +#include "quote.h" #include "quotearg.h" /* For quotearg_colon */ #include "randint.h" #include "randread.h" @@ -141,7 +142,8 @@ struct Options non-character as a pseudo short option, starting with CHAR_MAX + 1. */ enum { - RANDOM_SOURCE_OPTION = CHAR_MAX + 1 + RANDOM_SOURCE_OPTION = CHAR_MAX + 1, + RANDOM_SEED_OPTION }; static struct option const long_opts[] = @@ -150,6 +152,7 @@ static struct option const long_opts[] = {"force", no_argument, NULL, 'f'}, {"iterations", required_argument, NULL, 'n'}, {"size", required_argument, NULL, 's'}, + {"random-seed", required_argument, NULL, RANDOM_SEED_OPTION}, {"random-source", required_argument, NULL, RANDOM_SOURCE_OPTION}, {"remove", optional_argument, NULL, 'u'}, {"verbose", no_argument, NULL, 'v'}, @@ -177,6 +180,7 @@ for even very expensive hardware probing to recover the data.\n\ printf (_("\ -f, --force change permissions to allow writing if necessary\n\ -n, --iterations=N overwrite N times instead of the default (%d)\n\ + --random-seed=N use N as random seed\n\ --random-source=FILE get random bytes from FILE\n\ -s, --size=N shred this many bytes (suffixes like K, M, G accepted)\n\ "), DEFAULT_PASSES); @@ -1207,6 +1211,8 @@ main (int argc, char **argv) int c; int i; char const *random_source = NULL; + long random_seed = 0; + bool use_random_seed = false; initialize_main (&argc, &argv); set_program_name (argv[0]); @@ -1240,6 +1246,16 @@ main (int argc, char **argv) random_source = optarg; break; + case RANDOM_SEED_OPTION: + { + strtol_error e = xstrtol (optarg, NULL, 10, &random_seed, NULL); + if (e != LONGINT_OK) + error (EXIT_FAILURE, 0, _("invalid random seed: %s"), + quote (optarg)); + use_random_seed = true; + } + break; + case 'u': if (optarg == NULL) flags.remove_file = remove_wipesync; @@ -1282,8 +1298,15 @@ main (int argc, char **argv) error (0, 0, _("missing file operand")); usage (EXIT_FAILURE); } + if (random_source && use_random_seed) + { + error (0, 0, _("random-source and random-seed are mutually exclusive")); + usage (EXIT_FAILURE); + } - randint_source = randint_all_new (random_source, SIZE_MAX); + randint_source = (use_random_seed) + ?randint_all_new_seed (random_seed, SIZE_MAX) + :randint_all_new (random_source, SIZE_MAX); if (! randint_source) error (EXIT_FAILURE, errno, "%s", quotearg_colon (random_source)); atexit (clear_random_data); -- 1.9.1 >From f41946b1639b2beb641b06e1944422ccef887998 Mon Sep 17 00:00:00 2001 From: "A. Gordon" Date: Fri, 06 Feb 2015 21:06:34 -0500 Subject: [PATCH 4/7] shuf: new option: --random-seed=N * src/shuf.c: usage(): mention new option. main(): handle new option. --- src/shuf.c | 31 +++++++++++++++++++++++++++---- 1 file changed, 27 insertions(+), 4 deletions(-) diff --git a/src/shuf.c b/src/shuf.c index 5a25e58..141779c 100644 --- a/src/shuf.c +++ b/src/shuf.c @@ -76,6 +76,7 @@ Write a random permutation of the input lines to standard output.\n\ -i, --input-range=LO-HI treat each number LO through HI as an input line\n\ -n, --head-count=COUNT output at most COUNT lines\n\ -o, --output=FILE write result to FILE instead of standard output\n\ + --random-seed=N use N as random seed\n\ --random-source=FILE get random bytes from FILE\n\ -r, --repeat output lines can be repeated\n\ "), stdout); @@ -98,7 +99,8 @@ With no FILE, or when FILE is -, read standard input.\n\ non-character as a pseudo short option, starting with CHAR_MAX + 1. */ enum { - RANDOM_SOURCE_OPTION = CHAR_MAX + 1 + RANDOM_SOURCE_OPTION = CHAR_MAX + 1, + RANDOM_SEED_OPTION }; static struct option const long_opts[] = @@ -107,6 +109,7 @@ static struct option const long_opts[] = {"input-range", required_argument, NULL, 'i'}, {"head-count", required_argument, NULL, 'n'}, {"output", required_argument, NULL, 'o'}, + {"random-seed", required_argument, NULL, RANDOM_SEED_OPTION}, {"random-source", required_argument, NULL, RANDOM_SOURCE_OPTION}, {"repeat", no_argument, NULL, 'r'}, {"zero-terminated", no_argument, NULL, 'z'}, @@ -391,6 +394,8 @@ main (int argc, char **argv) size_t head_lines = SIZE_MAX; char const *outfile = NULL; char *random_source = NULL; + long random_seed = 0; + bool use_random_seed = false; char eolbyte = '\n'; char **input_lines = NULL; bool use_reservoir_sampling = false; @@ -476,6 +481,16 @@ main (int argc, char **argv) random_source = optarg; break; + case RANDOM_SEED_OPTION: + { + strtol_error e = xstrtol (optarg, NULL, 10, &random_seed, NULL); + if (e != LONGINT_OK) + error (EXIT_FAILURE, 0, _("invalid random seed: %s"), + quote (optarg)); + use_random_seed = true; + } + break; + case 'r': repeat = true; break; @@ -504,6 +519,11 @@ main (int argc, char **argv) error (0, 0, _("extra operand %s"), quote (operand[!input_range])); usage (EXIT_FAILURE); } + if (random_source && use_random_seed) + { + error (0, 0, _("random-source and random-seed are mutually exclusive")); + usage (EXIT_FAILURE); + } /* Prepare input. */ if (echo) @@ -543,10 +563,13 @@ main (int argc, char **argv) if (! repeat) head_lines = MIN (head_lines, n_lines); - randint_source = randint_all_new (random_source, - (use_reservoir_sampling || repeat + const size_t bytes_bound = (use_reservoir_sampling || repeat ? SIZE_MAX - : randperm_bound (head_lines, n_lines))); + : randperm_bound (head_lines, n_lines)); + randint_source = (use_random_seed) + ?randint_all_new_seed (random_seed, bytes_bound) + :randint_all_new (random_source, bytes_bound); + if (! randint_source) error (EXIT_FAILURE, errno, "%s", quotearg_colon (random_source)); -- 1.9.1 >From 5c19703d04e2e2314ac85b9d5ce11ffcf26a7993 Mon Sep 17 00:00:00 2001 From: "A. Gordon" Date: Fri, 06 Feb 2015 21:06:34 -0500 Subject: [PATCH 5/7] sort: new option: --random-seed=N * src/sort.c: usage(): mention new option. main(): handle new option. random_md5_state_init_seed(): initialize the random data structure from a user-supplied seed value. --- src/sort.c | 50 +++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 49 insertions(+), 1 deletion(-) diff --git a/src/sort.c b/src/sort.c index 23d4565..3cff244 100644 --- a/src/sort.c +++ b/src/sort.c @@ -451,6 +451,7 @@ Ordering options:\n\ fputs (_("\ -n, --numeric-sort compare according to string numerical value\n\ -R, --random-sort sort by random hash of keys\n\ + --random-seed=N use N as random seed\n\ --random-source=FILE get random bytes from FILE\n\ -r, --reverse reverse the result of comparisons\n\ "), stdout); @@ -547,6 +548,7 @@ enum FILES0_FROM_OPTION, NMERGE_OPTION, RANDOM_SOURCE_OPTION, + RANDOM_SEED_OPTION, SORT_OPTION, PARALLEL_OPTION }; @@ -571,6 +573,7 @@ static struct option const long_options[] = {"human-numeric-sort", no_argument, NULL, 'h'}, {"version-sort", no_argument, NULL, 'V'}, {"random-sort", no_argument, NULL, 'R'}, + {"random-seed", required_argument, NULL, RANDOM_SEED_OPTION}, {"random-source", required_argument, NULL, RANDOM_SOURCE_OPTION}, {"sort", required_argument, NULL, SORT_OPTION}, {"output", required_argument, NULL, 'o'}, @@ -2063,6 +2066,28 @@ random_md5_state_init (char const *random_source) md5_process_bytes (buf, sizeof buf, &random_md5_state); } +/* Initialize the pseudo-random MD5 state with a user-supplied seed. */ + +static void +random_md5_state_init_seed (int seed) +{ + unsigned char buf[MD5_DIGEST_SIZE]; + struct randread_source *r = randread_new_seed (seed, sizeof buf); + if (! r) + { + error (0, 0, _("random seed initization error")); + exit (SORT_FAILURE); + } + randread (r, buf, sizeof buf); + if (randread_free (r) != 0) + { + error (0, 0, _("random seed cleanup error")); + exit (SORT_FAILURE); + } + md5_init_ctx (&random_md5_state); + md5_process_bytes (buf, sizeof buf, &random_md5_state); +} + /* This is like strxfrm, except it reports any error and exits. */ static size_t @@ -4180,6 +4205,8 @@ main (int argc, char **argv) bool mergeonly = false; char *random_source = NULL; bool need_random = false; + long random_seed = 0; + bool use_random_seed = false; size_t nthreads = 0; size_t nfiles = 0; bool posixly_correct = (getenv ("POSIXLY_CORRECT") != NULL); @@ -4483,6 +4510,16 @@ main (int argc, char **argv) random_source = optarg; break; + case RANDOM_SEED_OPTION: + { + strtol_error e = xstrtol (optarg, NULL, 10, &random_seed, NULL); + if (e != LONGINT_OK) + error (EXIT_FAILURE, 0, _("invalid random seed: %s"), + quote (optarg)); + use_random_seed = true; + } + break; + case 's': stable = true; break; @@ -4650,6 +4687,12 @@ main (int argc, char **argv) check_ordering_compatibility (); + if (random_source && use_random_seed) + { + error (0, 0, _("random-source and random-seed are mutually exclusive")); + usage (EXIT_FAILURE); + } + if (debug) { if (checkonly || outfile) @@ -4672,7 +4715,12 @@ main (int argc, char **argv) reverse = gkey.reverse; if (need_random) - random_md5_state_init (random_source); + { + if (use_random_seed) + random_md5_state_init_seed (random_seed); + else + random_md5_state_init (random_source); + } if (temp_dir_count == 0) { -- 1.9.1 >From 33a9fa43cb92b2e24578c4965d45ecd81fc83b85 Mon Sep 17 00:00:00 2001 From: "A. Gordon" Date: Fri, 06 Feb 2015 21:06:34 -0500 Subject: [PATCH 6/7] tests: sort,shuf,shred: add --random-seed=N tests * tests/misc/shred-passes.sh, * tests/misc/shuf.sh, * tests/misc/sort-rand.sh: test new --random-seed=N option, ensure identical output is produced. --- tests/misc/shred-passes.sh | 14 ++++++++++++++ tests/misc/shuf.sh | 14 ++++++++++++++ tests/misc/sort-rand.sh | 14 ++++++++++++++ 3 files changed, 42 insertions(+) diff --git a/tests/misc/shred-passes.sh b/tests/misc/shred-passes.sh index 0fa63be..12e6630 100755 --- a/tests/misc/shred-passes.sh +++ b/tests/misc/shred-passes.sh @@ -47,5 +47,19 @@ shred -v -u f 2>out || fail=1 compare exp out || fail=1 +# should fail: mutually exclusive random options +{ shred --random-source=/dev/null --random-seed=5 f || test $? -ne 1 ; } && + { warn_ "shred --random-{source,seed} did not trigger an error"; fail=1 ; } +# should fail: invalid random seed +{ shred --random-seed=hello f || test $? -ne 1 ; } && + { warn_ "shred --random-seed=hello did not trigger an error"; fail=1 ; } + +# shred with fixed random seed should produce identical output +for i in 1 2 ; do + seq 100 > j$i || framework_failure_ + shred --random-seed=4200 j$i || framework_failure_ +done +compare_ j1 j2 \ + || { fail=1; echo "shred --random-seed=N produced non-identical output" ; } Exit $fail diff --git a/tests/misc/shuf.sh b/tests/misc/shuf.sh index c7db14f..ef8c230 100755 --- a/tests/misc/shuf.sh +++ b/tests/misc/shuf.sh @@ -166,4 +166,18 @@ printf "A\nB\nC\nD\nE\n" | shuf --rep -n0 > exp || framework_failure_ test \! -s exp || { fail=1; echo "--repeat,STDIN,-n0 produced bad output">&2 ; } +# should fail: mutually exclusive random options +{ shuf --random-source=/dev/null --random-seed=5 -e A || test $? -ne 1 ; } && + { warn_ "shuf --random-{source,seed} did not trigger an error"; fail=1 ; } +# should fail: invalid random seed +{ shuf --random-seed=hello -e A || test $? -ne 1 ; } && + { warn_ "shuf --random-seed=hello did not trigger an error"; fail=1 ; } + +# shuf with fixed random seed should produce identical output +for i in 1 2 ; do + shuf -n20 --random-seed=4242 < in > exp$i || framework_failure_ +done +compare_ exp1 exp2 \ + || { fail=1; echo "shuf --random-seed=N produced non-identical output" ; } + Exit $fail diff --git a/tests/misc/sort-rand.sh b/tests/misc/sort-rand.sh index 3fc5fef..4c2ddf1 100755 --- a/tests/misc/sort-rand.sh +++ b/tests/misc/sort-rand.sh @@ -49,4 +49,18 @@ if (locale --version) > /dev/null 2>&1; then { fail=1; echo "not a permutation with LC_ALL=$locale" 1>&2; } fi +# should fail: mutually exclusive random options +{ sort --random-source=/dev/null --random-seed=5 in || test $? -ne 1 ; } && + { warn_ "sort --random-{source,seed} did not trigger an error"; fail=1 ; } +# should fail: invalid random seed +{ sort --random-seed=hello in || test $? -ne 1 ; } && + { warn_ "sort --random-seed=hello did not trigger an error"; fail=1 ; } + +# sort with fixed random seed should produce identical output +for i in 1 2 ; do + sort --random-seed=4242 -R < in > exp$i || framework_failure_ +done +compare_ exp1 exp2 \ + || { fail=1; echo "sort --random-seed=N produced non-identical output" ; } + Exit $fail -- 1.9.1 >From 0107e5a2597283bf572f0f79c74c2f315b97e36c Mon Sep 17 00:00:00 2001 From: "A. Gordon" Date: Fri, 06 Feb 2015 21:06:34 -0500 Subject: [PATCH 7/7] doc: mention --random-seed=N option * doc/coreutils.texi: add new section for --random-seed, and mention new option in sort,shuf,shred sections. --- doc/coreutils.texi | 36 +++++++++++++++++++++++++++++++++++- 1 file changed, 35 insertions(+), 1 deletion(-) diff --git a/doc/coreutils.texi b/doc/coreutils.texi index 87fb3dc..577f20c 100644 --- a/doc/coreutils.texi +++ b/doc/coreutils.texi @@ -767,6 +767,7 @@ name. * Signal specifications:: Specifying signals using the --signal option. * Disambiguating names and IDs:: chgrp, chown, chroot, id: user and group syntax * Random sources:: --random-source, in some programs. +* Random seeds:: --random-seed, in some programs. * Target directory:: Specifying a target directory, in some programs. * Trailing slashes:: --strip-trailing-slashes, in some programs. * Traversing symlinks:: -H, -L, or -P, in some programs. @@ -1240,6 +1241,18 @@ operating system. To reproduce the results of an earlier invocation of a command, you can save some random data into a file and then use that file as the random source in earlier and later invocations of the command. +Alternatively, use @option{--random-seed=@var{N}}. + +@node Random seeds +@section User-supplied random seed + +@cindex random seeds + +The @command{shuf}, @command{shred}, and @command{sort} commands +sometimes need random data to do their work. The +@option{--random-seed=@var{N}} option forces a known random seed, +enable reproducible results with the random options (effectively, +producing non-random output). @node Target directory @section Target directory @@ -4551,7 +4564,7 @@ functions for different fields, you can invoke @command{sort} more than once. The choice of hash function is affected by the -@option{--random-source} option. +@option{--random-source} and @option{--random-seed} options. @end table @@ -4649,6 +4662,13 @@ On newer systems, @option{-o} cannot appear after an input file if scripts should specify @option{-o @var{output-file}} before any input files. +@item --random-seed=@var{N} +@opindex --random-seed +@cindex user-supplied random seed +Use numeric value @var{N} to initialize the random data source. +Enables reproducible output with random sort options (effectively +producing non-random output). @xref{Random seeds}. + @item --random-source=@var{file} @opindex --random-source @cindex random source for sorting @@ -5019,6 +5039,13 @@ Write output to @var{output-file} instead of standard output. @var{output-file}, so you can safely shuffle a file in place by using commands like @code{shuf -o F