>From f20a3407a8ae8488b2e7434f75738b219a2320be Mon Sep 17 00:00:00 2001 From: Assaf Gordon Date: Fri, 5 Jul 2013 14:59:44 -0600 Subject: [PATCH 1/2] tests: add more tests for shuf option combinations * test/misc/shuf.sh: Add tests for erroneous conditions like multiple '-o' and '--random-source'. --- tests/misc/shuf.sh | 29 +++++++++++++++++++++++++++++ 1 files changed, 29 insertions(+), 0 deletions(-) diff --git a/tests/misc/shuf.sh b/tests/misc/shuf.sh index 3e33b61..492fd41 100755 --- a/tests/misc/shuf.sh +++ b/tests/misc/shuf.sh @@ -65,4 +65,33 @@ if ! test -r unreadable; then shuf -n1 unreadable && fail=1 fi +# Multiple -n is accepted, should use the smallest value +shuf -n10 -i0-9 -n3 -n20 > exp || framework_failure_ +c=$(wc -l < exp) || framework_failure_ +test "$c" -eq 3 || { fail=1; echo "Multiple -n failed">&2 ; } + +# Test error conditions + +# -i and -e must not be used together +: | shuf -i -e A B && + { fail=1; echo "shuf did not detect erroneous -e and -i usage.">&2 ; } +# Test invalid value for -n +: | shuf -nA && + { fail=1; echo "shuf did not detect erroneous -n usage.">&2 ; } +# Test multiple -i +shuf -i0-9 -n10 -i8-90 && + { fail=1; echo "shuf did not detect multiple -i usage.">&2 ; } +# Test invalid range +for ARG in '1' 'A' '1-' '1-A'; do + shuf -i$ARG && + { fail=1; echo "shuf did not detect erroneous -i$ARG usage.">&2 ; } +done + +# multiple -o are forbidden +shuf -i0-9 -o A -o B && + { fail=1; echo "shuf did not detect erroneous multiple -o usage.">&2 ; } +# multiple random-sources are forbidden +shuf -i0-9 --random-source A --random-source B && + { fail=1; echo "shuf did not detect multiple --random-source usage.">&2 ; } + Exit $fail -- 1.7.7.6 >From 349eda8cb0765621979d8fd8b58c21e9c5d49073 Mon Sep 17 00:00:00 2001 From: Assaf Gordon Date: Thu, 4 Jul 2013 13:26:45 -0600 Subject: [PATCH 2/2] shuf: add --repetition to support repetition in output main(): Process new option. Replace input_numbers_option_used() with a local variable. Re-organize argument processing. usage(): Describe the new option. (write_random_numbers): A new function to generate a permutation of the specified input range with repetition. (write_random_lines): Likewise for stdin and --echo. (write_permuted_numbers): New function refactored from write_permuted_output(). (write_permuted_lines): Likewise. * tests/misc/shuf.sh: Add tests for --repetitions option. * doc/coreutils.texi: Mention --repetitions, add examples. * TODO: Mention an optimization to avoid needing to read all of the input into memory with --repetitions. * NEWS: Mention new shuf option. --- NEWS | 3 + doc/coreutils.texi | 37 ++++++++++ src/shuf.c | 190 ++++++++++++++++++++++++++++++++++++--------------- tests/misc/shuf.sh | 62 +++++++++++++++++ 4 files changed, 236 insertions(+), 56 deletions(-) diff --git a/NEWS b/NEWS index 75ec253..5fd1acb 100644 --- a/NEWS +++ b/NEWS @@ -42,6 +42,9 @@ GNU coreutils NEWS -*- outline -*- csplit accepts a new option: --suppressed-matched, to elide the lines used to identify the split points. + shuf accepts a new option: --repetitions (-r), to allow repetitions + of input items in the permuted output. + ** Changes in behavior stdbuf now requires at least one buffering mode option to be specified, diff --git a/doc/coreutils.texi b/doc/coreutils.texi index b3233f6..ca10a16 100644 --- a/doc/coreutils.texi +++ b/doc/coreutils.texi @@ -4945,6 +4945,16 @@ commands like @code{shuf -o F 0 && input_range) + || (!echo && !input_range && n_operands>=2)) + { + error (0, 0, _("extra operand %s"), quote (operand[1])); + usage (EXIT_FAILURE); + } + + /* Prepare input */ if (echo) { - if (input_numbers_option_used (lo_input, hi_input)) - error (EXIT_FAILURE, 0, _("cannot combine -e and -i options")); input_from_argv (operand, n_operands, eolbyte); n_lines = n_operands; line = operand; } - else if (input_numbers_option_used (lo_input, hi_input)) + else if (input_range) { - if (n_operands) - { - error (0, 0, _("extra operand %s"), quote (operand[0])); - usage (EXIT_FAILURE); - } n_lines = hi_input - lo_input + 1; line = NULL; } else { - switch (n_operands) - { - case 0: - break; - - case 1: + /* Input file specified, re-open it as STDIN */ + if (n_operands==1) if (! (STREQ (operand[0], "-") || ! head_lines || freopen (operand[0], "r", stdin))) error (EXIT_FAILURE, errno, "%s", operand[0]); - break; - - default: - error (0, 0, _("extra operand %s"), quote (operand[1])); - usage (EXIT_FAILURE); - } fadvise (stdin, FADVISE_SEQUENTIAL); - if (head_lines != SIZE_MAX && (! head_lines - || input_size () > RESERVOIR_MIN_INPUT)) + if (! repetition && head_lines != SIZE_MAX + && (! head_lines || input_size () > RESERVOIR_MIN_INPUT)) { use_reservoir_sampling = true; n_lines = SIZE_MAX; /* unknown number of input lines, for now. */ @@ -488,10 +543,17 @@ main (int argc, char **argv) } } - head_lines = MIN (head_lines, n_lines); + /* When generating random numbers with repetitions, + the default count is one, unless specified by the user. */ + if (repetition && head_lines == SIZE_MAX) + head_lines = 1 ; + + if (! repetition) + head_lines = MIN (head_lines, n_lines); randint_source = randint_all_new (random_source, - use_reservoir_sampling ? SIZE_MAX : + (use_reservoir_sampling || repetition)? + SIZE_MAX: randperm_bound (head_lines, n_lines)); if (! randint_source) error (EXIT_FAILURE, errno, "%s", quotearg_colon (random_source)); @@ -508,20 +570,36 @@ main (int argc, char **argv) /* Close stdin now, rather than earlier, so that randint_all_new doesn't have to worry about opening something other than stdin. */ - if (! (echo || input_numbers_option_used (lo_input, hi_input)) + if (! (echo || input_range) && (fclose (stdin) != 0)) error (EXIT_FAILURE, errno, _("read error")); - permutation = randperm_new (randint_source, head_lines, n_lines); + if (!repetition) + permutation = randperm_new (randint_source, head_lines, n_lines); if (outfile && ! freopen (outfile, "w", stdout)) error (EXIT_FAILURE, errno, "%s", quotearg_colon (outfile)); - if (use_reservoir_sampling) - i = write_permuted_output_reservoir (n_lines, reservoir, permutation); + /* Generate output according to requested method */ + if (repetition) + { + if (input_range) + i = write_random_numbers (randint_source, head_lines, + lo_input, hi_input, eolbyte); + else + i = write_random_lines (randint_source, head_lines, line, n_lines); + } else - i = write_permuted_output (head_lines, line, lo_input, - permutation, eolbyte); + { + if (use_reservoir_sampling) + i = write_permuted_output_reservoir (n_lines, reservoir, permutation); + else if (input_range) + i = write_permuted_numbers (head_lines, lo_input, + permutation, eolbyte); + else + i = write_permuted_lines (head_lines, line, permutation); + } + if (i != 0) error (EXIT_FAILURE, errno, _("write error")); diff --git a/tests/misc/shuf.sh b/tests/misc/shuf.sh index 492fd41..a25a6f8 100755 --- a/tests/misc/shuf.sh +++ b/tests/misc/shuf.sh @@ -94,4 +94,66 @@ shuf -i0-9 -o A -o B && shuf -i0-9 --random-source A --random-source B && { fail=1; echo "shuf did not detect multiple --random-source usage.">&2 ; } +# Test --repetition option + +# --repetition without count should return one line +shuf --rep -i0-10 > exp || framework_failure_ +c=$(wc -l < exp) || framework_failure_ +test "$c" -eq 1 || { fail=1; echo "--repetition default count is not 1">&2 ; } + +# --repetition can output more values than the input range +shuf --rep -i0-9 -n1000 > exp || framework_failure_ +c=$(wc -l < exp) || framework_failure_ +test "$c" -eq 1000 || { fail=1; echo "--repetition with --count failed">&2 ; } + +# Check output values (this is not bullet-proof, but drawing 1000 values +# between 0 and 9 should produce all values, unless there's a bug in shuf +# or a very poor random source, or extremely bad luck) +c=$(sort -nu exp | paste -s -d ' ') || framework_failure_ +test "$c" = "0 1 2 3 4 5 6 7 8 9" || + { fail=1; echo "--repetition produced bad output">&2 ; } + +# check --repetition with non-zero low value +shuf --rep -i222-233 -n2000 > exp || framework_failure_ +c=$(cat exp | sort -nu | paste -s -d ' ') || framework_failure_ +test "$c" = "222 223 224 225 226 227 228 229 230 231 232 233" || + { fail=1; echo "--repetition produced bad output with non-zero low">&2 ; } + +# --repetition,-i,count=0 should not fail and produce no output +shuf --rep -i0-9 -n0 > exp || framework_failure_ +# file size should be zero (no output from shuf) +test \! -s exp || + { fail=1; echo "--repetition,-i0-9,-n0 produced bad output">&2 ; } + +# --repetition with -e, without count, should return one line +shuf --rep -e A B C D > exp || framework_failure_ +c=$(cat exp | wc -l) || framework_failure_ +test "$c" -eq 1 || + { fail=1; echo "--repetition,-e default count is not 1">&2 ; } + +# --repetition with STDIN, without count, should return one line +printf "A\nB\nC\nD\nE\n" | shuf --rep > exp || framework_failure_ +c=$(wc -l < exp) || framework_failure_ +test "$c" -eq 1 || + { fail=1; echo "--repetition,STDIN default count is not 1">&2 ; } + +# --repetition with STDIN,count - can return move values than input lines +printf "A\nB\nC\nD\nE\n" | shuf --rep -n2000 > exp || framework_failure_ +c=$(wc -l < exp) || framework_failure_ +test "$c" -eq 2000 || + { fail=1; echo "--repetition,STDIN,count failed">&2 ; } + +# Check output values (this is not bullet-proof, but drawing 2000 values +# between A and E should produce all values, unless there's a bug in shuf +# or a very poor random source, or extremely bad luck) +c=$(sort -u exp | paste -s -d ' ') || framework_failure_ +test "$c" = "A B C D E" || + { fail=1; echo "--repetition,STDIN,count produced bad output">&2 ; } + +# --repetition,stdin,count=0 should not fail and produce no output +printf "A\nB\nC\nD\nE\n" | shuf --rep -n0 > exp || framework_failure_ +# file size should be zero (no output from shuf) +test \! -s exp || + { fail=1; echo "--repetition,STDIN,-n0 produced bad output">&2 ; } + Exit $fail -- 1.7.7.6