bug-coreutils
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [patch] Adding Numerical Suffixes to Split


From: Paul Eggert
Subject: Re: [patch] Adding Numerical Suffixes to Split
Date: 07 Aug 2003 15:49:19 -0700
User-agent: Gnus/5.09 (Gnus v5.9.0) Emacs/21.3

Capt Jesse Kornblum USAF <address@hidden> writes:

> Sorry it's been so long since I've written...

No problem.  Thanks for following up on this.


> Why should the user have to remember a long line of arcane shell
> commands instead of just adding a single command line flag that
> already exists in a sister program (csplit)?

You make a good argument based on convenience.

However, -n means something different in csplit; one might argue that
the 'split' option should be spelled differently, to avoid confusion.
(Things are already confusing here, since csplit's -n is the same as
split's -a.)  Perhaps '-d' would be better?  The mnemonic would be
that -d means to use digits in suffixes rather than letters.

Another thought: the patch that you sent numbers the output files
foo01, foo02, foo03, etc.  This is inconsistent with csplit, which
starts with foo00, and it complicates the logic.  It'd be more
consistent to number the output files starting with 00.

I went back to my suggestion in
<http://mail.gnu.org/archive/html/bug-coreutils/2003-07/msg00001.html>?
and decided there was too much hassle in allowing arbitrary suffix
alphabets: multibyte characters, duplicate characters in alphabets,
empty alphabets, etc.  So I think you're right that it'd be better
just to have a simple flag to enable numeric suffixes.

How about this patch instead?  It fixes a couple of minor
POSIX-compliance bugs in split, while I was in the neighborhood.

2003-08-07  Paul Eggert  <address@hidden>

        * doc/coreutils.texi (split invocation):
        Add -d or --numeric-suffixes option to 'split'.
        From a suggestion by Jesse Kornblum.
        * src/split.c (suffix_alphabet): New var.
        (longopts, usage, next_file_name, main): Support -d.
        (next_file_name, main): Allow -a0, as POSIX requires.
        (next_file_name): Don't assume ASCII-like encoding;
        'a' through 'z' are not contiguous in EBCDIC.

--- NEWS.~1.120.~       Fri Aug  1 15:35:57 2003
+++ NEWS        Thu Aug  7 15:40:35 2003
@@ -5,6 +5,8 @@ GNU coreutils NEWS                      
 
   date accepts a new option --rfc-2822, an alias for --rfc-822.
 
+  split accepts a new option -d or --numeric-suffixes.
+
 
 * Major changes in release 5.0.90:
 
@@ -47,6 +49,10 @@ GNU coreutils NEWS                      
   and with status 3 if an error occurred.  POSIX requires this.
 
   expr now reports trouble if string comparison fails due to a collation error.
+
+  split now generates suffixes properly on EBCDIC hosts.
+
+  split -a0 now works, as POSIX requires.
 
   `sort --version' and `sort --help' fail, as they should
   when their output is redirected to /dev/full.
Index: doc/coreutils.texi
===================================================================
RCS file: /cvsroot/coreutils/coreutils/doc/coreutils.texi,v
retrieving revision 1.127
diff -p -u -r1.127 coreutils.texi
--- doc/coreutils.texi  1 Aug 2003 22:37:20 -0000       1.127
+++ doc/coreutils.texi  7 Aug 2003 22:43:43 -0000
@@ -2317,8 +2317,9 @@ left over for the last section), into ea
 
 @cindex output file name prefix
 The output files' names consist of @var{prefix} (@samp{x} by default)
-followed by a group of letters (@samp{aa}, @samp{ab}, @dots{} by default),
-such that concatenating the output files in sorted order by file name produces
+followed by a group of characters (@samp{aa}, @samp{ab}, @dots{} by
+default), such that concatenating the output files in traditional
+sorted order by file name produces
 the original input file.  If the output file names are exhausted,
 @command{split} reports an error without deleting the output files
 that it did create.
@@ -2362,6 +2363,12 @@ possible without exceeding @var{bytes} b
 less than @var{bytes} bytes of the line are left, then continue
 normally.  @var{bytes} has the same format as for the @option{--bytes}
 option.
+
address@hidden -d
address@hidden --numeric-suffixes
address@hidden -d
address@hidden --numeric-suffixes
+Use digits in suffixes rather than lower-case letters.
 
 @itemx --verbose
 @opindex --verbose
Index: src/split.c
===================================================================
RCS file: /cvsroot/coreutils/coreutils/src/split.c,v
retrieving revision 1.90
diff -p -u -r1.90 split.c
--- src/split.c 23 Jul 2003 07:29:55 -0000      1.90
+++ src/split.c 7 Aug 2003 22:43:43 -0000
@@ -60,6 +60,9 @@ static char *outfile_mid;
 /* Length of OUTFILE's suffix.  */
 static size_t suffix_length = DEFAULT_SUFFIX_LENGTH;
 
+/* Alphabet of characters to use in suffix.  */
+static char const *suffix_alphabet = "abcdefghijklmnopqrstuvwxyz";
+
 /* Name of input file.  May be "-".  */
 static char *infile;
 
@@ -79,6 +82,7 @@ static struct option const longopts[] =
   {"lines", required_argument, NULL, 'l'},
   {"line-bytes", required_argument, NULL, 'C'},
   {"suffix-length", required_argument, NULL, 'a'},
+  {"numeric-suffixes", no_argument, NULL, 'd'},
   {"verbose", no_argument, &verbose, 1},
   {GETOPT_HELP_OPTION_DECL},
   {GETOPT_VERSION_OPTION_DECL},
@@ -109,6 +113,7 @@ Mandatory arguments to long options are 
   -a, --suffix-length=N   use suffixes of length N (default %d)\n\
   -b, --bytes=SIZE        put SIZE bytes per output file\n\
   -C, --line-bytes=SIZE   put at most SIZE bytes of lines per output file\n\
+  -d, --numeric-suffixes  use numeric suffixes instead of alphabetic\n\
   -l, --lines=NUMBER      put NUMBER lines per output file\n\
 "), DEFAULT_SUFFIX_LENGTH);
       fputs (_("\
@@ -132,6 +137,9 @@ SIZE may have a multiplier suffix: b for
 static void
 next_file_name (void)
 {
+  /* Index in suffix_alphabet of each character in the suffix.  */
+  static size_t *sufindex;
+
   if (! outfile)
     {
       /* Allocate and initialize the first file name.  */
@@ -143,8 +151,9 @@ next_file_name (void)
       outfile = xmalloc (outfile_length + 1);
       outfile_mid = outfile + outbase_length;
       memcpy (outfile, outbase, outbase_length);
-      memset (outfile_mid, 'a', suffix_length);
+      memset (outfile_mid, suffix_alphabet[0], suffix_length);
       outfile[outfile_length] = 0;
+      sufindex = xcalloc (suffix_length, sizeof (size_t));
 
 #if ! _POSIX_NO_TRUNC && HAVE_PATHCONF && defined _PC_NAME_MAX
       /* POSIX requires that if the output file name is too long for
@@ -164,10 +173,16 @@ next_file_name (void)
     {
       /* Increment the suffix in place, if possible.  */
 
-      char *p;
-      for (p = outfile_mid + suffix_length; outfile_mid < p; *--p = 'a')
-       if (p[-1]++ != 'z')
-         return;
+      size_t i = suffix_length;
+      while (i-- != 0)
+       {
+         sufindex[i]++;
+         outfile_mid[i] = suffix_alphabet[sufindex[i]];
+         if (outfile_mid[i])
+           return;
+         sufindex[i] = 0;
+         outfile_mid[i] = suffix_alphabet[sufindex[i]];
+       }
       error (EXIT_FAILURE, 0, _("Output file suffixes exhausted"));
     }
 }
@@ -380,7 +395,7 @@ main (int argc, char **argv)
       /* This is the argv-index of the option we will read next.  */
       int this_optind = optind ? optind : 1;
 
-      c = getopt_long (argc, argv, "0123456789C:a:b:l:", longopts, NULL);
+      c = getopt_long (argc, argv, "0123456789C:a:b:dl:", longopts, NULL);
       if (c == -1)
        break;
 
@@ -393,7 +408,7 @@ main (int argc, char **argv)
          {
            unsigned long tmp;
            if (xstrtoul (optarg, NULL, 10, &tmp, "") != LONGINT_OK
-               || tmp == 0 || SIZE_MAX < tmp)
+               || SIZE_MAX / sizeof (size_t) < tmp)
              {
                error (0, 0, _("%s: invalid suffix length"), optarg);
                usage (EXIT_FAILURE);
@@ -467,6 +482,10 @@ main (int argc, char **argv)
                     umaxtostr (n_units, buffer), c);
            }
          n_units = n_units * 10 + c - '0';
+         break;
+
+       case 'd':
+         suffix_alphabet = "0123456789";
          break;
 
        case_GETOPT_HELP_CHAR;




reply via email to

[Prev in Thread] Current Thread [Next in Thread]