coreutils
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: Splitting search results from a "find -print0"


From: Pádraig Brady
Subject: Re: Splitting search results from a "find -print0"
Date: Fri, 09 Jan 2015 02:16:10 +0000
User-agent: Mozilla/5.0 (X11; Linux x86_64; rv:31.0) Gecko/20100101 Thunderbird/31.3.0

On 08/01/15 06:53, Assaf Gordon wrote:
> Hello,
> 
> On Jan 7, 2015, at 22:07, Pádraig Brady <address@hidden> wrote:
> 
>> On 08/01/15 02:30, Pádraig Brady wrote:
>>> On 08/01/15 00:24, Assaf Gordon wrote:
>>>>
>>>> attached is a quick attempt at splitting with custom line-separator 
>>>> '-t=SEP' and NUL '-z' .
>>>>
>>>> It's not complete, but could be improved if this is the right direction.
>>
> <...>
>> Let's just go with -t CHAR for now.
> 
> attached an updated patch, with more tests, and documentation.

I made a few adjustments, as seen inline below.
The main change was the removal of the -z option as that's supported with -t 
'\0'.
Also I renamed --line-separator to just --separator as previously discussed.
I'll push the attached rolled up patch tomorrow morning.

thanks again,
Pádraig.

diff --git a/NEWS b/NEWS
index 8e5be5a..e0a2893 100644
--- a/NEWS
+++ b/NEWS
@@ -45,7 +45,8 @@ GNU coreutils NEWS                                    -*- 
outline -*-
   dd accepts a new status=progress level to print data transfer statistics
   on stderr approximately every second.

-  split accepts new options: --zero-terminated (-z), --line-separator=X (-tX).
+  split accepts a new --separator option to select a record separator character
+  other than the default newline character.

 ** Changes in behavior

diff --git a/doc/coreutils.texi b/doc/coreutils.texi
index abf46d8..86007a9 100644
--- a/doc/coreutils.texi
+++ b/doc/coreutils.texi
@@ -3506,12 +3506,15 @@ span a chunk.  The output file sequence numbers, always 
run consecutively
 even when this option is specified.

 @item -t @var{separator}
-@itemx --line-separator=@var{separator}
+@itemx --separator=@var{separator}
 @opindex -t
-@opindex --line-separator
+@opindex --separator
 @cindex line separator character
-Use character @var{separator} as the line separator instead of the default
+@cindex record separator character
+Use character @var{separator} as the record separator instead of the default
 newline character (ASCII LF).
+To specify ASCII NUL as the separator, use the two-character string @samp{\0},
+e.g., @samp{split -t '\0'}.

 @item -u
 @itemx --unbuffered
@@ -3524,18 +3527,6 @@ which is a much slower mode of operation.
 @opindex --verbose
 Write a diagnostic just before each output file is opened.

-@item -z
-@itemx --zero-terminated
-@opindex -z
-@opindex --zero-terminated
-@cindex process zero-terminated lines
-Use zero byte (ASCII NUL character) as the line separator instead of the
-default newline character (ASCII LF).
-This option can be useful in conjunction with @samp{perl -0} or
-@samp{find -print0} and @samp{xargs -0} which do the same in order to
-reliably handle arbitrary file names (even those containing blanks
-or other special characters).
-
 @end table

 @exitstatus
diff --git a/src/split.c b/src/split.c
index fce4dae..6d78c83 100644
--- a/src/split.c
+++ b/src/split.c
@@ -16,8 +16,9 @@


 /* By address@hidden, with rms.

-   To do:
-   * support -p REGEX as in BSD's split */
+   TODO:
+   * support -p REGEX as in BSD's split.
+   * support --suppress-matched as in csplit.  */
 #include <config.h>

 #include <assert.h>
@@ -106,9 +107,8 @@ static bool elide_empty_files;
    input to output, which is much slower, so disabled by default.  */
 static bool unbuffered;

-/* The character marking end of line. Default to \n. */
-enum { DEFAULT_EOL = '\n' };
-static char eolchar = DEFAULT_EOL;
+/* The character marking end of line.  Defaults to \n below.  */
+static int eolchar = -1;

 /* The split mode to use.  */
 enum Split_type
@@ -141,8 +141,7 @@ static struct option const longopts[] =
   {"numeric-suffixes", optional_argument, NULL, 'd'},
   {"filter", required_argument, NULL, FILTER_OPTION},
   {"verbose", no_argument, NULL, VERBOSE_OPTION},
-  {"line-separator", required_argument, NULL, 't'},
-  {"zero-terminated", no_argument, NULL, 'z'},
+  {"separator", required_argument, NULL, 't'},
   {"-io-blksize", required_argument, NULL,
    IO_BLKSIZE_OPTION}, /* do not document */
   {GETOPT_HELP_OPTION_DECL},
@@ -220,16 +219,15 @@ is -, read standard input.\n\
   -a, --suffix-length=N   generate suffixes of length N (default %d)\n\
       --additional-suffix=SUFFIX  append an additional SUFFIX to file names\n\
   -b, --bytes=SIZE        put SIZE bytes per output file\n\
-  -C, --line-bytes=SIZE   put at most SIZE bytes of lines per output file\n\
+  -C, --line-bytes=SIZE   put at most SIZE bytes of records per output file\n\
   -d, --numeric-suffixes[=FROM]  use numeric suffixes instead of alphabetic;\n\
                                    FROM changes the start value (default 0)\n\
   -e, --elide-empty-files  do not generate empty output files with '-n'\n\
       --filter=COMMAND    write to shell COMMAND; file name is $FILE\n\
-  -l, --lines=NUMBER      put NUMBER lines per output file\n\
+  -l, --lines=NUMBER      put NUMBER lines/records per output file\n\
   -n, --number=CHUNKS     generate CHUNKS output files; see explanation 
below\n\
-  -t, --line-separator=SEP  use SEP instead of newline as line separator\n\
+  -t, --separator=SEP     use SEP instead of newline as record separator\n\
   -u, --unbuffered        immediately copy input to output with '-n r/...'\n\
-  -z, --zero-terminated   line delimiter is NUL, not newline\n\
 "), DEFAULT_SUFFIX_LENGTH);
       fputs (_("\
       --verbose           print a diagnostic just before each\n\
@@ -242,8 +240,8 @@ is -, read standard input.\n\
 CHUNKS may be:\n\
   N       split into N files based on size of input\n\
   K/N     output Kth of N to stdout\n\
-  l/N     split into N files without splitting lines\n\
-  l/K/N   output Kth of N to stdout without splitting lines\n\
+  l/N     split into N files without splitting lines/records\n\
+  l/K/N   output Kth of N to stdout without splitting lines/records\n\
   r/N     like 'l' but use round robin distribution\n\
   r/K/N   likewise but only output Kth of N to stdout\n\
 "), stdout);
@@ -1313,7 +1311,7 @@ main (int argc, char **argv)
           {
             char neweol = optarg[0];
             if (! neweol)
-              error (EXIT_FAILURE, 0, _("empty line-delimiter"));
+              error (EXIT_FAILURE, 0, _("empty record separator"));
             if (optarg[1])
               {
                 if (STREQ (optarg, "\\0"))
@@ -1324,22 +1322,18 @@ main (int argc, char **argv)
                        "multi-character tab" instead of "multibyte tab", so
                        that the diagnostic's wording does not need to be
                        changed once multibyte characters are supported.  */
-                    error (EXIT_FAILURE, 0, _("multi-character delimiter %s"),
+                    error (EXIT_FAILURE, 0, _("multi-character separator %s"),
                            quote (optarg));
                   }
               }
-            if (eolchar != DEFAULT_EOL && neweol != eolchar)
-              error (EXIT_FAILURE, 0, _("incompatible line-delimiters"));
+            /* Make it explicit we don't support multiple separators.  */
+            if (0 <= eolchar && neweol != eolchar)
+              error (EXIT_FAILURE, 0, _("incompatible record separator"));
+
             eolchar = neweol;
           }
           break;

-        case 'z':
-          if (eolchar != DEFAULT_EOL && eolchar != '\0')
-            error (EXIT_FAILURE, 0, _("incompatible line-delimiters"));
-          eolchar = '\0';
-          break;
-
         case '0':
         case '1':
         case '2':
@@ -1435,6 +1429,9 @@ main (int argc, char **argv)
       usage (EXIT_FAILURE);
     }

+  if (eolchar < 0)
+    eolchar = '\n';
+
   set_suffix_length (n_units, split_type);

   /* Get out the filename arguments.  */
diff --git a/tests/split/lines-sep.sh b/tests/split/lines-sep.sh
index e0727dc..74857fe 100755
--- a/tests/split/lines-sep.sh
+++ b/tests/split/lines-sep.sh
@@ -1,7 +1,7 @@
 #!/bin/sh
-# test split with custom line separators
+# test split with custom record separators

-# Copyright (C) 2002-2015 Free Software Foundation, Inc.
+# Copyright (C) 2015 Free Software Foundation, Inc.

 # This program is free software: you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
@@ -56,57 +56,45 @@ run_split()
   return 0
 }

+NL='
+'

 # Test newline, without '-t' option (the default)
 run_split 1 nl         || { warn_ "test 1 failed" ; fail=1 ; }

-# Test newline specified as custom line separator
-run_split 2 nl -t$'\n' || { warn_ "test 2 failed" ; fail=1 ; }
-
-# Test null line-separator with '-z'
-run_split 3 z -z       || { warn_ "test 3 failed" ; fail=1 ; }
+# Test newline specified as custom record separator
+run_split 2 nl -t"$NL" || { warn_ "test 2 failed" ; fail=1 ; }

 # Test null line-separator with '-t'
-run_split 4 z -t\\0    || { warn_ "test 4 failed" ; fail=1 ; }
+run_split 4 z -t'\0'   || { warn_ "test 4 failed" ; fail=1 ; }

 # Test non-default line-separator with '-t'
 run_split 5 cln -t:    || { warn_ "test 5 failed" ; fail=1 ; }


-
 #
 # Test usage edge cases
 #

 # Should fail: '-t' requires an argument
-split -t </dev/null >/dev/null 2>/dev/null \
-  && { warn_ "-t without argument did not trigger an error" ; fail=1 ; }
+{ split -t </dev/null >/dev/null 2>/dev/null || test $? -ne 1; } &&
+  { warn_ "-t without argument did not trigger an error" ; fail=1 ; }

 # should fail: multi-character separator
-split -txx </dev/null >/dev/null 2>/dev/null \
-  && { warn_ "-txx did not trigger an error" ; fail=1 ; }
+{ split -txx </dev/null >/dev/null 2>&1 || test $? -ne 1; } &&
+  { warn_ "-txx did not trigger an error" ; fail=1 ; }

 # should fail: different separators used
-split -ta -tb </dev/null >/dev/null 2>/dev/null \
-  && { warn_ "-ta -tb did not trigger an error" ; fail=1 ; }
+{ split -ta -tb </dev/null >/dev/null 2>&1 || test $? -ne 1; } &&
+  { warn_ "-ta -tb did not trigger an error" ; fail=1 ; }

-# should fail: different separators used
-split -ta -z </dev/null >/dev/null 2>/dev/null \
-  && { warn_ "-ta -z did not trigger an error" ; fail=1 ; }
-
-# should fail: different separators used
-split -z -ta </dev/null >/dev/null 2>/dev/null \
-  && { warn_ "-z -ta did not trigger an error" ; fail=1 ; }
+# should fail: different separators used, including default
+{ split -t"$NL" -tb </dev/null >/dev/null 2>&1 || test $? -ne 1; } &&
+  { warn_ "-t\$NL -tb did not trigger an error" ; fail=1 ; }

 # should not fail: same separator used multiple times
-split -t: -t: </dev/null >/dev/null 2>/dev/null \
-  || { warn_ "-t: -t: triggered an error" ; fail=1 ; }
-
-# should not fail: NUL separator used multiple times
-split -z -t\\0 </dev/null >/dev/null 2>/dev/null \
-  || { warn_ "-z -t\\0 triggered an error" ; fail=1 ; }
-split -t\\0 -z </dev/null >/dev/null 2>/dev/null \
-  || { warn_ "-t\\0 -z triggered an error" ; fail=1 ; }
+split -t: -t: </dev/null >/dev/null 2>&1 ||
+  { warn_ "-t: -t: triggered an error" ; fail=1 ; }

Attachment: split-t.patch
Description: Text Data


reply via email to

[Prev in Thread] Current Thread [Next in Thread]