bug-coreutils
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[PATCH] split: --chunks option


From: Chen Guo
Subject: [PATCH] split: --chunks option
Date: Wed, 25 Nov 2009 20:33:36 -0800 (PST)

Hi all,
    This is mostly a step towards multithreaded sort the unix way, but as 
Padraig mentioned, has its other uses. Parsing and I/O are not my strong suits, 
so I have a couple of questions:

    Are there more appropriate functions than open and pread to use here? I 
usually see wrapper functions called in place of actual functions like fopen, 
fread, etc, and it feels rather inappropriate for me to use open and pread here.

    And are there any suggestions for parsing the --chunk option in a better 
way? I feel having two separate options specifying both required values is 
redundant, so I decided to separate the values by a comma, as Jim had in an 
example he linked me. The way I wrote it, it feels like a hacked workaround, 
but I'm not sure how else to get around that comma.

    Also, any opinions on how the lines should be output? As of now I just have 
it as stdout, since that's how I see sort would use it. And of course, anything 
else I missed/could've done better? Thanks a lot guys.


>From 875147fd73abfb4d798f0beb0c84a5ed54bab28c Mon Sep 17 00:00:00 2001
From: Chen Guo <address@hidden>
Date: Thu, 26 Nov 2009 05:12:32 +0100
Subject: [PATCH] Split: add --chunk=N,TOT option to extract Nth of TOT chunks.

---
 src/split.c |  145 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++---
 1 files changed, 139 insertions(+), 6 deletions(-)

diff --git a/src/split.c b/src/split.c
index d1a0e0d..e8446ad 100644
--- a/src/split.c
+++ b/src/split.c
@@ -82,6 +82,7 @@ enum
 static struct option const longopts[] =
 {
   {"bytes", required_argument, NULL, 'b'},
+  {"chunk", required_argument, NULL, 'c'},
   {"lines", required_argument, NULL, 'l'},
   {"line-bytes", required_argument, NULL, 'C'},
   {"suffix-length", required_argument, NULL, 'a'},
@@ -116,6 +117,7 @@ Mandatory arguments to long options are mandatory for short 
options too.\n\
       fprintf (stdout, _("\
   -a, --suffix-length=N   use suffixes of length N (default %d)\n\
   -b, --bytes=SIZE        put SIZE bytes per output file\n\
+  -c, --chunk=N,CHUNKS    generate Nth of CHUNKS pieces\n\
   -C, --line-bytes=SIZE   put at most SIZE bytes of lines per output file\n\
   -d, --numeric-suffixes  use numeric suffixes instead of alphabetic\n\
   -l, --lines=NUMBER      put NUMBER lines per output file\n\
@@ -362,6 +364,99 @@ line_bytes_split (size_t n_bytes)
   free (buf);
 }
 
+/* If file is seekable, extract nth of total chunks, starting from first line
+   entirely in the nth chunk to the last line whose first byte is in the nth
+   chunk. 
+   FIXME: Support non-seekable files as such: extract lines whose line number L
+   is such that L % total + 1 = n.
+  */
+
+static void
+chunk_split (uintmax_t n, uintmax_t total, char* buf, int fildes, off_t 
file_size,
+             size_t bufsize)
+{
+  ssize_t n_read;
+  bool new_flag = true;
+  bool end_of_chunk = false;
+  bool skip = true;
+  char *bp = buf, *bp_out = buf, *eob;
+  off_t start;
+  off_t end;
+
+  /* For n != 1, start reading 1 byte before nth chunk of file. This is to
+     detect if the first byte of chunk is the first byte of a line. */
+  if (n == 1)
+    {
+      start = 0;
+      skip = false;
+    }
+  else
+    start = (n - 1) * (file_size / total) - 1;
+  end = (n == total)? file_size - 1: n * file_size / total - 1;
+
+  /* bp: point from which to read into buffer
+     bp_out: point from which to write out
+     eob: point where write out ends */
+  while (1)
+    {
+      n_read = pread (fildes, bp, bufsize, start);
+      if (n_read < 0)
+        error (EXIT_FAILURE, errno, "%s", infile);
+      if (n_read == 0)
+        end_of_chunk = true;
+      start += n_read;
+      eob = bp + n_read;      
+
+      /* Unless n == 1, skip past the first newline character
+         encountered. */
+      if (skip)
+        {
+          bp_out = memchr (bp, '\n', n_read);
+          if (!bp_out)
+            {
+              if (start < end)
+                {
+                  off_t offset = eob - buf;
+                  bufsize *= 2;
+                  buf = x2nrealloc (buf, &bufsize, sizeof *buf);
+                  bp = buf + offset;
+                }
+              else
+                break;
+            }
+          else
+            {
+              bp_out++;
+              skip == false;
+            }
+        }
+
+      if (start >= end)
+        {
+          char *base;
+          if (eob - start + end > bp_out)
+            base = eob - start + end;
+          else
+            base = bp_out;
+          char *line = memchr (base, '\n', eob - base);
+          if (line)
+            {
+              eob = line + 1;
+              *eob = '\0';
+              end_of_chunk == true;
+            }
+        }
+      //      cwrite (new_flag, bp_out, eob - bp_out);
+      if (write (STDOUT_FILENO, bp_out, eob - bp_out) != eob - bp_out)
+        error (EXIT_FAILURE, errno, "output error");
+      if (end_of_chunk)
+        break;
+      new_flag = false;
+      bp = buf;
+      bp_out = buf;
+    }
+}
+
 #define FAIL_ONLY_ONE_WAY()                    \
   do                                \
     {                                \
@@ -376,15 +471,19 @@ main (int argc, char **argv)
   struct stat stat_buf;
   enum
     {
-      type_undef, type_bytes, type_byteslines, type_lines, type_digits
+      type_undef, type_bytes, type_byteslines, type_lines, type_digits,
+      type_chunk
     } split_type = type_undef;
   size_t in_blk_size;        /* optimal block size of input file device */
   char *buf;            /* file i/o buffer */
   size_t page_size = getpagesize ();
+  uintmax_t m_units;
   uintmax_t n_units;
   static char const multipliers[] = "bEGKkMmPTYZ0";
   int c;
   int digits_optind = 0;
+  int fd = 0;
+  off_t file_size;
 
   initialize_main (&argc, &argv);
   set_program_name (argv[0]);
@@ -404,7 +503,7 @@ main (int argc, char **argv)
       /* This is the argv-index of the option we will read next.  */
       int this_optind = optind ? optind : 1;
 
-      c = getopt_long (argc, argv, "0123456789C:a:b:dl:", longopts, NULL);
+      c = getopt_long (argc, argv, "0123456789C:a:b:c:dl:", longopts, NULL);
       if (c == -1)
         break;
 
@@ -435,6 +534,31 @@ main (int argc, char **argv)
             }
           break;
 
+        case 'c':
+          if (split_type != type_undef)
+            FAIL_ONLY_ONE_WAY ();
+          split_type = type_chunk;
+          char *comma = strchr (optarg, ',');
+          if (comma == NULL)
+            {
+              error (0, 0, _("%s: invalid chunk information"), optarg);
+              usage (EXIT_FAILURE);
+            }
+          *comma = '\0';
+          if (xstrtoumax (optarg, NULL, 10, &m_units, ",") != LONGINT_OK
+              || m_units == 0)
+            {
+              error (0, 0, _("%s: invalid chunk number"), optarg);
+              usage (EXIT_FAILURE);
+            }
+          if (xstrtoumax (++comma, NULL, 10, &n_units, "") != LONGINT_OK
+              || n_units == 0 || n_units < m_units)
+            {
+              error (0, 0, _("%s: invalid number of total chunks"), comma);
+              usage (EXIT_FAILURE);
+            }
+          break;
+
         case 'l':
           if (split_type != type_undef)
             FAIL_ONLY_ONE_WAY ();
@@ -533,11 +657,15 @@ main (int argc, char **argv)
     }
 
   /* Open the input file.  */
+  if (split_type == type_chunk)
+    fd = open (infile, O_RDONLY);
   if (! STREQ (infile, "-")
+      && ((split_type != type_chunk
       && fd_reopen (STDIN_FILENO, infile, O_RDONLY, 0) < 0)
+      || (split_type == type_chunk && fd < 0)))
     error (EXIT_FAILURE, errno, _("cannot open %s for reading"),
            quote (infile));
-
+ 
   /* Binary I/O is safer when bytecounts are used.  */
   if (O_BINARY && ! isatty (STDIN_FILENO))
     xfreopen (NULL, "rb", stdin);
@@ -546,10 +674,11 @@ main (int argc, char **argv)
   output_desc = -1;
 
   /* Get the optimal block size of input device and make a buffer.  */
-
-  if (fstat (STDIN_FILENO, &stat_buf) != 0)
-    error (EXIT_FAILURE, errno, "%s", infile);
+  if ((split_type != type_chunk && fstat (STDIN_FILENO, &stat_buf) != 0)
+      || (split_type == type_chunk && fstat (fd, &stat_buf) != 0))
+      error (EXIT_FAILURE, errno, "%s", infile);
   in_blk_size = io_blksize (stat_buf);
+  file_size = stat_buf.st_size;
 
   buf = ptr_align (xmalloc (in_blk_size + 1 + page_size - 1), page_size);
 
@@ -568,6 +697,10 @@ main (int argc, char **argv)
       line_bytes_split (n_units);
       break;
 
+    case type_chunk:
+      chunk_split (m_units, n_units, buf, fd, file_size, in_blk_size);
+      break;
+
     default:
       abort ();
     }
-- 
1.6.3.3





reply via email to

[Prev in Thread] Current Thread [Next in Thread]