bug-coreutils
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

bug#6524: du now uses less than half as much memory, sometimes


From: Jim Meyering
Subject: bug#6524: du now uses less than half as much memory, sometimes
Date: Sun, 04 Jul 2010 12:00:33 +0200

Jim Meyering wrote:
> This is just a preliminary heads-up.  At least one detail must be
> addressed before I push: I haven't written justification for the
> change to hash.c (I confess that I don't remember -- parts of this
> change date back to Dec 2008 ;-)

I've pushed the four change-sets I wrote.
Here's Paul's improvement, including the touch-up changes I mentioned.

Paul, one question I have is whether to include an initial_inode_set_size
parameter in your di_set_alloc function.  That would make it more
efficient in the event that an application happens to know in advance the
number of inodes it will process (say if it's processing a single device
and all of its inodes).  However, there's no rush to address that --
you're welcome to push this as-is.

>From 08ac2f42353a35451696f9a390da1e5e6c43adaa Mon Sep 17 00:00:00 2001
From: Paul Eggert <address@hidden>
Date: Sun, 4 Jul 2010 09:06:59 +0200
Subject: [PATCH] du: improve device-inode tracking code

The preceding implementation would not save space when most inode
numbers have one of the top few bits set.  Remove that limitation
by using a separate inode-managing set for each distinct device.

This patch continues to treat inode 0 specially, to work around a
problem with hash.c.  Recently hash.c was changed to no longer reject
NULL keys, but its internals still depend on not allowing NULL keys so
this patch is careful to never create a key by casting 0 to (void *).
The assumption is that 0 is the only size_t value i such that ((void
*) i == NULL); this assumption is true for all practical architectures
that I know about, though it is not guaranteed by any standard.

The preceding implementation also ran afoul of the i686-specific gcc
bug, c/44806 (fixed in gcc-4.5.1), so replacing its bitfield- and
union-using code with this simpler design is doubly welcome.

* gl/lib/ino-map.c, gl/lib/ino-map.h: New files.
* gl/modules/ino-map, gl/modules/ino-map-tests: New files.
* gl/modules/di-set, gl/modules/di-set-tests: New files.
* gl/tests/test-di-set.c, gl/tests/test-ino-map.c: New files.
* gl/lib/dev-map.c, gl/lib/dev-map.h: Remove files.
* gl/modules/dev-map, gl/modules/dev-map-tests: Remove files.
* gl/tests/test-dev-map.c: Remove files.
* src/du.c (INITIAL_DI_SET_SIZE): Remove definition; no longer
needed, since di_set_alloc takes no size parameter.
(di_set): Use a global pointer rather than a struct.
Update all uses.
(main): Hoist a di_set_free call.
---
 gl/lib/dev-map.c                            |  110 ---------
 gl/lib/dev-map.h                            |   23 --
 gl/lib/di-set.c                             |  350 ++++++++++++---------------
 gl/lib/di-set.h                             |   28 +--
 gl/lib/ino-map.c                            |  162 ++++++++++++
 gl/lib/ino-map.h                            |   14 +
 gl/modules/dev-map                          |   23 --
 gl/modules/dev-map-tests                    |   10 -
 gl/modules/di-set                           |    3 +-
 gl/modules/ino-map                          |   24 ++
 gl/modules/ino-map-tests                    |   10 +
 gl/tests/test-di-set.c                      |   31 +--
 gl/tests/{test-dev-map.c => test-ino-map.c} |   34 ++--
 src/du.c                                    |   21 +-
 14 files changed, 399 insertions(+), 444 deletions(-)
 delete mode 100644 gl/lib/dev-map.c
 delete mode 100644 gl/lib/dev-map.h
 create mode 100644 gl/lib/ino-map.c
 create mode 100644 gl/lib/ino-map.h
 delete mode 100644 gl/modules/dev-map
 delete mode 100644 gl/modules/dev-map-tests
 create mode 100644 gl/modules/ino-map
 create mode 100644 gl/modules/ino-map-tests
 rename gl/tests/{test-dev-map.c => test-ino-map.c} (72%)

diff --git a/gl/lib/dev-map.c b/gl/lib/dev-map.c
deleted file mode 100644
index 363f5af..0000000
--- a/gl/lib/dev-map.c
+++ /dev/null
@@ -1,110 +0,0 @@
-/* Map a dev_t device number to a small integer.
-
-   Copyright 2009-2010 Free Software Foundation, Inc.
-
-   This program is free software: you can redistribute it and/or modify
-   it under the terms of the GNU General Public License as published by
-   the Free Software Foundation; either version 3 of the License, or
-   (at your option) any later version.
-
-   This program is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-   GNU General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
-
-/* written by Jim Meyering */
-
-#include <config.h>
-#include "dev-map.h"
-
-#include <stdio.h>
-#include <assert.h>
-#include <stdint.h>
-#include <stdlib.h>
-
-struct dev_map_ent
-{
-  dev_t dev;
-  uint32_t mapped_dev;
-};
-
-enum { INITIAL_DEV_MAP_TABLE_SIZE = 31 };
-
-static size_t
-dev_hash (void const *x, size_t table_size)
-{
-  struct dev_map_ent const *p = x;
-  return (uintmax_t) p->dev % table_size;
-}
-
-/* Compare two dev_map_ent structs on "dev".
-   Return true if they are the same.  */
-static bool
-dev_compare (void const *x, void const *y)
-{
-  struct dev_map_ent const *a = x;
-  struct dev_map_ent const *b = y;
-  return a->dev == b->dev ? true : false;
-}
-
-/* Initialize state.  */
-int
-dev_map_init (struct dev_map *dev_map)
-{
-  assert (dev_map != NULL);
-  dev_map->n_device = 0;
-  dev_map->dev_map = hash_initialize (INITIAL_DEV_MAP_TABLE_SIZE, NULL,
-                                            dev_hash, dev_compare, free);
-  return dev_map->dev_map ? 0 : -1;
-}
-
-void
-dev_map_free (struct dev_map *dev_map)
-{
-  hash_free (dev_map->dev_map);
-}
-
-/* Insert device number, DEV, into the map, DEV_MAP if it's not there already,
-   and return the nonnegative, mapped and usually smaller, number.
-   If DEV is already in DEV_MAP, return the existing mapped value.
-   Upon memory allocation failure, return -1.  */
-int
-dev_map_insert (struct dev_map *dev_map, dev_t dev)
-{
-  /* Attempt to insert <DEV,n_device> in the map.
-     Possible outcomes:
-     - DEV was already there, with a different necessarily smaller value
-     - DEV was not there, (thus was just inserted)
-     - insert failed: out of memory
-     Return -1 on out of memory.
-  */
-
-  struct dev_map_ent *ent_from_table;
-  struct dev_map_ent *ent = malloc (sizeof *ent);
-  if (!ent)
-    return -1;
-
-  ent->dev = dev;
-  ent->mapped_dev = dev_map->n_device;
-
-  ent_from_table = hash_insert (dev_map->dev_map, ent);
-  if (ent_from_table == NULL)
-    {
-      /* Insertion failed due to lack of memory.  */
-      return -1;
-    }
-
-  if (ent_from_table == ent)
-    {
-      /* Insertion succeeded: new device.  */
-      return dev_map->n_device++;
-    }
-
-  /* That DEV value is already in the table, so ENT was not inserted.
-     Free it and return the already-associated value.  */
-  free (ent);
-  return ent_from_table->mapped_dev;
-}
diff --git a/gl/lib/dev-map.h b/gl/lib/dev-map.h
deleted file mode 100644
index f093d90..0000000
--- a/gl/lib/dev-map.h
+++ /dev/null
@@ -1,23 +0,0 @@
-#include <stddef.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include "hash.h"
-
-struct dev_map
-{
-  /* KEY,VAL pair, where KEY is the raw st_dev value
-     and VAL is the small number that maps to.  */
-  struct hash_table *dev_map;
-  size_t n_device;
-};
-
-#undef _ATTRIBUTE_NONNULL_
-#if __GNUC__ == 3 && __GNUC_MINOR__ >= 3 || 3 < __GNUC__
-# define _ATTRIBUTE_NONNULL_(m) __attribute__ ((__nonnull__ (m)))
-#else
-# define _ATTRIBUTE_NONNULL_(m)
-#endif
-
-int dev_map_init (struct dev_map *) _ATTRIBUTE_NONNULL_ (1);
-void dev_map_free (struct dev_map *) _ATTRIBUTE_NONNULL_ (1);
-int dev_map_insert (struct dev_map *, dev_t) _ATTRIBUTE_NONNULL_ (1);
diff --git a/gl/lib/di-set.c b/gl/lib/di-set.c
index 3c4717b..b6dce45 100644
--- a/gl/lib/di-set.c
+++ b/gl/lib/di-set.c
@@ -15,262 +15,218 @@
    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <http://www.gnu.org/licenses/>.  */

-/* written by Jim Meyering */
+/* written by Paul Eggert and Jim Meyering */

 #include <config.h>
 #include "di-set.h"

-#include <stdio.h>
-#include <assert.h>
+#include "hash.h"
+#include "ino-map.h"
+
 #include <stdint.h>
 #include <stdlib.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-
-#include "verify.h"
-
-/* Set operations for device-inode pairs stored in a space-efficient manner.
-   A naive mapping uses 16 bytes to save a single st_dev, st_ino pair.
-   However, in many applications, the vast majority of actual device,inode
-   number pairs can be efficiently compressed to fit in 8 or even 4 bytes,
-   by using a separate table to map a relatively small number of devices
-   to small integers.  */
-
-#define N_DEV_BITS_4 5
-#define N_INO_BITS_4 (32 - N_DEV_BITS_4 - 2 - 1)
-
-#define N_DEV_BITS_8 8
-#define N_INO_BITS_8 (64 - N_DEV_BITS_8 - 2 - 1)
-
-/* Note how the last bit is always set.
-   This is required, in order to be able to distinguish
-   an encoded di_ent value from a malloc-returned pointer,
-   which must be 4-byte-aligned or better.  */
-struct dev_ino_4
-{
-  uint32_t mode:2; /* must be first */
-  uint32_t short_ino:N_INO_BITS_4;
-  uint32_t mapped_dev:N_DEV_BITS_4;
-  uint32_t always_set:1;
-};
-verify (N_DEV_BITS_4 <= 8 * sizeof (int));
-verify (sizeof (struct dev_ino_4) == 4);
-
-struct dev_ino_8
-{
-  uint32_t mode:2; /* must be first */
-  uint64_t short_ino:N_INO_BITS_8;
-  uint32_t mapped_dev:N_DEV_BITS_8;
-  uint32_t always_set:1;
-};
-verify (sizeof (struct dev_ino_8) == 8);

-struct dev_ino_full
-{
-  uint32_t mode:2; /* must be first */
-  dev_t dev;
-  ino_t ino;
-};
-
-enum di_mode
-{
-  DI_MODE_4 = 1,
-  DI_MODE_8 = 2,
-  DI_MODE_FULL = 3
-};
-
-/*
-     di_mode     raw_inode      mapped dev    always_set
-          \____________|_______________\_____/
-  4-byte | 2|          25            |  5  |1|          mapped_dev
-         `----------------------------------------------------|-----.
-  8-byte | 2|        53                                  |    8   |1|
-         `----------------------------------------------------------'
-*/
+/* The hash package hashes "void *", but this package wants to hash
+   integers.  Use integers that are as large as possible, but no
+   larger than void *, so that they can be cast to void * and back
+   without losing information.  */
+typedef size_t hashint;
+#define HASHINT_MAX ((hashint) -1)
+
+/* Integers represent inode numbers.  Integers in the range
+   1..(LARGE_INO_MIN-1) represent inode numbers directly.  (The hash
+   package does not work with null pointers, so inode 0 cannot be used
+   as a key.)  To find the representations of other inode numbers, map
+   them through INO_MAP.  */
+#define LARGE_INO_MIN (HASHINT_MAX / 2)
+
+/* Set operations for device-inode pairs stored in a space-efficient
+   manner.  Use a two-level hash table.  The top level hashes by
+   device number, as there are typically a small number of devices.
+   The lower level hashes by mapped inode numbers.  In the typical
+   case where the inode number is positive and small, the inode number
+   maps to itself, masquerading as a void * value; otherwise, its
+   value is the result of hashing the inode value through INO_MAP.  */
+
+/* A pair that maps a device number to a set of inode numbers.  */
 struct di_ent
 {
-  union
-  {
-    struct dev_ino_4 di4;
-    struct dev_ino_8 di8;
-    struct dev_ino_full full;
-    uint32_t u32;
-    uint64_t u64;
-    void *ptr;
-  } u;
-};
-
-struct dev_map_ent
-{
   dev_t dev;
-  uint32_t mapped_dev;
+  struct hash_table *ino_set;
 };

-static inline bool
-is_encoded_ptr (struct di_ent const *v)
+/* A two-level hash table that manages and indexes these pairs.  */
+struct di_set
 {
-  return (size_t) v % 4;
-}
+  /* Map device numbers to sets of inode number representatives.  */
+  struct hash_table *dev_map;

-static struct di_ent
-decode_ptr (struct di_ent const *v)
-{
-  if (!is_encoded_ptr (v))
-    return *v;
+  /* If nonnull, map large inode numbers to their small
+     representatives.  If null, there are no large inode numbers in
+     this set.  */
+  struct ino_map *ino_map;

-  struct di_ent di;
-  di.u.ptr = (void *) v;
-  return di;
-}
+  /* Cache of the most recently allocated and otherwise-unused storage
+     for probing this table.  */
+  struct di_ent *probe;
+};

+/* Hash a device-inode-set entry.  */
 static size_t
 di_ent_hash (void const *x, size_t table_size)
 {
-  struct di_ent e = decode_ptr (x);
-  return (e.u.di4.mode == DI_MODE_4
-          ? e.u.u32
-          : (e.u.di4.mode == DI_MODE_8
-             ? e.u.u64
-             : e.u.full.ino)) % table_size;
+  struct di_ent const *p = x;
+
+  /* Use ordinary % if it's known to return a nonnegative value.
+     Otherwise, fall back on uintmax_t %, which could be much slower.  */
+  if ((dev_t) -1 % (size_t) 2 == 1)
+    return p->dev % table_size;
+  else
+    {
+      uintmax_t dev = p->dev;
+      return dev % table_size;
+    }
 }

-/* Compare two di_ent structs.
-   Return true if they are the same.  */
+/* Return true if two device-inode-set entries are the same.  */
 static bool
 di_ent_compare (void const *x, void const *y)
 {
-  struct di_ent a = decode_ptr (x);
-  struct di_ent b = decode_ptr (y);
-  if (a.u.di4.mode != b.u.di4.mode)
-    return false;
-
-  if (a.u.di4.mode == DI_MODE_4)
-    return (a.u.di4.short_ino == b.u.di4.short_ino
-            && a.u.di4.mapped_dev == b.u.di4.mapped_dev);
-
-  if (a.u.di8.mode == DI_MODE_8)
-    return (a.u.di8.short_ino == b.u.di8.short_ino
-            && a.u.di8.mapped_dev == b.u.di8.mapped_dev);
-
-  return (a.u.full.ino == b.u.full.ino
-          && a.u.full.dev == b.u.full.dev);
+  struct di_ent const *a = x;
+  struct di_ent const *b = y;
+  return a->dev == b->dev;
 }

+/* Free a device-inode-set entry.  */
 static void
 di_ent_free (void *v)
 {
-  if ( ! is_encoded_ptr (v))
-    free (v);
+  struct di_ent *a = v;
+  hash_free (a->ino_set);
+  free (a);
 }

-int
-di_set_init (struct di_set_state *dis, size_t initial_size)
+/* Create a set of device-inode pairs.  Return NULL on allocation failure.  */
+struct di_set *
+di_set_alloc (void)
 {
-  if (dev_map_init (&dis->dev_map) < 0)
-    return -1;
+  struct di_set *dis = malloc (sizeof *dis);
+  if (dis)
+    {
+      enum { INITIAL_DEV_MAP_SIZE = 11 };
+      dis->dev_map = hash_initialize (INITIAL_DEV_MAP_SIZE, NULL,
+                                      di_ent_hash, di_ent_compare,
+                                      di_ent_free);
+      if (! dis->dev_map)
+        {
+          free (dis);
+          return NULL;
+        }
+      dis->ino_map = NULL;
+      dis->probe = NULL;
+    }

-  dis->di_set = hash_initialize (initial_size, NULL,
-                                 di_ent_hash, di_ent_compare, di_ent_free);
-  return dis->di_set ? 0 : -1;
+  return dis;
 }

+/* Free a set of device-inode pairs.  */
 void
-di_set_free (struct di_set_state *dis)
+di_set_free (struct di_set *dis)
 {
-  dev_map_free (&dis->dev_map);
-  hash_free (dis->di_set);
+  hash_free (dis->dev_map);
+  free (dis->ino_map);
+  free (dis->probe);
+  free (dis);
 }

-/* Given a device-inode set, DIS, create an entry for the DEV,INO
-   pair, and store it in *V.  If possible, encode DEV,INO into the pointer
-   itself, but if not, allocate space for a full "struct di_ent" and set *V
-   to that pointer.  Upon memory allocation failure, return -1.
-   Otherwise return 0.  */
-int
-di_ent_create (struct di_set_state *dis,
-               dev_t dev, ino_t ino,
-               struct di_ent **v)
+/* Hash an encoded inode number I.  */
+static size_t
+di_ino_hash (void const *i, size_t table_size)
 {
-  static int prev_m = -1;
-  static dev_t prev_dev = -1;
-  struct di_ent di_ent;
-  int mapped_dev;
+  return (hashint) i % table_size;
+}

-  if (dev == prev_dev)
-    mapped_dev = prev_m;
-  else
+/* Using the DIS table, map a device to a hash table that represents
+   a set of inode numbers.  Return NULL on error.  */
+static struct hash_table *
+map_device (struct di_set *dis, dev_t dev)
+{
+  /* Find space for the probe, reusing the cache if available.  */
+  struct di_ent *ent;
+  struct di_ent *probe = dis->probe;
+  if (probe)
     {
-      mapped_dev = dev_map_insert (&dis->dev_map, dev);
-      if (mapped_dev < 0)
-        return -1;
-      prev_dev = dev;
-      prev_m = mapped_dev;
+      /* If repeating a recent query, return the cached result.   */
+      if (probe->dev == dev)
+        return probe->ino_set;
     }
-
-  if (mapped_dev < (1 << N_DEV_BITS_4)
-      && ino < (1 << N_INO_BITS_4))
+  else
     {
-#if lint
-      /* When this struct is smaller than a pointer, initialize
-         the pointer so tools like valgrind don't complain about
-         the uninitialized bytes.  */
-      if (sizeof di_ent.u.di4 < sizeof di_ent.u.ptr)
-        di_ent.u.ptr = NULL;
-#endif
-      di_ent.u.di4.mode = DI_MODE_4;
-      di_ent.u.di4.short_ino = ino;
-      di_ent.u.di4.mapped_dev = mapped_dev;
-      di_ent.u.di4.always_set = 1;
-      *v = di_ent.u.ptr;
+      dis->probe = probe = malloc (sizeof *probe);
+      if (! probe)
+        return NULL;
     }
-  else if (mapped_dev < (1 << N_DEV_BITS_8)
-           && ino < ((uint64_t) 1 << N_INO_BITS_8))
+
+  /* Probe for the device.  */
+  probe->dev = dev;
+  ent = hash_insert (dis->dev_map, probe);
+  if (! ent)
+    return NULL;
+
+  if (ent == probe)
     {
-      di_ent.u.di8.mode = DI_MODE_8;
-      di_ent.u.di8.short_ino = ino;
-      di_ent.u.di8.mapped_dev = mapped_dev;
-      di_ent.u.di8.always_set = 1;
-      *v = di_ent.u.ptr;
+      enum { INITIAL_INO_SET_SIZE = 1021 };
+
+      /* Prepare to allocate a new probe next time; this one is in use.  */
+      dis->probe = NULL;
+
+      /* DEV is new; allocate an inode set for it.  */
+      probe->ino_set = hash_initialize (INITIAL_INO_SET_SIZE, NULL,
+                                        di_ino_hash, NULL, NULL);
     }
   else
+    probe->ino_set = ent->ino_set;
+
+  return probe->ino_set;
+}
+
+/* Using the DIS table, map an inode number to a mapped value.
+   Return INO_MAP_INSERT_FAILURE on error.  */
+static hashint
+map_inode_number (struct di_set *dis, ino_t ino)
+{
+  if (0 < ino && ino < LARGE_INO_MIN)
+    return ino;
+
+  if (! dis->ino_map)
     {
-      /* Handle the case in which INO is too large or in which (far less
-         likely) we encounter hard-linked files on 2^N_DEV_BITS_8
-         different devices.  */
-      struct di_ent *p = malloc (sizeof *p);
-      if (!p)
-        return -1;
-      assert ((size_t) p % 4 == 0);
-      p->u.full.mode = DI_MODE_FULL;
-      p->u.full.ino = ino;
-      p->u.full.dev = dev;
-      *v = p;
+      dis->ino_map = ino_map_alloc (LARGE_INO_MIN);
+      if (! dis->ino_map)
+        return INO_MAP_INSERT_FAILURE;
     }

-  return 0;
+  return ino_map_insert (dis->ino_map, ino);
 }

-/* Attempt to insert the DEV,INO pair into the set, DIS.
-   If it matches a pair already in DIS, don't modify DIS and return 0.
+/* Attempt to insert the DEV,INO pair into the set DIS.
+   If it matches a pair already in DIS, keep that pair and return 0.
    Otherwise, if insertion is successful, return 1.
    Upon any failure return -1.  */
 int
-di_set_insert (struct di_set_state *dis, dev_t dev, ino_t ino)
+di_set_insert (struct di_set *dis, dev_t dev, ino_t ino)
 {
-  struct di_ent *v;
-  if (di_ent_create (dis, dev, ino, &v) < 0)
-    return -1;
+  hashint i;

-  int err = hash_insert0 (dis->di_set, v, NULL);
-  if (err == -1) /* Insertion failed due to lack of memory.  */
+  /* Map the device number to a set of inodes.  */
+  struct hash_table *ino_set = map_device (dis, dev);
+  if (! ino_set)
     return -1;

-  if (err == 1) /* Insertion succeeded.  */
-    return 1;
-
-  /* That pair is already in the table, so ENT was not inserted.  Free it.  */
-  if (! is_encoded_ptr (v))
-    free (v);
+  /* Map the inode number to a small representative I.  */
+  i = map_inode_number (dis, ino);
+  if (i == INO_MAP_INSERT_FAILURE)
+    return -1;

-  return 0;
+  /* Put I into the inode set.  */
+  return hash_insert0 (ino_set, (void *) i, NULL);
 }
diff --git a/gl/lib/di-set.h b/gl/lib/di-set.h
index f90d0dd..d30a31e 100644
--- a/gl/lib/di-set.h
+++ b/gl/lib/di-set.h
@@ -1,28 +1,12 @@
-#include "dev-map.h"
-
-struct di_set_state
-{
-  /* A map to help compact device numbers.  */
-  struct dev_map dev_map;
-
-  /* A set of compact dev,inode pairs.  */
-  struct hash_table *di_set;
-};
+#include <sys/types.h>

 #undef _ATTRIBUTE_NONNULL_
 #if __GNUC__ == 3 && __GNUC_MINOR__ >= 3 || 3 < __GNUC__
-# define _ATTRIBUTE_NONNULL_(m,...) __attribute__ ((__nonnull__ (m)))
+# define _ATTRIBUTE_NONNULL_(m) __attribute__ ((__nonnull__ (m)))
 #else
-# define _ATTRIBUTE_NONNULL_(m,...)
+# define _ATTRIBUTE_NONNULL_(m)
 #endif

-int di_set_init (struct di_set_state *, size_t) _ATTRIBUTE_NONNULL_ (1);
-void di_set_free (struct di_set_state *) _ATTRIBUTE_NONNULL_ (1);
-int di_set_insert (struct di_set_state *, dev_t, ino_t)
-  _ATTRIBUTE_NONNULL_ (1);
-
-struct di_ent;
-int di_ent_create (struct di_set_state *di_set_state,
-                   dev_t dev, ino_t ino,
-                   struct di_ent **di_ent)
-  _ATTRIBUTE_NONNULL_ (1,4);
+struct di_set *di_set_alloc (void);
+int di_set_insert (struct di_set *, dev_t, ino_t) _ATTRIBUTE_NONNULL_ (1);
+void di_set_free (struct di_set *) _ATTRIBUTE_NONNULL_ (1);
diff --git a/gl/lib/ino-map.c b/gl/lib/ino-map.c
new file mode 100644
index 0000000..1123d52
--- /dev/null
+++ b/gl/lib/ino-map.c
@@ -0,0 +1,162 @@
+/* Map an ino_t inode number to a small integer.
+
+   Copyright 2009, 2010 Free Software Foundation, Inc.
+
+   This program is free software: you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
+
+/* written by Paul Eggert and Jim Meyering */
+
+#include <config.h>
+#include "ino-map.h"
+
+#include "hash.h"
+#include "verify.h"
+
+#include <stdint.h>
+#include <stdlib.h>
+
+/* A pair that maps an inode number to a mapped inode number; the
+   latter is a small unique ID for the former.  */
+struct ino_map_ent
+{
+  ino_t ino;
+  size_t mapped_ino;
+};
+
+/* A table that manages and indexes these pairs.  */
+struct ino_map
+{
+  /* A table of KEY,VAL pairs, where KEY is the raw ino_t value and
+     VAL is the small number that it maps to.  */
+  struct hash_table *map;
+
+  /* The next mapped inode number to hand out.  */
+  size_t next_mapped_ino;
+
+  /* Cache of the most recently allocated and otherwise-unused storage
+     for probing the table.  */
+  struct ino_map_ent *probe;
+};
+
+/* Hash an inode map entry.  */
+static size_t
+ino_hash (void const *x, size_t table_size)
+{
+  struct ino_map_ent const *p = x;
+
+  /* Use ordinary % if it's known to return a nonnegative value.
+     Otherwise, fall back on uintmax_t %, which could be much slower.  */
+  if ((ino_t) -1 % (size_t) 2 == 1)
+    return p->ino % table_size;
+  else
+    {
+      uintmax_t ino = p->ino;
+      return ino % table_size;
+    }
+}
+
+/* Return true if two inode map entries are the same.  */
+static bool
+ino_compare (void const *x, void const *y)
+{
+  struct ino_map_ent const *a = x;
+  struct ino_map_ent const *b = y;
+  return a->ino == b->ino;
+}
+
+/* Allocate an inode map that will hand out integers starting with
+   NEXT_MAPPED_INO.  Return NULL if memory is exhausted.  */
+struct ino_map *
+ino_map_alloc (size_t next_mapped_ino)
+{
+  struct ino_map *im = malloc (sizeof *im);
+
+  if (im)
+    {
+      enum { INITIAL_INO_MAP_TABLE_SIZE = 1021 };
+      im->map = hash_initialize (INITIAL_INO_MAP_TABLE_SIZE, NULL,
+                                 ino_hash, ino_compare, free);
+      if (! im->map)
+        {
+          free (im);
+          return NULL;
+        }
+      im->next_mapped_ino = next_mapped_ino;
+      im->probe = NULL;
+    }
+
+  return im;
+}
+
+/* Free an inode map.  */
+void
+ino_map_free (struct ino_map *map)
+{
+  hash_free (map->map);
+  free (map->probe);
+  free (map);
+}
+
+
+/* Insert into MAP the inode number INO if it's not there already,
+   and return its nonnegative mapped inode number.
+   If INO is already in MAP, return the existing mapped inode number.
+   Return INO_MAP_INSERT_FAILURE on memory or counter exhaustion.  */
+size_t
+ino_map_insert (struct ino_map *im, ino_t ino)
+{
+  struct ino_map_ent *ent;
+
+  /* Find space for the probe, reusing the cache if available.  */
+  struct ino_map_ent *probe = im->probe;
+  if (probe)
+    {
+      /* If repeating a recent query, return the cached result.   */
+      if (probe->ino == ino)
+        return probe->mapped_ino;
+    }
+  else
+    {
+      im->probe = probe = malloc (sizeof *probe);
+      if (! probe)
+        return INO_MAP_INSERT_FAILURE;
+    }
+
+  probe->ino = ino;
+  ent = hash_insert (im->map, probe);
+  if (! ent)
+    return INO_MAP_INSERT_FAILURE;
+
+  if (ent != probe)
+    {
+      /* There was already an existing entry.  Use it.  */
+      probe->mapped_ino = ent->mapped_ino;
+    }
+  else
+    {
+      /* If adding 1 to map->next_mapped_ino would cause it to
+         overflow to zero, then it must equal INO_MAP_INSERT_FAILURE,
+         which is the value that should be returned in that case.  Verify
+         that this works.  */
+      verify (INO_MAP_INSERT_FAILURE + 1 == 0);
+
+      /* Prepare to allocate a new probe next time; this one is in use.  */
+      im->probe = NULL;
+
+      /* INO is new; allocate a mapped inode number for it.  */
+      probe->mapped_ino = im->next_mapped_ino++;
+    }
+
+  return probe->mapped_ino;
+}
diff --git a/gl/lib/ino-map.h b/gl/lib/ino-map.h
new file mode 100644
index 0000000..661b78a
--- /dev/null
+++ b/gl/lib/ino-map.h
@@ -0,0 +1,14 @@
+#include <sys/types.h>
+
+#undef _ATTRIBUTE_NONNULL_
+#if __GNUC__ == 3 && __GNUC_MINOR__ >= 3 || 3 < __GNUC__
+# define _ATTRIBUTE_NONNULL_(m) __attribute__ ((__nonnull__ (m)))
+#else
+# define _ATTRIBUTE_NONNULL_(m)
+#endif
+
+#define INO_MAP_INSERT_FAILURE ((size_t) -1)
+
+struct ino_map *ino_map_alloc (size_t);
+void ino_map_free (struct ino_map *) _ATTRIBUTE_NONNULL_ (1);
+size_t ino_map_insert (struct ino_map *, ino_t) _ATTRIBUTE_NONNULL_ (1);
diff --git a/gl/modules/dev-map b/gl/modules/dev-map
deleted file mode 100644
index 91f437b..0000000
--- a/gl/modules/dev-map
+++ /dev/null
@@ -1,23 +0,0 @@
-Description:
-maintain a mapping of dev_t numbers to small integers
-
-Files:
-lib/dev-map.c
-lib/dev-map.h
-
-Depends-on:
-hash
-
-configure.ac:
-
-Makefile.am:
-lib_SOURCES += dev-map.c dev-map.h
-
-Include:
-"dev-map.h"
-
-License
-GPL
-
-Maintainer:
-Jim Meyering
diff --git a/gl/modules/dev-map-tests b/gl/modules/dev-map-tests
deleted file mode 100644
index 4bec2e6..0000000
--- a/gl/modules/dev-map-tests
+++ /dev/null
@@ -1,10 +0,0 @@
-Files:
-tests/test-dev-map.c
-
-Depends-on:
-
-configure.ac:
-
-Makefile.am:
-TESTS += test-dev-map
-check_PROGRAMS += test-dev-map
diff --git a/gl/modules/di-set b/gl/modules/di-set
index fe52778..562db14 100644
--- a/gl/modules/di-set
+++ b/gl/modules/di-set
@@ -6,9 +6,8 @@ lib/di-set.c
 lib/di-set.h

 Depends-on:
-dev-map
+ino-map
 hash
-verify

 configure.ac:

diff --git a/gl/modules/ino-map b/gl/modules/ino-map
new file mode 100644
index 0000000..06ee51d
--- /dev/null
+++ b/gl/modules/ino-map
@@ -0,0 +1,24 @@
+Description:
+maintain a mapping of ino_t numbers to small integers
+
+Files:
+lib/ino-map.c
+lib/ino-map.h
+
+Depends-on:
+hash
+verify
+
+configure.ac:
+
+Makefile.am:
+lib_SOURCES += ino-map.c ino-map.h
+
+Include:
+"ino-map.h"
+
+License
+GPL
+
+Maintainer:
+Jim Meyering
diff --git a/gl/modules/ino-map-tests b/gl/modules/ino-map-tests
new file mode 100644
index 0000000..fa10b4f
--- /dev/null
+++ b/gl/modules/ino-map-tests
@@ -0,0 +1,10 @@
+Files:
+tests/test-ino-map.c
+
+Depends-on:
+
+configure.ac:
+
+Makefile.am:
+TESTS += test-ino-map
+check_PROGRAMS += test-ino-map
diff --git a/gl/tests/test-di-set.c b/gl/tests/test-di-set.c
index 4d5823e..e5fb6cb 100644
--- a/gl/tests/test-di-set.c
+++ b/gl/tests/test-di-set.c
@@ -1,4 +1,4 @@
-/* Test the dev-map module.
+/* Test the di-set module.
    Copyright (C) 2010 Free Software Foundation, Inc.

    This program is free software: you can redistribute it and/or modify
@@ -36,41 +36,21 @@

 #include "di-set.h"

-/* FIXME: ugly duplication of code from di-set.c */
-#define N_DEV_BITS_4 5
-#define N_INO_BITS_4 (32 - N_DEV_BITS_4 - 2 - 1)
-
-#define N_DEV_BITS_8 8
-#define N_INO_BITS_8 (64 - N_DEV_BITS_8 - 2 - 1)
-
 int
 main (void)
 {
   /* set_program_name (argv[0]); placate overzealous "syntax-check" test.  */
-  size_t initial_size = 61;
-  /* "real" code might prefer to avoid the allocation here, simply
-     declaring "struct di_set_state dis;", do a global substitution,
-     s/\<dis\>/\&dis/, and remove the final free.  */
-  struct di_set_state *dis = malloc (sizeof *dis);
+  struct di_set *dis = di_set_alloc ();
   ASSERT (dis);
-  ASSERT (di_set_init (dis, initial_size) == 0);
-
-  struct di_ent *di_ent;
-  ASSERT (di_ent_create (dis, 1, 1, &di_ent) == 0);
-  ASSERT (di_ent_create (dis, 1 << N_DEV_BITS_4, 1, &di_ent) == 0);
-  ASSERT (di_ent_create (dis, 1, 1 << N_INO_BITS_4, &di_ent) == 0);
-  ASSERT (di_ent_create (dis, 1,
-                         (uint64_t) 1 << N_INO_BITS_8, &di_ent) == 0);
-  free (di_ent);

   ASSERT (di_set_insert (dis, 2, 5) == 1); /* first insertion succeeds */
   ASSERT (di_set_insert (dis, 2, 5) == 0); /* duplicate fails */
   ASSERT (di_set_insert (dis, 3, 5) == 1); /* diff dev, duplicate inode is ok 
*/
   ASSERT (di_set_insert (dis, 2, 8) == 1); /* same dev, different inode is ok 
*/

-  /* very large inode number */
-  ASSERT (di_set_insert (dis, 5, (uint64_t) 1 << 63) == 1);
-  ASSERT (di_set_insert (dis, 5, (uint64_t) 1 << 63) == 0); /* dup */
+  /* very large (or negative) inode number */
+  ASSERT (di_set_insert (dis, 5, (ino_t) -1) == 1);
+  ASSERT (di_set_insert (dis, 5, (ino_t) -1) == 0); /* dup */

   unsigned int i;
   for (i = 0; i < 3000; i++)
@@ -79,7 +59,6 @@ main (void)
     ASSERT (di_set_insert (dis, 9, i) == 0); /* duplicate fails */

   di_set_free (dis);
-  free (dis);

   return 0;
 }
diff --git a/gl/tests/test-dev-map.c b/gl/tests/test-ino-map.c
similarity index 72%
rename from gl/tests/test-dev-map.c
rename to gl/tests/test-ino-map.c
index 98ba630..2b44602 100644
--- a/gl/tests/test-dev-map.c
+++ b/gl/tests/test-ino-map.c
@@ -1,4 +1,4 @@
-/* Test the dev-map module.
+/* Test the ino-map module.
    Copyright (C) 2010 Free Software Foundation, Inc.

    This program is free software: you can redistribute it and/or modify
@@ -34,30 +34,30 @@
     }                                                                        \
   while (0)

-#include "dev-map.h"
-
-/* Risky: this is also defined in di-set.c; they should match.  */
-#define N_DEV_BITS_4 5
+#include "ino-map.h"

 int
 main ()
 {
   /* set_program_name (argv[0]); placate overzealous "syntax-check" test.  */
-  struct dev_map dev_map;
-  ASSERT (dev_map_init (&dev_map) == 0);
-
-  ASSERT (dev_map_insert (&dev_map, 42) == 0);
-  ASSERT (dev_map_insert (&dev_map, 42) == 0);
-  ASSERT (dev_map_insert (&dev_map, 398) == 1);
-  ASSERT (dev_map_insert (&dev_map, 398) == 1);
-
-  int32_t i;
-  for (i = 0; i < (1 << N_DEV_BITS_4); i++)
+  enum { INO_MAP_INIT = 123 };
+  struct ino_map *ino_map = ino_map_alloc (INO_MAP_INIT);
+  ASSERT (ino_map != NULL);
+
+  ASSERT (ino_map_insert (ino_map, 42) == INO_MAP_INIT);
+  ASSERT (ino_map_insert (ino_map, 42) == INO_MAP_INIT);
+  ASSERT (ino_map_insert (ino_map, 398) == INO_MAP_INIT + 1);
+  ASSERT (ino_map_insert (ino_map, 398) == INO_MAP_INIT + 1);
+  ASSERT (ino_map_insert (ino_map, 0) == INO_MAP_INIT + 2);
+  ASSERT (ino_map_insert (ino_map, 0) == INO_MAP_INIT + 2);
+
+  int i;
+  for (i = 0; i < 100; i++)
     {
-      ASSERT (dev_map_insert (&dev_map, 10000+i) == 2+i);
+      ASSERT (ino_map_insert (ino_map, 10000 + i) == INO_MAP_INIT + 3 + i);
     }

-  dev_map_free (&dev_map);
+  ino_map_free (ino_map);

   return 0;
 }
diff --git a/src/du.c b/src/du.c
index 7c02c86..739be73 100644
--- a/src/du.c
+++ b/src/du.c
@@ -60,11 +60,8 @@ extern bool fts_debug;
 # define FTS_CROSS_CHECK(Fts)
 #endif

-/* Initial size of the hash table.  */
-enum { INITIAL_DI_SET_SIZE = 1021 };
-
 /* A set of dev/ino pairs.  */
-static struct di_set_state di_set;
+static struct di_set *di_set;

 /* Define a class for collecting directory information. */

@@ -337,14 +334,10 @@ Mandatory arguments to long options are mandatory for 
short options too.\n\
 static bool
 hash_ins (ino_t ino, dev_t dev)
 {
-  int inserted = di_set_insert (&di_set, dev, ino);
+  int inserted = di_set_insert (di_set, dev, ino);
   if (inserted < 0)
-    {
-      /* Insertion failed due to lack of memory.  */
-      xalloc_die ();
-    }
-
-  return inserted ? true : false;
+    xalloc_die ();
+  return inserted;
 }

 /* FIXME: this code is nearly identical to code in date.c  */
@@ -905,7 +898,8 @@ main (int argc, char **argv)
     xalloc_die ();

   /* Initialize the set of dev,inode pairs.  */
-  if (di_set_init (&di_set, INITIAL_DI_SET_SIZE))
+  di_set = di_set_alloc ();
+  if (!di_set)
     xalloc_die ();

   bit_flags |= symlink_deref_bits;
@@ -977,6 +971,7 @@ main (int argc, char **argv)
     }

   argv_iter_free (ai);
+  di_set_free (di_set);

   if (files_from && (ferror (stdin) || fclose (stdin) != 0))
     error (EXIT_FAILURE, 0, _("error reading %s"), quote (files_from));
@@ -984,7 +979,5 @@ main (int argc, char **argv)
   if (print_grand_total)
     print_size (&tot_dui, _("total"));

-  di_set_free (&di_set);
-
   exit (ok ? EXIT_SUCCESS : EXIT_FAILURE);
 }
--
1.7.2.rc1.192.g262ff





reply via email to

[Prev in Thread] Current Thread [Next in Thread]