pspp-dev
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[bug37999 09/11] RANK: Create all variables together, in order.


From: Ben Pfaff
Subject: [bug37999 09/11] RANK: Create all variables together, in order.
Date: Thu, 31 Jan 2013 22:03:31 -0800

An upcoming commit will rewrite the RANK implementation so that the
new variables are not created until after a pass through the data.
(This makes sense because their values cannot actually be determined
until that pass is complete, so there is no point in allocating space
for them in cases.)  To do that, it is necessary to figure out the
variable names (and that they will be valid variable names) in
advance.  This commit switches to that approach in advance.

This approach has another small advantage: the order of the variables
added by RANK to the dictionary does not depend on whether the
variables are named by the user or by generating a name.  (This
is why the rank.at test case changes.)
---
 src/language/stats/rank.c    |  231 ++++++++++++++++++++++++------------------
 tests/language/stats/rank.at |   38 +++----
 2 files changed, 152 insertions(+), 117 deletions(-)

diff --git a/src/language/stats/rank.c b/src/language/stats/rank.c
index 35516c2..a4f8185 100644
--- a/src/language/stats/rank.c
+++ b/src/language/stats/rank.c
@@ -35,9 +35,11 @@
 #include "language/stats/sort-criteria.h"
 #include "math/sort.h"
 #include "libpspp/assertion.h"
+#include "libpspp/i18n.h"
 #include "libpspp/message.h"
 #include "libpspp/misc.h"
 #include "libpspp/pool.h"
+#include "libpspp/string-set.h"
 #include "libpspp/taint.h"
 
 #include "output/tab.h"
@@ -143,69 +145,68 @@ struct rank_spec
 {
   enum rank_func rfunc;
   struct variable **destvars;
+  const char **dest_names;
 };
 
-
-/* Create and return a new variable in which to store the ranks of SRC_VAR
-   accoring to the rank function F.
-   VNAME is the name of the variable to be created.
-   If VNAME is NULL, then a name will be automatically chosen.
-*/
-static struct variable *
-create_rank_variable (struct dictionary *dict, enum rank_func f,
-                     const struct variable *src_var,
-                     const char *vname)
+/* If NEW_NAME exists in DICT or NEW_NAMES, returns NULL without changing
+   anything.  Otherwise, inserts NEW_NAME in NEW_NAMES and returns the copy of
+   NEW_NAME now in NEW_NAMES. */
+static const char *
+try_new_name (const char *new_name,
+              const struct dictionary *dict, struct string_set *new_names)
 {
-  int i;
-  struct variable *var = NULL;
-  char name[SHORT_NAME_LEN + 1];
-
-  if ( vname )
-    var = dict_create_var(dict, vname, 0);
+  return (!dict_lookup_var (dict, new_name)
+          && string_set_insert (new_names, new_name)
+          ? string_set_find_node (new_names, new_name)->string
+          : NULL);
+}
 
-  if ( NULL == var )
-    {
-      snprintf (name, SHORT_NAME_LEN + 1, "%c%s",
-                function_name[f][0], var_get_name (src_var));
+/* Returns a variable name for storing ranks of a variable named SRC_NAME
+   accoring to the rank function F.  The name chosen will not be one already in
+   DICT or NEW_NAMES.
 
-      var = dict_create_var(dict, name, 0);
-    }
-  i = 1;
-  while( NULL == var )
-    {
-      char func_abb[4];
-      snprintf(func_abb, 4, "%s", function_name[f]);
-      snprintf(name, SHORT_NAME_LEN + 1, "%s%03d", func_abb,
-              i);
-
-      var = dict_create_var(dict, name, 0);
-      if (i++ >= 999)
-       break;
-    }
+   If successful, adds the new name to NEW_NAMES and returns the name added.
+   If no name can be generated, returns NULL. */
+static const char *
+rank_choose_dest_name (struct dictionary *dict, struct string_set *new_names,
+                       enum rank_func f, const char *src_name)
+{
+  char *src_name_7;
+  char name[128];
+  const char *s;
+  int i;
 
-  i = 1;
-  while ( NULL == var )
+  /* Try the first character of the ranking function followed by the first 7
+     bytes of the srcinal variable name. */
+  src_name_7 = utf8_encoding_trunc (src_name, dict_get_encoding (dict), 7);
+  snprintf (name, sizeof name, "%c%s", function_name[f][0], src_name_7);
+  free (src_name_7);
+  s = try_new_name (name, dict, new_names);
+  if (s != NULL)
+    return s;
+
+  /* Try "fun###". */
+  for (i = 1; i <= 999; i++)
     {
-      char func_abb[3];
-      snprintf(func_abb, 3, "%s", function_name[f]);
-
-      snprintf(name, SHORT_NAME_LEN + 1,
-              "RNK%s%02d", func_abb, i);
-
-      var = dict_create_var(dict, name, 0);
-      if ( i++ >= 99 )
-       break;
+      sprintf (name, "%.3s%03d", function_name[f], i);
+      s = try_new_name (name, dict, new_names);
+      if (s != NULL)
+        return s;
     }
 
-  if ( NULL == var )
+  /* Try "RNKfn##". */
+  for (i = 1; i <= 99; i++)
     {
-      msg(ME, _("Cannot create new rank variable.  All candidates in use."));
-      return NULL;
+      sprintf (name, "RNK%.2s%02d", function_name[f], i);
+      s = try_new_name (name, dict, new_names);
+      if (s != NULL)
+        return s;
     }
 
-  var_set_both_formats (var, &dest_format[f]);
-
-  return var;
+  msg (ME, _("Cannot generate variable name for ranking %s with %s.  "
+             "All candidates in use."),
+       src_name, function_name[f]);
+  return NULL;
 }
 
 struct rank
@@ -248,7 +249,8 @@ destroy_rank (struct rank *rank)
 }
 
 static bool
-parse_into (struct lexer *lexer, struct rank *cmd)
+parse_into (struct lexer *lexer, struct rank *cmd,
+            struct string_set *new_names)
 {
   int var_count = 0;
   struct rank_spec *rs = NULL;
@@ -306,30 +308,32 @@ parse_into (struct lexer *lexer, struct rank *cmd)
     }
 
   cmd->n_rs++;
-  rs->destvars = NULL;
-  rs->destvars = pool_calloc (cmd->pool, cmd->n_vars, sizeof (*rs->destvars));
+  rs->dest_names = pool_calloc (cmd->pool, cmd->n_vars,
+                                sizeof *rs->dest_names);
 
   if (lex_match_id (lexer, "INTO"))
     {
       while( lex_token (lexer) == T_ID )
        {
          const char *name = lex_tokcstr (lexer);
-         if ( dict_lookup_var (cmd->dict, name) != NULL )
-           {
-             msg (SE, _("Variable %s already exists."), name);
-             return false;
-           }
-         
+
          if ( var_count >= subcase_get_n_fields (&cmd->sc) )
-           {
-             msg (SE, _("Too many variables in INTO clause."));
-             return false;
-           }
-         rs->destvars[var_count] = 
-           create_rank_variable (cmd->dict, rs->rfunc, cmd->vars[var_count], 
name);
-         ++var_count;
-         lex_get (lexer);
-       }
+            msg (SE, _("Too many variables in INTO clause."));
+         else if ( dict_lookup_var (cmd->dict, name) != NULL )
+            msg (SE, _("Variable %s already exists."), name);
+          else if (string_set_contains (new_names, name))
+            msg (SE, _("Duplicate variable name %s."), name);
+          else
+            {
+              string_set_insert (new_names, name);
+              rs->dest_names[var_count++] = pool_strdup (cmd->pool, name);
+              lex_get (lexer);
+              continue;
+            }
+
+          /* Error path. */
+          return false;
+        }
     }
 
   return true;
@@ -617,12 +621,14 @@ fraction_name (const struct rank *cmd)
     }
 }
 
-/* Create a label on DEST_VAR, describing its derivation from SRC_VAR and F */
-static void
-create_var_label (struct rank *cmd, struct variable *dest_var,
-                 const struct variable *src_var, enum rank_func f)
+/* Returns a label for a variable derived from SRC_VAR with function F. */
+static const char *
+create_var_label (struct rank *cmd, const struct variable *src_var,
+                  enum rank_func f)
 {
   struct string label;
+  const char *pool_label;
+
   ds_init_empty (&label);
 
   if ( cmd->n_group_vars > 0 )
@@ -646,16 +652,20 @@ create_var_label (struct rank *cmd, struct variable 
*dest_var,
     ds_put_format (&label, _("%s of %s"),
                    function_name[f], var_get_name (src_var));
 
-  var_set_label (dest_var, ds_cstr (&label), false);
+  pool_label = pool_strdup (cmd->pool, ds_cstr (&label));
   
   ds_destroy (&label);
+
+  return pool_label;
 }
 
 int
 cmd_rank (struct lexer *lexer, struct dataset *ds)
 {
+  struct string_set new_names;
   struct rank rank;
   struct variable *order;
+  struct rank_spec *rs;
   bool result = true;
   int i;
 
@@ -672,6 +682,8 @@ cmd_rank (struct lexer *lexer, struct dataset *ds)
   rank.print = true;
   rank.pool = pool_create ();
 
+  string_set_init (&new_names);
+
   if (lex_match_id (lexer, "VARIABLES"))
     lex_force_match (lexer, T_EQUALS);
 
@@ -778,7 +790,7 @@ cmd_rank (struct lexer *lexer, struct dataset *ds)
              goto error;
            }
        }
-      else if (! parse_into (lexer, &rank))
+      else if (! parse_into (lexer, &rank, &new_names))
        goto error;
     }
 
@@ -786,33 +798,54 @@ cmd_rank (struct lexer *lexer, struct dataset *ds)
   /* If no rank specs are given, then apply a default */
   if ( rank.n_rs == 0)
     {
-      rank.rs = pool_calloc (rank.pool, 1, sizeof (*rank.rs));
+      struct rank_spec *rs;
+
+      rs = pool_calloc (rank.pool, 1, sizeof *rs);
+      rs->rfunc = RANK;
+      rs->dest_names = pool_calloc (rank.pool, 1, sizeof *rs->dest_names);
+
+      rank.rs = rs;
       rank.n_rs = 1;
-      rank.rs[0].rfunc = RANK;
-      rank.rs[0].destvars = pool_calloc (rank.pool, rank.n_vars, sizeof 
(*rank.rs[0].destvars));
     }
 
-  /* Create variables for all rank destinations which haven't
-     already been created with INTO.
-     Add labels to all the destination variables.
-  */
-  for (i = 0 ; i <  rank.n_rs ; ++i )
+  /* Choose variable names for all rank destinations which haven't already been
+     created with INTO. */
+  for (rs = rank.rs; rs < &rank.rs[rank.n_rs]; rs++)
     {
       int v;
-      struct rank_spec *rs = &rank.rs[i];
 
       for ( v = 0 ; v < rank.n_vars ;  v ++ )
-       {
-         if ( rs->destvars[v] == NULL )
-           {
-             rs->destvars[v] =
-               create_rank_variable (rank.dict, rs->rfunc, rank.vars[v], NULL);
-           }
+        {
+          const char **dst_name = &rs->dest_names[v];
+          if ( *dst_name == NULL )
+            {
+              *dst_name = rank_choose_dest_name (rank.dict, &new_names,
+                                                 rs->rfunc,
+                                                 var_get_name (rank.vars[v]));
+              if (*dst_name == NULL)
+                goto error;
+            }
+        }
+    }
 
-         create_var_label (&rank, rs->destvars[v],
-                           rank.vars[v],
-                           rs->rfunc);
-       }
+  /* Create variables. */
+  for (rs = rank.rs; rs < &rank.rs[rank.n_rs]; rs++)
+    rs->destvars = pool_nmalloc (rank.pool, rank.n_vars, sizeof *rs->destvars);
+  for ( i = 0 ; i < rank.n_vars ;  i ++ )
+    {
+      struct variable *var;
+
+      for (rs = rank.rs; rs < &rank.rs[rank.n_rs]; rs++)
+        {
+          var = dict_create_var_assert (dataset_dict (ds),
+                                        rs->dest_names[i], 0);
+          var_set_both_formats (var, &dest_format[rs->rfunc]);
+          var_set_label (var, create_var_label (&rank, rank.vars[i],
+                                                rs->rfunc),
+                         false);
+
+          rs->destvars[i] = var;
+        }
     }
 
   if ( rank.print )
@@ -845,7 +878,7 @@ cmd_rank (struct lexer *lexer, struct dataset *ds)
                    tab_output_text_format (0,
                                             _("%s into %s(%s of %s using %s BY 
%s)"),
                                             var_get_name (rank.vars[v]),
-                                            var_get_name 
(rank.rs[i].destvars[v]),
+                                            rank.rs[i].dest_names[v],
                                             function_name[rank.rs[i].rfunc],
                                             var_get_name (rank.vars[v]),
                                             fraction_name (&rank),
@@ -855,7 +888,7 @@ cmd_rank (struct lexer *lexer, struct dataset *ds)
                    tab_output_text_format (0,
                                             _("%s into %s(%s of %s BY %s)"),
                                             var_get_name (rank.vars[v]),
-                                            var_get_name 
(rank.rs[i].destvars[v]),
+                                            rank.rs[i].dest_names[v],
                                             function_name[rank.rs[i].rfunc],
                                             var_get_name (rank.vars[v]),
                                             ds_cstr (&varlist));
@@ -868,7 +901,7 @@ cmd_rank (struct lexer *lexer, struct dataset *ds)
                    tab_output_text_format (0,
                                             _("%s into %s(%s of %s using %s)"),
                                             var_get_name (rank.vars[v]),
-                                            var_get_name 
(rank.rs[i].destvars[v]),
+                                            rank.rs[i].dest_names[v],
                                             function_name[rank.rs[i].rfunc],
                                             var_get_name (rank.vars[v]),
                                             fraction_name (&rank));
@@ -877,7 +910,7 @@ cmd_rank (struct lexer *lexer, struct dataset *ds)
                    tab_output_text_format (0,
                                             _("%s into %s(%s of %s)"),
                                             var_get_name (rank.vars[v]),
-                                            var_get_name 
(rank.rs[i].destvars[v]),
+                                            rank.rs[i].dest_names[v],
                                             function_name[rank.rs[i].rfunc],
                                             var_get_name (rank.vars[v]));
                }
@@ -913,11 +946,13 @@ cmd_rank (struct lexer *lexer, struct dataset *ds)
   }
 
   destroy_rank (&rank);
+  string_set_destroy (&new_names);
   return CMD_SUCCESS;
 
  error:
 
   destroy_rank (&rank);
+  string_set_destroy (&new_names);
   return CMD_FAILURE;
 }
 
diff --git a/tests/language/stats/rank.at b/tests/language/stats/rank.at
index 04d5acc..61c4180 100644
--- a/tests/language/stats/rank.at
+++ b/tests/language/stats/rank.at
@@ -92,23 +92,23 @@ b,Format: F8.2,,2
 ,Measure: Scale,,
 ,Display Alignment: Right,,
 ,Display Width: 8,,
-count,N of a,,3
-,Format: F6.0,,
+Ra,RANK of a,,3
+,Format: F9.3,,
 ,Measure: Scale,,
 ,Display Alignment: Right,,
 ,Display Width: 8,,
-Ra,RANK of a,,4
-,Format: F9.3,,
+RFR001,RFRACTION of a,,4
+,Format: F6.4,,
 ,Measure: Scale,,
 ,Display Alignment: Right,,
 ,Display Width: 8,,
-Rb,RANK of b,,5
-,Format: F9.3,,
+count,N of a,,5
+,Format: F6.0,,
 ,Measure: Scale,,
 ,Display Alignment: Right,,
 ,Display Width: 8,,
-RFR001,RFRACTION of a,,6
-,Format: F6.4,,
+Rb,RANK of b,,6
+,Format: F9.3,,
 ,Measure: Scale,,
 ,Display Alignment: Right,,
 ,Display Width: 8,,
@@ -124,17 +124,17 @@ Nb,N of b,,8
 ,Display Width: 8,,
 
 Table: Data List
-a,b,count,Ra,Rb,RFR001,RFR002,Nb
-.00,24.00,10,10.000,8.000,1.0000,.8889,9
-1.00,32.00,10,9.000,4.000,.9000,.4444,9
-2.00,31.00,10,8.000,5.000,.8000,.5556,9
-2.00,32.00,10,8.000,4.000,.8000,.4444,9
-4.00,30.00,10,6.000,6.000,.6000,.6667,9
-5.00,29.00,10,5.000,7.000,.5000,.7778,9
-6.00,1.00,10,4.000,9.000,.4000,1.0000,9
-7.00,43.00,10,3.000,2.000,.3000,.2222,9
-8.00,.  ,10,2.000,.   ,.2000,.    ,.
-9.00,45.00,10,1.000,1.000,.1000,.1111,9
+a,b,Ra,RFR001,count,Rb,RFR002,Nb
+.00,24.00,10.000,1.0000,10,8.000,.8889,9
+1.00,32.00,9.000,.9000,10,4.000,.4444,9
+2.00,31.00,8.000,.8000,10,5.000,.5556,9
+2.00,32.00,8.000,.8000,10,4.000,.4444,9
+4.00,30.00,6.000,.6000,10,6.000,.6667,9
+5.00,29.00,5.000,.5000,10,7.000,.7778,9
+6.00,1.00,4.000,.4000,10,9.000,1.0000,9
+7.00,43.00,3.000,.3000,10,2.000,.2222,9
+8.00,.  ,2.000,.2000,10,.   ,.    ,.
+9.00,45.00,1.000,.1000,10,1.000,.1111,9
 ])
 AT_CLEANUP
 
-- 
1.7.10.4




reply via email to

[Prev in Thread] Current Thread [Next in Thread]