pspp-dev
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[PATCH 03/18] sys-file-reader: Refactor to clean up character encoding s


From: Ben Pfaff
Subject: [PATCH 03/18] sys-file-reader: Refactor to clean up character encoding support.
Date: Sat, 19 Mar 2011 17:09:49 -0700

The system file format is unusual in that it does not record the encoding
used by character strings at the beginning or at any fixed place in the
file.  Instead, it can be recorded practically anywhere in the file.  It
never precedes all of the actual character strings in the file, which makes
it impossible to interpret those strings completely and correctly until it
is encountered.

Until now, the system file reader has dealt with this situation by
stuffing uninterpreted character strings into data structures until the
encoding is known, then at that point fetching out the character strings,
reencoding them, and stuffing them back into the data structures.  This
does work, but it has the disadvantage that all of the PSPP data
structures have to tolerate character strings with unknown encoding.  In
some cases this seems like an ugly situation.  For example, arbitrary
variable names have to be supported, even though the syntax for variable
names is circumscribed by the language, because the syntax rules for
variable names cannot be completely and correctly applied to a string that
is in an unknown encoding.

This commit fixes that problem by adopting a new way to read system files.
Each record in the system file dictionary is essentially slurped into
memory as a chunk, then the character encoding is extracted from it, then
the rest of the dictionary is interpreted based on that encoding.  The
actual implementation is a little more intricate because the format of
system file records is somewhat non-uniform.
---
 src/data/dictionary.c                      |    6 +-
 src/data/sys-file-reader.c                 | 2070 +++++++++++++++-------------
 src/data/sys-file-reader.h                 |    2 +-
 tests/data/sys-file-reader.at              |  237 ++--
 tests/language/dictionary/sys-file-info.at |    4 +-
 5 files changed, 1238 insertions(+), 1081 deletions(-)

diff --git a/src/data/dictionary.c b/src/data/dictionary.c
index 2ad94ca..467f347 100644
--- a/src/data/dictionary.c
+++ b/src/data/dictionary.c
@@ -1228,12 +1228,14 @@ dict_get_label (const struct dictionary *d)
 }
 
 /* Sets D's file label to LABEL, truncating it to a maximum of 60
-   characters. */
+   characters.
+
+   Removes D's label if LABEL is null or the empty string. */
 void
 dict_set_label (struct dictionary *d, const char *label)
 {
   free (d->label);
-  d->label = label != NULL ? xstrndup (label, 60) : NULL;
+  d->label = label != NULL && label[0] != '\0' ? xstrndup (label, 60) : NULL;
 }
 
 /* Returns the documents for D, or a null pointer if D has no
diff --git a/src/data/sys-file-reader.c b/src/data/sys-file-reader.c
index 8f39f47..ceb4e04 100644
--- a/src/data/sys-file-reader.c
+++ b/src/data/sys-file-reader.c
@@ -1,5 +1,5 @@
 /* PSPP - a program for statistical analysis.
-   Copyright (C) 1997-9, 2000, 2006, 2007, 2009, 2010, 2011 Free Software 
Foundation, Inc.
+   Copyright (C) 1997-2000, 2006-2007, 2009-2011 Free Software Foundation, Inc.
 
    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -51,6 +51,7 @@
 
 #include "gl/c-ctype.h"
 #include "gl/inttostr.h"
+#include "gl/localcharset.h"
 #include "gl/minmax.h"
 #include "gl/unlocked-io.h"
 #include "gl/xalloc.h"
@@ -60,6 +61,73 @@
 #define _(msgid) gettext (msgid)
 #define N_(msgid) (msgid)
 
+enum
+  {
+    /* subtypes 0-2 unknown */
+    EXT_INTEGER       = 3,      /* Machine integer info. */
+    EXT_FLOAT         = 4,      /* Machine floating-point info. */
+    EXT_VAR_SETS      = 5,      /* Variable sets. */
+    EXT_DATE          = 6,      /* DATE. */
+    EXT_MRSETS        = 7,      /* Multiple response sets. */
+    EXT_DATA_ENTRY    = 8,      /* SPSS Data Entry. */
+    /* subtypes 9-10 unknown */
+    EXT_DISPLAY       = 11,     /* Variable display parameters. */
+    /* subtype 12 unknown */
+    EXT_LONG_NAMES    = 13,     /* Long variable names. */
+    EXT_LONG_STRINGS  = 14,     /* Long strings. */
+    /* subtype 15 unknown */
+    EXT_NCASES        = 16,     /* Extended number of cases. */
+    EXT_FILE_ATTRS    = 17,     /* Data file attributes. */
+    EXT_VAR_ATTRS     = 18,     /* Variable attributes. */
+    EXT_MRSETS2       = 19,     /* Multiple response sets (extended). */
+    EXT_ENCODING      = 20,     /* Character encoding. */
+    EXT_LONG_LABELS   = 21      /* Value labels for long strings. */
+  };
+
+struct sfm_var_record
+  {
+    off_t pos;
+    int width;
+    char name[8];
+    int print_format;
+    int write_format;
+    int missing_value_code;
+    uint8_t missing[24];
+    char *label;
+    struct variable *var;
+  };
+
+struct sfm_value_label
+  {
+    uint8_t value[8];
+    char *label;
+  };
+
+struct sfm_value_label_record
+  {
+    off_t pos;
+    struct sfm_value_label *labels;
+    size_t n_labels;
+
+    int *vars;
+    size_t n_vars;
+  };
+
+struct sfm_document_record
+  {
+    off_t pos;
+    char *documents;
+    size_t n_lines;
+  };
+
+struct sfm_extension_record
+  {
+    off_t pos;                  /* Starting offset in file. */
+    size_t size;                /* Size of data elements. */
+    size_t count;               /* Number of data elements. */
+    void *data;                 /* Contents. */
+  };
+
 /* System file reader. */
 struct sfm_reader
   {
@@ -71,17 +139,17 @@ struct sfm_reader
     struct file_handle *fh;     /* File handle. */
     struct fh_lock *lock;       /* Mutual exclusion for file handle. */
     FILE *file;                 /* File stream. */
+    off_t pos;                  /* Position in file. */
     bool error;                 /* I/O or corruption error? */
     struct caseproto *proto;    /* Format of output cases. */
 
     /* File format. */
     enum integer_format integer_format; /* On-disk integer format. */
     enum float_format float_format; /* On-disk floating point format. */
-    int oct_cnt;               /* Number of 8-byte units per case. */
     struct sfm_var *sfm_vars;   /* Variables. */
     size_t sfm_var_cnt;         /* Number of variables. */
     casenumber case_cnt;        /* Number of cases */
-    bool has_long_var_names;    /* File has a long variable name map */
+    const char *encoding;       /* String encoding. */
 
     /* Decompression. */
     bool compressed;           /* File is compressed? */
@@ -95,21 +163,17 @@ static const struct casereader_class 
sys_file_casereader_class;
 
 static bool close_reader (struct sfm_reader *);
 
-static struct variable **make_var_by_value_idx (struct sfm_reader *,
-                                                struct dictionary *);
-static struct variable *lookup_var_by_value_idx (struct sfm_reader *,
-                                                 struct variable **,
-                                                 int value_idx);
-static struct variable *lookup_var_by_short_name (struct dictionary *,
-                                                  const char *short_name);
+static struct variable *lookup_var_by_index (struct sfm_reader *, off_t,
+                                             const struct sfm_var_record *,
+                                             size_t n, int idx);
 
-static void sys_msg (struct sfm_reader *r, int class,
+static void sys_msg (struct sfm_reader *r, off_t, int class,
                      const char *format, va_list args)
-     PRINTF_FORMAT (3, 0);
-static void sys_warn (struct sfm_reader *, const char *, ...)
-     PRINTF_FORMAT (2, 3);
-static void sys_error (struct sfm_reader *, const char *, ...)
-     PRINTF_FORMAT (2, 3)
+     PRINTF_FORMAT (4, 0);
+static void sys_warn (struct sfm_reader *, off_t, const char *, ...)
+     PRINTF_FORMAT (3, 4);
+static void sys_error (struct sfm_reader *, off_t, const char *, ...)
+     PRINTF_FORMAT (3, 4)
      NO_RETURN;
 
 static void read_bytes (struct sfm_reader *, void *, size_t);
@@ -119,8 +183,27 @@ static double read_float (struct sfm_reader *);
 static void read_string (struct sfm_reader *, char *, size_t);
 static void skip_bytes (struct sfm_reader *, size_t);
 
-static struct text_record *open_text_record (struct sfm_reader *, size_t size);
-static void close_text_record (struct sfm_reader *r,
+static int parse_int (struct sfm_reader *, const void *data, size_t ofs);
+static double parse_float (struct sfm_reader *, const void *data, size_t ofs);
+
+static void read_variable_record (struct sfm_reader *,
+                                  struct sfm_var_record *);
+static void read_value_label_record (struct sfm_reader *,
+                                     struct sfm_value_label_record *,
+                                     size_t n_vars);
+static struct sfm_document_record *read_document_record (struct sfm_reader *);
+static struct sfm_extension_record *read_extension_record (
+  struct sfm_reader *, int subtype);
+static void skip_extension_record (struct sfm_reader *, int subtype);
+
+static const char *choose_encoding (
+  struct sfm_reader *,
+  const struct sfm_extension_record *ext_integer,
+  const struct sfm_extension_record *ext_encoding);
+
+static struct text_record *open_text_record (
+  struct sfm_reader *, const struct sfm_extension_record *);
+static void close_text_record (struct sfm_reader *,
                                struct text_record *);
 static bool read_variable_to_value_pair (struct sfm_reader *,
                                          struct dictionary *,
@@ -154,133 +237,83 @@ enum which_format
     WRITE_FORMAT
   };
 
-static void read_header (struct sfm_reader *, struct dictionary *,
-                         int *weight_idx, int *claimed_oct_cnt,
-                         struct sfm_read_info *);
-static void read_variable_record (struct sfm_reader *, struct dictionary *,
-                                  int *format_warning_cnt);
-static void parse_format_spec (struct sfm_reader *, unsigned int,
-                               enum which_format, struct variable *,
-                               int *format_warning_cnt);
-static void setup_weight (struct sfm_reader *, int weight_idx,
-                          struct variable **var_by_value_idx,
+static void read_header (struct sfm_reader *, int *weight_idx,
+                         int *claimed_oct_cnt, struct sfm_read_info *,
+                         char **file_labelp);
+static void parse_file_label (struct sfm_reader *, const char *file_label,
+                              struct dictionary *);
+static void parse_variable_records (struct sfm_reader *, struct dictionary *,
+                                    struct sfm_var_record *, size_t n);
+static void parse_format_spec (struct sfm_reader *, off_t pos,
+                               unsigned int format, enum which_format,
+                               struct variable *, int *format_warning_cnt);
+static void parse_document (struct dictionary *, struct sfm_document_record *);
+static void parse_display_parameters (struct sfm_reader *,
+                                      const struct sfm_extension_record *,
+                                      struct dictionary *);
+static void parse_machine_integer_info (struct sfm_reader *,
+                                        const struct sfm_extension_record *,
+                                        struct sfm_read_info *);
+static void parse_machine_float_info (struct sfm_reader *,
+                                      const struct sfm_extension_record *);
+static void parse_mrsets (struct sfm_reader *,
+                          const struct sfm_extension_record *,
                           struct dictionary *);
-static void read_documents (struct sfm_reader *, struct dictionary *);
-static void read_value_labels (struct sfm_reader *, struct dictionary *,
-                               struct variable **var_by_value_idx);
-
-static void read_extension_record (struct sfm_reader *, struct dictionary *,
-                                   struct sfm_read_info *);
-static void read_machine_integer_info (struct sfm_reader *,
-                                       size_t size, size_t count,
-                                       struct sfm_read_info *,
-                                      struct dictionary *
-                                      );
-static void read_machine_float_info (struct sfm_reader *,
-                                     size_t size, size_t count);
-static void read_mrsets (struct sfm_reader *, size_t size, size_t count,
-                         struct dictionary *);
-static void read_display_parameters (struct sfm_reader *,
-                                     size_t size, size_t count,
+static void parse_long_var_name_map (struct sfm_reader *,
+                                     const struct sfm_extension_record *,
                                      struct dictionary *);
-static void read_long_var_name_map (struct sfm_reader *,
-                                    size_t size, size_t count,
-                                    struct dictionary *);
-static void read_long_string_map (struct sfm_reader *,
-                                  size_t size, size_t count,
-                                  struct dictionary *);
-static void read_data_file_attributes (struct sfm_reader *,
-                                       size_t size, size_t count,
+static void parse_long_string_map (struct sfm_reader *,
+                                   const struct sfm_extension_record *,
+                                   struct dictionary *);
+static void parse_value_labels (struct sfm_reader *, struct dictionary *,
+                                const struct sfm_var_record *,
+                                size_t n_var_recs,
+                                const struct sfm_value_label_record *);
+static void parse_data_file_attributes (struct sfm_reader *,
+                                        const struct sfm_extension_record *,
+                                        struct dictionary *);
+static void parse_variable_attributes (struct sfm_reader *,
+                                       const struct sfm_extension_record *,
                                        struct dictionary *);
-static void read_variable_attributes (struct sfm_reader *,
-                                      size_t size, size_t count,
-                                      struct dictionary *);
-static void read_long_string_value_labels (struct sfm_reader *,
-                                          size_t size, size_t count,
-                                          struct dictionary *);
-
-/* Convert all the strings in DICT from the dict encoding to UTF8 */
-static void
-recode_strings (struct dictionary *dict)
-{
-  int i;
-
-  const char *enc = dict_get_encoding (dict);
-
-  if ( NULL == enc)
-    enc = get_default_encoding ();
-
-  for (i = 0 ; i < dict_get_var_cnt (dict); ++i)
-    {
-      /* Convert the long variable name */
-      struct variable *var = dict_get_var (dict, i);
-      const char *native_name = var_get_name (var);
-      char *utf8_name = recode_string (UTF8, enc, native_name, -1);
-      if ( 0 != strcmp (utf8_name, native_name))
-       {
-         if ( NULL == dict_lookup_var (dict, utf8_name))
-           dict_rename_var (dict, var, utf8_name);
-         else
-           msg (MW,
-            _("Recoded variable name duplicates an existing `%s' within system 
file."), utf8_name);
-    }
-
-      free (utf8_name);
-
-      /* Convert the variable label */
-      if (var_has_label (var))
-       {
-         char *utf8_label = recode_string (UTF8, enc, var_get_label (var), -1);
-         var_set_label (var, utf8_label);
-         free (utf8_label);
-       }
-
-      if (var_has_value_labels (var))
-       {
-         const struct val_lab *vl = NULL;
-         const struct val_labs *vlabs = var_get_value_labels (var);
-
-         for (vl = val_labs_first (vlabs); vl != NULL; vl = val_labs_next 
(vlabs, vl))
-           {
-             const union value *val = val_lab_get_value (vl);
-             const char *label = val_lab_get_label (vl);
-             char *new_label = NULL;
-
-             new_label = recode_string (UTF8, enc, label, -1);
-
-             var_replace_value_label (var, val, new_label);
-             free (new_label);
-           }
-       }
-    }
-}
+static void parse_long_string_value_labels (struct sfm_reader *,
+                                            const struct sfm_extension_record 
*,
+                                            struct dictionary *);
 
 /* Opens the system file designated by file handle FH for
    reading.  Reads the system file's dictionary into *DICT.
    If INFO is non-null, then it receives additional info about the
    system file. */
 struct casereader *
-sfm_open_reader (struct file_handle *fh, struct dictionary **dict,
+sfm_open_reader (struct file_handle *fh, struct dictionary **dictp,
                  struct sfm_read_info *volatile info)
 {
   struct sfm_reader *volatile r = NULL;
-  struct variable **var_by_value_idx;
   struct sfm_read_info local_info;
-  int format_warning_cnt = 0;
+
+  struct sfm_var_record *vars;
+  size_t n_vars, allocated_vars;
+
+  struct sfm_value_label_record *labels;
+  size_t n_labels, allocated_labels;
+
+  struct sfm_document_record *document;
+
+  struct sfm_extension_record *extensions[32];
+
   int weight_idx;
   int claimed_oct_cnt;
-  int rec_type;
+  char *file_label;
 
-  *dict = dict_create ();
+  struct dictionary *dict = NULL;
+  size_t i;
 
   /* Create and initialize reader. */
   r = pool_create_container (struct sfm_reader, pool);
   r->fh = fh_ref (fh);
   r->lock = NULL;
   r->file = NULL;
+  r->pos = 0;
   r->error = false;
-  r->oct_cnt = 0;
-  r->has_long_var_names = false;
   r->opcode_idx = sizeof r->opcodes;
   r->corruption_warning = false;
 
@@ -306,97 +339,178 @@ sfm_open_reader (struct file_handle *fh, struct 
dictionary **dict,
   if (setjmp (r->bail_out))
     goto error;
 
-
   /* Read header. */
-  read_header (r, *dict, &weight_idx, &claimed_oct_cnt, info);
+  read_header (r, &weight_idx, &claimed_oct_cnt, info, &file_label);
 
-  /* Read all the variable definition records. */
-  rec_type = read_int (r);
-  while (rec_type == 2)
-    {
-      read_variable_record (r, *dict, &format_warning_cnt);
-      rec_type = read_int (r);
-    }
+  vars = NULL;
+  n_vars = allocated_vars = 0;
+
+  labels = NULL;
+  n_labels = allocated_labels = 0;
 
-  /* Figure out the case format. */
-  var_by_value_idx = make_var_by_value_idx (r, *dict);
-  setup_weight (r, weight_idx, var_by_value_idx, *dict);
+  document = NULL;
 
-  /* Read all the rest of the dictionary records. */
-  while (rec_type != 999)
+  memset (extensions, 0, sizeof extensions);
+
+  for (;;)
     {
-      switch (rec_type)
+      int subtype;
+      int type;
+
+      type = read_int (r);
+      if (type == 999)
         {
+          read_int (r);         /* Skip filler. */
+          break;
+        }
+
+      switch (type)
+        {
+        case 2:
+          if (n_vars >= allocated_vars)
+            vars = pool_2nrealloc (r->pool, vars, &allocated_vars,
+                                   sizeof *vars);
+          read_variable_record (r, &vars[n_vars++]);
+          break;
+
         case 3:
-          read_value_labels (r, *dict, var_by_value_idx);
+          if (n_labels >= allocated_labels)
+            labels = pool_2nrealloc (r->pool, labels, &allocated_labels,
+                                     sizeof *labels);
+          read_value_label_record (r, &labels[n_labels++], n_vars);
           break;
 
         case 4:
-          sys_error (r, _("Misplaced type 4 record."));
+          /* A Type 4 record is always immediately after a type 3 record,
+             so the code for type 3 records reads the type 4 record too. */
+          sys_error (r, r->pos, _("Misplaced type 4 record."));
 
         case 6:
-          read_documents (r, *dict);
+          if (document != NULL)
+            sys_error (r, r->pos, _("Duplicate type 6 (document) record."));
+          document = read_document_record (r);
           break;
 
         case 7:
-          read_extension_record (r, *dict, info);
+          subtype = read_int (r);
+          if (subtype < 0 || subtype >= sizeof extensions / sizeof *extensions)
+            {
+              sys_warn (r, r->pos,
+                        _("Unrecognized record type 7, subtype %d.  Please "
+                          "send a copy of this file, and the syntax which "
+                          "created it to %s."),
+                        subtype, PACKAGE_BUGREPORT);
+              skip_extension_record (r, subtype);
+            }
+          else if (extensions[subtype] != NULL)
+            {
+              sys_warn (r, r->pos,
+                        _("Record type 7, subtype %d found here has the same "
+                          "type as the record found near offset 0x%llx.  "
+                          "Please send a copy of this file, and the syntax "
+                          "which created it to %s."),
+                        subtype, (long long int) extensions[subtype]->pos,
+                        PACKAGE_BUGREPORT);
+              skip_extension_record (r, subtype);
+            }
+          else
+            extensions[subtype] = read_extension_record (r, subtype);
           break;
 
         default:
-          sys_error (r, _("Unrecognized record type %d."), rec_type);
+          sys_error (r, r->pos, _("Unrecognized record type %d."), type);
+          goto error;
         }
-      rec_type = read_int (r);
     }
 
+  /* Now actually parse what we read.
 
-  if ( ! r->has_long_var_names )
-    {
-      int i;
-      for (i = 0; i < dict_get_var_cnt (*dict); i++)
-       {
-         struct variable *var = dict_get_var (*dict, i);
-         char short_name[SHORT_NAME_LEN + 1];
-         char long_name[SHORT_NAME_LEN + 1];
+     First, figure out the correct character encoding, because this determines
+     how the rest of the header data is to be interpreted. */
+  dict = dict_create ();
+  r->encoding = choose_encoding (r, extensions[EXT_INTEGER],
+                                 extensions[EXT_ENCODING]);
+  dict_set_encoding (dict, r->encoding);
 
-         strcpy (short_name, var_get_name (var));
+  /* These records don't use variables at all. */
+  if (document != NULL)
+    parse_document (dict, document);
 
-         strcpy (long_name, short_name);
-         str_lowercase (long_name);
+  if (extensions[EXT_INTEGER] != NULL)
+    parse_machine_integer_info (r, extensions[EXT_INTEGER], info);
 
-         /* Set long name.  Renaming a variable may clear the short
-            name, but we want to retain it, so re-set it
-            explicitly. */
-         dict_rename_var (*dict, var, long_name);
-         var_set_short_name (var, 0, short_name);
-       }
+  if (extensions[EXT_FLOAT] != NULL)
+    parse_machine_float_info (r, extensions[EXT_FLOAT]);
+
+  if (extensions[EXT_FILE_ATTRS] != NULL)
+    parse_data_file_attributes (r, extensions[EXT_FILE_ATTRS], dict);
+
+  parse_file_label (r, file_label, dict);
+
+  /* Parse the variable records, the basis of almost everything else. */
+  parse_variable_records (r, dict, vars, n_vars);
+
+  /* Parse value labels and the weight variable immediately after the variable
+     records.  These records use indexes into var_recs[], so we must parse them
+     before those indexes become invalidated by very long string variables. */
+  for (i = 0; i < n_labels; i++)
+    parse_value_labels (r, dict, vars, n_vars, &labels[i]);
+  if (weight_idx != 0)
+    {
+      struct variable *weight_var;
 
-      r->has_long_var_names = true;
+      weight_var = lookup_var_by_index (r, 76, vars, n_vars, weight_idx);
+      if (var_is_numeric (weight_var))
+        dict_set_weight (dict, weight_var);
+      else
+        sys_error (r, -1, _("Weighting variable must be numeric "
+                            "(not string variable `%s')."),
+                   var_get_name (weight_var));
     }
 
-  recode_strings (*dict);
+  if (extensions[EXT_DISPLAY] != NULL)
+    parse_display_parameters (r, extensions[EXT_DISPLAY], dict);
+
+  /* The following records use short names, so they need to be parsed before
+     parse_long_var_name_map() changes short names to long names. */
+  if (extensions[EXT_MRSETS] != NULL)
+    parse_mrsets (r, extensions[EXT_MRSETS], dict);
+
+  if (extensions[EXT_MRSETS2] != NULL)
+    parse_mrsets (r, extensions[EXT_MRSETS2], dict);
 
-  /* Read record 999 data, which is just filler. */
-  read_int (r);
+  if (extensions[EXT_LONG_STRINGS] != NULL)
+    parse_long_string_map (r, extensions[EXT_LONG_STRINGS], dict);
+
+  /* Now rename variables to their long names. */
+  parse_long_var_name_map (r, extensions[EXT_LONG_NAMES], dict);
+
+  /* The following records use long names, so they need to follow renaming. */
+  if (extensions[EXT_VAR_ATTRS] != NULL)
+    parse_variable_attributes (r, extensions[EXT_VAR_ATTRS], dict);
+
+  if (extensions[EXT_LONG_LABELS] != NULL)
+    parse_long_string_value_labels (r, extensions[EXT_LONG_LABELS], dict);
 
   /* Warn if the actual amount of data per case differs from the
      amount that the header claims.  SPSS version 13 gets this
      wrong when very long strings are involved, so don't warn in
      that case. */
-  if (claimed_oct_cnt != -1 && claimed_oct_cnt != r->oct_cnt
+  if (claimed_oct_cnt != -1 && claimed_oct_cnt != n_vars
       && info->version_major != 13)
-    sys_warn (r, _("File header claims %d variable positions but "
-                   "%d were read from file."),
-              claimed_oct_cnt, r->oct_cnt);
+    sys_warn (r, -1, _("File header claims %d variable positions but "
+                       "%d were read from file."),
+              claimed_oct_cnt, n_vars);
 
   /* Create an index of dictionary variable widths for
      sfm_read_case to use.  We cannot use the `struct variable's
      from the dictionary we created, because the caller owns the
      dictionary and may destroy or modify its variables. */
-  sfm_dictionary_to_sfm_vars (*dict, &r->sfm_vars, &r->sfm_var_cnt);
+  sfm_dictionary_to_sfm_vars (dict, &r->sfm_vars, &r->sfm_var_cnt);
   pool_register (r->pool, free, r->sfm_vars);
-  r->proto = caseproto_ref_pool (dict_get_proto (*dict), r->pool);
+  r->proto = caseproto_ref_pool (dict_get_proto (dict), r->pool);
 
-  pool_free (r->pool, var_by_value_idx);
+  *dictp = dict;
   return casereader_create_sequential
     (NULL, r->proto,
      r->case_cnt == -1 ? CASENUMBER_MAX: r->case_cnt,
@@ -404,8 +518,8 @@ sfm_open_reader (struct file_handle *fh, struct dictionary 
**dict,
 
 error:
   close_reader (r);
-  dict_destroy (*dict);
-  *dict = NULL;
+  dict_destroy (dict);
+  *dictp = NULL;
   return NULL;
 }
 
@@ -462,18 +576,16 @@ sfm_detect (FILE *file)
   return !strcmp ("$FL2", rec_type);
 }
 
-/* Reads the global header of the system file.
-   Sets DICT's file label to the system file's label.
-   Sets *WEIGHT_IDX to 0 if the system file is unweighted,
-   or to the value index of the weight variable otherwise.
-   Sets *CLAIMED_OCT_CNT to the number of "octs" (8-byte units)
-   per case that the file claims to have (although it is not
-   always correct).
-   Initializes INFO with header information. */
+/* Reads the global header of the system file.  Sets *WEIGHT_IDX to 0 if the
+   system file is unweighted, or to the value index of the weight variable
+   otherwise.  Sets *CLAIMED_OCT_CNT to the number of "octs" (8-byte units) per
+   case that the file claims to have (although it is not always correct).
+   Initializes INFO with header information.  Stores the file label as a string
+   in dictionary encoding into *FILE_LABELP. */
 static void
-read_header (struct sfm_reader *r, struct dictionary *dict,
-             int *weight_idx, int *claimed_oct_cnt,
-             struct sfm_read_info *info)
+read_header (struct sfm_reader *r, int *weight_idx,
+             int *claimed_oct_cnt, struct sfm_read_info *info,
+             char **file_labelp)
 {
   char rec_type[5];
   char eye_catcher[61];
@@ -482,14 +594,13 @@ read_header (struct sfm_reader *r, struct dictionary 
*dict,
   char creation_date[10];
   char creation_time[9];
   char file_label[65];
-  struct substring file_label_ss;
   struct substring product;
 
   read_string (r, rec_type, sizeof rec_type);
   read_string (r, eye_catcher, sizeof eye_catcher);
 
   if (strcmp ("$FL2", rec_type) != 0)
-    sys_error (r, _("This is not an SPSS system file."));
+    sys_error (r, 0, _("This is not an SPSS system file."));
 
   /* Identify integer format. */
   read_bytes (r, raw_layout_code, sizeof raw_layout_code);
@@ -499,7 +610,7 @@ read_header (struct sfm_reader *r, struct dictionary *dict,
                              &r->integer_format))
       || (r->integer_format != INTEGER_MSB_FIRST
           && r->integer_format != INTEGER_LSB_FIRST))
-    sys_error (r, _("This is not an SPSS system file."));
+    sys_error (r, 64, _("This is not an SPSS system file."));
 
   *claimed_oct_cnt = read_int (r);
   if (*claimed_oct_cnt < 0 || *claimed_oct_cnt > INT_MAX / 16)
@@ -520,9 +631,10 @@ read_header (struct sfm_reader *r, struct dictionary *dict,
       uint8_t zero_bias[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
 
       if (memcmp (raw_bias, zero_bias, 8))
-        sys_warn (r, _("Compression bias is not the usual "
-                       "value of 100, or system file uses unrecognized "
-                       "floating-point format."));
+        sys_warn (r, r->pos - 8,
+                  _("Compression bias is not the usual "
+                    "value of 100, or system file uses unrecognized "
+                    "floating-point format."));
       else
         {
           /* Some software is known to write all-zeros to this
@@ -544,14 +656,6 @@ read_header (struct sfm_reader *r, struct dictionary *dict,
   read_string (r, file_label, sizeof file_label);
   skip_bytes (r, 3);
 
-  file_label_ss = ss_cstr (file_label);
-  ss_trim (&file_label_ss, ss_cstr (" "));
-  if (!ss_is_empty (file_label_ss))
-    {
-      ss_data (file_label_ss)[ss_length (file_label_ss)] = '\0';
-      dict_set_label (dict, ss_data (file_label_ss));
-    }
-
   strcpy (info->creation_date, creation_date);
   strcpy (info->creation_time, creation_time);
   info->integer_format = r->integer_format;
@@ -564,65 +668,37 @@ read_header (struct sfm_reader *r, struct dictionary 
*dict,
   ss_trim (&product, ss_cstr (" "));
   str_copy_buf_trunc (info->product, sizeof info->product,
                       ss_data (product), ss_length (product));
+
+  *file_labelp = pool_strdup0 (r->pool, file_label, sizeof file_label - 1);
 }
 
-/* Reads a variable (type 2) record from R and adds the
-   corresponding variable to DICT.
-   Also skips past additional variable records for long string
-   variables. */
+/* Reads a variable (type 2) record from R into RECORD. */
 static void
-read_variable_record (struct sfm_reader *r, struct dictionary *dict,
-                      int *format_warning_cnt)
+read_variable_record (struct sfm_reader *r, struct sfm_var_record *record)
 {
-  int width;
   int has_variable_label;
-  int missing_value_code;
-  int print_format;
-  int write_format;
-  char name[9];
 
-  struct variable *var;
-  int nv;
+  memset (record, 0, sizeof *record);
 
-  width = read_int (r);
+  record->pos = r->pos;
+  record->width = read_int (r);
   has_variable_label = read_int (r);
-  missing_value_code = read_int (r);
-  print_format = read_int (r);
-  write_format = read_int (r);
-  read_string (r, name, sizeof name);
-  name[strcspn (name, " ")] = '\0';
-
-  /* Check variable name. */
-  if (name[0] == '$' || name[0] == '#')
-    sys_error (r, _("Variable name begins with invalid character `%c'."),
-               name[0]);
-  if (!var_is_plausible_name (name, false))
-    sys_error (r, _("Invalid variable name `%s'."), name);
-
-  /* Create variable. */
-  if (width < 0 || width > 255)
-    sys_error (r, _("Bad width %d for variable %s."), width, name);
-  var = dict_create_var (dict, name, width);
-  if (var == NULL)
-    sys_error (r, _("Duplicate variable name `%s'."), name);
-
-  /* Set the short name the same as the long name. */
-  var_set_short_name (var, 0, var_get_name (var));
-
-  /* Get variable label, if any. */
-  if (has_variable_label != 0 && has_variable_label != 1)
-    sys_error (r, _("Variable label indicator field is not 0 or 1."));
+  record->missing_value_code = read_int (r);
+  record->print_format = read_int (r);
+  record->write_format = read_int (r);
+  read_bytes (r, record->name, sizeof record->name);
+
   if (has_variable_label == 1)
     {
+      enum { MAX_LABEL_LEN = 255 };
       size_t len, read_len;
-      char label[255 + 1];
 
       len = read_int (r);
 
-      /* Read up to 255 bytes of label. */
-      read_len = MIN (sizeof label - 1, len);
-      read_string (r, label, read_len + 1);
-      var_set_label (var, label);
+      /* Read up to MAX_LABEL_LEN bytes of label. */
+      read_len = MIN (MAX_LABEL_LEN, len);
+      record->label = xmalloc (read_len + 1);
+      read_string (r, record->label, read_len + 1);
 
       /* Skip unread label bytes. */
       skip_bytes (r, len - read_len);
@@ -630,103 +706,355 @@ read_variable_record (struct sfm_reader *r, struct 
dictionary *dict,
       /* Skip label padding up to multiple of 4 bytes. */
       skip_bytes (r, ROUND_UP (len, 4) - len);
     }
+  else if (has_variable_label != 0)
+    sys_error (r, record->pos,
+               _("Variable label indicator field is not 0 or 1."));
 
   /* Set missing values. */
-  if (missing_value_code != 0)
+  if (record->missing_value_code != 0)
     {
-      struct missing_values mv;
-      int i;
-
-      mv_init_pool (r->pool, &mv, var_get_width (var));
-      if (var_is_numeric (var))
+      int code = record->missing_value_code;
+      if (record->width == 0)
         {
-          if (missing_value_code < -3 || missing_value_code > 3
-              || missing_value_code == -1)
-            sys_error (r, _("Numeric missing value indicator field is not "
-                            "-3, -2, 0, 1, 2, or 3."));
-          if (missing_value_code < 0)
-            {
-              double low = read_float (r);
-              double high = read_float (r);
-              mv_add_range (&mv, low, high);
-              missing_value_code = -missing_value_code - 2;
-            }
-          for (i = 0; i < missing_value_code; i++)
-            mv_add_num (&mv, read_float (r));
+          if (code < -3 || code > 3 || code == -1)
+            sys_error (r, record->pos,
+                       _("Numeric missing value indicator field is not "
+                         "-3, -2, 0, 1, 2, or 3."));
         }
       else
         {
-          int mv_width = MAX (width, 8);
-          union value value;
+          if (code < 1 || code > 3)
+            sys_error (r, record->pos,
+                       _("String missing value indicator field is not "
+                         "0, 1, 2, or 3."));
+        }
 
-          if (missing_value_code < 1 || missing_value_code > 3)
-            sys_error (r, _("String missing value indicator field is not "
-                            "0, 1, 2, or 3."));
+      read_bytes (r, record->missing, 8 * abs (code));
+    }
+}
 
-          value_init (&value, mv_width);
-          value_set_missing (&value, mv_width);
-          for (i = 0; i < missing_value_code; i++)
-            {
-              uint8_t *s = value_str_rw (&value, mv_width);
-              read_bytes (r, s, 8);
-              mv_add_str (&mv, s);
-            }
-          value_destroy (&value, mv_width);
-        }
-      var_set_missing_values (var, &mv);
+/* Reads value labels from R into RECORD. */
+static void
+read_value_label_record (struct sfm_reader *r,
+                         struct sfm_value_label_record *record,
+                         size_t n_vars)
+{
+  size_t i;
+
+  /* Read type 3 record. */
+  record->pos = r->pos;
+  record->n_labels = read_int (r);
+  if (record->n_labels > SIZE_MAX / sizeof *record->labels)
+    sys_error (r, r->pos - 4, _("Invalid number of labels %zu."),
+               record->n_labels);
+  record->labels = pool_nmalloc (r->pool, record->n_labels,
+                                 sizeof *record->labels);
+  for (i = 0; i < record->n_labels; i++)
+    {
+      struct sfm_value_label *label = &record->labels[i];
+      unsigned char label_len;
+      size_t padded_len;
+
+      read_bytes (r, label->value, sizeof label->value);
+
+      /* Read label length. */
+      read_bytes (r, &label_len, sizeof label_len);
+      padded_len = ROUND_UP (label_len + 1, 8);
+
+      /* Read label, padding. */
+      label->label = pool_malloc (r->pool, padded_len + 1);
+      read_bytes (r, label->label, padded_len - 1);
+      label->label[label_len] = '\0';
     }
 
-  /* Set formats. */
-  parse_format_spec (r, print_format, PRINT_FORMAT, var, format_warning_cnt);
-  parse_format_spec (r, write_format, WRITE_FORMAT, var, format_warning_cnt);
+  /* Read record type of type 4 record. */
+  if (read_int (r) != 4)
+    sys_error (r, r->pos - 4,
+               _("Variable index record (type 4) does not immediately "
+                 "follow value label record (type 3) as it should."));
+
+  /* Read number of variables associated with value label from type 4
+     record. */
+  record->n_vars = read_int (r);
+  if (record->n_vars < 1 || record->n_vars > n_vars)
+    sys_error (r, r->pos - 4,
+               _("Number of variables associated with a value label (%d) "
+                 "is not between 1 and the number of variables (%zu)."),
+               record->n_vars, n_vars);
+  record->vars = pool_nmalloc (r->pool, record->n_vars, sizeof *record->vars);
+  for (i = 0; i < record->n_vars; i++)
+    record->vars[i] = read_int (r);
+}
+
+/* Reads a document record from R and returns it. */
+static struct sfm_document_record *
+read_document_record (struct sfm_reader *r)
+{
+  struct sfm_document_record *record;
+  int n_lines;
+
+  record = pool_malloc (r->pool, sizeof *record);
+  record->pos = r->pos;
+
+  n_lines = read_int (r);
+  if (n_lines <= 0 || n_lines >= INT_MAX / DOC_LINE_LENGTH)
+    sys_error (r, record->pos,
+               _("Number of document lines (%d) "
+                 "must be greater than 0 and less than %d."),
+               n_lines, INT_MAX / DOC_LINE_LENGTH);
+
+  record->n_lines = n_lines;
+  record->documents = pool_malloc (r->pool, DOC_LINE_LENGTH * n_lines);
+  read_bytes (r, record->documents, DOC_LINE_LENGTH * n_lines);
+
+  return record;
+}
+
+static void
+read_extension_record_header (struct sfm_reader *r, int subtype,
+                              struct sfm_extension_record *record)
+{
+  record->pos = r->pos;
+  record->size = read_int (r);
+  record->count = read_int (r);
+
+  /* Check that SIZE * COUNT + 1 doesn't overflow.  Adding 1
+     allows an extra byte for a null terminator, used by some
+     extension processing routines. */
+  if (record->size != 0
+      && size_overflow_p (xsum (1, xtimes (record->count, record->size))))
+    sys_error (r, record->pos, "Record type 7 subtype %d too large.", subtype);
+}
 
-  /* Account for values.
-     Skip long string continuation records, if any. */
-  nv = width == 0 ? 1 : DIV_RND_UP (width, 8);
-  r->oct_cnt += nv;
-  if (width > 8)
+/* Reads an extension record from R into RECORD. */
+static struct sfm_extension_record *
+read_extension_record (struct sfm_reader *r, int subtype)
+{
+  struct extension_record_type
     {
-      int i;
+      int subtype;
+      int size;
+      int count;
+    };
+
+  static const struct extension_record_type types[] =
+    {
+      /* Implemented record types. */
+      { EXT_INTEGER,      4, 8 },
+      { EXT_FLOAT,        8, 3 },
+      { EXT_MRSETS,       1, 0 },
+      { EXT_DISPLAY,      4, 0 },
+      { EXT_LONG_NAMES,   1, 0 },
+      { EXT_LONG_STRINGS, 1, 0 },
+      { EXT_NCASES,       8, 2 },
+      { EXT_FILE_ATTRS,   1, 0 },
+      { EXT_VAR_ATTRS,    1, 0 },
+      { EXT_MRSETS2,      1, 0 },
+      { EXT_ENCODING,     1, 0 },
+      { EXT_LONG_LABELS,  1, 0 },
+
+      /* Ignored record types. */
+      { EXT_VAR_SETS,     0, 0 },
+      { EXT_DATE,         0, 0 },
+      { EXT_DATA_ENTRY,   0, 0 },
+    };
+
+  const struct extension_record_type *type;
+  struct sfm_extension_record *record;
+  size_t n_bytes;
+
+  record = pool_malloc (r->pool, sizeof *record);
+  read_extension_record_header (r, subtype, record);
+  n_bytes = record->count * record->size;
+
+  for (type = types; type < &types[sizeof types / sizeof *types]; type++)
+    if (subtype == type->subtype)
+      {
+        if (type->size > 0 && record->size != type->size)
+          sys_warn (r, record->pos,
+                    _("Record type 7, subtype %d has bad size %zu "
+                      "(expected %d)."), subtype, record->size, type->size);
+        else if (type->count > 0 && record->count != type->count)
+          sys_warn (r, record->pos,
+                    _("Record type 7, subtype %d has bad count %zu "
+                      "(expected %d)."), subtype, record->count, type->count);
+        else if (type->count == 0 && type->size == 0)
+          {
+            /* Ignore this record. */
+          }
+        else
+          {
+            char *data = pool_malloc (r->pool, n_bytes + 1);
+            data[n_bytes] = '\0';
+
+            record->data = data;
+            read_bytes (r, record->data, n_bytes);
+            return record;
+          }
+
+        goto skip;
+      }
+
+  sys_warn (r, record->pos,
+            _("Unrecognized record type 7, subtype %d.  Please send a "
+              "copy of this file, and the syntax which created it to %s."),
+            subtype, PACKAGE_BUGREPORT);
 
-      for (i = 1; i < nv; i++)
+skip:
+  skip_bytes (r, n_bytes);
+  return NULL;
+}
+
+static void
+skip_extension_record (struct sfm_reader *r, int subtype)
+{
+  struct sfm_extension_record record;
+
+  read_extension_record_header (r, subtype, &record);
+  skip_bytes (r, record.count * record.size);
+}
+
+static void
+parse_file_label (struct sfm_reader *r, const char *file_label,
+                  struct dictionary *dict)
+{
+  char *utf8_file_label;
+  size_t file_label_len;
+
+  utf8_file_label = recode_string_pool ("UTF-8", dict_get_encoding (dict),
+                                        file_label, -1, r->pool);
+  file_label_len = strlen (utf8_file_label);
+  while (file_label_len > 0 && utf8_file_label[file_label_len - 1] == ' ')
+    file_label_len--;
+  utf8_file_label[file_label_len] = '\0';
+  dict_set_label (dict, utf8_file_label);
+}
+
+/* Reads a variable (type 2) record from R and adds the
+   corresponding variable to DICT.
+   Also skips past additional variable records for long string
+   variables. */
+static void
+parse_variable_records (struct sfm_reader *r, struct dictionary *dict,
+                        struct sfm_var_record *var_recs, size_t n_var_recs)
+{
+  const char *dict_encoding = dict_get_encoding (dict);
+  struct sfm_var_record *rec;
+  int n_warnings = 0;
+
+  for (rec = var_recs; rec < &var_recs[n_var_recs]; )
+    {
+      struct variable *var;
+      size_t n_values;
+      char *name;
+      size_t i;
+
+      name = recode_string_pool ("UTF-8", dict_encoding,
+                                 rec->name, 8, r->pool);
+      name[strcspn (name, " ")] = '\0';
+
+      if (!var_is_valid_name (name, false) || name[0] == '$' || name[0] == '#')
+        sys_error (r, rec->pos, _("Invalid variable name `%s'."), name);
+
+      if (rec->width < 0 || rec->width > 255)
+        sys_error (r, rec->pos,
+                   _("Bad width %d for variable %s."), rec->width, name);
+
+      var = rec->var = dict_create_var (dict, name, rec->width);
+      if (var == NULL)
+        sys_error (r, rec->pos, _("Duplicate variable name `%s'."), name);
+
+      /* Set the short name the same as the long name. */
+      var_set_short_name (var, 0, name);
+
+      /* Get variable label, if any. */
+      if (rec->label)
         {
-          /* Check for record type 2 and width -1. */
-          if (read_int (r) != 2 || read_int (r) != -1)
-            sys_error (r, _("Missing string continuation record."));
-
-          /* Skip and ignore remaining continuation data. */
-          has_variable_label = read_int (r);
-          missing_value_code = read_int (r);
-          print_format = read_int (r);
-          write_format = read_int (r);
-          read_string (r, name, sizeof name);
-
-          /* Variable label fields on continuation records have
-             been spotted in system files created by "SPSS Power
-             Macintosh Release 6.1". */
-          if (has_variable_label)
-            skip_bytes (r, ROUND_UP (read_int (r), 4));
+          char *utf8_label;
+
+          utf8_label = recode_string_pool ("UTF-8", dict_encoding,
+                                           rec->label, -1, r->pool);
+          var_set_label (var, utf8_label);
         }
+
+      /* Set missing values. */
+      if (rec->missing_value_code != 0)
+        {
+          int width = var_get_width (var);
+          struct missing_values mv;
+
+          mv_init_pool (r->pool, &mv, width);
+          if (var_is_numeric (var))
+            {
+              bool has_range = rec->missing_value_code < 0;
+              int n_discrete = (has_range
+                                ? rec->missing_value_code == -3
+                                : rec->missing_value_code);
+              int ofs = 0;
+
+              if (has_range)
+                {
+                  double low = parse_float (r, rec->missing, 0);
+                  double high = parse_float (r, rec->missing, 8);
+                  mv_add_range (&mv, low, high);
+                  ofs += 16;
+                }
+
+              for (i = 0; i < n_discrete; i++)
+                {
+                  mv_add_num (&mv, parse_float (r, rec->missing, ofs));
+                  ofs += 8;
+                }
+            }
+          else
+            {
+              union value value;
+
+              value_init_pool (r->pool, &value, width);
+              value_set_missing (&value, width);
+              for (i = 0; i < rec->missing_value_code; i++)
+                {
+                  uint8_t *s = value_str_rw (&value, width);
+                  memcpy (s, rec->missing + 8 * i, MIN (width, 8));
+                  mv_add_str (&mv, s);
+                }
+            }
+          var_set_missing_values (var, &mv);
+        }
+
+      /* Set formats. */
+      parse_format_spec (r, rec->pos + 12, rec->print_format,
+                         PRINT_FORMAT, var, &n_warnings);
+      parse_format_spec (r, rec->pos + 16, rec->write_format,
+                         WRITE_FORMAT, var, &n_warnings);
+
+      /* Account for values.
+         Skip long string continuation records, if any. */
+      n_values = rec->width == 0 ? 1 : DIV_RND_UP (rec->width, 8);
+      for (i = 1; i < n_values; i++)
+        if (i + (rec - var_recs) >= n_var_recs || rec[i].width != -1)
+          sys_error (r, rec->pos, _("Missing string continuation record."));
+      rec += n_values;
     }
 }
 
 /* Translates the format spec from sysfile format to internal
    format. */
 static void
-parse_format_spec (struct sfm_reader *r, unsigned int s,
+parse_format_spec (struct sfm_reader *r, off_t pos, unsigned int format,
                    enum which_format which, struct variable *v,
-                   int *format_warning_cnt)
+                   int *n_warnings)
 {
-  const int max_format_warnings = 8;
+  const int max_warnings = 8;
+  uint8_t raw_type = format >> 16;
+  uint8_t w = format >> 8;
+  uint8_t d = format;
   struct fmt_spec f;
-  uint8_t raw_type = s >> 16;
-  uint8_t w = s >> 8;
-  uint8_t d = s;
 
   bool ok;
 
   if (!fmt_from_io (raw_type, &f.type))
-    sys_error (r, _("Unknown variable format %"PRIu8"."), raw_type);
+    sys_error (r, pos, _("Unknown variable format %"PRIu8"."), raw_type);
   f.w = w;
   f.d = d;
 
@@ -741,192 +1069,58 @@ parse_format_spec (struct sfm_reader *r, unsigned int s,
       else
         var_set_write_format (v, &f);
     }
-  else if (++*format_warning_cnt <= max_format_warnings)
+  else if (++*n_warnings <= max_warnings)
     {
       char fmt_string[FMT_STRING_LEN_MAX + 1];
-      sys_warn (r, _("%s variable %s has invalid %s format %s."),
+      sys_warn (r, pos, _("%s variable %s has invalid %s format %s."),
                 var_is_numeric (v) ? _("Numeric") : _("String"),
                 var_get_name (v),
                 which == PRINT_FORMAT ? _("print") : _("write"),
                 fmt_to_string (&f, fmt_string));
 
-      if (*format_warning_cnt == max_format_warnings)
-        sys_warn (r, _("Suppressing further invalid format warnings."));
+      if (*n_warnings == max_warnings)
+        sys_warn (r, -1, _("Suppressing further invalid format warnings."));
     }
 }
 
-/* Sets the weighting variable in DICT to the variable
-   corresponding to the given 1-based VALUE_IDX, if VALUE_IDX is
-   nonzero. */
 static void
-setup_weight (struct sfm_reader *r, int weight_idx,
-              struct variable **var_by_value_idx, struct dictionary *dict)
+parse_document (struct dictionary *dict, struct sfm_document_record *record)
 {
-  if (weight_idx != 0)
-    {
-      struct variable *weight_var
-        = lookup_var_by_value_idx (r, var_by_value_idx, weight_idx);
-      if (var_is_numeric (weight_var))
-        dict_set_weight (dict, weight_var);
-      else
-        sys_error (r, _("Weighting variable must be numeric "
-                        "(not string variable `%s')."),
-                   var_get_name (weight_var));
-    }
-}
+  const char *p;
 
-/* Reads a document record, type 6, from system file R, and sets up
-   the documents and n_documents fields in the associated
-   dictionary. */
-static void
-read_documents (struct sfm_reader *r, struct dictionary *dict)
-{
-  int line_cnt;
-  char *documents;
-
-  if (dict_get_documents (dict) != NULL)
-    sys_error (r, _("Multiple type 6 (document) records."));
-
-  line_cnt = read_int (r);
-  if (line_cnt <= 0)
-    sys_error (r, _("Number of document lines (%d) "
-                    "must be greater than 0."), line_cnt);
-
-  documents = pool_nmalloc (r->pool, line_cnt + 1, DOC_LINE_LENGTH);
-  read_string (r, documents, DOC_LINE_LENGTH * line_cnt + 1);
-  if (strlen (documents) == DOC_LINE_LENGTH * line_cnt)
-    dict_set_documents (dict, documents);
-  else
-    sys_error (r, _("Document line contains null byte."));
-  pool_free (r->pool, documents);
-}
-
-/* Read a type 7 extension record. */
-static void
-read_extension_record (struct sfm_reader *r, struct dictionary *dict,
-                       struct sfm_read_info *info)
-{
-  int subtype = read_int (r);
-  size_t size = read_int (r);
-  size_t count = read_int (r);
-  size_t bytes = size * count;
-
-  /* Check that SIZE * COUNT + 1 doesn't overflow.  Adding 1
-     allows an extra byte for a null terminator, used by some
-     extension processing routines. */
-  if (size != 0 && size_overflow_p (xsum (1, xtimes (count, size))))
-    sys_error (r, "Record type 7 subtype %d too large.", subtype);
-
-  switch (subtype)
+  for (p = record->documents;
+       p < record->documents + DOC_LINE_LENGTH * record->n_lines;
+       p += DOC_LINE_LENGTH)
     {
-    case 3:
-      read_machine_integer_info (r, size, count, info, dict);
-      return;
-
-    case 4:
-      read_machine_float_info (r, size, count);
-      return;
-
-    case 5:
-      /* Variable sets information.  We don't use these yet.
-         They only apply to GUIs; see VARSETS on the APPLY
-         DICTIONARY command in SPSS documentation. */
-      break;
-
-    case 6:
-      /* DATE variable information.  We don't use it yet, but we
-         should. */
-      break;
-
-    case 7:
-    case 19:
-      read_mrsets (r, size, count, dict);
-      return;
-
-    case 8:
-      /* Used by the SPSS Data Entry software. */
-      break;
-
-    case 11:
-      read_display_parameters (r, size, count, dict);
-      return;
-
-    case 13:
-      read_long_var_name_map (r, size, count, dict);
-      return;
-
-    case 14:
-      read_long_string_map (r, size, count, dict);
-      return;
-
-    case 16:
-      /* Extended number of cases.  Not important. */
-      break;
+      struct substring line;
 
-    case 17:
-      read_data_file_attributes (r, size, count, dict);
-      return;
-
-    case 18:
-      read_variable_attributes (r, size, count, dict);
-      return;
+      line = recode_substring_pool ("UTF-8", dict_get_encoding (dict),
+                                    ss_buffer (p, DOC_LINE_LENGTH), NULL);
+      ss_rtrim (&line, ss_cstr (" "));
+      line.string[line.length] = '\0';
 
-    case 20:
-      /* New in SPSS 16.  Contains a single string that describes
-         the character encoding, e.g. "windows-1252". */
-      {
-       char *encoding = pool_calloc (r->pool, size, count + 1);
-       read_string (r, encoding, count + 1);
-       dict_set_encoding (dict, encoding);
-       return;
-      }
-
-    case 21:
-      /* New in SPSS 16.  Encodes value labels for long string
-         variables. */
-      read_long_string_value_labels (r, size, count, dict);
-      return;
+      dict_add_document_line (dict, line.string);
 
-    default:
-      sys_warn (r, _("Unrecognized record type 7, subtype %d.  Please send "
-                     "a copy of this file, and the syntax which created it "
-                     "to %s."),
-               subtype, PACKAGE_BUGREPORT);
-      break;
+      ss_dealloc (&line);
     }
-
-  skip_bytes (r, bytes);
 }
 
-/* Read record type 7, subtype 3. */
+/* Parses record type 7, subtype 3. */
 static void
-read_machine_integer_info (struct sfm_reader *r, size_t size, size_t count,
-                           struct sfm_read_info *info,
-                          struct dictionary *dict)
+parse_machine_integer_info (struct sfm_reader *r,
+                            const struct sfm_extension_record *record,
+                            struct sfm_read_info *info)
 {
-  int version_major = read_int (r);
-  int version_minor = read_int (r);
-  int version_revision = read_int (r);
-  int machine_code UNUSED = read_int (r);
-  int float_representation = read_int (r);
-  int compression_code UNUSED = read_int (r);
-  int integer_representation = read_int (r);
-  int character_code = read_int (r);
-
-  int expected_float_format;
-  int expected_integer_format;
-
-  if (size != 4 || count != 8)
-    sys_error (r, _("Bad size (%zu) or count (%zu) field on record type 7, "
-                    "subtype 3."),
-                size, count);
+  int float_representation, expected_float_format;
+  int integer_representation, expected_integer_format;
 
   /* Save version info. */
-  info->version_major = version_major;
-  info->version_minor = version_minor;
-  info->version_revision = version_revision;
+  info->version_major = parse_int (r, record->data, 0);
+  info->version_minor = parse_int (r, record->data, 4);
+  info->version_revision = parse_int (r, record->data, 8);
 
   /* Check floating point format. */
+  float_representation = parse_int (r, record->data, 16);
   if (r->float_format == FLOAT_IEEE_DOUBLE_BE
       || r->float_format == FLOAT_IEEE_DOUBLE_LE)
     expected_float_format = 1;
@@ -937,11 +1131,12 @@ read_machine_integer_info (struct sfm_reader *r, size_t 
size, size_t count,
   else
     NOT_REACHED ();
   if (float_representation != expected_float_format)
-    sys_error (r, _("Floating-point representation indicated by "
-                    "system file (%d) differs from expected (%d)."),
-              float_representation, expected_float_format);
+    sys_error (r, record->pos, _("Floating-point representation indicated by "
+                 "system file (%d) differs from expected (%d)."),
+               float_representation, expected_float_format);
 
   /* Check integer format. */
+  integer_representation = parse_int (r, record->data, 24);
   if (r->integer_format == INTEGER_MSB_FIRST)
     expected_integer_format = 1;
   else if (r->integer_format == INTEGER_LSB_FIRST)
@@ -949,88 +1144,94 @@ read_machine_integer_info (struct sfm_reader *r, size_t 
size, size_t count,
   else
     NOT_REACHED ();
   if (integer_representation != expected_integer_format)
-    sys_warn (r, _("Integer format indicated by system file (%d) "
-                   "differs from expected (%d)."),
+    sys_warn (r, record->pos,
+              _("Integer format indicated by system file (%d) "
+                "differs from expected (%d)."),
               integer_representation, expected_integer_format);
 
-  /*
-    Record 7 (20) provides a much more reliable way of
-    setting the encoding.
-    The character_code is used as a fallback only.
-  */
-  if ( NULL == dict_get_encoding (dict))
+}
+
+static const char *
+choose_encoding (struct sfm_reader *r,
+                 const struct sfm_extension_record *ext_integer,
+                 const struct sfm_extension_record *ext_encoding)
+{
+  /* The EXT_ENCODING record is a more reliable way to determine dictionary
+     encoding. */
+  if (ext_encoding)
+    return ext_encoding->data;
+
+  /* But EXT_INTEGER is better than nothing as a fallback. */
+  if (ext_integer)
     {
-      switch (character_code)
-       {
-       case 1:
-         dict_set_encoding (dict, "EBCDIC-US");
-         break;
-       case 2:
-       case 3:
-         /* These ostensibly mean "7-bit ASCII" and "8-bit ASCII"[sic]
-            respectively.   However, there are known to be many files
-            in the wild with character code 2, yet have data which are
-            clearly not ascii.
-            Therefore we ignore these values.
-         */
-         return;
-       case 4:
-         dict_set_encoding (dict, "MS_KANJI");
-         break;
-       case 65000:
-         dict_set_encoding (dict, "UTF-7");
-         break;
-       case 65001:
-         dict_set_encoding (dict, "UTF-8");
-         break;
-       default:
-         {
-           char enc[100];
-           snprintf (enc, 100, "CP%d", character_code);
-           dict_set_encoding (dict, enc);
-         }
-         break;
-       };
+      int codepage = parse_int (r, ext_integer->data, 7 * 4);
+
+      switch (codepage)
+        {
+        case 1:
+          return "EBCDIC-US";
+
+        case 2:
+        case 3:
+          /* These ostensibly mean "7-bit ASCII" and "8-bit ASCII"[sic]
+             respectively.  However, there are known to be many files in the 
wild
+             with character code 2, yet have data which are clearly not ASCII.
+             Therefore we ignore these values. */
+          break;
+
+        case 4:
+          return "MS_KANJI";
+
+        case 65000:
+          return "UTF-7";
+
+        case 65001:
+          return "UTF-8";
+
+        default:
+          return pool_asprintf (r->pool, "CP%d", codepage);
+        }
     }
+
+  return locale_charset ();
 }
 
-/* Read record type 7, subtype 4. */
+/* Parses record type 7, subtype 4. */
 static void
-read_machine_float_info (struct sfm_reader *r, size_t size, size_t count)
+parse_machine_float_info (struct sfm_reader *r,
+                          const struct sfm_extension_record *record)
 {
-  double sysmis = read_float (r);
-  double highest = read_float (r);
-  double lowest = read_float (r);
-
-  if (size != 8 || count != 3)
-    sys_error (r, _("Bad size (%zu) or count (%zu) on extension 4."),
-               size, count);
+  double sysmis = parse_float (r, record->data, 0);
+  double highest = parse_float (r, record->data, 8);
+  double lowest = parse_float (r, record->data, 16);
 
   if (sysmis != SYSMIS)
-    sys_warn (r, _("File specifies unexpected value %g as %s."),
+    sys_warn (r, record->pos, _("File specifies unexpected value %g as %s."),
               sysmis, "SYSMIS");
 
   if (highest != HIGHEST)
-    sys_warn (r, _("File specifies unexpected value %g as %s."),
+    sys_warn (r, record->pos, _("File specifies unexpected value %g as %s."),
               highest, "HIGHEST");
 
   if (lowest != LOWEST)
-    sys_warn (r, _("File specifies unexpected value %g as %s."),
+    sys_warn (r, record->pos, _("File specifies unexpected value %g as %s."),
               lowest, "LOWEST");
 }
 
-/* Read record type 7, subtype 7 or 19. */
+/* Parses record type 7, subtype 7 or 19. */
 static void
-read_mrsets (struct sfm_reader *r, size_t size, size_t count,
-             struct dictionary *dict)
+parse_mrsets (struct sfm_reader *r, const struct sfm_extension_record *record,
+              struct dictionary *dict)
 {
   struct text_record *text;
   struct mrset *mrset;
 
-  text = open_text_record (r, size * count);
+  text = open_text_record (r, record);
   for (;;)
     {
-      const char *name, *label, *counted;
+      const char *counted = NULL;
+      const char *name;
+      const char *label;
       struct stringi_set var_names;
       size_t allocated_vars;
       char delimiter;
@@ -1045,8 +1246,9 @@ read_mrsets (struct sfm_reader *r, size_t size, size_t 
count,
 
       if (mrset->name[0] != '$')
         {
-          sys_warn (r, _("`%s' does not begin with `$' at offset %zu "
-                         "in MRSETS record."), mrset->name, text_pos (text));
+          sys_warn (r, record->pos,
+                    _("`%s' does not begin with `$' at UTF-8 offset %zu "
+                      "in MRSETS record."), mrset->name, text_pos (text));
           break;
         }
 
@@ -1055,8 +1257,9 @@ read_mrsets (struct sfm_reader *r, size_t size, size_t 
count,
           mrset->type = MRSET_MC;
           if (!text_match (text, ' '))
             {
-              sys_warn (r, _("Missing space following `%c' at offset %zu "
-                             "in MRSETS record."), 'C', text_pos (text));
+              sys_warn (r, record->pos,
+                        _("Missing space following `%c' at UTF-8 offset %zu "
+                          "in MRSETS record."), 'C', text_pos (text));
               break;
             }
         }
@@ -1073,8 +1276,9 @@ read_mrsets (struct sfm_reader *r, size_t size, size_t 
count,
           mrset->cat_source = MRSET_COUNTEDVALUES;
           if (!text_match (text, ' '))
             {
-              sys_warn (r, _("Missing space following `%c' at offset %zu "
-                             "in MRSETS record."), 'E',  text_pos (text));
+              sys_warn (r, record->pos,
+                        _("Missing space following `%c' at UTF-8 offset %zu "
+                          "in MRSETS record."), 'E',  text_pos (text));
               break;
             }
 
@@ -1082,14 +1286,16 @@ read_mrsets (struct sfm_reader *r, size_t size, size_t 
count,
           if (!strcmp (number, "11"))
             mrset->label_from_var_label = true;
           else if (strcmp (number, "1"))
-            sys_warn (r, _("Unexpected label source value `%s' "
-                           "following `E' at offset %zu in MRSETS record."),
+            sys_warn (r, record->pos,
+                      _("Unexpected label source value `%s' following `E' "
+                        "at UTF-8 offset %zu in MRSETS record."),
                       number, text_pos (text));
         }
       else
         {
-          sys_warn (r, _("Missing `C', `D', or `E' at offset %zu "
-                         "in MRSETS record."),
+          sys_warn (r, record->pos,
+                    _("Missing `C', `D', or `E' at UTF-8 offset %zu "
+                      "in MRSETS record."),
                     text_pos (text));
           break;
         }
@@ -1117,19 +1323,21 @@ read_mrsets (struct sfm_reader *r, size_t size, size_t 
count,
           var_name = text_get_token (text, ss_cstr (" \n"), &delimiter);
           if (var_name == NULL)
             {
-              sys_warn (r, _("Missing new-line parsing variable names "
-                             "at offset %zu in MRSETS record."),
+              sys_warn (r, record->pos,
+                        _("Missing new-line parsing variable names "
+                          "at UTF-8 offset %zu in MRSETS record."),
                         text_pos (text));
               break;
             }
 
-          var = lookup_var_by_short_name (dict, var_name);
+          var = dict_lookup_var (dict, var_name);
           if (var == NULL)
             continue;
           if (!stringi_set_insert (&var_names, var_name))
             {
-              sys_warn (r, _("Duplicate variable name %s "
-                             "at offset %zu in MRSETS record."),
+              sys_warn (r, record->pos,
+                        _("Duplicate variable name %s "
+                          "at UTF-8 offset %zu in MRSETS record."),
                         var_name, text_pos (text));
               continue;
             }
@@ -1141,8 +1349,9 @@ read_mrsets (struct sfm_reader *r, size_t size, size_t 
count,
           if (mrset->n_vars
               && var_get_type (var) != var_get_type (mrset->vars[0]))
             {
-              sys_warn (r, _("MRSET %s contains both string and "
-                             "numeric variables."), name);
+              sys_warn (r, record->pos,
+                        _("MRSET %s contains both string and "
+                          "numeric variables."), name);
               continue;
             }
           width = MIN (width, var_get_width (var));
@@ -1156,7 +1365,8 @@ read_mrsets (struct sfm_reader *r, size_t size, size_t 
count,
 
       if (mrset->n_vars < 2)
         {
-          sys_warn (r, _("MRSET %s has only %zu variables."), mrset->name,
+          sys_warn (r, record->pos,
+                    _("MRSET %s has only %zu variables."), mrset->name,
                     mrset->n_vars);
           mrset_destroy (mrset);
           continue;
@@ -1184,40 +1394,48 @@ read_mrsets (struct sfm_reader *r, size_t size, size_t 
count,
 /* Read record type 7, subtype 11, which specifies how variables
    should be displayed in GUI environments. */
 static void
-read_display_parameters (struct sfm_reader *r, size_t size, size_t count,
+parse_display_parameters (struct sfm_reader *r,
+                         const struct sfm_extension_record *record,
                          struct dictionary *dict)
 {
-  size_t n_vars;
   bool includes_width;
   bool warned = false;
+  size_t n_vars;
+  size_t ofs;
   size_t i;
 
-  if (size != 4)
-    {
-      sys_warn (r, _("Bad size %zu on extension 11."), size);
-      skip_bytes (r, size * count);
-      return;
-    }
-
   n_vars = dict_get_var_cnt (dict);
-  if (count == 3 * n_vars)
+  if (record->count == 3 * n_vars)
     includes_width = true;
-  else if (count == 2 * n_vars)
+  else if (record->count == 2 * n_vars)
     includes_width = false;
   else
     {
-      sys_warn (r, _("Extension 11 has bad count %zu (for %zu variables)."),
-                count, n_vars);
-      skip_bytes (r, size * count);
+      sys_warn (r, record->pos,
+                _("Extension 11 has bad count %zu (for %zu variables)."),
+                record->count, n_vars);
       return;
     }
 
+  ofs = 0;
   for (i = 0; i < n_vars; ++i)
     {
       struct variable *v = dict_get_var (dict, i);
-      int measure = read_int (r);
-      int width = includes_width ? read_int (r) : 0;
-      int align = read_int (r);
+      int measure, width, align;
+
+      measure = parse_int (r, record->data, ofs);
+      ofs += 4;
+
+      if (includes_width)
+        {
+          width = parse_int (r, record->data, ofs);
+          ofs += 4;
+        }
+      else
+        width = 0;
+
+      align = parse_int (r, record->data, ofs);
+      ofs += 4;
 
       /* SPSS 14 sometimes seems to set string variables' measure
          to zero. */
@@ -1227,9 +1445,9 @@ read_display_parameters (struct sfm_reader *r, size_t 
size, size_t count,
       if (measure < 1 || measure > 3 || align < 0 || align > 2)
         {
           if (!warned)
-            sys_warn (r, _("Invalid variable display parameters "
-                           "for variable %zu (%s).  "
-                           "Default parameters substituted."),
+            sys_warn (r, record->pos,
+                      _("Invalid variable display parameters for variable "
+                        "%zu (%s).  Default parameters substituted."),
                       i, var_get_name (v));
           warned = true;
           continue;
@@ -1250,29 +1468,82 @@ read_display_parameters (struct sfm_reader *r, size_t 
size, size_t count,
     }
 }
 
-/* Reads record type 7, subtype 13, which gives the long name
-   that corresponds to each short name.  Modifies variable names
-   in DICT accordingly.  */
 static void
-read_long_var_name_map (struct sfm_reader *r, size_t size, size_t count,
-                        struct dictionary *dict)
+rename_var_and_save_short_names (struct dictionary *dict, struct variable *var,
+                                 const char *new_name)
+{
+  size_t n_short_names;
+  char **short_names;
+  size_t i;
+
+  /* Renaming a variable may clear its short names, but we
+     want to retain them, so we save them and re-set them
+     afterward. */
+  n_short_names = var_get_short_name_cnt (var);
+  short_names = xnmalloc (n_short_names, sizeof *short_names);
+  for (i = 0; i < n_short_names; i++)
+    {
+      const char *s = var_get_short_name (var, i);
+      short_names[i] = s != NULL ? xstrdup (s) : NULL;
+    }
+
+  /* Set long name. */
+  dict_rename_var (dict, var, new_name);
+
+  /* Restore short names. */
+  for (i = 0; i < n_short_names; i++)
+    {
+      var_set_short_name (var, i, short_names[i]);
+      free (short_names[i]);
+    }
+  free (short_names);
+}
+
+/* Parses record type 7, subtype 13, which gives the long name that corresponds
+   to each short name.  Modifies variable names in DICT accordingly.  */
+static void
+parse_long_var_name_map (struct sfm_reader *r,
+                         const struct sfm_extension_record *record,
+                         struct dictionary *dict)
 {
   struct text_record *text;
   struct variable *var;
   char *long_name;
 
-  text = open_text_record (r, size * count);
-  while (read_variable_to_value_pair (r, dict, text, &var, &long_name))
+  if (record == NULL)
     {
-      char **short_names;
-      size_t short_name_cnt;
+      /* Convert variable names to lowercase. */
       size_t i;
 
+      for (i = 0; i < dict_get_var_cnt (dict); i++)
+       {
+         struct variable *var = dict_get_var (dict, i);
+          char *new_name;
+
+          new_name = xstrdup (var_get_name (var));
+         str_lowercase (new_name);
+
+          rename_var_and_save_short_names (dict, var, new_name);
+
+          free (new_name);
+       }
+
+      return;
+    }
+
+  /* Rename each of the variables, one by one.  (In a correctly constructed
+     system file, this cannot create any intermediate duplicate variable names,
+     because all of the new variable names are longer than any of the old
+     variable names and thus there cannot be any overlaps.) */
+  text = open_text_record (r, record);
+  while (read_variable_to_value_pair (r, dict, text, &var, &long_name))
+    {
       /* Validate long name. */
       if (!var_is_valid_name (long_name, false))
         {
-          sys_warn (r, _("Long variable mapping from %s to invalid "
-                         "variable name `%s'."),
+          sys_warn (r, record->pos,
+                    _("Long variable mapping from %s to invalid "
+                      "variable name `%s'."),
                     var_get_name (var), long_name);
           continue;
         }
@@ -1281,47 +1552,28 @@ read_long_var_name_map (struct sfm_reader *r, size_t 
size, size_t count,
       if (strcasecmp (var_get_short_name (var, 0), long_name)
           && dict_lookup_var (dict, long_name) != NULL)
         {
-          sys_warn (r, _("Duplicate long variable name `%s'."), long_name);
+          sys_warn (r, record->pos,
+                    _("Duplicate long variable name `%s'."), long_name);
           continue;
         }
 
-      /* Renaming a variable may clear its short names, but we
-         want to retain them, so we save them and re-set them
-         afterward. */
-      short_name_cnt = var_get_short_name_cnt (var);
-      short_names = xnmalloc (short_name_cnt, sizeof *short_names);
-      for (i = 0; i < short_name_cnt; i++)
-        {
-          const char *s = var_get_short_name (var, i);
-          short_names[i] = s != NULL ? xstrdup (s) : NULL;
-        }
-
-      /* Set long name. */
-      dict_rename_var (dict, var, long_name);
-
-      /* Restore short names. */
-      for (i = 0; i < short_name_cnt; i++)
-        {
-          var_set_short_name (var, i, short_names[i]);
-          free (short_names[i]);
-        }
-      free (short_names);
+      rename_var_and_save_short_names (dict, var, long_name);
     }
   close_text_record (r, text);
-  r->has_long_var_names = true;
 }
 
 /* Reads record type 7, subtype 14, which gives the real length
    of each very long string.  Rearranges DICT accordingly. */
 static void
-read_long_string_map (struct sfm_reader *r, size_t size, size_t count,
-                      struct dictionary *dict)
+parse_long_string_map (struct sfm_reader *r,
+                       const struct sfm_extension_record *record,
+                       struct dictionary *dict)
 {
   struct text_record *text;
   struct variable *var;
   char *length_s;
 
-  text = open_text_record (r, size * count);
+  text = open_text_record (r, record);
   while (read_variable_to_value_pair (r, dict, text, &var, &length_s))
     {
       size_t idx = var_get_dict_index (var);
@@ -1333,8 +1585,9 @@ read_long_string_map (struct sfm_reader *r, size_t size, 
size_t count,
       length = strtol (length_s, NULL, 10);
       if (length < 1 || length > MAX_STRING)
         {
-          sys_warn (r, _("%s listed as string of invalid length %s "
-                         "in very long string record."),
+          sys_warn (r, record->pos,
+                    _("%s listed as string of invalid length %s "
+                      "in very long string record."),
                     var_get_name (var), length_s);
           continue;
         }
@@ -1343,13 +1596,15 @@ read_long_string_map (struct sfm_reader *r, size_t 
size, size_t count,
       segment_cnt = sfm_width_to_segments (length);
       if (segment_cnt == 1)
         {
-          sys_warn (r, _("%s listed in very long string record with width %s, "
-                         "which requires only one segment."),
+          sys_warn (r, record->pos,
+                    _("%s listed in very long string record with width %s, "
+                      "which requires only one segment."),
                     var_get_name (var), length_s);
           continue;
         }
       if (idx + segment_cnt > dict_get_var_cnt (dict))
-        sys_error (r, _("Very long string %s overflows dictionary."),
+        sys_error (r, record->pos,
+                   _("Very long string %s overflows dictionary."),
                    var_get_name (var));
 
       /* Get the short names from the segments and check their
@@ -1363,8 +1618,9 @@ read_long_string_map (struct sfm_reader *r, size_t size, 
size_t count,
           if (i > 0)
             var_set_short_name (var, i, var_get_short_name (seg, 0));
           if (ROUND_UP (width, 8) != ROUND_UP (alloc_width, 8))
-            sys_error (r, _("Very long string with width %ld has segment %d "
-                            "of width %d (expected %d)."),
+            sys_error (r, record->pos,
+                       _("Very long string with width %ld has segment %d "
+                         "of width %d (expected %d)."),
                        length, i, width, alloc_width);
         }
       dict_delete_consecutive_vars (dict, idx + 1, segment_cnt - 1);
@@ -1374,148 +1630,117 @@ read_long_string_map (struct sfm_reader *r, size_t 
size, size_t count,
   dict_compact_values (dict);
 }
 
-/* Reads value labels from sysfile H and inserts them into the
-   associated dictionary. */
 static void
-read_value_labels (struct sfm_reader *r,
-                   struct dictionary *dict, struct variable **var_by_value_idx)
+parse_value_labels (struct sfm_reader *r, struct dictionary *dict,
+                    const struct sfm_var_record *var_recs, size_t n_var_recs,
+                    const struct sfm_value_label_record *record)
 {
-  struct pool *subpool;
+  struct variable **vars;
+  char **utf8_labels;
+  size_t i;
 
-  struct label
+  utf8_labels = pool_nmalloc (r->pool, sizeof *utf8_labels, record->n_labels);
+  for (i = 0; i < record->n_labels; i++)
+    utf8_labels[i] = recode_string_pool ("UTF-8", dict_get_encoding (dict),
+                                         record->labels[i].label, -1,
+                                         r->pool);
+
+  vars = pool_nmalloc (r->pool, record->n_vars, sizeof *vars);
+  for (i = 0; i < record->n_vars; i++)
+    vars[i] = lookup_var_by_index (r, record->pos,
+                                   var_recs, n_var_recs, record->vars[i]);
+
+  for (i = 1; i < record->n_vars; i++)
+    if (var_get_type (vars[i]) != var_get_type (vars[0]))
+      sys_error (r, record->pos,
+                 _("Variables associated with value label are not all of "
+                   "identical type.  Variable %s is %s, but variable "
+                   "%s is %s."),
+                 var_get_name (vars[0]),
+                 var_is_numeric (vars[0]) ? _("numeric") : _("string"),
+                 var_get_name (vars[i]),
+                 var_is_numeric (vars[i]) ? _("numeric") : _("string"));
+
+  for (i = 0; i < record->n_vars; i++)
     {
-      uint8_t raw_value[8];        /* Value as uninterpreted bytes. */
-      union value value;        /* Value. */
-      char *label;              /* Null-terminated label string. */
-    };
-
-  struct label *labels = NULL;
-  int label_cnt;               /* Number of labels. */
-
-  struct variable **var = NULL;        /* Associated variables. */
-  int var_cnt;                 /* Number of associated variables. */
-  int max_width;                /* Maximum width of string variables. */
-
-  int i;
-
-  subpool = pool_create_subpool (r->pool);
-
-  /* Read the type 3 record and record its contents.  We can't do
-     much with the data yet because we don't know whether it is
-     of numeric or string type. */
-
-  /* Read number of labels. */
-  label_cnt = read_int (r);
+      struct variable *var = vars[i];
+      int width;
+      size_t j;
 
-  if (size_overflow_p (xtimes (label_cnt, sizeof *labels)))
-    sys_error (r, _("Invalid number of labels %d."), label_cnt);
+      width = var_get_width (var);
+      if (width > 8)
+        sys_error (r, record->pos,
+                   _("Value labels may not be added to long string "
+                     "variables (e.g. %s) using records types 3 and 4."),
+                   var_get_name (var));
 
-  /* Read each value/label tuple into labels[]. */
-  labels = pool_nalloc (subpool, label_cnt, sizeof *labels);
-  for (i = 0; i < label_cnt; i++)
-    {
-      struct label *label = labels + i;
-      unsigned char label_len;
-      size_t padded_len;
+      for (j = 0; j < record->n_labels; j++)
+        {
+          struct sfm_value_label *label = &record->labels[j];
+          union value value;
 
-      /* Read value. */
-      read_bytes (r, label->raw_value, sizeof label->raw_value);
+          value_init (&value, width);
+          if (width == 0)
+            value.f = parse_float (r, label->value, 0);
+          else
+            memcpy (value_str_rw (&value, width), label->value, width);
 
-      /* Read label length. */
-      read_bytes (r, &label_len, sizeof label_len);
-      padded_len = ROUND_UP (label_len + 1, 8);
+          if (!var_add_value_label (var, &value, utf8_labels[j]))
+            {
+              if (var_is_numeric (var))
+                sys_warn (r, record->pos,
+                          _("Duplicate value label for %g on %s."),
+                          value.f, var_get_name (var));
+              else
+                sys_warn (r, record->pos,
+                          _("Duplicate value label for `%.*s' on %s."),
+                          width, value_str (&value, width),
+                          var_get_name (var));
+            }
 
-      /* Read label, padding. */
-      label->label = pool_alloc (subpool, padded_len + 1);
-      read_bytes (r, label->label, padded_len - 1);
-      label->label[label_len] = 0;
+          value_destroy (&value, width);
+        }
     }
 
-  /* Now, read the type 4 record that has the list of variables
-     to which the value labels are to be applied. */
-
-  /* Read record type of type 4 record. */
-  if (read_int (r) != 4)
-    sys_error (r, _("Variable index record (type 4) does not immediately "
-                    "follow value label record (type 3) as it should."));
+  pool_free (r->pool, vars);
+  for (i = 0; i < record->n_labels; i++)
+    pool_free (r->pool, utf8_labels[i]);
+  pool_free (r->pool, utf8_labels);
+}
 
-  /* Read number of variables associated with value label from type 4
-     record. */
-  var_cnt = read_int (r);
-  if (var_cnt < 1 || var_cnt > dict_get_var_cnt (dict))
-    sys_error (r, _("Number of variables associated with a value label (%d) "
-                    "is not between 1 and the number of variables (%zu)."),
-               var_cnt, dict_get_var_cnt (dict));
-
-  /* Read the list of variables. */
-  var = pool_nalloc (subpool, var_cnt, sizeof *var);
-  max_width = 0;
-  for (i = 0; i < var_cnt; i++)
-    {
-      var[i] = lookup_var_by_value_idx (r, var_by_value_idx, read_int (r));
-      if (var_get_width (var[i]) > 8)
-        sys_error (r, _("Value labels may not be added to long string "
-                        "variables (e.g. %s) using records types 3 and 4."),
-                   var_get_name (var[i]));
-      max_width = MAX (max_width, var_get_width (var[i]));
-    }
+static struct variable *
+lookup_var_by_index (struct sfm_reader *r, off_t offset,
+                     const struct sfm_var_record *var_recs, size_t n_var_recs,
+                     int idx)
+{
+  const struct sfm_var_record *rec;
 
-  /* Type check the variables. */
-  for (i = 1; i < var_cnt; i++)
-    if (var_get_type (var[i]) != var_get_type (var[0]))
-      sys_error (r, _("Variables associated with value label are not all of "
-                      "identical type.  Variable %s is %s, but variable "
-                      "%s is %s."),
-                 var_get_name (var[0]),
-                 var_is_numeric (var[0]) ? _("numeric") : _("string"),
-                 var_get_name (var[i]),
-                 var_is_numeric (var[i]) ? _("numeric") : _("string"));
-
-  /* Fill in labels[].value, now that we know the desired type. */
-  for (i = 0; i < label_cnt; i++)
+  if (idx < 1 || idx > n_var_recs)
     {
-      struct label *label = labels + i;
-
-      value_init_pool (subpool, &label->value, max_width);
-      if (var_is_alpha (var[0]))
-        u8_buf_copy_rpad (value_str_rw (&label->value, max_width), max_width,
-                       label->raw_value, sizeof label->raw_value, ' ');
-      else
-        label->value.f = float_get_double (r->float_format, label->raw_value);
+      sys_error (r, offset,
+                 _("Variable index %d not in valid range 1...%d."),
+                 idx, n_var_recs);
+      return NULL;
     }
 
-  /* Assign the `value_label's to each variable. */
-  for (i = 0; i < var_cnt; i++)
+  rec = &var_recs[idx - 1];
+  if (rec->var == NULL)
     {
-      struct variable *v = var[i];
-      int j;
-
-      /* Add each label to the variable. */
-      for (j = 0; j < label_cnt; j++)
-       {
-          struct label *label = &labels[j];
-          if (!var_add_value_label (v, &label->value, label->label))
-            {
-              if (var_is_numeric (var[0]))
-                sys_warn (r, _("Duplicate value label for %g on %s."),
-                          label->value.f, var_get_name (v));
-              else
-                sys_warn (r, _("Duplicate value label for `%.*s' on %s."),
-                          max_width, value_str (&label->value, max_width),
-                          var_get_name (v));
-            }
-       }
+      sys_error (r, offset,
+                 _("Variable index %d refers to long string continuation."),
+                 idx);
+      return NULL;
     }
 
-  pool_destroy (subpool);
+  return rec->var;
 }
 
-/* Reads a set of custom attributes from TEXT into ATTRS.
+/* Parses a set of custom attributes from TEXT into ATTRS.
    ATTRS may be a null pointer, in which case the attributes are
    read but discarded. */
 static void
-read_attributes (struct sfm_reader *r, struct text_record *text,
-                 struct attrset *attrs)
+parse_attributes (struct sfm_reader *r, struct text_record *text,
+                  struct attrset *attrs)
 {
   do
     {
@@ -1572,140 +1797,153 @@ read_attributes (struct sfm_reader *r, struct 
text_record *text,
 /* Reads record type 7, subtype 17, which lists custom
    attributes on the data file.  */
 static void
-read_data_file_attributes (struct sfm_reader *r,
-                           size_t size, size_t count,
-                           struct dictionary *dict)
+parse_data_file_attributes (struct sfm_reader *r,
+                            const struct sfm_extension_record *record,
+                            struct dictionary *dict)
 {
-  struct text_record *text = open_text_record (r, size * count);
-  read_attributes (r, text, dict_get_attributes (dict));
+  struct text_record *text = open_text_record (r, record);
+  parse_attributes (r, text, dict_get_attributes (dict));
   close_text_record (r, text);
 }
 
+/* Parses record type 7, subtype 18, which lists custom
+   attributes on individual variables.  */
 static void
-skip_long_string_value_labels (struct sfm_reader *r, size_t n_labels)
+parse_variable_attributes (struct sfm_reader *r,
+                           const struct sfm_extension_record *record,
+                           struct dictionary *dict)
 {
-  size_t i;
+  struct text_record *text;
+  struct variable *var;
 
-  for (i = 0; i < n_labels; i++)
-    {
-      size_t value_length, label_length;
+  text = open_text_record (r, record);
+  while (text_read_variable_name (r, dict, text, ss_cstr (":"), &var))
+    parse_attributes (r, text, var != NULL ? var_get_attributes (var) : NULL);
+  close_text_record (r, text);
+}
 
-      value_length = read_int (r);
-      skip_bytes (r, value_length);
-      label_length = read_int (r);
-      skip_bytes (r, label_length);
-    }
+static void
+check_overflow (struct sfm_reader *r,
+                const struct sfm_extension_record *record,
+                size_t ofs, size_t length)
+{
+  size_t end = record->size * record->count;
+  if (length >= end || ofs + length > end)
+    sys_error (r, record->pos + end,
+               _("Long string value label record ends unexpectedly."));
 }
 
 static void
-read_long_string_value_labels (struct sfm_reader *r,
-                              size_t size, size_t count,
-                              struct dictionary *d)
+parse_long_string_value_labels (struct sfm_reader *r,
+                                const struct sfm_extension_record *record,
+                                struct dictionary *dict)
 {
-  const off_t start = ftello (r->file);
-  while (ftello (r->file) - start < size * count)
+  const char *dict_encoding = dict_get_encoding (dict);
+  size_t end = record->size * record->count;
+  size_t ofs = 0;
+
+  while (ofs < end)
     {
-      char var_name[VAR_NAME_LEN + 1];
+      char *var_name;
       size_t n_labels, i;
-      struct variable *v;
+      struct variable *var;
       union value value;
       int var_name_len;
       int width;
 
-      /* Read header. */
-      var_name_len = read_int (r);
-      if (var_name_len > VAR_NAME_LEN)
-        sys_error (r, _("Variable name length in long string value label "
-                        "record (%d) exceeds %d-byte limit."),
-                   var_name_len, VAR_NAME_LEN);
-      read_string (r, var_name, var_name_len + 1);
-      width = read_int (r);
-      n_labels = read_int (r);
-
-      v = dict_lookup_var (d, var_name);
-      if (v == NULL)
+      /* Parse variable name length. */
+      check_overflow (r, record, ofs, 4);
+      var_name_len = parse_int (r, record->data, ofs);
+      ofs += 4;
+
+      /* Parse variable name, width, and number of labels. */
+      check_overflow (r, record, ofs, var_name_len + 8);
+      var_name = recode_string_pool ("UTF-8", dict_encoding,
+                                     (const char *) record->data + ofs,
+                                     var_name_len, r->pool);
+      width = parse_int (r, record->data, ofs + var_name_len);
+      n_labels = parse_int (r, record->data, ofs + var_name_len + 4);
+      ofs += var_name_len + 8;
+
+      /* Look up 'var' and validate. */
+      var = dict_lookup_var (dict, var_name);
+      if (var == NULL)
+        sys_warn (r, record->pos + ofs,
+                  _("Ignoring long string value record for "
+                    "unknown variable %s."), var_name);
+      else if (var_is_numeric (var))
         {
-          sys_warn (r, _("Ignoring long string value record for "
-                         "unknown variable %s."), var_name);
-          skip_long_string_value_labels (r, n_labels);
-          continue;
+          sys_warn (r, record->pos + ofs,
+                    _("Ignoring long string value record for "
+                      "numeric variable %s."), var_name);
+          var = NULL;
         }
-      if (var_is_numeric (v))
+      else if (width != var_get_width (var))
         {
-          sys_warn (r, _("Ignoring long string value record for "
-                         "numeric variable %s."), var_name);
-          skip_long_string_value_labels (r, n_labels);
-          continue;
-        }
-      if (width != var_get_width (v))
-        {
-          sys_warn (r, _("Ignoring long string value record for variable %s "
-                         "because the record's width (%d) does not match the "
-                         "variable's width (%d)."),
-                    var_name, width, var_get_width (v));
-          skip_long_string_value_labels (r, n_labels);
-          continue;
+          sys_warn (r, record->pos + ofs,
+                    _("Ignoring long string value record for variable %s "
+                      "because the record's width (%d) does not match the "
+                      "variable's width (%d)."),
+                    var_name, width, var_get_width (var));
+          var = NULL;
         }
 
-      /* Read values. */
+      /* Parse values. */
       value_init_pool (r->pool, &value, width);
       for (i = 0; i < n_labels; i++)
        {
           size_t value_length, label_length;
-          char label[256];
-          bool skip = false;
+          bool skip = var == NULL;
 
-          /* Read value. */
-          value_length = read_int (r);
-          if (value_length == width)
-            read_bytes (r, value_str_rw (&value, width), width);
-          else
+          /* Parse value length. */
+          check_overflow (r, record, ofs, 4);
+          value_length = parse_int (r, record->data, ofs);
+          ofs += 4;
+
+          /* Parse value. */
+          check_overflow (r, record, ofs, value_length);
+          if (!skip)
             {
-              sys_warn (r, _("Ignoring long string value %zu for variable %s, "
-                             "with width %d, that has bad value width %zu."),
-                        i, var_get_name (v), width, value_length);
-              skip_bytes (r, value_length);
-              skip = true;
+              if (value_length == width)
+                memcpy (value_str_rw (&value, width),
+                        (const uint8_t *) record->data + ofs, width);
+              else
+                {
+                  sys_warn (r, record->pos + ofs,
+                            _("Ignoring long string value %zu for variable "
+                              "%s, with width %d, that has bad value "
+                              "width %zu."),
+                            i, var_get_name (var), width, value_length);
+                  skip = true;
+                }
             }
+          ofs += value_length;
+
+          /* Parse label length. */
+          check_overflow (r, record, ofs, 4);
+          label_length = parse_int (r, record->data, ofs);
+          ofs += 4;
 
-          /* Read label. */
-          label_length = read_int (r);
-          read_string (r, label, MIN (sizeof label, label_length + 1));
-          if (label_length >= sizeof label)
+          /* Parse label. */
+          check_overflow (r, record, ofs, label_length);
+          if (!skip)
             {
-              /* Skip and silently ignore label text after the
-                 first 255 bytes.  The maximum documented length
-                 of a label is 120 bytes so this is more than
-                 generous. */
-              skip_bytes (r, (label_length + 1) - sizeof label);
+              char *label;
+
+              label = recode_string_pool ("UTF-8", dict_encoding,
+                                          (const char *) record->data + ofs,
+                                          label_length, r->pool);
+              if (!var_add_value_label (var, &value, label))
+                sys_warn (r, record->pos + ofs,
+                          _("Duplicate value label for `%.*s' on %s."),
+                          width, value_str (&value, width),
+                          var_get_name (var));
+              pool_free (r->pool, label);
             }
-
-          if (!skip && !var_add_value_label (v, &value, label))
-            sys_warn (r, _("Duplicate value label for `%.*s' on %s."),
-                      width, value_str (&value, width), var_get_name (v));
+          ofs += label_length;
         }
     }
 }
-
-
-/* Reads record type 7, subtype 18, which lists custom
-   attributes on individual variables.  */
-static void
-read_variable_attributes (struct sfm_reader *r,
-                          size_t size, size_t count,
-                          struct dictionary *dict)
-{
-  struct text_record *text = open_text_record (r, size * count);
-  for (;;) 
-    {
-      struct variable *var;
-      if (!text_read_variable_name (r, dict, text, ss_cstr (":"), &var))
-        break;
-      read_attributes (r, text, var != NULL ? var_get_attributes (var) : NULL);
-    }
-  close_text_record (r, text);
-}
-
 
 /* Case reader. */
 
@@ -1776,7 +2014,7 @@ eof:
 static void
 partial_record (struct sfm_reader *r)
 {
-  sys_error (r, _("File ends in partial case."));
+  sys_error (r, r->pos, _("File ends in partial case."));
 }
 
 /* Issues an error that an unspecified error occurred SFM, and
@@ -1885,8 +2123,9 @@ read_compressed_number (struct sfm_reader *r, double *d)
       if (!r->corruption_warning)
         {
           r->corruption_warning = true;
-          sys_warn (r, _("Possible compressed data corruption: "
-                         "compressed spaces appear in numeric field."));
+          sys_warn (r, r->pos,
+                    _("Possible compressed data corruption: "
+                      "compressed spaces appear in numeric field."));
         }
       break;
 
@@ -1938,8 +2177,9 @@ read_compressed_string (struct sfm_reader *r, uint8_t 
*dst)
         else if (!r->corruption_warning)
           {
             r->corruption_warning = true;
-            sys_warn (r, _("Possible compressed data corruption: "
-                           "string contains compressed integer (opcode %d)."),
+            sys_warn (r, r->pos,
+                      _("Possible compressed data corruption: "
+                        "string contains compressed integer (opcode %d)."),
                       opcode);
           }
       }
@@ -1988,86 +2228,6 @@ skip_whole_strings (struct sfm_reader *r, size_t length)
   return read_whole_strings (r, buffer, length);
 }
 
-/* Creates and returns a table that can be used for translating a value
-   index into a case to a "struct variable *" for DICT.  Multiple
-   system file fields reference variables this way.
-
-   This table must be created before processing the very long
-   string extension record, because that record causes some
-   values to be deleted from the case and the dictionary to be
-   compacted. */
-static struct variable **
-make_var_by_value_idx (struct sfm_reader *r, struct dictionary *dict)
-{
-  struct variable **var_by_value_idx;
-  int value_idx = 0;
-  int i;
-
-  var_by_value_idx = pool_nmalloc (r->pool,
-                                   r->oct_cnt, sizeof *var_by_value_idx);
-  for (i = 0; i < dict_get_var_cnt (dict); i++)
-    {
-      struct variable *v = dict_get_var (dict, i);
-      int nv = var_is_numeric (v) ? 1 : DIV_RND_UP (var_get_width (v), 8);
-      int j;
-
-      var_by_value_idx[value_idx++] = v;
-      for (j = 1; j < nv; j++)
-        var_by_value_idx[value_idx++] = NULL;
-    }
-  assert (value_idx == r->oct_cnt);
-
-  return var_by_value_idx;
-}
-
-/* Returns the "struct variable" corresponding to the given
-   1-basd VALUE_IDX in VAR_BY_VALUE_IDX.  Verifies that the index
-   is valid. */
-static struct variable *
-lookup_var_by_value_idx (struct sfm_reader *r,
-                         struct variable **var_by_value_idx, int value_idx)
-{
-  struct variable *var;
-
-  if (value_idx < 1 || value_idx > r->oct_cnt)
-    sys_error (r, _("Variable index %d not in valid range 1...%d."),
-               value_idx, r->oct_cnt);
-
-  var = var_by_value_idx[value_idx - 1];
-  if (var == NULL)
-    sys_error (r, _("Variable index %d refers to long string "
-                    "continuation."),
-               value_idx);
-
-  return var;
-}
-
-/* Returns the variable in D with the given SHORT_NAME,
-   or a null pointer if there is none. */
-static struct variable *
-lookup_var_by_short_name (struct dictionary *d, const char *short_name)
-{
-  struct variable *var;
-  size_t var_cnt;
-  size_t i;
-
-  /* First try looking up by full name.  This often succeeds. */
-  var = dict_lookup_var (d, short_name);
-  if (var != NULL && !strcasecmp (var_get_short_name (var, 0), short_name))
-    return var;
-
-  /* Iterate through the whole dictionary as a fallback. */
-  var_cnt = dict_get_var_cnt (d);
-  for (i = 0; i < var_cnt; i++)
-    {
-      var = dict_get_var (d, i);
-      if (!strcasecmp (var_get_short_name (var, 0), short_name))
-        return var;
-    }
-
-  return NULL;
-}
-
 /* Helpers for reading records that contain structured text
    strings. */
 
@@ -2078,22 +2238,26 @@ lookup_var_by_short_name (struct dictionary *d, const 
char *short_name)
 /* State. */
 struct text_record
   {
-    struct substring buffer;    /* Record contents. */
+    struct substring buffer;    /* Record contents, in UTF-8. */
+    off_t start;                /* Starting offset in file. */
     size_t pos;                 /* Current position in buffer. */
     int n_warnings;             /* Number of warnings issued or suppressed. */
   };
 
-/* Reads SIZE bytes into a text record for R,
-   and returns the new text record. */
 static struct text_record *
-open_text_record (struct sfm_reader *r, size_t size)
+open_text_record (struct sfm_reader *r,
+                  const struct sfm_extension_record *record)
 {
-  struct text_record *text = pool_alloc (r->pool, sizeof *text);
-  char *buffer = pool_malloc (r->pool, size + 1);
-  read_bytes (r, buffer, size);
-  text->buffer = ss_buffer (buffer, size);
+  struct text_record *text;
+  struct substring raw;
+
+  text = pool_alloc (r->pool, sizeof *text);
+  raw = ss_buffer (record->data, record->size * record->count);
+  text->start = record->pos;
+  text->buffer = recode_substring_pool ("UTF-8", r->encoding, raw, r->pool);
   text->pos = 0;
   text->n_warnings = 0;
+
   return text;
 }
 
@@ -2103,7 +2267,7 @@ static void
 close_text_record (struct sfm_reader *r, struct text_record *text)
 {
   if (text->n_warnings > MAX_TEXT_WARNINGS)
-    sys_warn (r, _("Suppressed %d additional related warnings."),
+    sys_warn (r, -1, _("Suppressed %d additional related warnings."),
               text->n_warnings - MAX_TEXT_WARNINGS);
   pool_free (r->pool, ss_data (text->buffer));
 }
@@ -2163,7 +2327,7 @@ text_read_short_name (struct sfm_reader *r, struct 
dictionary *dict,
   if (short_name == NULL)
     return false;
 
-  *var = lookup_var_by_short_name (dict, short_name);
+  *var = dict_lookup_var (dict, short_name);
   if (*var == NULL)
     text_warn (r, text, _("Dictionary record refers to unknown variable %s."),
                short_name);
@@ -2181,7 +2345,7 @@ text_warn (struct sfm_reader *r, struct text_record *text,
       va_list args;
 
       va_start (args, format);
-      sys_msg (r, MW, format, args);
+      sys_msg (r, text->start + text->pos, MW, format, args);
       va_end (args);
     }
 }
@@ -2226,22 +2390,25 @@ text_parse_counted_string (struct sfm_reader *r, struct 
text_record *text)
     }
   if (start == text->pos)
     {
-      sys_warn (r, _("Expecting digit at offset %zu in MRSETS record."),
-                 text->pos);
+      sys_warn (r, text->start,
+                _("Expecting digit at UTF-8 offset %zu in MRSETS record."),
+                text->pos);
       return NULL;
     }
 
   if (!text_match (text, ' '))
     {
-      sys_warn (r, _("Expecting space at offset %zu in MRSETS record."),
+      sys_warn (r, text->start,
+                _("Expecting space at UTF-8 offset %zu in MRSETS record."),
                 text->pos);
       return NULL;
     }
 
   if (text->pos + n > text->buffer.length)
     {
-      sys_warn (r, _("%zu-byte string starting at offset %zu "
-                     "exceeds record length %zu."),
+      sys_warn (r, text->start,
+                _("%zu-byte string starting at UTF-8 offset %zu "
+                  "exceeds record length %zu."),
                 n, text->pos, text->buffer.length);
       return NULL;
     }
@@ -2249,8 +2416,9 @@ text_parse_counted_string (struct sfm_reader *r, struct 
text_record *text)
   s = &text->buffer.string[text->pos];
   if (s[n] != ' ')
     {
-      sys_warn (r,
-                _("Expecting space at offset %zu following %zu-byte string."),
+      sys_warn (r, text->start,
+                _("Expecting space at UTF-8 offset %zu following %zu-byte "
+                  "string."),
                 text->pos + n, n);
       return NULL;
     }
@@ -2271,7 +2439,8 @@ text_match (struct text_record *text, char c)
     return false;
 }
 
-/* Returns the current byte offset inside the TEXT's string. */
+/* Returns the current byte offset (as convertd to UTF-8) inside the TEXT's
+   string. */
 static size_t
 text_pos (const struct text_record *text)
 {
@@ -2282,14 +2451,18 @@ text_pos (const struct text_record *text)
 
 /* Displays a corruption message. */
 static void
-sys_msg (struct sfm_reader *r, int class, const char *format, va_list args)
+sys_msg (struct sfm_reader *r, off_t offset,
+         int class, const char *format, va_list args)
 {
   struct msg m;
   struct string text;
 
   ds_init_empty (&text);
-  ds_put_format (&text, "`%s' near offset 0x%llx: ",
-                 fh_get_file_name (r->fh), (long long int) ftello (r->file));
+  if (offset >= 0)
+    ds_put_format (&text, _("`%s' near offset 0x%llx: "),
+                   fh_get_file_name (r->fh), (long long int) offset);
+  else
+    ds_put_format (&text, _("`%s': "), fh_get_file_name (r->fh));
   ds_put_vformat (&text, format, args);
 
   m.category = msg_class_to_category (class);
@@ -2303,14 +2476,14 @@ sys_msg (struct sfm_reader *r, int class, const char 
*format, va_list args)
   msg_emit (&m);
 }
 
-/* Displays a warning for the current file position. */
+/* Displays a warning for offset OFFSET in the file. */
 static void
-sys_warn (struct sfm_reader *r, const char *format, ...)
+sys_warn (struct sfm_reader *r, off_t offset, const char *format, ...)
 {
   va_list args;
 
   va_start (args, format);
-  sys_msg (r, MW, format, args);
+  sys_msg (r, offset, MW, format, args);
   va_end (args);
 }
 
@@ -2318,12 +2491,12 @@ sys_warn (struct sfm_reader *r, const char *format, ...)
    marks it as in an error state,
    and aborts reading it using longjmp. */
 static void
-sys_error (struct sfm_reader *r, const char *format, ...)
+sys_error (struct sfm_reader *r, off_t offset, const char *format, ...)
 {
   va_list args;
 
   va_start (args, format);
-  sys_msg (r, ME, format, args);
+  sys_msg (r, offset, ME, format, args);
   va_end (args);
 
   r->error = true;
@@ -2341,12 +2514,13 @@ read_bytes_internal (struct sfm_reader *r, bool 
eof_is_ok,
                    void *buf, size_t byte_cnt)
 {
   size_t bytes_read = fread (buf, 1, byte_cnt, r->file);
+  r->pos += bytes_read;
   if (bytes_read == byte_cnt)
     return true;
   else if (ferror (r->file))
-    sys_error (r, _("System error: %s."), strerror (errno));
+    sys_error (r, r->pos, _("System error: %s."), strerror (errno));
   else if (!eof_is_ok || bytes_read != 0)
-    sys_error (r, _("Unexpected end of file."));
+    sys_error (r, r->pos, _("Unexpected end of file."));
   else
     return false;
 }
@@ -2389,6 +2563,18 @@ read_float (struct sfm_reader *r)
   return float_get_double (r->float_format, number);
 }
 
+static int
+parse_int (struct sfm_reader *r, const void *data, size_t ofs)
+{
+  return integer_get (r->integer_format, (const uint8_t *) data + ofs, 4);
+}
+
+static double
+parse_float (struct sfm_reader *r, const void *data, size_t ofs)
+{
+  return float_get_double (r->float_format, (const uint8_t *) data + ofs);
+}
+
 /* Reads exactly SIZE - 1 bytes into BUFFER
    and stores a null byte into BUFFER[SIZE - 1]. */
 static void
diff --git a/src/data/sys-file-reader.h b/src/data/sys-file-reader.h
index 7495651..24c3f84 100644
--- a/src/data/sys-file-reader.h
+++ b/src/data/sys-file-reader.h
@@ -35,7 +35,7 @@ struct sfm_read_info
     enum float_format float_format;
     bool compressed;           /* 0=no, 1=yes. */
     casenumber case_cnt;        /* -1 if unknown. */
-    char product[61];          /* Product name plus a null. */
+    char product[61];          /* Product name, as ASCII string. */
 
     /* Writer's version number in X.Y.Z format.
        The version number is not always present; if not, then
diff --git a/tests/data/sys-file-reader.at b/tests/data/sys-file-reader.at
index cb9abd3..3a8c689 100644
--- a/tests/data/sys-file-reader.at
+++ b/tests/data/sys-file-reader.at
@@ -1221,10 +1221,10 @@ DISPLAY DICTIONARY.
 LIST.
 ])
   AT_CHECK([pspp -o pspp.csv sys-file.sps], [0], 
-    [warning: `sys-file.sav' near offset 0x5c: Compression bias is not the 
usual value of 100, or system file uses unrecognized floating-point format.
+    [warning: `sys-file.sav' near offset 0x54: Compression bias is not the 
usual value of 100, or system file uses unrecognized floating-point format.
 ])
   AT_CHECK([grep -v Measure pspp.csv | grep -v Display], [0], [dnl
-"warning: `sys-file.sav' near offset 0x5c: Compression bias is not the usual 
value of 100, or system file uses unrecognized floating-point format."
+"warning: `sys-file.sav' near offset 0x54: Compression bias is not the usual 
value of 100, or system file uses unrecognized floating-point format."
 
 Variable,Description,,Position
 num1,Format: F8.0,,1
@@ -1327,7 +1327,7 @@ do
   AT_DATA([sys-file.sps], [GET FILE='sys-file.sav'.
 ])
   AT_CHECK([pspp -O format=csv sys-file.sps], [0], 
-   [warning: `sys-file.sav' near offset 0xd8: File header claims 2 variable 
positions but 1 were read from file.
+   [warning: `sys-file.sav': File header claims 2 variable positions but 1 
were read from file.
 ])
 done
 AT_CLEANUP
@@ -1355,7 +1355,7 @@ do
   AT_DATA([sys-file.sps], [GET FILE='sys-file.sav'.
 ])
   AT_CHECK([pspp -O format=csv sys-file.sps], [1], 
-   [error: `sys-file.sav' near offset 0xd0: Variable name begins with invalid 
character `$'.
+   [error: `sys-file.sav' near offset 0xb4: Invalid variable name `$UM1'.
 
 sys-file.sps:1: error: Stopping syntax file processing here to avoid a cascade 
of dependent command failures.
 ])
@@ -1385,7 +1385,7 @@ do
   AT_DATA([sys-file.sps], [GET FILE='sys-file.sav'.
 ])
   AT_CHECK([pspp -O format=csv sys-file.sps], [1], 
-   [error: `sys-file.sav' near offset 0xd0: Invalid variable name `TO'.
+   [error: `sys-file.sav' near offset 0xb4: Invalid variable name `TO'.
 
 sys-file.sps:1: error: Stopping syntax file processing here to avoid a cascade 
of dependent command failures.
 ])
@@ -1415,7 +1415,7 @@ do
   AT_DATA([sys-file.sps], [GET FILE='sys-file.sav'.
 ])
   AT_CHECK([pspp -O format=csv sys-file.sps], [1], 
-   [error: `sys-file.sav' near offset 0xd0: Bad width 256 for variable VAR1.
+   [error: `sys-file.sav' near offset 0xb4: Bad width 256 for variable VAR1.
 
 sys-file.sps:1: error: Stopping syntax file processing here to avoid a cascade 
of dependent command failures.
 ])
@@ -1446,7 +1446,7 @@ do
   AT_DATA([sys-file.sps], [GET FILE='sys-file.sav'.
 ])
   AT_CHECK([pspp -O format=csv sys-file.sps], [1], 
-   [error: `sys-file.sav' near offset 0xf0: Duplicate variable name `VAR1'.
+   [error: `sys-file.sav' near offset 0xd4: Duplicate variable name `VAR1'.
 
 sys-file.sps:1: error: Stopping syntax file processing here to avoid a cascade 
of dependent command failures.
 ])
@@ -1476,7 +1476,7 @@ do
   AT_DATA([sys-file.sps], [GET FILE='sys-file.sav'.
 ])
   AT_CHECK([pspp -O format=csv sys-file.sps], [1], 
-   [error: `sys-file.sav' near offset 0xd0: Variable label indicator field is 
not 0 or 1.
+   [error: `sys-file.sav' near offset 0xb4: Variable label indicator field is 
not 0 or 1.
 
 sys-file.sps:1: error: Stopping syntax file processing here to avoid a cascade 
of dependent command failures.
 ])
@@ -1506,7 +1506,7 @@ do
   AT_DATA([sys-file.sps], [GET FILE='sys-file.sav'.
 ])
   AT_CHECK([pspp -O format=csv sys-file.sps], [1], 
-   ["error: `sys-file.sav' near offset 0xd0: Numeric missing value indicator 
field is not -3, -2, 0, 1, 2, or 3."
+   ["error: `sys-file.sav' near offset 0xb4: Numeric missing value indicator 
field is not -3, -2, 0, 1, 2, or 3."
 
 sys-file.sps:1: error: Stopping syntax file processing here to avoid a cascade 
of dependent command failures.
 ])
@@ -1536,7 +1536,7 @@ do
   AT_DATA([sys-file.sps], [GET FILE='sys-file.sav'.
 ])
   AT_CHECK([pspp -O format=csv sys-file.sps], [1], 
-   ["error: `sys-file.sav' near offset 0xd0: String missing value indicator 
field is not 0, 1, 2, or 3."
+   ["error: `sys-file.sav' near offset 0xb4: String missing value indicator 
field is not 0, 1, 2, or 3."
 
 sys-file.sps:1: error: Stopping syntax file processing here to avoid a cascade 
of dependent command failures.
 ])
@@ -1567,7 +1567,7 @@ do
   AT_DATA([sys-file.sps], [GET FILE='sys-file.sav'.
 ])
   AT_CHECK([pspp -O format=csv sys-file.sps], [1], 
-   [error: `sys-file.sav' near offset 0xd8: Missing string continuation record.
+   [error: `sys-file.sav' near offset 0xb4: Missing string continuation record.
 
 sys-file.sps:1: error: Stopping syntax file processing here to avoid a cascade 
of dependent command failures.
 ])
@@ -1597,7 +1597,7 @@ do
   AT_DATA([sys-file.sps], [GET FILE='sys-file.sav'.
 ])
   AT_CHECK([pspp -O format=csv sys-file.sps], [1], 
-   [error: `sys-file.sav' near offset 0xd0: Unknown variable format 255.
+   [error: `sys-file.sav' near offset 0xc0: Unknown variable format 255.
 
 sys-file.sps:1: error: Stopping syntax file processing here to avoid a cascade 
of dependent command failures.
 ])
@@ -1633,17 +1633,17 @@ do
   AT_DATA([sys-file.sps], [GET FILE='sys-file.sav'.
 ])
   AT_CHECK([pspp -O format=csv sys-file.sps], [0], 
-   [warning: `sys-file.sav' near offset 0xd0: Numeric variable VAR1 has 
invalid print format A8.
+   [warning: `sys-file.sav' near offset 0xc0: Numeric variable VAR1 has 
invalid print format A8.
 
-warning: `sys-file.sav' near offset 0xd0: Numeric variable VAR1 has invalid 
write format AHEX16.
+warning: `sys-file.sav' near offset 0xc4: Numeric variable VAR1 has invalid 
write format AHEX16.
 
-warning: `sys-file.sav' near offset 0xf0: String variable STR1 has invalid 
print format F8.0.
+warning: `sys-file.sav' near offset 0xe0: String variable STR1 has invalid 
print format F8.0.
 
-warning: `sys-file.sav' near offset 0xf0: String variable STR1 has invalid 
write format E10.1.
+warning: `sys-file.sav' near offset 0xe4: String variable STR1 has invalid 
write format E10.1.
 
-warning: `sys-file.sav' near offset 0x110: String variable STR2 has invalid 
print format A8.
+warning: `sys-file.sav' near offset 0x100: String variable STR2 has invalid 
print format A8.
 
-warning: `sys-file.sav' near offset 0x110: String variable STR2 has invalid 
write format AHEX4.
+warning: `sys-file.sav' near offset 0x104: String variable STR2 has invalid 
write format AHEX4.
 ])
 done
 AT_CLEANUP
@@ -1674,7 +1674,7 @@ do
   AT_DATA([sys-file.sps], [GET FILE='sys-file.sav'.
 ])
   AT_CHECK([pspp -O format=csv sys-file.sps], [1], 
-   [error: `sys-file.sav' near offset 0xf4: Weighting variable must be numeric 
(not string variable `STR1').
+   [error: `sys-file.sav': Weighting variable must be numeric (not string 
variable `STR1').
 
 sys-file.sps:1: error: Stopping syntax file processing here to avoid a cascade 
of dependent command failures.
 ])
@@ -1707,7 +1707,7 @@ do
   AT_DATA([sys-file.sps], [GET FILE='sys-file.sav'.
 ])
   AT_CHECK([pspp -O format=csv sys-file.sps], [1], 
-   [error: `sys-file.sav' near offset 0xf4: Variable index 3 not in valid 
range 1...2.
+   [error: `sys-file.sav' near offset 0x4c: Variable index 3 not in valid 
range 1...2.
 
 sys-file.sps:1: error: Stopping syntax file processing here to avoid a cascade 
of dependent command failures.
 ])
@@ -1741,7 +1741,7 @@ do
   AT_DATA([sys-file.sps], [GET FILE='sys-file.sav'.
 ])
   AT_CHECK([pspp -O format=csv sys-file.sps], [1], 
-   [error: `sys-file.sav' near offset 0x114: Variable index 3 refers to long 
string continuation.
+   [error: `sys-file.sav' near offset 0x4c: Variable index 3 refers to long 
string continuation.
 
 sys-file.sps:1: error: Stopping syntax file processing here to avoid a cascade 
of dependent command failures.
 ])
@@ -1778,7 +1778,7 @@ do
 GET FILE='sys-file.sav'.
 ])
   AT_CHECK([pspp -O format=csv sys-file.sps], [1], [dnl
-error: `sys-file.sav' near offset 0x12c: Multiple type 6 (document) records.
+error: `sys-file.sav' near offset 0x12c: Duplicate type 6 (document) record.
 
 sys-file.sps:1: error: Stopping syntax file processing here to avoid a cascade 
of dependent command failures.
 ])
@@ -1816,44 +1816,7 @@ do
 GET FILE='sys-file.sav'.
 ])
   AT_CHECK([pspp -O format=csv sys-file.sps], [1], [dnl
-error: `sys-file.sav' near offset 0xd8: Number of document lines (0) must be 
greater than 0.
-
-sys-file.sps:1: error: Stopping syntax file processing here to avoid a cascade 
of dependent command failures.
-])
-done
-AT_CLEANUP
-
-AT_SETUP([document contains null byte])
-AT_KEYWORDS([sack synthetic system file negative])
-AT_DATA([sys-file.sack], [dnl
-dnl File header.
-"$FL2"; s60 "$(#) SPSS DATA FILE PSPP synthetic test file";
-2; 2; 1; 0; -1; 100.0; "01 Jan 11"; "20:53:52"; s64 ""; i8 0 *3;
-
-dnl Numeric variable, no label or missing values.
-2; 0; 0; 0; 0x050800 *2; s8 "NUM1";
-
-dnl Document record.
-6; 1; >>i8 0<<; s79 "One line of documents";
-
-dnl Dictionary termination record.
-999; 0;
-
-dnl Data.
-1.0;
-])
-for variant in \
-       "be 24b5f451ae2559785c1a38358c511e39" \
-       "le b7e9802506307c28343293144bd6d4f4"
-do
-  set $variant
-  AT_CHECK_UNQUOTED([sack --$[1] sys-file.sack > sys-file.sav], [0], [], [$[2]
-])
-  AT_DATA([sys-file.sps], [dnl
-GET FILE='sys-file.sav'.
-])
-  AT_CHECK([pspp -O format=csv sys-file.sps], [1], [dnl
-error: `sys-file.sav' near offset 0x128: Document line contains null byte.
+error: `sys-file.sav' near offset 0xd4: Number of document lines (0) must be 
greater than 0 and less than 26843545.
 
 sys-file.sps:1: error: Stopping syntax file processing here to avoid a cascade 
of dependent command failures.
 ])
@@ -1884,7 +1847,7 @@ do
 GET FILE='sys-file.sav'.
 ])
   AT_CHECK([pspp -O format=csv sys-file.sps], [1], [dnl
-error: `sys-file.sav' near offset 0xe0: Record type 7 subtype 3 too large.
+error: `sys-file.sav' near offset 0xd8: Record type 7 subtype 3 too large.
 
 sys-file.sps:1: error: Stopping syntax file processing here to avoid a cascade 
of dependent command failures.
 ])
@@ -1918,7 +1881,7 @@ do
 GET FILE='sys-file.sav'.
 ])
   AT_CHECK([pspp -O format=csv sys-file.sps], [0], [dnl
-"warning: `sys-file.sav' near offset 0xe0: Unrecognized record type 7, subtype 
30.  Please send a copy of this file, and the syntax which created it to 
address@hidden"
+"warning: `sys-file.sav' near offset 0xd8: Unrecognized record type 7, subtype 
30.  Please send a copy of this file, and the syntax which created it to 
address@hidden"
 ])
 done
 AT_CLEANUP
@@ -1928,27 +1891,27 @@ AT_KEYWORDS([sack synthetic system file negative])
 AT_DATA([sys-file.sack], [dnl
 dnl File header.
 "$FL2"; s60 "$(#) SPSS DATA FILE PSPP synthetic test file";
-2; 2; 1; 0; -1; 100.0; "01 Jan 11"; "20:53:52"; s64 ""; i8 0 *3;
+2; 1; 1; 0; -1; 100.0; "01 Jan 11"; "20:53:52"; s64 ""; i8 0 *3;
 
 dnl Numeric variable, no label or missing values.
 2; 0; 0; 0; 0x050800 *2; s8 "NUM1";
 
 dnl Machine integer info record.
-7; 3; 4; >>9<<; 1; 2; 3; -1; 1; 1; ENDIAN; 1252;
+7; 3; 4; >>9<<; 1; 2; 3; -1; 1; 1; ENDIAN; 1252; >>1234<<;
+
+dnl End of dictionary.
+999; 0;
 ])
 for variant in \
        "be 21ec84826886b0a266d1360f8279d769" \
        "le 15dcba7b2b89b7d8a21ebcc872f515af"
 do
   set $variant
-  AT_CHECK_UNQUOTED([sack --$[1] sys-file.sack > sys-file.sav], [0], [], [$[2]
-])
+  AT_CHECK_UNQUOTED([sack --$[1] sys-file.sack > sys-file.sav], [0], [], 
[ignore])
   AT_DATA([sys-file.sps], [GET FILE='sys-file.sav'.
 ])
-  AT_CHECK([pspp -O format=csv sys-file.sps], [1], [dnl
-"error: `sys-file.sav' near offset 0x100: Bad size (4) or count (9) field on 
record type 7, subtype 3."
-
-sys-file.sps:1: error: Stopping syntax file processing here to avoid a cascade 
of dependent command failures.
+  AT_CHECK([pspp -O format=csv sys-file.sps], [0], [dnl
+"warning: `sys-file.sav' near offset 0xd8: Record type 7, subtype 3 has bad 
count 9 (expected 8)."
 ])
 done
 AT_CLEANUP
@@ -1965,18 +1928,20 @@ dnl Numeric variable, no label or missing values.
 
 dnl Machine integer info record.
 7; 3; 4; 8; 1; 2; 3; -1; >>2<<; 1; ENDIAN; 1252;
+
+dnl End of dictionary.
+999; 0;
 ])
 for variant in \
        "be d510ed28278649eee997fb6881a4c04f" \
        "le fbf1eca561a4e243b7ae844ed1677035"
 do
   set $variant
-  AT_CHECK_UNQUOTED([sack --$[1] sys-file.sack > sys-file.sav], [0], [], [$[2]
-])
+  AT_CHECK_UNQUOTED([sack --$[1] sys-file.sack > sys-file.sav], [0], [], 
[ignore])
   AT_DATA([sys-file.sps], [GET FILE='sys-file.sav'.
 ])
   AT_CHECK([pspp -O format=csv sys-file.sps], [1], [dnl
-error: `sys-file.sav' near offset 0x100: Floating-point representation 
indicated by system file (2) differs from expected (1).
+error: `sys-file.sav' near offset 0xd8: Floating-point representation 
indicated by system file (2) differs from expected (1).
 
 sys-file.sps:1: error: Stopping syntax file processing here to avoid a cascade 
of dependent command failures.
 ])
@@ -2011,7 +1976,7 @@ GET FILE='sys-file.sav'.
 DISPLAY DICTIONARY.
 ])
   AT_CHECK_UNQUOTED([pspp -O format=csv sys-file.sps], [0], [dnl
-warning: \`sys-file.sav' near offset 0x100: Integer format indicated by system 
file (3) differs from expected ($[3]).
+warning: \`sys-file.sav' near offset 0xd8: Integer format indicated by system 
file (3) differs from expected ($[3]).
 
 Variable,Description,,Position
 num1,Format: F8.0,,1
@@ -2028,27 +1993,27 @@ AT_KEYWORDS([sack synthetic system file negative])
 AT_DATA([sys-file.sack], [dnl
 dnl File header.
 "$FL2"; s60 "$(#) SPSS DATA FILE PSPP synthetic test file";
-2; 2; 1; 0; -1; 100.0; "01 Jan 11"; "20:53:52"; s64 ""; i8 0 *3;
+2; 1; 1; 0; -1; 100.0; "01 Jan 11"; "20:53:52"; s64 ""; i8 0 *3;
 
 dnl Numeric variable, no label or missing values.
 2; 0; 0; 0; 0x050800 *2; s8 "NUM1";
 
 dnl Machine floating-point info record.
 7; 4; 8; >>4<<; SYSMIS; HIGHEST; LOWEST; 0.0;
+
+dnl End of dictionary.
+999; 0;
 ])
 for variant in \
        "be 29c9a173638fbb8bb1efe1176c4d670f" \
        "le 5cb49eb1084e5b9cd573a54705ff86a7"
 do
   set $variant
-  AT_CHECK_UNQUOTED([sack --$[1] sys-file.sack > sys-file.sav], [0], [], [$[2]
-])
+  AT_CHECK_UNQUOTED([sack --$[1] sys-file.sack > sys-file.sav], [0], [], 
[ignore])
   AT_DATA([sys-file.sps], [GET FILE='sys-file.sav'.
 ])
-  AT_CHECK([pspp -O format=csv sys-file.sps], [1], [dnl
-error: `sys-file.sav' near offset 0xf8: Bad size (8) or count (4) on extension 
4.
-
-sys-file.sps:1: error: Stopping syntax file processing here to avoid a cascade 
of dependent command failures.
+  AT_CHECK([pspp -O format=csv sys-file.sps], [0], [dnl
+"warning: `sys-file.sav' near offset 0xd8: Record type 7, subtype 4 has bad 
count 4 (expected 3)."
 ])
 done
 AT_CLEANUP
@@ -2079,11 +2044,11 @@ do
   AT_DATA([sys-file.sps], [GET FILE='sys-file.sav'.
 ])
   AT_CHECK([pspp -O format=csv sys-file.sps], [0], [dnl
-warning: `sys-file.sav' near offset 0xf8: File specifies unexpected value 0 as 
SYSMIS.
+warning: `sys-file.sav' near offset 0xd8: File specifies unexpected value 0 as 
SYSMIS.
 
-warning: `sys-file.sav' near offset 0xf8: File specifies unexpected value 1 as 
HIGHEST.
+warning: `sys-file.sav' near offset 0xd8: File specifies unexpected value 1 as 
HIGHEST.
 
-warning: `sys-file.sav' near offset 0xf8: File specifies unexpected value 2 as 
LOWEST.
+warning: `sys-file.sav' near offset 0xd8: File specifies unexpected value 2 as 
LOWEST.
 ])
 done
 AT_CLEANUP
@@ -2114,9 +2079,9 @@ do
   AT_DATA([sys-file.sps], [GET FILE='sys-file.sav'.
 ])
   AT_CHECK([pspp -O format=csv sys-file.sps], [0], [dnl
-warning: `sys-file.sav' near offset 0xe3: `a' does not begin with `$' at 
offset 2 in MRSETS record.
+warning: `sys-file.sav' near offset 0xd8: `a' does not begin with `$' at UTF-8 
offset 2 in MRSETS record.
 
-warning: `sys-file.sav' near offset 0xf8: `xyz' does not begin with `$' at 
offset 4 in MRSETS record.
+warning: `sys-file.sav' near offset 0xeb: `xyz' does not begin with `$' at 
UTF-8 offset 4 in MRSETS record.
 ])
 done
 AT_CLEANUP
@@ -2146,7 +2111,7 @@ do
   AT_DATA([sys-file.sps], [GET FILE='sys-file.sav'.
 ])
   AT_CHECK([pspp -O format=csv sys-file.sps], [0], [dnl
-warning: `sys-file.sav' near offset 0xe5: Missing space following `C' at 
offset 4 in MRSETS record.
+warning: `sys-file.sav' near offset 0xd8: Missing space following `C' at UTF-8 
offset 4 in MRSETS record.
 ])
 done
 AT_CLEANUP
@@ -2176,7 +2141,7 @@ do
   AT_DATA([sys-file.sps], [GET FILE='sys-file.sav'.
 ])
   AT_CHECK([pspp -O format=csv sys-file.sps], [0], [dnl
-warning: `sys-file.sav' near offset 0xe5: Missing space following `E' at 
offset 4 in MRSETS record.
+warning: `sys-file.sav' near offset 0xd8: Missing space following `E' at UTF-8 
offset 4 in MRSETS record.
 ])
 done
 AT_CLEANUP
@@ -2206,9 +2171,9 @@ do
   AT_DATA([sys-file.sps], [GET FILE='sys-file.sav'.
 ])
   AT_CHECK([pspp -O format=csv sys-file.sps], [0], [dnl
-warning: `sys-file.sav' near offset 0xe6: Unexpected label source value `2' 
following `E' at offset 7 in MRSETS record.
+warning: `sys-file.sav' near offset 0xd8: Unexpected label source value `2' 
following `E' at UTF-8 offset 7 in MRSETS record.
 
-warning: `sys-file.sav' near offset 0xe6: Expecting digit at offset 7 in 
MRSETS record.
+warning: `sys-file.sav' near offset 0xd8: Expecting digit at UTF-8 offset 7 in 
MRSETS record.
 ])
 done
 AT_CLEANUP
@@ -2238,7 +2203,7 @@ do
   AT_DATA([sys-file.sps], [GET FILE='sys-file.sav'.
 ])
   AT_CHECK([pspp -O format=csv sys-file.sps], [0], [dnl
-"warning: `sys-file.sav' near offset 0xe3: Missing `C', `D', or `E' at offset 
3 in MRSETS record."
+"warning: `sys-file.sav' near offset 0xd8: Missing `C', `D', or `E' at UTF-8 
offset 3 in MRSETS record."
 ])
 done
 AT_CLEANUP
@@ -2268,7 +2233,7 @@ do
   AT_DATA([sys-file.sps], [GET FILE='sys-file.sav'.
 ])
   AT_CHECK([pspp -O format=csv sys-file.sps], [0], [dnl
-warning: `sys-file.sav' near offset 0xe5: Expecting digit at offset 4 in 
MRSETS record.
+warning: `sys-file.sav' near offset 0xd8: Expecting digit at UTF-8 offset 4 in 
MRSETS record.
 ])
 done
 AT_CLEANUP
@@ -2298,7 +2263,7 @@ do
   AT_DATA([sys-file.sps], [GET FILE='sys-file.sav'.
 ])
   AT_CHECK([pspp -O format=csv sys-file.sps], [0], [dnl
-warning: `sys-file.sav' near offset 0xe6: Expecting space at offset 5 in 
MRSETS record.
+warning: `sys-file.sav' near offset 0xd8: Expecting space at UTF-8 offset 5 in 
MRSETS record.
 ])
 done
 AT_CLEANUP
@@ -2328,7 +2293,7 @@ do
   AT_DATA([sys-file.sps], [GET FILE='sys-file.sav'.
 ])
   AT_CHECK([pspp -O format=csv sys-file.sps], [0], [dnl
-warning: `sys-file.sav' near offset 0xe9: 4-byte string starting at offset 6 
exceeds record length 9.
+warning: `sys-file.sav' near offset 0xd8: 4-byte string starting at UTF-8 
offset 6 exceeds record length 9.
 ])
 done
 AT_CLEANUP
@@ -2358,7 +2323,7 @@ do
   AT_DATA([sys-file.sps], [GET FILE='sys-file.sav'.
 ])
   AT_CHECK([pspp -O format=csv sys-file.sps], [0], [dnl
-warning: `sys-file.sav' near offset 0xea: Expecting space at offset 9 
following 3-byte string.
+warning: `sys-file.sav' near offset 0xd8: Expecting space at UTF-8 offset 9 
following 3-byte string.
 ])
 done
 AT_CLEANUP
@@ -2388,9 +2353,9 @@ do
   AT_DATA([sys-file.sps], [GET FILE='sys-file.sav'.
 ])
   AT_CHECK([pspp -O format=csv sys-file.sps], [0], [dnl
-warning: `sys-file.sav' near offset 0xec: Missing new-line parsing variable 
names at offset 14 in MRSETS record.
+warning: `sys-file.sav' near offset 0xd8: Missing new-line parsing variable 
names at UTF-8 offset 14 in MRSETS record.
 
-warning: `sys-file.sav' near offset 0xec: MRSET $a has only 1 variables.
+warning: `sys-file.sav' near offset 0xd8: MRSET $a has only 1 variables.
 ])
 done
 AT_CLEANUP
@@ -2420,9 +2385,9 @@ do
   AT_DATA([sys-file.sps], [GET FILE='sys-file.sav'.
 ])
   AT_CHECK([pspp -O format=csv sys-file.sps], [0], [dnl
-warning: `sys-file.sav' near offset 0xf2: Duplicate variable name NUM1 at 
offset 18 in MRSETS record.
+warning: `sys-file.sav' near offset 0xd8: Duplicate variable name NUM1 at 
UTF-8 offset 18 in MRSETS record.
 
-warning: `sys-file.sav' near offset 0xf2: MRSET $a has only 1 variables.
+warning: `sys-file.sav' near offset 0xd8: MRSET $a has only 1 variables.
 ])
 done
 AT_CLEANUP
@@ -2453,9 +2418,9 @@ do
   AT_DATA([sys-file.sps], [GET FILE='sys-file.sav'.
 ])
   AT_CHECK([pspp -O format=csv sys-file.sps], [0], [dnl
-warning: `sys-file.sav' near offset 0x112: MRSET $a contains both string and 
numeric variables.
+warning: `sys-file.sav' near offset 0xf8: MRSET $a contains both string and 
numeric variables.
 
-warning: `sys-file.sav' near offset 0x112: MRSET $a has only 1 variables.
+warning: `sys-file.sav' near offset 0xf8: MRSET $a has only 1 variables.
 ])
 done
 AT_CLEANUP
@@ -2485,7 +2450,7 @@ do
   AT_DATA([sys-file.sps], [GET FILE='sys-file.sav'.
 ])
   AT_CHECK([pspp -O format=csv sys-file.sps], [0], [dnl
-warning: `sys-file.sav' near offset 0xed: MRSET $a has only 1 variables.
+warning: `sys-file.sav' near offset 0xd8: MRSET $a has only 1 variables.
 ])
 done
 AT_CLEANUP
@@ -2515,7 +2480,7 @@ do
   AT_DATA([sys-file.sps], [GET FILE='sys-file.sav'.
 ])
   AT_CHECK([pspp -O format=csv sys-file.sps], [0], [dnl
-warning: `sys-file.sav' near offset 0xed: MRSET $a has only 1 variables.
+warning: `sys-file.sav' near offset 0xd8: MRSET $a has only 1 variables.
 ])
 done
 AT_CLEANUP
@@ -2546,7 +2511,7 @@ do
   AT_DATA([sys-file.sps], [GET FILE='sys-file.sav'.
 ])
   AT_CHECK([pspp -O format=csv sys-file.sps], [0], [dnl
-warning: `sys-file.sav' near offset 0xe0: Bad size 8 on extension 11.
+"warning: `sys-file.sav' near offset 0xd8: Record type 7, subtype 11 has bad 
size 8 (expected 4)."
 ])
 done
 AT_CLEANUP
@@ -2577,7 +2542,7 @@ do
   AT_DATA([sys-file.sps], [GET FILE='sys-file.sav'.
 ])
   AT_CHECK([pspp -O format=csv sys-file.sps], [0], [dnl
-warning: `sys-file.sav' near offset 0xe0: Extension 11 has bad count 4 (for 1 
variables).
+warning: `sys-file.sav' near offset 0xd8: Extension 11 has bad count 4 (for 1 
variables).
 ])
 done
 AT_CLEANUP
@@ -2608,7 +2573,7 @@ do
   AT_DATA([sys-file.sps], [GET FILE='sys-file.sav'.
 ])
   AT_CHECK([pspp -O format=csv sys-file.sps], [0], [dnl
-warning: `sys-file.sav' near offset 0xe8: Invalid variable display parameters 
for variable 0 (NUM1).  Default parameters substituted.
+warning: `sys-file.sav' near offset 0xd8: Invalid variable display parameters 
for variable 0 (NUM1).  Default parameters substituted.
 ])
 done
 AT_CLEANUP
@@ -2639,7 +2604,7 @@ do
   AT_DATA([sys-file.sps], [GET FILE='sys-file.sav'.
 ])
   AT_CHECK([pspp -O format=csv sys-file.sps], [0], [dnl
-warning: `sys-file.sav' near offset 0xe8: Invalid variable display parameters 
for variable 0 (NUM1).  Default parameters substituted.
+warning: `sys-file.sav' near offset 0xd8: Invalid variable display parameters 
for variable 0 (NUM1).  Default parameters substituted.
 ])
 done
 AT_CLEANUP
@@ -2671,7 +2636,7 @@ do
 GET FILE='sys-file.sav'.
 ])
   AT_CHECK([pspp -O format=csv sys-file.sps], [0], [dnl
-warning: `sys-file.sav' near offset 0xe5: Dictionary record refers to unknown 
variable xyzzy.
+warning: `sys-file.sav' near offset 0xde: Dictionary record refers to unknown 
variable xyzzy.
 ])
 done
 AT_CLEANUP
@@ -2710,9 +2675,9 @@ do
 GET FILE='sys-file.sav'.
 ])
   AT_CHECK([pspp -O format=csv sys-file.sps], [0], [dnl
-warning: `sys-file.sav' near offset 0x186: Long variable mapping from LONGVARI 
to invalid variable name `_Invalid'.
+warning: `sys-file.sav' near offset 0x138: Long variable mapping from LONGVARI 
to invalid variable name `_Invalid'.
 
-warning: `sys-file.sav' near offset 0x186: Duplicate long variable name 
`LONGVARIABLENAME'.
+warning: `sys-file.sav' near offset 0x138: Duplicate long variable name 
`LONGVARIABLENAME'.
 ])
 done
 AT_CLEANUP
@@ -2748,11 +2713,11 @@ do
 GET FILE='sys-file.sav'.
 ])
   AT_CHECK([pspp -O format=csv sys-file.sps], [1], [dnl
-warning: `sys-file.sav' near offset 0x104: NUM1 listed as string of invalid 
length 00000 in very long string record.
+warning: `sys-file.sav' near offset 0xd8: NUM1 listed as string of invalid 
length 00000 in very long string record.
 
-"warning: `sys-file.sav' near offset 0x104: NUM1 listed in very long string 
record with width 00255, which requires only one segment."
+"warning: `sys-file.sav' near offset 0xd8: NUM1 listed in very long string 
record with width 00255, which requires only one segment."
 
-error: `sys-file.sav' near offset 0x104: Very long string NUM1 overflows 
dictionary.
+error: `sys-file.sav' near offset 0xd8: Very long string NUM1 overflows 
dictionary.
 
 sys-file.sps:1: error: Stopping syntax file processing here to avoid a cascade 
of dependent command failures.
 ])
@@ -2791,7 +2756,7 @@ do
 GET FILE='sys-file.sav'.
 ])
   AT_CHECK([pspp -O format=csv sys-file.sps], [1], [dnl
-error: `sys-file.sav' near offset 0x50c: Very long string with width 256 has 
segment 1 of width 9 (expected 4).
+error: `sys-file.sav' near offset 0x4f8: Very long string with width 256 has 
segment 1 of width 9 (expected 4).
 
 sys-file.sps:1: error: Stopping syntax file processing here to avoid a cascade 
of dependent command failures.
 ])
@@ -2823,7 +2788,7 @@ do
 GET FILE='sys-file.sav'.
 ])
   AT_CHECK([pspp -O format=csv sys-file.sps], [1], [dnl
-error: `sys-file.sav' near offset 0xd8: Invalid number of labels 2147483647.
+error: `sys-file.sav' near offset 0xd4: Invalid number of labels 2147483647.
 
 sys-file.sps:1: error: Stopping syntax file processing here to avoid a cascade 
of dependent command failures.
 ])
@@ -2857,7 +2822,7 @@ do
 GET FILE='sys-file.sav'.
 ])
   AT_CHECK([pspp -O format=csv sys-file.sps], [1], [dnl
-error: `sys-file.sav' near offset 0xec: Variable index record (type 4) does 
not immediately follow value label record (type 3) as it should.
+error: `sys-file.sav' near offset 0xe8: Variable index record (type 4) does 
not immediately follow value label record (type 3) as it should.
 
 sys-file.sps:1: error: Stopping syntax file processing here to avoid a cascade 
of dependent command failures.
 ])
@@ -2888,7 +2853,7 @@ do
 GET FILE='sys-file.sav'.
 ])
   AT_CHECK([pspp -O format=csv sys-file.sps], [1], [dnl
-error: `sys-file.sav' near offset 0xf0: Number of variables associated with a 
value label (0) is not between 1 and the number of variables (1).
+error: `sys-file.sav' near offset 0xec: Number of variables associated with a 
value label (0) is not between 1 and the number of variables (1).
 
 sys-file.sps:1: error: Stopping syntax file processing here to avoid a cascade 
of dependent command failures.
 ])
@@ -2908,19 +2873,21 @@ dnl Long string variable.
 
 dnl Value label that names long string variable.
 3; 1; s8 "xyzzy"; i8 3; s7 "one"; 4; 1; >>1<<;
+
+dnl End of dictionary.
+999; 0;
 ])
 for variant in \
        "be 14053a4f09de4c7c4c55281534dd66f4" \
        "le 8a61cc994c659fd66307d2f0fd64ce20"
 do
   set $variant
-  AT_CHECK_UNQUOTED([sack --$[1] sys-file.sack > sys-file.sav], [0], [], [$[2]
-])
+  AT_CHECK_UNQUOTED([sack --$[1] sys-file.sack > sys-file.sav], [0], [], 
[ignore])
   AT_DATA([sys-file.sps], [dnl
 GET FILE='sys-file.sav'.
 ])
   AT_CHECK([pspp -O format=csv sys-file.sps], [1], [dnl
-error: `sys-file.sav' near offset 0x114: Value labels may not be added to long 
string variables (e.g. STR1) using records types 3 and 4.
+error: `sys-file.sav' near offset 0xf4: Value labels may not be added to long 
string variables (e.g. STR1) using records types 3 and 4.
 
 sys-file.sps:1: error: Stopping syntax file processing here to avoid a cascade 
of dependent command failures.
 ])
@@ -2940,19 +2907,21 @@ dnl Variables.
 
 dnl Value label that names numeric and string variables.
 3; 1; s8 "xyzzy"; i8 3; s7 "one"; 4; 2; >>1; 2<<;
+
+dnl End of dictionary.
+999; 0;
 ])
 for variant in \
        "be 7577c456726a88f52bbef63a8b47bf1a" \
        "le 3ba5c6af9ad0ae5cc88f9f63e726e414"
 do
   set $variant
-  AT_CHECK_UNQUOTED([sack --$[1] sys-file.sack > sys-file.sav], [0], [], [$[2]
-])
+  AT_CHECK_UNQUOTED([sack --$[1] sys-file.sack > sys-file.sav], [0], [], 
[ignore])
   AT_DATA([sys-file.sps], [dnl
 GET FILE='sys-file.sav'.
 ])
   AT_CHECK([pspp -O format=csv sys-file.sps], [1], [dnl
-"error: `sys-file.sav' near offset 0x118: Variables associated with value 
label are not all of identical type.  Variable STR1 is string, but variable 
NUM1 is numeric."
+"error: `sys-file.sav' near offset 0xf4: Variables associated with value label 
are not all of identical type.  Variable STR1 is string, but variable NUM1 is 
numeric."
 
 sys-file.sps:1: error: Stopping syntax file processing here to avoid a cascade 
of dependent command failures.
 ])
@@ -2988,9 +2957,9 @@ do
 GET FILE='sys-file.sav'.
 ])
   AT_CHECK([pspp -O format=csv sys-file.sps], [0], [dnl
-warning: `sys-file.sav' near offset 0x118: Duplicate value label for `xyzzy ' 
on STR1.
+warning: `sys-file.sav' near offset 0xf4: Duplicate value label for `xyzzy ' 
on STR1.
 
-warning: `sys-file.sav' near offset 0x140: Duplicate value label for 1 on NUM1.
+warning: `sys-file.sav' near offset 0x11c: Duplicate value label for 1 on NUM1.
 ])
 done
 AT_CLEANUP
@@ -3030,9 +2999,9 @@ do
 GET FILE='sys-file.sav'.
 ])
   AT_CHECK([pspp -O format=csv sys-file.sps], [0], [dnl
-warning: `sys-file.sav' near offset 0xe6: Error parsing attribute value 
Attr1[[1]].
+warning: `sys-file.sav' near offset 0xdf: Error parsing attribute value 
Attr1[[1]].
 
-warning: `sys-file.sav' near offset 0x109: Error parsing attribute value 
fred[[2]].
+warning: `sys-file.sav' near offset 0x102: Error parsing attribute value 
fred[[2]].
 ])
 done
 AT_CLEANUP
@@ -3073,9 +3042,9 @@ do
 GET FILE='sys-file.sav'.
 ])
   AT_CHECK([pspp -O format=csv sys-file.sps], [0], [dnl
-warning: `sys-file.sav' near offset 0xed: Attribute value Attr1[[1]] is not 
quoted: value.
+warning: `sys-file.sav' near offset 0xe4: Attribute value Attr1[[1]] is not 
quoted: value.
 
-warning: `sys-file.sav' near offset 0x10f: Attribute value fred[[1]] is not 
quoted: 23.
+warning: `sys-file.sav' near offset 0x106: Attribute value fred[[1]] is not 
quoted: 23.
 ])
 done
 AT_CLEANUP
@@ -3128,15 +3097,15 @@ do
   AT_DATA([sys-file.sps], [GET FILE='sys-file.sav'.
 ])
   AT_CHECK([pspp -O format=csv sys-file.sps], [0], [dnl
-warning: `sys-file.sav' near offset 0x130: Ignoring long string value record 
for unknown variable STR9.
+warning: `sys-file.sav' near offset 0x128: Ignoring long string value record 
for unknown variable STR9.
 
-warning: `sys-file.sav' near offset 0x16c: Ignoring long string value record 
for numeric variable NUM1.
+warning: `sys-file.sav' near offset 0x164: Ignoring long string value record 
for numeric variable NUM1.
 
-warning: `sys-file.sav' near offset 0x19b: Ignoring long string value record 
for variable STR14 because the record's width (9) does not match the variable's 
width (14).
+warning: `sys-file.sav' near offset 0x193: Ignoring long string value record 
for variable STR14 because the record's width (9) does not match the variable's 
width (14).
 
-"warning: `sys-file.sav' near offset 0x1dc: Ignoring long string value 0 for 
variable STR14, with width 14, that has bad value width 9."
+"warning: `sys-file.sav' near offset 0x1d4: Ignoring long string value 0 for 
variable str14, with width 14, that has bad value width 9."
 
-warning: `sys-file.sav' near offset 0x289: Duplicate value label for 
`abcdefghijklmn' on STR14.
+warning: `sys-file.sav' near offset 0x259: Duplicate value label for 
`abcdefghijklmn' on str14.
 ])
 done
 AT_CLEANUP
diff --git a/tests/language/dictionary/sys-file-info.at 
b/tests/language/dictionary/sys-file-info.at
index 14a53fa..923c3a2 100644
--- a/tests/language/dictionary/sys-file-info.at
+++ b/tests/language/dictionary/sys-file-info.at
@@ -17,7 +17,8 @@ AT_CHECK(
   [sed -e '/^Created:,/d' \
        -e '/^Endian:,/d' \
        -e '/^Integer Format:,/d' \
-       -e '/^Real Format:,/d' pspp.csv],
+       -e '/^Real Format:,/d' \
+       -e '/^Charset:,/d' pspp.csv],
   [0], [dnl
 Table: Reading free-form data from INLINE.
 Variable,Format
@@ -31,7 +32,6 @@ Cases:,3
 Type:,System File
 Weight:,Not weighted.
 Mode:,Compression on.
-Charset:,Unknown
 
 Variable,Description,,Position
 x,Format: F8.2,,1
-- 
1.7.2.3




reply via email to

[Prev in Thread] Current Thread [Next in Thread]