Re: strread.m

octave-maintainers

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: strread.m

From:	John W. Eaton
Subject:	Re: strread.m
Date:	Thu, 4 Aug 2011 11:27:42 -0400

On  3-Aug-2011, Philip Nienhuis wrote:

| > I will probably try to write textscan in C++.  It's up to you whether
| > you want to continue fixing problems in strread, but given the
| 
| Do you have a time schedule in mind?
| That would help me make a better decision of what to do.

I started working on it yesterday.  So far I've only implemented the
part that decodes the format.  I'll try for at least some of the
conversions today.  Then I may need help in figuring out how to
properly return the variables that are read from the file.  Then we
will also need to handle the parameter/value options.

The diffs below are what I have now.  You can do things like

  fid = fopen ("any-existing-file");
  xtextscan (fid, "any format here for testing")

and xtextscan will display the components of the format.

jwe

# HG changeset patch
# User John W. Eaton <address@hidden>
# Date 1312471485 14400
# Node ID 5860b88c35c5cdb5d81d9e78a9f3ff4033326004
# Parent  61906c0d1e9bce0b98d3e05a571549598eaaf99e
rewrite textscan in C++

* file-io.cc (Fxtextscan): New function.
* oct-stream.h, oct-stream.cc (textscan_format_elt,
textscan_format_list): New classes.
(octave_base_stream::do_textscan, octave_base_stream::textscan,
octave_stream::textscan): New functions.

diff --git a/src/file-io.cc b/src/file-io.cc
--- a/src/file-io.cc
+++ b/src/file-io.cc
@@ -1292,6 +1292,37 @@
   return Ffscanf (tmp_args, nargout);
 }
 
+DEFUN (xtextscan, args, ,
+  "-*- texinfo -*-\n\
address@hidden  {Built-in Function} address@hidden, @var{pos}] =} textscan 
(@var{fid}, @var{template}, @var{ntimes}, @var{param}, @var{val}, @dots{})\n\
address@hidden deftypefn")
+{
+  octave_value_list retval;
+
+  octave_stream os = octave_stream_list::lookup (args(0), "textscan");
+
+  if (! error_state)
+    {
+      if (args(1).is_string ())
+        {
+          octave_idx_type ntimes = -1;
+
+          octave_value tmp = os.textscan (args(1), ntimes);
+
+          if (! error_state)
+            {
+              // FIXME -- warn if stream is not opened in binary mode?
+              retval(1) = os.tell ();
+              retval(0) = tmp;
+            }
+        }
+      else
+        print_usage ();
+    }
+
+  return retval;
+}
+
 static octave_value
 do_fread (octave_stream& os, const octave_value& size_arg,
           const octave_value& prec_arg, const octave_value& skip_arg,
diff --git a/src/oct-stream.cc b/src/oct-stream.cc
--- a/src/oct-stream.cc
+++ b/src/oct-stream.cc
@@ -34,7 +34,8 @@
 #include <sstream>
 #include <string>
 
-#include <Array.h>
+#include "Array.h"
+#include "Array.cc"
 
 #include "byte-swap.h"
 #include "lo-ieee.h"
@@ -573,6 +574,368 @@
 
 // Ugh again.
 
+textscan_format_list::textscan_format_list (const std::string& s)
+  : nconv (0), curr_idx (0), list (dim_vector (16, 1)), buf (0)
+{
+  octave_idx_type num_elts = 0;
+
+  size_t n = s.length ();
+
+  size_t i = 0;
+
+  int width = 0;
+  int prec = 0;
+  int bitwidth = 0;
+  bool discard = false;
+  char type = '\0';
+
+  bool have_more = true;
+
+  while (i < n)
+    {
+      have_more = true;
+
+      if (! buf)
+        buf = new std::ostringstream ();
+
+      if (s[i] == '%')
+        {
+          // Process percent-escape conversion type.
+
+          process_conversion (s, i, n, width, prec, bitwidth,
+                              discard, type, num_elts);
+
+          have_more = (buf != 0);
+        }
+      else if (isspace (s[i]))
+        {
+          type = textscan_format_elt::whitespace_conversion;
+
+          width = 0;
+          prec = 0;
+          bitwidth = 0;
+          discard = false;
+          *buf << " ";
+
+          while (++i < n && isspace (s[i]))
+            /* skip whitespace */;
+
+          add_elt_to_list (width, prec, bitwidth, discard, type, num_elts);
+
+          have_more = false;
+        }
+      else
+        {
+          type = textscan_format_elt::literal_conversion;
+
+          width = 0;
+          prec = 0;
+          bitwidth = 0;
+          discard = false;
+
+          while (i < n && ! isspace (s[i]) && s[i] != '%')
+            *buf << s[i++];
+
+          add_elt_to_list (width, prec, bitwidth, discard, type, num_elts);
+
+          have_more = false;
+        }
+
+      if (nconv < 0)
+        {
+          have_more = false;
+          break;
+        }
+    }
+
+  if (have_more)
+    add_elt_to_list (width, prec, bitwidth, discard, type, num_elts);
+
+  list.resize (dim_vector (num_elts, 1));
+
+  delete buf;
+}
+
+textscan_format_list::~textscan_format_list (void)
+{
+  octave_idx_type n = list.length ();
+
+  for (octave_idx_type i = 0; i < n; i++)
+    {
+      textscan_format_elt *elt = list(i);
+      delete elt;
+    }
+}
+
+void
+textscan_format_list::add_elt_to_list (int width, int prec, int bitwidth,
+                                       bool discard, char type,
+                                       octave_idx_type& num_elts,
+                                       const std::string& char_class)
+{
+  if (buf)
+    {
+      std::string text = buf->str ();
+
+      if (! text.empty ())
+        {
+          textscan_format_elt *elt
+            = new textscan_format_elt (text.c_str (), width, prec, bitwidth,
+                                       discard, type, char_class);
+
+          if (num_elts == list.length ())
+            list.resize (dim_vector (2 * num_elts, 1));
+
+          list(num_elts++) = elt;
+        }
+
+      delete buf;
+      buf = 0;
+    }
+}
+
+void
+textscan_format_list::process_conversion (const std::string& s, size_t& i,
+                                          size_t n, int& width, int& prec,
+                                          int& bitwidth, bool& discard,
+                                          char& type, octave_idx_type& 
num_elts)
+{
+  width = 0;
+  prec = 0;
+  bitwidth = 0;
+  discard = false;
+  type = '\0';
+
+  *buf << s[i++];
+
+  bool have_width = false;
+
+  while (i < n)
+    {
+      switch (s[i])
+        {
+        case '*':
+          if (discard)
+            nconv = -1;
+          else
+            {
+              discard = true;
+              *buf << s[i++];
+            }
+          break;
+
+        case '0': case '1': case '2': case '3': case '4':
+        case '5': case '6': case '7': case '8': case '9':
+          if (have_width)
+            nconv = -1;
+          else
+            {
+              char c = s[i++];
+              width = width * 10 + c - '0';
+              have_width = true;
+              *buf << c;
+              while (i < n && isdigit (s[i]))
+                {
+                  c = s[i++];
+                  width = width * 10 + c - '0';
+                  *buf << c;
+                }
+
+              if (i < n && s[i] == '.')
+                {
+                  *buf << s[i++];
+                  while (i < n && isdigit (s[i]))
+                    {
+                      c = s[i++];
+                      prec = prec * 10 + c - '0';
+                      *buf << c;
+                    }
+                }
+            }
+          break;
+
+        case 'd': case 'u':
+          if (i < n)
+            {
+              bitwidth = 32;
+
+              if (s[i] == 8)
+                {
+                  *buf << s[i++];
+                  bitwidth = 8;
+                }
+              else if (s[i] == 1 && i+1 < n && s[i+1] == 6)
+                {
+                  *buf << s[i++];
+                  *buf << s[i++];
+                }
+              else if (s[i] == 3 && i+1 < n && s[i+1] == 2)
+                {
+                  *buf << s[i++];
+                  *buf << s[i++];
+                }
+              else if (s[i] == 6 && i+1 < n && s[i+1] == 4)
+                {
+                  *buf << s[i++];
+                  *buf << s[i++];
+                }
+            }
+          goto fini;
+
+        case 'f':
+          if (i < n)
+            {
+              bitwidth = 64;
+
+              if (s[i] == 3 && i+1 < n && s[i+1] == 2)
+                {
+                  *buf << s[i++];
+                  *buf << s[i++];
+                }
+              else if (s[i] == 6 && i+1 < n && s[i+1] == 4)
+                {
+                  *buf << s[i++];
+                  *buf << s[i++];
+                }
+            }
+          goto fini;
+
+        case 'n':
+          bitwidth = 64;
+          goto fini;
+
+        case 's': case 'q': case '%': case '[':
+          goto fini;
+
+        fini:
+          {
+            if (finish_conversion (s, i, n, width, prec, bitwidth,
+                                   discard, type, num_elts) == 0)
+              return;
+          }
+          break;
+
+        default:
+          nconv = -1;
+          break;
+        }
+
+      if (nconv < 0)
+        break;
+    }
+
+  nconv = -1;
+}
+
+int
+textscan_format_list::finish_conversion (const std::string& s, size_t& i,
+                                         size_t n, int& width, int& prec,
+                                         int& bitwidth, bool discard,
+                                         char& type, octave_idx_type& num_elts)
+{
+  int retval = 0;
+
+  std::string char_class;
+
+  size_t beg_idx = std::string::npos;
+  size_t end_idx = std::string::npos;
+
+  if (s[i] == '%')
+    {
+      type = '%';
+      *buf << s[i++];
+    }
+  else
+    {
+      type = s[i];
+
+      if (s[i] == '[')
+        {
+          *buf << s[i++];
+
+          if (i < n)
+            {
+              beg_idx = i;
+
+              if (s[i] == '^')
+                {
+                  type = '^';
+                  *buf << s[i++];
+
+                  if (i < n)
+                    {
+                      beg_idx = i;
+
+                      if (s[i] == ']')
+                        *buf << s[i++];
+                    }
+                }
+              else if (s[i] == ']')
+                *buf << s[i++];
+            }
+
+          while (i < n && s[i] != ']')
+            *buf << s[i++];
+
+          if (i < n && s[i] == ']')
+            {
+              end_idx = i-1;
+              *buf << s[i++];
+            }
+
+          if (s[i-1] != ']')
+            retval = nconv = -1;
+        }
+      else
+        *buf << s[i++];
+
+      nconv++;
+    }
+
+  if (nconv >= 0)
+    {
+      if (beg_idx != std::string::npos && end_idx != std::string::npos)
+        char_class = expand_char_class (s.substr (beg_idx,
+                                                  end_idx - beg_idx + 1));
+
+      add_elt_to_list (width, prec, bitwidth, discard, type,
+                       num_elts, char_class);
+    }
+
+  return retval;
+}
+
+void
+textscan_format_list::printme (void) const
+{
+  octave_idx_type n = list.length ();
+
+  for (octave_idx_type i = 0; i < n; i++)
+    {
+      textscan_format_elt *elt = list(i);
+
+      std::cerr
+        << "width:      " << elt->width << "\n"
+        << "digits      " << elt->digits << "\n"
+        << "bitwidth:   " << elt->bitwidth << "\n"
+        << "discard:    " << elt->discard << "\n"
+        << "type:       ";
+
+      if (elt->type == textscan_format_elt::literal_conversion)
+        std::cerr << "literal text\n";
+      else if (elt->type == textscan_format_elt::whitespace_conversion)
+        std::cerr << "whitespace\n";
+      else
+        std::cerr << elt->type << "\n";
+
+      std::cerr
+        << "char_class: `" << undo_string_escapes (elt->char_class) << "'\n"
+        << "text:       `" << undo_string_escapes (elt->text) << "'\n\n";
+    }
+}
+
+// And again.
+
 printf_format_list::printf_format_list (const std::string& s)
   : nconv (0), curr_idx (0), list (dim_vector (16, 1)), buf (0)
 {
@@ -2305,6 +2668,45 @@
   return retval;
 }
 
+octave_value
+octave_base_stream::do_textscan (textscan_format_list& fmt_list,
+                                 octave_idx_type ntimes)
+{
+  octave_value retval = Matrix ();
+
+  std::cerr << "textscan: this is when the conversion would happen"
+            << std::endl;
+
+  return retval;
+}
+
+octave_value
+octave_base_stream::textscan (const std::string& fmt, octave_idx_type ntimes)
+{
+  octave_value retval = Matrix ();
+
+  std::istream *isp = input_stream ();
+
+  if (isp)
+    {
+      textscan_format_list fmt_list (fmt);
+
+      fmt_list.printme ();
+
+      if (fmt_list.num_conversions () == -1)
+        ::error ("textscan: invalid format specified");
+      else
+        {
+          if (! error_state)
+            retval = do_textscan (fmt_list, ntimes);
+        }
+    }
+  else
+    invalid_operation ("textscan", "reading");
+
+  return retval;
+}
+
 // Functions that are defined for all output streams (output streams
 // are those that define os).
 
@@ -3860,6 +4262,41 @@
   return retval;
 }
 
+octave_value
+octave_stream::textscan (const std::string& fmt, octave_idx_type ntimes)
+{
+  octave_value retval;
+
+  if (stream_ok ())
+    retval = rep->textscan (fmt, ntimes);
+
+  return retval;
+}
+
+octave_value
+octave_stream::textscan (const octave_value& fmt, octave_idx_type ntimes)
+{
+  octave_value retval = Matrix ();
+
+  if (fmt.is_string ())
+    {
+      std::string sfmt = fmt.string_value ();
+
+      if (fmt.is_sq_string ())
+        sfmt = do_string_escapes (sfmt);
+
+      retval = textscan (sfmt, ntimes);
+    }
+  else
+    {
+      // Note that this is not ::error () !
+
+      error ("textscan: format must be a string");
+    }
+
+  return retval;
+}
+
 int
 octave_stream::printf (const std::string& fmt, const octave_value_list& args,
                        const std::string& who)
diff --git a/src/oct-stream.h b/src/oct-stream.h
--- a/src/oct-stream.h
+++ b/src/oct-stream.h
@@ -184,6 +184,155 @@
 };
 
 class
+OCTINTERP_API
+textscan_format_elt
+{
+public:
+
+  enum special_conversion
+    {
+      whitespace_conversion = 1,
+      literal_conversion = 2
+    };
+
+  textscan_format_elt (const char *txt = 0, int w = 0, int d = 0,
+                       int bw = 0, bool dis = false, char typ = '\0',
+                       const std::string& ch_class = std::string ())
+    : text (strsave (txt)), width (w), digits (d), bitwidth (bw),
+      discard (dis), type (typ), char_class (ch_class) { }
+
+  textscan_format_elt (const textscan_format_elt& e)
+    : text (strsave (e.text)), width (e.width), digits (e.digits),
+      bitwidth (e.bitwidth), discard (e.discard), type (e.type),
+      char_class (e.char_class) { }
+
+  textscan_format_elt& operator = (const textscan_format_elt& e)
+    {
+      if (this != &e)
+        {
+          text = strsave (e.text);
+          width = e.width;
+          digits = e.digits;
+          bitwidth = e.bitwidth;
+          discard = e.discard;
+          type = e.type;
+          char_class = e.char_class;
+        }
+
+      return *this;
+    }
+
+  ~textscan_format_elt (void) { delete [] text; }
+
+  // The C-style format string.
+  const char *text;
+
+  // The maximum field width.
+  int width;
+
+  // The maximum number of digits to read after the decimal in a
+  // floating point conversion.
+  int digits;
+
+  // The size of the result.  For integers, bitwidth may be 8, 16, 34,
+  // or 64.  For floating point values, bitwidth may be 32 or 64.
+  int bitwidth;
+
+  // TRUE if we are not storing the result of this conversion.
+  bool discard;
+
+  // Type of conversion -- `d', `u', `f', `n', `s', `q', `c', `%', or `['.
+  char type;
+
+  // The class of characters in a `[' format.
+  std::string char_class;
+};
+
+class
+OCTINTERP_API
+textscan_format_list
+{
+public:
+
+  textscan_format_list (const std::string& fmt = std::string ());
+
+  ~textscan_format_list (void);
+
+  octave_idx_type num_conversions (void) { return nconv; }
+
+  // The length can be different than the number of conversions.
+  // For example, "x %d y %d z" has 2 conversions but the length of
+  // the list is 3 because of the characters that appear after the
+  // last conversion.
+
+  octave_idx_type length (void) { return list.length (); }
+
+  const textscan_format_elt *first (void)
+    {
+      curr_idx = 0;
+      return current ();
+    }
+
+  const textscan_format_elt *current (void) const
+    { return list.length () > 0 ? list.elem (curr_idx) : 0; }
+
+  const textscan_format_elt *next (bool cycle = true)
+    {
+      curr_idx++;
+
+      if (curr_idx >= list.length ())
+        {
+          if (cycle)
+            curr_idx = 0;
+          else
+            return 0;
+        }
+      return current ();
+    }
+
+  void printme (void) const;
+
+  bool ok (void) const { return (nconv >= 0); }
+
+  operator bool () const { return ok (); }
+
+private:
+
+  // Number of conversions specified by this format string, or -1 if
+  // invalid conversions have been found.
+  octave_idx_type nconv;
+
+  // Index to current element;
+  octave_idx_type curr_idx;
+
+  // FIXME -- maybe LIST should be a std::list object?
+  // List of format elements.
+  Array<textscan_format_elt*> list;
+
+  // Temporary buffer.
+  std::ostringstream *buf;
+
+  void add_elt_to_list (int width, int digits, int bitwidth, bool discard,
+                        char type, octave_idx_type& num_elts,
+                        const std::string& char_class = std::string ());
+
+  void process_conversion (const std::string& s, size_t& i, size_t n,
+                           int& width, int& digits, int& bitwidth,
+                           bool& discard, char& type,
+                           octave_idx_type& num_elts);
+
+  int finish_conversion (const std::string& s, size_t& i, size_t n,
+                         int& width, int& digits, int& bitwidth,
+                         bool discard, char& type,
+                         octave_idx_type& num_elts);
+  // No copying!
+
+  textscan_format_list (const textscan_format_list&);
+
+  textscan_format_list& operator = (const textscan_format_list&);
+};
+
+class
 printf_format_elt
 {
 public:
@@ -468,6 +617,11 @@
   octave_value_list oscanf (const std::string& fmt,
                             const std::string& who /* = "scanf" */);
 
+  octave_value do_textscan (textscan_format_list& fmt_list,
+                            octave_idx_type ntimes);
+
+  octave_value textscan (const std::string& fmt, octave_idx_type ntimes);
+
   // Functions that are defined for all output streams (output streams
   // are those that define os).
 
@@ -558,6 +712,10 @@
   octave_value_list oscanf (const octave_value& fmt,
                             const std::string& who /* = "scanf" */);
 
+  octave_value textscan (const std::string& fmt, octave_idx_type ntimes);
+
+  octave_value textscan (const octave_value& fmt, octave_idx_type ntimes);
+
   int printf (const std::string& fmt, const octave_value_list& args,
               const std::string& who /* = "printf" */);

[Prev in Thread]

Current Thread

[Next in Thread]

Re: Release goals for 3.6, (continued)
- Re: Release goals for 3.6, Konstantinos Poulios, 2011/08/03

Prev by Date: Re: sqrtm test failure
Next by Date: cellfun benchmarks
Previous by thread: Re: strread.m
Next by thread: xtextscan [WAS: Re: strread.m]
Index(es):
- Date
- Thread