bug-gnulib
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[PATCH v4] unistr: New modules for backward iteration in string.


From: Ben Pfaff
Subject: [PATCH v4] unistr: New modules for backward iteration in string.
Date: Thu, 18 Sep 2014 15:53:15 -0700

New module 'unistr/u8-mb-prev-uc'.
* lib/unistr.in.h (u8_mb_prev_uc): New declaration.
(u8_mb_prev_uc_aux): New declaration.
* lib/unistr/u8-mb-prev-uc.c: New file.
* lib/unistr/u8-mb-prev-uc-aux.c: New file.
* tests/test-u8-mb-prev-uc.c: New file.
* modules/u8-mb-prev-uc: New file.
* modules/u8-mb-prev-uc-tests: New file.

New module 'unistr/u16-mb-prev-uc'.
* lib/unistr.in.h (u16_mb_prev_uc): New declaration.
(u16_mb_prev_uc_aux): New declaration.
* lib/unistr/u16-mb-prev-uc.c: New file.
* lib/unistr/u16-mb-prev-uc-aux.c: New file.
* tests/test-u16-mb-prev-uc.c: New file.
* modules/u16-mb-prev-uc: New file.
* modules/u16-mb-prev-uc-tests: New file.

New module 'unistr/u32-mb-prev-uc'.
* lib/unistr.in.h (u32_mb_prev_uc): New declaration.
* lib/unistr/u32-mb-prev-uc.c: New file.
* tests/test-u32-mb-prev-uc.c: New file.
* modules/u32-mb-prev-uc: New file.
* modules/u32-mb-prev-uc-tests: New file.
---
v1->v2: Revised based on Bruno Haible's feedback.
v2->v3: Rebase only.
v3->v4: Changed the code to always be "safe".  It looks to me like the
  "unsafe" version that I had written originally reflected a misunderstanding 
of how the gnulib option for that was supposed to work.

 ChangeLog                           |  27 ++++
 lib/unistr.in.h                     |  71 ++++++++++
 lib/unistr/u16-mb-prev-uc-aux.c     |  52 +++++++
 lib/unistr/u16-mb-prev-uc.c         |  62 +++++++++
 lib/unistr/u32-mb-prev-uc.c         |  43 ++++++
 lib/unistr/u8-mb-prev-uc-aux.c      | 131 +++++++++++++++++
 lib/unistr/u8-mb-prev-uc.c          | 142 +++++++++++++++++++
 modules/unistr/u16-mb-prev-uc       |  28 ++++
 modules/unistr/u16-mb-prev-uc-tests |  12 ++
 modules/unistr/u32-mb-prev-uc       |  27 ++++
 modules/unistr/u32-mb-prev-uc-tests |  12 ++
 modules/unistr/u8-mb-prev-uc        |  28 ++++
 modules/unistr/u8-mb-prev-uc-tests  |  14 ++
 tests/unistr/test-u16-mb-prev-uc.c  |  89 ++++++++++++
 tests/unistr/test-u32-mb-prev-uc.c  |  89 ++++++++++++
 tests/unistr/test-u8-mb-prev-uc.c   | 270 ++++++++++++++++++++++++++++++++++++
 16 files changed, 1097 insertions(+)
 create mode 100644 lib/unistr/u16-mb-prev-uc-aux.c
 create mode 100644 lib/unistr/u16-mb-prev-uc.c
 create mode 100644 lib/unistr/u32-mb-prev-uc.c
 create mode 100644 lib/unistr/u8-mb-prev-uc-aux.c
 create mode 100644 lib/unistr/u8-mb-prev-uc.c
 create mode 100644 modules/unistr/u16-mb-prev-uc
 create mode 100644 modules/unistr/u16-mb-prev-uc-tests
 create mode 100644 modules/unistr/u32-mb-prev-uc
 create mode 100644 modules/unistr/u32-mb-prev-uc-tests
 create mode 100644 modules/unistr/u8-mb-prev-uc
 create mode 100644 modules/unistr/u8-mb-prev-uc-tests
 create mode 100644 tests/unistr/test-u16-mb-prev-uc.c
 create mode 100644 tests/unistr/test-u32-mb-prev-uc.c
 create mode 100644 tests/unistr/test-u8-mb-prev-uc.c

diff --git a/ChangeLog b/ChangeLog
index 2da7d9b..8c7ba46 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,30 @@
+2011-01-01  Ben Pfaff  <address@hidden>
+
+       New module 'unistr/u8-mb-prev-uc'.
+       * lib/unistr.in.h (u8_mb_prev_uc): New declaration.
+       (u8_mb_prev_uc_aux): New declaration.
+       * lib/unistr/u8-mb-prev-uc.c: New file.
+       * lib/unistr/u8-mb-prev-uc-aux.c: New file.
+       * tests/test-u8-mb-prev-uc.c: New file.
+       * modules/u8-mb-prev-uc: New file.
+       * modules/u8-mb-prev-uc-tests: New file.
+
+       New module 'unistr/u16-mb-prev-uc'.
+       * lib/unistr.in.h (u16_mb_prev_uc): New declaration.
+       (u16_mb_prev_uc_aux): New declaration.
+       * lib/unistr/u16-mb-prev-uc.c: New file.
+       * lib/unistr/u16-mb-prev-uc-aux.c: New file.
+       * tests/test-u16-mb-prev-uc.c: New file.
+       * modules/u16-mb-prev-uc: New file.
+       * modules/u16-mb-prev-uc-tests: New file.
+
+       New module 'unistr/u32-mb-prev-uc'.
+       * lib/unistr.in.h (u32_mb_prev_uc): New declaration.
+       * lib/unistr/u32-mb-prev-uc.c: New file.
+       * tests/test-u32-mb-prev-uc.c: New file.
+       * modules/u32-mb-prev-uc: New file.
+       * modules/u32-mb-prev-uc-tests: New file.
+
 2014-09-05  Mathieu Anquetin  <address@hidden>
 
        Trivial change.
diff --git a/lib/unistr.in.h b/lib/unistr.in.h
index 73d2c23..41078cc 100644
--- a/lib/unistr.in.h
+++ b/lib/unistr.in.h
@@ -300,6 +300,77 @@ extern int
        u32_mbtoucr (ucs4_t *puc, const uint32_t *s, size_t n);
 #endif
 
+/* Return the length (number of units) of the last character in S, putting
+   its 'ucs4_t' representation in *PUC.  Upon failure, *PUC is set to 0xfffd,
+   and an appropriate number of units is returned.
+   The number of available units, N, must be > 0.  */
+
+#if GNULIB_UNISTR_U8_MB_PREV_UC || HAVE_LIBUNISTRING
+# if !HAVE_INLINE
+extern int
+       u8_mb_prev_uc (ucs4_t *puc, const uint8_t *s, size_t n);
+# else
+extern int
+       u8_mb_prev_uc_aux (ucs4_t *puc, const uint8_t *s, size_t n);
+static inline int
+u8_mb_prev_uc (ucs4_t *puc, const uint8_t *s, size_t n)
+{
+  uint8_t c = s[n - 1];
+
+  if (c < 0x80)
+    {
+      *puc = c;
+      return 1;
+    }
+  else
+    return u8_mb_prev_uc_aux (puc, s, n);
+}
+# endif
+#endif
+
+#if GNULIB_UNISTR_U16_MB_PREV_UC || HAVE_LIBUNISTRING
+# if !HAVE_INLINE
+extern int
+       u16_mb_prev_uc (ucs4_t *puc, const uint16_t *s, size_t n);
+# else
+extern int
+       u16_mb_prev_uc_aux (ucs4_t *puc, const uint16_t *s, size_t n);
+static inline int
+u16_mb_prev_uc (ucs4_t *puc, const uint16_t *s, size_t n)
+{
+  uint16_t c = s[n - 1];
+
+  if (c < 0xd800 || c >= 0xe000)
+    {
+      *puc = c;
+      return 1;
+    }
+  else
+    return u16_mb_prev_uc_aux (puc, s, n);
+}
+# endif
+#endif
+
+#if GNULIB_UNISTR_U32_MB_PREV_UC || HAVE_LIBUNISTRING
+# if !HAVE_INLINE
+extern int
+       u32_mb_prev_uc (ucs4_t *puc, const uint32_t *s, size_t n);
+# else
+static inline int
+u32_mb_prev_uc (ucs4_t *puc, const uint32_t *s, size_t n _GL_UNUSED_PARAMETER)
+{
+  uint32_t c = s[n - 1];
+
+  if (c < 0xd800 || (c >= 0xe000 && c < 0x110000))
+    *puc = c;
+  else
+    /* invalid multibyte character */
+    *puc = 0xfffd;
+  return 1;
+}
+# endif
+#endif
+
 /* Put the multibyte character represented by UC in S, returning its
    length.  Return -1 upon failure, -2 if the number of available units, N,
    is too small.  The latter case cannot occur if N >= 6/2/1, respectively.  */
diff --git a/lib/unistr/u16-mb-prev-uc-aux.c b/lib/unistr/u16-mb-prev-uc-aux.c
new file mode 100644
index 0000000..eeab787
--- /dev/null
+++ b/lib/unistr/u16-mb-prev-uc-aux.c
@@ -0,0 +1,52 @@
+/* Look at last character in UTF-16 string.
+   Copyright (C) 1999-2002, 2006-2007, 2009-2011 Free Software Foundation, Inc.
+   Written by Ben Pfaff <address@hidden>, 2011,
+   based on code by Bruno Haible <address@hidden>, 2001.
+
+   This program is free software: you can redistribute it and/or modify it
+   under the terms of the GNU Lesser General Public License as published
+   by the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
+
+#include <config.h>
+
+/* Specification.  */
+#include "unistr.h"
+
+#if defined IN_LIBUNISTRING || HAVE_INLINE
+
+int
+u16_mb_prev_uc_aux (ucs4_t *puc, const uint16_t *s, size_t n)
+{
+  uint16_t c = s[n - 1];
+
+  if (c >= 0xdc00)
+    {
+      if (n >= 2)
+        {
+          if (s[n - 2] >= 0xd800 && s[n - 2] < 0xdc00)
+            {
+              *puc = 0x10000 + ((s[n - 2] - 0xd800) << 10) + (c - 0xdc00);
+              return 2;
+            }
+          /* invalid multibyte character */
+        }
+      else
+        {
+          /* incomplete multibyte character */
+        }
+    }
+  /* invalid multibyte character */
+  *puc = 0xfffd;
+  return 1;
+}
+
+#endif
diff --git a/lib/unistr/u16-mb-prev-uc.c b/lib/unistr/u16-mb-prev-uc.c
new file mode 100644
index 0000000..3511666
--- /dev/null
+++ b/lib/unistr/u16-mb-prev-uc.c
@@ -0,0 +1,62 @@
+/* Look at last character in UTF-16 string.
+   Copyright (C) 1999-2002, 2006-2007, 2009-2011 Free Software Foundation, Inc.
+   Written by Ben Pfaff <address@hidden>, 2011,
+   based on code by Bruno Haible <address@hidden>, 2001.
+
+   This program is free software: you can redistribute it and/or modify it
+   under the terms of the GNU Lesser General Public License as published
+   by the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
+
+#include <config.h>
+
+#if defined IN_LIBUNISTRING
+/* Tell unistr.h to declare u16_mb_prev_uc as 'extern', not 'static inline'. */
+# include "unistring-notinline.h"
+#endif
+
+/* Specification.  */
+#include "unistr.h"
+
+#if !HAVE_INLINE
+
+int
+u16_mb_prev_uc (ucs4_t *puc, const uint16_t *s, size_t n)
+{
+  uint16_t c = s[n - 1];
+
+  if (c < 0xd800 || c >= 0xe000)
+    {
+      *puc = c;
+      return 1;
+    }
+  if (c >= 0xdc00)
+    {
+      if (n >= 2)
+        {
+          if (s[n - 2] >= 0xd800 && s[n - 2] < 0xdc00)
+            {
+              *puc = 0x10000 + ((s[n - 2] - 0xd800) << 10) + (c - 0xdc00);
+              return 2;
+            }
+          /* invalid multibyte character */
+        }
+      else
+        {
+          /* incomplete multibyte character */
+        }
+    }
+  /* invalid multibyte character */
+  *puc = 0xfffd;
+  return 1;
+}
+
+#endif
diff --git a/lib/unistr/u32-mb-prev-uc.c b/lib/unistr/u32-mb-prev-uc.c
new file mode 100644
index 0000000..398827b
--- /dev/null
+++ b/lib/unistr/u32-mb-prev-uc.c
@@ -0,0 +1,43 @@
+/* Look at last character in UTF-32 string.
+   Copyright (C) 2002, 2006-2007, 2009-2011 Free Software Foundation, Inc.
+   Written by Bruno Haible <address@hidden>, 2002.
+
+   This program is free software: you can redistribute it and/or modify it
+   under the terms of the GNU Lesser General Public License as published
+   by the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
+
+#include <config.h>
+
+#if defined IN_LIBUNISTRING
+/* Tell unistr.h to declare u32_mb_prev_uc as 'extern', not 'static inline'. */
+# include "unistring-notinline.h"
+#endif
+
+/* Specification.  */
+#include "unistr.h"
+
+#if !HAVE_INLINE
+
+int
+u32_mb_prev_uc (ucs4_t *puc, const uint32_t *s, size_t n)
+{
+  uint32_t c = s[n - 1];
+
+  if (c < 0xd800 || (c >= 0xe000 && c < 0x110000))
+    *puc = c;
+  else
+    /* invalid multibyte character */
+    *puc = 0xfffd;
+  return 1;
+}
+
+#endif
diff --git a/lib/unistr/u8-mb-prev-uc-aux.c b/lib/unistr/u8-mb-prev-uc-aux.c
new file mode 100644
index 0000000..1af912d
--- /dev/null
+++ b/lib/unistr/u8-mb-prev-uc-aux.c
@@ -0,0 +1,131 @@
+/* Look at last character in UCS-8 string.
+   Copyright (C) 2001-2002, 2006-2007, 2009-2011, 2014 Free Software 
Foundation, Inc.
+   Written by Ben Pfaff <address@hidden>, 2010,
+   based on code by Bruno Haible <address@hidden>, 2001.
+
+   This program is free software: you can redistribute it and/or modify it
+   under the terms of the GNU Lesser General Public License as published
+   by the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
+
+#include <config.h>
+
+/* Specification.  */
+#include "unistr.h"
+
+#if defined IN_LIBUNISTRING || HAVE_INLINE
+
+int
+u8_mb_prev_uc_aux (ucs4_t *puc, const uint8_t *s, size_t n)
+{
+  uint8_t c_1 = s[n - 1];
+
+  /* The #if 1'd blocks below are code that could be deleted if one decided to
+     build an unsafe variant of this function. */
+
+#if 1
+  if (c_1 <= 0xbf)
+#endif
+    {
+      if (n >= 2)
+        {
+          uint8_t c_2 = s[n - 2];
+
+          if ((c_2 ^ 0x80) >= 0x40)
+            {
+#if 1
+              if (c_2 >= 0xc2 && c_2 < 0xe0)
+#endif
+                {
+                  *puc = ((unsigned int) (c_2 & 0x1f) << 6)
+                         | (unsigned int) (c_1 ^ 0x80);
+                  return 2;
+                }
+#if 1
+              if (c_2 >= 0xe0 && c_2 < 0xf8)
+                {
+                  /* incomplete multibyte character */
+                  *puc = 0xfffd;
+                  return 2;
+                }
+#endif
+            }
+          else if (n >= 3)
+            {
+              uint8_t c_3 = s[n - 3];
+
+              if ((c_3 ^ 0x80) >= 0x40)
+                {
+#if 1
+                  if ((c_3 == 0xe0 && c_2 >= 0xa0)
+                      || (c_3 >= 0xe1 && c_3 < 0xed)
+                      || (c_3 == 0xed && c_2 < 0xa0)
+                      || (c_3 >= 0xee && c_3 < 0xf0))
+#endif
+                    {
+                      *puc = ((unsigned int) (c_3 & 0x0f) << 12)
+                             | (unsigned int) ((c_2 ^ 0x80) << 6)
+                             | (unsigned int) (c_1 ^ 0x80);
+                      return 3;
+                    }
+#if 1
+                  if (c_3 >= 0xe0 && c_3 < 0xf8)
+                    {
+                      /* 0xe0: overlong sequence.
+                         0xe1...0xec: not reached.
+                         0xed: UTF-16 surrogate.
+                         0xee...0xef: not reached.
+                         0xf0...0xf7: incomplete multibyte character. */
+                      *puc = 0xfffd;
+                      return 3;
+                    }
+#endif
+                }
+              else if (n >= 4)
+                {
+                  uint8_t c_4 = s[n - 4];
+
+                  if ((c_4 ^ 0x80) >= 0x40)
+                    {
+#if 1
+                      if ((c_4 == 0xf0 && c_3 >= 0x90)
+                          || (c_4 >= 0xf1 && c_4 < 0xf4)
+                          || (c_4 == 0xf4 && c_3 < 0x90))
+#endif
+                        {
+                          *puc = (unsigned int) ((c_4 & 0x07) << 18)
+                                 | (unsigned int) ((c_3 ^ 0x80) << 12)
+                                 | (unsigned int) ((c_2 ^ 0x80) << 6)
+                                 | (unsigned int) (c_1 ^ 0x80);
+                          return 4;
+                        }
+#if 1
+                      if (c_4 >= 0xf0 && c_4 < 0xf8)
+                        {
+                          /* 0xf0: overlong sequence.
+                             0xf1...0xf3: not reached.
+                             0xf4...0xf7: invalid code point above U+10FFFF */
+                          *puc = 0xfffd;
+                          return 4;
+                        }
+#endif
+                    }
+                }
+            }
+        }
+    }
+
+  /* invalid or incomplete multibyte character */
+  *puc = 0xfffd;
+  return 1;
+}
+
+#endif
diff --git a/lib/unistr/u8-mb-prev-uc.c b/lib/unistr/u8-mb-prev-uc.c
new file mode 100644
index 0000000..86cbd73
--- /dev/null
+++ b/lib/unistr/u8-mb-prev-uc.c
@@ -0,0 +1,142 @@
+/* Look at last character in UTF-8 string.
+   Copyright (C) 2001-2002, 2006-2007, 2009-2011, 2014 Free Software 
Foundation, Inc.
+   Written by Ben Pfaff <address@hidden>, 2010,
+   based on code by Bruno Haible <address@hidden>, 2001.
+
+   This program is free software: you can redistribute it and/or modify it
+   under the terms of the GNU Lesser General Public License as published
+   by the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
+
+#include <config.h>
+
+#if defined IN_LIBUNISTRING
+/* Tell unistr.h to declare u8_mb_prev_uc as 'extern', not 'static inline'.  */
+# include "unistring-notinline.h"
+#endif
+
+/* Specification.  */
+#include "unistr.h"
+
+#if !HAVE_INLINE
+
+int
+u8_mb_prev_uc (ucs4_t *puc, const uint8_t *s, size_t n)
+{
+  uint8_t c_1 = s[n - 1];
+
+  if (c_1 < 0x80)
+    {
+      *puc = c_1;
+      return 1;
+    }
+
+  /* The #if 1'd blocks below are code that could be deleted if one decided to
+     build an unsafe variant of this function. */
+
+#if 1
+  if (c_1 <= 0xbf)
+#endif
+    {
+      if (n >= 2)
+        {
+          uint8_t c_2 = s[n - 2];
+
+          if ((c_2 ^ 0x80) >= 0x40)
+            {
+#if 1
+              if (c_2 >= 0xc2 && c_2 < 0xe0)
+#endif
+                {
+                  *puc = ((unsigned int) (c_2 & 0x1f) << 6)
+                         | (unsigned int) (c_1 ^ 0x80);
+                  return 2;
+                }
+#if 1
+              if (c_2 >= 0xe0 && c_2 < 0xf8)
+                {
+                  /* incomplete multibyte character */
+                  *puc = 0xfffd;
+                  return 2;
+                }
+#endif
+            }
+          else if (n >= 3)
+            {
+              uint8_t c_3 = s[n - 3];
+
+              if ((c_3 ^ 0x80) >= 0x40)
+                {
+#if 1
+                  if ((c_3 == 0xe0 && c_2 >= 0xa0)
+                      || (c_3 >= 0xe1 && c_3 < 0xed)
+                      || (c_3 == 0xed && c_2 < 0xa0)
+                      || (c_3 >= 0xee && c_3 < 0xf0))
+#endif
+                    {
+                      *puc = ((unsigned int) (c_3 & 0x0f) << 12)
+                             | (unsigned int) ((c_2 ^ 0x80) << 6)
+                             | (unsigned int) (c_1 ^ 0x80);
+                      return 3;
+                    }
+#if 1
+                  if (c_3 >= 0xe0 && c_3 < 0xf8)
+                    {
+                      /* 0xe0: overlong sequence.
+                         0xe1...0xec: not reached.
+                         0xed: UTF-16 surrogate.
+                         0xee...0xef: not reached.
+                         0xf0...0xf7: incomplete multibyte character. */
+                      *puc = 0xfffd;
+                      return 3;
+                    }
+#endif
+                }
+              else if (n >= 4)
+                {
+                  uint8_t c_4 = s[n - 4];
+
+                  if ((c_4 ^ 0x80) >= 0x40)
+                    {
+#if 1
+                      if ((c_4 == 0xf0 && c_3 >= 0x90)
+                          || (c_4 >= 0xf1 && c_4 < 0xf4)
+                          || (c_4 == 0xf4 && c_3 < 0x90))
+#endif
+                        {
+                          *puc = (unsigned int) ((c_4 & 0x07) << 18)
+                                 | (unsigned int) ((c_3 ^ 0x80) << 12)
+                                 | (unsigned int) ((c_2 ^ 0x80) << 6)
+                                 | (unsigned int) (c_1 ^ 0x80);
+                          return 4;
+                        }
+#if 1
+                      if (c_4 >= 0xf0 && c_4 < 0xf8)
+                        {
+                          /* 0xf0: overlong sequence.
+                             0xf1...0xf3: not reached.
+                             0xf4...0xf7: invalid code point above U+10FFFF */
+                          *puc = 0xfffd;
+                          return 4;
+                        }
+#endif
+                    }
+                }
+            }
+        }
+    }
+
+  /* invalid or incomplete multibyte character */
+  *puc = 0xfffd;
+  return 1;
+}
+
+#endif
diff --git a/modules/unistr/u16-mb-prev-uc b/modules/unistr/u16-mb-prev-uc
new file mode 100644
index 0000000..508fc72
--- /dev/null
+++ b/modules/unistr/u16-mb-prev-uc
@@ -0,0 +1,28 @@
+Description:
+Look at last character in UTF-16 string.
+
+Files:
+lib/unistr/u16-mb-prev-uc.c
+lib/unistr/u16-mb-prev-uc-aux.c
+
+Depends-on:
+unistr/base
+
+configure.ac:
+gl_MODULE_INDICATOR([unistr/u16-mb-prev-uc])
+gl_LIBUNISTRING_MODULE([0.9.4], [unistr/u16-mb-prev-uc])
+
+Makefile.am:
+if LIBUNISTRING_COMPILE_UNISTR_U16_MB_PREV_UC
+lib_SOURCES += unistr/u16-mb-prev-uc.c unistr/u16-mb-prev-uc-aux.c
+endif
+
+Include:
+"unistr.h"
+
+License:
+LGPL
+
+Maintainer:
+Bruno Haible, Ben Pfaff
+
diff --git a/modules/unistr/u16-mb-prev-uc-tests 
b/modules/unistr/u16-mb-prev-uc-tests
new file mode 100644
index 0000000..a9f504f
--- /dev/null
+++ b/modules/unistr/u16-mb-prev-uc-tests
@@ -0,0 +1,12 @@
+Files:
+tests/unistr/test-u16-mb-prev-uc.c
+
+Depends-on:
+
+configure.ac:
+
+Makefile.am:
+TESTS += test-u16-mb-prev-uc
+check_PROGRAMS += test-u16-mb-prev-uc
+test_u16_mb_prev_uc_SOURCES = unistr/test-u16-mb-prev-uc.c
+test_u16_mb_prev_uc_LDADD = $(LDADD) $(LIBUNISTRING)
diff --git a/modules/unistr/u32-mb-prev-uc b/modules/unistr/u32-mb-prev-uc
new file mode 100644
index 0000000..ad7974a
--- /dev/null
+++ b/modules/unistr/u32-mb-prev-uc
@@ -0,0 +1,27 @@
+Description:
+Look at last character in UTF-32 string.
+
+Files:
+lib/unistr/u32-mb-prev-uc.c
+
+Depends-on:
+unistr/base
+
+configure.ac:
+gl_MODULE_INDICATOR([unistr/u32-mb-prev-uc])
+gl_LIBUNISTRING_MODULE([0.9.4], [unistr/u32-mb-prev-uc])
+
+Makefile.am:
+if LIBUNISTRING_COMPILE_UNISTR_U32_MB_PREV_UC
+lib_SOURCES += unistr/u32-mb-prev-uc.c
+endif
+
+Include:
+"unistr.h"
+
+License:
+LGPL
+
+Maintainer:
+Bruno Haible, Ben Pfaff
+
diff --git a/modules/unistr/u32-mb-prev-uc-tests 
b/modules/unistr/u32-mb-prev-uc-tests
new file mode 100644
index 0000000..e1e45c8
--- /dev/null
+++ b/modules/unistr/u32-mb-prev-uc-tests
@@ -0,0 +1,12 @@
+Files:
+tests/unistr/test-u32-mb-prev-uc.c
+
+Depends-on:
+
+configure.ac:
+
+Makefile.am:
+TESTS += test-u32-mb-prev-uc
+check_PROGRAMS += test-u32-mb-prev-uc
+test_u32_mb_prev_uc_SOURCES = unistr/test-u32-mb-prev-uc.c
+test_u32_mb_prev_uc_LDADD = $(LDADD) $(LIBUNISTRING)
diff --git a/modules/unistr/u8-mb-prev-uc b/modules/unistr/u8-mb-prev-uc
new file mode 100644
index 0000000..2a12805
--- /dev/null
+++ b/modules/unistr/u8-mb-prev-uc
@@ -0,0 +1,28 @@
+Description:
+Look at last character in UTF-8 string.
+
+Files:
+lib/unistr/u8-mb-prev-uc.c
+lib/unistr/u8-mb-prev-uc-aux.c
+
+Depends-on:
+unistr/base
+
+configure.ac:
+gl_MODULE_INDICATOR([unistr/u8-mb-prev-uc])
+gl_LIBUNISTRING_MODULE([0.9.4], [unistr/u8-mb-prev-uc])
+
+Makefile.am:
+if LIBUNISTRING_COMPILE_UNISTR_U8_MB_PREV_UC
+lib_SOURCES += unistr/u8-mb-prev-uc.c unistr/u8-mb-prev-uc-aux.c
+endif
+
+Include:
+"unistr.h"
+
+License:
+LGPL
+
+Maintainer:
+Bruno Haible, Ben Pfaff
+
diff --git a/modules/unistr/u8-mb-prev-uc-tests 
b/modules/unistr/u8-mb-prev-uc-tests
new file mode 100644
index 0000000..66a593a
--- /dev/null
+++ b/modules/unistr/u8-mb-prev-uc-tests
@@ -0,0 +1,14 @@
+Files:
+tests/unistr/test-u8-mb-prev-uc.c
+tests/macros.h
+
+Depends-on:
+unistr/u8-mbtouc
+
+configure.ac:
+
+Makefile.am:
+TESTS += test-u8-mb-prev-uc
+check_PROGRAMS += test-u8-mb-prev-uc
+test_u8_mb_prev_uc_SOURCES = unistr/test-u8-mb-prev-uc.c
+test_u8_mb_prev_uc_LDADD = $(LDADD) $(LIBUNISTRING)
diff --git a/tests/unistr/test-u16-mb-prev-uc.c 
b/tests/unistr/test-u16-mb-prev-uc.c
new file mode 100644
index 0000000..7f85e98
--- /dev/null
+++ b/tests/unistr/test-u16-mb-prev-uc.c
@@ -0,0 +1,89 @@
+/* Test of u16_mb_prev_uc() function.
+   Copyright (C) 2011 Free Software Foundation, Inc.
+
+   This program is free software: you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
+
+/* Written by Ben Pfaff, 2011.  */
+
+#include <config.h>
+
+#include "unistr.h"
+
+#include <assert.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+static void
+test_u16_mb_prev_uc (int expect_len, ucs4_t expect_uc, ...)
+{
+  uint16_t s[16];
+  va_list args;
+  size_t n;
+
+  ucs4_t uc;
+  int len;
+
+  va_start (args, expect_uc);
+  n = 0;
+  for (;;)
+    {
+      int unit = va_arg (args, int);
+      if (unit == -1)
+        break;
+      else if (n >= sizeof s / sizeof *s)
+        abort ();
+
+      s[n++] = unit;
+    }
+  va_end (args);
+
+  len = u16_mb_prev_uc (&uc, s, n);
+  if (len != expect_len || uc != expect_uc)
+    {
+      size_t i;
+
+      fprintf (stderr, "u16_mb_prev_uc returned length %d and U+%04x, "
+               "expected length %d and U+%04x:",
+               len, (unsigned int) uc,
+               expect_len, (unsigned int) expect_uc);
+      for (i = 0; i < n; i++)
+        fprintf (stderr, " %04x", s[i]);
+      putc ('\n', stderr);
+      fflush (stderr);
+      abort ();
+    }
+}
+
+int
+main (void)
+{
+  /* Valid single-unit sequences. */
+  test_u16_mb_prev_uc (1, 'a',     'a', -1);
+  test_u16_mb_prev_uc (1, 0x3042,  0x3042, -1);
+  test_u16_mb_prev_uc (1, 'b',     'a', 'b', -1);
+  test_u16_mb_prev_uc (1, 'x',     0x3042, 'x', -1);
+
+  /* Valid surrogate pairs. */
+  test_u16_mb_prev_uc (2, 0x1f610, 0xd83d, 0xde10, -1);
+  test_u16_mb_prev_uc (2, 0x1f610, 'x', 0xd83d, 0xde10, -1);
+
+  /* Invalid surrogate pairs. */
+  test_u16_mb_prev_uc (1, 0xfffd,  0xd800, -1);
+  test_u16_mb_prev_uc (1, 0xfffd,  'a', 0xd800, -1);
+  test_u16_mb_prev_uc (1, 0xfffd,  0xdeff, -1);
+  test_u16_mb_prev_uc (1, 0xfffd,  'b', 0xdeff, -1);
+
+  return 0;
+}
diff --git a/tests/unistr/test-u32-mb-prev-uc.c 
b/tests/unistr/test-u32-mb-prev-uc.c
new file mode 100644
index 0000000..6666877
--- /dev/null
+++ b/tests/unistr/test-u32-mb-prev-uc.c
@@ -0,0 +1,89 @@
+/* Test of u32_mb_prev_uc() function.
+   Copyright (C) 2011 Free Software Foundation, Inc.
+
+   This program is free software: you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
+
+/* Written by Ben Pfaff, 2011.  */
+
+#include <config.h>
+
+#include "unistr.h"
+
+#include <assert.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+static void
+test_u32_mb_prev_uc (int expect_len, ucs4_t expect_uc, ...)
+{
+  uint32_t s[16];
+  va_list args;
+  size_t n;
+
+  ucs4_t uc;
+  int len;
+
+  va_start (args, expect_uc);
+  n = 0;
+  for (;;)
+    {
+      int unit = va_arg (args, int);
+      if (unit == -1)
+        break;
+      else if (n >= sizeof s / sizeof *s)
+        abort ();
+
+      s[n++] = unit;
+    }
+  va_end (args);
+
+  len = u32_mb_prev_uc (&uc, s, n);
+  if (len != expect_len || uc != expect_uc)
+    {
+      size_t i;
+
+      fprintf (stderr, "u32_mb_prev_uc returned length %d and U+%04x, "
+               "expected length %d and U+%04x:",
+               len, (unsigned int) uc,
+               expect_len, (unsigned int) expect_uc);
+      for (i = 0; i < n; i++)
+        fprintf (stderr, " %04x", s[i]);
+      putc ('\n', stderr);
+      fflush (stderr);
+      abort ();
+    }
+}
+
+int
+main (void)
+{
+  /* Valid. */
+  test_u32_mb_prev_uc (1, 'a',     'a', -1);
+  test_u32_mb_prev_uc (1, 0x3042,  0x3042, -1);
+  test_u32_mb_prev_uc (1, 'b',     'a', 'b', -1);
+  test_u32_mb_prev_uc (1, 'x',     0x3042, 'x', -1);
+
+  /* Surrogate pairs are invalid in UTF-32. */
+  test_u32_mb_prev_uc (1, 0xfffd,  0xd83d, 0xde10, -1);
+  test_u32_mb_prev_uc (1, 0xfffd,  'x', 0xd83d, 0xde10, -1);
+
+  /* Malformed surrogate pairs are doubly invalid in UTF-32. */
+  test_u32_mb_prev_uc (1, 0xfffd,  0xd800, -1);
+  test_u32_mb_prev_uc (1, 0xfffd,  'a', 0xd800, -1);
+  test_u32_mb_prev_uc (1, 0xfffd,  0xdeff, -1);
+  test_u32_mb_prev_uc (1, 0xfffd,  'b', 0xdeff, -1);
+
+  return 0;
+}
diff --git a/tests/unistr/test-u8-mb-prev-uc.c 
b/tests/unistr/test-u8-mb-prev-uc.c
new file mode 100644
index 0000000..59d9a3c
--- /dev/null
+++ b/tests/unistr/test-u8-mb-prev-uc.c
@@ -0,0 +1,270 @@
+/* Test of u8_mb_prev_uc() function.
+   Copyright (C) 2010, 2011, 2014 Free Software Foundation, Inc.
+
+   This program is free software: you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
+
+/* Written by Ben Pfaff, 2010.  */
+
+#include <config.h>
+
+#include "unistr.h"
+
+#include <assert.h>
+
+#include "macros.h"
+
+struct uc
+  {
+    /* UTF-8 representation. */
+    const uint8_t *s;
+    int n;
+
+    /* Code point. */
+    ucs4_t uc;
+  };
+
+/* Print the N code points and their representations in UC on stderr, preceded
+   by TITLE. */
+static void
+print_ucs (const char *title, const struct uc *uc, size_t n)
+{
+  fprintf (stderr, "%s:", title);
+  for (; n-- > 0; uc++)
+    {
+      size_t i;
+
+      fprintf (stderr, " <");
+      for (i = 0; i < uc->n; i++)
+        {
+          if (i > 0)
+            putc (' ', stderr);
+          fprintf (stderr, "%02x", (unsigned int) uc->s[i]);
+        }
+      fprintf (stderr, "> U+%04X", (unsigned int) uc->uc);
+    }
+  putc ('\n', stderr);
+}
+
+/* Reverses the order of the N elements of UC. */
+static void
+reverse_ucs (struct uc *uc, size_t n)
+{
+  size_t i;
+
+  for (i = 0; i < n / 2; i++)
+    {
+      size_t j = n - (i + 1);
+      struct uc tmp = uc[i];
+      uc[i] = uc[j];
+      uc[j] = tmp;
+    }
+}
+
+static bool
+equal_ucs (const struct uc *a, size_t n_a, const struct uc *b, size_t n_b)
+{
+  if (n_a != n_b)
+    return false;
+  for (; n_a-- > 0; a++, b++)
+    if (a->n != b->n || a->s != b->s || a->uc != b->uc)
+      return false;
+  return true;
+}
+
+/* Checks that the N units in S yield the same code points whether iterated
+   in the forward or reverse direction. */
+static void
+check_bidirectionally (const uint8_t *s, int n)
+{
+  struct uc ucf[16];
+  struct uc ucr[16];
+  int n_ucf, n_ucr;
+  int used;
+
+  assert (n <= SIZEOF (ucf));
+  assert (n <= SIZEOF (ucr));
+
+  /* Translate units to code points forward. */
+  used = 0;
+  n_ucf = 0;
+  while (used < n)
+    {
+      struct uc *uc = &ucf[n_ucf++];
+      uc->s = &s[used];
+      uc->n = u8_mbtouc (&uc->uc, uc->s, n - used);
+      ASSERT (uc->n >= 1);
+      ASSERT (uc->n <= n - used);
+      used += uc->n;
+    }
+
+  /* Translate units to code points backward. */
+  used = 0;
+  n_ucr = 0;
+  while (used < n)
+    {
+      struct uc *uc = &ucr[n_ucr++];
+      uc->n = u8_mb_prev_uc (&uc->uc, s, n - used);
+      ASSERT (uc->n >= 1);
+      ASSERT (uc->n <= n - used);
+      used += uc->n;
+      uc->s = &s[n - used];
+    }
+  reverse_ucs (ucr, n_ucr);
+
+  /* Check that the results were the same. */
+  if (!equal_ucs (ucf, n_ucf, ucr, n_ucr))
+    {
+      fprintf (stderr, "%s:%d: forward and reverse differ\n",
+               __FILE__, __LINE__);
+      print_ucs ("forward", ucf, n_ucf);
+      print_ucs ("reverse", ucr, n_ucr);
+      fflush (stderr);
+      abort ();
+    }
+}
+
+static void
+do_exhaustive_test (const uint8_t *start, uint8_t *s, int n)
+{
+  /* The units to test. */
+  static const uint8_t units[] = {
+    /* The smallest value in each class. (Any other member or members would
+       work as well). */
+    0x00, 0x80, 0x90, 0xa0, 0xc0, 0xc2, 0xe0, 0xe1, 0xed, 0xee, 0xf0, 0xf1,
+    0xf4, 0xf5,
+
+    /* The UTF-8 units that make up U+FFFD, since that is such a special value
+       for these routines. */
+    0xef, 0xbf, 0xbd
+  };
+  int i;
+
+  for (i = 0; i < SIZEOF (units); i++)
+    {
+      s[0] = units[i];
+      if (n > 1)
+        do_exhaustive_test (start, s + 1, n - 1);
+      else
+        check_bidirectionally (start, (s + 1) - start);
+    }
+}
+
+/* This test exhaustively compares how u8_mbtouc() and u8_mb_prev_uc() treat
+   all UTF-8 well-formed and ill-formed sequences that are MAX_LENGTH units or
+   shorter.  To do so in a reasonable amount of time, it uses a trick: many
+   UTF-8 unit values are in classes whose members are all treated the same way.
+   Thus, it is only necessary to test one member of each class. */
+static void
+exhaustive_test (int max_length)
+{
+  uint8_t s[16];
+  int length;
+
+  assert (max_length <= SIZEOF (s));
+  for (length = 0; length <= max_length; length++)
+    do_exhaustive_test (s, s, length);
+}
+
+static void
+do_well_formed_test (const uint8_t *start, uint8_t *s, int n)
+{
+  if (n == 0)
+    {
+      check_bidirectionally (start, s - start);
+      return;
+    }
+
+  /* Test single-byte characters. */
+  s[0] = 0;
+  do_well_formed_test (start, s + 1, n - 1);
+
+  s[0] = 0x41;
+  do_well_formed_test (start, s + 1, n - 1);
+
+  /* Test 2-byte characters. */
+  if (n >= 2)
+    {
+      s[0] = 0xc2;
+      s[1] = 0xb0;
+      do_well_formed_test (start, s + 2, n - 2);
+    }
+
+  /* Test 3-byte characters. */
+  if (n >= 3)
+    {
+      s[0] = 0xe0;
+      s[1] = 0xa0;
+      s[2] = 0xa5;
+      do_well_formed_test (start, s + 3, n - 3);
+
+      s[0] = 0xe5;
+      s[1] = 0xbf;
+      s[2] = 0x81;
+      do_well_formed_test (start, s + 3, n - 3);
+
+      s[0] = 0xed;
+      s[1] = 0x9f;
+      s[2] = 0x99;
+      do_well_formed_test (start, s + 3, n - 3);
+    }
+
+  /* Test 4-byte characters. */
+  if (n >= 4)
+    {
+      s[0] = 0xf0;
+      s[1] = 0x90;
+      s[2] = 0xbb;
+      s[3] = 0x80;
+      do_well_formed_test (start, s + 4, n - 4);
+
+      s[0] = 0xf2;
+      s[1] = 0x80;
+      s[2] = 0xbf;
+      s[3] = 0x80;
+      do_well_formed_test (start, s + 4, n - 4);
+
+      s[0] = 0xf4;
+      s[1] = 0x8f;
+      s[2] = 0x80;
+      s[3] = 0xbf;
+      do_well_formed_test (start, s + 4, n - 4);
+    }
+}
+
+/* Checks iteration through all possible sets of UTF-8 sequence lengths with
+   no more than MAX_LENGTH units. */
+static void
+well_formed_test (int max_length)
+{
+  uint8_t s[16];
+  int length;
+
+  assert (max_length <= SIZEOF (s));
+  for (length = 0; length <= max_length; length++)
+    do_well_formed_test (s, s, length);
+}
+
+int
+main (void)
+{
+  /* Runtime increases exponentially with the argument: 4 runs in a fraction
+     of a second, 5 in a few seconds, 6 in half a minute. */
+  exhaustive_test (5);
+
+  /* Runtime increases exponentially but much more slowly than with
+     exhaustive_test(). */
+  well_formed_test (10);
+
+  return 0;
+}
-- 
1.9.1




reply via email to

[Prev in Thread] Current Thread [Next in Thread]