bug-gnulib
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

new modules for Unicode character case mappings


From: Bruno Haible
Subject: new modules for Unicode character case mappings
Date: Sun, 8 Feb 2009 21:37:10 +0100
User-agent: KMail/1.9.9

Hi,

Mike Gran wrote:
> As far as I can tell, the ISO C towupper will probably work correctly on
> UCS-4 characters created by Gnulib functions like u32_conv_from_enc.  But,
> it seems that it isn't guaranteed to do so, since wint_t is not required to
> be UCS-4.  (I don't have a counterexample: I'm just going from what I've
> read.)

This is correct: wchar_t is not required to be Unicode, and is not Unicode,
for example on Solaris and FreeBSD. So, you cannot use towupper() on Unicode
characters in portable code.

Even on systems where wchar_t is UCS-4 (such as glibc systems), the towupper()
function supports only those characters that map to multibyte characters in
the current locale, and not all Unicode characters. (Example: It will not
handle the Polish ł character when operating in a German ISO-8859-15 locale.)

> If that is true, then it would be neat to have u32_toupper and u32_tolower 
> functions.  

You are right. I'm adding functions uc_toupper, uc_tolower, uc_totitle 
functions.
These are the right functions to use for Unicode characters.

Note, however, that it is better to use case mapping functions that operate on
an entire string; this is the only way to handle German or Lithuanian
specialities correctly. These functions are already declared in gnulib's
"unicase.h", but not yet implemented as of today.



2009-02-08  Bruno Haible  <address@hidden>

        New module 'unicase/totitle'.
        * modules/unicase/totitle: New file.
        * lib/unicase/totitle.c: New file.

        New module 'unicase/tolower'.
        * modules/unicase/tolower: New file.
        * lib/unicase/tolower.c: New file.

        New module 'unicase/toupper'.
        * modules/unicase/toupper: New file.
        * lib/unicase/toupper.c: New file.
        * lib/unicase/simple-mapping.h: New file.

        * lib/gen-uni-tables.c (output_simple_mapping_test): New function.
        (mapping_table): New structure.
        (output_simple_mapping): New function.
        (main): Invoke output_simple_mapping_test and output_simple_mapping.
        * modules/gen-uni-tables (Description): Update.
        * lib/unicase/toupper.h: New file, automatically generated by
        gen-uni-tables.
        * lib/unicase/tolower.h: New file, automatically generated by
        gen-uni-tables.
        * lib/unicase/totitle.h: New file, automatically generated by
        gen-uni-tables.
        * tests/unicase/test-uc_toupper.c: New file, automatically generated by
        gen-uni-tables.
        * tests/unicase/test-uc_tolower.c: New file, automatically generated by
        gen-uni-tables.
        * tests/unicase/test-uc_totitle.c: New file, automatically generated by
        gen-uni-tables.

        New module 'unicase/base'.
        * modules/unicase/base: New file.
        * lib/unicase.h: New file.

================================ lib/unicase.h ================================
/* Unicode character case mappings.
   Copyright (C) 2002, 2009 Free Software Foundation, Inc.

   This program is free software: you can redistribute it and/or modify it
   under the terms of the GNU Lesser General Public License as published
   by the Free Software Foundation; either version 3 of the License, or
   (at your option) any later version.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.

   You should have received a copy of the GNU Lesser General Public License
   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */

#ifndef _UNICASE_H
#define _UNICASE_H

#include "unitypes.h"

/* Get size_t.  */
#include <stddef.h>

#ifdef __cplusplus
extern "C" {
#endif

/* ========================================================================= */

/* Character case mappings.
   These mappings are locale and context independent.
   WARNING! These functions are not sufficient for languages such as German.
   Better use the functions below that treat an entire string at once and are
   language aware.  */

/* Return the uppercase mapping of a Unicode character.  */
extern ucs4_t
       uc_toupper (ucs4_t uc);

/* Return the lowercase mapping of a Unicode character.  */
extern ucs4_t
       uc_tolower (ucs4_t uc);

/* Return the titlecase mapping of a Unicode character.  */
extern ucs4_t
       uc_totitle (ucs4_t uc);

/* ========================================================================= */

/* String case mappings.  */

/* These functions are locale dependent.  The iso639_language argument
   identifies the language (e.g. "tr" for Turkish).  NULL means to use
   locale independent case mappings.  */

/* Return the ISO 639 language code of the current locale.
   Return "" if it is unknown, or in the "C" locale.  */
extern const char *
       uc_locale_language (void);

/* Return the uppercase mapping of a string.  */
extern uint8_t *
       u8_toupper (const uint8_t *s, size_t n, const char *iso639_language, 
uint8_t *resultbuf, size_t *lengthp);
extern uint16_t *
       u16_toupper (const uint16_t *s, size_t n, const char *iso639_language, 
uint16_t *resultbuf, size_t *lengthp);
extern uint32_t *
       u32_toupper (const uint32_t *s, size_t n, const char *iso639_language, 
uint32_t *resultbuf, size_t *lengthp);

/* Return the lowercase mapping of a string.  */
extern uint8_t *
       u8_tolower (const uint8_t *s, size_t n, const char *iso639_language, 
uint8_t *resultbuf, size_t *lengthp);
extern uint16_t *
       u16_tolower (const uint16_t *s, size_t n, const char *iso639_language, 
uint16_t *resultbuf, size_t *lengthp);
extern uint32_t *
       u32_tolower (const uint32_t *s, size_t n, const char *iso639_language, 
uint32_t *resultbuf, size_t *lengthp);

/* Return the titlecase mapping of a string.  */
extern uint8_t *
       u8_totitle (const uint8_t *s, size_t n, const char *iso639_language, 
uint8_t *resultbuf, size_t *lengthp);
extern uint16_t *
       u16_totitle (const uint16_t *s, size_t n, const char *iso639_language, 
uint16_t *resultbuf, size_t *lengthp);
extern uint32_t *
       u32_totitle (const uint32_t *s, size_t n, const char *iso639_language, 
uint32_t *resultbuf, size_t *lengthp);

/* Return the case folded string.  */
extern uint8_t *
       u8_casefold (const uint8_t *s, size_t n, uint8_t *resultbuf, size_t 
*lengthp);
extern uint16_t *
       u16_casefold (const uint16_t *s, size_t n, uint16_t *resultbuf, size_t 
*lengthp);
extern uint32_t *
       u32_casefold (const uint32_t *s, size_t n, uint32_t *resultbuf, size_t 
*lengthp);

/* Compare S1 and S2, ignoring case.
   Return -1 if S1 < S2, 0 if S1 = S2, 1 if S1 > S2.  */
extern int
       u8_casecmp (const uint8_t *s1, size_t n1, const uint8_t *s2, size_t n2);
extern int
       u16_casecmp (const uint16_t *s1, size_t n1, const uint16_t *s2, size_t 
n2);
extern int
       u32_casecmp (const uint32_t *s1, size_t n1, const uint32_t *s2, size_t 
n2);

/* Compare S1 and S2 using the collation rules of the current locale,
   ignoring case.
   Return -1 if S1 < S2, 0 if S1 = S2, 1 if S1 > S2.
   Upon failure, set errno and return any value.  */
extern int
       u8_casecoll (const uint8_t *s1, size_t n1, const uint8_t *s2, size_t n2);
extern int
       u16_casecoll (const uint16_t *s1, size_t n1, const uint16_t *s2, size_t 
n2);
extern int
       u32_casecoll (const uint32_t *s1, size_t n1, const uint32_t *s2, size_t 
n2);

/* ========================================================================= */

#ifdef __cplusplus
}
#endif

#endif /* _UNICASE_H */




reply via email to

[Prev in Thread] Current Thread [Next in Thread]