Unicode Manipulation

Unicode Manipulation — functions operating on Unicode characters and UTF-8 strings

Synopsis


#include <glib.h>

typedef             gunichar;
typedef             gunichar2;

gboolean            g_unichar_validate                  (gunichar ch);
gboolean            g_unichar_isalnum                   (gunichar c);
gboolean            g_unichar_isalpha                   (gunichar c);
gboolean            g_unichar_iscntrl                   (gunichar c);
gboolean            g_unichar_isdefined                 (gunichar c);
gboolean            g_unichar_isdigit                   (gunichar c);
gboolean            g_unichar_isgraph                   (gunichar c);
gboolean            g_unichar_islower                   (gunichar c);
gboolean            g_unichar_ismark                    (gunichar c);
gboolean            g_unichar_isprint                   (gunichar c);
gboolean            g_unichar_ispunct                   (gunichar c);
gboolean            g_unichar_isspace                   (gunichar c);
gboolean            g_unichar_istitle                   (gunichar c);
gboolean            g_unichar_isupper                   (gunichar c);
gboolean            g_unichar_isxdigit                  (gunichar c);
gboolean            g_unichar_iswide                    (gunichar c);
gboolean            g_unichar_iswide_cjk                (gunichar c);
gboolean            g_unichar_iszerowidth               (gunichar c);
gunichar            g_unichar_toupper                   (gunichar c);
gunichar            g_unichar_tolower                   (gunichar c);
gunichar            g_unichar_totitle                   (gunichar c);
gint                g_unichar_digit_value               (gunichar c);
gint                g_unichar_xdigit_value              (gunichar c);
enum                GUnicodeType;
GUnicodeType        g_unichar_type                      (gunichar c);
enum                GUnicodeBreakType;
GUnicodeBreakType   g_unichar_break_type                (gunichar c);
gint                g_unichar_combining_class           (gunichar uc);
void                g_unicode_canonical_ordering        (gunichar *string,
                                                         gsize len);
gunichar*           g_unicode_canonical_decomposition   (gunichar ch,
                                                         gsize *result_len);
gboolean            g_unicode_canonical_decomposition_to_buffer
                                                        (gunichar ch,
                                                         gunichar *out,
                                                         gsize out_len,
                                                         gsize *result_len);
gboolean            g_unichar_get_mirror_char           (gunichar ch,
                                                         gunichar *mirrored_ch);
enum                GUnicodeScript;
GUnicodeScript      g_unichar_get_script                (gunichar ch);

#define             g_utf8_next_char                    (p)
gunichar            g_utf8_get_char                     (const gchar *p);
gunichar            g_utf8_get_char_validated           (const gchar *p,
                                                         gssize max_len);
gchar*              g_utf8_offset_to_pointer            (const gchar *str,
                                                         glong offset);
glong               g_utf8_pointer_to_offset            (const gchar *str,
                                                         const gchar *pos);
gchar*              g_utf8_prev_char                    (const gchar *p);
gchar*              g_utf8_find_next_char               (const gchar *p,
                                                         const gchar *end);
gchar*              g_utf8_find_prev_char               (const gchar *str,
                                                         const gchar *p);
glong               g_utf8_strlen                       (const gchar *p,
                                                         gssize max);
gchar*              g_utf8_strncpy                      (gchar *dest,
                                                         const gchar *src,
                                                         gsize n);
gchar*              g_utf8_strchr                       (const gchar *p,
                                                         gssize len,
                                                         gunichar c);
gchar*              g_utf8_strrchr                      (const gchar *p,
                                                         gssize len,
                                                         gunichar c);
gchar*              g_utf8_strreverse                   (const gchar *str,
                                                         gssize len);
gboolean            g_utf8_validate                     (const gchar *str,
                                                         gssize max_len,
                                                         const gchar **end);

gchar*              g_utf8_strup                        (const gchar *str,
                                                         gssize len);
gchar*              g_utf8_strdown                      (const gchar *str,
                                                         gssize len);
gchar*              g_utf8_casefold                     (const gchar *str,
                                                         gssize len);
gchar*              g_utf8_normalize                    (const gchar *str,
                                                         gssize len,
                                                         GNormalizeMode mode);
enum                GNormalizeMode;
gint                g_utf8_collate                      (const gchar *str1,
                                                         const gchar *str2);
gchar*              g_utf8_collate_key                  (const gchar *str,
                                                         gssize len);
gchar*              g_utf8_collate_key_for_filename     (const gchar *str,
                                                         gssize len);

gunichar2*          g_utf8_to_utf16                     (const gchar *str,
                                                         glong len,
                                                         glong *items_read,
                                                         glong *items_written,
                                                         GError **error);
gunichar*           g_utf8_to_ucs4                      (const gchar *str,
                                                         glong len,
                                                         glong *items_read,
                                                         glong *items_written,
                                                         GError **error);
gunichar*           g_utf8_to_ucs4_fast                 (const gchar *str,
                                                         glong len,
                                                         glong *items_written);
gunichar*           g_utf16_to_ucs4                     (const gunichar2 *str,
                                                         glong len,
                                                         glong *items_read,
                                                         glong *items_written,
                                                         GError **error);
gchar*              g_utf16_to_utf8                     (const gunichar2 *str,
                                                         glong len,
                                                         glong *items_read,
                                                         glong *items_written,
                                                         GError **error);
gunichar2*          g_ucs4_to_utf16                     (const gunichar *str,
                                                         glong len,
                                                         glong *items_read,
                                                         glong *items_written,
                                                         GError **error);
gchar*              g_ucs4_to_utf8                      (const gunichar *str,
                                                         glong len,
                                                         glong *items_read,
                                                         glong *items_written,
                                                         GError **error);
gint                g_unichar_to_utf8                   (gunichar c,
                                                         gchar *outbuf);

Description

This section describes a number of functions for dealing with Unicode characters and strings. There are analogues of the traditional ctype.h character classification and case conversion functions, UTF-8 analogues of some string utility functions, functions to perform normalization, case conversion and collation on UTF-8 strings and finally functions to convert between the UTF-8, UTF-16 and UCS-4 encodings of Unicode.

The implementations of the Unicode functions in GLib are based on the Unicode Character Data tables, which are available from www.unicode.org. GLib 2.8 supports Unicode 4.0, GLib 2.10 supports Unicode 4.1, GLib 2.12 supports Unicode 5.0, GLib 2.16.3 supports Unicode 5.1.

Details

gunichar

typedef guint32 gunichar;

A type which can hold any UCS-4 character code.


gunichar2

typedef guint16 gunichar2;

A type which can hold any UTF-16 code point[3].


g_unichar_validate ()

gboolean            g_unichar_validate                  (gunichar ch);

Checks whether ch is a valid Unicode character. Some possible integer values of ch will not be valid. 0 is considered a valid character, though it's normally a string terminator.

ch : a Unicode character
Returns : TRUE if ch is a valid Unicode character

g_unichar_isalnum ()

gboolean            g_unichar_isalnum                   (gunichar c);

Determines whether a character is alphanumeric. Given some UTF-8 text, obtain a character value with g_utf8_get_char().

c : a Unicode character
Returns : TRUE if c is an alphanumeric character

g_unichar_isalpha ()

gboolean            g_unichar_isalpha                   (gunichar c);

Determines whether a character is alphabetic (i.e. a letter). Given some UTF-8 text, obtain a character value with g_utf8_get_char().

c : a Unicode character
Returns : TRUE if c is an alphabetic character

g_unichar_iscntrl ()

gboolean            g_unichar_iscntrl                   (gunichar c);

Determines whether a character is a control character. Given some UTF-8 text, obtain a character value with g_utf8_get_char().

c : a Unicode character
Returns : TRUE if c is a control character

g_unichar_isdefined ()

gboolean            g_unichar_isdefined                 (gunichar c);

Determines if a given character is assigned in the Unicode standard.

c : a Unicode character
Returns : TRUE if the character has an assigned value

g_unichar_isdigit ()

gboolean            g_unichar_isdigit                   (gunichar c);

Determines whether a character is numeric (i.e. a digit). This covers ASCII 0-9 and also digits in other languages/scripts. Given some UTF-8 text, obtain a character value with g_utf8_get_char().

c : a Unicode character
Returns : TRUE if c is a digit

g_unichar_isgraph ()

gboolean            g_unichar_isgraph                   (gunichar c);

Determines whether a character is printable and not a space (returns FALSE for control characters, format characters, and spaces). g_unichar_isprint() is similar, but returns TRUE for spaces. Given some UTF-8 text, obtain a character value with g_utf8_get_char().

c : a Unicode character
Returns : TRUE if c is printable unless it's a space

g_unichar_islower ()

gboolean            g_unichar_islower                   (gunichar c);

Determines whether a character is a lowercase letter. Given some UTF-8 text, obtain a character value with g_utf8_get_char().

c : a Unicode character
Returns : TRUE if c is a lowercase letter

g_unichar_ismark ()

gboolean            g_unichar_ismark                    (gunichar c);

Determines whether a character is a mark (non-spacing mark, combining mark, or enclosing mark in Unicode speak). Given some UTF-8 text, obtain a character value with g_utf8_get_char().

Note: in most cases where isalpha characters are allowed, ismark characters should be allowed to as they are essential for writing most European languages as well as many non-Latin scripts.

c : a Unicode character
Returns : TRUE if c is a mark character

Since 2.14


g_unichar_isprint ()

gboolean            g_unichar_isprint                   (gunichar c);

Determines whether a character is printable. Unlike g_unichar_isgraph(), returns TRUE for spaces. Given some UTF-8 text, obtain a character value with g_utf8_get_char().

c : a Unicode character
Returns : TRUE if c is printable

g_unichar_ispunct ()

gboolean            g_unichar_ispunct                   (gunichar c);

Determines whether a character is punctuation or a symbol. Given some UTF-8 text, obtain a character value with g_utf8_get_char().

c : a Unicode character
Returns : TRUE if c is a punctuation or symbol character

g_unichar_isspace ()

gboolean            g_unichar_isspace                   (gunichar c);

Determines whether a character is a space, tab, or line separator (newline, carriage return, etc.). Given some UTF-8 text, obtain a character value with g_utf8_get_char().

(Note: don't use this to do word breaking; you have to use Pango or equivalent to get word breaking right, the algorithm is fairly complex.)

c : a Unicode character
Returns : TRUE if c is a space character

g_unichar_istitle ()

gboolean            g_unichar_istitle                   (gunichar c);

Determines if a character is titlecase. Some characters in Unicode which are composites, such as the DZ digraph have three case variants instead of just two. The titlecase form is used at the beginning of a word where only the first letter is capitalized. The titlecase form of the DZ digraph is U+01F2 LATIN CAPITAL LETTTER D WITH SMALL LETTER Z.

c : a Unicode character
Returns : TRUE if the character is titlecase

g_unichar_isupper ()

gboolean            g_unichar_isupper                   (gunichar c);

Determines if a character is uppercase.

c : a Unicode character
Returns : TRUE if c is an uppercase character

g_unichar_isxdigit ()

gboolean            g_unichar_isxdigit                  (gunichar c);

Determines if a character is a hexidecimal digit.

c : a Unicode character.
Returns : TRUE if the character is a hexadecimal digit

g_unichar_iswide ()

gboolean            g_unichar_iswide                    (gunichar c);

Determines if a character is typically rendered in a double-width cell.

c : a Unicode character
Returns : TRUE if the character is wide

g_unichar_iswide_cjk ()

gboolean            g_unichar_iswide_cjk                (gunichar c);

Determines if a character is typically rendered in a double-width cell under legacy East Asian locales. If a character is wide according to g_unichar_iswide(), then it is also reported wide with this function, but the converse is not necessarily true. See the Unicode Standard Annex 11 for details.

c : a Unicode character
Returns : TRUE if the character is wide in legacy East Asian locales

Since 2.12


g_unichar_iszerowidth ()

gboolean            g_unichar_iszerowidth               (gunichar c);

Determines if a given character typically takes zero width when rendered. The return value is TRUE for all non-spacing and enclosing marks (e.g., combining accents), format characters, zero-width space, but not U+00AD SOFT HYPHEN.

A typical use of this function is with one of g_unichar_iswide() or g_unichar_iswide_cjk() to determine the number of cells a string occupies when displayed on a grid display (terminals). However, note that not all terminals support zero-width rendering of zero-width marks.

c : a Unicode character
Returns : TRUE if the character has zero width

Since 2.14


g_unichar_toupper ()

gunichar            g_unichar_toupper                   (gunichar c);

Converts a character to uppercase.

c : a Unicode character
Returns : the result of converting c to uppercase. If c is not an lowercase or titlecase character, or has no upper case equivalent c is returned unchanged.

g_unichar_tolower ()

gunichar            g_unichar_tolower                   (gunichar c);

Converts a character to lower case.

c : a Unicode character.
Returns : the result of converting c to lower case. If c is not an upperlower or titlecase character, or has no lowercase equivalent c is returned unchanged.

g_unichar_totitle ()

gunichar            g_unichar_totitle                   (gunichar c);

Converts a character to the titlecase.

c : a Unicode character
Returns : the result of converting c to titlecase. If c is not an uppercase or lowercase character, c is returned unchanged.

g_unichar_digit_value ()

gint                g_unichar_digit_value               (gunichar c);

Determines the numeric value of a character as a decimal digit.

c : a Unicode character
Returns : If c is a decimal digit (according to g_unichar_isdigit()), its numeric value. Otherwise, -1.

g_unichar_xdigit_value ()

gint                g_unichar_xdigit_value              (gunichar c);

Determines the numeric value of a character as a hexidecimal digit.

c : a Unicode character
Returns : If c is a hex digit (according to g_unichar_isxdigit()), its numeric value. Otherwise, -1.

enum GUnicodeType

typedef enum
{
  G_UNICODE_CONTROL,
  G_UNICODE_FORMAT,
  G_UNICODE_UNASSIGNED,
  G_UNICODE_PRIVATE_USE,
  G_UNICODE_SURROGATE,
  G_UNICODE_LOWERCASE_LETTER,
  G_UNICODE_MODIFIER_LETTER,
  G_UNICODE_OTHER_LETTER,
  G_UNICODE_TITLECASE_LETTER,
  G_UNICODE_UPPERCASE_LETTER,
  G_UNICODE_COMBINING_MARK,
  G_UNICODE_ENCLOSING_MARK,
  G_UNICODE_NON_SPACING_MARK,
  G_UNICODE_DECIMAL_NUMBER,
  G_UNICODE_LETTER_NUMBER,
  G_UNICODE_OTHER_NUMBER,
  G_UNICODE_CONNECT_PUNCTUATION,
  G_UNICODE_DASH_PUNCTUATION,
  G_UNICODE_CLOSE_PUNCTUATION,
  G_UNICODE_FINAL_PUNCTUATION,
  G_UNICODE_INITIAL_PUNCTUATION,
  G_UNICODE_OTHER_PUNCTUATION,
  G_UNICODE_OPEN_PUNCTUATION,
  G_UNICODE_CURRENCY_SYMBOL,
  G_UNICODE_MODIFIER_SYMBOL,
  G_UNICODE_MATH_SYMBOL,
  G_UNICODE_OTHER_SYMBOL,
  G_UNICODE_LINE_SEPARATOR,
  G_UNICODE_PARAGRAPH_SEPARATOR,
  G_UNICODE_SPACE_SEPARATOR
} GUnicodeType;

These are the possible character classifications from the Unicode specification. See http://www.unicode.org/Public/UNIDATA/UnicodeData.html.

G_UNICODE_CONTROL General category "Other, Control" (Cc)
G_UNICODE_FORMAT General category "Other, Format" (Cf)
G_UNICODE_UNASSIGNED General category "Other, Not Assigned" (Cn)
G_UNICODE_PRIVATE_USE General category "Other, Private Use" (Co)
G_UNICODE_SURROGATE General category "Other, Surrogate" (Cs)
G_UNICODE_LOWERCASE_LETTER General category "Letter, Lowercase" (Ll)
G_UNICODE_MODIFIER_LETTER General category "Letter, Modifier" (Lm)
G_UNICODE_OTHER_LETTER General category "Letter, Other" (Lo)
G_UNICODE_TITLECASE_LETTER General category "Letter, Titlecase" (Lt)
G_UNICODE_UPPERCASE_LETTER General category "Letter, Uppercase" (Lu)
G_UNICODE_COMBINING_MARK General category "Mark, Spacing Combining" (Mc)
G_UNICODE_ENCLOSING_MARK General category "Mark, Enclosing" (Me)
G_UNICODE_NON_SPACING_MARK General category "Mark, Nonspacing" (Mn)
G_UNICODE_DECIMAL_NUMBER General category "Number, Decimal Digit" (Nd)
G_UNICODE_LETTER_NUMBER General category "Number, Letter" (Nl)
G_UNICODE_OTHER_NUMBER General category "Number, Other" (No)
G_UNICODE_CONNECT_PUNCTUATION General category "Punctuation, Connector" (Pc)
G_UNICODE_DASH_PUNCTUATION General category "Punctuation, Dash" (Pd)
G_UNICODE_CLOSE_PUNCTUATION General category "Punctuation, Close" (Pe)
G_UNICODE_FINAL_PUNCTUATION General category "Punctuation, Final quote" (Pf)
G_UNICODE_INITIAL_PUNCTUATION General category "Punctuation, Initial quote" (Pi)
G_UNICODE_OTHER_PUNCTUATION General category "Punctuation, Other" (Po)
G_UNICODE_OPEN_PUNCTUATION General category "Punctuation, Open" (Ps)
G_UNICODE_CURRENCY_SYMBOL General category "Symbol, Currency" (Sc)
G_UNICODE_MODIFIER_SYMBOL General category "Symbol, Modifier" (Sk)
G_UNICODE_MATH_SYMBOL General category "Symbol, Math" (Sm)
G_UNICODE_OTHER_SYMBOL General category "Symbol, Other" (So)
G_UNICODE_LINE_SEPARATOR General category "Separator, Line" (Zl)
G_UNICODE_PARAGRAPH_SEPARATOR General category "Separator, Paragraph" (Zp)
G_UNICODE_SPACE_SEPARATOR General category "Separator, Space" (Zs)

g_unichar_type ()

GUnicodeType        g_unichar_type                      (gunichar c);

Classifies a Unicode character by type.

c : a Unicode character
Returns : the type of the character.

enum GUnicodeBreakType

typedef enum
{
  G_UNICODE_BREAK_MANDATORY,
  G_UNICODE_BREAK_CARRIAGE_RETURN,
  G_UNICODE_BREAK_LINE_FEED,
  G_UNICODE_BREAK_COMBINING_MARK,
  G_UNICODE_BREAK_SURROGATE,
  G_UNICODE_BREAK_ZERO_WIDTH_SPACE,
  G_UNICODE_BREAK_INSEPARABLE,
  G_UNICODE_BREAK_NON_BREAKING_GLUE,
  G_UNICODE_BREAK_CONTINGENT,
  G_UNICODE_BREAK_SPACE,
  G_UNICODE_BREAK_AFTER,
  G_UNICODE_BREAK_BEFORE,
  G_UNICODE_BREAK_BEFORE_AND_AFTER,
  G_UNICODE_BREAK_HYPHEN,
  G_UNICODE_BREAK_NON_STARTER,
  G_UNICODE_BREAK_OPEN_PUNCTUATION,
  G_UNICODE_BREAK_CLOSE_PUNCTUATION,
  G_UNICODE_BREAK_QUOTATION,
  G_UNICODE_BREAK_EXCLAMATION,
  G_UNICODE_BREAK_IDEOGRAPHIC,
  G_UNICODE_BREAK_NUMERIC,
  G_UNICODE_BREAK_INFIX_SEPARATOR,
  G_UNICODE_BREAK_SYMBOL,
  G_UNICODE_BREAK_ALPHABETIC,
  G_UNICODE_BREAK_PREFIX,
  G_UNICODE_BREAK_POSTFIX,
  G_UNICODE_BREAK_COMPLEX_CONTEXT,
  G_UNICODE_BREAK_AMBIGUOUS,
  G_UNICODE_BREAK_UNKNOWN,
  G_UNICODE_BREAK_NEXT_LINE,
  G_UNICODE_BREAK_WORD_JOINER,
  G_UNICODE_BREAK_HANGUL_L_JAMO,
  G_UNICODE_BREAK_HANGUL_V_JAMO,
  G_UNICODE_BREAK_HANGUL_T_JAMO,
  G_UNICODE_BREAK_HANGUL_LV_SYLLABLE,
  G_UNICODE_BREAK_HANGUL_LVT_SYLLABLE
} GUnicodeBreakType;

These are the possible line break classifications. The five Hangul types were added in Unicode 4.1, so, has been introduced in GLib 2.10. Note that new types may be added in the future. Applications should be ready to handle unknown values. They may be regarded as G_UNICODE_BREAK_UNKNOWN. See http://www.unicode.org/unicode/reports/tr14/.

G_UNICODE_BREAK_MANDATORY Mandatory Break (BK)
G_UNICODE_BREAK_CARRIAGE_RETURN Carriage Return (CR)
G_UNICODE_BREAK_LINE_FEED Line Feed (LF)
G_UNICODE_BREAK_COMBINING_MARK Attached Characters and Combining Marks (CM)
G_UNICODE_BREAK_SURROGATE Surrogates (SG)
G_UNICODE_BREAK_ZERO_WIDTH_SPACE Zero Width Space (ZW)
G_UNICODE_BREAK_INSEPARABLE Inseparable (IN)
G_UNICODE_BREAK_NON_BREAKING_GLUE Non-breaking ("Glue") (GL)
G_UNICODE_BREAK_CONTINGENT Contingent Break Opportunity (CB)
G_UNICODE_BREAK_SPACE Space (SP)
G_UNICODE_BREAK_AFTER Break Opportunity After (BA)
G_UNICODE_BREAK_BEFORE Break Opportunity Before (BB)
G_UNICODE_BREAK_BEFORE_AND_AFTER Break Opportunity Before and After (B2)
G_UNICODE_BREAK_HYPHEN Hyphen (HY)
G_UNICODE_BREAK_NON_STARTER Nonstarter (NS)
G_UNICODE_BREAK_OPEN_PUNCTUATION Opening Punctuation (OP)
G_UNICODE_BREAK_CLOSE_PUNCTUATION Closing Punctuation (CL)
G_UNICODE_BREAK_QUOTATION Ambiguous Quotation (QU)
G_UNICODE_BREAK_EXCLAMATION Exclamation/Interrogation (EX)
G_UNICODE_BREAK_IDEOGRAPHIC Ideographic (ID)
G_UNICODE_BREAK_NUMERIC Numeric (NU)
G_UNICODE_BREAK_INFIX_SEPARATOR Infix Separator (Numeric) (IS)
G_UNICODE_BREAK_SYMBOL Symbols Allowing Break After (SY)
G_UNICODE_BREAK_ALPHABETIC Ordinary Alphabetic and Symbol Characters (AL)
G_UNICODE_BREAK_PREFIX Prefix (Numeric) (PR)
G_UNICODE_BREAK_POSTFIX Postfix (Numeric) (PO)
G_UNICODE_BREAK_COMPLEX_CONTEXT Complex Content Dependent (South East Asian) (SA)
G_UNICODE_BREAK_AMBIGUOUS Ambiguous (Alphabetic or Ideographic) (AI)
G_UNICODE_BREAK_UNKNOWN Unknown (XX)
G_UNICODE_BREAK_NEXT_LINE Next Line (NL)
G_UNICODE_BREAK_WORD_JOINER Word Joiner (WJ)
G_UNICODE_BREAK_HANGUL_L_JAMO Hangul L Jamo (JL)
G_UNICODE_BREAK_HANGUL_V_JAMO Hangul V Jamo (JV)
G_UNICODE_BREAK_HANGUL_T_JAMO Hangul T Jamo (JT)
G_UNICODE_BREAK_HANGUL_LV_SYLLABLE Hangul LV Syllable (H2)
G_UNICODE_BREAK_HANGUL_LVT_SYLLABLE Hangul LVT Syllable (H3)

g_unichar_break_type ()

GUnicodeBreakType   g_unichar_break_type                (gunichar c);

Determines the break type of c. c should be a Unicode character (to derive a character from UTF-8 encoded text, use g_utf8_get_char()). The break type is used to find word and line breaks ("text boundaries"), Pango implements the Unicode boundary resolution algorithms and normally you would use a function such as pango_break() instead of caring about break types yourself.

c : a Unicode character
Returns : the break type of c

g_unichar_combining_class ()

gint                g_unichar_combining_class           (gunichar uc);

Determines the canonical combining class of a Unicode character.

uc : a Unicode character
Returns : the combining class of the character

Since 2.14


g_unicode_canonical_ordering ()

void                g_unicode_canonical_ordering        (gunichar *string,
                                                         gsize len);

Computes the canonical ordering of a string in-place. This rearranges decomposed characters in the string according to their combining classes. See the Unicode manual for more information.

string : a UCS-4 encoded string.
len : the maximum length of string to use.

g_unicode_canonical_decomposition ()

gunichar*           g_unicode_canonical_decomposition   (gunichar ch,
                                                         gsize *result_len);

Computes the canonical decomposition of a Unicode character.

ch : a Unicode character.
result_len : location to store the length of the return value.
Returns : a newly allocated string of Unicode characters. result_len is set to the resulting length of the string.

g_unicode_canonical_decomposition_to_buffer ()

gboolean            g_unicode_canonical_decomposition_to_buffer
                                                        (gunichar ch,
                                                         gunichar *out,
                                                         gsize out_len,
                                                         gsize *result_len);

Computes the canonical decomposition of a Unicode character, storing the result in out if it is large enough. If out is too small, FALSE is returned and the function should be called again with a buffer of size result_len.

ch : a Unicode character.
out : a buffer to store the decomposited string of unicode characters
out_len : the size of the buffer
result_len : location to store the length of the return value.
Returns : TRUE if out was large enough

Since maemo


g_unichar_get_mirror_char ()

gboolean            g_unichar_get_mirror_char           (gunichar ch,
                                                         gunichar *mirrored_ch);

In Unicode, some characters are mirrored. This means that their images are mirrored horizontally in text that is laid out from right to left. For instance, "(" would become its mirror image, ")", in right-to-left text.

If ch has the Unicode mirrored property and there is another unicode character that typically has a glyph that is the mirror image of ch's glyph and mirrored_ch is set, it puts that character in the address pointed to by mirrored_ch. Otherwise the original character is put.

ch : a Unicode character
mirrored_ch : location to store the mirrored character
Returns : TRUE if ch has a mirrored character, FALSE otherwise

Since 2.4


enum GUnicodeScript

typedef enum 
{                         /* ISO 15924 code */
  G_UNICODE_SCRIPT_INVALID_CODE = -1,
  G_UNICODE_SCRIPT_COMMON       = 0,   /* Zyyy */
  G_UNICODE_SCRIPT_INHERITED,          /* Qaai */
  G_UNICODE_SCRIPT_ARABIC,             /* Arab */
  G_UNICODE_SCRIPT_ARMENIAN,           /* Armn */
  G_UNICODE_SCRIPT_BENGALI,            /* Beng */
  G_UNICODE_SCRIPT_BOPOMOFO,           /* Bopo */
  G_UNICODE_SCRIPT_CHEROKEE,           /* Cher */
  G_UNICODE_SCRIPT_COPTIC,             /* Qaac */
  G_UNICODE_SCRIPT_CYRILLIC,           /* Cyrl (Cyrs) */
  G_UNICODE_SCRIPT_DESERET,            /* Dsrt */
  G_UNICODE_SCRIPT_DEVANAGARI,         /* Deva */
  G_UNICODE_SCRIPT_ETHIOPIC,           /* Ethi */
  G_UNICODE_SCRIPT_GEORGIAN,           /* Geor (Geon, Geoa) */
  G_UNICODE_SCRIPT_GOTHIC,             /* Goth */
  G_UNICODE_SCRIPT_GREEK,              /* Grek */
  G_UNICODE_SCRIPT_GUJARATI,           /* Gujr */
  G_UNICODE_SCRIPT_GURMUKHI,           /* Guru */
  G_UNICODE_SCRIPT_HAN,                /* Hani */
  G_UNICODE_SCRIPT_HANGUL,             /* Hang */
  G_UNICODE_SCRIPT_HEBREW,             /* Hebr */
  G_UNICODE_SCRIPT_HIRAGANA,           /* Hira */
  G_UNICODE_SCRIPT_KANNADA,            /* Knda */
  G_UNICODE_SCRIPT_KATAKANA,           /* Kana */
  G_UNICODE_SCRIPT_KHMER,              /* Khmr */
  G_UNICODE_SCRIPT_LAO,                /* Laoo */
  G_UNICODE_SCRIPT_LATIN,              /* Latn (Latf, Latg) */
  G_UNICODE_SCRIPT_MALAYALAM,          /* Mlym */
  G_UNICODE_SCRIPT_MONGOLIAN,          /* Mong */
  G_UNICODE_SCRIPT_MYANMAR,            /* Mymr */
  G_UNICODE_SCRIPT_OGHAM,              /* Ogam */
  G_UNICODE_SCRIPT_OLD_ITALIC,         /* Ital */
  G_UNICODE_SCRIPT_ORIYA,              /* Orya */
  G_UNICODE_SCRIPT_RUNIC,              /* Runr */
  G_UNICODE_SCRIPT_SINHALA,            /* Sinh */
  G_UNICODE_SCRIPT_SYRIAC,             /* Syrc (Syrj, Syrn, Syre) */
  G_UNICODE_SCRIPT_TAMIL,              /* Taml */
  G_UNICODE_SCRIPT_TELUGU,             /* Telu */
  G_UNICODE_SCRIPT_THAANA,             /* Thaa */
  G_UNICODE_SCRIPT_THAI,               /* Thai */
  G_UNICODE_SCRIPT_TIBETAN,            /* Tibt */
  G_UNICODE_SCRIPT_CANADIAN_ABORIGINAL, /* Cans */
  G_UNICODE_SCRIPT_YI,                 /* Yiii */
  G_UNICODE_SCRIPT_TAGALOG,            /* Tglg */
  G_UNICODE_SCRIPT_HANUNOO,            /* Hano */
  G_UNICODE_SCRIPT_BUHID,              /* Buhd */
  G_UNICODE_SCRIPT_TAGBANWA,           /* Tagb */

  /* Unicode-4.0 additions */
  G_UNICODE_SCRIPT_BRAILLE,            /* Brai */
  G_UNICODE_SCRIPT_CYPRIOT,            /* Cprt */
  G_UNICODE_SCRIPT_LIMBU,              /* Limb */
  G_UNICODE_SCRIPT_OSMANYA,            /* Osma */
  G_UNICODE_SCRIPT_SHAVIAN,            /* Shaw */
  G_UNICODE_SCRIPT_LINEAR_B,           /* Linb */
  G_UNICODE_SCRIPT_TAI_LE,             /* Tale */
  G_UNICODE_SCRIPT_UGARITIC,           /* Ugar */
      
  /* Unicode-4.1 additions */
  G_UNICODE_SCRIPT_NEW_TAI_LUE,        /* Talu */
  G_UNICODE_SCRIPT_BUGINESE,           /* Bugi */
  G_UNICODE_SCRIPT_GLAGOLITIC,         /* Glag */
  G_UNICODE_SCRIPT_TIFINAGH,           /* Tfng */
  G_UNICODE_SCRIPT_SYLOTI_NAGRI,       /* Sylo */
  G_UNICODE_SCRIPT_OLD_PERSIAN,        /* Xpeo */
  G_UNICODE_SCRIPT_KHAROSHTHI,         /* Khar */

  /* Unicode-5.0 additions */
  G_UNICODE_SCRIPT_UNKNOWN,            /* Zzzz */
  G_UNICODE_SCRIPT_BALINESE,           /* Bali */
  G_UNICODE_SCRIPT_CUNEIFORM,          /* Xsux */
  G_UNICODE_SCRIPT_PHOENICIAN,         /* Phnx */
  G_UNICODE_SCRIPT_PHAGS_PA,           /* Phag */
  G_UNICODE_SCRIPT_NKO,                /* Nkoo */

  /* Unicode-5.1 additions */
  G_UNICODE_SCRIPT_KAYAH_LI,           /* Kali */
  G_UNICODE_SCRIPT_LEPCHA,             /* Lepc */
  G_UNICODE_SCRIPT_REJANG,             /* Rjng */
  G_UNICODE_SCRIPT_SUNDANESE,          /* Sund */
  G_UNICODE_SCRIPT_SAURASHTRA,         /* Saur */
  G_UNICODE_SCRIPT_CHAM,               /* Cham */
  G_UNICODE_SCRIPT_OL_CHIKI,           /* Olck */
  G_UNICODE_SCRIPT_VAI,                /* Vaii */
  G_UNICODE_SCRIPT_CARIAN,             /* Cari */
  G_UNICODE_SCRIPT_LYCIAN,             /* Lyci */
  G_UNICODE_SCRIPT_LYDIAN              /* Lydi */
} GUnicodeScript;

The GUnicodeScript enumeration identifies different writing systems. The values correspond to the names as defined in the Unicode standard. The enumeration has been added in GLib 2.14, and is interchangeable with PangoScript. Note that new types may be added in the future. Applications should be ready to handle unknown values. See Unicode Standard Annex "" Script names.

G_UNICODE_SCRIPT_INVALID_CODE a value never returned from g_unichar_get_script()
G_UNICODE_SCRIPT_COMMON a character used by multiple different scripts
G_UNICODE_SCRIPT_INHERITED a mark glyph that takes its script from the base glyph to which it is attached
G_UNICODE_SCRIPT_ARABIC Arabic
G_UNICODE_SCRIPT_ARMENIAN Armenian
G_UNICODE_SCRIPT_BENGALI Bengali
G_UNICODE_SCRIPT_BOPOMOFO Bopomofo
G_UNICODE_SCRIPT_CHEROKEE Cherokee
G_UNICODE_SCRIPT_COPTIC Coptic
G_UNICODE_SCRIPT_CYRILLIC Cyrillic
G_UNICODE_SCRIPT_DESERET Deseret
G_UNICODE_SCRIPT_DEVANAGARI Devanagari
G_UNICODE_SCRIPT_ETHIOPIC Ethiopic
G_UNICODE_SCRIPT_GEORGIAN Georgian
G_UNICODE_SCRIPT_GOTHIC Gothic
G_UNICODE_SCRIPT_GREEK Greek
G_UNICODE_SCRIPT_GUJARATI Gujarati
G_UNICODE_SCRIPT_GURMUKHI Gurmukhi
G_UNICODE_SCRIPT_HAN Han
G_UNICODE_SCRIPT_HANGUL Hangul
G_UNICODE_SCRIPT_HEBREW Hebrew
G_UNICODE_SCRIPT_HIRAGANA Hiragana
G_UNICODE_SCRIPT_KANNADA Kannada
G_UNICODE_SCRIPT_KATAKANA Katakana
G_UNICODE_SCRIPT_KHMER Khmer
G_UNICODE_SCRIPT_LAO Lao
G_UNICODE_SCRIPT_LATIN Latin
G_UNICODE_SCRIPT_MALAYALAM Malayalam
G_UNICODE_SCRIPT_MONGOLIAN Mongolian
G_UNICODE_SCRIPT_MYANMAR Myanmar
G_UNICODE_SCRIPT_OGHAM Ogham
G_UNICODE_SCRIPT_OLD_ITALIC Old Italic
G_UNICODE_SCRIPT_ORIYA Oriya
G_UNICODE_SCRIPT_RUNIC Runic
G_UNICODE_SCRIPT_SINHALA Sinhala
G_UNICODE_SCRIPT_SYRIAC Syriac
G_UNICODE_SCRIPT_TAMIL Tamil
G_UNICODE_SCRIPT_TELUGU Telugu
G_UNICODE_SCRIPT_THAANA Thaana
G_UNICODE_SCRIPT_THAI Thai
G_UNICODE_SCRIPT_TIBETAN Tibetan
G_UNICODE_SCRIPT_CANADIAN_ABORIGINAL Canadian Aboriginal
G_UNICODE_SCRIPT_YI Yi
G_UNICODE_SCRIPT_TAGALOG Tagalog
G_UNICODE_SCRIPT_HANUNOO Hanunoo
G_UNICODE_SCRIPT_BUHID Buhid
G_UNICODE_SCRIPT_TAGBANWA Tagbanwa
G_UNICODE_SCRIPT_BRAILLE Braille
G_UNICODE_SCRIPT_CYPRIOT Cypriot
G_UNICODE_SCRIPT_LIMBU Limbu
G_UNICODE_SCRIPT_OSMANYA Osmanya
G_UNICODE_SCRIPT_SHAVIAN Shavian
G_UNICODE_SCRIPT_LINEAR_B Linear B
G_UNICODE_SCRIPT_TAI_LE Tai Le
G_UNICODE_SCRIPT_UGARITIC Ugaritic
G_UNICODE_SCRIPT_NEW_TAI_LUE New Tai Lue
G_UNICODE_SCRIPT_BUGINESE Buginese
G_UNICODE_SCRIPT_GLAGOLITIC Glagolitic
G_UNICODE_SCRIPT_TIFINAGH Tifinagh
G_UNICODE_SCRIPT_SYLOTI_NAGRI Syloti Nagri
G_UNICODE_SCRIPT_OLD_PERSIAN Old Persian
G_UNICODE_SCRIPT_KHAROSHTHI Kharoshthi
G_UNICODE_SCRIPT_UNKNOWN an unassigned code point
G_UNICODE_SCRIPT_BALINESE Balinese
G_UNICODE_SCRIPT_CUNEIFORM Cuneiform
G_UNICODE_SCRIPT_PHOENICIAN Phoenician
G_UNICODE_SCRIPT_PHAGS_PA Phags-pa
G_UNICODE_SCRIPT_NKO N'Ko
G_UNICODE_SCRIPT_KAYAH_LI Kayah Li. Since 2.16.3
G_UNICODE_SCRIPT_LEPCHA Lepcha. Since 2.16.3
G_UNICODE_SCRIPT_REJANG Rejang. Since 2.16.3
G_UNICODE_SCRIPT_SUNDANESE Sundanese. Since 2.16.3
G_UNICODE_SCRIPT_SAURASHTRA Saurashtra. Since 2.16.3
G_UNICODE_SCRIPT_CHAM Cham. Since 2.16.3
G_UNICODE_SCRIPT_OL_CHIKI Ol Chiki. Since 2.16.3
G_UNICODE_SCRIPT_VAI Vai. Since 2.16.3
G_UNICODE_SCRIPT_CARIAN Carian. Since 2.16.3
G_UNICODE_SCRIPT_LYCIAN Lycian. Since 2.16.3
G_UNICODE_SCRIPT_LYDIAN Lydian. Since 2.16.3

g_unichar_get_script ()

GUnicodeScript      g_unichar_get_script                (gunichar ch);

Looks up the GUnicodeScript for a particular character (as defined by Unicode Standard Annex 24). No check is made for ch being a valid Unicode character; if you pass in invalid character, the result is undefined.

This function is equivalent to pango_script_for_unichar() and the two are interchangeable.

ch : a Unicode character
Returns : the GUnicodeScript for the character.

Since 2.14


g_utf8_next_char()

#define             g_utf8_next_char(p)

Skips to the next character in a UTF-8 string. The string must be valid; this macro is as fast as possible, and has no error-checking. You would use this macro to iterate over a string character by character. The macro returns the start of the next UTF-8 character. Before using this macro, use g_utf8_validate() to validate strings that may contain invalid UTF-8.

p : Pointer to the start of a valid UTF-8 character.

g_utf8_get_char ()

gunichar            g_utf8_get_char                     (const gchar *p);

Converts a sequence of bytes encoded as UTF-8 to a Unicode character. If p does not point to a valid UTF-8 encoded character, results are undefined. If you are not sure that the bytes are complete valid Unicode characters, you should use g_utf8_get_char_validated() instead.

p : a pointer to Unicode character encoded as UTF-8
Returns : the resulting character

g_utf8_get_char_validated ()

gunichar            g_utf8_get_char_validated           (const gchar *p,
                                                         gssize max_len);

Convert a sequence of bytes encoded as UTF-8 to a Unicode character. This function checks for incomplete characters, for invalid characters such as characters that are out of the range of Unicode, and for overlong encodings of valid characters.

p : a pointer to Unicode character encoded as UTF-8
max_len : the maximum number of bytes to read, or -1, for no maximum or if p is nul-terminated
Returns : the resulting character. If p points to a partial sequence at the end of a string that could begin a valid character (or if max_len is zero), returns (gunichar)-2; otherwise, if p does not point to a valid UTF-8 encoded Unicode character, returns (gunichar)-1.

g_utf8_offset_to_pointer ()

gchar*              g_utf8_offset_to_pointer            (const gchar *str,
                                                         glong offset);

Converts from an integer character offset to a pointer to a position within the string.

Since 2.10, this function allows to pass a negative offset to step backwards. It is usually worth stepping backwards from the end instead of forwards if offset is in the last fourth of the string, since moving forward is about 3 times faster than moving backward.

str : a UTF-8 encoded string
offset : a character offset within str
Returns : the resulting pointer

g_utf8_pointer_to_offset ()

glong               g_utf8_pointer_to_offset            (const gchar *str,
                                                         const gchar *pos);

Converts from a pointer to position within a string to a integer character offset.

Since 2.10, this function allows pos to be before str, and returns a negative offset in this case.

str : a UTF-8 encoded string
pos : a pointer to a position within str
Returns : the resulting character offset

g_utf8_prev_char ()

gchar*              g_utf8_prev_char                    (const gchar *p);

Finds the previous UTF-8 character in the string before p.

p does not have to be at the beginning of a UTF-8 character. No check is made to see if the character found is actually valid other than it starts with an appropriate byte. If p might be the first character of the string, you must use g_utf8_find_prev_char() instead.

p : a pointer to a position within a UTF-8 encoded string
Returns : a pointer to the found character.

g_utf8_find_next_char ()

gchar*              g_utf8_find_next_char               (const gchar *p,
                                                         const gchar *end);

Finds the start of the next UTF-8 character in the string after p.

p does not have to be at the beginning of a UTF-8 character. No check is made to see if the character found is actually valid other than it starts with an appropriate byte.

p : a pointer to a position within a UTF-8 encoded string
end : a pointer to the byte following the end of the string, or NULL to indicate that the string is nul-terminated.
Returns : a pointer to the found character or NULL

g_utf8_find_prev_char ()

gchar*              g_utf8_find_prev_char               (const gchar *str,
                                                         const gchar *p);

Given a position p with a UTF-8 encoded string str, find the start of the previous UTF-8 character starting before p. Returns NULL if no UTF-8 characters are present in str before p.

p does not have to be at the beginning of a UTF-8 character. No check is made to see if the character found is actually valid other than it starts with an appropriate byte.

str : pointer to the beginning of a UTF-8 encoded string
p : pointer to some position within str
Returns : a pointer to the found character or NULL.

g_utf8_strlen ()

glong               g_utf8_strlen                       (const gchar *p,
                                                         gssize max);

Returns the length of the string in characters.

p : pointer to the start of a UTF-8 encoded string.
max : the maximum number of bytes to examine. If max is less than 0, then the string is assumed to be nul-terminated. If max is 0, p will not be examined and may be NULL.
Returns : the length of the string in characters

g_utf8_strncpy ()

gchar*              g_utf8_strncpy                      (gchar *dest,
                                                         const gchar *src,
                                                         gsize n);

Like the standard C strncpy() function, but copies a given number of characters instead of a given number of bytes. The src string must be valid UTF-8 encoded text. (Use g_utf8_validate() on all text before trying to use UTF-8 utility functions with it.)

dest : buffer to fill with characters from src
src : UTF-8 encoded string
n : character count
Returns : dest

g_utf8_strchr ()

gchar*              g_utf8_strchr                       (const gchar *p,
                                                         gssize len,
                                                         gunichar c);

Finds the leftmost occurrence of the given Unicode character in a UTF-8 encoded string, while limiting the search to len bytes. If len is -1, allow unbounded search.

p : a nul-terminated UTF-8 encoded string
len : the maximum length of p
c : a Unicode character
Returns : NULL if the string does not contain the character, otherwise, a pointer to the start of the leftmost occurrence of the character in the string.

g_utf8_strrchr ()

gchar*              g_utf8_strrchr                      (const gchar *p,
                                                         gssize len,
                                                         gunichar c);

Find the rightmost occurrence of the given Unicode character in a UTF-8 encoded string, while limiting the search to len bytes. If len is -1, allow unbounded search.

p : a nul-terminated UTF-8 encoded string
len : the maximum length of p
c : a Unicode character
Returns : NULL if the string does not contain the character, otherwise, a pointer to the start of the rightmost occurrence of the character in the string.

g_utf8_strreverse ()

gchar*              g_utf8_strreverse                   (const gchar *str,
                                                         gssize len);

Reverses a UTF-8 string. str must be valid UTF-8 encoded text. (Use g_utf8_validate() on all text before trying to use UTF-8 utility functions with it.)

This function is intended for programmatic uses of reversed strings. It pays no attention to decomposed characters, combining marks, byte order marks, directional indicators (LRM, LRO, etc) and similar characters which might need special handling when reversing a string for display purposes.

Note that unlike g_strreverse(), this function returns newly-allocated memory, which should be freed with g_free() when no longer needed.

str : a UTF-8 encoded string
len : the maximum length of str to use, in bytes. If len < 0, then the string is nul-terminated.
Returns : a newly-allocated string which is the reverse of str.

Since 2.2


g_utf8_validate ()

gboolean            g_utf8_validate                     (const gchar *str,
                                                         gssize max_len,
                                                         const gchar **end);

Validates UTF-8 encoded text. str is the text to validate; if str is nul-terminated, then max_len can be -1, otherwise max_len should be the number of bytes to validate. If end is non-NULL, then the end of the valid range will be stored there (i.e. the start of the first invalid character if some bytes were invalid, or the end of the text being validated otherwise).

Note that g_utf8_validate() returns FALSE if max_len is positive and NUL is met before max_len bytes have been read.

Returns TRUE if all of str was valid. Many GLib and GTK+ routines require valid UTF-8 as input; so data read from a file or the network should be checked with g_utf8_validate() before doing anything else with it.

str : a pointer to character data
max_len : max bytes to validate, or -1 to go until NUL
end : return location for end of valid data
Returns : TRUE if the text was valid UTF-8

g_utf8_strup ()

gchar*              g_utf8_strup                        (const gchar *str,
                                                         gssize len);

Converts all Unicode characters in the string that have a case to uppercase. The exact manner that this is done depends on the current locale, and may result in the number of characters in the string increasing. (For instance, the German ess-zet will be changed to SS.)

str : a UTF-8 encoded string
len : length of str, in bytes, or -1 if str is nul-terminated.
Returns : a newly allocated string, with all characters converted to uppercase.

g_utf8_strdown ()

gchar*              g_utf8_strdown                      (const gchar *str,
                                                         gssize len);

Converts all Unicode characters in the string that have a case to lowercase. The exact manner that this is done depends on the current locale, and may result in the number of characters in the string changing.

str : a UTF-8 encoded string
len : length of str, in bytes, or -1 if str is nul-terminated.
Returns : a newly allocated string, with all characters converted to lowercase.

g_utf8_casefold ()

gchar*              g_utf8_casefold                     (const gchar *str,
                                                         gssize len);

Converts a string into a form that is independent of case. The result will not correspond to any particular case, but can be compared for equality or ordered with the results of calling g_utf8_casefold() on other strings.

Note that calling g_utf8_casefold() followed by g_utf8_collate() is only an approximation to the correct linguistic case insensitive ordering, though it is a fairly good one. Getting this exactly right would require a more sophisticated collation function that takes case sensitivity into account. GLib does not currently provide such a function.

str : a UTF-8 encoded string
len : length of str, in bytes, or -1 if str is nul-terminated.
Returns : a newly allocated string, that is a case independent form of str.

g_utf8_normalize ()

gchar*              g_utf8_normalize                    (const gchar *str,
                                                         gssize len,
                                                         GNormalizeMode mode);

Converts a string into canonical form, standardizing such issues as whether a character with an accent is represented as a base character and combining accent or as a single precomposed character. The string has to be valid UTF-8, otherwise NULL is returned. You should generally call g_utf8_normalize() before comparing two Unicode strings.

The normalization mode G_NORMALIZE_DEFAULT only standardizes differences that do not affect the text content, such as the above-mentioned accent representation. G_NORMALIZE_ALL also standardizes the "compatibility" characters in Unicode, such as SUPERSCRIPT THREE to the standard forms (in this case DIGIT THREE). Formatting information may be lost but for most text operations such characters should be considered the same.

G_NORMALIZE_DEFAULT_COMPOSE and G_NORMALIZE_ALL_COMPOSE are like G_NORMALIZE_DEFAULT and G_NORMALIZE_ALL, but returned a result with composed forms rather than a maximally decomposed form. This is often useful if you intend to convert the string to a legacy encoding or pass it to a system with less capable Unicode handling.

str : a UTF-8 encoded string.
len : length of str, in bytes, or -1 if str is nul-terminated.
mode : the type of normalization to perform.
Returns : a newly allocated string, that is the normalized form of str, or NULL if str is not valid UTF-8.

enum GNormalizeMode

typedef enum {
  G_NORMALIZE_DEFAULT,
  G_NORMALIZE_NFD = G_NORMALIZE_DEFAULT,
  G_NORMALIZE_DEFAULT_COMPOSE,
  G_NORMALIZE_NFC = G_NORMALIZE_DEFAULT_COMPOSE,
  G_NORMALIZE_ALL,
  G_NORMALIZE_NFKD = G_NORMALIZE_ALL,
  G_NORMALIZE_ALL_COMPOSE,
  G_NORMALIZE_NFKC = G_NORMALIZE_ALL_COMPOSE
} GNormalizeMode;

Defines how a Unicode string is transformed in a canonical form, standardizing such issues as whether a character with an accent is represented as a base character and combining accent or as a single precomposed character. Unicode strings should generally be normalized before comparing them.

G_NORMALIZE_DEFAULT standardize differences that do not affect the text content, such as the above-mentioned accent representation.
G_NORMALIZE_NFD another name for G_NORMALIZE_DEFAULT.
G_NORMALIZE_DEFAULT_COMPOSE like G_NORMALIZE_DEFAULT, but with composed forms rather than a maximally decomposed form.
G_NORMALIZE_NFC another name for G_NORMALIZE_DEFAULT_COMPOSE.
G_NORMALIZE_ALL beyond G_NORMALIZE_DEFAULT also standardize the "compatibility" characters in Unicode, such as SUPERSCRIPT THREE to the standard forms (in this case DIGIT THREE). Formatting information may be lost but for most text operations such characters should be considered the same.
G_NORMALIZE_NFKD another name for G_NORMALIZE_ALL.
G_NORMALIZE_ALL_COMPOSE like G_NORMALIZE_ALL, but with composed forms rather than a maximally decomposed form.
G_NORMALIZE_NFKC another name for G_NORMALIZE_ALL_COMPOSE.

g_utf8_collate ()

gint                g_utf8_collate                      (const gchar *str1,
                                                         const gchar *str2);

Compares two strings for ordering using the linguistically correct rules for the current locale. When sorting a large number of strings, it will be significantly faster to obtain collation keys with g_utf8_collate_key() and compare the keys with strcmp() when sorting instead of sorting the original strings.

str1 : a UTF-8 encoded string
str2 : a UTF-8 encoded string
Returns : < 0 if str1 compares before str2, 0 if they compare equal, > 0 if str1 compares after str2.

g_utf8_collate_key ()

gchar*              g_utf8_collate_key                  (const gchar *str,
                                                         gssize len);

Converts a string into a collation key that can be compared with other collation keys produced by the same function using strcmp().

The results of comparing the collation keys of two strings with strcmp() will always be the same as comparing the two original keys with g_utf8_collate().

Note that this function depends on the current locale.

str : a UTF-8 encoded string.
len : length of str, in bytes, or -1 if str is nul-terminated.
Returns : a newly allocated string. This string should be freed with g_free() when you are done with it.

g_utf8_collate_key_for_filename ()

gchar*              g_utf8_collate_key_for_filename     (const gchar *str,
                                                         gssize len);

Converts a string into a collation key that can be compared with other collation keys produced by the same function using strcmp().

In order to sort filenames correctly, this function treats the dot '.' as a special case. Most dictionary orderings seem to consider it insignificant, thus producing the ordering "event.c" "eventgenerator.c" "event.h" instead of "event.c" "event.h" "eventgenerator.c". Also, we would like to treat numbers intelligently so that "file1" "file10" "file5" is sorted as "file1" "file5" "file10".

Note that this function depends on the current locale.

str : a UTF-8 encoded string.
len : length of str, in bytes, or -1 if str is nul-terminated.
Returns : a newly allocated string. This string should be freed with g_free() when you are done with it.

Since 2.8


g_utf8_to_utf16 ()

gunichar2*          g_utf8_to_utf16                     (const gchar *str,
                                                         glong len,
                                                         glong *items_read,
                                                         glong *items_written,
                                                         GError **error);

Convert a string from UTF-8 to UTF-16. A 0 character will be added to the result after the converted text.

str : a UTF-8 encoded string
len : the maximum length (number of characters) of str to use. If len < 0, then the string is nul-terminated.
items_read : location to store number of bytes read, or NULL. If NULL, then G_CONVERT_ERROR_PARTIAL_INPUT will be returned in case str contains a trailing partial character. If an error occurs then the index of the invalid input is stored here.
items_written : location to store number of gunichar2 written, or NULL. The value stored here does not include the trailing 0.
error : location to store the error occuring, or NULL to ignore errors. Any of the errors in GConvertError other than G_CONVERT_ERROR_NO_CONVERSION may occur.
Returns : a pointer to a newly allocated UTF-16 string. This value must be freed with g_free(). If an error occurs, NULL will be returned and error set.

g_utf8_to_ucs4 ()

gunichar*           g_utf8_to_ucs4                      (const gchar *str,
                                                         glong len,
                                                         glong *items_read,
                                                         glong *items_written,
                                                         GError **error);

Convert a string from UTF-8 to a 32-bit fixed width representation as UCS-4. A trailing 0 will be added to the string after the converted text.

str : a UTF-8 encoded string
len : the maximum length of str to use, in bytes. If len < 0, then the string is nul-terminated.
items_read : location to store number of bytes read, or NULL. If NULL, then G_CONVERT_ERROR_PARTIAL_INPUT will be returned in case str contains a trailing partial character. If an error occurs then the index of the invalid input is stored here.
items_written : location to store number of characters written or NULL. The value here stored does not include the trailing 0 character.
error : location to store the error occuring, or NULL to ignore errors. Any of the errors in GConvertError other than G_CONVERT_ERROR_NO_CONVERSION may occur.
Returns : a pointer to a newly allocated UCS-4 string. This value must be freed with g_free(). If an error occurs, NULL will be returned and error set.

g_utf8_to_ucs4_fast ()

gunichar*           g_utf8_to_ucs4_fast                 (const gchar *str,
                                                         glong len,
                                                         glong *items_written);

Convert a string from UTF-8 to a 32-bit fixed width representation as UCS-4, assuming valid UTF-8 input. This function is roughly twice as fast as g_utf8_to_ucs4() but does no error checking on the input.

str : a UTF-8 encoded string
len : the maximum length of str to use, in bytes. If len < 0, then the string is nul-terminated.
items_written : location to store the number of characters in the result, or NULL.
Returns : a pointer to a newly allocated UCS-4 string. This value must be freed with g_free().

g_utf16_to_ucs4 ()

gunichar*           g_utf16_to_ucs4                     (const gunichar2 *str,
                                                         glong len,
                                                         glong *items_read,
                                                         glong *items_written,
                                                         GError **error);

Convert a string from UTF-16 to UCS-4. The result will be nul-terminated.

str : a UTF-16 encoded string
len : the maximum length (number of gunichar2) of str to use. If len < 0, then the string is nul-terminated.
items_read : location to store number of words read, or NULL. If NULL, then G_CONVERT_ERROR_PARTIAL_INPUT will be returned in case str contains a trailing partial character. If an error occurs then the index of the invalid input is stored here.
items_written : location to store number of characters written, or NULL. The value stored here does not include the trailing 0 character.
error : location to store the error occuring, or NULL to ignore errors. Any of the errors in GConvertError other than G_CONVERT_ERROR_NO_CONVERSION may occur.
Returns : a pointer to a newly allocated UCS-4 string. This value must be freed with g_free(). If an error occurs, NULL will be returned and error set.

g_utf16_to_utf8 ()

gchar*              g_utf16_to_utf8                     (const gunichar2 *str,
                                                         glong len,
                                                         glong *items_read,
                                                         glong *items_written,
                                                         GError **error);

Convert a string from UTF-16 to UTF-8. The result will be terminated with a 0 byte.

Note that the input is expected to be already in native endianness, an initial byte-order-mark character is not handled specially. g_convert() can be used to convert a byte buffer of UTF-16 data of ambiguous endianess.

str : a UTF-16 encoded string
len : the maximum length (number of gunichar2) of str to use. If len < 0, then the string is nul-terminated.
items_read : location to store number of words read, or NULL. If NULL, then G_CONVERT_ERROR_PARTIAL_INPUT will be returned in case str contains a trailing partial character. If an error occurs then the index of the invalid input is stored here.
items_written : location to store number of bytes written, or NULL. The value stored here does not include the trailing 0 byte.
error : location to store the error occuring, or NULL to ignore errors. Any of the errors in GConvertError other than G_CONVERT_ERROR_NO_CONVERSION may occur.
Returns : a pointer to a newly allocated UTF-8 string. This value must be freed with g_free(). If an error occurs, NULL will be returned and error set.

g_ucs4_to_utf16 ()

gunichar2*          g_ucs4_to_utf16                     (const gunichar *str,
                                                         glong len,
                                                         glong *items_read,
                                                         glong *items_written,
                                                         GError **error);

Convert a string from UCS-4 to UTF-16. A 0 character will be added to the result after the converted text.

str : a UCS-4 encoded string
len : the maximum length (number of characters) of str to use. If len < 0, then the string is nul-terminated.
items_read : location to store number of bytes read, or NULL. If an error occurs then the index of the invalid input is stored here.
items_written : location to store number of gunichar2 written, or NULL. The value stored here does not include the trailing 0.
error : location to store the error occuring, or NULL to ignore errors. Any of the errors in GConvertError other than G_CONVERT_ERROR_NO_CONVERSION may occur.
Returns : a pointer to a newly allocated UTF-16 string. This value must be freed with g_free(). If an error occurs, NULL will be returned and error set.

g_ucs4_to_utf8 ()

gchar*              g_ucs4_to_utf8                      (const gunichar *str,
                                                         glong len,
                                                         glong *items_read,
                                                         glong *items_written,
                                                         GError **error);

Convert a string from a 32-bit fixed width representation as UCS-4. to UTF-8. The result will be terminated with a 0 byte.

str : a UCS-4 encoded string
len : the maximum length (number of characters) of str to use. If len < 0, then the string is nul-terminated.
items_read : location to store number of characters read, or NULL.
items_written : location to store number of bytes written or NULL. The value here stored does not include the trailing 0 byte.
error : location to store the error occuring, or NULL to ignore errors. Any of the errors in GConvertError other than G_CONVERT_ERROR_NO_CONVERSION may occur.
Returns : a pointer to a newly allocated UTF-8 string. This value must be freed with g_free(). If an error occurs, NULL will be returned and error set. In that case, items_read will be set to the position of the first invalid input character.

g_unichar_to_utf8 ()

gint                g_unichar_to_utf8                   (gunichar c,
                                                         gchar *outbuf);

Converts a single character to UTF-8.

c : a Unicode character code
outbuf : output buffer, must have at least 6 bytes of space. If NULL, the length will be computed and returned and nothing will be written to outbuf.
Returns : number of bytes written

See Also

g_locale_to_utf8(), g_locale_from_utf8()

Convenience functions for converting between UTF-8 and the locale encoding.



[3] surrogate pairs