'\" t
.\" Title: unicode_convert
.\" Author: Sam Varshavchik
.\" Generator: DocBook XSL Stylesheets vsnapshot
.\" Date: 11/25/2020
.\" Manual: Courier Unicode Library
.\" Source: Courier Unicode Library
.\" Language: English
.\"
.TH "UNICODE_CONVERT" "3" "11/25/2020" "Courier Unicode Library" "Courier Unicode Library"
.\" -----------------------------------------------------------------
.\" * Define some portability stuff
.\" -----------------------------------------------------------------
.\" ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.\" http://bugs.debian.org/507673
.\" http://lists.gnu.org/archive/html/groff/2009-02/msg00013.html
.\" ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.ie \n(.g .ds Aq \(aq
.el .ds Aq '
.\" -----------------------------------------------------------------
.\" * set default formatting
.\" -----------------------------------------------------------------
.\" disable hyphenation
.nh
.\" disable justification (adjust text to left margin only)
.ad l
.\" -----------------------------------------------------------------
.\" * MAIN CONTENT STARTS HERE *
.\" -----------------------------------------------------------------
.SH "NAME"
unicode_u_ucs4_native, unicode_u_ucs2_native, unicode_convert_init, unicode_convert, unicode_convert_deinit, unicode_convert_tocbuf_init, unicode_convert_tou_init, unicode_convert_fromu_init, unicode_convert_uc, unicode_convert_tocbuf_toutf8_init, unicode_convert_tocbuf_fromutf8_init, unicode_convert_toutf8, unicode_convert_fromutf8, unicode_convert_tobuf, unicode_convert_tou_tobuf, unicode_convert_fromu_tobuf \- unicode character set conversion
.SH "SYNOPSIS"
.sp
.ft B
.nf
#include
extern const char unicode_u_ucs4_native[];
extern const char unicode_u_ucs2_native[];
.fi
.ft
.HP \w'unicode_convert_handle_t\ unicode_convert_init('u
.BI "unicode_convert_handle_t unicode_convert_init(const\ char\ *" "src_chset" ", const\ char\ *" "dst_chset" ", void\ *" "cb_arg" ");"
.HP \w'int\ unicode_convert('u
.BI "int unicode_convert(unicode_convert_handle_t\ " "handle" ", const\ char\ *" "text" ", size_t\ " "cnt" ");"
.HP \w'int\ unicode_convert_deinit('u
.BI "int unicode_convert_deinit(unicode_convert_handle_t\ " "handle" ", int\ *" "errptr" ");"
.HP 8
.BI "unicode_convert_handle_t unicode_convert_tocbuf_init( const\ char\ *" "src_chset" ", const\ char\ *" "dst_chset" ", char\ **" "cbufptr_ret" ", size_t\ *" "cbufsize_ret" ", int\ " "nullterminate" ");"
.HP 8
.BI "unicode_convert_handle_t unicode_convert_tocbuf_toutf8_init( const\ char\ *" "src_chset" ", char\ **" "cbufptr_ret" ", size_t\ *" "cbufsize_ret" ", int\ " "nullterminate" ");"
.HP 8
.BI "unicode_convert_handle_t unicode_convert_tocbuf_fromutf8_init( const\ char\ *" "dst_chset" ", char\ **" "cbufptr_ret" ", size_t\ *" "cbufsize_ret" ", int\ " "nullterminate" ");"
.HP 8
.BI "unicode_convert_handle_t unicode_convert_tou_init( const\ char\ *" "src_chset" ", char32_t\ **" "ucptr_ret" ", size_t\ *" "ucsize_ret" ", int\ " "nullterminate" ");"
.HP 8
.BI "unicode_convert_handle_t unicode_convert_fromu_init( const\ char\ *" "dst_chset" ", char\ **" "cbufptr_ret" ", size_t\ *" "cbufsize_ret" ", int\ " "nullterminate" ");"
.HP \w'int\ unicode_convert_uc('u
.BI "int unicode_convert_uc(unicode_convert_handle_t\ " "handle" ", const\ char32_t\ *" "text" ", size_t\ " "cnt" ");"
.HP \w'char\ *unicode_convert_toutf8('u
.BI "char *unicode_convert_toutf8(const\ char\ *" "text" ", const\ char\ *" "charset" ", int\ *" "error" ");"
.HP \w'char\ *unicode_convert_fromutf8('u
.BI "char *unicode_convert_fromutf8(const\ char\ *" "text" ", const\ char\ *" "charset" ", int\ *" "error" ");"
.HP \w'char\ *unicode_convert_tobuf('u
.BI "char *unicode_convert_tobuf(const\ char\ *" "text" ", const\ char\ *" "charset" ", const\ char\ *" "dstcharset" ", int\ *" "error" ");"
.HP \w'int\ unicode_convert_toubuf('u
.BI "int unicode_convert_toubuf(const\ char\ *" "text" ", size_t\ " "text_l" ", const\ char\ *" "charset" ", char32_t\ **" "uc" ", size_t\ *" "ucsize" ", int\ *" "error" ");"
.HP \w'int\ unicode_convert_fromu_tobuf('u
.BI "int unicode_convert_fromu_tobuf(const\ char32_t\ *" "utext" ", size_t\ " "utext_l" ", const\ char\ *" "charset" ", char\ **" "c" ", size_t\ *" "csize" ", int\ *" "error" ");"
.SH "DESCRIPTION"
.PP
\fIunicode_u_ucs4_native\fR[] contains the string
\(lqUCS\-4BE\(rq
or
\(lqUCS\-4LE\(rq, matching the native
char32_t
endianness\&.
.PP
\fIunicode_u_ucs2_native\fR[] contains the string
\(lqUCS\-2BE\(rq
or
\(lqUCS\-2LE\(rq, matching the native
char32_t
endianness\&.
.PP
\fBunicode_convert_init\fR(),
\fBunicode_convert\fR(), and
\fBunicode_convert_deinit\fR() are an adaption of th
\m[blue]\fB\fBiconv\fR(3)\fR\m[]\&\s-2\u[1]\d\s+2
API that uses the same calling convention as the other algorithms in this unicode library, with some value\-added features\&. These functions use
\fBiconv\fR(3)
to effect the actual character set conversion\&.
.PP
\fBunicode_convert_init\fR() returns a non\-NULL handle for the requested conversion, or NULL if the requested conversion is not available\&.
\fBunicode_convert_init\fR() takes a pointer to the output function that receives receives converted character text\&. The output function receives a pointer to the converted character text, and the number of characters in the converted text\&. The output function gets repeatedly called, until it receives the entire converted text\&.
.PP
The character text to convert gets passed, repeatedly, to
\fBunicode_convert\fR()\&. Each call to
\fBunicode_convert\fR() results in the output function getting invoked, zero or more times, with each successive part of the converted text\&. Finally,
\fBunicode_convert_deinit\fR() stops the conversion and deallocates the conversion handle\&.
.PP
It\*(Aqs possible that a call to
\fBunicode_convert_deinit\fR() results in some additional calls to the output function, passing the remaining, final parts, of the converted text, before
\fBunicode_convert_deinit\fR() deallocates the handle, and returns\&.
.PP
The output function should return 0 normally\&. A non\-0 return indicates n error condition\&.
\fBunicode_convert_deinit\fR() returns non\-zero if any previous invocation of the output function returned non\-zero (this includes any invocations of the output function resulting from this call, or prior
\fBunicode_convert\fR() calls), or 0 if all invocations of the output function returned 0\&.
.PP
If the
\fIerrptr\fR
is not
NULL, *\fIerrptr\fR
gets set to non\-zero if there were any conversion errors \-\- if there was any text that could not be converted to the destination character text\&.
.PP
\fBunicode_convert\fR() also returns non\-zero if it calls the output function and it returns non\-zero, however the conversion handle remains allocated, so
\fBunicode_convert_deinit\fR() must still be called, to clean that up\&.
.SS "Collecting converted text into a buffer"
.PP
Call
\fBunicode_convert_tocbuf_init\fR() instead of
\fBunicode_convert_init\fR(), then call
\fBunicode_convert\fR() and
\fBunicode_convert_deinit\fR() normally\&. The parameters to
\fBunicode_convert_init\fR() specify the source and the destination character sets\&.
\fBunicode_convert_tocbuf_toutf8_init\fR() is just an alias that specifies
UTF\-8
as the destination character set\&.
\fBunicode_convert_tocbuf_fromutf8_init\fR() is just an alias that specifies
UTF\-8
as the source character st\&.
.PP
These functions supply an output function that collects the converted text into a malloc()ed buffer\&. If
\fBunicode_convert_deinit\fR() returns 0, *\fIcbufptr_ret\fR
gets initialized to a malloc()ed buffer, and the number of converted characters, the size of the malloc()ed buffer, get placed into *\fIcbufsize_ret\fR\&.
.if n \{\
.sp
.\}
.RS 4
.it 1 an-trap
.nr an-no-space-flag 1
.nr an-break-flag 1
.br
.ps +1
\fBNote\fR
.ps -1
.br
.PP
If the converted string is an empty string, *\fIcbufsize_ret\fR
gets set to 0, but *\fIcbufptr_ret\fR
still gets initialized (to a dummy malloced buffer)\&.
.sp .5v
.RE
.PP
A non\-zero
\fInullterminate\fR
places a trailing \e0 character after the converted string (this is included in *\fIcbufsize_ret\fR)\&.
.SS "Converting between character sets and unicode"
.PP
\fBunicode_convert_tou_init\fR() converts character text into a
char32_t
buffer\&. It works just like
\fBunicode_convert_tocbuf_init\fR(), except that only the source character set gets specified and the output buffer is a
char32_t
buffer\&.
\fInullterminate\fR
terminates the converted unicode characters with a
U+0000\&.
.PP
\fBunicode_convert_fromu_init\fR() converts
char32_ts to the output character set, and also works like
\fBunicode_convert_tocbuf_init\fR()\&. Additionally, in this case,
\fBunicode_convert_uc\fR() works just like
\fBunicode_convert\fR() except that the input sequence is a
char32_t
sequence, and the count parameter is th enumber of unicode characters\&.
.SS "One\-shot conversions"
.PP
\fBunicode_convert_toutf8\fR() converts the specified text in the specified text into a UTF\-8 string, returning a malloced buffer\&. If
\fIerror\fR
is not
NULL, even if
\fBunicode_convert_toutf8\fR() returns a non
NULL
value *\fIerror\fR
gets set to a non\-zero value if a character conversion error has occurred, and some characters could not be converted\&.
.PP
\fBunicode_convert_fromutf8\fR() does a similar conversion from UTF\-8
\fItext\fR
to the specified character set\&.
.PP
\fBunicode_convert_tobuf\fR() does a similar conversion between two different character sets\&.
.PP
\fBunicode_convert_tou_tobuf\fR() calls
\fBunicode_convert_tou_init\fR(), feeds the character string through
\fBunicode_convert\fR(), then calls
\fBunicode_convert_deinit\fR()\&. If this function returns 0, *\fIuc\fR
and *\fIucsize\fR
are set to a malloced buffer+size holding the unicode char array\&.
.PP
\fBunicode_convert_fromu_tobuf\fR() calls
\fBunicode_convert_fromu_init\fR(), feeds the unicode array through
\fBunicode_convert_uc\fR(), then calls unicode_convert_deinit()\&. If this function returns 0, *\fIc\fR
and *\fIcsize\fR
are set to a malloced buffer+size holding the char array\&.
.SH "SEE ALSO"
.PP
\fBcourier-unicode\fR(7),
\fBunicode_convert_tocase\fR(3),
\fBunicode_default_chset\fR(3)\&.
.SH "AUTHOR"
.PP
\fBSam Varshavchik\fR
.RS 4
Author
.RE
.SH "NOTES"
.IP " 1." 4
\fBiconv\fR(3)
.RS 4
\%http://manpages.courier-mta.org/htmlman3/iconv.3.html
.RE