Reimplement wchar conversion API.
This commit is contained in:
parent
9e6148f6ff
commit
f41964fcab
|
@ -238,6 +238,8 @@ time/strftime.o \
|
|||
time/timegm.o \
|
||||
wchar/mbrlen.o \
|
||||
wchar/mbrtowc.o \
|
||||
wchar/mbsinit.o \
|
||||
wchar/mbsnrtowcs.o \
|
||||
wchar/mbsrtowcs.o \
|
||||
wchar/wcrtomb.o \
|
||||
wchar/wcscat.o \
|
||||
|
@ -251,6 +253,7 @@ wchar/wcslen.o \
|
|||
wchar/wcsncat.o \
|
||||
wchar/wcsncmp.o \
|
||||
wchar/wcsncpy.o \
|
||||
wchar/wcsnrtombs.o \
|
||||
wchar/wcspbrk.o \
|
||||
wchar/wcsrchr.o \
|
||||
wchar/wcsrtombs.o \
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/*******************************************************************************
|
||||
|
||||
Copyright(C) Jonas 'Sortie' Termansen 2011, 2012, 2013.
|
||||
Copyright(C) Jonas 'Sortie' Termansen 2011, 2012, 2013, 2014.
|
||||
|
||||
This file is part of the Sortix C Library.
|
||||
|
||||
|
@ -39,8 +39,7 @@ __BEGIN_DECLS
|
|||
/* TODO: This random interface is stupid. What should a good value be? */
|
||||
#define RAND_MAX 32767
|
||||
|
||||
/* TODO: This is just a value. It's not a compile time constant! */
|
||||
#define MB_CUR_MAX 16
|
||||
#define MB_CUR_MAX 6
|
||||
|
||||
typedef struct
|
||||
{
|
||||
|
|
|
@ -105,12 +105,15 @@ typedef __wint_t wint_t;
|
|||
/* Conversion state information. */
|
||||
typedef struct
|
||||
{
|
||||
int __count;
|
||||
union
|
||||
{
|
||||
wint_t __wch;
|
||||
char __wchb[4];
|
||||
} __value; /* Value so far. */
|
||||
#if defined(__is_sortix_libc)
|
||||
unsigned short count;
|
||||
unsigned short length;
|
||||
wint_t wch;
|
||||
#else
|
||||
unsigned short __count;
|
||||
unsigned short __length;
|
||||
wint_t __wch;
|
||||
#endif
|
||||
} mbstate_t;
|
||||
#define __mbstate_t_defined 1
|
||||
#endif
|
||||
|
@ -126,12 +129,11 @@ struct tm;
|
|||
/* TODO: wint_t getwchar(void); */
|
||||
size_t mbrlen(const char* __restrict, size_t, mbstate_t* __restrict);
|
||||
size_t mbrtowc(wchar_t* __restrict, const char* __restrict, size_t, mbstate_t* __restrict);
|
||||
/* TODO: int mbsinit(const mbstate_t*); */
|
||||
int mbsinit(const mbstate_t*);
|
||||
size_t mbsrtowcs(wchar_t* __restrict, const char** __restrict, size_t, mbstate_t* __restrict);
|
||||
/* TODO: wint_t putwc(wchar_t, FILE*); */
|
||||
/* TODO: wint_t putwchar(wchar_t); */
|
||||
/* TODO: wint_t ungetwc(wint_t, FILE*); */
|
||||
|
||||
size_t wcrtomb(char* __restrict, wchar_t, mbstate_t* __restrict);
|
||||
wchar_t* wcscat(wchar_t* __restrict, const wchar_t* __restrict);
|
||||
wchar_t* wcschr(const wchar_t*, wchar_t);
|
||||
|
@ -193,7 +195,7 @@ int wcwidth(wchar_t);
|
|||
|
||||
/* Functions from POSIX 2008. */
|
||||
#if __USE_SORTIX || 200809L <= __USE_POSIX
|
||||
/* TODO: size_t mbsnrtowcs(wchar_t* __restrict, const char** __restrict, size_t, size_t, mbstate_t* __restrict); */
|
||||
size_t mbsnrtowcs(wchar_t* __restrict, const char** __restrict, size_t, size_t, mbstate_t* __restrict);
|
||||
/* TODO: FILE* open_wmemstream(wchar_t**, size_t*); */
|
||||
/* TODO: wchar_t* wcpcpy(wchar_t* __restrict, const wchar_t* __restrict); */
|
||||
/* TODO: wchar_t* wcpncpy(wchar_t* __restrict, const wchar_t* __restrict, size_t); */
|
||||
|
@ -204,7 +206,7 @@ int wcwidth(wchar_t);
|
|||
/* TODO: int wcsncasecmp(const wchar_t*, const wchar_t *, size_t); */
|
||||
/* TODO: int wcsncasecmp_l(const wchar_t*, const wchar_t *, size_t, locale_t); */
|
||||
/* TODO: size_t wcsnlen(const wchar_t*, size_t); */
|
||||
/* TODO: size_t wcsnrtombs(char* __restrict, const wchar_t** __restrict, size_t, size_t, mbstate_t* __restrict); */
|
||||
size_t wcsnrtombs(char* __restrict, const wchar_t** __restrict, size_t, size_t, mbstate_t* __restrict);
|
||||
/* TODO: size_t wcsxfrm_l(wchar_t* __restrict, const wchar_t* __restrict, size_t, locale_t); */
|
||||
#endif
|
||||
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/*******************************************************************************
|
||||
|
||||
Copyright(C) Jonas 'Sortie' Termansen 2013.
|
||||
Copyright(C) Jonas 'Sortie' Termansen 2013, 2014.
|
||||
|
||||
This file is part of the Sortix C Library.
|
||||
|
||||
|
@ -26,18 +26,18 @@
|
|||
#include <string.h>
|
||||
#include <wchar.h>
|
||||
|
||||
// TODO: This function is unpure and should be removed.
|
||||
extern "C" int mblen(const char* s, size_t n)
|
||||
{
|
||||
wchar_t wc;
|
||||
static mbstate_t ps;
|
||||
size_t result = mbrtowc(&wc, s, n, &ps);
|
||||
if ( !s )
|
||||
{
|
||||
memset(&ps, 0, sizeof(ps));
|
||||
return 0; // TODO: Give the correct return value depending on ps.
|
||||
}
|
||||
size_t ret = mbrlen(s, n, &ps);
|
||||
if ( ret == (size_t) -2 )
|
||||
if ( result == (size_t) -1 )
|
||||
return memset(&ps, 0, sizeof(ps)), -1;
|
||||
// TODO: Should ps be cleared to zero in this case?
|
||||
if ( result == (size_t) -2 )
|
||||
return -1;
|
||||
if ( ret == (size_t) -1 )
|
||||
return -1;
|
||||
return (int) ret;
|
||||
return (int) result;
|
||||
}
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/*******************************************************************************
|
||||
|
||||
Copyright(C) Jonas 'Sortie' Termansen 2013.
|
||||
Copyright(C) Jonas 'Sortie' Termansen 2013, 2014.
|
||||
|
||||
This file is part of the Sortix C Library.
|
||||
|
||||
|
@ -23,16 +23,14 @@
|
|||
*******************************************************************************/
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <wchar.h>
|
||||
|
||||
extern "C" size_t mbstowcs(wchar_t* dst, const char* src, size_t n)
|
||||
// TODO: This function is unpure and should be removed.
|
||||
extern "C"
|
||||
size_t mbstowcs(wchar_t* restrict dst, const char* restrict src, size_t n)
|
||||
{
|
||||
// Reset the secret conversion state variable in mbsrtowcs that is used when
|
||||
// ps is NULL by successfully converting the empty string. As always, this
|
||||
// is not multithread secure. For some reason, the standards don't mandate
|
||||
// that the conversion state is reset when mbsrtowcs is called with ps=NULL,
|
||||
// which arguably is a feature - but this function is supposed to do it.
|
||||
const char* empty_string = "";
|
||||
mbsrtowcs(NULL, &empty_string, 0, NULL);
|
||||
return mbsrtowcs(dst, &src, n, NULL);
|
||||
mbstate_t ps;
|
||||
memset(&ps, 0, sizeof(ps));
|
||||
return mbsrtowcs(dst, (const char**) &src, n, &ps);
|
||||
}
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/*******************************************************************************
|
||||
|
||||
Copyright(C) Jonas 'Sortie' Termansen 2011, 2012.
|
||||
Copyright(C) Jonas 'Sortie' Termansen 2011, 2012, 2014.
|
||||
|
||||
This file is part of the Sortix C Library.
|
||||
|
||||
|
@ -24,10 +24,20 @@
|
|||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <wchar.h>
|
||||
|
||||
// TODO: This function is unpure and should be removed.
|
||||
extern "C" int mbtowc(wchar_t* pwd, const char* s, size_t n)
|
||||
extern "C" int mbtowc(wchar_t* pwc, const char* s, size_t n)
|
||||
{
|
||||
return mbrtowc(pwd, s, n, NULL);
|
||||
static mbstate_t ps;
|
||||
size_t result = mbrtowc(pwc, s, n, &ps);
|
||||
if ( !s )
|
||||
memset(&ps, 0, sizeof(ps));
|
||||
if ( result == (size_t) -1 )
|
||||
return memset(&ps, 0, sizeof(ps)), -1;
|
||||
// TODO: Should ps be cleared to zero in this case?
|
||||
if ( result == (size_t) -2 )
|
||||
return -1;
|
||||
return (int) result;
|
||||
}
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/*******************************************************************************
|
||||
|
||||
Copyright(C) Jonas 'Sortie' Termansen 2013.
|
||||
Copyright(C) Jonas 'Sortie' Termansen 2013, 2014.
|
||||
|
||||
This file is part of the Sortix C Library.
|
||||
|
||||
|
@ -23,16 +23,13 @@
|
|||
*******************************************************************************/
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <wchar.h>
|
||||
|
||||
// TODO: This function is unpure and should be removed.
|
||||
extern "C" size_t wcstombs(char* dst, const wchar_t* src, size_t n)
|
||||
{
|
||||
// Reset the secret conversion state variable in wcsrtombs that is used when
|
||||
// ps is NULL by successfully converting the empty string. As always, this
|
||||
// is not multithread secure. For some reason, the standards don't mandate
|
||||
// that the conversion state is reset when wcsrtombs is called with ps=NULL,
|
||||
// which arguably is a feature - but this function is supposed to do it.
|
||||
const wchar_t* empty_string = L"";
|
||||
wcsrtombs(NULL, &empty_string, 0, NULL);
|
||||
return wcsrtombs(dst, &src, n, NULL);
|
||||
mbstate_t ps;
|
||||
memset(&ps, 0, sizeof(ps));
|
||||
return wcsrtombs(dst, &src, n, &ps);
|
||||
}
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/*******************************************************************************
|
||||
|
||||
Copyright(C) Jonas 'Sortie' Termansen 2012.
|
||||
Copyright(C) Jonas 'Sortie' Termansen 2012, 2014.
|
||||
|
||||
This file is part of the Sortix C Library.
|
||||
|
||||
|
@ -23,10 +23,19 @@
|
|||
*******************************************************************************/
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <wchar.h>
|
||||
|
||||
// TODO: This function is unpure and should be removed.
|
||||
extern "C" int wctomb(char* s, wchar_t wc)
|
||||
{
|
||||
return wcrtomb(s, wc, NULL);
|
||||
static mbstate_t ps;
|
||||
size_t result = wcrtomb(s, wc, &ps);
|
||||
if ( !s )
|
||||
memset(&ps, 0, sizeof(ps));
|
||||
if ( result == (size_t) -1 )
|
||||
return -1;
|
||||
if ( result == (size_t) -2 )
|
||||
return -1;
|
||||
return (int) result;
|
||||
}
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/*******************************************************************************
|
||||
|
||||
Copyright(C) Jonas 'Sortie' Termansen 2013.
|
||||
Copyright(C) Jonas 'Sortie' Termansen 2013, 2014.
|
||||
|
||||
This file is part of the Sortix C Library.
|
||||
|
||||
|
@ -22,64 +22,11 @@
|
|||
|
||||
*******************************************************************************/
|
||||
|
||||
#include <errno.h>
|
||||
#include <string.h>
|
||||
#include <wchar.h>
|
||||
|
||||
static size_t utf8_header_length(unsigned char uc)
|
||||
{
|
||||
if ( (uc & 0b11000000) == 0b10000000 )
|
||||
return 0;
|
||||
if ( (uc & 0b10000000) == 0b00000000 )
|
||||
return 1;
|
||||
if ( (uc & 0b11100000) == 0b11000000 )
|
||||
return 2;
|
||||
if ( (uc & 0b11110000) == 0b11100000 )
|
||||
return 3;
|
||||
if ( (uc & 0b11111000) == 0b11110000 )
|
||||
return 4;
|
||||
if ( (uc & 0b11111100) == 0b11111000 )
|
||||
return 5;
|
||||
if ( (uc & 0b11111110) == 0b11111100 )
|
||||
return 6;
|
||||
return (size_t) -1;
|
||||
}
|
||||
|
||||
// TODO: Use the shift state.
|
||||
extern "C"
|
||||
size_t mbrlen(const char* restrict s, size_t n, mbstate_t* restrict ps)
|
||||
{
|
||||
size_t expected_length;
|
||||
|
||||
for ( size_t i = 0; i < n; i++ )
|
||||
{
|
||||
unsigned char uc = (unsigned char) s[i];
|
||||
|
||||
if ( i == 0 )
|
||||
{
|
||||
if ( !uc )
|
||||
{
|
||||
memset(ps, 0, sizeof(*ps));
|
||||
return 0;
|
||||
}
|
||||
|
||||
if ( (expected_length = utf8_header_length(uc)) == (size_t) -1 )
|
||||
return errno = EILSEQ, (size_t) -1;
|
||||
|
||||
// Check if we encounted an unexpected character claiming to be in
|
||||
// the middle of a UTF-8 multibyte sequence (10xxxxxx).
|
||||
if ( expected_length == 0 )
|
||||
// TODO: Should we play catch up with the partial sequence?
|
||||
return errno = EILSEQ, (size_t) -1;
|
||||
}
|
||||
|
||||
// All non-header bytes should be of the form 10xxxxxx.
|
||||
if ( 0 < i && expected_length < n && (uc & 0b11000000) != 0b10000000 )
|
||||
return errno = EILSEQ, (size_t) -1;
|
||||
|
||||
if ( i + 1 == expected_length )
|
||||
return i + 1;
|
||||
}
|
||||
|
||||
return (size_t) -2;
|
||||
static mbstate_t static_ps;
|
||||
return mbrtowc(NULL, s, n, ps ? ps : &static_ps);
|
||||
}
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/*******************************************************************************
|
||||
|
||||
Copyright(C) Jonas 'Sortie' Termansen 2012.
|
||||
Copyright(C) Jonas 'Sortie' Termansen 2012, 2014.
|
||||
|
||||
This file is part of the Sortix C Library.
|
||||
|
||||
|
@ -24,82 +24,123 @@
|
|||
|
||||
#include <errno.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <wchar.h>
|
||||
|
||||
extern "C"
|
||||
size_t mbrtowc(wchar_t* restrict pwc, const char* restrict s, size_t n,
|
||||
mbstate_t* restrict /*ps*/)
|
||||
static
|
||||
size_t utf8_mbrtowc(wchar_t* restrict pwc,
|
||||
const char* restrict s,
|
||||
size_t n,
|
||||
mbstate_t* restrict ps)
|
||||
{
|
||||
if ( !s )
|
||||
size_t i;
|
||||
for ( i = 0; !(i && ps->count == 0); i++ )
|
||||
{
|
||||
// TODO: Restore ps to initial state if currently valid.
|
||||
return 0;
|
||||
}
|
||||
uint8_t* buf = (uint8_t*) s;
|
||||
wchar_t ret = 0;
|
||||
size_t numbytes = 0;
|
||||
size_t sequence_len = 1;
|
||||
while ( numbytes < sequence_len )
|
||||
{
|
||||
if ( numbytes == n )
|
||||
{
|
||||
// TODO: Support restore through the mbstate_t!
|
||||
// Handle the case where we were not able to fully decode a character,
|
||||
// but it is still possible to finish decoding given more bytes.
|
||||
if ( n <= i )
|
||||
return (size_t) -2;
|
||||
|
||||
char c = s[i];
|
||||
unsigned char uc = (unsigned char) c;
|
||||
|
||||
// The initial state is that we expect a leading byte that informs us of
|
||||
// the length of this character sequence. The number of consecutive high
|
||||
// order bits tells us how many bytes make up this character (one
|
||||
// leading byte followed by zero or more continuation bytes).
|
||||
if ( ps->count == 0 )
|
||||
{
|
||||
if ( (uc & 0b10000000) == 0b00000000 ) /* 0xxxxxxx */
|
||||
{
|
||||
ps->length = (ps->count = 0) + 1;
|
||||
ps->wch = (wchar_t) uc & 0b1111111;
|
||||
}
|
||||
else if ( (uc & 0b11100000) == 0b11000000 ) /* 110xxxxx */
|
||||
{
|
||||
ps->length = (ps->count = 1) + 1;
|
||||
ps->wch = (wchar_t) uc & 0b11111;
|
||||
}
|
||||
else if ( (uc & 0b11110000) == 0b11100000 ) /* 1110xxxx */
|
||||
{
|
||||
ps->length = (ps->count = 2) + 1;
|
||||
ps->wch = (wchar_t) uc & 0b1111;
|
||||
}
|
||||
else if ( (uc & 0b11111000) == 0b11110000 ) /* 11110xxx */
|
||||
{
|
||||
ps->length = (ps->count = 3) + 1;
|
||||
ps->wch = (wchar_t) uc & 0b111;
|
||||
}
|
||||
#if 0 /* 5-byte and 6-byte sequences are forbidden by RFC 3629 */
|
||||
else if ( (uc & 0b11111100) == 0b11111000 ) /* 111110xx */
|
||||
{
|
||||
ps->length = (ps->count = 4) + 1) + 1;
|
||||
ps->wch = (wchar_t) uc & 0b11;
|
||||
}
|
||||
else if ( (uc & 0b11111110) == 0b11111100 ) /* 1111110x */
|
||||
{
|
||||
ps->length = (ps->count = 5) + 1) + 1;
|
||||
ps->wch = (wchar_t) uc & 0b1;
|
||||
}
|
||||
#endif
|
||||
else
|
||||
return errno = EILSEQ, (size_t) -1;
|
||||
}
|
||||
uint8_t b = buf[numbytes++];
|
||||
|
||||
bool is_continuation = b >> (8-2) == 0b10;
|
||||
if ( 1 == numbytes && is_continuation )
|
||||
return errno = EILSEQ, (size_t) -1;
|
||||
if ( 2 <= numbytes && !is_continuation )
|
||||
return errno = EILSEQ, (size_t) -1;
|
||||
|
||||
wchar_t new_bits;
|
||||
size_t new_bits_num;
|
||||
if ( b >> (8-1) == 0b0 )
|
||||
new_bits = b & 0b01111111,
|
||||
new_bits_num = 7,
|
||||
sequence_len = 1;
|
||||
else if ( b >> (8-2) == 0b10 )
|
||||
new_bits = b & 0b00111111,
|
||||
new_bits_num = 6,
|
||||
sequence_len = 2;
|
||||
else if ( b >> (8-3) == 0b110 )
|
||||
new_bits = b & 0b00011111,
|
||||
new_bits_num = 5,
|
||||
sequence_len = 3;
|
||||
else if ( b >> (8-4) == 0b1110 )
|
||||
new_bits = b & 0b00001111,
|
||||
new_bits_num = 4,
|
||||
sequence_len = 4;
|
||||
else if ( b >> (8-5) == 0b11110 )
|
||||
new_bits = b & 0b00000111,
|
||||
new_bits_num = 3,
|
||||
sequence_len = 5;
|
||||
else if ( b >> (8-6) == 0b111110 )
|
||||
new_bits = b & 0b00000011,
|
||||
new_bits_num = 2,
|
||||
sequence_len = 6;
|
||||
else if ( b >> (8-7) == 0b1111110 )
|
||||
new_bits = b & 0b00000001,
|
||||
new_bits_num = 1,
|
||||
sequence_len = 7;
|
||||
// The secondary state is that following a leading byte, we are
|
||||
// expecting a non-zero number of continuation byte bytes.
|
||||
else
|
||||
return errno = EILSEQ, (size_t) -1;
|
||||
ret = ret >> new_bits_num | new_bits;
|
||||
{
|
||||
// Verify this is a continuation byte.
|
||||
if ( (uc & 0b11000000) != 0b10000000 )
|
||||
return errno = EILSEQ, (size_t) -1;
|
||||
ps->wch = ps->wch << 6 | (uc & 0b00111111);
|
||||
ps->count--;
|
||||
}
|
||||
}
|
||||
if ( !ret )
|
||||
{
|
||||
// TODO: Reset ps to initial state.
|
||||
return 0;
|
||||
}
|
||||
if ( (numbytes == 2 && ret <= 0x007F) ||
|
||||
(numbytes == 3 && ret <= 0x07FF) ||
|
||||
(numbytes == 4 && ret <= 0xFFFF) ||
|
||||
(numbytes == 5 && ret <= 0x1FFFFF) ||
|
||||
(numbytes == 6 && ret <= 0x3FFFFFF) )
|
||||
|
||||
// Reject the character if it was produced with an overly long sequence.
|
||||
if ( ps->length == 1 && 1 << 7 <= ps->wch )
|
||||
return errno = EILSEQ, (size_t) -1;
|
||||
if ( ps->length == 2 && 1 << (5 + 1 * 6) <= ps->wch )
|
||||
return errno = EILSEQ, (size_t) -1;
|
||||
if ( ps->length == 3 && 1 << (4 + 2 * 6) <= ps->wch )
|
||||
return errno = EILSEQ, (size_t) -1;
|
||||
if ( ps->length == 4 && 1 << (3 + 3 * 6) <= ps->wch )
|
||||
return errno = EILSEQ, (size_t) -1;
|
||||
#if 0 /* 5-byte and 6-byte sequences are forbidden by RFC 3629 */
|
||||
if ( ps->length == 5 && 1 << (2 + 4 * 6) <= ps->wch )
|
||||
return errno = EILSEQ, (size_t) -1;
|
||||
if ( ps->length == 6 && 1 << (1 + 5 * 6) <= ps->wch )
|
||||
return errno = EILSEQ, (size_t) -1;
|
||||
#endif
|
||||
|
||||
// RFC 3629 limits UTF-8 to 0x0 through 0x10FFFF.
|
||||
if ( 0x10FFFF <= ps->wch )
|
||||
return errno = EILSEQ, (size_t) -1;
|
||||
|
||||
wchar_t result = ps->wch;
|
||||
|
||||
if ( pwc )
|
||||
*pwc = ret;
|
||||
return numbytes;
|
||||
*pwc = result;
|
||||
|
||||
ps->length = 0;
|
||||
ps->wch = 0;
|
||||
|
||||
return result != L'\0' ? i : 0;
|
||||
}
|
||||
|
||||
extern "C"
|
||||
size_t mbrtowc(wchar_t* restrict pwc,
|
||||
const char* restrict s,
|
||||
size_t n,
|
||||
mbstate_t* restrict ps)
|
||||
{
|
||||
static mbstate_t static_ps;
|
||||
if ( !ps )
|
||||
ps = &static_ps;
|
||||
if ( !s )
|
||||
s = "", n = 1;
|
||||
|
||||
// TODO: Verify whether the current locale is UTF-8.
|
||||
return utf8_mbrtowc(pwc, s, n, ps);
|
||||
}
|
||||
|
|
|
@ -0,0 +1,30 @@
|
|||
/*******************************************************************************
|
||||
|
||||
Copyright(C) Jonas 'Sortie' Termansen 2014.
|
||||
|
||||
This file is part of the Sortix C Library.
|
||||
|
||||
The Sortix C Library is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU Lesser General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or (at your
|
||||
option) any later version.
|
||||
|
||||
The Sortix C Library is distributed in the hope that it will be useful, but
|
||||
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
|
||||
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
|
||||
License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public License
|
||||
along with the Sortix C Library. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
wchar/mbsinit.cpp
|
||||
Determine conversion object status.
|
||||
|
||||
*******************************************************************************/
|
||||
|
||||
#include <wchar.h>
|
||||
|
||||
extern "C" int mbsinit(const mbstate_t* ps)
|
||||
{
|
||||
return !ps || !ps->count;
|
||||
}
|
|
@ -0,0 +1,82 @@
|
|||
/*******************************************************************************
|
||||
|
||||
Copyright(C) Jonas 'Sortie' Termansen 2013, 2014.
|
||||
|
||||
This file is part of the Sortix C Library.
|
||||
|
||||
The Sortix C Library is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU Lesser General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or (at your
|
||||
option) any later version.
|
||||
|
||||
The Sortix C Library is distributed in the hope that it will be useful, but
|
||||
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
|
||||
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
|
||||
License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public License
|
||||
along with the Sortix C Library. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
wchar/mbsnrtowcs.cpp
|
||||
Convert a multibyte string to a wide-character string.
|
||||
|
||||
*******************************************************************************/
|
||||
|
||||
#include <assert.h>
|
||||
#include <wchar.h>
|
||||
|
||||
extern "C"
|
||||
size_t mbsnrtowcs(wchar_t* restrict dst,
|
||||
const char** restrict src_ptr,
|
||||
size_t src_len,
|
||||
size_t dst_len,
|
||||
mbstate_t* restrict ps)
|
||||
{
|
||||
static mbstate_t static_ps;
|
||||
if ( !ps )
|
||||
ps = &static_ps;
|
||||
|
||||
assert(src_ptr && *src_ptr);
|
||||
const char* src = *src_ptr;
|
||||
|
||||
// Continue to decode wide characters until we have filled the destination
|
||||
// buffer or if we have exhausted the limit on input multibyte characters.
|
||||
size_t dst_offset = 0;
|
||||
size_t src_offset = 0;
|
||||
while ( (!dst || dst_offset < dst_len) && src_offset < src_len )
|
||||
{
|
||||
mbstate_t ps_copy = *ps;
|
||||
wchar_t wc;
|
||||
size_t amount = mbrtowc(&wc, src + src_offset, src_len - src_offset, ps);
|
||||
|
||||
// Stop in the event a decoding error occured.
|
||||
if ( amount == (size_t) -1 )
|
||||
return *src_ptr = src + src_offset, (size_t) -1;
|
||||
|
||||
// Stop decoding early in the event we encountered a partial character.
|
||||
if ( amount == (size_t) -2 )
|
||||
{
|
||||
*ps = ps_copy;
|
||||
break;
|
||||
}
|
||||
|
||||
// Store the decoded wide character in the destination buffer.
|
||||
if ( dst )
|
||||
dst[dst_offset] = wc;
|
||||
|
||||
// Stop decoding after decoding a null character and return a NULL
|
||||
// source pointer to the caller, not including the null character in the
|
||||
// number of characters stored in the destination buffer.
|
||||
if ( wc == L'\0' )
|
||||
{
|
||||
src = NULL;
|
||||
src_offset = 0;
|
||||
break;
|
||||
}
|
||||
|
||||
dst_offset++;
|
||||
src_offset += amount;
|
||||
}
|
||||
|
||||
return *src_ptr = src + src_offset, dst_offset;
|
||||
}
|
|
@ -1,6 +1,6 @@
|
|||
/*******************************************************************************
|
||||
|
||||
Copyright(C) Jonas 'Sortie' Termansen 2013.
|
||||
Copyright(C) Jonas 'Sortie' Termansen 2013, 2014.
|
||||
|
||||
This file is part of the Sortix C Library.
|
||||
|
||||
|
@ -24,49 +24,16 @@
|
|||
|
||||
#include <assert.h>
|
||||
#include <errno.h>
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <wchar.h>
|
||||
|
||||
extern "C" size_t mbsrtowcs(wchar_t* dst, const char** src_ptr, size_t dst_len,
|
||||
mbstate_t* ps)
|
||||
extern "C"
|
||||
size_t mbsrtowcs(wchar_t* restrict dst,
|
||||
const char** restrict src_ptr,
|
||||
size_t dst_len,
|
||||
mbstate_t* restrict ps)
|
||||
{
|
||||
assert(src_ptr && *src_ptr);
|
||||
// Avoid changing *src_ptr if dst is NULL.
|
||||
const char* local_src_ptr = *src_ptr;
|
||||
if ( !dst )
|
||||
src_ptr = &local_src_ptr;
|
||||
// For some reason, the standards don't mandate that the secret ps variable
|
||||
// is reset when ps is NULL, unlike mbstowcs that always resets this
|
||||
// variable. We'll avoid resetting the variable here in case any programs
|
||||
// actually take advantage of this fact.
|
||||
static mbstate_t static_ps;
|
||||
if ( !ps )
|
||||
ps = &static_ps;
|
||||
size_t ret = 0;
|
||||
size_t src_len = strlen(*src_ptr);
|
||||
while ( !dst || dst_len )
|
||||
{
|
||||
mbstate_t saved_ps = *ps;
|
||||
size_t consumed = mbrtowc(dst, *src_ptr, src_len, ps);
|
||||
if ( consumed == (size_t) 0 )
|
||||
{
|
||||
*src_ptr = NULL;
|
||||
break;
|
||||
}
|
||||
if ( consumed == (size_t) -1 )
|
||||
return (size_t) -1;
|
||||
if ( consumed == (size_t) -2 )
|
||||
{
|
||||
*ps = saved_ps;
|
||||
break;
|
||||
}
|
||||
*src_ptr += consumed;
|
||||
src_len -= consumed;
|
||||
if ( dst )
|
||||
dst++,
|
||||
dst_len--;
|
||||
ret++;
|
||||
}
|
||||
return ret;
|
||||
return mbsnrtowcs(dst, src_ptr, SIZE_MAX, dst_len, ps);
|
||||
}
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/*******************************************************************************
|
||||
|
||||
Copyright(C) Jonas 'Sortie' Termansen 2012.
|
||||
Copyright(C) Jonas 'Sortie' Termansen 2012, 2014.
|
||||
|
||||
This file is part of the Sortix C Library.
|
||||
|
||||
|
@ -23,58 +23,87 @@
|
|||
*******************************************************************************/
|
||||
|
||||
#include <errno.h>
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include <wchar.h>
|
||||
|
||||
extern "C"
|
||||
size_t wcrtomb(char* restrict s, wchar_t wc, mbstate_t* restrict /*ps*/)
|
||||
static
|
||||
size_t utf8_wcrtomb(char* restrict s, wchar_t wc, mbstate_t* restrict /*ps*/)
|
||||
{
|
||||
if ( !wc )
|
||||
// The definition of UTF-8 prohibits encoding character numbers between
|
||||
// U+D800 and U+DFFF, which are reserved for use with the UTF-16 encoding
|
||||
// form (as surrogate pairs) and do not directly represent characters.
|
||||
if ( 0xD800 <= wc && wc <= 0xDFFF )
|
||||
return errno = EILSEQ, (size_t) -1;
|
||||
|
||||
// RFC 3629 limits UTF-8 to 0x0 through 0x10FFFF.
|
||||
if ( 0x10FFFF <= wc )
|
||||
return errno = EILSEQ, (size_t) -1;
|
||||
|
||||
size_t index = 0;
|
||||
|
||||
if ( wc < (1 << (7)) ) /* 0xxxxxxx */
|
||||
{
|
||||
if ( s )
|
||||
*s = '\0';
|
||||
return 1;
|
||||
s[index++] = 0b00000000 | (wc >> 0 & 0b01111111);
|
||||
return index;
|
||||
}
|
||||
|
||||
uint32_t unicode = wc;
|
||||
uint8_t* buf = (uint8_t*) s;
|
||||
unsigned bytes = 1;
|
||||
unsigned bits = 7;
|
||||
if ( (1U<<7U) <= unicode ) { bytes = 2; bits = 11; }
|
||||
if ( (1U<<11U) <= unicode ) { bytes = 3; bits = 16; }
|
||||
if ( (1U<<16U) <= unicode ) { bytes = 4; bits = 21; }
|
||||
if ( (1U<<21U) <= unicode ) { bytes = 5; bits = 26; }
|
||||
if ( (1U<<26U) <= unicode ) { bytes = 6; bits = 31; }
|
||||
if ( (1U<<31U) <= unicode ) { errno = EILSEQ; return (size_t) -1; }
|
||||
|
||||
if ( !s )
|
||||
return bytes;
|
||||
|
||||
uint8_t prefix;
|
||||
unsigned prefixavai;
|
||||
switch ( bytes )
|
||||
if ( wc < (1 << (5 + 1 * 6)) ) /* 110xxxxx 10xxxxxx^1 */
|
||||
{
|
||||
case 1: prefixavai = 7; prefix = 0b0U << prefixavai; break;
|
||||
case 2: prefixavai = 5; prefix = 0b110U << prefixavai; break;
|
||||
case 3: prefixavai = 4; prefix = 0b1110U << prefixavai; break;
|
||||
case 4: prefixavai = 3; prefix = 0b11110U << prefixavai; break;
|
||||
case 5: prefixavai = 2; prefix = 0b111110U << prefixavai; break;
|
||||
case 6: prefixavai = 1; prefix = 0b1111110U << prefixavai; break;
|
||||
default: __builtin_unreachable();
|
||||
s[index++] = 0b11000000 | (wc >> 6 & 0b00011111);
|
||||
s[index++] = 0b10000000 | (wc >> 0 & 0b00111111);
|
||||
return index;
|
||||
}
|
||||
|
||||
// Put the first bits in the unused area of the prefix.
|
||||
prefix |= unicode >> (bits - prefixavai);
|
||||
*buf++ = prefix;
|
||||
unsigned bitsleft = bits - prefixavai;
|
||||
|
||||
while ( bitsleft )
|
||||
if ( wc < (1 << (4 + 2 * 6)) ) /* 1110xxxx 10xxxxxx^2 */
|
||||
{
|
||||
bitsleft -= 6;
|
||||
uint8_t elembits = (unicode>>bitsleft) & ((1U<<6U)-1U);
|
||||
uint8_t elem = (0b10U<<6U) | elembits;
|
||||
*buf++ = elem;
|
||||
s[index++] = 0b11100000 | (wc >> 2*6 & 0b00001111);
|
||||
s[index++] = 0b10000000 | (wc >> 1*6 & 0b00111111);
|
||||
s[index++] = 0b10000000 | (wc >> 0*6 & 0b00111111);
|
||||
return index;
|
||||
}
|
||||
|
||||
return bytes;
|
||||
if ( wc < (1 << (3 + 3 * 6)) ) /* 11110xxx 10xxxxxx^3 */
|
||||
{
|
||||
s[index++] = 0b11110000 | (wc >> 3*6 & 0b00000111);
|
||||
s[index++] = 0b10000000 | (wc >> 2*6 & 0b00111111);
|
||||
s[index++] = 0b10000000 | (wc >> 1*6 & 0b00111111);
|
||||
s[index++] = 0b10000000 | (wc >> 0*6 & 0b00111111);
|
||||
return index;
|
||||
}
|
||||
|
||||
#if 0 /* 5-byte and 6-byte sequences are forbidden by RFC 3629 */
|
||||
if ( wc < (1 << (2 + 4 * 6)) ) /* 111110xx 10xxxxxx^4 */
|
||||
{
|
||||
s[index++] = 0b11111000 | (wc >> 4*6 & 0b00000011);
|
||||
s[index++] = 0b10000000 | (wc >> 3*6 & 0b00111111);
|
||||
s[index++] = 0b10000000 | (wc >> 2*6 & 0b00111111);
|
||||
s[index++] = 0b10000000 | (wc >> 1*6 & 0b00111111);
|
||||
s[index++] = 0b10000000 | (wc >> 0*6 & 0b00111111);
|
||||
return index;
|
||||
}
|
||||
|
||||
if ( wc < (1 << (1 + 5 * 6)) ) /* 111110xx 10xxxxxx^5 */
|
||||
{
|
||||
s[index++] = 0b11111100 | (wc >> 5*6 & 0b00000001);
|
||||
s[index++] = 0b10000000 | (wc >> 4*6 & 0b00111111);
|
||||
s[index++] = 0b10000000 | (wc >> 3*6 & 0b00111111);
|
||||
s[index++] = 0b10000000 | (wc >> 2*6 & 0b00111111);
|
||||
s[index++] = 0b10000000 | (wc >> 1*6 & 0b00111111);
|
||||
s[index++] = 0b10000000 | (wc >> 0*6 & 0b00111111);
|
||||
return index;
|
||||
}
|
||||
#endif
|
||||
|
||||
return errno = EILSEQ; return (size_t) -1;
|
||||
}
|
||||
|
||||
extern "C"
|
||||
size_t wcrtomb(char* restrict s, wchar_t wc, mbstate_t* restrict ps)
|
||||
{
|
||||
char internal_buffer[MB_CUR_MAX];
|
||||
if ( !s )
|
||||
wc = L'\0', s = internal_buffer;
|
||||
|
||||
// TODO: Verify whether the current locale is UTF-8.
|
||||
return utf8_wcrtomb(s, wc, ps);
|
||||
}
|
||||
|
|
|
@ -0,0 +1,87 @@
|
|||
/*******************************************************************************
|
||||
|
||||
Copyright(C) Jonas 'Sortie' Termansen 2013, 2014.
|
||||
|
||||
This file is part of the Sortix C Library.
|
||||
|
||||
The Sortix C Library is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU Lesser General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or (at your
|
||||
option) any later version.
|
||||
|
||||
The Sortix C Library is distributed in the hope that it will be useful, but
|
||||
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
|
||||
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
|
||||
License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public License
|
||||
along with the Sortix C Library. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
wchar/wcsnrtombs.cpp
|
||||
Convert a wide-character string to multibyte string.
|
||||
|
||||
*******************************************************************************/
|
||||
|
||||
#include <assert.h>
|
||||
#include <errno.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <wchar.h>
|
||||
|
||||
extern "C"
|
||||
size_t wcsnrtombs(char* restrict dst,
|
||||
const wchar_t** restrict src_ptr,
|
||||
size_t src_len,
|
||||
size_t dst_len,
|
||||
mbstate_t* restrict ps)
|
||||
{
|
||||
static mbstate_t static_ps;
|
||||
if ( !ps )
|
||||
ps = &static_ps;
|
||||
|
||||
assert(src_ptr && *src_ptr);
|
||||
const wchar_t* src = *src_ptr;
|
||||
|
||||
// Continue to encode multibyte characters until we have filled the
|
||||
// destination buffer or if we have exhausted the limit on input wide chars.
|
||||
size_t dst_offset = 0;
|
||||
size_t src_offset = 0;
|
||||
while ( (!dst || dst_offset < dst_len) && src_offset < src_len )
|
||||
{
|
||||
mbstate_t ps_copy = *ps;
|
||||
wchar_t wc = src[src_offset];
|
||||
char mb[MB_CUR_MAX];
|
||||
size_t amount = wcrtomb(mb, wc, ps);
|
||||
|
||||
// Stop in the event a decoding error occured.
|
||||
if ( amount == (size_t) -1 )
|
||||
return *src_ptr = src + src_offset, (size_t) -1;
|
||||
|
||||
// Stop decoding early in the event we encountered a partial character,
|
||||
// or that we ran out of space in the destination buffer.
|
||||
if ( amount == (size_t) -2 || (dst && dst_offset - dst_len < amount ) )
|
||||
{
|
||||
*ps = ps_copy;
|
||||
break;
|
||||
}
|
||||
|
||||
// Store the decoded multibyte character in the destination buffer.
|
||||
if ( dst )
|
||||
memcpy(dst + dst_offset, mb, amount);
|
||||
|
||||
// Stop decoding after decoding a null character and return a NULL
|
||||
// source pointer to the caller, not including the null character in the
|
||||
// number of characters stored in the destination buffer.
|
||||
if ( wc == L'\0' )
|
||||
{
|
||||
src = NULL;
|
||||
src_offset = 0;
|
||||
break;
|
||||
}
|
||||
|
||||
dst_offset += amount;
|
||||
src_offset++;
|
||||
}
|
||||
|
||||
return *src_ptr = src + src_offset, dst_offset;
|
||||
}
|
|
@ -1,6 +1,6 @@
|
|||
/*******************************************************************************
|
||||
|
||||
Copyright(C) Jonas 'Sortie' Termansen 2013.
|
||||
Copyright(C) Jonas 'Sortie' Termansen 2013, 2014.
|
||||
|
||||
This file is part of the Sortix C Library.
|
||||
|
||||
|
@ -22,55 +22,14 @@
|
|||
|
||||
*******************************************************************************/
|
||||
|
||||
#include <assert.h>
|
||||
#include <errno.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
#include <wchar.h>
|
||||
|
||||
extern "C" size_t wcsrtombs(char* dst, const wchar_t** src_ptr, size_t dst_len,
|
||||
mbstate_t* ps)
|
||||
extern "C"
|
||||
size_t wcsrtombs(char* restrict dst,
|
||||
const wchar_t** restrict src_ptr,
|
||||
size_t dst_len,
|
||||
mbstate_t* ps)
|
||||
{
|
||||
assert(src_ptr && *src_ptr);
|
||||
// Avoid changing *src_ptr if dst is NULL.
|
||||
const wchar_t* local_src_ptr = *src_ptr;
|
||||
if ( !dst )
|
||||
src_ptr = &local_src_ptr;
|
||||
// For some reason, the standards don't mandate that the secret ps variable
|
||||
// is reset when ps is NULL, unlike mbstowcs that always resets this
|
||||
// variable. We'll avoid resetting the variable here in case any programs
|
||||
// actually take advantage of this fact.
|
||||
static mbstate_t static_ps;
|
||||
if ( !ps )
|
||||
ps = &static_ps;
|
||||
size_t ret = 0;
|
||||
size_t src_len = wcslen(*src_ptr);
|
||||
char buf[MB_CUR_MAX];
|
||||
while ( !dst || dst_len )
|
||||
{
|
||||
mbstate_t saved_ps = *ps;
|
||||
size_t produced = wcrtomb(buf, **src_ptr, ps);
|
||||
if ( produced == (size_t) -1 )
|
||||
return (size_t) -1;
|
||||
if ( dst && dst_len < produced )
|
||||
{
|
||||
*ps = saved_ps;
|
||||
break;
|
||||
}
|
||||
memcpy(dst, buf, produced);
|
||||
if ( **src_ptr == L'\0' )
|
||||
{
|
||||
ret += produced - 1; // Don't count the '\0' byte.
|
||||
*src_ptr = NULL;
|
||||
break;
|
||||
}
|
||||
ret += produced;
|
||||
(*src_ptr)++;
|
||||
src_len--;
|
||||
if ( dst )
|
||||
dst += produced,
|
||||
dst_len -= produced;
|
||||
ret++;
|
||||
}
|
||||
return ret;
|
||||
return wcsnrtombs(dst, src_ptr, SIZE_MAX, dst_len, ps);
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue