/* $Revision: 10 $ $Date: 23/02/06 18:05 $ Copyright © 2004-2006, FSL Technologies Limited. Contact "http://fost3.fsltech.com". */ #include "stdafx.h" #include namespace FSLib { namespace utf { class F3UTIL_DECLSPEC filebuf : public std::basic_streambuf< wchar_t >, public boost::noncopyable { public: typedef wchar_t char_type; typedef std::basic_streambuf< wchar_t >::int_type int_type; typedef std::char_traits< wchar_t > traits_type; filebuf(); ~filebuf(); void open( const wstring &filename, std::ios_base::openmode mode ); void close(); protected: int_type underflow(); int_type overflow( int_type ch = std::char_traits< wchar_t >::eof() ); //int sync(); //std::streamsize showmanyc(); private: HANDLE m_hfile; std::ios_base::openmode m_mode; }; } } using namespace FSLib; using namespace FSLib::utf; namespace { Revision c_revision( L"$Archive: /FOST.3/F3Util/unicode.cpp $", __DATE__, L"$Revision: 10 $", L"$Date: 23/02/06 18:05 $" ); const Setting c_writeBuffer( L"$Archive: /FOST.3/F3Util/unicode.cpp $", L"UTF-8", L"Write Buffer", L"65536", true ); const Setting c_readBuffer( L"$Archive: /FOST.3/F3Util/unicode.cpp $", L"UTF-8", L"Read Buffer", L"65536", true ); const wchar_t c_unicodeEncodingMsg[]=L"Unicode encoding and decoding"; const wchar_t c_unexpectedZeroMsg[]=L"Unexpected zero when processing Unicode stream."; } /* FSLib::Exceptions::UnicodeEncoding */ FSLib::Exceptions::UnicodeEncoding::UnicodeEncoding( const wstring &e ) : FSLib::Exceptions::Exception( e ) { } const wchar_t * const FSLib::Exceptions::UnicodeEncoding::message() const { return c_unicodeEncodingMsg; } /* FSLib::Exceptions::UnexpectedZero */ FSLib::Exceptions::UnexpectedZero::UnexpectedZero() : FSLib::Exceptions::Exception() { } FSLib::Exceptions::UnexpectedZero::UnexpectedZero( const wstring &e ) : FSLib::Exceptions::Exception( e ) { } const wchar_t * const FSLib::Exceptions::UnexpectedZero::message() const { return c_unexpectedZeroMsg; } /* Helper functions used to implement other parts of the Unicode handling */ namespace { std::size_t chars( const utf8 ch ) { if ( ch < 0x80 ) return 1; else if ( ch >= 0x80 && ch <= 0xBF ) throw FSLib::Exceptions::UnicodeEncoding( L"UTF-8 continuation character (" + toString( ch ) + L") cannot appear without control character" ); else if ( ch >= 0xC0 && ch < 0xE0 ) return 2; else if ( ch >= 0xE0 && ch < 0xF0 ) return 3; else if ( ch >= 0xF0 && ch < 0xF8 ) return 4; else if ( ch >= 0xF8 && ch < 0xFC ) throw FSLib::Exceptions::UnicodeEncoding( L"UTF-8 encoding may no longer be 5 bytes long (" + toString( ch ) + L")" ); else if ( ch >= 0xFC && ch < 0xFE ) throw FSLib::Exceptions::UnicodeEncoding( L"UTF-8 encoding may no longer be 6 bytes long (" + toString( ch ) + L")" ); else throw FSLib::Exceptions::UnicodeEncoding( L"UTF-8 control character (" + toString( ch ) + L") is not recognised (could be a UTF-16 BOM)" ); } std::size_t chars_noexception( const utf8 ch ) { if ( ch < 0x80 ) return 1; else if ( ch >= 0xC0 && ch < 0xE0 ) return 2; else if ( ch >= 0xE0 && ch < 0xF0 ) return 3; else if ( ch >= 0xF0 && ch < 0xF8 ) return 4; else return 0; } } /* Misc encoding & decoding functions */ utf32 FSLib::utf::assertValid( const utf32 ch ) { try { if ( ch >= 0xD800 && ch <= 0xDBFF ) throw FSLib::Exceptions::UnicodeEncoding( L"UTF-32 character is in the UTF-16 leading surrogate pair range." ); if ( ch >= 0xDC00 && ch <= 0xDFFF ) throw FSLib::Exceptions::UnicodeEncoding( L"UTF-32 character is in the UTF-16 trailing surrogate pair range." ); if ( ch == 0xFFFE || ch == 0xFFFF ) throw FSLib::Exceptions::UnicodeEncoding( L"UTF-32 character is disallowed (0xFFFE/0xFFFF)" ); if ( ch > 0x10FFFF ) throw FSLib::Exceptions::UnicodeEncoding( L"UTF-32 character is beyond the allowable range." ); return ch; } catch ( FSLib::Exceptions::UnicodeEncoding &e ) { e.info() << L"Character value is: " << ch << std::endl; throw; } } std::size_t FSLib::utf::length( const utf8 *seq ) { std::size_t count = 0; for ( ; *seq != 0; ++count ) { std::size_t chars = ::chars( *seq ); for ( std::size_t chk = 1; chk < chars; chk++ ) { if ( seq[ chk ] < 0x80 || seq[ chk ] > 0xBF ) throw FSLib::Exceptions::UnicodeEncoding( L"UTF-8 continuation character is not correct (" + toString( chk ) + L" of " + toString( chars ) + L") is " + toString( seq[ chk ] ) ); } seq += chars; } return count; } std::size_t FSLib::utf::length( const utf16 *seq ) { std::size_t count = 0; for ( ; *seq != 0; ++count ) { if ( *seq >= 0xD800 && *seq <= 0xDBFF ) { ++seq; if ( *seq < 0xDC00 || *seq > 0xDFFF ) throw FSLib::Exceptions::UnicodeEncoding( L"Trailing character in a UTF-16 surrogate pair is missing" ); ++seq; } else if ( *seq >= 0xDC00 && *seq <= 0xDFFF ) throw FSLib::Exceptions::UnicodeEncoding( L"Trailing character in a UTF-16 surrogate pair has been found first (" + toString( *seq ) + L")" ); else ++seq; } return count; } std::size_t FSLib::utf::length( const utf16 *seq, const utf16 *end ) { std::size_t count = 0; for ( ; *seq != 0 && seq != end; ++count ) { if ( *seq >= 0xD800 && *seq <= 0xDBFF ) { ++seq; if ( seq == end ) return count - 1; else if ( *seq < 0xDC00 || *seq > 0xDFFF ) throw FSLib::Exceptions::UnicodeEncoding( L"Trailing character in a UTF-16 surrogate pair is missing" ); ++seq; } else if ( *seq >= 0xDC00 && *seq <= 0xDFFF ) throw FSLib::Exceptions::UnicodeEncoding( L"Trailing character in a UTF-16 surrogate pair has been found first (" + toString( *seq ) + L")" ); else ++seq; } return count; } std::size_t FSLib::utf::length( const utf32 *seq ) { std::size_t c( 0 ); while ( *seq != 0 ) { ++c; ++seq; } return c; } std::size_t FSLib::utf::utf8length( const utf32 ch ) { assertValid( ch ); if ( ch < 0x00080 ) return 1; else if ( ch < 0x00800 ) return 2; else if ( ch < 0x10000 ) return 3; else return 4; } std::size_t FSLib::utf::utf16length( const utf32 ch ) { if ( ch < 0x10000 ) return 1; else return 2; } std::size_t FSLib::utf::utf16length( const utf16 *seq ) { for ( std::size_t s( 0 ); true; ++s ) if ( seq[ s ] == 0 ) return s; } utf32 FSLib::utf::decode( const utf8 *seq ) { utf32 ch; std::size_t chars = ::chars( *seq ); for ( std::size_t chk = 1; chk < chars; chk++ ) { if ( seq[ chk ] < 0x80 || seq[ chk ] > 0xBF ) throw FSLib::Exceptions::UnicodeEncoding( L"UTF-8 continuation character is not correct (" + toString( chk ) + L" of " + toString( chars ) + L") is " + toString( seq[ chk ] ) ); } switch ( chars ) { case 1: ch = utf32( seq[ 0 ] & 0x7F ); break; case 2: ch = utf32( seq[ 0 ] & 0x1F ) << 6; ch |= utf32( seq[ 1 ] & 0x3F ); break; case 3: ch = utf32( seq[ 0 ] & 0x0F ) << 12; ch |= utf32( seq[ 1 ] & 0x3F ) << 6; ch |= utf32( seq[ 2 ] & 0x3F ); break; case 4: ch = utf32( seq[ 0 ] & 0x07 ) << 18; ch |= utf32( seq[ 1 ] & 0x3F ) << 12; ch |= utf32( seq[ 2 ] & 0x3F ) << 6; ch |= utf32( seq[ 3 ] & 0x3F ); break; } if ( utf8length( ch ) != chars ) throw FSLib::Exceptions::UnicodeEncoding( L"UTF-8 sequence of " + toString( chars ) + L" chars generated a UTF32 character (" + toString( ch ) + L") with a different length (" + toString( utf8length( ch ) ) + L")" ); return ch; } utf32 FSLib::utf::decode( const utf16 *seq ) { try { utf32 ch = *seq; if ( ch >= 0xD800 && ch <= 0xDBFF ) { if ( seq[ 1 ] == 0 ) throw FSLib::Exceptions::UnicodeEncoding( L"Trailing surrogate missing from UTF-16 sequence (it is ZERO)" ); if ( seq[ 1 ] < 0xDC00 || seq[ 1 ] > 0xDFFF ) throw FSLib::Exceptions::UnicodeEncoding( L"Trailing character in a UTF-16 surrogate pair is missing (outside correct range)" ); return assertValid( ( ch << 10 ) + seq[ 1 ] + 0x10000 - ( 0xD800 << 10 ) - 0xDC00 ); } return assertValid( ch ); } catch ( FSLib::Exceptions::Exception &e ) { e.info() << L"Decoding UTF-16 number: " << toString( unsigned int( seq[ 0 ] ) ) << std::endl; e.info() << L"Preceeding UTF-16 number: " << toString( unsigned int( seq[ -1 ] ) ) << std::endl; e.info() << L"Following UTF-16 number: " << toString( unsigned int( seq[ 1 ] ) ) << std::endl; throw; } } utf32 FSLib::utf::decode( const utf32 *seq ) { return assertValid( *seq ); } std::size_t FSLib::utf::encode( const utf32 ch, utf8 *begin, const utf8 *end ) { std::size_t sz = utf::utf8length( assertValid( ch ) ); if ( begin + sz < end ) { switch ( sz ) { case 1: begin[ 0 ] = static_cast< utf8 >( ch & 0x7F ); break; case 2: begin[ 0 ] = 0xC0 | ( static_cast< utf8 >( ch >> 6 ) & 0x1F ); begin[ 1 ] = 0x80 | ( static_cast< utf8 >( ch ) & 0x3F ); break; case 3: begin[ 0 ] = 0xE0 | ( static_cast< utf8 >( ch >> 12 ) & 0x0F ); begin[ 1 ] = 0x80 | ( static_cast< utf8 >( ch >> 6 ) & 0x3F ); begin[ 2 ] = 0x80 | ( static_cast< utf8 >(ch) & 0x3F ); break; case 4: begin[ 0 ] = 0xF0 | ( static_cast< utf8 >( ch >> 18 ) & 0x07 ); begin[ 1 ] = 0x80 | ( static_cast< utf8 >( ch >> 12 ) & 0x3F ); begin[ 2 ] = 0x80 | ( static_cast< utf8 >( ch >> 6 ) & 0x3F ); begin[ 3 ] = 0x80 | ( static_cast< utf8 >( ch ) & 0x3F ); break; default: throw FSLib::Exceptions::OutOfRange< std::size_t >( L"Number of UTF-8 bytes for a single character outside of permitted range", 1, utf::utf32_utf8_max_length, sz ); } return sz; } else return 0; } std::size_t FSLib::utf::encode( const utf32 ch, utf16 *begin, const utf16 *end ) { std::size_t sz = utf::utf16length( assertValid( ch ) ); if ( begin + sz < end ) { if ( sz == 1 ) begin[ 0 ] = utf16( ch ); else { begin[ 0 ] = 0xD800 - ( 0x10000 >> 10 ) + static_cast< utf16 >( ch >> 10 ); begin[ 1 ] = 0xDC00 + static_cast< utf16 >( ch & 0x3FF ); } return sz; } else return 0; } /* FSLib::utf::uofstream */ FSLib::uofstream::uofstream() : std::basic_ofstream< wchar_t >(), m_locale( std::locale::classic(), new codecvt_utf8( 0 ) ) { } FSLib::uofstream::uofstream( const char *filename, std::ios_base::openmode mode, int prot ) : std::basic_ofstream< wchar_t >(), m_locale( std::locale::classic(), new codecvt_utf8( 0 ) ) { open( filename, mode, prot ); } void FSLib::uofstream::open( const char *filename, std::ios_base::openmode mode, int prot ) { std::basic_ofstream< wchar_t >::open( filename, mode, prot ); imbue( m_locale ); if ( good() ) put( wchar_t( utf::c_bom ) ); // BOM } void FSLib::uofstream::open( const char *filename, std::ios_base::open_mode mode ) { open( filename, std::ios_base::openmode( mode ) ); } /* FSLib::utf::uifstream */ FSLib::uifstream::uifstream() : std::basic_ifstream< utf16 >(), m_utf8( std::locale::classic(), new codecvt_utf8( 0 ) ) { } FSLib::uifstream::uifstream( const char *filename, std::ios_base::openmode mode, int prot ) : std::basic_ifstream< utf16 >(), m_utf8( std::locale::classic(), new codecvt_utf8( 0 ) ) { open( filename, mode, prot ); } void FSLib::uifstream::open( const char *filename, std::ios_base::openmode mode, int prot ) { std::locale &loc = m_utf8; // default to UTF8 bool bom = false; std::basic_ifstream< utf8 > chk( filename, mode, prot ); std::ifstream::int_type u8( chk.get() ); if ( !chk.good() ) return; else if ( u8 > 0xFF ) throw FSLib::Exceptions::UnicodeEncoding( L"File error reading first byte" ); else if ( u8 == 0xFF || u8 == 0x00 ) throw FSLib::Exceptions::NotImplemented( L"UTF16 BE not supported" ); else if ( u8 == 0xFE || chk.get() == 0x00 ) throw FSLib::Exceptions::NotImplemented( L"UTF16 LE not supported" ); else if ( u8 >= 0x80 && u8 <= 0xBF ) throw FSLib::Exceptions::UnicodeEncoding( L"File may not start with a UTF8 continuation character" ); else if ( u8 == 0xEF && chk.get() == 0xBB && chk.get() == 0xBF ) bom = true; std::basic_ifstream< utf16 >::open( filename, mode, prot ); imbue( loc ); if ( bom ) get(); // throw away leading BOM } void FSLib::uifstream::open( const char *filename, std::ios_base::open_mode mode ) { open( filename, std::ios_base::openmode( mode ) ); } FSLib::wstring FSLib::uifstream::load( const char *filename ) { uifstream file( filename ); wstring text; utf32 u32 = 0; size_t len = 0; while ( !file.eof() && file.good() ) { utf16 u16 = file.get(); if ( u16 < 0x80 && len > 0 ) throw FSLib::Exceptions::UnicodeEncoding( L"Not enough continuation characters found" ); else if ( u16 < 0x80 ) text += u16; else if ( u16 >= 0x80 && u16 < 0xC0 ) { if ( len-- == 0 ) throw FSLib::Exceptions::UnicodeEncoding( L"Continuation character found in wrong place" ); u32 = ( u32 << 6 ) | ( u16 & 0x3F ); if ( len == 0 && u32 == utf::c_bom && !text.empty() ) FSLib::Exceptions::UnicodeEncoding( L"BOM may not appear anywhere other than at the beginning of the file" ); else if ( len == 0 && u32 != utf::c_bom ) text += u32; } else if ( u16 >= 0xC0 && u16 < 0xE0 ) { len = 1; u32 = u16 & 0x1F; } else if ( u16 >= 0xE0 && u16 < 0xF0 ) { len = 2; u32 = u16 & 0x0F; } else if ( u16 >= 0xF0 && u16 < 0xF8 ) { len = 3; u32 = u16 & 0x07; } else if ( u16 == 0xFFFF && len > 0 ) throw FSLib::Exceptions::UnicodeEncoding( L"Not enough continuation characters found before end of file" ); } if ( !file.eof() && file.bad() ) throw FSLib::Exceptions::UnicodeEncoding( L"Error reading from file - not all content read" ); return text; } /* FSLib::utf8::filebuf */ FSLib::utf::filebuf::filebuf() : m_hfile( 0 ), m_mode( 0 ) { } FSLib::utf::filebuf::~filebuf() { if ( m_hfile || m_mode ) close(); } void FSLib::utf::filebuf::open( const wstring &filename, std::ios_base::openmode mode ) { if ( m_hfile || m_mode ) throw FSLib::Exceptions::NotNull( L"File has already been opened." ); m_mode = mode; m_hfile = ::CreateFile( filename.c_str(), FILE_WRITE_DATA, FILE_SHARE_READ, NULL, CREATE_ALWAYS, FILE_ATTRIBUTE_NORMAL, NULL ); } void FSLib::utf::filebuf::close() { if ( m_hfile ) ::CloseHandle( m_hfile ); m_hfile = 0; m_mode = 0; } FSLib::utf::filebuf::int_type FSLib::utf::filebuf::underflow() { throw FSLib::Exceptions::NotImplemented( L"FSLib::utf8::filebuf::underflow not implemented for write buffer" ); } FSLib::utf::filebuf::int_type FSLib::utf::filebuf::overflow( FSLib::utf::filebuf::int_type ch ) { return ch; } /* FSLib::utf::codecvt_utf8 */ FSLib::utf::codecvt_utf8::codecvt_utf8( std::size_t ref ) : std::codecvt< utf16, utf8, utf16 >( ref ) { } bool FSLib::utf::codecvt_utf8::do_always_noconv() const { return false; // shows that conversion do take place } int FSLib::utf::codecvt_utf8::do_max_length() const { return 4; } int FSLib::utf::codecvt_utf8::do_encoding() const { return 0; // the encoding involves sequences of varying lengths } /* This implementation follows the Microsoft (Dinkumware) documentation. This does not conform to the C++ standard. The standard states that the length is the number of extern_type and this implementation counts the number of intern_type. */ int FSLib::utf::codecvt_utf8::do_length( const state_type &, const extern_type *first1, const extern_type *last1, std::size_t len2 ) const { int count = 0; for ( ; first1 != last1 && std::size_t( count ) != len2; ++first1, ++count ) { int chars = 1; if ( *first1 >= 0x80 && *first1 <= 0xBF ) return count; // throw FSLib::Exceptions::UnicodeEncoding( L"UTF-8 continuation character cannot appear without control character" ); else if ( *first1 >= 0xC0 && *first1 < 0xE0 ) chars = 2; else if ( *first1 >= 0xE0 && *first1 < 0xF0 ) chars = 3; else if ( *first1 >= 0xF0 && *first1 < 0xF8 ) chars = 4; else return count; // Not a proper UTF-8 char so we stop counting already int check = chars; for ( ; check > 1 && first1 != last1; --check, ++first1 ) { if ( *first1 < 0x80 || *first1 > 0xBF ) return count; // throw FSLib::Exceptions::UnicodeEncoding( L"UTF-8 continuation character is not correct" ); } if ( check != 1 ) return count; // Final character was incomplete so don't count it } return count; } FSLib::utf::codecvt_utf8::result FSLib::utf::codecvt_utf8::do_in( state_type &, const extern_type *first1, const extern_type *last1, const extern_type *&mid1, intern_type *first2, intern_type *last2, intern_type *&mid2 ) const { utf32 ch; std::size_t chars = ::chars_noexception( *first1 ); if ( chars == 0 ) return error; else if ( first1 + chars >= last1 ) return partial; for ( std::size_t chk = 1; chk < chars; ++chk ) { if ( first1[ chk ] < 0x80 || first1[ chk ] > 0xBF ) return error; //throw FSLib::Exceptions::UnicodeEncoding( L"UTF-8 continuation character is not correct (" + toString( chk ) + L" of " + toString( chars ) + L") is " + toString( seq[ chk ] ) ); } switch ( chars ) { case 1: ch = utf32( first1[ 0 ] & 0x7F ); break; case 2: ch = utf32( first1[ 0 ] & 0x1F ) << 6; ch |= utf32( first1[ 1 ] & 0x3F ); break; case 3: ch = utf32( first1[ 0 ] & 0x0F ) << 12; ch |= utf32( first1[ 1 ] & 0x3F ) << 6; ch |= utf32( first1[ 2 ] & 0x3F ); break; case 4: ch = utf32( first1[ 0 ] & 0x07 ) << 18; ch |= utf32( first1[ 1 ] & 0x3F ) << 12; ch |= utf32( first1[ 2 ] & 0x3F ) << 6; ch |= utf32( first1[ 3 ] & 0x3F ); break; } if ( utf8length( ch ) != chars ) return error; //throw FSLib::Exceptions::UnicodeEncoding( L"UTF-8 sequence of " + toString( chars ) + L" chars generated a UTF32 character (" + toString( ch ) + L") with a different length (" + toString( utf8length( ch ) ) + L")" ); size_t utf16len = encode( ch, first2, last2 ); if ( utf16len == 0 ) return partial; else { mid1 = first1 + chars; mid2 = first2 + utf16len; return ok; } } FSLib::utf::codecvt_utf8::result FSLib::utf::codecvt_utf8::do_out( state_type &, const intern_type *first1, const intern_type *, const intern_type *&mid1, extern_type *first2, extern_type *last2, extern_type *&mid2 ) const { if ( *first1 >= 0x0000 && *first1 <= 0x007F ) { if ( first2 + 1 >= last2 ) return partial; first2[ 0 ] = static_cast< extern_type >( *first1 & 0x007F ); mid2 = first2 + 1; } else if ( *first1 >= 0x0080 && *first1 <= 0x07FF ) { if ( first2 + 2 >= last2 ) return partial; first2[ 0 ] = 0xC0 | ( static_cast< extern_type >( *first1 >> 6 ) & 0x1F ); first2[ 1 ] = 0x80 | ( static_cast< extern_type >( *first1 ) & 0x3F ); mid2 = first2 + 2; } else if ( *first1 >= 0x0800 && *first1 <= 0xFFFF ) { if ( first2 + 3 >= last2 ) return partial; first2[ 0 ] = 0xE0 | ( static_cast< extern_type >( *first1 >> 12 ) & 0x0F ); first2[ 1 ] = 0x80 | ( static_cast< extern_type >( *first1 >> 6 ) & 0x3F ); first2[ 2 ] = 0x80 | ( static_cast< extern_type >( *first1 ) & 0x3F ); mid2 = first2 + 3; } else { return error; } mid1 = first1 + 1; return ok; } /* $History: unicode.cpp $ * * ***************** Version 10 ***************** * User: Kirit Date: 23/02/06 Time: 18:05 * Updated in $/FOST.3/F3Util * Improved Unicode failure diagnostics and debugged UTF-16 encoding of * extended Unicode characters. * * ***************** Version 9 ***************** * User: Kirit Date: 14/03/05 Time: 10:27 * Updated in $/FOST.3/F3Util * Debugged UTF-8 input functions. * * ***************** Version 8 ***************** * User: Kirit Date: 5/03/05 Time: 17:40 * Updated in $/FOST.3/F3Util * Basic file reading implemented for fetching a FOST.3 script. * * ***************** Version 7 ***************** * User: Kirit Date: 13/02/05 Time: 15:46 * Updated in $/FOST.3/F3Util * Added rfind to wstring and finding of utf16 sequence lengths. * * ***************** Version 6 ***************** * User: Kirit Date: 14/01/05 Time: 13:46 * Updated in $/FOST.3/F3Util * UTF-8 sequences with Unicode characters from HTTP POST now works * correctly. * * ***************** Version 5 ***************** * User: Kirit Date: 27/12/04 Time: 16:58 * Updated in $/FOST.3/F3Util * Updated for move to new installation - fix of old problems with old * files. * * ***************** Version 4 ***************** * User: Kirit Date: 5/11/04 Time: 12:51 * Updated in $/FOST.3/F3Util * Added extra check for other invalid characters. * * ***************** Version 3 ***************** * User: Kirit Date: 3/11/04 Time: 21:51 * Updated in $/FOST.3/F3Util * First working version of FSLib::wstring using UTF-16. * * ***************** Version 2 ***************** * User: Kirit Date: 4/10/04 Time: 14:43 * Updated in $/FOST.3/f3util * Removed some compile check code and added comments about the * implementation. * * ***************** Version 1 ***************** * User: Kirit Date: 3/10/04 Time: 22:22 * Created in $/FOST.3/F3Util * Log files now seem to be correct UTF-8 output. There are still * potential issues with the internal wchar_t representation. */