Blame - include/ndnboost/detail/utf8_codecvt_facet.ipp - ndnSIM/ndn-cxx

blob: 00e893d0f1b752c2e59bcb58b189478aa45959f3 [file] [log] [blame]

Jeff Thompson	86b6d64	2013-10-17 15:01:56 -0700	[diff] [blame^]	1	/////////1/////////2/////////3/////////4/////////5/////////6/////////7/////////8
				2	// utf8_codecvt_facet.ipp
				3
				4	// Copyright (c) 2001 Ronald Garcia, Indiana University (garcia@osl.iu.edu)
				5	// Andrew Lumsdaine, Indiana University (lums@osl.iu.edu).
				6	// Use, modification and distribution is subject to the Boost Software
				7	// License, Version 1.0. (See accompanying file LICENSE_1_0.txt or copy at
				8	// http://www.boost.org/LICENSE_1_0.txt)
				9
				10	// Please see the comments in <ndnboost/detail/utf8_codecvt_facet.hpp> to
				11	// learn how this file should be used.
				12
				13	#include <ndnboost/detail/utf8_codecvt_facet.hpp>
				14
				15	#include <cstdlib> // for multi-byte converson routines
				16	#include <cassert>
				17
				18	#include <ndnboost/limits.hpp>
				19	#include <ndnboost/config.hpp>
				20
				21	// If we don't have wstring, then Unicode support
				22	// is not available anyway, so we don't need to even
				23	// compiler this file. This also fixes the problem
				24	// with mingw, which can compile this file, but will
				25	// generate link error when building DLL.
				26	#ifndef NDNBOOST_NO_STD_WSTRING
				27
				28	NDNBOOST_UTF8_BEGIN_NAMESPACE
				29
				30	/////////1/////////2/////////3/////////4/////////5/////////6/////////7/////////8
				31	// implementation for wchar_t
				32
				33	// Translate incoming UTF-8 into UCS-4
				34	std::codecvt_base::result utf8_codecvt_facet::do_in(
				35	std::mbstate_t& /state/,
				36	const char * from,
				37	const char * from_end,
				38	const char * & from_next,
				39	wchar_t * to,
				40	wchar_t * to_end,
				41	wchar_t * & to_next
				42	) const {
				43	// Basic algorithm: The first octet determines how many
				44	// octets total make up the UCS-4 character. The remaining
				45	// "continuing octets" all begin with "10". To convert, subtract
				46	// the amount that specifies the number of octets from the first
				47	// octet. Subtract 0x80 (1000 0000) from each continuing octet,
				48	// then mash the whole lot together. Note that each continuing
				49	// octet only uses 6 bits as unique values, so only shift by
				50	// multiples of 6 to combine.
				51	while (from != from_end && to != to_end) {
				52
				53	// Error checking on the first octet
				54	if (invalid_leading_octet(*from)){
				55	from_next = from;
				56	to_next = to;
				57	return std::codecvt_base::error;
				58	}
				59
				60	// The first octet is adjusted by a value dependent upon
				61	// the number of "continuing octets" encoding the character
				62	const int cont_octet_count = get_cont_octet_count(*from);
				63	const wchar_t octet1_modifier_table[] = {
				64	0x00, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc
				65	};
				66
				67	// The unsigned char conversion is necessary in case char is
				68	// signed (I learned this the hard way)
				69	wchar_t ucs_result =
				70	(unsigned char)(*from++) - octet1_modifier_table[cont_octet_count];
				71
				72	// Invariants :
				73	// 1) At the start of the loop, 'i' continuing characters have been
				74	// processed
				75	// 2) *from points to the next continuing character to be processed.
				76	int i = 0;
				77	while(i != cont_octet_count && from != from_end) {
				78
				79	// Error checking on continuing characters
				80	if (invalid_continuing_octet(*from)) {
				81	from_next = from;
				82	to_next = to;
				83	return std::codecvt_base::error;
				84	}
				85
				86	ucs_result *= (1 << 6);
				87
				88	// each continuing character has an extra (10xxxxxx)b attached to
				89	// it that must be removed.
				90	ucs_result += (unsigned char)(*from++) - 0x80;
				91	++i;
				92	}
				93
				94	// If the buffer ends with an incomplete unicode character...
				95	if (from == from_end && i != cont_octet_count) {
				96	// rewind "from" to before the current character translation
				97	from_next = from - (i+1);
				98	to_next = to;
				99	return std::codecvt_base::partial;
				100	}
				101	*to++ = ucs_result;
				102	}
				103	from_next = from;
				104	to_next = to;
				105
				106	// Were we done converting or did we run out of destination space?
				107	if(from == from_end) return std::codecvt_base::ok;
				108	else return std::codecvt_base::partial;
				109	}
				110
				111	std::codecvt_base::result utf8_codecvt_facet::do_out(
				112	std::mbstate_t& /state/,
				113	const wchar_t * from,
				114	const wchar_t * from_end,
				115	const wchar_t * & from_next,
				116	char * to,
				117	char * to_end,
				118	char * & to_next
				119	) const
				120	{
				121	// RG - consider merging this table with the other one
				122	const wchar_t octet1_modifier_table[] = {
				123	0x00, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc
				124	};
				125
				126	wchar_t max_wchar = (std::numeric_limits<wchar_t>::max)();
				127	while (from != from_end && to != to_end) {
				128
				129	// Check for invalid UCS-4 character
				130	if (*from > max_wchar) {
				131	from_next = from;
				132	to_next = to;
				133	return std::codecvt_base::error;
				134	}
				135
				136	int cont_octet_count = get_cont_octet_out_count(*from);
				137
				138	// RG - comment this formula better
				139	int shift_exponent = (cont_octet_count) * 6;
				140
				141	// Process the first character
				142	*to++ = static_cast<char>(octet1_modifier_table[cont_octet_count] +
				143	(unsigned char)(*from / (1 << shift_exponent)));
				144
				145	// Process the continuation characters
				146	// Invariants: At the start of the loop:
				147	// 1) 'i' continuing octets have been generated
				148	// 2) '*to' points to the next location to place an octet
				149	// 3) shift_exponent is 6 more than needed for the next octet
				150	int i = 0;
				151	while (i != cont_octet_count && to != to_end) {
				152	shift_exponent -= 6;
				153	to++ = static_cast<char>(0x80 + ((from / (1 << shift_exponent)) % (1 << 6)));
				154	++i;
				155	}
				156	// If we filled up the out buffer before encoding the character
				157	if(to == to_end && i != cont_octet_count) {
				158	from_next = from;
				159	to_next = to - (i+1);
				160	return std::codecvt_base::partial;
				161	}
				162	++from;
				163	}
				164	from_next = from;
				165	to_next = to;
				166	// Were we done or did we run out of destination space
				167	if(from == from_end) return std::codecvt_base::ok;
				168	else return std::codecvt_base::partial;
				169	}
				170
				171	// How many char objects can I process to get <= max_limit
				172	// wchar_t objects?
				173	int utf8_codecvt_facet::do_length(
				174	NDNBOOST_CODECVT_DO_LENGTH_CONST std::mbstate_t &,
				175	const char * from,
				176	const char * from_end,
				177	std::size_t max_limit
				178	#if NDNBOOST_WORKAROUND(__IBMCPP__, NDNBOOST_TESTED_AT(600))
				179	) const throw()
				180	#else
				181	) const
				182	#endif
				183	{
				184	// RG - this code is confusing! I need a better way to express it.
				185	// and test cases.
				186
				187	// Invariants:
				188	// 1) last_octet_count has the size of the last measured character
				189	// 2) char_count holds the number of characters shown to fit
				190	// within the bounds so far (no greater than max_limit)
				191	// 3) from_next points to the octet 'last_octet_count' before the
				192	// last measured character.
				193	int last_octet_count=0;
				194	std::size_t char_count = 0;
				195	const char* from_next = from;
				196	// Use "<" because the buffer may represent incomplete characters
				197	while (from_next+last_octet_count <= from_end && char_count <= max_limit) {
				198	from_next += last_octet_count;
				199	last_octet_count = (get_octet_count(*from_next));
				200	++char_count;
				201	}
				202	return static_cast<int>(from_next-from_end);
				203	}
				204
				205	unsigned int utf8_codecvt_facet::get_octet_count(
				206	unsigned char lead_octet
				207	){
				208	// if the 0-bit (MSB) is 0, then 1 character
				209	if (lead_octet <= 0x7f) return 1;
				210
				211	// Otherwise the count number of consecutive 1 bits starting at MSB
				212	// assert(0xc0 <= lead_octet && lead_octet <= 0xfd);
				213
				214	if (0xc0 <= lead_octet && lead_octet <= 0xdf) return 2;
				215	else if (0xe0 <= lead_octet && lead_octet <= 0xef) return 3;
				216	else if (0xf0 <= lead_octet && lead_octet <= 0xf7) return 4;
				217	else if (0xf8 <= lead_octet && lead_octet <= 0xfb) return 5;
				218	else return 6;
				219	}
				220	NDNBOOST_UTF8_END_NAMESPACE
				221
				222	namespace {
				223	template<std::size_t s>
				224	int get_cont_octet_out_count_impl(wchar_t word){
				225	if (word < 0x80) {
				226	return 0;
				227	}
				228	if (word < 0x800) {
				229	return 1;
				230	}
				231	return 2;
				232	}
				233
				234	template<>
				235	int get_cont_octet_out_count_impl<4>(wchar_t word){
				236	if (word < 0x80) {
				237	return 0;
				238	}
				239	if (word < 0x800) {
				240	return 1;
				241	}
				242
				243	// Note that the following code will generate warnings on some platforms
				244	// where wchar_t is defined as UCS2. The warnings are superfluous as the
				245	// specialization is never instantitiated with such compilers, but this
				246	// can cause problems if warnings are being treated as errors, so we guard
				247	// against that. Including <ndnboost/detail/utf8_codecvt_facet.hpp> as we do
				248	// should be enough to get WCHAR_MAX defined.
				249	#if !defined(WCHAR_MAX)
				250	# error WCHAR_MAX not defined!
				251	#endif
				252	// cope with VC++ 7.1 or earlier having invalid WCHAR_MAX
				253	#if defined(_MSC_VER) && _MSC_VER <= 1310 // 7.1 or earlier
				254	return 2;
				255	#elif WCHAR_MAX > 0x10000
				256
				257	if (word < 0x10000) {
				258	return 2;
				259	}
				260	if (word < 0x200000) {
				261	return 3;
				262	}
				263	if (word < 0x4000000) {
				264	return 4;
				265	}
				266	return 5;
				267
				268	#else
				269	return 2;
				270	#endif
				271	}
				272
				273	} // namespace anonymous
				274
				275	NDNBOOST_UTF8_BEGIN_NAMESPACE
				276	// How many "continuing octets" will be needed for this word
				277	// == total octets - 1.
				278	int utf8_codecvt_facet::get_cont_octet_out_count(
				279	wchar_t word
				280	) const {
				281	return get_cont_octet_out_count_impl<sizeof(wchar_t)>(word);
				282	}
				283	NDNBOOST_UTF8_END_NAMESPACE
				284
				285	#endif