include/ndnboost/detail/utf8_codecvt_facet.hpp - ndn-cxx - Gitiles

 // Copyright (c) 2001 Ronald Garcia, Indiana University (garcia@osl.iu.edu)
 // Andrew Lumsdaine, Indiana University (lums@osl.iu.edu).
 // Distributed under the Boost Software License, Version 1.0. (See accompany-
 // ing file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)

 #ifndef NDNBOOST_UTF8_CODECVT_FACET_HPP
 #define NDNBOOST_UTF8_CODECVT_FACET_HPP

 // MS compatible compilers support #pragma once
 #if defined(_MSC_VER) && (_MSC_VER >= 1020)
 # pragma once
 #endif

 /////////1/////////2/////////3/////////4/////////5/////////6/////////7/////////8
 // utf8_codecvt_facet.hpp

 // This header defines class utf8_codecvt_facet, derived fro
 // std::codecvt<wchar_t, char>, which can be used to convert utf8 data in
 // files into wchar_t strings in the application.
 //
 // The header is NOT STANDALONE, and is not to be included by the USER.
 // There are at least two libraries which want to use this functionality, and
 // we want to avoid code duplication. It would be possible to create utf8
 // library, but:
 // - this requires review process first
 // - in the case, when linking the a library which uses utf8
 //   (say 'program_options'), user should also link to the utf8 library.
 //   This seems inconvenient, and asking a user to link to an unrevieved
 //   library is strange.
 // Until the above points are fixed, a library which wants to use utf8 must:
 // - include this header from one of it's headers or sources
 // - include the corresponding .cpp file from one of the sources
 // - before including either file, the library must define
 //   - NDNBOOST_UTF8_BEGIN_NAMESPACE to the namespace declaration that must be used
 //   - NDNBOOST_UTF8_END_NAMESPACE to the code to close the previous namespace
 //   - declaration.
 //   - NDNBOOST_UTF8_DECL -- to the code which must be used for all 'exportable'
 //     symbols.
 //
 // For example, program_options library might contain:
 //    #define NDNBOOST_UTF8_BEGIN_NAMESPACE <backslash character>
 //             namespace ndnboost { namespace program_options {
 //    #define NDNBOOST_UTF8_END_NAMESPACE }}
 //    #define NDNBOOST_UTF8_DECL NDNBOOST_PROGRAM_OPTIONS_DECL
 //    #include "../../detail/utf8/utf8_codecvt.cpp"
 //
 // Essentially, each library will have its own copy of utf8 code, in
 // different namespaces.

 // Note:(Robert Ramey).  I have made the following alterations in the original
 // code.
 // a) Rendered utf8_codecvt<wchar_t, char>  with using templates
 // b) Move longer functions outside class definition to prevent inlining
 // and make code smaller
 // c) added on a derived class to permit translation to/from current
 // locale to utf8

 //  See http://www.boost.org for updates, documentation, and revision history.

 // archives stored as text - note these ar templated on the basic
 // stream templates to accommodate wide (and other?) kind of characters
 //
 // note the fact that on libraries without wide characters, ostream is
 // is not a specialization of basic_ostream which in fact is not defined
 // in such cases.   So we can't use basic_ostream<OStream::char_type> but rather
 // use two template parameters
 //
 // utf8_codecvt_facet
 //   This is an implementation of a std::codecvt facet for translating
 //   from UTF-8 externally to UCS-4.  Note that this is not tied to
 //   any specific types in order to allow customization on platforms
 //   where wchar_t is not big enough.
 //
 // NOTES:  The current implementation jumps through some unpleasant hoops in
 // order to deal with signed character types.  As a std::codecvt_base::result,
 // it is necessary  for the ExternType to be convertible to unsigned  char.
 // I chose not to tie the extern_type explicitly to char. But if any combination
 // of types other than <wchar_t,char_t> is used, then std::codecvt must be
 // specialized on those types for this to work.

 #include <locale>
 #include <cwchar>   // for mbstate_t
 #include <cstddef>  // for std::size_t

 #include <ndnboost/config.hpp>
 #include <ndnboost/detail/workaround.hpp>

 #if defined(NDNBOOST_NO_STDC_NAMESPACE)
 namespace std {
     using ::mbstate_t;
     using ::size_t;
 }
 #endif

 #if !defined(__MSL_CPP__) && !defined(__LIBCOMO__)
     #define NDNBOOST_CODECVT_DO_LENGTH_CONST const
 #else
     #define NDNBOOST_CODECVT_DO_LENGTH_CONST
 #endif

 // maximum lenght of a multibyte string
 #define MB_LENGTH_MAX 8

 NDNBOOST_UTF8_BEGIN_NAMESPACE

 struct NDNBOOST_UTF8_DECL utf8_codecvt_facet :
     public std::codecvt<wchar_t, char, std::mbstate_t>
 {
 public:
     explicit utf8_codecvt_facet(std::size_t no_locale_manage=0)
         : std::codecvt<wchar_t, char, std::mbstate_t>(no_locale_manage)
     {}
 protected:
     virtual std::codecvt_base::result do_in(
         std::mbstate_t& state,
         const char * from,
         const char * from_end,
         const char * & from_next,
         wchar_t * to,
         wchar_t * to_end,
         wchar_t*& to_next
     ) const;

     virtual std::codecvt_base::result do_out(
         std::mbstate_t & state, const wchar_t * from,
         const wchar_t * from_end, const wchar_t*  & from_next,
         char * to, char * to_end, char * & to_next
     ) const;

     bool invalid_continuing_octet(unsigned char octet_1) const {
         return (octet_1 < 0x80|| 0xbf< octet_1);
     }

     bool invalid_leading_octet(unsigned char octet_1)   const {
         return (0x7f < octet_1 && octet_1 < 0xc0) ||
             (octet_1 > 0xfd);
     }

     // continuing octets = octets except for the leading octet
     static unsigned int get_cont_octet_count(unsigned   char lead_octet) {
         return get_octet_count(lead_octet) - 1;
     }

     static unsigned int get_octet_count(unsigned char   lead_octet);

     // How many "continuing octets" will be needed for this word
     // ==   total octets - 1.
     int get_cont_octet_out_count(wchar_t word) const ;

     virtual bool do_always_noconv() const throw() { return false; }

     // UTF-8 isn't really stateful since we rewind on partial conversions
     virtual std::codecvt_base::result do_unshift(
         std::mbstate_t&,
         char * from,
         char * /*to*/,
         char * & next
     ) const
     {
         next = from;
         return ok;
     }

     virtual int do_encoding() const throw() {
         const int variable_byte_external_encoding=0;
         return variable_byte_external_encoding;
     }

     // How many char objects can I process to get <= max_limit
     // wchar_t objects?
     virtual int do_length(
         NDNBOOST_CODECVT_DO_LENGTH_CONST std::mbstate_t &,
         const char * from,
         const char * from_end,
         std::size_t max_limit
 #if NDNBOOST_WORKAROUND(__IBMCPP__, NDNBOOST_TESTED_AT(600))
         ) const throw();
 #else
         ) const;
 #endif

     // Largest possible value do_length(state,from,from_end,1) could return.
     virtual int do_max_length() const throw () {
         return 6; // largest UTF-8 encoding of a UCS-4 character
     }
 };

 NDNBOOST_UTF8_END_NAMESPACE

 #endif // NDNBOOST_UTF8_CODECVT_FACET_HPP
	// Copyright (c) 2001 Ronald Garcia, Indiana University (garcia@osl.iu.edu)
	// Andrew Lumsdaine, Indiana University (lums@osl.iu.edu).
	// Distributed under the Boost Software License, Version 1.0. (See accompany-
	// ing file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)

	#ifndef NDNBOOST_UTF8_CODECVT_FACET_HPP
	#define NDNBOOST_UTF8_CODECVT_FACET_HPP

	// MS compatible compilers support #pragma once
	#if defined(_MSC_VER) && (_MSC_VER >= 1020)
	# pragma once
	#endif

	/////////1/////////2/////////3/////////4/////////5/////////6/////////7/////////8
	// utf8_codecvt_facet.hpp

	// This header defines class utf8_codecvt_facet, derived fro
	// std::codecvt<wchar_t, char>, which can be used to convert utf8 data in
	// files into wchar_t strings in the application.
	//
	// The header is NOT STANDALONE, and is not to be included by the USER.
	// There are at least two libraries which want to use this functionality, and
	// we want to avoid code duplication. It would be possible to create utf8
	// library, but:
	// - this requires review process first
	// - in the case, when linking the a library which uses utf8
	// (say 'program_options'), user should also link to the utf8 library.
	// This seems inconvenient, and asking a user to link to an unrevieved
	// library is strange.
	// Until the above points are fixed, a library which wants to use utf8 must:
	// - include this header from one of it's headers or sources
	// - include the corresponding .cpp file from one of the sources
	// - before including either file, the library must define
	// - NDNBOOST_UTF8_BEGIN_NAMESPACE to the namespace declaration that must be used
	// - NDNBOOST_UTF8_END_NAMESPACE to the code to close the previous namespace
	// - declaration.
	// - NDNBOOST_UTF8_DECL -- to the code which must be used for all 'exportable'
	// symbols.
	//
	// For example, program_options library might contain:
	// #define NDNBOOST_UTF8_BEGIN_NAMESPACE <backslash character>
	// namespace ndnboost { namespace program_options {
	// #define NDNBOOST_UTF8_END_NAMESPACE }}
	// #define NDNBOOST_UTF8_DECL NDNBOOST_PROGRAM_OPTIONS_DECL
	// #include "../../detail/utf8/utf8_codecvt.cpp"
	//
	// Essentially, each library will have its own copy of utf8 code, in
	// different namespaces.

	// Note:(Robert Ramey). I have made the following alterations in the original
	// code.
	// a) Rendered utf8_codecvt<wchar_t, char> with using templates
	// b) Move longer functions outside class definition to prevent inlining
	// and make code smaller
	// c) added on a derived class to permit translation to/from current
	// locale to utf8

	// See http://www.boost.org for updates, documentation, and revision history.

	// archives stored as text - note these ar templated on the basic
	// stream templates to accommodate wide (and other?) kind of characters
	//
	// note the fact that on libraries without wide characters, ostream is
	// is not a specialization of basic_ostream which in fact is not defined
	// in such cases. So we can't use basic_ostream<OStream::char_type> but rather
	// use two template parameters
	//
	// utf8_codecvt_facet
	// This is an implementation of a std::codecvt facet for translating
	// from UTF-8 externally to UCS-4. Note that this is not tied to
	// any specific types in order to allow customization on platforms
	// where wchar_t is not big enough.
	//
	// NOTES: The current implementation jumps through some unpleasant hoops in
	// order to deal with signed character types. As a std::codecvt_base::result,
	// it is necessary for the ExternType to be convertible to unsigned char.
	// I chose not to tie the extern_type explicitly to char. But if any combination
	// of types other than <wchar_t,char_t> is used, then std::codecvt must be
	// specialized on those types for this to work.

	#include <locale>
	#include <cwchar> // for mbstate_t
	#include <cstddef> // for std::size_t

	#include <ndnboost/config.hpp>
	#include <ndnboost/detail/workaround.hpp>

	#if defined(NDNBOOST_NO_STDC_NAMESPACE)
	namespace std {
	using ::mbstate_t;
	using ::size_t;
	}
	#endif

	#if !defined(__MSL_CPP__) && !defined(__LIBCOMO__)
	#define NDNBOOST_CODECVT_DO_LENGTH_CONST const
	#else
	#define NDNBOOST_CODECVT_DO_LENGTH_CONST
	#endif

	// maximum lenght of a multibyte string
	#define MB_LENGTH_MAX 8

	NDNBOOST_UTF8_BEGIN_NAMESPACE

	struct NDNBOOST_UTF8_DECL utf8_codecvt_facet :
	public std::codecvt<wchar_t, char, std::mbstate_t>
	{
	public:
	explicit utf8_codecvt_facet(std::size_t no_locale_manage=0)
	: std::codecvt<wchar_t, char, std::mbstate_t>(no_locale_manage)
	{}
	protected:
	virtual std::codecvt_base::result do_in(
	std::mbstate_t& state,
	const char * from,
	const char * from_end,
	const char * & from_next,
	wchar_t * to,
	wchar_t * to_end,
	wchar_t*& to_next
	) const;

	virtual std::codecvt_base::result do_out(
	std::mbstate_t & state, const wchar_t * from,
	const wchar_t * from_end, const wchar_t* & from_next,
	char * to, char * to_end, char * & to_next
	) const;

	bool invalid_continuing_octet(unsigned char octet_1) const {
	return (octet_1 < 0x80\|\| 0xbf< octet_1);
	}

	bool invalid_leading_octet(unsigned char octet_1) const {
	return (0x7f < octet_1 && octet_1 < 0xc0) \|\|
	(octet_1 > 0xfd);
	}

	// continuing octets = octets except for the leading octet
	static unsigned int get_cont_octet_count(unsigned char lead_octet) {
	return get_octet_count(lead_octet) - 1;
	}

	static unsigned int get_octet_count(unsigned char lead_octet);

	// How many "continuing octets" will be needed for this word
	// == total octets - 1.
	int get_cont_octet_out_count(wchar_t word) const ;

	virtual bool do_always_noconv() const throw() { return false; }

	// UTF-8 isn't really stateful since we rewind on partial conversions
	virtual std::codecvt_base::result do_unshift(
	std::mbstate_t&,
	char * from,
	char * /to/,
	char * & next
	) const
	{
	next = from;
	return ok;
	}

	virtual int do_encoding() const throw() {
	const int variable_byte_external_encoding=0;
	return variable_byte_external_encoding;
	}

	// How many char objects can I process to get <= max_limit
	// wchar_t objects?
	virtual int do_length(
	NDNBOOST_CODECVT_DO_LENGTH_CONST std::mbstate_t &,
	const char * from,
	const char * from_end,
	std::size_t max_limit
	#if NDNBOOST_WORKAROUND(__IBMCPP__, NDNBOOST_TESTED_AT(600))
	) const throw();
	#else
	) const;
	#endif

	// Largest possible value do_length(state,from,from_end,1) could return.
	virtual int do_max_length() const throw () {
	return 6; // largest UTF-8 encoding of a UCS-4 character
	}
	};

	NDNBOOST_UTF8_END_NAMESPACE

	#endif // NDNBOOST_UTF8_CODECVT_FACET_HPP