Jeff Thompson | 86b6d64 | 2013-10-17 15:01:56 -0700 | [diff] [blame] | 1 | /////////1/////////2/////////3/////////4/////////5/////////6/////////7/////////8 |
| 2 | // utf8_codecvt_facet.ipp |
| 3 | |
| 4 | // Copyright (c) 2001 Ronald Garcia, Indiana University (garcia@osl.iu.edu) |
| 5 | // Andrew Lumsdaine, Indiana University (lums@osl.iu.edu). |
| 6 | // Use, modification and distribution is subject to the Boost Software |
| 7 | // License, Version 1.0. (See accompanying file LICENSE_1_0.txt or copy at |
| 8 | // http://www.boost.org/LICENSE_1_0.txt) |
| 9 | |
| 10 | // Please see the comments in <ndnboost/detail/utf8_codecvt_facet.hpp> to |
| 11 | // learn how this file should be used. |
| 12 | |
| 13 | #include <ndnboost/detail/utf8_codecvt_facet.hpp> |
| 14 | |
| 15 | #include <cstdlib> // for multi-byte converson routines |
| 16 | #include <cassert> |
| 17 | |
| 18 | #include <ndnboost/limits.hpp> |
| 19 | #include <ndnboost/config.hpp> |
| 20 | |
| 21 | // If we don't have wstring, then Unicode support |
| 22 | // is not available anyway, so we don't need to even |
| 23 | // compiler this file. This also fixes the problem |
| 24 | // with mingw, which can compile this file, but will |
| 25 | // generate link error when building DLL. |
| 26 | #ifndef NDNBOOST_NO_STD_WSTRING |
| 27 | |
| 28 | NDNBOOST_UTF8_BEGIN_NAMESPACE |
| 29 | |
| 30 | /////////1/////////2/////////3/////////4/////////5/////////6/////////7/////////8 |
| 31 | // implementation for wchar_t |
| 32 | |
| 33 | // Translate incoming UTF-8 into UCS-4 |
| 34 | std::codecvt_base::result utf8_codecvt_facet::do_in( |
| 35 | std::mbstate_t& /*state*/, |
| 36 | const char * from, |
| 37 | const char * from_end, |
| 38 | const char * & from_next, |
| 39 | wchar_t * to, |
| 40 | wchar_t * to_end, |
| 41 | wchar_t * & to_next |
| 42 | ) const { |
| 43 | // Basic algorithm: The first octet determines how many |
| 44 | // octets total make up the UCS-4 character. The remaining |
| 45 | // "continuing octets" all begin with "10". To convert, subtract |
| 46 | // the amount that specifies the number of octets from the first |
| 47 | // octet. Subtract 0x80 (1000 0000) from each continuing octet, |
| 48 | // then mash the whole lot together. Note that each continuing |
| 49 | // octet only uses 6 bits as unique values, so only shift by |
| 50 | // multiples of 6 to combine. |
| 51 | while (from != from_end && to != to_end) { |
| 52 | |
| 53 | // Error checking on the first octet |
| 54 | if (invalid_leading_octet(*from)){ |
| 55 | from_next = from; |
| 56 | to_next = to; |
| 57 | return std::codecvt_base::error; |
| 58 | } |
| 59 | |
| 60 | // The first octet is adjusted by a value dependent upon |
| 61 | // the number of "continuing octets" encoding the character |
| 62 | const int cont_octet_count = get_cont_octet_count(*from); |
| 63 | const wchar_t octet1_modifier_table[] = { |
| 64 | 0x00, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc |
| 65 | }; |
| 66 | |
| 67 | // The unsigned char conversion is necessary in case char is |
| 68 | // signed (I learned this the hard way) |
| 69 | wchar_t ucs_result = |
| 70 | (unsigned char)(*from++) - octet1_modifier_table[cont_octet_count]; |
| 71 | |
| 72 | // Invariants : |
| 73 | // 1) At the start of the loop, 'i' continuing characters have been |
| 74 | // processed |
| 75 | // 2) *from points to the next continuing character to be processed. |
| 76 | int i = 0; |
| 77 | while(i != cont_octet_count && from != from_end) { |
| 78 | |
| 79 | // Error checking on continuing characters |
| 80 | if (invalid_continuing_octet(*from)) { |
| 81 | from_next = from; |
| 82 | to_next = to; |
| 83 | return std::codecvt_base::error; |
| 84 | } |
| 85 | |
| 86 | ucs_result *= (1 << 6); |
| 87 | |
| 88 | // each continuing character has an extra (10xxxxxx)b attached to |
| 89 | // it that must be removed. |
| 90 | ucs_result += (unsigned char)(*from++) - 0x80; |
| 91 | ++i; |
| 92 | } |
| 93 | |
| 94 | // If the buffer ends with an incomplete unicode character... |
| 95 | if (from == from_end && i != cont_octet_count) { |
| 96 | // rewind "from" to before the current character translation |
| 97 | from_next = from - (i+1); |
| 98 | to_next = to; |
| 99 | return std::codecvt_base::partial; |
| 100 | } |
| 101 | *to++ = ucs_result; |
| 102 | } |
| 103 | from_next = from; |
| 104 | to_next = to; |
| 105 | |
| 106 | // Were we done converting or did we run out of destination space? |
| 107 | if(from == from_end) return std::codecvt_base::ok; |
| 108 | else return std::codecvt_base::partial; |
| 109 | } |
| 110 | |
| 111 | std::codecvt_base::result utf8_codecvt_facet::do_out( |
| 112 | std::mbstate_t& /*state*/, |
| 113 | const wchar_t * from, |
| 114 | const wchar_t * from_end, |
| 115 | const wchar_t * & from_next, |
| 116 | char * to, |
| 117 | char * to_end, |
| 118 | char * & to_next |
| 119 | ) const |
| 120 | { |
| 121 | // RG - consider merging this table with the other one |
| 122 | const wchar_t octet1_modifier_table[] = { |
| 123 | 0x00, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc |
| 124 | }; |
| 125 | |
| 126 | wchar_t max_wchar = (std::numeric_limits<wchar_t>::max)(); |
| 127 | while (from != from_end && to != to_end) { |
| 128 | |
| 129 | // Check for invalid UCS-4 character |
| 130 | if (*from > max_wchar) { |
| 131 | from_next = from; |
| 132 | to_next = to; |
| 133 | return std::codecvt_base::error; |
| 134 | } |
| 135 | |
| 136 | int cont_octet_count = get_cont_octet_out_count(*from); |
| 137 | |
| 138 | // RG - comment this formula better |
| 139 | int shift_exponent = (cont_octet_count) * 6; |
| 140 | |
| 141 | // Process the first character |
| 142 | *to++ = static_cast<char>(octet1_modifier_table[cont_octet_count] + |
| 143 | (unsigned char)(*from / (1 << shift_exponent))); |
| 144 | |
| 145 | // Process the continuation characters |
| 146 | // Invariants: At the start of the loop: |
| 147 | // 1) 'i' continuing octets have been generated |
| 148 | // 2) '*to' points to the next location to place an octet |
| 149 | // 3) shift_exponent is 6 more than needed for the next octet |
| 150 | int i = 0; |
| 151 | while (i != cont_octet_count && to != to_end) { |
| 152 | shift_exponent -= 6; |
| 153 | *to++ = static_cast<char>(0x80 + ((*from / (1 << shift_exponent)) % (1 << 6))); |
| 154 | ++i; |
| 155 | } |
| 156 | // If we filled up the out buffer before encoding the character |
| 157 | if(to == to_end && i != cont_octet_count) { |
| 158 | from_next = from; |
| 159 | to_next = to - (i+1); |
| 160 | return std::codecvt_base::partial; |
| 161 | } |
| 162 | ++from; |
| 163 | } |
| 164 | from_next = from; |
| 165 | to_next = to; |
| 166 | // Were we done or did we run out of destination space |
| 167 | if(from == from_end) return std::codecvt_base::ok; |
| 168 | else return std::codecvt_base::partial; |
| 169 | } |
| 170 | |
| 171 | // How many char objects can I process to get <= max_limit |
| 172 | // wchar_t objects? |
| 173 | int utf8_codecvt_facet::do_length( |
| 174 | NDNBOOST_CODECVT_DO_LENGTH_CONST std::mbstate_t &, |
| 175 | const char * from, |
| 176 | const char * from_end, |
| 177 | std::size_t max_limit |
| 178 | #if NDNBOOST_WORKAROUND(__IBMCPP__, NDNBOOST_TESTED_AT(600)) |
| 179 | ) const throw() |
| 180 | #else |
| 181 | ) const |
| 182 | #endif |
| 183 | { |
| 184 | // RG - this code is confusing! I need a better way to express it. |
| 185 | // and test cases. |
| 186 | |
| 187 | // Invariants: |
| 188 | // 1) last_octet_count has the size of the last measured character |
| 189 | // 2) char_count holds the number of characters shown to fit |
| 190 | // within the bounds so far (no greater than max_limit) |
| 191 | // 3) from_next points to the octet 'last_octet_count' before the |
| 192 | // last measured character. |
| 193 | int last_octet_count=0; |
| 194 | std::size_t char_count = 0; |
| 195 | const char* from_next = from; |
| 196 | // Use "<" because the buffer may represent incomplete characters |
| 197 | while (from_next+last_octet_count <= from_end && char_count <= max_limit) { |
| 198 | from_next += last_octet_count; |
| 199 | last_octet_count = (get_octet_count(*from_next)); |
| 200 | ++char_count; |
| 201 | } |
| 202 | return static_cast<int>(from_next-from_end); |
| 203 | } |
| 204 | |
| 205 | unsigned int utf8_codecvt_facet::get_octet_count( |
| 206 | unsigned char lead_octet |
| 207 | ){ |
| 208 | // if the 0-bit (MSB) is 0, then 1 character |
| 209 | if (lead_octet <= 0x7f) return 1; |
| 210 | |
| 211 | // Otherwise the count number of consecutive 1 bits starting at MSB |
| 212 | // assert(0xc0 <= lead_octet && lead_octet <= 0xfd); |
| 213 | |
| 214 | if (0xc0 <= lead_octet && lead_octet <= 0xdf) return 2; |
| 215 | else if (0xe0 <= lead_octet && lead_octet <= 0xef) return 3; |
| 216 | else if (0xf0 <= lead_octet && lead_octet <= 0xf7) return 4; |
| 217 | else if (0xf8 <= lead_octet && lead_octet <= 0xfb) return 5; |
| 218 | else return 6; |
| 219 | } |
| 220 | NDNBOOST_UTF8_END_NAMESPACE |
| 221 | |
| 222 | namespace { |
| 223 | template<std::size_t s> |
| 224 | int get_cont_octet_out_count_impl(wchar_t word){ |
| 225 | if (word < 0x80) { |
| 226 | return 0; |
| 227 | } |
| 228 | if (word < 0x800) { |
| 229 | return 1; |
| 230 | } |
| 231 | return 2; |
| 232 | } |
| 233 | |
| 234 | template<> |
| 235 | int get_cont_octet_out_count_impl<4>(wchar_t word){ |
| 236 | if (word < 0x80) { |
| 237 | return 0; |
| 238 | } |
| 239 | if (word < 0x800) { |
| 240 | return 1; |
| 241 | } |
| 242 | |
| 243 | // Note that the following code will generate warnings on some platforms |
| 244 | // where wchar_t is defined as UCS2. The warnings are superfluous as the |
| 245 | // specialization is never instantitiated with such compilers, but this |
| 246 | // can cause problems if warnings are being treated as errors, so we guard |
| 247 | // against that. Including <ndnboost/detail/utf8_codecvt_facet.hpp> as we do |
| 248 | // should be enough to get WCHAR_MAX defined. |
| 249 | #if !defined(WCHAR_MAX) |
| 250 | # error WCHAR_MAX not defined! |
| 251 | #endif |
| 252 | // cope with VC++ 7.1 or earlier having invalid WCHAR_MAX |
| 253 | #if defined(_MSC_VER) && _MSC_VER <= 1310 // 7.1 or earlier |
| 254 | return 2; |
| 255 | #elif WCHAR_MAX > 0x10000 |
| 256 | |
| 257 | if (word < 0x10000) { |
| 258 | return 2; |
| 259 | } |
| 260 | if (word < 0x200000) { |
| 261 | return 3; |
| 262 | } |
| 263 | if (word < 0x4000000) { |
| 264 | return 4; |
| 265 | } |
| 266 | return 5; |
| 267 | |
| 268 | #else |
| 269 | return 2; |
| 270 | #endif |
| 271 | } |
| 272 | |
| 273 | } // namespace anonymous |
| 274 | |
| 275 | NDNBOOST_UTF8_BEGIN_NAMESPACE |
| 276 | // How many "continuing octets" will be needed for this word |
| 277 | // == total octets - 1. |
| 278 | int utf8_codecvt_facet::get_cont_octet_out_count( |
| 279 | wchar_t word |
| 280 | ) const { |
| 281 | return get_cont_octet_out_count_impl<sizeof(wchar_t)>(word); |
| 282 | } |
| 283 | NDNBOOST_UTF8_END_NAMESPACE |
| 284 | |
| 285 | #endif |