C++ 实现unicode到utf-8的转码
思路:
获取字符串里面中的Unicode部分,然后将该部分转换位utf-8格式的字符,最后将字符串里面的所有Unicode替换为utf-8即可。
废话不多少,直接上代码:
头文件:
/*
* charsetEncode.h
*
* Created on: Jul 25, 2016
* Author: root
*/
#ifndef COMMONSERVER_INCLUDE_CHARSETENCODE_H_
#define COMMONSERVER_INCLUDE_CHARSETENCODE_H_
#include <iostream>
#include <algorithm>
#include <string>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <errno.h>
#include <assert.h>
using namespace std;
class CcharsetEncode
{
public:
int unicode_to_utf8(string &source);//unicode to utf-8
//字符串忽略大小写字串替换
void ReplaceStr(string &strContent, const char *strSrc, const char *strDest);
private:
int enc_unicode_to_utf8_one(unsigned long unic, unsigned char *pOutput, int outSize);
int isUnicode(const string &src); //the src is unicode or not, total 6 char(0x5e3f).1,yes;2,no
unsigned int xstrtoshortint(const char *str); //"0x1a3f"->1a3f
};
#endif /* COMMONSERVER_INCLUDE_CHARSETENCODE_H_ */源文件:
/*
* charsetEncode.cpp
*
* Created on: Jul 25, 2016
* Author: root
*/
#include "charsetEncode.h"
int CcharsetEncode::unicode_to_utf8(string &source)
{
int sourcesize = source.size();
string src;
unsigned char pout[8];
for(int index = 0; index < sourcesize - 6;)
{
memset(pout, 0, 8);
src = source.substr(index, 6);
if(isUnicode(src) == 1)
{
string hexsrc = source.substr(index + 2, 4);
int num = enc_unicode_to_utf8_one(xstrtoshortint(hexsrc.c_str()), pout, 8);
ReplaceStr(source, src.c_str(), (char *)pout);
index += 3;
sourcesize = source.size();
}
else
{
index++;
}
}
return 0;
}
int CcharsetEncode::enc_unicode_to_utf8_one(unsigned long unic, unsigned char *pOutput, int outSize)
{
assert(pOutput != NULL);
assert(outSize >= 6);
if ( unic <= 0x0000007F )
{
// * U-00000000 - U-0000007F: 0xxxxxxx
*pOutput = (unic & 0x7F);
return 1;
}
else if ( unic >= 0x00000080 && unic <= 0x000007FF )
{
// * U-00000080 - U-000007FF: 110xxxxx 10xxxxxx
*(pOutput+1) = (unic & 0x3F) | 0x80;
*pOutput = ((unic >> 6) & 0x1F) | 0xC0;
return 2;
}
else if ( unic >= 0x00000800 && unic <= 0x0000FFFF )
{
// * U-00000800 - U-0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
*(pOutput+2) = (unic & 0x3F) | 0x80;
*(pOutput+1) = ((unic >> 6) & 0x3F) | 0x80;
*pOutput = ((unic >> 12) & 0x0F) | 0xE0;
return 3;
}
else if ( unic >= 0x00010000 && unic <= 0x001FFFFF )
{
// * U-00010000 - U-001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
*(pOutput+3) = (unic & 0x3F) | 0x80;
*(pOutput+2) = ((unic >> 6) & 0x3F) | 0x80;
*(pOutput+1) = ((unic >> 12) & 0x3F) | 0x80;
*pOutput = ((unic >> 18) & 0x07) | 0xF0;
return 4;
}
else if ( unic >= 0x00200000 && unic <= 0x03FFFFFF )
{
// * U-00200000 - U-03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
*(pOutput+4) = (unic & 0x3F) | 0x80;
*(pOutput+3) = ((unic >> 6) & 0x3F) | 0x80;
*(pOutput+2) = ((unic >> 12) & 0x3F) | 0x80;
*(pOutput+1) = ((unic >> 18) & 0x3F) | 0x80;
*pOutput = ((unic >> 24) & 0x03) | 0xF8;
return 5;
}
else if ( unic >= 0x04000000 && unic <= 0x7FFFFFFF )
{
// * U-04000000 - U-7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
*(pOutput+5) = (unic & 0x3F) | 0x80;
*(pOutput+4) = ((unic >> 6) & 0x3F) | 0x80;
*(pOutput+3) = ((unic >> 12) & 0x3F) | 0x80;
*(pOutput+2) = ((unic >> 18) & 0x3F) | 0x80;
*(pOutput+1) = ((unic >> 24) & 0x3F) | 0x80;
*pOutput = ((unic >> 30) & 0x01) | 0xFC;
return 6;
}
return 0;
}
int CcharsetEncode::isUnicode(const string &src)
{
if(src.size() != 6)
return 0;
if(src.find("\u", 0) == 0)
{
for(int i = 2; i <= 5; i++)
{
if(!((src[i] >= "a" && src[i] <= "f")
|| (src[i] >= "A" && src[i] <= "F")
|| (src[i] >= "0" && src[i] <= "9")))
{
return 0;
}
}
return 1;
}
else
{
return 0;
}
}
unsigned int CcharsetEncode::xstrtoshortint(const char *str)
{
int len = strlen(str);
unsigned int ivalue = 0;
for (int i = 0; i < len; i++)
{
if ((str[i] <= "9" && str[i] >= "0"))
{
ivalue = ivalue * 16 + (str[i] - "0"); //16进制 可换其它进制
}
else if ((str[i] >= "a" && str[i] <= "f"))
{
ivalue = ivalue * 16 + (str[i] - "a") + 10;
}
else if ((str[i] >= "A" && str[i] <= "F"))
{
ivalue = ivalue * 16 + (str[i] - "A") + 10;
}
}
return ivalue;
}
void CcharsetEncode::ReplaceStr(string &strContent, const char *strSrc, const char *strDest)
{
string strCopy(strContent);
string strSrcCopy(strSrc);
string::size_type pos = 0;
string::size_type srclen = strlen(strSrc);
if( (pos=strCopy.find(strSrcCopy, pos)) != string::npos)
{
strContent.replace(pos, srclen, strDest);
}
}主函数测试:
int main()
{
CcharsetEncode encode;
string src = "u300au58ebu5175u7a81u51fbu300b";
encode.unicode_to_utf8(src);
cout<<" unicode: "<<src<<endl;
return 0;
}声明:该文观点仅代表作者本人,牛骨文系教育信息发布平台,牛骨文仅提供信息存储空间服务。
