C++ 实现unicode到utf-8的转码

创建时间：2016-09-13 投稿人：浏览次数：323

思路：

获取字符串里面中的Unicode部分，然后将该部分转换位utf-8格式的字符，最后将字符串里面的所有Unicode替换为utf-8即可。

废话不多少，直接上代码：

头文件：

/*
 * charsetEncode.h
 *
 *  Created on: Jul 25, 2016
 *      Author: root
 */


#ifndef COMMONSERVER_INCLUDE_CHARSETENCODE_H_
#define COMMONSERVER_INCLUDE_CHARSETENCODE_H_


#include <iostream>
#include <algorithm>
#include <string>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <errno.h>
#include <assert.h>


using namespace std;


class CcharsetEncode
{
public:
   int unicode_to_utf8(string &source);//unicode to utf-8


   //字符串忽略大小写字串替换
   void ReplaceStr(string &strContent, const char *strSrc, const char *strDest);


private:
    int enc_unicode_to_utf8_one(unsigned long unic, unsigned char *pOutput, int outSize);
    int isUnicode(const string &src); //the src is unicode or not, total 6 char(0x5e3f).1,yes;2,no
    unsigned int xstrtoshortint(const char *str); //"0x1a3f"->1a3f


};

#endif /* COMMONSERVER_INCLUDE_CHARSETENCODE_H_ */

源文件：

/*
 * charsetEncode.cpp
 *
 *  Created on: Jul 25, 2016
 *      Author: root
 */


#include "charsetEncode.h"


int CcharsetEncode::unicode_to_utf8(string &source)
{
int sourcesize = source.size();
string src;
unsigned char pout[8];
for(int index = 0; index < sourcesize - 6;)
{
memset(pout, 0, 8);
src = source.substr(index, 6);
if(isUnicode(src) == 1)
{
string hexsrc = source.substr(index + 2, 4);
int num = enc_unicode_to_utf8_one(xstrtoshortint(hexsrc.c_str()), pout, 8);
ReplaceStr(source, src.c_str(), (char *)pout);
index += 3;
sourcesize = source.size();
}
else
{
index++;
}
}
return 0;
}


int CcharsetEncode::enc_unicode_to_utf8_one(unsigned long unic, unsigned char *pOutput, int outSize)
{
    assert(pOutput != NULL);
    assert(outSize >= 6);


    if ( unic <= 0x0000007F )
    {
        // * U-00000000 - U-0000007F:  0xxxxxxx
        *pOutput     = (unic & 0x7F);
        return 1;
    }
    else if ( unic >= 0x00000080 && unic <= 0x000007FF )
    {
        // * U-00000080 - U-000007FF:  110xxxxx 10xxxxxx
        *(pOutput+1) = (unic & 0x3F) | 0x80;
        *pOutput     = ((unic >> 6) & 0x1F) | 0xC0;
        return 2;
    }
    else if ( unic >= 0x00000800 && unic <= 0x0000FFFF )
    {
        // * U-00000800 - U-0000FFFF:  1110xxxx 10xxxxxx 10xxxxxx
        *(pOutput+2) = (unic & 0x3F) | 0x80;
        *(pOutput+1) = ((unic >>  6) & 0x3F) | 0x80;
        *pOutput     = ((unic >> 12) & 0x0F) | 0xE0;
        return 3;
    }
    else if ( unic >= 0x00010000 && unic <= 0x001FFFFF )
    {
        // * U-00010000 - U-001FFFFF:  11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
        *(pOutput+3) = (unic & 0x3F) | 0x80;
        *(pOutput+2) = ((unic >>  6) & 0x3F) | 0x80;
        *(pOutput+1) = ((unic >> 12) & 0x3F) | 0x80;
        *pOutput     = ((unic >> 18) & 0x07) | 0xF0;
        return 4;
    }
    else if ( unic >= 0x00200000 && unic <= 0x03FFFFFF )
    {
        // * U-00200000 - U-03FFFFFF:  111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
        *(pOutput+4) = (unic & 0x3F) | 0x80;
        *(pOutput+3) = ((unic >>  6) & 0x3F) | 0x80;
        *(pOutput+2) = ((unic >> 12) & 0x3F) | 0x80;
        *(pOutput+1) = ((unic >> 18) & 0x3F) | 0x80;
        *pOutput     = ((unic >> 24) & 0x03) | 0xF8;
        return 5;
    }
    else if ( unic >= 0x04000000 && unic <= 0x7FFFFFFF )
    {
        // * U-04000000 - U-7FFFFFFF:  1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
        *(pOutput+5) = (unic & 0x3F) | 0x80;
        *(pOutput+4) = ((unic >>  6) & 0x3F) | 0x80;
        *(pOutput+3) = ((unic >> 12) & 0x3F) | 0x80;
        *(pOutput+2) = ((unic >> 18) & 0x3F) | 0x80;
        *(pOutput+1) = ((unic >> 24) & 0x3F) | 0x80;
        *pOutput     = ((unic >> 30) & 0x01) | 0xFC;
        return 6;
    }
    return 0;
}


int CcharsetEncode::isUnicode(const string &src)
{
if(src.size() != 6)
return 0;
if(src.find("\u", 0) == 0)
{
for(int i = 2; i <= 5; i++)
{
if(!((src[i] >= "a" && src[i] <= "f")
|| (src[i] >= "A" && src[i] <= "F")
|| (src[i] >= "0" && src[i] <= "9")))
{
return 0;
}
}
return 1;
}
else
{
return 0;
}
}


unsigned int CcharsetEncode::xstrtoshortint(const char *str)
{
    int len = strlen(str);
    unsigned int ivalue = 0;
    for (int i = 0; i < len; i++)
    {
        if ((str[i] <= "9" && str[i] >= "0"))
        {
            ivalue = ivalue * 16 + (str[i] - "0"); //16进制 可换其它进制
        }
        else if ((str[i] >= "a" && str[i] <= "f"))
        {
            ivalue = ivalue * 16 + (str[i] - "a") + 10;
        }
        else if ((str[i] >= "A" && str[i] <= "F"))
        {
            ivalue = ivalue * 16 + (str[i] - "A") + 10;
        }
    }
    return ivalue;
}


void CcharsetEncode::ReplaceStr(string &strContent, const char *strSrc, const char *strDest)
{
    string strCopy(strContent);
    string strSrcCopy(strSrc);


    string::size_type pos = 0;
    string::size_type srclen = strlen(strSrc);
    if( (pos=strCopy.find(strSrcCopy, pos)) != string::npos)
    {
        strContent.replace(pos, srclen, strDest);
    }
}

主函数测试：

int main()
{
CcharsetEncode encode;
string src = "u300au58ebu5175u7a81u51fbu300b";
encode.unicode_to_utf8(src);
cout<<" unicode: "<<src<<endl;
return 0;
}

声明：该文观点仅代表作者本人，牛骨文系教育信息发布平台，牛骨文仅提供信息存储空间服务。

上一篇： c++ ANSI、UNICODE、UTF8互转
下一篇： [C/C++]_[utf8和unicode的相互转换]

热门文章: CTF writeup 2_南邮网络攻防训...; SSM框架——详细整合教程（...; Linux Shell脚本编程－－curl命...; HttpClient使用详解; Java面试题全集（上）; JAVA设计模式之单例模式; java.lang.OutOfMemoryError: PermGen ...; TCP协议中的三次握手和四次...; form表单的两种提交方式，su...; String,StringBuffer与StringBuilder...

最新文章: Java之品优购课程讲义_day20（7）; 剑指 Offer - 8：跳台阶; Netty权威指南_札记02_NIO编程; mysql时间属性之时间戳和datetime之...; 虚拟现实或许可以拯救古埃及的“...; spring cloud服务注册中心eureka---集群...; Java SE 第六章; HTTP请求+数据库; HIDL学习笔记之HIDL C++（第二天）; ubuntu系统下指定tomcat运行时为JDK1.8...