牛骨文教育服务平台(让学习变的简单)
博文笔记

C++ 实现unicode到utf-8的转码

创建时间:2016-09-13 投稿人: 浏览次数:319

思路:

获取字符串里面中的Unicode部分,然后将该部分转换位utf-8格式的字符,最后将字符串里面的所有Unicode替换为utf-8即可。


废话不多少,直接上代码:

头文件:

/*
 * charsetEncode.h
 *
 *  Created on: Jul 25, 2016
 *      Author: root
 */


#ifndef COMMONSERVER_INCLUDE_CHARSETENCODE_H_
#define COMMONSERVER_INCLUDE_CHARSETENCODE_H_


#include <iostream>
#include <algorithm>
#include <string>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <errno.h>
#include <assert.h>


using namespace std;


class CcharsetEncode
{
public:
   int unicode_to_utf8(string &source);//unicode to utf-8


   //字符串忽略大小写字串替换
   void ReplaceStr(string &strContent, const char *strSrc, const char *strDest);


private:
    int enc_unicode_to_utf8_one(unsigned long unic, unsigned char *pOutput, int outSize);
    int isUnicode(const string &src); //the src is unicode or not, total 6 char(0x5e3f).1,yes;2,no
    unsigned int xstrtoshortint(const char *str); //"0x1a3f"->1a3f


};

#endif /* COMMONSERVER_INCLUDE_CHARSETENCODE_H_ */




源文件:

/*
 * charsetEncode.cpp
 *
 *  Created on: Jul 25, 2016
 *      Author: root
 */


#include "charsetEncode.h"


int CcharsetEncode::unicode_to_utf8(string &source)
{
int sourcesize = source.size();
string src;
unsigned char pout[8];
for(int index = 0; index < sourcesize - 6;)
{
memset(pout, 0, 8);
src = source.substr(index, 6);
if(isUnicode(src) == 1)
{
string hexsrc = source.substr(index + 2, 4);
int num = enc_unicode_to_utf8_one(xstrtoshortint(hexsrc.c_str()), pout, 8);
ReplaceStr(source, src.c_str(), (char *)pout);
index += 3;
sourcesize = source.size();
}
else
{
index++;
}
}
return 0;
}


int CcharsetEncode::enc_unicode_to_utf8_one(unsigned long unic, unsigned char *pOutput, int outSize)
{
    assert(pOutput != NULL);
    assert(outSize >= 6);


    if ( unic <= 0x0000007F )
    {
        // * U-00000000 - U-0000007F:  0xxxxxxx
        *pOutput     = (unic & 0x7F);
        return 1;
    }
    else if ( unic >= 0x00000080 && unic <= 0x000007FF )
    {
        // * U-00000080 - U-000007FF:  110xxxxx 10xxxxxx
        *(pOutput+1) = (unic & 0x3F) | 0x80;
        *pOutput     = ((unic >> 6) & 0x1F) | 0xC0;
        return 2;
    }
    else if ( unic >= 0x00000800 && unic <= 0x0000FFFF )
    {
        // * U-00000800 - U-0000FFFF:  1110xxxx 10xxxxxx 10xxxxxx
        *(pOutput+2) = (unic & 0x3F) | 0x80;
        *(pOutput+1) = ((unic >>  6) & 0x3F) | 0x80;
        *pOutput     = ((unic >> 12) & 0x0F) | 0xE0;
        return 3;
    }
    else if ( unic >= 0x00010000 && unic <= 0x001FFFFF )
    {
        // * U-00010000 - U-001FFFFF:  11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
        *(pOutput+3) = (unic & 0x3F) | 0x80;
        *(pOutput+2) = ((unic >>  6) & 0x3F) | 0x80;
        *(pOutput+1) = ((unic >> 12) & 0x3F) | 0x80;
        *pOutput     = ((unic >> 18) & 0x07) | 0xF0;
        return 4;
    }
    else if ( unic >= 0x00200000 && unic <= 0x03FFFFFF )
    {
        // * U-00200000 - U-03FFFFFF:  111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
        *(pOutput+4) = (unic & 0x3F) | 0x80;
        *(pOutput+3) = ((unic >>  6) & 0x3F) | 0x80;
        *(pOutput+2) = ((unic >> 12) & 0x3F) | 0x80;
        *(pOutput+1) = ((unic >> 18) & 0x3F) | 0x80;
        *pOutput     = ((unic >> 24) & 0x03) | 0xF8;
        return 5;
    }
    else if ( unic >= 0x04000000 && unic <= 0x7FFFFFFF )
    {
        // * U-04000000 - U-7FFFFFFF:  1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
        *(pOutput+5) = (unic & 0x3F) | 0x80;
        *(pOutput+4) = ((unic >>  6) & 0x3F) | 0x80;
        *(pOutput+3) = ((unic >> 12) & 0x3F) | 0x80;
        *(pOutput+2) = ((unic >> 18) & 0x3F) | 0x80;
        *(pOutput+1) = ((unic >> 24) & 0x3F) | 0x80;
        *pOutput     = ((unic >> 30) & 0x01) | 0xFC;
        return 6;
    }
    return 0;
}


int CcharsetEncode::isUnicode(const string &src)
{
if(src.size() != 6)
return 0;
if(src.find("\u", 0) == 0)
{
for(int i = 2; i <= 5; i++)
{
if(!((src[i] >= "a" && src[i] <= "f")
|| (src[i] >= "A" && src[i] <= "F")
|| (src[i] >= "0" && src[i] <= "9")))
{
return 0;
}
}
return 1;
}
else
{
return 0;
}
}


unsigned int CcharsetEncode::xstrtoshortint(const char *str)
{
    int len = strlen(str);
    unsigned int ivalue = 0;
    for (int i = 0; i < len; i++)
    {
        if ((str[i] <= "9" && str[i] >= "0"))
        {
            ivalue = ivalue * 16 + (str[i] - "0"); //16进制 可换其它进制
        }
        else if ((str[i] >= "a" && str[i] <= "f"))
        {
            ivalue = ivalue * 16 + (str[i] - "a") + 10;
        }
        else if ((str[i] >= "A" && str[i] <= "F"))
        {
            ivalue = ivalue * 16 + (str[i] - "A") + 10;
        }
    }
    return ivalue;
}


void CcharsetEncode::ReplaceStr(string &strContent, const char *strSrc, const char *strDest)
{
    string strCopy(strContent);
    string strSrcCopy(strSrc);


    string::size_type pos = 0;
    string::size_type srclen = strlen(strSrc);
    if( (pos=strCopy.find(strSrcCopy, pos)) != string::npos)
    {
        strContent.replace(pos, srclen, strDest);
    }
}




主函数测试:
int main()
{
CcharsetEncode encode;
string src = "u300au58ebu5175u7a81u51fbu300b";
encode.unicode_to_utf8(src);
cout<<" unicode: "<<src<<endl;
return 0;
}






声明:该文观点仅代表作者本人,牛骨文系教育信息发布平台,牛骨文仅提供信息存储空间服务。