java使用unicode过滤emoji表情
最近搞个微信公众号的项目,如果微信用户名有表情符号的,那么在我们的系统里面就会保存不了。最初的解决方案是把数据库的utf8格式改为utf8mb4。这样就能兼容保存emoji表情符号,后面又发现,保存的表情符号有的能正常显示有的不可以,后来经过几次验证才发现,用IOS自带输入法输入的表情,跟安卓输入法输入的表情编码不一样,导致有些符号不能显示,而且有些IOS有的符号安卓机子还没有。如果要统一就必须要进行转码,这样工作量比较大,不能在短时间内解决,最好的方案就是过滤,把emoji表情符号替换成□。于是就去查了unicode码表,做了一个过滤的工具类,使用正则表达式过滤表情字符
public final static String unicodeReg= "["+
"u4E00-u9FBF"+//:CJK 统一表意符号 (CJK Unified Ideographs)
"u4DC0-u4DFF"+//:易经六十四卦符号 (Yijing Hexagrams Symbols)
"u0000-u007F"+//:C0控制符及基本拉丁文 (C0 Control and Basic Latin)
"u0080-u00FF"+//:C1控制符及拉丁:补充-1 (C1 Control and Latin 1 Supplement)
"u0100-u017F"+//:拉丁文扩展-A (Latin Extended-A)
"u0180-u024F"+//:拉丁文扩展-B (Latin Extended-B)
"u0250-u02AF"+//:国际音标扩展 (IPA Extensions)
"u02B0-u02FF"+//:空白修饰字母 (Spacing Modifiers)
"u0300-u036F"+//:结合用读音符号 (Combining Diacritics Marks)
"u0370-u03FF"+//:希腊文及科普特文 (Greek and Coptic)
"u0400-u04FF"+//:西里尔字母 (Cyrillic)
"u0500-u052F"+//:西里尔字母补充 (Cyrillic Supplement)
"u0530-u058F"+//:亚美尼亚语 (Armenian)
"u0590-u05FF"+//:希伯来文 (Hebrew)
"u0600-u06FF"+//:阿拉伯文 (Arabic)
"u0700-u074F"+//:叙利亚文 (Syriac)
"u0750-u077F"+//:阿拉伯文补充 (Arabic Supplement)
"u0780-u07BF"+//:马尔代夫语 (Thaana)
//"u07C0-u077F"+//:西非书面语言 (N"Ko)
"u0800-u085F"+//:阿维斯塔语及巴列维语 (Avestan and Pahlavi)
"u0860-u087F"+//:Mandaic
"u0880-u08AF"+//:撒马利亚语 (Samaritan)
"u0900-u097F"+//:天城文书 (Devanagari)
"u0980-u09FF"+//:孟加拉语 (Bengali)
"u0A00-u0A7F"+//:锡克教文 (Gurmukhi)
"u0A80-u0AFF"+//:古吉拉特文 (Gujarati)
"u0B00-u0B7F"+//:奥里亚文 (Oriya)
"u0B80-u0BFF"+//:泰米尔文 (Tamil)
"u0C00-u0C7F"+//:泰卢固文 (Telugu)
"u0C80-u0CFF"+//:卡纳达文 (Kannada)
"u0D00-u0D7F"+//:德拉维族语 (Malayalam)
"u0D80-u0DFF"+//:僧伽罗语 (Sinhala)
"u0E00-u0E7F"+//:泰文 (Thai)
"u0E80-u0EFF"+//:老挝文 (Lao)
"u0F00-u0FFF"+//:藏文 (Tibetan)
"u1000-u109F"+//:缅甸语 (Myanmar)
"u10A0-u10FF"+//:格鲁吉亚语 (Georgian)
"u1100-u11FF"+//:朝鲜文 (Hangul Jamo)
"u1200-u137F"+//:埃塞俄比亚语 (Ethiopic)
"u1380-u139F"+//:埃塞俄比亚语补充 (Ethiopic Supplement)
"u13A0-u13FF"+//:切罗基语 (Cherokee)
"u1400-u167F"+//:统一加拿大土著语音节 (Unified Canadian Aboriginal Syllabics)
"u1680-u169F"+//:欧甘字母 (Ogham)
"u16A0-u16FF"+//:如尼文 (Runic)
"u1700-u171F"+//:塔加拉语 (Tagalog)
"u1720-u173F"+//:Hanunóo
"u1740-u175F"+//:Buhid
"u1760-u177F"+//:Tagbanwa
"u1780-u17FF"+//:高棉语 (Khmer)
"u1800-u18AF"+//:蒙古文 (Mongolian)
"u18B0-u18FF"+//:Cham
"u1900-u194F"+//:Limbu
"u1950-u197F"+//:德宏泰语 (Tai Le)
"u1980-u19DF"+//:新傣仂语 (New Tai Lue)
"u19E0-u19FF"+//:高棉语记号 (Kmer Symbols)
"u1A00-u1A1F"+//:Buginese
"u1A20-u1A5F"+//:Batak
"u1A80-u1AEF"+//:Lanna
"u1B00-u1B7F"+//:巴厘语 (Balinese)
"u1B80-u1BB0"+//:巽他语 (Sundanese)
"u1BC0-u1BFF"+//:Pahawh Hmong
"u1C00-u1C4F"+//:雷布查语(Lepcha)
"u1C50-u1C7F"+//:Ol Chiki
"u1C80-u1CDF"+//:曼尼普尔语 (Meithei/Manipuri)
"u1D00-u1D7F"+//:语音学扩展 (Phone tic Extensions)
"u1D80-u1DBF"+//:语音学扩展补充 (Phonetic Extensions Supplement)
"u1DC0-u1DFF"+//结合用读音符号补充 (Combining Diacritics Marks Supplement)
"u1E00-u1EFF"+//:拉丁文扩充附加 (Latin Extended Additional)
"u1F00-u1FFF"+//:希腊语扩充 (Greek Extended)
"u2000-u206F"+//:常用标点 (General Punctuation)
"u2070-u209F"+//:上标及下标 (Superscripts and Subscripts)
"u20A0-u20CF"+//:货币符号 (Currency Symbols)
"u20D0-u20FF"+//:组合用记号 (Combining Diacritics Marks for Symbols)
"u2100-u214F"+//:字母式符号 (Letterlike Symbols)
"u2150-u218F"+//:数字形式 (Number Form)
"u2190-u21FF"+//:箭头 (Arrows)
"u2200-u22FF"+//:数学运算符 (Mathematical Operator)
"u2300-u23FF"+//:杂项工业符号 (Miscellaneous Technical)
"u2400-u243F"+//:控制图片 (Control Pictures)
"u2440-u245F"+//:光学识别符 (Optical Character Recognition)
"u2460-u24FF"+//:封闭式字母数字 (Enclosed Alphanumerics)
"u2500-u257F"+//:制表符 (Box Drawing)
"u2580-u259F"+//:方块元素 (Block Element)
"u25A0-u25FF"+//:几何图形 (Geometric Shapes)
"u2600-u26FF"+//:杂项符号 (Miscellaneous Symbols)
"u2700-u27BF"+//:印刷符号 (Dingbats)
"u27C0-u27EF"+//:杂项数学符号-A (Miscellaneous Mathematical Symbols-A)
"u27F0-u27FF"+//:追加箭头-A (Supplemental Arrows-A)
"u2800-u28FF"+//:盲文点字模型 (Braille Patterns)
"u2900-u297F"+//:追加箭头-B (Supplemental Arrows-B)
"u2980-u29FF"+//:杂项数学符号-B (Miscellaneous Mathematical Symbols-B)
"u2A00-u2AFF"+//:追加数学运算符 (Supplemental Mathematical Operator)
"u2B00-u2BFF"+//:杂项符号和箭头 (Miscellaneous Symbols and Arrows)
"u2C00-u2C5F"+//:格拉哥里字母 (Glagolitic)
"u2C60-u2C7F"+//:拉丁文扩展-C (Latin Extended-C)
"u2C80-u2CFF"+//:古埃及语 (Coptic)
"u2D00-u2D2F"+//:格鲁吉亚语补充 (Georgian Supplement)
"u2D30-u2D7F"+//:提非纳文 (Tifinagh)
"u2D80-u2DDF"+//:埃塞俄比亚语扩展 (Ethiopic Extended)
"u2E00-u2E7F"+//:追加标点 (Supplemental Punctuation)
"u2E80-u2EFF"+//:CJK 部首补充 (CJK Radicals Supplement)
"u2F00-u2FDF"+//:康熙字典部首 (Kangxi Radicals)
"u2FF0-u2FFF"+//:表意文字描述符 (Ideographic Description Characters)
"u3000-u303F"+//:CJK 符号和标点 (CJK Symbols and Punctuation)
"u3040-u309F"+//:日文平假名 (Hiragana)
"u30A0-u30FF"+//:日文片假名 (Katakana)
"u3100-u312F"+//:注音字母 (Bopomofo)
"u3130-u318F"+//:朝鲜文兼容字母 (Hangul Compatibility Jamo)
"u3190-u319F"+//:象形字注释标志 (Kanbun)
"u31A0-u31BF"+//:注音字母扩展 (Bopomofo Extended)
"u31C0-u31EF"+//:CJK 笔画 (CJK Strokes)
"u31F0-u31FF"+//:日文片假名语音扩展 (Katakana Phonetic Extensions)
"u3200-u32FF"+//:封闭式 CJK 文字和月份 (Enclosed CJK Letters and Months)
"u3300-u33FF"+//:CJK 兼容 (CJK Compatibility)
"u3400-u4DBF"+//:CJK 统一表意符号扩展 A (CJK Unified Ideographs Extension A)
"u4DC0-u4DFF"+//:易经六十四卦符号 (Yijing Hexagrams Symbols)
"u4E00-u9FBF"+//:CJK 统一表意符号 (CJK Unified Ideographs)
"uA000-uA48F"+//:彝文音节 (Yi Syllables)
"uA490-uA4CF"+//:彝文字根 (Yi Radicals)
"uA500-uA61F"+//:Vai
"uA660-uA6FF"+//:统一加拿大土著语音节补充 (Unified Canadian Aboriginal Syllabics Supplement)
"uA700-uA71F"+//:声调修饰字母 (Modifier Tone Letters)
"uA720-uA7FF"+//:拉丁文扩展-D (Latin Extended-D)
"uA800-uA82F"+//:Syloti Nagri
"uA840-uA87F"+//:八思巴字 (Phags-pa)
"uA880-uA8DF"+//:Saurashtra
"uA900-uA97F"+//:爪哇语 (Javanese)
"uA980-uA9DF"+//:Chakma
"uAA00-uAA3F"+//:Varang Kshiti
"uAA40-uAA6F"+//:Sorang Sompeng
"uAA80-uAADF"+//:Newari
"uAB00-uAB5F"+//:越南傣语 (Vi?t Thái)
"uAB80-uABA0"+//:Kayah Li
"uAC00-uD7AF"+//:朝鲜文音节 (Hangul Syllables)
//"uD800-uDBFF"+//:High-half zone of UTF-16
//"uDC00-uDFFF"+//:Low-half zone of UTF-16
"uE000-uF8FF"+//:自行使用区域 (Private Use Zone)
"uF900-uFAFF"+//:CJK 兼容象形文字 (CJK Compatibility Ideographs)
"uFB00-uFB4F"+//:字母表达形式 (Alphabetic Presentation Form)
"uFB50-uFDFF"+//:阿拉伯表达形式A (Arabic Presentation Form-A)
"uFE00-uFE0F"+//:变量选择符 (Variation Selector)
"uFE10-uFE1F"+//:竖排形式 (Vertical Forms)
"uFE20-uFE2F"+//:组合用半符号 (Combining Half Marks)
"uFE30-uFE4F"+//:CJK 兼容形式 (CJK Compatibility Forms)
"uFE50-uFE6F"+//:小型变体形式 (Small Form Variants)
"uFE70-uFEFF"+//:阿拉伯表达形式B (Arabic Presentation Form-B)
"uFF00-uFFEF"+//:半型及全型形式 (Halfwidth and Fullwidth Form)
"uFFF0-uFFFF]";//:特殊 (Specials);
/**
* 将字符串转成unicode
* @param str 待转字符串
* @return unicode字符串
*/
public static String convert(String str)
{
str = (str == null ? "" : str);
String tmp;
StringBuffer sb = new StringBuffer(1000);
char c;
int i, j;
sb.setLength(0);
for (i = 0; i < str.length(); i++)
{
c = str.charAt(i);
sb.append("\u");
j = (c >>>8); //取出高8位
tmp = Integer.toHexString(j);
if (tmp.length() == 1)
{
sb.append("0");
}
sb.append(tmp);
j = (c & 0xFF); //取出低8位
tmp = Integer.toHexString(j);
if (tmp.length() == 1)
{
sb.append("0");
}
sb.append(tmp);
}
return (new String(sb).toUpperCase());
}
/**
* 2)unicode转成字符串,与上述过程反向操作即可
* 将unicode 字符串
* @param str 待转字符串
* @return 普通字符串
*/
public static String revert(String str)
{
str = (str == null ? "" : str);
if (str.indexOf("\u") == -1)//如果不是unicode码则原样返回
return str;
StringBuffer sb = new StringBuffer(1000);
for (int i = 0; i < str.length() - 6;)
{
String strTemp = str.substring(i, i + 6);
String value = strTemp.substring(2);
int c = 0;
for (int j = 0; j < value.length(); j++)
{
char tempChar = value.charAt(j);
int t = 0;
switch (tempChar)
{
case "a":
t = 10;
break;
case "b":
t = 11;
break;
case "c":
t = 12;
break;
case "d":
t = 13;
break;
case "e":
t = 14;
break;
case "f":
t = 15;
break;
default:
t = tempChar - 48;
break;
}
c += t * ((int) Math.pow(16, (value.length() - j - 1)));
}
sb.append((char) c);
i = i + 6;
}
return sb.toString();
}
public static String emojiChange(String string){
System.out.println("__________________________________");
try{
System.out.println("all-string:"+string);
System.out.println("all-unicode:"+convert(string));
Pattern pattern = Pattern.compile(unicodeReg);
StringBuffer sbBuffer=new StringBuffer();
for(int i=0;i<string.length();i++){
char c=string.charAt(i);
String temp=String.valueOf(c);
Matcher matcher = pattern.matcher(temp);
if(matcher.find()){
sbBuffer.append(temp);
}else{
sbBuffer.append("□");
}
System.out.println("temp:"+temp+";unicode:"+convert(temp));
}
System.out.println("sb:"+sbBuffer.toString());
System.out.println("--------------------------------------");
return sbBuffer.toString();
}catch(Exception e){
e.printStackTrace();
}
return "";
}这样就先解决了表情符号无法显示的问题,至于如果要真正能显示符号。。。感觉还是有很长的路要走,而且IOS每次更新,都会新上几个表情符号,如果微信更新不及时,微信里面的显示也是方块或者替换成对应的文字。
参考
IOS emoji编码
http://www.cocoachina.com/ios/20160527/16494.html
声明:该文观点仅代表作者本人,牛骨文系教育信息发布平台,牛骨文仅提供信息存储空间服务。
- 上一篇: js实现base64加密解密
- 下一篇: C#实现Base64加密解密
