Java正确判别出文件的字符集(尤其是带BOM和不带BOM的UTF-8字符)
前几天在项目中需要读取用户上传过来的txt文件,但不确定txt文件的字符集
UTF-16、UTF-8(带BOM)、Unicode可以根据前三个字节区别
public String getTxtEncode(FileInputStream in) throws IOException{ byte[] head = new byte[3]; in.read(head); String code = "GBK"; if (head[0] == -1 && head[1] == -2 ) code = "UTF-16"; if (head[0] == -2 && head[1] == -1 ) code = "Unicode"; //带BOM if(head[0]==-17 && head[1]==-69 && head[2] ==-65) code = "UTF-8"; if("Unicode".equals(code)){ code = "UTF-16"; } return code; }
通过在google上搜索发现不带BOM的识别是Java遗留的一个bug,呵呵,终于找到根源了,Java提供了此bug的解决方案
package com.justsy.sts.utf8; import java.io.*; /** * This inputstream will recognize unicode BOM marks and will skip bytes if * getEncoding() method is called before any of the read(...) methods. * * Usage pattern: String enc = "ISO-8859-1"; // or NULL to use systemdefault * FileInputStream fis = new FileInputStream(file); UnicodeInputStream uin = new * UnicodeInputStream(fis, enc); enc = uin.getEncoding(); // check and skip * possible BOM bytes InputStreamReader in; if (enc == null) in = new * InputStreamReader(uin); else in = new InputStreamReader(uin, enc); */ public class UnicodeInputStream extends InputStream { PushbackInputStream internalIn; boolean isInited = false; String defaultEnc; String encoding; private static final int BOM_SIZE = 4; public UnicodeInputStream(InputStream in, String defaultEnc) { internalIn = new PushbackInputStream(in, BOM_SIZE); this.defaultEnc = defaultEnc; } public String getDefaultEncoding() { return defaultEnc; } public String getEncoding() { if (!isInited) { try { init(); } catch (IOException ex) { IllegalStateException ise = new IllegalStateException( "Init method failed."); ise.initCause(ise); throw ise; } } return encoding; } /** * Read-ahead four bytes and check for BOM marks. Extra bytes are unread * back to the stream, only BOM bytes are skipped. */ protected void init() throws IOException { if (isInited) return; byte bom[] = new byte[BOM_SIZE]; int n, unread; n = internalIn.read(bom, 0, bom.length); if ((bom[0] == (byte) 0x00) && (bom[1] == (byte) 0x00) && (bom[2] == (byte) 0xFE) && (bom[3] == (byte) 0xFF)) { encoding = "UTF-32BE"; unread = n - 4; } else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE) && (bom[2] == (byte) 0x00) && (bom[3] == (byte) 0x00)) { encoding = "UTF-32LE"; unread = n - 4; } else if ((bom[0] == (byte) 0xEF) && (bom[1] == (byte) 0xBB) && (bom[2] == (byte) 0xBF)) { encoding = "UTF-8"; unread = n - 3; } else if ((bom[0] == (byte) 0xFE) && (bom[1] == (byte) 0xFF)) { encoding = "UTF-16BE"; unread = n - 2; } else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE)) { encoding = "UTF-16LE"; unread = n - 2; } else { // Unicode BOM mark not found, unread all bytes encoding = defaultEnc; unread = n; } // System.out.println("read=" + n + ", unread=" + unread); if (unread > 0) internalIn.unread(bom, (n - unread), unread); isInited = true; } public void close() throws IOException { // init(); isInited = true; internalIn.close(); } public int read() throws IOException { // init(); isInited = true; return internalIn.read(); } }
通过使用上述InputStream类的实现可以正确的读取出不带BOM和带BOM的字符集
package com.justsy.sts.utf8; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStreamReader; import java.nio.charset.Charset; public class UTF8Test { public static void main(String[] args) throws IOException { File f = new File("D:"+File.separator+"Order.txt"); FileInputStream in = new FileInputStream(f); String dc = Charset.defaultCharset().name(); UnicodeInputStream uin = new UnicodeInputStream(in,dc); BufferedReader br = new BufferedReader(new InputStreamReader(uin)); String line = br.readLine(); while(line != null) { System.out.println(line); line = br.readLine(); } } }
结合Java提供的方案,我们就可以比较完整的判别出各种字符集了
public String getTxtEncode(FileInputStream in) throws IOException{ String dc = Charset.defaultCharset().name(); UnicodeInputStream uin = new UnicodeInputStream(in,dc); if("UTF-8".equals(uin.getEncoding())){ uin.close(); return "UTF-8"; } uin.close(); byte[] head = new byte[3]; in.read(head); String code = "GBK"; if (head[0] == -1 && head[1] == -2 ) code = "UTF-16"; if (head[0] == -2 && head[1] == -1 ) code = "Unicode"; //带BOM if(head[0]==-17 && head[1]==-69 && head[2] ==-65) code = "UTF-8"; if("Unicode".equals(code)){ code = "UTF-16"; } return code; }
本文的转载地址为:http://blog.csdn.net/tibib/article/details/7988735
声明:该文观点仅代表作者本人,牛骨文系教育信息发布平台,牛骨文仅提供信息存储空间服务。
- 上一篇: COBOL语言初级教程(1)--COBOL简介
- 下一篇: 网页端和手机端自动适应网页设计