原因:
关于utf-8编码的txt文件,windows以记事本方式保存时会在第一行最开始处自动加入bom格式的相关信息,大概三个字节!
所以java在读取此类文件时第一行时会多出三个不相关的字节,这样对正常的程序产生了不良影响!
解决方法:
网上有如下解决方法确实可行
1.使用UltraEdit将上边的txt文件另存为UTF-8无BOM格式;
2.使用Notepad++打开上边的txt文件执行如下操作“格式-->以UTF-8无BOM格式编码”,修改后将txt文本进行保存
不足之处:
但是这样也有不足,这样对文件生产者提出了很高的要求,万一这样的文件是很多人生产的,那就势必会产生各种各样的问题,这归根到底是jdk的一个bug.
有没有什么办法能够一劳永逸呢,答案是有的,咱们程序里控制,来跟着我一起做!
终极解决方案:
(1)在工程中增加JDK提供的一个工具类:
public class UnicodeInputStream extends InputStream { PushbackInputStream internalIn; boolean isInited = false; String defaultEnc; String encoding;
private static final int BOM_SIZE = 4;
public UnicodeInputStream(InputStream in, String defaultEnc) { internalIn = new PushbackInputStream(in, BOM_SIZE); this.defaultEnc = defaultEnc; }
public String getDefaultEncoding() { return defaultEnc; }
public String getEncoding() { if (!isInited) { try { init(); } catch (IOException ex) { IllegalStateException ise = new IllegalStateException("Init method failed."); ise.initCause(ise); throw ise; } } return encoding; }
/** * Read-ahead four bytes and check for BOM marks. Extra bytes are * unread back to the stream, only BOM bytes are skipped. */ protected void init() throws IOException { if (isInited) return;
byte bom[] = new byte[BOM_SIZE]; int n, unread; n = internalIn.read(bom, 0, bom.length);
if ( (bom[0] == (byte)0x00) && (bom[1] == (byte)0x00) && (bom[2] == (byte)0xFE) && (bom[3] == (byte)0xFF) ) { encoding = "UTF-32BE"; unread = n - 4; } else if ( (bom[0] == (byte)0xFF) && (bom[1] == (byte)0xFE) && (bom[2] == (byte)0x00) && (bom[3] == (byte)0x00) ) { encoding = "UTF-32LE"; unread = n - 4; } else if ( (bom[0] == (byte)0xEF) && (bom[1] == (byte)0xBB) && (bom[2] == (byte)0xBF) ) { encoding = "UTF-8"; unread = n - 3; } else if ( (bom[0] == (byte)0xFE) && (bom[1] == (byte)0xFF) ) { encoding = "UTF-16BE"; unread = n - 2; } else if ( (bom[0] == (byte)0xFF) && (bom[1] == (byte)0xFE) ) { encoding = "UTF-16LE"; unread = n - 2; } else { // Unicode BOM mark not found, unread all bytes encoding = defaultEnc; unread = n; } //System.out.println("read=" + n + ", unread=" + unread);
if (unread > 0) internalIn.unread(bom, (n - unread), unread);
isInited = true; }
public void close() throws IOException { //init(); isInited = true; internalIn.close(); }
public int read() throws IOException { //init(); isInited = true; return internalIn.read(); } }
(2)读取时使用如下代码: //因为我这边是服务器上的远程文件,如果是本地文件使用File类
URL url = new URL("http://****/***/test.txt");
// File f = new File("test.txt");
String enc = null; // or NULL to use systemdefault UnicodeInputStream uin = new UnicodeInputStream(url.openStream(),enc); //如果是本地将url.openStream -> new FileInputStream(f) enc = uin.getEncoding(); // check and skip possible BOM bytes InputStreamReader in; if (enc == null){ in = new InputStreamReader(uin); }else { in = new InputStreamReader(uin, enc); } BufferedReader reader = new BufferedReader(in); //BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream("D:/tags.txt"),"utf-8")); String tmp =reader.readLine();
这样读取的结果就是正常的了,有什么问题还可以留言!
|