(资料图)
说明:由于txt文件有bom和不同的编码方式,导致导入数据时产生乱码,以下代码完美解决乱码问题。参考他人代码,结合自己的业务加工完成,费了大半天功夫完成,希望对大家有点用处。废话不多说,直接上代码:
/** * 从txt文件流读取数据 * * @param txtStream * @return * @throws IOException */ public static List readFromTxt(InputStream txtStream) throws IOException { List paragraphList = new ArrayList<>(); LabelValuePair result = getStreamCharset(txtStream); Charset cs = result.getValue(); BOMInputStream bomInputStream = new BOMInputStream(result.getLabel()); boolean hasBom = bomInputStream.hasBOM(); InputStreamReader sr = hasBom ? new InputStreamReader(bomInputStream, Charset.forName(bomInputStream.getBOMCharsetName())) : new InputStreamReader(bomInputStream, cs); BufferedReader br = new BufferedReader(sr); String line = null; Integer lineIndex = 0; while ((line = br.readLine()) != null) { if (!hasBom && lineIndex == 0) { lineIndex++; if (StringUtils.isNotEmpty(line)) { byte[] bts = line.getBytes(cs); if ((bts[0] == -1 && bts[1] == -2) || bts[0] == -2 && bts[1] == -1) { byte[] newBts = new byte[bts.length - 2]; for (int i = 2; i < bts.length; i++) { newBts[i - 2] = bts[i]; } line = new String(newBts, cs); } } } if (StringUtils.isNotEmpty(line) && StringUtils.isNotEmpty(line.trim())) { paragraphList.add(line); log.info("读取数据:{},长度:{},value:{}", line, line.trim().length(), line.getBytes(cs)); } } br.close(); sr.close(); return paragraphList; } /** * 判断获取字节流 编码格式,主要用于txt文件内容读取 * 再次读取流,使用返回结果中的流 * * @param stream * @return */ public static LabelValuePair getStreamCharset(InputStream stream) throws IOException { LabelValuePair result = readSteam(stream, true); byte[] buffer = result.getValue(); if (buffer.length < 2) return new LabelValuePair<>(result.getLabel(), CharsetKit.CHARSET_GBK); String encode = getFileCharSet(new BufferedInputStream(new ByteArrayInputStream(result.getValue())));// getBytesCharset(buffer); return new LabelValuePair<>(result.getLabel(), CharsetKit.charset(encode)); } /** * 判断txt编码格式方法 * * @param bis * @return */ public static String getFileCharSet(BufferedInputStream bis) { String charset = "GBK"; byte[] first3Bytes = new byte[3]; try { boolean checked = false; bis.mark(0); int read = bis.read(first3Bytes, 0, 3); if (read == -1) { return charset; //文件编码为 ANSI } else if (first3Bytes[0] == (byte) 0xFF && first3Bytes[1] == (byte) 0xFE) { charset = "UTF-16LE"; //文件编码为 Unicode checked = true; } else if (first3Bytes[0] == (byte) 0xFE && first3Bytes[1] == (byte) 0xFF) { charset = "UTF-16BE"; //文件编码为 Unicode big endian checked = true; } else if (first3Bytes[0] == (byte) 0xEF && first3Bytes[1] == (byte) 0xBB && first3Bytes[2] == (byte) 0xBF) { charset = "UTF-8"; //文件编码为 UTF-8 checked = true; } bis.reset(); if (!checked) { int loc = 0; while ((read = bis.read()) != -1) { loc++; if (read >= 0xF0) break; if (0x80 <= read && read <= 0xBF) // 单独出现BF以下的,也算是GBK break; if (0xC0 <= read && read <= 0xDF) { read = bis.read(); if (0x80 <= read && read <= 0xBF) // 双字节 (0xC0 - 0xDF) // (0x80 // - 0xBF),也可能在GB编码内 continue; else break; } else if (0xE0 <= read && read <= 0xEF) {// 也有可能出错,但是几率较小 read = bis.read(); if (0x80 <= read && read <= 0xBF) { read = bis.read(); if (0x80 <= read && read <= 0xBF) { charset = "UTF-8"; break; } else break; } else break; } } } bis.close(); } catch (Exception e) { log.error("获取文件编码方式异常", e); } return charset; } /** * 读取流 * * @param inputStream 输入流 * @param isRepeat 是否重复读取 * @return */ public static LabelValuePair readSteam(InputStream inputStream, boolean isRepeat) throws IOException { ByteArrayOutputStream outSteam = new ByteArrayOutputStream(); byte[] buffer = new byte[1024]; int len = -1; inputStream.mark(0); while ((len = inputStream.read(buffer)) != -1) { outSteam.write(buffer); } byte[] fs = outSteam.toByteArray(); outSteam.close(); inputStream.close(); InputStream newSteam = null; if (isRepeat) { newSteam = new ByteArrayInputStream(fs); } return new LabelValuePair<>(newSteam, fs); }
标签:
Copyright © 2015-2023 港澳兽药网版权所有 备案号:京ICP备2023022245号-31 联系邮箱:435 226 40 @qq.com