首页 >综合 > > 正文

java读取txt文件解决乱码问题

发布日期:2023-07-30 18:34:30 来源:博客园 分享


(资料图)

说明:由于txt文件有bom和不同的编码方式,导致导入数据时产生乱码,以下代码完美解决乱码问题。参考他人代码,结合自己的业务加工完成,费了大半天功夫完成,希望对大家有点用处。废话不多说,直接上代码:

/**     * 从txt文件流读取数据     *     * @param txtStream     * @return     * @throws IOException     */    public static List readFromTxt(InputStream txtStream) throws IOException {        List paragraphList = new ArrayList<>();        LabelValuePair result = getStreamCharset(txtStream);        Charset cs = result.getValue();        BOMInputStream bomInputStream = new BOMInputStream(result.getLabel());        boolean hasBom = bomInputStream.hasBOM();        InputStreamReader sr = hasBom ?                new InputStreamReader(bomInputStream, Charset.forName(bomInputStream.getBOMCharsetName())) :                new InputStreamReader(bomInputStream, cs);        BufferedReader br = new BufferedReader(sr);        String line = null;        Integer lineIndex = 0;        while ((line = br.readLine()) != null) {            if (!hasBom && lineIndex == 0) {                lineIndex++;                if (StringUtils.isNotEmpty(line)) {                    byte[] bts = line.getBytes(cs);                    if ((bts[0] == -1 && bts[1] == -2) || bts[0] == -2 && bts[1] == -1) {                        byte[] newBts = new byte[bts.length - 2];                        for (int i = 2; i < bts.length; i++) {                            newBts[i - 2] = bts[i];                        }                        line = new String(newBts, cs);                    }                }            }            if (StringUtils.isNotEmpty(line) && StringUtils.isNotEmpty(line.trim())) {                paragraphList.add(line);                log.info("读取数据:{},长度:{},value:{}", line, line.trim().length(), line.getBytes(cs));            }        }        br.close();        sr.close();        return paragraphList;    } /**     * 判断获取字节流 编码格式,主要用于txt文件内容读取     * 再次读取流,使用返回结果中的流     *     * @param stream     * @return     */    public static LabelValuePair getStreamCharset(InputStream stream) throws IOException {        LabelValuePair result = readSteam(stream, true);        byte[] buffer = result.getValue();        if (buffer.length < 2)            return new LabelValuePair<>(result.getLabel(), CharsetKit.CHARSET_GBK);        String encode = getFileCharSet(new BufferedInputStream(new ByteArrayInputStream(result.getValue())));// getBytesCharset(buffer);        return new LabelValuePair<>(result.getLabel(), CharsetKit.charset(encode));    }  /**     * 判断txt编码格式方法     *     * @param bis     * @return     */    public static String getFileCharSet(BufferedInputStream bis) {        String charset = "GBK";        byte[] first3Bytes = new byte[3];        try {            boolean checked = false;            bis.mark(0);            int read = bis.read(first3Bytes, 0, 3);            if (read == -1) {                return charset; //文件编码为 ANSI            } else if (first3Bytes[0] == (byte) 0xFF                    && first3Bytes[1] == (byte) 0xFE) {                charset = "UTF-16LE"; //文件编码为 Unicode                checked = true;            } else if (first3Bytes[0] == (byte) 0xFE                    && first3Bytes[1] == (byte) 0xFF) {                charset = "UTF-16BE"; //文件编码为 Unicode big endian                checked = true;            } else if (first3Bytes[0] == (byte) 0xEF                    && first3Bytes[1] == (byte) 0xBB                    && first3Bytes[2] == (byte) 0xBF) {                charset = "UTF-8"; //文件编码为 UTF-8                checked = true;            }            bis.reset();            if (!checked) {                int loc = 0;                while ((read = bis.read()) != -1) {                    loc++;                    if (read >= 0xF0)                        break;                    if (0x80 <= read && read <= 0xBF) // 单独出现BF以下的,也算是GBK                        break;                    if (0xC0 <= read && read <= 0xDF) {                        read = bis.read();                        if (0x80 <= read && read <= 0xBF) // 双字节 (0xC0 - 0xDF)                            // (0x80                            // - 0xBF),也可能在GB编码内                            continue;                        else                            break;                    } else if (0xE0 <= read && read <= 0xEF) {// 也有可能出错,但是几率较小                        read = bis.read();                        if (0x80 <= read && read <= 0xBF) {                            read = bis.read();                            if (0x80 <= read && read <= 0xBF) {                                charset = "UTF-8";                                break;                            } else                                break;                        } else                            break;                    }                }            }            bis.close();        } catch (Exception e) {            log.error("获取文件编码方式异常", e);        }        return charset;    }    /**     * 读取流     *     * @param inputStream 输入流     * @param isRepeat    是否重复读取     * @return     */    public static LabelValuePair readSteam(InputStream inputStream, boolean isRepeat) throws IOException {        ByteArrayOutputStream outSteam = new ByteArrayOutputStream();        byte[] buffer = new byte[1024];        int len = -1;        inputStream.mark(0);        while ((len = inputStream.read(buffer)) != -1) {            outSteam.write(buffer);        }        byte[] fs = outSteam.toByteArray();        outSteam.close();        inputStream.close();        InputStream newSteam = null;        if (isRepeat) {            newSteam = new ByteArrayInputStream(fs);        }        return new LabelValuePair<>(newSteam, fs);    }

标签:

Copyright ©  2015-2023 港澳兽药网版权所有  备案号:京ICP备2023022245号-31   联系邮箱:435 226 40 @qq.com