浏览代码

代码重构

tom.xu@informa.com 2 年之前
父节点
当前提交
d43edaa525

+ 27 - 0
pom.xml

@@ -89,6 +89,33 @@
             <artifactId>hutool-all</artifactId>
             <version>5.6.5</version>
         </dependency>
+
+        <!-- cpdetector 检测文件编码格式 -->
+        <dependency>
+            <groupId>cpdetector</groupId>
+            <artifactId>cpdetector</artifactId>
+            <version>1.0.10</version>
+            <scope>system</scope>
+            <systemPath>${basedir}/src/main/resources/libs/cpdetector_1.0.10.jar</systemPath>
+        </dependency>
+
+        <!-- cpdetector 的插件 chardet -->
+        <dependency>
+            <groupId>chardet</groupId>
+            <artifactId>chardet</artifactId>
+            <version>1.0</version>
+            <scope>system</scope>
+            <systemPath>${basedir}/src/main/resources/libs/chardet-1.0.jar</systemPath>
+        </dependency>
+
+        <!-- cpdetector 的插件 antlr -->
+        <dependency>
+            <groupId>antlr</groupId>
+            <artifactId>antlr</artifactId>
+            <version>2.7.4</version>
+            <scope>system</scope>
+            <systemPath>${basedir}/src/main/resources/libs/antlr-2.7.4.jar</systemPath>
+        </dependency>
     </dependencies>
 
     <build>

+ 50 - 7
src/main/java/ieven/server/webapp/domain/file/FileService.java

@@ -6,8 +6,10 @@
 package ieven.server.webapp.domain.file;
 
 import cn.hutool.core.date.DateUtil;
+import cn.hutool.core.io.FileUtil;
 import cn.hutool.core.io.IoUtil;
 import cn.hutool.core.util.IdUtil;
+import cn.hutool.core.util.RandomUtil;
 import com.mongodb.client.gridfs.model.GridFSFile;
 import com.mongodb.client.result.DeleteResult;
 import ieven.server.webapp.domain.IdInput;
@@ -16,14 +18,15 @@ import ieven.server.webapp.domain.data.Fields;
 import ieven.server.webapp.domain.model.Model;
 import ieven.server.webapp.infrastructure.wrapper.Mapped;
 import ieven.server.webapp.service.MongoExcelService;
+import ieven.server.webapp.util.EncodeDetector;
 import ieven.server.webapp.util.excel.ExcelXlsReader;
 import ieven.server.webapp.util.excel.ExcelXlsxReader;
 import ieven.server.webapp.util.excel.PublicStatic;
+import info.monitorenter.cpdetector.io.*;
 import org.apache.commons.csv.CSVFormat;
 import org.apache.commons.csv.CSVParser;
 import org.apache.commons.csv.CSVRecord;
 import org.apache.commons.lang3.StringUtils;
-import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
 import org.bson.types.ObjectId;
 import org.springframework.beans.factory.annotation.Autowired;
 import org.springframework.context.annotation.Lazy;
@@ -40,11 +43,9 @@ import org.springframework.scheduling.annotation.Async;
 import org.springframework.scheduling.annotation.AsyncResult;
 import org.springframework.scheduling.annotation.EnableAsync;
 import org.springframework.stereotype.Service;
-import org.xml.sax.SAXException;
 
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.InputStreamReader;
+import java.io.*;
+import java.nio.charset.Charset;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.List;
@@ -105,7 +106,13 @@ public class FileService {
             mongoExcelService.insertRest();
           } else if (filename.endsWith(".csv")) {
             mongoExcelService = new MongoExcelService(this.mongoTemplate, uploaded.getId());
-            CSVParser csvParser = CSVFormat.EXCEL.parse(new InputStreamReader(inputStream, "utf8"));
+            String randomFileName = RandomUtil.randomString(5);
+            String tmpdir = System.getProperty("java.io.tmpdir");
+            File file = new File(tmpdir + File.separator + randomFileName + ".csv");
+            FileUtil.writeFromStream(inputStream, file);
+            String encoding = EncodeDetector.getEncoding(file);
+            CSVParser csvParser =
+                CSVFormat.EXCEL.parse(new InputStreamReader(new FileInputStream(file), encoding));
             int curRow = 0;
             int curRowWxFile = 0;
             boolean wxFile = false;
@@ -170,7 +177,7 @@ public class FileService {
             }
             mongoExcelService.insertRest();
           }
-        } catch (OpenXML4JException | SAXException | IOException var11) {
+        } catch (Exception var11) {
           this.modifyStatus(uploaded.getId(), "ERROR");
           var11.printStackTrace();
         }
@@ -377,4 +384,40 @@ public class FileService {
     Model model = this.mongoTemplate.findOne(query, Model.class, "model");
     return new AsyncResult(model != null ? model.getModelName() : "");
   }
+
+  public static String getCharsetName(File file) throws IOException {
+    String charsetName = "UTF-8";
+    // 获取 CodepageDetectorProxy 实例
+    CodepageDetectorProxy detector = CodepageDetectorProxy.getInstance();
+    // 添加解析器,会使用到添加的后 2 个 ext 里的 jar 包
+    detector.add(new ParsingDetector(false));
+    detector.add(JChardetFacade.getInstance());
+    detector.add(ASCIIDetector.getInstance());
+    detector.add(UnicodeDetector.getInstance());
+    Charset charset = detector.detectCodepage(file.toURI().toURL());
+    if (charset != null) {
+      charsetName = charset.name();
+    }
+    return charsetName;
+  }
+
+  public static void main(String[] args) throws IOException {
+    File file =
+        new File(
+            "C:\\Users\\Administrator\\Desktop\\楼\\支付宝五联单数据格式\\20120303组织卖淫案_OR222713D2400411_注册信息_20210913100124.csv");
+    String charsetName = getCharsetName(file);
+    CSVParser csvParser =
+        CSVFormat.EXCEL.parse(new InputStreamReader(new FileInputStream(file), charsetName));
+    for (CSVRecord strings : csvParser) {
+      System.out.println(strings);
+    }
+
+    file = new File("C:\\Users\\Administrator\\Desktop\\楼\\微信支付账单(2).csv");
+    charsetName = getCharsetName(file);
+    csvParser =
+        CSVFormat.EXCEL.parse(new InputStreamReader(new FileInputStream(file), charsetName));
+    for (CSVRecord strings : csvParser) {
+      System.out.println(strings);
+    }
+  }
 }

+ 87 - 0
src/main/java/ieven/server/webapp/util/EncodeDetector.java

@@ -0,0 +1,87 @@
+package ieven.server.webapp.util;
+
+import info.monitorenter.cpdetector.io.*;
+
+import java.io.BufferedInputStream;
+import java.io.ByteArrayInputStream;
+import java.io.File;
+import java.nio.charset.Charset;
+
+/**
+ * 获取数据编码
+ *
+ * @author sikj
+ */
+public class EncodeDetector {
+  /**
+   * 获取流编码
+   *
+   * @param in 输入流
+   * @return 编码字符串
+   * @throws Exception
+   */
+  public static String getEncoding(BufferedInputStream buffIn) throws Exception {
+    int size = buffIn.available();
+    buffIn.mark(size);
+    CodepageDetectorProxy detector = getDetector();
+
+    java.nio.charset.Charset charset = null;
+    charset = detector.detectCodepage(buffIn, size);
+
+    buffIn.reset();
+
+    return charset.toString();
+  }
+
+  public static String getEncoding(File file) throws Exception {
+    String charsetName = "UTF-8";
+    CodepageDetectorProxy detector = getDetector();
+    Charset charset = detector.detectCodepage(file.toURI().toURL());
+    if (charset != null) {
+      charsetName = charset.name();
+    }
+    return charsetName;
+  }
+
+  /**
+   * 获取二进制数组编码
+   *
+   * @param byteArr 数据数组
+   * @return 编码字符串
+   * @throws Exception
+   */
+  public static String getEncoding(byte[] byteArr) throws Exception {
+    ByteArrayInputStream byteArrIn = new ByteArrayInputStream(byteArr);
+    BufferedInputStream buffIn = new BufferedInputStream(byteArrIn);
+
+    CodepageDetectorProxy detector = getDetector();
+    java.nio.charset.Charset charset = null;
+    charset = detector.detectCodepage(buffIn, buffIn.available());
+
+    try {
+      buffIn.close();
+    } catch (Exception e) {
+    }
+
+    return charset.toString();
+  }
+
+  private static CodepageDetectorProxy getDetector() {
+    if (!init) {
+      detector.add(JChardetFacade.getInstance());
+      detector.add(ASCIIDetector.getInstance());
+      detector.add(UnicodeDetector.getInstance());
+      detector.add(parsingDetector);
+      detector.add(byteOrderMarkDetector);
+
+      init = true;
+    }
+
+    return detector;
+  }
+
+  private static boolean init = false;
+  private static CodepageDetectorProxy detector = CodepageDetectorProxy.getInstance();
+  private static ParsingDetector parsingDetector = new ParsingDetector(false);
+  private static ByteOrderMarkDetector byteOrderMarkDetector = new ByteOrderMarkDetector();
+}

二进制
src/main/resources/libs/antlr-2.7.4.jar


二进制
src/main/resources/libs/chardet-1.0.jar


二进制
src/main/resources/libs/cpdetector_1.0.10.jar


二进制
src/main/resources/libs/jargs-1.0.jar