package cn.com.poc.common.utils;

import cn.com.yict.framemax.core.exception.BusinessException;
import cn.hutool.core.io.FileUtil;
import org.apache.pdfbox.io.RandomAccessBufferedFileInputStream;
import org.apache.pdfbox.pdfparser.PDFParser;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
//import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.springframework.util.Assert;

import java.io.*;
import java.nio.file.Files;

public class DocumentLoad {

    /**
     * 读取文档
     *
     * @param file
     * @return
     */
    public static String documentToText(File file) {
        Assert.notNull(file);
        String fileName = FileUtil.getName(file);
        String type = fileName.substring(fileName.lastIndexOf(".") + 1, fileName.length());
        try {
            switch (type) {
                case "docx":
                    return loadWordDocx(file);
                case "doc":
                    return loadWordDocx(file);
                case "md":
                    return loadMarkDown(file);
                case "pdf":
                    return loadPDF(file);
                case "txt":
                    return loadTxt(file);
                default:
                    throw new BusinessException(type + " format is not yet supported");
            }
        } catch (IOException e) {
            throw new BusinessException(e.getMessage());
        }
    }

    public static String loadMarkDown(File file) throws IOException {
        BufferedReader bufferedReader = new BufferedReader(new FileReader(file));
        StringBuilder stringBuilder = new StringBuilder();
        String line;
        while ((line = bufferedReader.readLine()) != null) {
            stringBuilder.append(line);
        }
        return stringBuilder.toString();
    }

    public static String loadWordDocx(File file) throws IOException {
        XWPFDocument xwpfDocument = new XWPFDocument(Files.newInputStream(file.toPath()));
        XWPFWordExtractor xwpfWordExtractor = new XWPFWordExtractor(xwpfDocument);
        return xwpfWordExtractor.getText();
    }

//    public static String loadWordDoc(File file) throws IOException {
//        FileInputStream fis = new FileInputStream(file);
//        HWPFDocument doc = new HWPFDocument(fis);
//        fis.close();
//        return doc.getText().toString();
//    }

    public static String loadTxt(File file) throws IOException {
        BufferedReader bufferedReader = new BufferedReader(new FileReader(file));
        StringBuilder stringBuilder = new StringBuilder();
        String line;
        while ((line = bufferedReader.readLine()) != null) {
            stringBuilder.append(line);
        }
        return stringBuilder.toString();
    }

    public static String loadPDF(File file) throws IOException {
        PDFParser parser = new PDFParser(new RandomAccessBufferedFileInputStream(file));
        parser.parse();
        PDDocument doc = parser.getPDDocument();
        PDFTextStripper textStripper = new PDFTextStripper();
        StringBuilder stringBuilder = new StringBuilder();
        textStripper.setStartPage(1);
        textStripper.setEndPage(doc.getNumberOfPages());
        textStripper.setSortByPosition(true);
        stringBuilder.append(textStripper.getText(doc));
        doc.close();
        return stringBuilder.toString();
    }
}
