package cn.com.poc.common.utils;

import cn.com.yict.framemax.core.i18n.I18nMessageException;
import cn.hutool.core.io.FileUtil;
import io.github.furstenheim.*;
import org.apache.pdfbox.io.RandomAccessBufferedFileInputStream;
import org.apache.pdfbox.pdfparser.PDFParser;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.springframework.util.Assert;

import java.io.*;
import java.net.URL;
import java.net.URLConnection;
import java.nio.file.Files;

public class DocumentLoad {

    final static OptionsBuilder optionsBuilder = OptionsBuilder.anOptions();
    final static Options options = optionsBuilder.withBr("-")
            .withLinkStyle(LinkStyle.REFERENCED)
            .withLinkReferenceStyle(LinkReferenceStyle.SHORTCUT)
            .build();
    final static CopyDown converter = new CopyDown(options);

    /**
     * Html To Markdown
     */
    public static String htmlToMarkdown(String url) {
        try {
            // 创建 资源符对象 连接
            URLConnection conn = new URL(url).openConnection();
            // 获取输入流
            InputStream inputStream = conn.getInputStream();
            // 缓冲区，读取输入流内容，64KB
            char[] buffer = new char[1024 * 64];
            int len;
            StringBuilder sb = new StringBuilder();
            // 转换为字符流
            InputStreamReader isr = new InputStreamReader(inputStream);
            // 循环读取
            while ((len = isr.read(buffer)) != -1) {
                sb.append(buffer, 0, len);
            }
            // 关闭资源
            inputStream.close();
            isr.close();
            String htmlStr = sb.toString();
            return converter.convert(htmlStr);
        } catch (IOException e) {
            throw new I18nMessageException(e.getMessage());
        }
    }

    /**
     * 读取文档
     *
     * @param file
     * @return
     */
    public static String documentToText(File file) {
        Assert.notNull(file);
        String fileName = FileUtil.getName(file);
        String type = fileName.substring(fileName.lastIndexOf(".") + 1, fileName.length());
        try {
            switch (type) {
                case "docx":
                    return loadWordDocx(file);
                case "doc":
                    return loadWordDoc(file);
                case "md":
                    return loadMarkDown(file);
                case "pdf":
                    return loadPDF(file);
                case "txt":
                    return loadTxt(file);
                default:
                    throw new I18nMessageException(type + " format is not yet supported");
            }
        } catch (IOException e) {
            throw new I18nMessageException(e.getMessage());
        }
    }

    public static String loadMarkDown(File file) throws IOException {
        BufferedReader bufferedReader = new BufferedReader(new FileReader(file));
        StringBuilder stringBuilder = new StringBuilder();
        String line;
        while ((line = bufferedReader.readLine()) != null) {
            stringBuilder.append(line);
        }
        return stringBuilder.toString();
    }

    public static String loadWordDocx(File file) throws IOException {
        XWPFDocument xwpfDocument = new XWPFDocument(Files.newInputStream(file.toPath()));
        XWPFWordExtractor xwpfWordExtractor = new XWPFWordExtractor(xwpfDocument);
        return xwpfWordExtractor.getText();
    }

    public static String loadWordDoc(File file) throws IOException {
        FileInputStream fis = new FileInputStream(file);
        WordExtractor wordExtractor = new WordExtractor(fis);
        fis.close();
        return wordExtractor.getText().toString();
    }

    public static String loadTxt(File file) throws IOException {
        BufferedReader bufferedReader = new BufferedReader(new FileReader(file));
        StringBuilder stringBuilder = new StringBuilder();
        String line;
        while ((line = bufferedReader.readLine()) != null) {
            stringBuilder.append(line);
        }
        return stringBuilder.toString();
    }

    public static String loadPDF(File file) throws IOException {
        PDFParser parser = new PDFParser(new RandomAccessBufferedFileInputStream(file));
        parser.parse();
        PDDocument doc = parser.getPDDocument();
        PDFTextStripper textStripper = new PDFTextStripper();
        StringBuilder stringBuilder = new StringBuilder();
        textStripper.setStartPage(1);
        textStripper.setEndPage(doc.getNumberOfPages());
        textStripper.setSortByPosition(true);
        stringBuilder.append(textStripper.getText(doc));
        doc.close();
        return stringBuilder.toString();
    }


    public static File downloadURLDocument(String path) {
        // 下载网络文件
        int bytesum = 0;
        int byteread = 0;
        try {
            URL url = new URL(path);

            URLConnection conn = url.openConnection();
            String[] split = url.getFile().split("\\.");
            String suffix = split[split.length - 1];
            File tempFile = File.createTempFile(UUIDTool.getUUID(), "." + suffix);
            FileOutputStream fs = new FileOutputStream(tempFile);
            InputStream inStream = conn.getInputStream();

            byte[] buffer = new byte[1024];
            while ((byteread = inStream.read(buffer)) != -1) {
                bytesum += byteread;
                fs.write(buffer, 0, byteread);
            }
            fs.close();
            return tempFile;
        } catch (IOException e) {
            throw new I18nMessageException("exception/file.load.error");
        }
    }
}
