package cn.com.poc.common.segmentation;


import cn.com.poc.common.segmentation.domain.SegmentationConfig;
import cn.com.poc.common.segmentation.domain.SegmentationResult;
import cn.com.poc.common.utils.DocumentLoad;
import org.apache.commons.lang3.StringUtils;
import org.springframework.util.Assert;

import java.io.File;
import java.util.ArrayList;
import java.util.List;

/**
 * 【字数分割】
 * <p>
 * 默认分割根据【大切分长度】和【子切分长度】进行分割，每个大切片包含多个子切片
 * 通过配置RepetitionRate参数来控制重复率【上下分块重复大小】
 *
 * @author alex.yao
 * @version 1.0
 * @Date 2024/9/3
 */
public class NumberOfWordSegmentation extends SegmentationAbstract {

    @Override
    public List<SegmentationResult> scrap(SegmentationConfig config) {
        try {
            // 校验配置
            paramVerification(config);

            String content = this.content;
            if (StringUtils.isNotBlank(this.documentUrl)) {
                // 下载文档
                File file = DocumentLoad.downloadURLDocument(this.documentUrl);
                // 加载文档内容
                content = DocumentLoad.documentToText(file);
            }

            // 分割内容
            return contentSplitters(config, content);
        } catch (Exception e) {
            throw new RuntimeException("文档加载失败");
        }
    }

    private static void paramVerification(SegmentationConfig config) {
        Assert.notNull(config);
        Assert.notNull(config.getChunkSize());
        Assert.notNull(config.getScrapSize());
        Assert.notNull(config.getRepetitionRate());
        Assert.isTrue(config.getRepetitionRate() > 0 && config.getRepetitionRate() <= 50, "重复率参数错误,必须大于0和小于50");
        Assert.isTrue(config.getChunkSize() > 0 && config.getScrapSize() > 0, "切块大小参数错误");
        Assert.isTrue(config.getChunkSize() >= config.getScrapSize(), "【子切片】长度必须小于【大切片】长度");
    }


    public List<SegmentationResult> contentSplitters(SegmentationConfig config, String documentContents) {
        List<SegmentationResult> result = new ArrayList<>();
        int CHUNK_SIZE = config.getChunkSize();
        int CHUNK_REDUNDANT_VALUE = (int) (CHUNK_SIZE * (config.getRepetitionRate() / 100D));
        int SCRAP_SIZE = config.getScrapSize();
        int SCRAP_REDUNDANT_VALUE = (int) (SCRAP_SIZE * (config.getRepetitionRate() / 100D));

        int left = CHUNK_REDUNDANT_VALUE - CHUNK_SIZE, right = 0;//初始化左右指针

        //切分大块
        while (right < documentContents.length()) {
            left = left + CHUNK_SIZE - CHUNK_REDUNDANT_VALUE; //更新大块左指针
            right = Math.min(left + CHUNK_SIZE, documentContents.length());//更新大块右指针
            String chunkContent = documentContents.substring(left, right);//大块内容
            //大块分割成多小块
            int scrapLeft = SCRAP_REDUNDANT_VALUE - SCRAP_SIZE; // 小块 左指针
            int scrapRight = 0; //小块 右指针

            List<String> scrapChunkContent = new ArrayList<>();
            while (scrapRight < chunkContent.length()) {
                scrapLeft = scrapLeft + SCRAP_SIZE - SCRAP_REDUNDANT_VALUE;//更新左指针
                scrapRight = Math.min(scrapLeft + SCRAP_SIZE, chunkContent.length());//更新右指针
                String scrapChunk = chunkContent.substring(scrapLeft, scrapRight);//小块内容
                scrapChunkContent.add(scrapChunk);
            }

            SegmentationResult segmentationResult = new SegmentationResult();
            segmentationResult.setChunkContent(chunkContent);
            segmentationResult.setScrapChunkContent(scrapChunkContent);
            result.add(segmentationResult);
        }
        return result;
    }
}
