Commit 9be410a5 authored by alex yao's avatar alex yao

feat: 关键词提取

parent 72890a4b
...@@ -14,6 +14,7 @@ import cn.com.poc.common.model.BizFileUploadRecordModel; ...@@ -14,6 +14,7 @@ import cn.com.poc.common.model.BizFileUploadRecordModel;
import cn.com.poc.common.service.BizFileUploadRecordService; import cn.com.poc.common.service.BizFileUploadRecordService;
import cn.com.poc.common.utils.DocumentLoad; import cn.com.poc.common.utils.DocumentLoad;
import cn.com.poc.common.utils.JsonUtils; import cn.com.poc.common.utils.JsonUtils;
import cn.com.poc.common.utils.ListUtils;
import cn.com.poc.common.utils.SSEUtil; import cn.com.poc.common.utils.SSEUtil;
import cn.com.poc.knowledge.aggregate.KnowledgeService; import cn.com.poc.knowledge.aggregate.KnowledgeService;
import cn.com.poc.knowledge.constant.KnowledgeConstant; import cn.com.poc.knowledge.constant.KnowledgeConstant;
...@@ -23,12 +24,15 @@ import cn.com.poc.knowledge.service.BizKnowledgeDocumentService; ...@@ -23,12 +24,15 @@ import cn.com.poc.knowledge.service.BizKnowledgeDocumentService;
import cn.com.poc.long_document.aggregate.LongTextDialoguesService; import cn.com.poc.long_document.aggregate.LongTextDialoguesService;
import cn.com.poc.long_document.domain.LongtextDialoguesResult; import cn.com.poc.long_document.domain.LongtextDialoguesResult;
import cn.com.poc.long_document.domain.MindMap; import cn.com.poc.long_document.domain.MindMap;
import cn.com.poc.long_document.domain.keyword.KeyWord;
import cn.com.poc.long_document.domain.keyword.MentionFrequency;
import cn.com.poc.long_document.dto.LongTextExampleDto; import cn.com.poc.long_document.dto.LongTextExampleDto;
import cn.com.poc.long_document.dto.LongTextSummaryDto; import cn.com.poc.long_document.dto.LongTextSummaryDto;
import cn.com.poc.long_document.entity.BizLongTextSummaryCacheEntity; import cn.com.poc.long_document.entity.BizLongTextSummaryCacheEntity;
import cn.com.poc.long_document.model.BizLongTextExampleModel; import cn.com.poc.long_document.model.BizLongTextExampleModel;
import cn.com.poc.long_document.service.BizLongTextExampleService; import cn.com.poc.long_document.service.BizLongTextExampleService;
import cn.com.poc.long_document.service.BizLongTextSummaryCacheService; import cn.com.poc.long_document.service.BizLongTextSummaryCacheService;
import cn.com.poc.meeting.cache.KeyWordCounter;
import cn.com.poc.thirdparty.resource.demand.ai.aggregate.DemandKnowledgeService; import cn.com.poc.thirdparty.resource.demand.ai.aggregate.DemandKnowledgeService;
import cn.com.poc.thirdparty.resource.demand.ai.constants.KnowledgeSearchTypeEnum; import cn.com.poc.thirdparty.resource.demand.ai.constants.KnowledgeSearchTypeEnum;
import cn.com.poc.thirdparty.resource.demand.ai.constants.LLMRoleEnum; import cn.com.poc.thirdparty.resource.demand.ai.constants.LLMRoleEnum;
...@@ -62,6 +66,7 @@ import java.io.IOException; ...@@ -62,6 +66,7 @@ import java.io.IOException;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Collection; import java.util.Collection;
import java.util.List; import java.util.List;
import java.util.Map;
import java.util.concurrent.CompletableFuture; import java.util.concurrent.CompletableFuture;
import java.util.stream.Collectors; import java.util.stream.Collectors;
...@@ -240,7 +245,7 @@ public class LongTextDialoguesServiceImpl implements LongTextDialoguesService { ...@@ -240,7 +245,7 @@ public class LongTextDialoguesServiceImpl implements LongTextDialoguesService {
String summary = StringUtils.EMPTY; String summary = StringUtils.EMPTY;
String corePoint = StringUtils.EMPTY; String corePoint = StringUtils.EMPTY;
String mindMap = StringUtils.EMPTY; String mindMap = StringUtils.EMPTY;
String keyWord = StringUtils.EMPTY; KeyWord keyWord = null;
boolean summaryNeedGenerate = false; boolean summaryNeedGenerate = false;
boolean corePointNeedGenerate = false; boolean corePointNeedGenerate = false;
...@@ -259,7 +264,8 @@ public class LongTextDialoguesServiceImpl implements LongTextDialoguesService { ...@@ -259,7 +264,8 @@ public class LongTextDialoguesServiceImpl implements LongTextDialoguesService {
BizLongTextSummaryCacheEntity longTextSummaryCacheEntity = bizLongTextSummaryCacheEntities.get(0); BizLongTextSummaryCacheEntity longTextSummaryCacheEntity = bizLongTextSummaryCacheEntities.get(0);
summary = longTextSummaryCacheEntity.getSummary(); summary = longTextSummaryCacheEntity.getSummary();
corePoint = longTextSummaryCacheEntity.getCorePoint(); corePoint = longTextSummaryCacheEntity.getCorePoint();
keyWord = longTextSummaryCacheEntity.getKeyword(); keyWord = StringUtils.isNotBlank(longTextSummaryCacheEntity.getKeyword()) ?
JsonUtils.deSerialize(longTextSummaryCacheEntity.getKeyword(), KeyWord.class) : null;
mindMap = longTextSummaryCacheEntity.getMindmap(); mindMap = longTextSummaryCacheEntity.getMindmap();
} }
...@@ -267,7 +273,7 @@ public class LongTextDialoguesServiceImpl implements LongTextDialoguesService { ...@@ -267,7 +273,7 @@ public class LongTextDialoguesServiceImpl implements LongTextDialoguesService {
summaryNeedGenerate = StringUtils.isBlank(summary); summaryNeedGenerate = StringUtils.isBlank(summary);
corePointNeedGenerate = StringUtils.isBlank(corePoint); corePointNeedGenerate = StringUtils.isBlank(corePoint);
mindMapNeedGenerate = StringUtils.isBlank(mindMap); mindMapNeedGenerate = StringUtils.isBlank(mindMap);
keyWordNeedGenerate = StringUtils.isBlank(keyWord); keyWordNeedGenerate = ObjectUtil.isEmpty(keyWord);
String document = DocumentLoad.documentToText(file); String document = DocumentLoad.documentToText(file);
...@@ -306,8 +312,8 @@ public class LongTextDialoguesServiceImpl implements LongTextDialoguesService { ...@@ -306,8 +312,8 @@ public class LongTextDialoguesServiceImpl implements LongTextDialoguesService {
}); });
boolean finalKeyWordNeedGenerate = keyWordNeedGenerate; boolean finalKeyWordNeedGenerate = keyWordNeedGenerate;
String finalKeyWord = keyWord; KeyWord finalKeyWord = keyWord;
CompletableFuture<String> keyWordFuture = CompletableFuture.supplyAsync(() -> CompletableFuture<KeyWord> keyWordFuture = CompletableFuture.supplyAsync(() ->
finalKeyWordNeedGenerate ? createKeyWord(document) : finalKeyWord).exceptionally(throwable -> { finalKeyWordNeedGenerate ? createKeyWord(document) : finalKeyWord).exceptionally(throwable -> {
logger.error("生成关键词失败", throwable); logger.error("生成关键词失败", throwable);
throw new BusinessException(throwable); throw new BusinessException(throwable);
...@@ -329,7 +335,8 @@ public class LongTextDialoguesServiceImpl implements LongTextDialoguesService { ...@@ -329,7 +335,8 @@ public class LongTextDialoguesServiceImpl implements LongTextDialoguesService {
updateCacheEntity.setCorePoint(corePoint); updateCacheEntity.setCorePoint(corePoint);
updateCacheEntity.setSummary(summary); updateCacheEntity.setSummary(summary);
updateCacheEntity.setMindmap(mindMap); updateCacheEntity.setMindmap(mindMap);
updateCacheEntity.setKeyword(keyWord); updateCacheEntity.setKeyword(ObjectUtil.isNotEmpty(keyWord) ?
JsonUtils.serialize(keyWord) : StringUtils.EMPTY);
bizLongTextSummaryCacheService.update(updateCacheEntity); bizLongTextSummaryCacheService.update(updateCacheEntity);
} else { } else {
BizLongTextSummaryCacheEntity saveCache = new BizLongTextSummaryCacheEntity(); BizLongTextSummaryCacheEntity saveCache = new BizLongTextSummaryCacheEntity();
...@@ -337,29 +344,28 @@ public class LongTextDialoguesServiceImpl implements LongTextDialoguesService { ...@@ -337,29 +344,28 @@ public class LongTextDialoguesServiceImpl implements LongTextDialoguesService {
saveCache.setSummary(summary); saveCache.setSummary(summary);
saveCache.setFileUrl(fileUrl); saveCache.setFileUrl(fileUrl);
saveCache.setMindmap(mindMap); saveCache.setMindmap(mindMap);
saveCache.setKeyword(keyWord); saveCache.setKeyword(ObjectUtil.isNotEmpty(keyWord) ?
JsonUtils.serialize(keyWord) : StringUtils.EMPTY);
saveCache.setFileMd5(fileMD5); saveCache.setFileMd5(fileMD5);
saveCache.setIsDeleted(CommonConstant.IsDeleted.N); saveCache.setIsDeleted(CommonConstant.IsDeleted.N);
bizLongTextSummaryCacheService.save(saveCache); bizLongTextSummaryCacheService.save(saveCache);
} }
} }
if (StringUtils.isBlank(summary)){ if (StringUtils.isBlank(summary)) {
throw new BusinessException("获取/生成摘要失败"); throw new BusinessException("获取/生成摘要失败");
} }
if (StringUtils.isBlank(keyWord)){ if (ObjectUtil.isEmpty(keyWord)) {
throw new BusinessException("获取/生成关键词失败"); throw new BusinessException("获取/生成关键词失败");
} }
if (StringUtils.isBlank(corePoint)){ if (StringUtils.isBlank(corePoint)) {
throw new BusinessException("获取/生成核心观点失败"); throw new BusinessException("获取/生成核心观点失败");
} }
if (StringUtils.isBlank(mindMap)){ if (StringUtils.isBlank(mindMap)) {
throw new BusinessException("获取/生成思维导图失败"); throw new BusinessException("获取/生成思维导图失败");
} }
// 返回摘要和核心观点 // 返回摘要和核心观点
// 转换格式 json to list // 转换格式 json to list
int startKeyWord = keyWord.lastIndexOf("[");
int endKeyWord = keyWord.lastIndexOf("]");
int startMindMap = mindMap.indexOf("{"); int startMindMap = mindMap.indexOf("{");
int endMindMap = mindMap.lastIndexOf("}"); int endMindMap = mindMap.lastIndexOf("}");
...@@ -367,8 +373,7 @@ public class LongTextDialoguesServiceImpl implements LongTextDialoguesService { ...@@ -367,8 +373,7 @@ public class LongTextDialoguesServiceImpl implements LongTextDialoguesService {
LongTextSummaryDto longTextSummaryDto = new LongTextSummaryDto(); LongTextSummaryDto longTextSummaryDto = new LongTextSummaryDto();
longTextSummaryDto.setSummary(summary); longTextSummaryDto.setSummary(summary);
longTextSummaryDto.setCorePoint(corePoint); longTextSummaryDto.setCorePoint(corePoint);
longTextSummaryDto.setKeyword(JsonUtils.deSerialize(keyWord.substring(startKeyWord, endKeyWord + 1), new TypeReference<List<String>>() { longTextSummaryDto.setKeyword(keyWord);
}.getType()));
longTextSummaryDto.setMindMap(JsonUtils.deSerialize(mindMap.substring(startMindMap, endMindMap + 1), MindMap.class)); longTextSummaryDto.setMindMap(JsonUtils.deSerialize(mindMap.substring(startMindMap, endMindMap + 1), MindMap.class));
return longTextSummaryDto; return longTextSummaryDto;
} }
...@@ -699,7 +704,7 @@ public class LongTextDialoguesServiceImpl implements LongTextDialoguesService { ...@@ -699,7 +704,7 @@ public class LongTextDialoguesServiceImpl implements LongTextDialoguesService {
} }
} }
private String createKeyWord(String document) { private KeyWord createKeyWord(String document) {
// 获取对话提示词 // 获取对话提示词
String promptCode = "CreateKeyWordPrompt"; String promptCode = "CreateKeyWordPrompt";
BizAgentApplicationGcConfigEntity documentDialoguePrompt = bizAgentApplicationGcConfigService.getByConfigCode(promptCode); BizAgentApplicationGcConfigEntity documentDialoguePrompt = bizAgentApplicationGcConfigService.getByConfigCode(promptCode);
...@@ -750,13 +755,112 @@ public class LongTextDialoguesServiceImpl implements LongTextDialoguesService { ...@@ -750,13 +755,112 @@ public class LongTextDialoguesServiceImpl implements LongTextDialoguesService {
} }
} }
bufferedReader.close(); bufferedReader.close();
return summary.toString(); if (StringUtils.isBlank(summary.toString())) {
logger.error("------------ summary is blank , check llm config -------------");
throw new BusinessException("关键词提取失败");
}
String keywordStr = summary.toString();
int start = keywordStr.lastIndexOf("[");
int end = keywordStr.lastIndexOf("]");
List<String> keywords = JsonUtils.deSerialize(keywordStr.substring(start, end + 1), new TypeReference<List<String>>() {
}.getType());
if (CollectionUtils.isEmpty(keywords)) {
logger.error("------------ keywords is blank , check llm config -------------");
throw new BusinessException("关键词提取失败");
}
//匹配关键词在文章中句子[按标点符号切分]/计算关键词频率
Map<String, Integer> keywordCountMap = KeyWordCounter.keywordCount(document, keywords);
Map<String, List<String>> matchSentenceMap = KeyWordCounter.keywordMatchSentence(document, keywords);
List<MentionFrequency> mentionFrequencies = new ArrayList<>();
for (String keyword : keywordCountMap.keySet()) {
MentionFrequency mentionFrequency = new MentionFrequency();
mentionFrequency.setKeyword(keyword);
mentionFrequency.setSentences(matchSentenceMap.get(keyword).stream().distinct().collect(Collectors.toList()));
mentionFrequency.setFrequency(keywordCountMap.get(keyword));
mentionFrequencies.add(mentionFrequency);
}
// 相关概念
List<String> concepts = createConcepts(document);
KeyWord keyWord = new KeyWord();
keyWord.setKeyWord(new ArrayList<>(keywordCountMap.keySet()));
keyWord.setMentionFrequency(mentionFrequencies);
keyWord.setConcepts(concepts);
return keyWord;
} catch (Exception e) { } catch (Exception e) {
logger.error("获取关键词失败", e); logger.error("获取关键词失败", e);
throw new BusinessException("获取关键词失败"); throw new BusinessException("获取关键词失败");
} }
} }
private List<String> createConcepts(String document) {
// 获取对话提示词
String promptCode = "CreateConceptsPrompt";
BizAgentApplicationGcConfigEntity documentDialoguePrompt = bizAgentApplicationGcConfigService.getByConfigCode(promptCode);
if (documentDialoguePrompt == null || StringUtils.isBlank(documentDialoguePrompt.getConfigSystem())) {
logger.error("获取对话提示词失败 , configCode:{}", promptCode);
throw new BusinessException("获取对话提示词失败");
}
String prompt = documentDialoguePrompt.getConfigSystem();
String largeModel = documentDialoguePrompt.getLargeModel();
prompt = prompt.replace("{document}", document);
// 配置message
List<Message> messages = new ArrayList<>();
Message systemMessage = new Message();
systemMessage.setContent(prompt);
systemMessage.setRole(LLMRoleEnum.SYSTEM.getRole());
messages.add(systemMessage);
Message questionMessage = new Message();
questionMessage.setContent("生成相关概念");
questionMessage.setRole(LLMRoleEnum.USER.getRole());
messages.add(questionMessage);
// 调用LLM
LargeModelResponse response = new LargeModelResponse();
response.setModel(largeModel);
response.setMessages(messages.toArray(new Message[0]));
response.setStream(true);
response.setUser("CreateMindMap");
try {
StringBuilder summary = new StringBuilder();
BufferedReader bufferedReader = llmService.chatChunk(response);
String res;
while ((res = bufferedReader.readLine()) != null) {
if (StringUtils.isEmpty(res)) {
continue;
}
res = res.replace("data: ", StringUtils.EMPTY);
LargeModelDemandResult result = JsonUtils.deSerialize(res, LargeModelDemandResult.class);
if (ObjectUtil.isEmpty(result) || !result.getCode().equals("0")) {
logger.error("LLM Error,code:{}", result.getCode());
throw new BusinessException("生成相关概念失败");
}
if (StringUtils.isNotBlank(result.getMessage())) {
String message = result.getMessage();
summary.append(message);
}
}
bufferedReader.close();
String result = summary.toString();
if (StringUtils.isBlank(result)) {
return ListUtils.EMPTY_LIST;
}
int startMindMap = result.indexOf("[");
int endMindMap = result.lastIndexOf("]");
return JsonUtils.deSerialize(result.substring(startMindMap, endMindMap + 1), new TypeReference<List<String>>() {
}.getType());
} catch (Exception e) {
logger.error("-----------生成相关概念失败------------", e);
return ListUtils.EMPTY_LIST;
}
}
private String createMindMap(String document) { private String createMindMap(String document) {
// 获取对话提示词 // 获取对话提示词
......
package cn.com.poc.long_document.domain.keyword;
/**
* @author alex.yao
* @date 2025/9/4
*/
public class Concepts {
}
package cn.com.poc.long_document.domain.keyword;
import java.util.List;
/**
* @author alex.yao
* @date 2025/9/4
*/
public class KeyWord {
/**
* 关键词云图 [10个以内]
*/
private List<String> keyWord;
/**
* 关键词提及频率 [10个以内]
*/
private List<MentionFrequency> mentionFrequency;
/**
* 相关概念 [10个以内]
*/
List<String> concepts;
public List<String> getKeyWord() {
return keyWord;
}
public void setKeyWord(List<String> keyWord) {
this.keyWord = keyWord;
}
public List<MentionFrequency> getMentionFrequency() {
return mentionFrequency;
}
public void setMentionFrequency(List<MentionFrequency> mentionFrequency) {
this.mentionFrequency = mentionFrequency;
}
public List<String> getConcepts() {
return concepts;
}
public void setConcepts(List<String> concepts) {
this.concepts = concepts;
}
}
package cn.com.poc.long_document.domain.keyword;
import java.util.List;
/**
* @author alex.yao
* @date 2025/9/4
*/
public class MentionFrequency {
/**
* 关键词
*/
private String keyword;
/**
* 关联句子
*/
private List<String> sentences;
/**
* 出现频率
*/
private Integer frequency;
public String getKeyword() {
return keyword;
}
public void setKeyword(String keyword) {
this.keyword = keyword;
}
public List<String> getSentences() {
return sentences;
}
public void setSentences(List<String> sentences) {
this.sentences = sentences;
}
public Integer getFrequency() {
return frequency;
}
public void setFrequency(Integer frequency) {
this.frequency = frequency;
}
}
package cn.com.poc.long_document.dto; package cn.com.poc.long_document.dto;
import cn.com.poc.long_document.domain.MindMap; import cn.com.poc.long_document.domain.MindMap;
import cn.com.poc.long_document.domain.keyword.KeyWord;
import com.fasterxml.jackson.annotation.JsonIgnoreProperties; import com.fasterxml.jackson.annotation.JsonIgnoreProperties;
import com.fasterxml.jackson.annotation.JsonInclude; import com.fasterxml.jackson.annotation.JsonInclude;
...@@ -18,15 +19,15 @@ public class LongTextSummaryDto { ...@@ -18,15 +19,15 @@ public class LongTextSummaryDto {
private String corePoint; private String corePoint;
private List<String> keyword; private KeyWord keyword;
private MindMap mindMap; private MindMap mindMap;
public List<String> getKeyword() { public KeyWord getKeyword() {
return keyword; return keyword;
} }
public void setKeyword(List<String> keyword) { public void setKeyword(KeyWord keyword) {
this.keyword = keyword; this.keyword = keyword;
} }
......
...@@ -38,6 +38,29 @@ public class KeyWordCounter { ...@@ -38,6 +38,29 @@ public class KeyWordCounter {
return sortedMap; return sortedMap;
} }
/**
* 提取关键词与文本中匹配的句子
*
* @param content 文本
* @param keywords 关键词
*/
public static Map<String, List<String>> keywordMatchSentence(String content, List<String> keywords) {
Map<String, List<String>> result = new LinkedHashMap<>();
// 将关键词转换为正则表达式模式(注意转义特殊字符)
for (String keyword : keywords) {
String regexPattern = Pattern.quote(keyword);
String sentenceRegex = "[^,,::;;.。!?!?\\n]*" + regexPattern + "[^,,::;;.。!?!?\\n]*[,,::;;.。!?!?\\n]";
Pattern pattern = Pattern.compile(sentenceRegex, Pattern.CASE_INSENSITIVE);
Matcher matcher = pattern.matcher(content);
// 查找并输出匹配的句子
List<String> sentences = new ArrayList<>();
while (matcher.find()) {
sentences.add(matcher.group().trim());
}
result.put(keyword, sentences);
}
return result;
}
/** /**
* 缓存关键词计数 * 缓存关键词计数
......
package cn.com.poc.meeting; package cn.com.poc.meeting;
import cn.com.yict.framemax.core.spring.SingleContextInitializer; import cn.com.yict.framemax.core.spring.SingleContextInitializer;
import org.junit.Test;
import org.junit.runner.RunWith; import org.junit.runner.RunWith;
import org.springframework.test.context.ContextConfiguration; import org.springframework.test.context.ContextConfiguration;
import org.springframework.test.context.junit4.SpringJUnit4ClassRunner; import org.springframework.test.context.junit4.SpringJUnit4ClassRunner;
import org.springframework.test.context.web.WebAppConfiguration; import org.springframework.test.context.web.WebAppConfiguration;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
@RunWith(SpringJUnit4ClassRunner.class) @RunWith(SpringJUnit4ClassRunner.class)
@ContextConfiguration(initializers = SingleContextInitializer.class) @ContextConfiguration(initializers = SingleContextInitializer.class)
@WebAppConfiguration @WebAppConfiguration
public class MeetingTest { public class MeetingTest {
@Test
public void test() {
String keyword = "an example";
String text = "This is an example sentence. Another example? Yes, this is an example.";
// 将关键词转换为正则表达式模式(注意转义特殊字符)
String regexPattern = Pattern.quote(keyword);
// 构建匹配句子的正则表达式
// 匹配以任意标点符号或换行结尾的句子,包含关键词
// 支持中英文混合关键词匹配
String sentenceRegex = "[^,,::;;.。!?!?\\n]*" + regexPattern + "[^,,::;;.。!?!?\\n]*[,,::;;.。!?!?\\n]";
Pattern pattern = Pattern.compile(sentenceRegex, Pattern.CASE_INSENSITIVE);
Matcher matcher = pattern.matcher(text);
// 查找并输出匹配的句子
while (matcher.find()) {
System.out.println("\"" + matcher.group().trim() + "\"");
}
}
} }
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment