Commit 9be410a5 authored by alex yao's avatar alex yao

feat: 关键词提取

parent 72890a4b
......@@ -14,6 +14,7 @@ import cn.com.poc.common.model.BizFileUploadRecordModel;
import cn.com.poc.common.service.BizFileUploadRecordService;
import cn.com.poc.common.utils.DocumentLoad;
import cn.com.poc.common.utils.JsonUtils;
import cn.com.poc.common.utils.ListUtils;
import cn.com.poc.common.utils.SSEUtil;
import cn.com.poc.knowledge.aggregate.KnowledgeService;
import cn.com.poc.knowledge.constant.KnowledgeConstant;
......@@ -23,12 +24,15 @@ import cn.com.poc.knowledge.service.BizKnowledgeDocumentService;
import cn.com.poc.long_document.aggregate.LongTextDialoguesService;
import cn.com.poc.long_document.domain.LongtextDialoguesResult;
import cn.com.poc.long_document.domain.MindMap;
import cn.com.poc.long_document.domain.keyword.KeyWord;
import cn.com.poc.long_document.domain.keyword.MentionFrequency;
import cn.com.poc.long_document.dto.LongTextExampleDto;
import cn.com.poc.long_document.dto.LongTextSummaryDto;
import cn.com.poc.long_document.entity.BizLongTextSummaryCacheEntity;
import cn.com.poc.long_document.model.BizLongTextExampleModel;
import cn.com.poc.long_document.service.BizLongTextExampleService;
import cn.com.poc.long_document.service.BizLongTextSummaryCacheService;
import cn.com.poc.meeting.cache.KeyWordCounter;
import cn.com.poc.thirdparty.resource.demand.ai.aggregate.DemandKnowledgeService;
import cn.com.poc.thirdparty.resource.demand.ai.constants.KnowledgeSearchTypeEnum;
import cn.com.poc.thirdparty.resource.demand.ai.constants.LLMRoleEnum;
......@@ -62,6 +66,7 @@ import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.Map;
import java.util.concurrent.CompletableFuture;
import java.util.stream.Collectors;
......@@ -240,7 +245,7 @@ public class LongTextDialoguesServiceImpl implements LongTextDialoguesService {
String summary = StringUtils.EMPTY;
String corePoint = StringUtils.EMPTY;
String mindMap = StringUtils.EMPTY;
String keyWord = StringUtils.EMPTY;
KeyWord keyWord = null;
boolean summaryNeedGenerate = false;
boolean corePointNeedGenerate = false;
......@@ -259,7 +264,8 @@ public class LongTextDialoguesServiceImpl implements LongTextDialoguesService {
BizLongTextSummaryCacheEntity longTextSummaryCacheEntity = bizLongTextSummaryCacheEntities.get(0);
summary = longTextSummaryCacheEntity.getSummary();
corePoint = longTextSummaryCacheEntity.getCorePoint();
keyWord = longTextSummaryCacheEntity.getKeyword();
keyWord = StringUtils.isNotBlank(longTextSummaryCacheEntity.getKeyword()) ?
JsonUtils.deSerialize(longTextSummaryCacheEntity.getKeyword(), KeyWord.class) : null;
mindMap = longTextSummaryCacheEntity.getMindmap();
}
......@@ -267,7 +273,7 @@ public class LongTextDialoguesServiceImpl implements LongTextDialoguesService {
summaryNeedGenerate = StringUtils.isBlank(summary);
corePointNeedGenerate = StringUtils.isBlank(corePoint);
mindMapNeedGenerate = StringUtils.isBlank(mindMap);
keyWordNeedGenerate = StringUtils.isBlank(keyWord);
keyWordNeedGenerate = ObjectUtil.isEmpty(keyWord);
String document = DocumentLoad.documentToText(file);
......@@ -306,8 +312,8 @@ public class LongTextDialoguesServiceImpl implements LongTextDialoguesService {
});
boolean finalKeyWordNeedGenerate = keyWordNeedGenerate;
String finalKeyWord = keyWord;
CompletableFuture<String> keyWordFuture = CompletableFuture.supplyAsync(() ->
KeyWord finalKeyWord = keyWord;
CompletableFuture<KeyWord> keyWordFuture = CompletableFuture.supplyAsync(() ->
finalKeyWordNeedGenerate ? createKeyWord(document) : finalKeyWord).exceptionally(throwable -> {
logger.error("生成关键词失败", throwable);
throw new BusinessException(throwable);
......@@ -329,7 +335,8 @@ public class LongTextDialoguesServiceImpl implements LongTextDialoguesService {
updateCacheEntity.setCorePoint(corePoint);
updateCacheEntity.setSummary(summary);
updateCacheEntity.setMindmap(mindMap);
updateCacheEntity.setKeyword(keyWord);
updateCacheEntity.setKeyword(ObjectUtil.isNotEmpty(keyWord) ?
JsonUtils.serialize(keyWord) : StringUtils.EMPTY);
bizLongTextSummaryCacheService.update(updateCacheEntity);
} else {
BizLongTextSummaryCacheEntity saveCache = new BizLongTextSummaryCacheEntity();
......@@ -337,29 +344,28 @@ public class LongTextDialoguesServiceImpl implements LongTextDialoguesService {
saveCache.setSummary(summary);
saveCache.setFileUrl(fileUrl);
saveCache.setMindmap(mindMap);
saveCache.setKeyword(keyWord);
saveCache.setKeyword(ObjectUtil.isNotEmpty(keyWord) ?
JsonUtils.serialize(keyWord) : StringUtils.EMPTY);
saveCache.setFileMd5(fileMD5);
saveCache.setIsDeleted(CommonConstant.IsDeleted.N);
bizLongTextSummaryCacheService.save(saveCache);
}
}
if (StringUtils.isBlank(summary)){
if (StringUtils.isBlank(summary)) {
throw new BusinessException("获取/生成摘要失败");
}
if (StringUtils.isBlank(keyWord)){
if (ObjectUtil.isEmpty(keyWord)) {
throw new BusinessException("获取/生成关键词失败");
}
if (StringUtils.isBlank(corePoint)){
if (StringUtils.isBlank(corePoint)) {
throw new BusinessException("获取/生成核心观点失败");
}
if (StringUtils.isBlank(mindMap)){
if (StringUtils.isBlank(mindMap)) {
throw new BusinessException("获取/生成思维导图失败");
}
// 返回摘要和核心观点
// 转换格式 json to list
int startKeyWord = keyWord.lastIndexOf("[");
int endKeyWord = keyWord.lastIndexOf("]");
int startMindMap = mindMap.indexOf("{");
int endMindMap = mindMap.lastIndexOf("}");
......@@ -367,8 +373,7 @@ public class LongTextDialoguesServiceImpl implements LongTextDialoguesService {
LongTextSummaryDto longTextSummaryDto = new LongTextSummaryDto();
longTextSummaryDto.setSummary(summary);
longTextSummaryDto.setCorePoint(corePoint);
longTextSummaryDto.setKeyword(JsonUtils.deSerialize(keyWord.substring(startKeyWord, endKeyWord + 1), new TypeReference<List<String>>() {
}.getType()));
longTextSummaryDto.setKeyword(keyWord);
longTextSummaryDto.setMindMap(JsonUtils.deSerialize(mindMap.substring(startMindMap, endMindMap + 1), MindMap.class));
return longTextSummaryDto;
}
......@@ -699,7 +704,7 @@ public class LongTextDialoguesServiceImpl implements LongTextDialoguesService {
}
}
private String createKeyWord(String document) {
private KeyWord createKeyWord(String document) {
// 获取对话提示词
String promptCode = "CreateKeyWordPrompt";
BizAgentApplicationGcConfigEntity documentDialoguePrompt = bizAgentApplicationGcConfigService.getByConfigCode(promptCode);
......@@ -750,13 +755,112 @@ public class LongTextDialoguesServiceImpl implements LongTextDialoguesService {
}
}
bufferedReader.close();
return summary.toString();
if (StringUtils.isBlank(summary.toString())) {
logger.error("------------ summary is blank , check llm config -------------");
throw new BusinessException("关键词提取失败");
}
String keywordStr = summary.toString();
int start = keywordStr.lastIndexOf("[");
int end = keywordStr.lastIndexOf("]");
List<String> keywords = JsonUtils.deSerialize(keywordStr.substring(start, end + 1), new TypeReference<List<String>>() {
}.getType());
if (CollectionUtils.isEmpty(keywords)) {
logger.error("------------ keywords is blank , check llm config -------------");
throw new BusinessException("关键词提取失败");
}
//匹配关键词在文章中句子[按标点符号切分]/计算关键词频率
Map<String, Integer> keywordCountMap = KeyWordCounter.keywordCount(document, keywords);
Map<String, List<String>> matchSentenceMap = KeyWordCounter.keywordMatchSentence(document, keywords);
List<MentionFrequency> mentionFrequencies = new ArrayList<>();
for (String keyword : keywordCountMap.keySet()) {
MentionFrequency mentionFrequency = new MentionFrequency();
mentionFrequency.setKeyword(keyword);
mentionFrequency.setSentences(matchSentenceMap.get(keyword).stream().distinct().collect(Collectors.toList()));
mentionFrequency.setFrequency(keywordCountMap.get(keyword));
mentionFrequencies.add(mentionFrequency);
}
// 相关概念
List<String> concepts = createConcepts(document);
KeyWord keyWord = new KeyWord();
keyWord.setKeyWord(new ArrayList<>(keywordCountMap.keySet()));
keyWord.setMentionFrequency(mentionFrequencies);
keyWord.setConcepts(concepts);
return keyWord;
} catch (Exception e) {
logger.error("获取关键词失败", e);
throw new BusinessException("获取关键词失败");
}
}
private List<String> createConcepts(String document) {
// 获取对话提示词
String promptCode = "CreateConceptsPrompt";
BizAgentApplicationGcConfigEntity documentDialoguePrompt = bizAgentApplicationGcConfigService.getByConfigCode(promptCode);
if (documentDialoguePrompt == null || StringUtils.isBlank(documentDialoguePrompt.getConfigSystem())) {
logger.error("获取对话提示词失败 , configCode:{}", promptCode);
throw new BusinessException("获取对话提示词失败");
}
String prompt = documentDialoguePrompt.getConfigSystem();
String largeModel = documentDialoguePrompt.getLargeModel();
prompt = prompt.replace("{document}", document);
// 配置message
List<Message> messages = new ArrayList<>();
Message systemMessage = new Message();
systemMessage.setContent(prompt);
systemMessage.setRole(LLMRoleEnum.SYSTEM.getRole());
messages.add(systemMessage);
Message questionMessage = new Message();
questionMessage.setContent("生成相关概念");
questionMessage.setRole(LLMRoleEnum.USER.getRole());
messages.add(questionMessage);
// 调用LLM
LargeModelResponse response = new LargeModelResponse();
response.setModel(largeModel);
response.setMessages(messages.toArray(new Message[0]));
response.setStream(true);
response.setUser("CreateMindMap");
try {
StringBuilder summary = new StringBuilder();
BufferedReader bufferedReader = llmService.chatChunk(response);
String res;
while ((res = bufferedReader.readLine()) != null) {
if (StringUtils.isEmpty(res)) {
continue;
}
res = res.replace("data: ", StringUtils.EMPTY);
LargeModelDemandResult result = JsonUtils.deSerialize(res, LargeModelDemandResult.class);
if (ObjectUtil.isEmpty(result) || !result.getCode().equals("0")) {
logger.error("LLM Error,code:{}", result.getCode());
throw new BusinessException("生成相关概念失败");
}
if (StringUtils.isNotBlank(result.getMessage())) {
String message = result.getMessage();
summary.append(message);
}
}
bufferedReader.close();
String result = summary.toString();
if (StringUtils.isBlank(result)) {
return ListUtils.EMPTY_LIST;
}
int startMindMap = result.indexOf("[");
int endMindMap = result.lastIndexOf("]");
return JsonUtils.deSerialize(result.substring(startMindMap, endMindMap + 1), new TypeReference<List<String>>() {
}.getType());
} catch (Exception e) {
logger.error("-----------生成相关概念失败------------", e);
return ListUtils.EMPTY_LIST;
}
}
private String createMindMap(String document) {
// 获取对话提示词
......
package cn.com.poc.long_document.domain.keyword;
/**
* @author alex.yao
* @date 2025/9/4
*/
public class Concepts {
}
package cn.com.poc.long_document.domain.keyword;
import java.util.List;
/**
* @author alex.yao
* @date 2025/9/4
*/
public class KeyWord {
/**
* 关键词云图 [10个以内]
*/
private List<String> keyWord;
/**
* 关键词提及频率 [10个以内]
*/
private List<MentionFrequency> mentionFrequency;
/**
* 相关概念 [10个以内]
*/
List<String> concepts;
public List<String> getKeyWord() {
return keyWord;
}
public void setKeyWord(List<String> keyWord) {
this.keyWord = keyWord;
}
public List<MentionFrequency> getMentionFrequency() {
return mentionFrequency;
}
public void setMentionFrequency(List<MentionFrequency> mentionFrequency) {
this.mentionFrequency = mentionFrequency;
}
public List<String> getConcepts() {
return concepts;
}
public void setConcepts(List<String> concepts) {
this.concepts = concepts;
}
}
package cn.com.poc.long_document.domain.keyword;
import java.util.List;
/**
* @author alex.yao
* @date 2025/9/4
*/
public class MentionFrequency {
/**
* 关键词
*/
private String keyword;
/**
* 关联句子
*/
private List<String> sentences;
/**
* 出现频率
*/
private Integer frequency;
public String getKeyword() {
return keyword;
}
public void setKeyword(String keyword) {
this.keyword = keyword;
}
public List<String> getSentences() {
return sentences;
}
public void setSentences(List<String> sentences) {
this.sentences = sentences;
}
public Integer getFrequency() {
return frequency;
}
public void setFrequency(Integer frequency) {
this.frequency = frequency;
}
}
package cn.com.poc.long_document.dto;
import cn.com.poc.long_document.domain.MindMap;
import cn.com.poc.long_document.domain.keyword.KeyWord;
import com.fasterxml.jackson.annotation.JsonIgnoreProperties;
import com.fasterxml.jackson.annotation.JsonInclude;
......@@ -18,15 +19,15 @@ public class LongTextSummaryDto {
private String corePoint;
private List<String> keyword;
private KeyWord keyword;
private MindMap mindMap;
public List<String> getKeyword() {
public KeyWord getKeyword() {
return keyword;
}
public void setKeyword(List<String> keyword) {
public void setKeyword(KeyWord keyword) {
this.keyword = keyword;
}
......
......@@ -38,6 +38,29 @@ public class KeyWordCounter {
return sortedMap;
}
/**
* 提取关键词与文本中匹配的句子
*
* @param content 文本
* @param keywords 关键词
*/
public static Map<String, List<String>> keywordMatchSentence(String content, List<String> keywords) {
Map<String, List<String>> result = new LinkedHashMap<>();
// 将关键词转换为正则表达式模式(注意转义特殊字符)
for (String keyword : keywords) {
String regexPattern = Pattern.quote(keyword);
String sentenceRegex = "[^,,::;;.。!?!?\\n]*" + regexPattern + "[^,,::;;.。!?!?\\n]*[,,::;;.。!?!?\\n]";
Pattern pattern = Pattern.compile(sentenceRegex, Pattern.CASE_INSENSITIVE);
Matcher matcher = pattern.matcher(content);
// 查找并输出匹配的句子
List<String> sentences = new ArrayList<>();
while (matcher.find()) {
sentences.add(matcher.group().trim());
}
result.put(keyword, sentences);
}
return result;
}
/**
* 缓存关键词计数
......
package cn.com.poc.meeting;
import cn.com.yict.framemax.core.spring.SingleContextInitializer;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.springframework.test.context.ContextConfiguration;
import org.springframework.test.context.junit4.SpringJUnit4ClassRunner;
import org.springframework.test.context.web.WebAppConfiguration;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
@RunWith(SpringJUnit4ClassRunner.class)
@ContextConfiguration(initializers = SingleContextInitializer.class)
@WebAppConfiguration
public class MeetingTest {
@Test
public void test() {
String keyword = "an example";
String text = "This is an example sentence. Another example? Yes, this is an example.";
// 将关键词转换为正则表达式模式(注意转义特殊字符)
String regexPattern = Pattern.quote(keyword);
// 构建匹配句子的正则表达式
// 匹配以任意标点符号或换行结尾的句子,包含关键词
// 支持中英文混合关键词匹配
String sentenceRegex = "[^,,::;;.。!?!?\\n]*" + regexPattern + "[^,,::;;.。!?!?\\n]*[,,::;;.。!?!?\\n]";
Pattern pattern = Pattern.compile(sentenceRegex, Pattern.CASE_INSENSITIVE);
Matcher matcher = pattern.matcher(text);
// 查找并输出匹配的句子
while (matcher.find()) {
System.out.println("\"" + matcher.group().trim() + "\"");
}
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment