Commit 9be410a5 authored by alex yao's avatar alex yao

feat: 关键词提取

parent 72890a4b
package cn.com.poc.long_document.domain.keyword;
/**
* @author alex.yao
* @date 2025/9/4
*/
public class Concepts {
}
package cn.com.poc.long_document.domain.keyword;
import java.util.List;
/**
* @author alex.yao
* @date 2025/9/4
*/
public class KeyWord {
/**
* 关键词云图 [10个以内]
*/
private List<String> keyWord;
/**
* 关键词提及频率 [10个以内]
*/
private List<MentionFrequency> mentionFrequency;
/**
* 相关概念 [10个以内]
*/
List<String> concepts;
public List<String> getKeyWord() {
return keyWord;
}
public void setKeyWord(List<String> keyWord) {
this.keyWord = keyWord;
}
public List<MentionFrequency> getMentionFrequency() {
return mentionFrequency;
}
public void setMentionFrequency(List<MentionFrequency> mentionFrequency) {
this.mentionFrequency = mentionFrequency;
}
public List<String> getConcepts() {
return concepts;
}
public void setConcepts(List<String> concepts) {
this.concepts = concepts;
}
}
package cn.com.poc.long_document.domain.keyword;
import java.util.List;
/**
* @author alex.yao
* @date 2025/9/4
*/
public class MentionFrequency {
/**
* 关键词
*/
private String keyword;
/**
* 关联句子
*/
private List<String> sentences;
/**
* 出现频率
*/
private Integer frequency;
public String getKeyword() {
return keyword;
}
public void setKeyword(String keyword) {
this.keyword = keyword;
}
public List<String> getSentences() {
return sentences;
}
public void setSentences(List<String> sentences) {
this.sentences = sentences;
}
public Integer getFrequency() {
return frequency;
}
public void setFrequency(Integer frequency) {
this.frequency = frequency;
}
}
package cn.com.poc.long_document.dto;
import cn.com.poc.long_document.domain.MindMap;
import cn.com.poc.long_document.domain.keyword.KeyWord;
import com.fasterxml.jackson.annotation.JsonIgnoreProperties;
import com.fasterxml.jackson.annotation.JsonInclude;
......@@ -18,15 +19,15 @@ public class LongTextSummaryDto {
private String corePoint;
private List<String> keyword;
private KeyWord keyword;
private MindMap mindMap;
public List<String> getKeyword() {
public KeyWord getKeyword() {
return keyword;
}
public void setKeyword(List<String> keyword) {
public void setKeyword(KeyWord keyword) {
this.keyword = keyword;
}
......
......@@ -38,6 +38,29 @@ public class KeyWordCounter {
return sortedMap;
}
/**
* 提取关键词与文本中匹配的句子
*
* @param content 文本
* @param keywords 关键词
*/
public static Map<String, List<String>> keywordMatchSentence(String content, List<String> keywords) {
Map<String, List<String>> result = new LinkedHashMap<>();
// 将关键词转换为正则表达式模式(注意转义特殊字符)
for (String keyword : keywords) {
String regexPattern = Pattern.quote(keyword);
String sentenceRegex = "[^,,::;;.。!?!?\\n]*" + regexPattern + "[^,,::;;.。!?!?\\n]*[,,::;;.。!?!?\\n]";
Pattern pattern = Pattern.compile(sentenceRegex, Pattern.CASE_INSENSITIVE);
Matcher matcher = pattern.matcher(content);
// 查找并输出匹配的句子
List<String> sentences = new ArrayList<>();
while (matcher.find()) {
sentences.add(matcher.group().trim());
}
result.put(keyword, sentences);
}
return result;
}
/**
* 缓存关键词计数
......
package cn.com.poc.meeting;
import cn.com.yict.framemax.core.spring.SingleContextInitializer;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.springframework.test.context.ContextConfiguration;
import org.springframework.test.context.junit4.SpringJUnit4ClassRunner;
import org.springframework.test.context.web.WebAppConfiguration;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
@RunWith(SpringJUnit4ClassRunner.class)
@ContextConfiguration(initializers = SingleContextInitializer.class)
@WebAppConfiguration
public class MeetingTest {
@Test
public void test() {
String keyword = "an example";
String text = "This is an example sentence. Another example? Yes, this is an example.";
// 将关键词转换为正则表达式模式(注意转义特殊字符)
String regexPattern = Pattern.quote(keyword);
// 构建匹配句子的正则表达式
// 匹配以任意标点符号或换行结尾的句子,包含关键词
// 支持中英文混合关键词匹配
String sentenceRegex = "[^,,::;;.。!?!?\\n]*" + regexPattern + "[^,,::;;.。!?!?\\n]*[,,::;;.。!?!?\\n]";
Pattern pattern = Pattern.compile(sentenceRegex, Pattern.CASE_INSENSITIVE);
Matcher matcher = pattern.matcher(text);
// 查找并输出匹配的句子
while (matcher.find()) {
System.out.println("\"" + matcher.group().trim() + "\"");
}
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment