Commit 6fba33c2 authored by alex yao's avatar alex yao

feat:合同信息提取插件

parent d1d8257c
......@@ -4,6 +4,7 @@ import cn.com.poc.common.utils.SpringUtils;
import cn.com.poc.thirdparty.resource.demand.ai.function.calculator.CalculatorFunction;
import cn.com.poc.thirdparty.resource.demand.ai.function.document_reader.DocumentReaderFunction;
import cn.com.poc.thirdparty.resource.demand.ai.function.document_understanding.DocumentUnderstandIngFunction;
import cn.com.poc.thirdparty.resource.demand.ai.function.extraction.ContractExtractionFunction;
import cn.com.poc.thirdparty.resource.demand.ai.function.html_reader.HtmlReaderFunction;
import cn.com.poc.thirdparty.resource.demand.ai.function.image_ocr.ImageOCRFunction;
import cn.com.poc.thirdparty.resource.demand.ai.function.long_document_reader.LongDocumentReaderFunction;
......@@ -46,6 +47,8 @@ public enum LargeModelFunctionEnum {
long_document_reader(LongDocumentReaderFunction.class),
contract_extraction(ContractExtractionFunction.class),
;
private Class<? extends AbstractLargeModelFunction> function;
......
package cn.com.poc.thirdparty.resource.demand.ai.function.extraction;
import cn.com.poc.agent_application.entity.Variable;
import cn.com.poc.common.utils.JsonUtils;
import cn.com.poc.thirdparty.resource.demand.ai.function.AbstractFunctionResult;
import cn.com.poc.thirdparty.resource.demand.ai.function.AbstractLargeModelFunction;
import cn.com.poc.thirdparty.resource.demand.ai.function.entity.FunctionLLMConfig;
import cn.com.poc.thirdparty.resource.demand.ai.function.entity.Parameters;
import cn.com.poc.thirdparty.resource.demand.ai.function.entity.Properties;
import cn.com.poc.thirdparty.resource.demand.ai.function.extraction.entity.KeyInfo;
import cn.com.poc.thirdparty.resource.demand.ai.function.text_in_pdf2md.api.TextInClient;
import cn.hutool.core.collection.ListUtil;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import org.springframework.stereotype.Component;
import java.util.ArrayList;
import java.util.List;
/**
* @author alex.yao
* @date 2025/5/12
*/
@Component
public class ContractExtractionFunction extends AbstractLargeModelFunction {
private String DESC = "合同关键信息抽取";
private final FunctionLLMConfig functionLLMConfig = new FunctionLLMConfig.FunctionLLMConfigBuilder()
.name("contract_extraction")
.parameters(new Parameters("array")
.addProperties("fileUrl", new Properties("string", "文件链接, 合同文件的在线地址"))
.addProperties("key_info", new Properties("string", "关键信息名称, 长度限制20个字符"))
.addProperties("paraphrase_names", new Properties("array", "相似名字段,字符串数组, 可根据相似名精准抽取关键信息, 最多填写3个,每个释义名称长度限制20个字符"))
.addProperties("field_type", new Properties("string", "字段类型字段, 可选项有,时间:time, 金额:amount, 地址:address, 公司:company, 姓名:name, 描述(长文本):long_text_description, 其他:other, 印章:stamp, 分别对应产品段配置的字段类型"))
.addProperties("keywords", new Properties("array", "关键字字段, 字符串数组, 可根据关键字信息,快速定位抽取信所在段落范围, 最多填写10个,且字符总长度不超过50"))
)
.description(DESC)
.build();
@Override
public AbstractFunctionResult<String> doFunction(String content, String identifier) {
AbstractFunctionResult<String> result = new AbstractFunctionResult<>();
JSONArray jsonArray = JSONArray.parseArray(content);
if (jsonArray.isEmpty()) {
return result;
}
String fileUrl = jsonArray.getJSONObject(0).getString("file_url");
List<KeyInfo> keyInfos = new ArrayList<>();
for (int i = 0; i < jsonArray.size(); i++) {
JSONObject jsonObject = jsonArray.getJSONObject(i);
KeyInfo keyInfo = new KeyInfo();
if (jsonObject.containsKey("field_type")) {
keyInfo.setField_type(jsonObject.getString("file_type"));
}
if (jsonObject.containsKey("key_info")) {
keyInfo.setKey_info(jsonObject.getString("key_info"));
}
if (jsonObject.containsKey("paraphrase_names")) {
keyInfo.setParaphrase_names(jsonObject.getJSONArray("paraphrase_names").toArray(new String[0]));
}
if (jsonObject.containsKey("keywords")) {
keyInfo.setKeywords(jsonObject.getJSONArray("keywords").toArray(new String[0]));
}
keyInfos.add(keyInfo);
}
TextInClient textInClient = new TextInClient();
String extraction = textInClient.extraction(fileUrl, keyInfos);
result.setFunctionResult(extraction);
result.setPromptContent(extraction);
return result;
}
@Override
public String getDesc() {
return DESC;
}
@Override
public List<String> getLLMConfig() {
return ListUtil.toList(JsonUtils.serialize(functionLLMConfig));
}
@Override
public List<String> getLLMConfig(List<Variable> variableStructure) {
return this.getLLMConfig();
}
}
package cn.com.poc.thirdparty.resource.demand.ai.function.extraction.entity;
/**
* @author alex.yao
* @date 2025/5/12
*/
public class Config {
public String engine;
public String use_pdf_parser;
public String use_semantic_match;
public String remove_watermark;
public Config(String engine, String use_pdf_parser, String use_semantic_match, String remove_watermark) {
this.engine = engine;
this.use_pdf_parser = use_pdf_parser;
this.use_semantic_match = use_semantic_match;
this.remove_watermark = remove_watermark;
}
}
package cn.com.poc.thirdparty.resource.demand.ai.function.extraction.entity;
/**
* @author alex.yao
* @date 2025/5/12
*/
public class KeyInfo {
public String key_info;
public String[] paraphrase_names;
public String field_type;
public boolean is_in_table;
public String[] keywords;
public KeyInfo() {
}
public KeyInfo(String key_info, String[] paraphrase_names, String field_type, boolean is_in_table, String[] keywords) {
this.key_info = key_info;
this.paraphrase_names = paraphrase_names;
this.field_type = field_type;
this.is_in_table = is_in_table;
this.keywords = keywords;
}
public String getKey_info() {
return key_info;
}
public void setKey_info(String key_info) {
this.key_info = key_info;
}
public String[] getParaphrase_names() {
return paraphrase_names;
}
public void setParaphrase_names(String[] paraphrase_names) {
this.paraphrase_names = paraphrase_names;
}
public String getField_type() {
return field_type;
}
public void setField_type(String field_type) {
this.field_type = field_type;
}
public boolean isIs_in_table() {
return is_in_table;
}
public void setIs_in_table(boolean is_in_table) {
this.is_in_table = is_in_table;
}
public String[] getKeywords() {
return keywords;
}
public void setKeywords(String[] keywords) {
this.keywords = keywords;
}
}
package cn.com.poc.thirdparty.resource.demand.ai.function.extraction.entity;
/**
* @author alex.yao
* @date 2025/5/12
*/
public class RequestData {
public String creator;
public Config config;
public String filedata;
public String filename;
public KeyInfo[] key_info_list;
public RequestData(String creator, Config config, String filedata, String filename, KeyInfo[] key_info_list) {
this.creator = creator;
this.config = config;
this.filedata = filedata;
this.filename = filename;
this.key_info_list = key_info_list;
}
}
......@@ -7,7 +7,7 @@ import cn.com.poc.thirdparty.resource.demand.ai.function.AbstractLargeModelFunct
import cn.com.poc.thirdparty.resource.demand.ai.function.entity.FunctionLLMConfig;
import cn.com.poc.thirdparty.resource.demand.ai.function.entity.Parameters;
import cn.com.poc.thirdparty.resource.demand.ai.function.entity.Properties;
import cn.com.poc.thirdparty.resource.demand.ai.function.text_in_pdf2md.api.OCRClient;
import cn.com.poc.thirdparty.resource.demand.ai.function.text_in_pdf2md.api.TextInClient;
import cn.hutool.core.collection.ListUtil;
import com.alibaba.fastjson.JSONObject;
import com.fasterxml.jackson.databind.JsonNode;
......@@ -63,7 +63,7 @@ public class PdfToMDFunction extends AbstractLargeModelFunction {
options.put("paratext_mode", "annotation");
options.put("parse_mode", "auto");
options.put("table_flavor", "md");
OCRClient client = new OCRClient();
TextInClient client = new TextInClient();
try {
String response = client.recognize(fileContent, options);
ObjectMapper mapper = new ObjectMapper();
......@@ -72,6 +72,10 @@ public class PdfToMDFunction extends AbstractLargeModelFunction {
String markdown = jsonNode.get("result").get("markdown").asText();
result.setPromptContent(markdown);
result.setFunctionResult(markdown);
} else {
logger.warn("text in 文档信息提取异常:{}", response);
result.setFunctionResult(response);
result.setPromptContent("FAIL");
}
return result;
} catch (Exception e) {
......
......@@ -5,26 +5,36 @@ package cn.com.poc.thirdparty.resource.demand.ai.function.text_in_pdf2md.api;
* @date 2025/5/7
*/
import cn.com.poc.common.utils.DocumentLoad;
import cn.com.poc.thirdparty.resource.demand.ai.function.extraction.entity.Config;
import cn.com.poc.thirdparty.resource.demand.ai.function.extraction.entity.KeyInfo;
import cn.com.poc.thirdparty.resource.demand.ai.function.extraction.entity.RequestData;
import cn.com.yict.framemax.core.exception.BusinessException;
import com.fasterxml.jackson.databind.ObjectMapper;
import org.jetbrains.annotations.NotNull;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.*;
import java.net.HttpURLConnection;
import java.net.URL;
import java.net.URLEncoder;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.Base64;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
public class OCRClient {
public class TextInClient {
private Logger logger = LoggerFactory.getLogger(OCRClient.class);
private Logger logger = LoggerFactory.getLogger(TextInClient.class);
private final String appId = "dafd04a574230c00ccba61132160de0c";
private final String secretCode = "3bc03c7e6f9402963e6e71d16d786a9c";
private final String baseUrl = "https://api.textin.com/ai/service/v1/pdf_to_markdown";
public OCRClient() {
public TextInClient() {
}
public String recognize(byte[] fileContent, HashMap<String, Object> options) throws IOException {
......@@ -37,16 +47,7 @@ public class OCRClient {
.append("=")
.append(URLEncoder.encode(entry.getValue().toString(), "UTF-8"));
}
String fullUrl = baseUrl + (queryParams.length() > 0 ? "?" + queryParams : "");
URL url = new URL(fullUrl);
HttpURLConnection connection = (HttpURLConnection) url.openConnection();
connection.setRequestMethod("POST");
connection.setRequestProperty("x-ti-app-id", appId);
connection.setRequestProperty("x-ti-secret-code", secretCode);
connection.setRequestProperty("Content-Type", "text/plain;charset=utf-8");
connection.setDoOutput(true);
HttpURLConnection connection = getRecoGinzeHttpURLConnection(queryParams);
try (OutputStream os = connection.getOutputStream()) {
os.write(fileContent);
os.flush();
......@@ -69,4 +70,73 @@ public class OCRClient {
}
}
public String extraction(String fileUrl, List<KeyInfo> keyInfoList) {
try {
// 读取文件并将其转换为Base64编码
File file = DocumentLoad.downloadURLDocument(fileUrl);
byte[] fileData = Files.readAllBytes(file.toPath());
String base64FileData = Base64.getEncoder().encodeToString(fileData);
// 获取文件名
String fileName = file.getName();
// 构建请求数据
Config config = new Config("table", "true", "true", "false");
RequestData requestData = new RequestData("", config, base64FileData, fileName, keyInfoList.toArray(new KeyInfo[0]));
// 创建ObjectMapper对象,序列化Java对象为JSON
ObjectMapper objectMapper = new ObjectMapper();
String requestDataJson = objectMapper.writeValueAsString(requestData);
// 创建URL对象
URL url = new URL("https://doc-compare.intsig.com/api/contracts/v3/extraction/external/create");
// 打开HTTP连接
HttpURLConnection connection = (HttpURLConnection) url.openConnection();
connection.setRequestMethod("POST");
connection.setRequestProperty("x-ti-app-id", appId);
connection.setRequestProperty("x-ti-secret-code", secretCode);
connection.setRequestProperty("Content-Type", "application/json");
connection.setDoOutput(true); // 开启输出流
// 发送请求数据
try (OutputStream os = connection.getOutputStream()) {
byte[] input = requestDataJson.getBytes(StandardCharsets.UTF_8);
os.write(input, 0, input.length);
}
// 获取响应代码
int status = connection.getResponseCode();
logger.info("Response Code: {}", status);
// 读取响应内容
try (BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream()))) {
String inputLine;
StringBuilder response = new StringBuilder();
while ((inputLine = in.readLine()) != null) {
response.append(inputLine);
}
// 输出响应内容
return response.toString();
}
} catch (IOException e) {
throw new BusinessException(e);
}
}
private HttpURLConnection getRecoGinzeHttpURLConnection(StringBuilder queryParams) throws IOException {
String baseUrl = "https://api.textin.com/ai/service/v1/pdf_to_markdown";
String fullUrl = baseUrl + (queryParams.length() > 0 ? "?" + queryParams : "");
URL url = new URL(fullUrl);
HttpURLConnection connection = (HttpURLConnection) url.openConnection();
connection.setRequestMethod("POST");
connection.setRequestProperty("x-ti-app-id", appId);
connection.setRequestProperty("x-ti-secret-code", secretCode);
connection.setRequestProperty("Content-Type", "text/plain;charset=utf-8");
connection.setDoOutput(true);
return connection;
}
}
\ No newline at end of file
package cn.com.poc.thirdparty.resource.demand.ai.function;
import cn.com.poc.thirdparty.resource.demand.ai.function.text_in_pdf2md.api.OCRClient;
import cn.com.poc.common.utils.JsonUtils;
import cn.com.poc.thirdparty.resource.demand.ai.function.extraction.ContractExtractionFunction;
import cn.com.poc.thirdparty.resource.demand.ai.function.text_in_pdf2md.api.TextInClient;
import cn.com.yict.framemax.core.spring.SingleContextInitializer;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
......@@ -10,6 +12,7 @@ import org.springframework.test.context.ContextConfiguration;
import org.springframework.test.context.junit4.SpringJUnit4ClassRunner;
import org.springframework.test.context.web.WebAppConfiguration;
import javax.annotation.Resource;
import java.nio.charset.StandardCharsets;
import java.util.HashMap;
......@@ -40,7 +43,7 @@ public class PdfToMdFunctionTest {
options.put("paratext_mode", "annotation");
options.put("parse_mode", "auto");
options.put("table_flavor", "md");
OCRClient client = new OCRClient();
TextInClient client = new TextInClient();
try {
String response = client.recognize(fileContent, options);
ObjectMapper mapper = new ObjectMapper();
......@@ -48,9 +51,20 @@ public class PdfToMdFunctionTest {
if (jsonNode.has("result") && jsonNode.get("result").has("markdown")) {
String markdown = jsonNode.get("result").get("markdown").asText();
System.out.println(markdown);
}else{
System.out.println(response);
}
} catch (Exception e) {
System.out.println("1111111");
}
}
@Resource
private ContractExtractionFunction contractExtractionFunction;
@Test
public void test_cefunction() {
System.out.println(contractExtractionFunction.getLLMConfig());
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment