Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Contribute to GitLab
Sign in
Toggle navigation
P
poc-api
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
poc
poc-api
Commits
6fba33c2
Commit
6fba33c2
authored
May 12, 2025
by
alex yao
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
feat:合同信息提取插件
parent
d1d8257c
Show whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
361 additions
and
4 deletions
+361
-4
LargeModelFunctionEnum.java
...y/resource/demand/ai/function/LargeModelFunctionEnum.java
+3
-0
ContractExtractionFunction.java
...nd/ai/function/extraction/ContractExtractionFunction.java
+89
-0
Config.java
...resource/demand/ai/function/extraction/entity/Config.java
+20
-0
KeyInfo.java
...esource/demand/ai/function/extraction/entity/KeyInfo.java
+64
-0
RequestData.java
...rce/demand/ai/function/extraction/entity/RequestData.java
+21
-0
PdfToMDFunction.java
...ce/demand/ai/function/text_in_pdf2md/PdfToMDFunction.java
+6
-2
TextInClient.java
...e/demand/ai/function/text_in_pdf2md/api/TextInClient.java
+142
-0
PdfToMdFunctionTest.java
...arty/resource/demand/ai/function/PdfToMdFunctionTest.java
+16
-2
No files found.
src/main/java/cn/com/poc/thirdparty/resource/demand/ai/function/LargeModelFunctionEnum.java
View file @
6fba33c2
...
...
@@ -4,6 +4,7 @@ import cn.com.poc.common.utils.SpringUtils;
import
cn.com.poc.thirdparty.resource.demand.ai.function.calculator.CalculatorFunction
;
import
cn.com.poc.thirdparty.resource.demand.ai.function.document_reader.DocumentReaderFunction
;
import
cn.com.poc.thirdparty.resource.demand.ai.function.document_understanding.DocumentUnderstandIngFunction
;
import
cn.com.poc.thirdparty.resource.demand.ai.function.extraction.ContractExtractionFunction
;
import
cn.com.poc.thirdparty.resource.demand.ai.function.html_reader.HtmlReaderFunction
;
import
cn.com.poc.thirdparty.resource.demand.ai.function.image_ocr.ImageOCRFunction
;
import
cn.com.poc.thirdparty.resource.demand.ai.function.long_document_reader.LongDocumentReaderFunction
;
...
...
@@ -46,6 +47,8 @@ public enum LargeModelFunctionEnum {
long_document_reader
(
LongDocumentReaderFunction
.
class
),
contract_extraction
(
ContractExtractionFunction
.
class
),
;
private
Class
<?
extends
AbstractLargeModelFunction
>
function
;
...
...
src/main/java/cn/com/poc/thirdparty/resource/demand/ai/function/extraction/ContractExtractionFunction.java
0 → 100644
View file @
6fba33c2
package
cn
.
com
.
poc
.
thirdparty
.
resource
.
demand
.
ai
.
function
.
extraction
;
import
cn.com.poc.agent_application.entity.Variable
;
import
cn.com.poc.common.utils.JsonUtils
;
import
cn.com.poc.thirdparty.resource.demand.ai.function.AbstractFunctionResult
;
import
cn.com.poc.thirdparty.resource.demand.ai.function.AbstractLargeModelFunction
;
import
cn.com.poc.thirdparty.resource.demand.ai.function.entity.FunctionLLMConfig
;
import
cn.com.poc.thirdparty.resource.demand.ai.function.entity.Parameters
;
import
cn.com.poc.thirdparty.resource.demand.ai.function.entity.Properties
;
import
cn.com.poc.thirdparty.resource.demand.ai.function.extraction.entity.KeyInfo
;
import
cn.com.poc.thirdparty.resource.demand.ai.function.text_in_pdf2md.api.TextInClient
;
import
cn.hutool.core.collection.ListUtil
;
import
com.alibaba.fastjson.JSONArray
;
import
com.alibaba.fastjson.JSONObject
;
import
org.springframework.stereotype.Component
;
import
java.util.ArrayList
;
import
java.util.List
;
/**
* @author alex.yao
* @date 2025/5/12
*/
@Component
public
class
ContractExtractionFunction
extends
AbstractLargeModelFunction
{
private
String
DESC
=
"合同关键信息抽取"
;
private
final
FunctionLLMConfig
functionLLMConfig
=
new
FunctionLLMConfig
.
FunctionLLMConfigBuilder
()
.
name
(
"contract_extraction"
)
.
parameters
(
new
Parameters
(
"array"
)
.
addProperties
(
"fileUrl"
,
new
Properties
(
"string"
,
"文件链接, 合同文件的在线地址"
))
.
addProperties
(
"key_info"
,
new
Properties
(
"string"
,
"关键信息名称, 长度限制20个字符"
))
.
addProperties
(
"paraphrase_names"
,
new
Properties
(
"array"
,
"相似名字段,字符串数组, 可根据相似名精准抽取关键信息, 最多填写3个,每个释义名称长度限制20个字符"
))
.
addProperties
(
"field_type"
,
new
Properties
(
"string"
,
"字段类型字段, 可选项有,时间:time, 金额:amount, 地址:address, 公司:company, 姓名:name, 描述(长文本):long_text_description, 其他:other, 印章:stamp, 分别对应产品段配置的字段类型"
))
.
addProperties
(
"keywords"
,
new
Properties
(
"array"
,
"关键字字段, 字符串数组, 可根据关键字信息,快速定位抽取信所在段落范围, 最多填写10个,且字符总长度不超过50"
))
)
.
description
(
DESC
)
.
build
();
@Override
public
AbstractFunctionResult
<
String
>
doFunction
(
String
content
,
String
identifier
)
{
AbstractFunctionResult
<
String
>
result
=
new
AbstractFunctionResult
<>();
JSONArray
jsonArray
=
JSONArray
.
parseArray
(
content
);
if
(
jsonArray
.
isEmpty
())
{
return
result
;
}
String
fileUrl
=
jsonArray
.
getJSONObject
(
0
).
getString
(
"file_url"
);
List
<
KeyInfo
>
keyInfos
=
new
ArrayList
<>();
for
(
int
i
=
0
;
i
<
jsonArray
.
size
();
i
++)
{
JSONObject
jsonObject
=
jsonArray
.
getJSONObject
(
i
);
KeyInfo
keyInfo
=
new
KeyInfo
();
if
(
jsonObject
.
containsKey
(
"field_type"
))
{
keyInfo
.
setField_type
(
jsonObject
.
getString
(
"file_type"
));
}
if
(
jsonObject
.
containsKey
(
"key_info"
))
{
keyInfo
.
setKey_info
(
jsonObject
.
getString
(
"key_info"
));
}
if
(
jsonObject
.
containsKey
(
"paraphrase_names"
))
{
keyInfo
.
setParaphrase_names
(
jsonObject
.
getJSONArray
(
"paraphrase_names"
).
toArray
(
new
String
[
0
]));
}
if
(
jsonObject
.
containsKey
(
"keywords"
))
{
keyInfo
.
setKeywords
(
jsonObject
.
getJSONArray
(
"keywords"
).
toArray
(
new
String
[
0
]));
}
keyInfos
.
add
(
keyInfo
);
}
TextInClient
textInClient
=
new
TextInClient
();
String
extraction
=
textInClient
.
extraction
(
fileUrl
,
keyInfos
);
result
.
setFunctionResult
(
extraction
);
result
.
setPromptContent
(
extraction
);
return
result
;
}
@Override
public
String
getDesc
()
{
return
DESC
;
}
@Override
public
List
<
String
>
getLLMConfig
()
{
return
ListUtil
.
toList
(
JsonUtils
.
serialize
(
functionLLMConfig
));
}
@Override
public
List
<
String
>
getLLMConfig
(
List
<
Variable
>
variableStructure
)
{
return
this
.
getLLMConfig
();
}
}
src/main/java/cn/com/poc/thirdparty/resource/demand/ai/function/extraction/entity/Config.java
0 → 100644
View file @
6fba33c2
package
cn
.
com
.
poc
.
thirdparty
.
resource
.
demand
.
ai
.
function
.
extraction
.
entity
;
/**
* @author alex.yao
* @date 2025/5/12
*/
public
class
Config
{
public
String
engine
;
public
String
use_pdf_parser
;
public
String
use_semantic_match
;
public
String
remove_watermark
;
public
Config
(
String
engine
,
String
use_pdf_parser
,
String
use_semantic_match
,
String
remove_watermark
)
{
this
.
engine
=
engine
;
this
.
use_pdf_parser
=
use_pdf_parser
;
this
.
use_semantic_match
=
use_semantic_match
;
this
.
remove_watermark
=
remove_watermark
;
}
}
src/main/java/cn/com/poc/thirdparty/resource/demand/ai/function/extraction/entity/KeyInfo.java
0 → 100644
View file @
6fba33c2
package
cn
.
com
.
poc
.
thirdparty
.
resource
.
demand
.
ai
.
function
.
extraction
.
entity
;
/**
* @author alex.yao
* @date 2025/5/12
*/
public
class
KeyInfo
{
public
String
key_info
;
public
String
[]
paraphrase_names
;
public
String
field_type
;
public
boolean
is_in_table
;
public
String
[]
keywords
;
public
KeyInfo
()
{
}
public
KeyInfo
(
String
key_info
,
String
[]
paraphrase_names
,
String
field_type
,
boolean
is_in_table
,
String
[]
keywords
)
{
this
.
key_info
=
key_info
;
this
.
paraphrase_names
=
paraphrase_names
;
this
.
field_type
=
field_type
;
this
.
is_in_table
=
is_in_table
;
this
.
keywords
=
keywords
;
}
public
String
getKey_info
()
{
return
key_info
;
}
public
void
setKey_info
(
String
key_info
)
{
this
.
key_info
=
key_info
;
}
public
String
[]
getParaphrase_names
()
{
return
paraphrase_names
;
}
public
void
setParaphrase_names
(
String
[]
paraphrase_names
)
{
this
.
paraphrase_names
=
paraphrase_names
;
}
public
String
getField_type
()
{
return
field_type
;
}
public
void
setField_type
(
String
field_type
)
{
this
.
field_type
=
field_type
;
}
public
boolean
isIs_in_table
()
{
return
is_in_table
;
}
public
void
setIs_in_table
(
boolean
is_in_table
)
{
this
.
is_in_table
=
is_in_table
;
}
public
String
[]
getKeywords
()
{
return
keywords
;
}
public
void
setKeywords
(
String
[]
keywords
)
{
this
.
keywords
=
keywords
;
}
}
src/main/java/cn/com/poc/thirdparty/resource/demand/ai/function/extraction/entity/RequestData.java
0 → 100644
View file @
6fba33c2
package
cn
.
com
.
poc
.
thirdparty
.
resource
.
demand
.
ai
.
function
.
extraction
.
entity
;
/**
* @author alex.yao
* @date 2025/5/12
*/
public
class
RequestData
{
public
String
creator
;
public
Config
config
;
public
String
filedata
;
public
String
filename
;
public
KeyInfo
[]
key_info_list
;
public
RequestData
(
String
creator
,
Config
config
,
String
filedata
,
String
filename
,
KeyInfo
[]
key_info_list
)
{
this
.
creator
=
creator
;
this
.
config
=
config
;
this
.
filedata
=
filedata
;
this
.
filename
=
filename
;
this
.
key_info_list
=
key_info_list
;
}
}
src/main/java/cn/com/poc/thirdparty/resource/demand/ai/function/text_in_pdf2md/PdfToMDFunction.java
View file @
6fba33c2
...
...
@@ -7,7 +7,7 @@ import cn.com.poc.thirdparty.resource.demand.ai.function.AbstractLargeModelFunct
import
cn.com.poc.thirdparty.resource.demand.ai.function.entity.FunctionLLMConfig
;
import
cn.com.poc.thirdparty.resource.demand.ai.function.entity.Parameters
;
import
cn.com.poc.thirdparty.resource.demand.ai.function.entity.Properties
;
import
cn.com.poc.thirdparty.resource.demand.ai.function.text_in_pdf2md.api.
OCR
Client
;
import
cn.com.poc.thirdparty.resource.demand.ai.function.text_in_pdf2md.api.
TextIn
Client
;
import
cn.hutool.core.collection.ListUtil
;
import
com.alibaba.fastjson.JSONObject
;
import
com.fasterxml.jackson.databind.JsonNode
;
...
...
@@ -63,7 +63,7 @@ public class PdfToMDFunction extends AbstractLargeModelFunction {
options
.
put
(
"paratext_mode"
,
"annotation"
);
options
.
put
(
"parse_mode"
,
"auto"
);
options
.
put
(
"table_flavor"
,
"md"
);
OCRClient
client
=
new
OCR
Client
();
TextInClient
client
=
new
TextIn
Client
();
try
{
String
response
=
client
.
recognize
(
fileContent
,
options
);
ObjectMapper
mapper
=
new
ObjectMapper
();
...
...
@@ -72,6 +72,10 @@ public class PdfToMDFunction extends AbstractLargeModelFunction {
String
markdown
=
jsonNode
.
get
(
"result"
).
get
(
"markdown"
).
asText
();
result
.
setPromptContent
(
markdown
);
result
.
setFunctionResult
(
markdown
);
}
else
{
logger
.
warn
(
"text in 文档信息提取异常:{}"
,
response
);
result
.
setFunctionResult
(
response
);
result
.
setPromptContent
(
"FAIL"
);
}
return
result
;
}
catch
(
Exception
e
)
{
...
...
src/main/java/cn/com/poc/thirdparty/resource/demand/ai/function/text_in_pdf2md/api/
OCR
Client.java
→
src/main/java/cn/com/poc/thirdparty/resource/demand/ai/function/text_in_pdf2md/api/
TextIn
Client.java
View file @
6fba33c2
...
...
@@ -5,26 +5,36 @@ package cn.com.poc.thirdparty.resource.demand.ai.function.text_in_pdf2md.api;
* @date 2025/5/7
*/
import
cn.com.poc.common.utils.DocumentLoad
;
import
cn.com.poc.thirdparty.resource.demand.ai.function.extraction.entity.Config
;
import
cn.com.poc.thirdparty.resource.demand.ai.function.extraction.entity.KeyInfo
;
import
cn.com.poc.thirdparty.resource.demand.ai.function.extraction.entity.RequestData
;
import
cn.com.yict.framemax.core.exception.BusinessException
;
import
com.fasterxml.jackson.databind.ObjectMapper
;
import
org.jetbrains.annotations.NotNull
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
java.io.BufferedReader
;
import
java.io.IOException
;
import
java.io.InputStreamReader
;
import
java.io.OutputStream
;
import
java.io.*
;
import
java.net.HttpURLConnection
;
import
java.net.URL
;
import
java.net.URLEncoder
;
import
java.nio.charset.StandardCharsets
;
import
java.nio.file.Files
;
import
java.nio.file.Path
;
import
java.nio.file.Paths
;
import
java.util.Base64
;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.Map
;
public
class
OCR
Client
{
public
class
TextIn
Client
{
private
Logger
logger
=
LoggerFactory
.
getLogger
(
OCR
Client
.
class
);
private
Logger
logger
=
LoggerFactory
.
getLogger
(
TextIn
Client
.
class
);
private
final
String
appId
=
"dafd04a574230c00ccba61132160de0c"
;
private
final
String
secretCode
=
"3bc03c7e6f9402963e6e71d16d786a9c"
;
private
final
String
baseUrl
=
"https://api.textin.com/ai/service/v1/pdf_to_markdown"
;
public
OCR
Client
()
{
public
TextIn
Client
()
{
}
public
String
recognize
(
byte
[]
fileContent
,
HashMap
<
String
,
Object
>
options
)
throws
IOException
{
...
...
@@ -37,16 +47,7 @@ public class OCRClient {
.
append
(
"="
)
.
append
(
URLEncoder
.
encode
(
entry
.
getValue
().
toString
(),
"UTF-8"
));
}
String
fullUrl
=
baseUrl
+
(
queryParams
.
length
()
>
0
?
"?"
+
queryParams
:
""
);
URL
url
=
new
URL
(
fullUrl
);
HttpURLConnection
connection
=
(
HttpURLConnection
)
url
.
openConnection
();
connection
.
setRequestMethod
(
"POST"
);
connection
.
setRequestProperty
(
"x-ti-app-id"
,
appId
);
connection
.
setRequestProperty
(
"x-ti-secret-code"
,
secretCode
);
connection
.
setRequestProperty
(
"Content-Type"
,
"text/plain;charset=utf-8"
);
connection
.
setDoOutput
(
true
);
HttpURLConnection
connection
=
getRecoGinzeHttpURLConnection
(
queryParams
);
try
(
OutputStream
os
=
connection
.
getOutputStream
())
{
os
.
write
(
fileContent
);
os
.
flush
();
...
...
@@ -69,4 +70,73 @@ public class OCRClient {
}
}
public
String
extraction
(
String
fileUrl
,
List
<
KeyInfo
>
keyInfoList
)
{
try
{
// 读取文件并将其转换为Base64编码
File
file
=
DocumentLoad
.
downloadURLDocument
(
fileUrl
);
byte
[]
fileData
=
Files
.
readAllBytes
(
file
.
toPath
());
String
base64FileData
=
Base64
.
getEncoder
().
encodeToString
(
fileData
);
// 获取文件名
String
fileName
=
file
.
getName
();
// 构建请求数据
Config
config
=
new
Config
(
"table"
,
"true"
,
"true"
,
"false"
);
RequestData
requestData
=
new
RequestData
(
""
,
config
,
base64FileData
,
fileName
,
keyInfoList
.
toArray
(
new
KeyInfo
[
0
]));
// 创建ObjectMapper对象,序列化Java对象为JSON
ObjectMapper
objectMapper
=
new
ObjectMapper
();
String
requestDataJson
=
objectMapper
.
writeValueAsString
(
requestData
);
// 创建URL对象
URL
url
=
new
URL
(
"https://doc-compare.intsig.com/api/contracts/v3/extraction/external/create"
);
// 打开HTTP连接
HttpURLConnection
connection
=
(
HttpURLConnection
)
url
.
openConnection
();
connection
.
setRequestMethod
(
"POST"
);
connection
.
setRequestProperty
(
"x-ti-app-id"
,
appId
);
connection
.
setRequestProperty
(
"x-ti-secret-code"
,
secretCode
);
connection
.
setRequestProperty
(
"Content-Type"
,
"application/json"
);
connection
.
setDoOutput
(
true
);
// 开启输出流
// 发送请求数据
try
(
OutputStream
os
=
connection
.
getOutputStream
())
{
byte
[]
input
=
requestDataJson
.
getBytes
(
StandardCharsets
.
UTF_8
);
os
.
write
(
input
,
0
,
input
.
length
);
}
// 获取响应代码
int
status
=
connection
.
getResponseCode
();
logger
.
info
(
"Response Code: {}"
,
status
);
// 读取响应内容
try
(
BufferedReader
in
=
new
BufferedReader
(
new
InputStreamReader
(
connection
.
getInputStream
())))
{
String
inputLine
;
StringBuilder
response
=
new
StringBuilder
();
while
((
inputLine
=
in
.
readLine
())
!=
null
)
{
response
.
append
(
inputLine
);
}
// 输出响应内容
return
response
.
toString
();
}
}
catch
(
IOException
e
)
{
throw
new
BusinessException
(
e
);
}
}
private
HttpURLConnection
getRecoGinzeHttpURLConnection
(
StringBuilder
queryParams
)
throws
IOException
{
String
baseUrl
=
"https://api.textin.com/ai/service/v1/pdf_to_markdown"
;
String
fullUrl
=
baseUrl
+
(
queryParams
.
length
()
>
0
?
"?"
+
queryParams
:
""
);
URL
url
=
new
URL
(
fullUrl
);
HttpURLConnection
connection
=
(
HttpURLConnection
)
url
.
openConnection
();
connection
.
setRequestMethod
(
"POST"
);
connection
.
setRequestProperty
(
"x-ti-app-id"
,
appId
);
connection
.
setRequestProperty
(
"x-ti-secret-code"
,
secretCode
);
connection
.
setRequestProperty
(
"Content-Type"
,
"text/plain;charset=utf-8"
);
connection
.
setDoOutput
(
true
);
return
connection
;
}
}
\ No newline at end of file
src/test/java/cn/com/poc/thirdparty/resource/demand/ai/function/PdfToMdFunctionTest.java
View file @
6fba33c2
package
cn
.
com
.
poc
.
thirdparty
.
resource
.
demand
.
ai
.
function
;
import
cn.com.poc.thirdparty.resource.demand.ai.function.text_in_pdf2md.api.OCRClient
;
import
cn.com.poc.common.utils.JsonUtils
;
import
cn.com.poc.thirdparty.resource.demand.ai.function.extraction.ContractExtractionFunction
;
import
cn.com.poc.thirdparty.resource.demand.ai.function.text_in_pdf2md.api.TextInClient
;
import
cn.com.yict.framemax.core.spring.SingleContextInitializer
;
import
com.fasterxml.jackson.databind.JsonNode
;
import
com.fasterxml.jackson.databind.ObjectMapper
;
...
...
@@ -10,6 +12,7 @@ import org.springframework.test.context.ContextConfiguration;
import
org.springframework.test.context.junit4.SpringJUnit4ClassRunner
;
import
org.springframework.test.context.web.WebAppConfiguration
;
import
javax.annotation.Resource
;
import
java.nio.charset.StandardCharsets
;
import
java.util.HashMap
;
...
...
@@ -40,7 +43,7 @@ public class PdfToMdFunctionTest {
options
.
put
(
"paratext_mode"
,
"annotation"
);
options
.
put
(
"parse_mode"
,
"auto"
);
options
.
put
(
"table_flavor"
,
"md"
);
OCRClient
client
=
new
OCR
Client
();
TextInClient
client
=
new
TextIn
Client
();
try
{
String
response
=
client
.
recognize
(
fileContent
,
options
);
ObjectMapper
mapper
=
new
ObjectMapper
();
...
...
@@ -48,9 +51,20 @@ public class PdfToMdFunctionTest {
if
(
jsonNode
.
has
(
"result"
)
&&
jsonNode
.
get
(
"result"
).
has
(
"markdown"
))
{
String
markdown
=
jsonNode
.
get
(
"result"
).
get
(
"markdown"
).
asText
();
System
.
out
.
println
(
markdown
);
}
else
{
System
.
out
.
println
(
response
);
}
}
catch
(
Exception
e
)
{
System
.
out
.
println
(
"1111111"
);
}
}
@Resource
private
ContractExtractionFunction
contractExtractionFunction
;
@Test
public
void
test_cefunction
()
{
System
.
out
.
println
(
contractExtractionFunction
.
getLLMConfig
());
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment