Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Contribute to GitLab
Sign in
Toggle navigation
P
poc-api
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
poc
poc-api
Commits
85997c0a
Commit
85997c0a
authored
Mar 05, 2025
by
alex yao
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
refactor: 优化Agent插件-网页解析 压缩内容
parent
05de9276
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
86 additions
and
1 deletion
+86
-1
pom.xml
pom.xml
+5
-0
DocumentLoad.java
src/main/java/cn/com/poc/common/utils/DocumentLoad.java
+12
-0
HtmlReaderFunction.java
...ce/demand/ai/function/html_reader/HtmlReaderFunction.java
+1
-1
HtmlReaderFunctionTest.java
...y/resource/demand/ai/function/HtmlReaderFunctionTest.java
+68
-0
No files found.
pom.xml
View file @
85997c0a
...
...
@@ -340,6 +340,11 @@
<version>
v1-rev20240821-2.0.0
</version>
</dependency>
<dependency>
<groupId>
org.jsoup
</groupId>
<artifactId>
jsoup
</artifactId>
<version>
1.19.1
</version>
</dependency>
</dependencies>
...
...
src/main/java/cn/com/poc/common/utils/DocumentLoad.java
View file @
85997c0a
...
...
@@ -23,6 +23,8 @@ import org.apache.poi.xslf.usermodel.XSLFTextShape;
import
org.apache.poi.xssf.usermodel.XSSFWorkbook
;
import
org.apache.poi.xwpf.extractor.XWPFWordExtractor
;
import
org.apache.poi.xwpf.usermodel.XWPFDocument
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Element
;
import
org.openxmlformats.schemas.drawingml.x2006.main.CTRegularTextRun
;
import
org.openxmlformats.schemas.drawingml.x2006.main.CTTextBody
;
import
org.openxmlformats.schemas.drawingml.x2006.main.CTTextParagraph
;
...
...
@@ -120,6 +122,16 @@ public class DocumentLoad {
return
sb
.
toString
();
}
public
static
String
loadHtml
(
String
url
)
{
try
{
Element
body
=
Jsoup
.
connect
(
"https://juejin.cn/post/7115639885457063966"
)
.
timeout
(
10
*
1000
).
get
().
body
();
return
body
.
text
();
}
catch
(
IOException
e
)
{
return
""
;
}
}
/**
* Html To Markdown
*/
...
...
src/main/java/cn/com/poc/thirdparty/resource/demand/ai/function/html_reader/HtmlReaderFunction.java
View file @
85997c0a
...
...
@@ -35,7 +35,7 @@ public class HtmlReaderFunction extends AbstractLargeModelFunction {
JSONObject
jsonObject
=
JSONObject
.
parseObject
(
content
);
String
url
=
jsonObject
.
getString
(
"url"
);
if
(
StringUtils
.
isNotBlank
(
url
))
{
return
DocumentLoad
.
htmlToMarkdown
(
url
);
return
DocumentLoad
.
loadHtml
(
url
);
}
return
StringUtils
.
EMPTY
;
}
...
...
src/test/java/cn/com/poc/thirdparty/resource/demand/ai/function/HtmlReaderFunctionTest.java
0 → 100644
View file @
85997c0a
package
cn
.
com
.
poc
.
thirdparty
.
resource
.
demand
.
ai
.
function
;
import
cn.com.yict.framemax.core.spring.SingleContextInitializer
;
import
io.github.furstenheim.*
;
import
org.jsoup.Jsoup
;
import
org.jsoup.nodes.Document
;
import
org.jsoup.nodes.Element
;
import
org.junit.runner.RunWith
;
import
org.junit.Test
;
import
org.springframework.test.context.ContextConfiguration
;
import
org.springframework.test.context.junit4.SpringJUnit4ClassRunner
;
import
org.springframework.test.context.web.WebAppConfiguration
;
import
java.io.IOException
;
import
java.io.InputStream
;
import
java.io.InputStreamReader
;
import
java.net.URL
;
import
java.net.URLConnection
;
/**
* @author alex.yao
* @date 2025/3/5
*/
@RunWith
(
SpringJUnit4ClassRunner
.
class
)
@ContextConfiguration
(
initializers
=
SingleContextInitializer
.
class
)
@WebAppConfiguration
public
class
HtmlReaderFunctionTest
{
@Test
public
void
test_jsoup
()
throws
IOException
{
Element
body
=
Jsoup
.
connect
(
"https://juejin.cn/post/7115639885457063966"
)
.
get
().
body
();
System
.
out
.
println
(
body
.
text
());
}
final
static
OptionsBuilder
optionsBuilder
=
OptionsBuilder
.
anOptions
();
final
static
Options
options
=
optionsBuilder
.
withBr
(
"-"
)
.
withLinkStyle
(
LinkStyle
.
REFERENCED
)
.
withLinkReferenceStyle
(
LinkReferenceStyle
.
SHORTCUT
)
.
build
();
final
static
CopyDown
converter
=
new
CopyDown
(
options
);
@Test
public
void
test_html2md
()
throws
IOException
{
// 创建 资源符对象 连接
URLConnection
conn
=
new
URL
(
"https://juejin.cn/post/7115639885457063966"
).
openConnection
();
// 设置连接超时时间,单位毫秒
conn
.
setConnectTimeout
(
5000
);
// 设置读取超时时间,单位毫秒
conn
.
setReadTimeout
(
15000
);
// 获取输入流
InputStream
inputStream
=
conn
.
getInputStream
();
// 缓冲区,读取输入流内容,64KB
char
[]
buffer
=
new
char
[
1024
*
64
];
int
len
;
StringBuilder
sb
=
new
StringBuilder
();
// 转换为字符流
InputStreamReader
isr
=
new
InputStreamReader
(
inputStream
);
// 循环读取
while
((
len
=
isr
.
read
(
buffer
))
!=
-
1
)
{
sb
.
append
(
buffer
,
0
,
len
);
}
// 关闭资源
inputStream
.
close
();
isr
.
close
();
String
htmlStr
=
sb
.
toString
().
replaceAll
(
"<head>.*?</head>"
,
""
);
System
.
out
.
println
(
converter
.
convert
(
htmlStr
));
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment