Commit 85997c0a authored by alex yao's avatar alex yao

refactor: 优化Agent插件-网页解析 压缩内容

parent 05de9276
...@@ -340,6 +340,11 @@ ...@@ -340,6 +340,11 @@
<version>v1-rev20240821-2.0.0</version> <version>v1-rev20240821-2.0.0</version>
</dependency> </dependency>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.19.1</version>
</dependency>
</dependencies> </dependencies>
......
...@@ -23,6 +23,8 @@ import org.apache.poi.xslf.usermodel.XSLFTextShape; ...@@ -23,6 +23,8 @@ import org.apache.poi.xslf.usermodel.XSLFTextShape;
import org.apache.poi.xssf.usermodel.XSSFWorkbook; import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor; import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.poi.xwpf.usermodel.XWPFDocument; import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Element;
import org.openxmlformats.schemas.drawingml.x2006.main.CTRegularTextRun; import org.openxmlformats.schemas.drawingml.x2006.main.CTRegularTextRun;
import org.openxmlformats.schemas.drawingml.x2006.main.CTTextBody; import org.openxmlformats.schemas.drawingml.x2006.main.CTTextBody;
import org.openxmlformats.schemas.drawingml.x2006.main.CTTextParagraph; import org.openxmlformats.schemas.drawingml.x2006.main.CTTextParagraph;
...@@ -120,6 +122,16 @@ public class DocumentLoad { ...@@ -120,6 +122,16 @@ public class DocumentLoad {
return sb.toString(); return sb.toString();
} }
public static String loadHtml(String url) {
try {
Element body = Jsoup.connect("https://juejin.cn/post/7115639885457063966")
.timeout(10 * 1000).get().body();
return body.text();
} catch (IOException e) {
return "";
}
}
/** /**
* Html To Markdown * Html To Markdown
*/ */
......
...@@ -35,7 +35,7 @@ public class HtmlReaderFunction extends AbstractLargeModelFunction { ...@@ -35,7 +35,7 @@ public class HtmlReaderFunction extends AbstractLargeModelFunction {
JSONObject jsonObject = JSONObject.parseObject(content); JSONObject jsonObject = JSONObject.parseObject(content);
String url = jsonObject.getString("url"); String url = jsonObject.getString("url");
if (StringUtils.isNotBlank(url)) { if (StringUtils.isNotBlank(url)) {
return DocumentLoad.htmlToMarkdown(url); return DocumentLoad.loadHtml(url);
} }
return StringUtils.EMPTY; return StringUtils.EMPTY;
} }
......
package cn.com.poc.thirdparty.resource.demand.ai.function;
import cn.com.yict.framemax.core.spring.SingleContextInitializer;
import io.github.furstenheim.*;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.junit.runner.RunWith;
import org.junit.Test;
import org.springframework.test.context.ContextConfiguration;
import org.springframework.test.context.junit4.SpringJUnit4ClassRunner;
import org.springframework.test.context.web.WebAppConfiguration;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URL;
import java.net.URLConnection;
/**
* @author alex.yao
* @date 2025/3/5
*/
@RunWith(SpringJUnit4ClassRunner.class)
@ContextConfiguration(initializers = SingleContextInitializer.class)
@WebAppConfiguration
public class HtmlReaderFunctionTest {
@Test
public void test_jsoup() throws IOException {
Element body = Jsoup.connect("https://juejin.cn/post/7115639885457063966")
.get().body();
System.out.println(body.text());
}
final static OptionsBuilder optionsBuilder = OptionsBuilder.anOptions();
final static Options options = optionsBuilder.withBr("-")
.withLinkStyle(LinkStyle.REFERENCED)
.withLinkReferenceStyle(LinkReferenceStyle.SHORTCUT)
.build();
final static CopyDown converter = new CopyDown(options);
@Test
public void test_html2md() throws IOException {
// 创建 资源符对象 连接
URLConnection conn = new URL("https://juejin.cn/post/7115639885457063966").openConnection();
// 设置连接超时时间,单位毫秒
conn.setConnectTimeout(5000);
// 设置读取超时时间,单位毫秒
conn.setReadTimeout(15000);
// 获取输入流
InputStream inputStream = conn.getInputStream();
// 缓冲区,读取输入流内容,64KB
char[] buffer = new char[1024 * 64];
int len;
StringBuilder sb = new StringBuilder();
// 转换为字符流
InputStreamReader isr = new InputStreamReader(inputStream);
// 循环读取
while ((len = isr.read(buffer)) != -1) {
sb.append(buffer, 0, len);
}
// 关闭资源
inputStream.close();
isr.close();
String htmlStr = sb.toString().replaceAll("<head>.*?</head>", "");
System.out.println( converter.convert(htmlStr));
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment