Commit 7674903c authored by alex yao's avatar alex yao

fix: PDF乱码替换

parent 768f92ec
...@@ -244,7 +244,7 @@ public class DocumentLoad { ...@@ -244,7 +244,7 @@ public class DocumentLoad {
textStripper.setSortByPosition(true); textStripper.setSortByPosition(true);
stringBuilder.append(textStripper.getText(doc)); stringBuilder.append(textStripper.getText(doc));
doc.close(); doc.close();
return stringBuilder.toString(); return stringBuilder.toString().replaceAll("�", StringUtils.EMPTY);
} }
......
...@@ -61,4 +61,11 @@ public class FileUtilsTest { ...@@ -61,4 +61,11 @@ public class FileUtilsTest {
} }
} }
@Test
public void test_pdf() {
File file = new File("C:\\Users\\52747\\Documents\\dataset\\迎向AI新纪元:2025企业转型的关键时刻-从2024产业案例看今年生成式AI.pdf");
String pdfResult = DocumentLoad.documentToText(file);
System.out.println(pdfResult);
}
} }
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment