Commit 2d4f8b5f authored by alex yao's avatar alex yao

feat:添加PPT读取方法

parent ca6cdc13
......@@ -7,11 +7,26 @@ import org.apache.pdfbox.io.RandomAccessBufferedFileInputStream;
import org.apache.pdfbox.pdfparser.PDFParser;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.poi.hslf.usermodel.HSLFSlide;
import org.apache.poi.hslf.usermodel.HSLFSlideShow;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.poifs.filesystem.OfficeXmlFileException;
import org.apache.poi.sl.extractor.SlideShowExtractor;
import org.apache.poi.sl.usermodel.Slide;
import org.apache.poi.sl.usermodel.SlideShow;
import org.apache.poi.ss.usermodel.*;
import org.apache.poi.xslf.usermodel.XMLSlideShow;
import org.apache.poi.xslf.usermodel.XSLFSlide;
import org.apache.poi.xslf.usermodel.XSLFTextShape;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.openxmlformats.schemas.drawingml.x2006.main.CTRegularTextRun;
import org.openxmlformats.schemas.drawingml.x2006.main.CTTextBody;
import org.openxmlformats.schemas.drawingml.x2006.main.CTTextParagraph;
import org.openxmlformats.schemas.presentationml.x2006.main.CTGroupShape;
import org.openxmlformats.schemas.presentationml.x2006.main.CTShape;
import org.openxmlformats.schemas.presentationml.x2006.main.CTSlide;
import org.springframework.util.Assert;
import java.io.*;
......@@ -19,6 +34,7 @@ import java.net.URL;
import java.net.URLConnection;
import java.nio.file.Files;
import java.util.Iterator;
import java.util.List;
public class DocumentLoad {
......@@ -66,6 +82,52 @@ public class DocumentLoad {
}
}
public static String loadPPT(File file) {
StringBuilder sb = new StringBuilder();
try {
InputStream is = FileUtil.getInputStream(file);
HSLFSlideShow hslfSlideShow = new HSLFSlideShow(is);
List<HSLFSlide> slides = hslfSlideShow.getSlides();
SlideShowExtractor slideShowExtractor = new SlideShowExtractor(hslfSlideShow);
for (HSLFSlide slide : slides) {
sb.append("Page:").append(slide.getSlideNumber()).append(StringUtils.LF).append(slideShowExtractor.getText(slide)).append(StringUtils.LF);
}
slideShowExtractor.close();
is.close();
} catch (IOException e) {
throw new I18nMessageException("exception/file.load.error");
} catch (OfficeXmlFileException e) {
try {
InputStream is = FileUtil.getInputStream(file);
XMLSlideShow xmlSlideShow = new XMLSlideShow(is);
List<XSLFSlide> slides = xmlSlideShow.getSlides();
for (XSLFSlide slide : slides) {
CTSlide rawSlide = slide.getXmlObject();
CTGroupShape spTree = rawSlide.getCSld().getSpTree();
List<CTShape> spList = spTree.getSpList();
for (CTShape shape : spList) {
CTTextBody txBody = shape.getTxBody();
if (null == txBody) {
continue;
}
List<CTTextParagraph> pList = txBody.getPList();
for (CTTextParagraph textParagraph : pList) {
List<CTRegularTextRun> textRuns = textParagraph.getRList();
for (CTRegularTextRun textRun : textRuns) {
sb.append("Page:").append(slide.getSlideNumber()).append(StringUtils.LF).append(textRun.getT()).append(StringUtils.LF);
}
}
}
}
xmlSlideShow.close();
is.close();
} catch (IOException e1) {
throw new I18nMessageException("exception/file.load.error");
}
}
return sb.toString();
}
/**
* Html To Markdown
*/
......@@ -121,6 +183,9 @@ public class DocumentLoad {
return loadPDF(file);
case "txt":
return loadTxt(file);
case "ppt":
case "pptx":
return loadPPT(file);
case "xlsx":
case "xls":
case "csv":
......
......@@ -53,4 +53,10 @@ public class ImageOCRFunctionTest {
File file = new File("C:\\Users\\52747\\Desktop\\List of Question Intents and Standard Answers (IDP&DL) (Dec2024).xlsx");
System.out.println(DocumentLoad.excelToMarkdown(file));
}
@Test
public void loadPPT() {
System.out.println(DocumentLoad.loadPPT(new File("C:\\Users\\52747\\Documents\\dataset\\中国风.pptx")));
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment