Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Contribute to GitLab
Sign in
Toggle navigation
P
poc-api
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
poc
poc-api
Commits
2d4f8b5f
Commit
2d4f8b5f
authored
Jan 20, 2025
by
alex yao
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
feat:添加PPT读取方法
parent
ca6cdc13
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
71 additions
and
0 deletions
+71
-0
DocumentLoad.java
src/main/java/cn/com/poc/common/utils/DocumentLoad.java
+65
-0
ImageOCRFunctionTest.java
...rty/resource/demand/ai/function/ImageOCRFunctionTest.java
+6
-0
No files found.
src/main/java/cn/com/poc/common/utils/DocumentLoad.java
View file @
2d4f8b5f
...
@@ -7,11 +7,26 @@ import org.apache.pdfbox.io.RandomAccessBufferedFileInputStream;
...
@@ -7,11 +7,26 @@ import org.apache.pdfbox.io.RandomAccessBufferedFileInputStream;
import
org.apache.pdfbox.pdfparser.PDFParser
;
import
org.apache.pdfbox.pdfparser.PDFParser
;
import
org.apache.pdfbox.pdmodel.PDDocument
;
import
org.apache.pdfbox.pdmodel.PDDocument
;
import
org.apache.pdfbox.text.PDFTextStripper
;
import
org.apache.pdfbox.text.PDFTextStripper
;
import
org.apache.poi.hslf.usermodel.HSLFSlide
;
import
org.apache.poi.hslf.usermodel.HSLFSlideShow
;
import
org.apache.poi.hwpf.extractor.WordExtractor
;
import
org.apache.poi.hwpf.extractor.WordExtractor
;
import
org.apache.poi.poifs.filesystem.OfficeXmlFileException
;
import
org.apache.poi.sl.extractor.SlideShowExtractor
;
import
org.apache.poi.sl.usermodel.Slide
;
import
org.apache.poi.sl.usermodel.SlideShow
;
import
org.apache.poi.ss.usermodel.*
;
import
org.apache.poi.ss.usermodel.*
;
import
org.apache.poi.xslf.usermodel.XMLSlideShow
;
import
org.apache.poi.xslf.usermodel.XSLFSlide
;
import
org.apache.poi.xslf.usermodel.XSLFTextShape
;
import
org.apache.poi.xssf.usermodel.XSSFWorkbook
;
import
org.apache.poi.xssf.usermodel.XSSFWorkbook
;
import
org.apache.poi.xwpf.extractor.XWPFWordExtractor
;
import
org.apache.poi.xwpf.extractor.XWPFWordExtractor
;
import
org.apache.poi.xwpf.usermodel.XWPFDocument
;
import
org.apache.poi.xwpf.usermodel.XWPFDocument
;
import
org.openxmlformats.schemas.drawingml.x2006.main.CTRegularTextRun
;
import
org.openxmlformats.schemas.drawingml.x2006.main.CTTextBody
;
import
org.openxmlformats.schemas.drawingml.x2006.main.CTTextParagraph
;
import
org.openxmlformats.schemas.presentationml.x2006.main.CTGroupShape
;
import
org.openxmlformats.schemas.presentationml.x2006.main.CTShape
;
import
org.openxmlformats.schemas.presentationml.x2006.main.CTSlide
;
import
org.springframework.util.Assert
;
import
org.springframework.util.Assert
;
import
java.io.*
;
import
java.io.*
;
...
@@ -19,6 +34,7 @@ import java.net.URL;
...
@@ -19,6 +34,7 @@ import java.net.URL;
import
java.net.URLConnection
;
import
java.net.URLConnection
;
import
java.nio.file.Files
;
import
java.nio.file.Files
;
import
java.util.Iterator
;
import
java.util.Iterator
;
import
java.util.List
;
public
class
DocumentLoad
{
public
class
DocumentLoad
{
...
@@ -66,6 +82,52 @@ public class DocumentLoad {
...
@@ -66,6 +82,52 @@ public class DocumentLoad {
}
}
}
}
public
static
String
loadPPT
(
File
file
)
{
StringBuilder
sb
=
new
StringBuilder
();
try
{
InputStream
is
=
FileUtil
.
getInputStream
(
file
);
HSLFSlideShow
hslfSlideShow
=
new
HSLFSlideShow
(
is
);
List
<
HSLFSlide
>
slides
=
hslfSlideShow
.
getSlides
();
SlideShowExtractor
slideShowExtractor
=
new
SlideShowExtractor
(
hslfSlideShow
);
for
(
HSLFSlide
slide
:
slides
)
{
sb
.
append
(
"Page:"
).
append
(
slide
.
getSlideNumber
()).
append
(
StringUtils
.
LF
).
append
(
slideShowExtractor
.
getText
(
slide
)).
append
(
StringUtils
.
LF
);
}
slideShowExtractor
.
close
();
is
.
close
();
}
catch
(
IOException
e
)
{
throw
new
I18nMessageException
(
"exception/file.load.error"
);
}
catch
(
OfficeXmlFileException
e
)
{
try
{
InputStream
is
=
FileUtil
.
getInputStream
(
file
);
XMLSlideShow
xmlSlideShow
=
new
XMLSlideShow
(
is
);
List
<
XSLFSlide
>
slides
=
xmlSlideShow
.
getSlides
();
for
(
XSLFSlide
slide
:
slides
)
{
CTSlide
rawSlide
=
slide
.
getXmlObject
();
CTGroupShape
spTree
=
rawSlide
.
getCSld
().
getSpTree
();
List
<
CTShape
>
spList
=
spTree
.
getSpList
();
for
(
CTShape
shape
:
spList
)
{
CTTextBody
txBody
=
shape
.
getTxBody
();
if
(
null
==
txBody
)
{
continue
;
}
List
<
CTTextParagraph
>
pList
=
txBody
.
getPList
();
for
(
CTTextParagraph
textParagraph
:
pList
)
{
List
<
CTRegularTextRun
>
textRuns
=
textParagraph
.
getRList
();
for
(
CTRegularTextRun
textRun
:
textRuns
)
{
sb
.
append
(
"Page:"
).
append
(
slide
.
getSlideNumber
()).
append
(
StringUtils
.
LF
).
append
(
textRun
.
getT
()).
append
(
StringUtils
.
LF
);
}
}
}
}
xmlSlideShow
.
close
();
is
.
close
();
}
catch
(
IOException
e1
)
{
throw
new
I18nMessageException
(
"exception/file.load.error"
);
}
}
return
sb
.
toString
();
}
/**
/**
* Html To Markdown
* Html To Markdown
*/
*/
...
@@ -121,6 +183,9 @@ public class DocumentLoad {
...
@@ -121,6 +183,9 @@ public class DocumentLoad {
return
loadPDF
(
file
);
return
loadPDF
(
file
);
case
"txt"
:
case
"txt"
:
return
loadTxt
(
file
);
return
loadTxt
(
file
);
case
"ppt"
:
case
"pptx"
:
return
loadPPT
(
file
);
case
"xlsx"
:
case
"xlsx"
:
case
"xls"
:
case
"xls"
:
case
"csv"
:
case
"csv"
:
...
...
src/test/java/cn/com/poc/thirdparty/resource/demand/ai/function/ImageOCRFunctionTest.java
View file @
2d4f8b5f
...
@@ -53,4 +53,10 @@ public class ImageOCRFunctionTest {
...
@@ -53,4 +53,10 @@ public class ImageOCRFunctionTest {
File
file
=
new
File
(
"C:\\Users\\52747\\Desktop\\List of Question Intents and Standard Answers (IDP&DL) (Dec2024).xlsx"
);
File
file
=
new
File
(
"C:\\Users\\52747\\Desktop\\List of Question Intents and Standard Answers (IDP&DL) (Dec2024).xlsx"
);
System
.
out
.
println
(
DocumentLoad
.
excelToMarkdown
(
file
));
System
.
out
.
println
(
DocumentLoad
.
excelToMarkdown
(
file
));
}
}
@Test
public
void
loadPPT
()
{
System
.
out
.
println
(
DocumentLoad
.
loadPPT
(
new
File
(
"C:\\Users\\52747\\Documents\\dataset\\中国风.pptx"
)));
}
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment