Java读取doc、docx、xls、xlsx、ppt、pptx、pdf文件内容

时间:2024-03-03 09:21:15

读取文件信息所需依赖

<!-- 读取Excel XLS -->
<dependency>
    <groupId>org.apache.poi</groupId>
    <artifactId>poi</artifactId>
    <version>4.1.2</version>
</dependency>
<!-- 读取PPT、DOC、Visio -->
<dependency>
    <groupId>org.apache.poi</groupId>
    <artifactId>poi-scratchpad</artifactId>
    <version>4.1.2</version>
</dependency>
<!-- 读取Excel XLSX、PPTX、DOCX、-->
<dependency>
    <groupId>org.apache.poi</groupId>
    <artifactId>poi-ooxml</artifactId>
    <version>4.1.2</version>
</dependency>
<!--读取pdf信息-->
<dependency>
    <groupId>org.apache.pdfbox</groupId>
    <artifactId>pdfbox</artifactId>
    <version>2.0.12</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.pdfbox/fontbox -->
<dependency>
    <groupId>org.apache.pdfbox</groupId>
    <artifactId>fontbox</artifactId>
    <version>2.0.12</version>
</dependency>

读取doc文件内容

public static String readWord(String name)
{
    FileInputStream in;
    String text = null;
    try 
    {
        in = new FileInputStream(name);
        WordExtractor extractor = new WordExtractor(in);
        text = extractor.getText();
    } 
    catch (FileNotFoundException e) 
    {
        // TODO Auto-generated catch block
        e.printStackTrace();
    } catch (Exception e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }
    return text;
}

读取docx文件内容

public static String readDoc(MultipartFile file) {
    if (file.isEmpty())return "";
    WordExtractor wordExtractor = null;
    try {
        InputStream inputStream = file.getInputStream();
        wordExtractor = new WordExtractor(inputStream);
    } catch (IOException e) {
        log.warn(e.toString());
        e.printStackTrace();
    }
    return wordExtractor.getText();
}

读取xls文件内容

public static String readXls(MultipartFile file) {
    if (file.isEmpty()) return "";
    StringBuilder content = new StringBuilder();
    try {
        HSSFWorkbook excel = new HSSFWorkbook(file.getInputStream());
        //获取第一个sheet
        HSSFSheet sheet0 = excel.getSheetAt(0);
        for (Iterator rowIterator = sheet0.iterator(); rowIterator.hasNext(); ) {
            HSSFRow row = (HSSFRow) rowIterator.next();
            for (Iterator iterator = row.cellIterator(); iterator.hasNext(); ) {
                HSSFCell cell = (HSSFCell) iterator.next();
                //根据单元的的类型 读取相应的结果
                if (cell.getCellType() == CellType.STRING)
                    content.append(cell.getStringCellValue() + "\t");
                else if (cell.getCellType() == CellType.NUMERIC 
                         					|| cell.getCellType() == CellType.FORMULA)
                    content.append(cell.getNumericCellValue() + "\t");
                else
                    content.append("" + "\t");
            }
        }
    } catch (Exception e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
        log.warn(e.toString());
    }
    return content.toString();
}

读取xlsx文件内容

public static String readXlsx(MultipartFile file) {
    if (file.isEmpty()) return "";
    StringBuilder content = new StringBuilder();
    try {
        XSSFWorkbook excel = new XSSFWorkbook(file.getInputStream());
        //获取第一个sheet
        XSSFSheet sheet0 = excel.getSheetAt(0);
        for (Iterator rowIterator = sheet0.iterator(); rowIterator.hasNext(); ) {
            XSSFRow row = (XSSFRow) rowIterator.next();
            for (Iterator iterator = row.cellIterator(); iterator.hasNext(); ) {
                XSSFCell cell = (XSSFCell) iterator.next();
                //根据单元格的类型 读取相应的结果
                if (cell.getCellType() == CellType.STRING)
                    content.append(cell.getStringCellValue() + "\t");
                else if (cell.getCellType() == CellType.NUMERIC 
                         				|| cell.getCellType() == CellType.FORMULA)
                    content.append(cell.getNumericCellValue() + "\t");
                else
                    content.append("" + "\t");
            }
        }
    } catch (Exception e) {
        e.printStackTrace();
        log.warn(e.toString());
    }
    return content.toString();
}

读取pdf文件内容

/**
 * 读取 PDF文本内容
 *
 * @Param: MultipartFile
 * @return: pdf文本内容
 */
public static String readPdf(MultipartFile file) {
    StringBuilder content = new StringBuilder();
    try {
        InputStream is = file.getInputStream();
        PDFParser parser = new PDFParser(new RandomAccessBuffer(is));
        parser.parse();
        // 读取文本内容
        PDDocument document = parser.getPDDocument();
        // 获取页码
        int pages = document.getNumberOfPages();
        PDFTextStripper stripper = new PDFTextStripper();
        // 设置按顺序输出
        stripper.setSortByPosition(true);
        stripper.setStartPage(1);
        stripper.setEndPage(pages);
        content.append(stripper.getText(document));

    } catch (Exception e) {
        e.printStackTrace();
        log.warn(e.toString());
    }
    return content.toString();
}

PDF文件加载有两种方式,无明显差异,方式二代码较简洁:

// 方式一:         
InputStream input = null;
input = new FileInputStream( pdfFile );
//加载 pdf 文档
PDFParser parser = new PDFParser(new RandomAccessBuffer(input));
parser.parse();
document = parser.getPDDocument();

 // 方式二:
document=PDDocument.load(pdfFile);   

读取ppt文件内容

public static String readPPT(MultipartFile file) {
    if (file.isEmpty()) return "";
    StringBuilder content = new StringBuilder();
    try {
        InputStream is = file.getInputStream();
        HSLFSlideShow hslfSlideShow = new HSLFSlideShow(is);
        List<HSLFSlide> slides = hslfSlideShow.getSlides();
        SlideShowExtractor slideShowExtractor = new SlideShowExtractor(hslfSlideShow);

        for (HSLFSlide slide : slides) {
            content.append(slideShowExtractor.getText(slide));
        }
        slideShowExtractor.close();
    } catch (IOException e) {
        log.warn(e.toString());
        e.printStackTrace();
    }
    return content.toString();
}

读取pptx文件内容

public static String readPPTX(MultipartFile file) {
    if (file.isEmpty()) return "";
    StringBuffer content = new StringBuffer();
    try {
        InputStream is = file.getInputStream();
        XMLSlideShow xmlSlideShow = new XMLSlideShow(is);
        List<XSLFSlide> slides = xmlSlideShow.getSlides();            //获得每一张幻灯片
        for (XSLFSlide slide : slides) {
            CTSlide rawSlide = slide.getXmlObject();
            CTGroupShape spTree = rawSlide.getCSld().getSpTree();
            List<CTShape> spList = spTree.getSpList();
            for (CTShape shape : spList) {
                CTTextBody txBody = shape.getTxBody();
                if (null == txBody) {
                    continue;
                }
                List<CTTextParagraph> pList = txBody.getPList();
                for (CTTextParagraph textParagraph : pList) {
                    List<CTRegularTextRun> textRuns = textParagraph.getRList();
                    for (CTRegularTextRun textRun : textRuns) {
                        content.append(textRun.getT());
                    }
                }
            }
        }
        xmlSlideShow.close();
    } catch (Exception e) {
        e.printStackTrace();
    }
    return content.toString();
}