java如何利用poi解析doc和docx中的数据

时间:2021-10-11 22:10:18

前言

这个功能是工作中遇到的一个需求,需要把上传的word中的内容解析出来,其中包含段落字符串解析,和表格中的数据解析出来,需要支持doc和docx格式的数据

Apache POI是Apache软件基金会的开源项目,POI提供API给Java程序对Microsoft Office格式档案读和写的功能。 .NET的开发人员则可以利用NPOI (POI for .NET) 来存取 Microsoft Office文档的功能。

方法如下:

1、增加maven中的包

?
1
2
3
4
5
6
7
8
9
10
11
<dependency>
    <groupId>org.apache.poi</groupId>
    <artifactId>poi-scratchpad</artifactId>
    <version>3.17</version>
</dependency>
<!--POI包 -->
<dependency>
    <groupId>org.apache.poi</groupId>
    <artifactId>poi-ooxml</artifactId>
    <version>3.17</version>
</dependency>

2、解析doc中的数据

获取文件,把MultipartFile对象的数据转成本地file

?
1
2
File file = new File(FileUtils.getUserDirectoryPath() + "/" + multipartFile.getOriginalFilename());
FileUtils.copyInputStreamToFile(multipartFile.getInputStream(), file);
?
1
2
3
4
5
6
7
8
9
String fileName = file.getName().toLowerCase();
FileInputStream in = new FileInputStream(file);
if (fileName.endsWith(".doc")) {
    // 处理doc格式 即office2003版本
    handlerDoc(in);
}
if (fileName.endsWith(".docx")) {
    handlerDocx(in);
}

解析doc格式中的段落和第一个表格数据

?
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
/**
 * doc 格式解析
 *
 * @param in
 * @throws IOException
 */
private void handlerDoc(FileInputStream in) throws IOException {
    POIFSFileSystem pfs = new POIFSFileSystem(in);
    HWPFDocument hwpf = new HWPFDocument(pfs);
 
    //得到文档的读取范围
    Range range = hwpf.getRange();
    for (int i = 0; i < range.numParagraphs(); i++) {
        //段落
        Paragraph p = range.getParagraph(i);
        //段落文本
        String paragraphText = p.text().replace("\r", "");
        log.info("paragraphText = {}", paragraphText );
        if (paragraphText.contains(VALUE_YLYC)) {
            analyze = false;
        }
    }
 
    TableIterator it = new TableIterator(range);
    // 迭代文档中的表格
    // 如果有多个表格只读取需要的一个 set是设置需要读取的第几个表格,total是文件中表格的总数
    int set = 1, total = 1;
    int num = set;
    for (int i = 0; i < set - 1; i++) {
        it.hasNext();
        it.next();
    }
    while (it.hasNext()) {
        Map<String, List<String>> tabelText = DocUtils.getTabelDocText((Table) it.next());
        log.info("tabelText = {}", tabelText);
    }
    // 过滤多余的表格
    while (num < total) {
        it.hasNext();
        it.next();
        num += 1;
    }
}

3、解析docx中数据

解析docx格式中的段落和第一个表格数据

?
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
/**
 * docx 格式解析
 *
 * @param in
 * @throws IOException
 */
private void handlerDocx(FileInputStream in) throws IOException {
    XWPFDocument xwpf = new XWPFDocument(in);
    // 获取word中的所有段落与表格
    List<IBodyElement> elements = xwpf.getBodyElements();
    // 解析表格后续不解析
    for (IBodyElement element : elements) {
        // 段落
        if (element instanceof XWPFParagraph) {
            String paragraphText = DocUtils.getParagraphText((XWPFParagraph) element);
            log.info("paragraphText = {}", paragraphText);
        } else if (element instanceof XWPFTable) {
            // 表格
            Map<String, List<String>> tabelText = DocUtils.getTabelText((XWPFTable) element);
            log.info("tabelText = {}", tabelText);
        } else {
            log.info("其他内容");
        }
    }
}

工具类

?
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
package com.hundsun.fais.innerreport.utils;
 
import org.apache.poi.hwpf.usermodel.Paragraph;
import org.apache.poi.hwpf.usermodel.Table;
import org.apache.poi.hwpf.usermodel.TableCell;
import org.apache.poi.hwpf.usermodel.TableRow;
import org.apache.poi.xwpf.usermodel.*;
 
import java.util.*;
 
/**
 * @author lvbaolin
 * @date 2021/4/2 10:39
 */
public class DocUtils {
 
 
    /**
     * docx 格式获取表格内容
     *
     * @param table
     */
    public static Map<String, List<String>> getTabelText(XWPFTable table) {
        Map<String, List<String>> result = new LinkedHashMap<>();
        List<XWPFTableRow> rows = table.getRows();
        for (XWPFTableRow row : rows) {
            String key = null;
            List<String> list = new ArrayList<>(16);
            int i = 0;
            List<XWPFTableCell> cells = row.getTableCells();
            for (XWPFTableCell cell : cells) {
                // 简单获取内容(简单方式是不能获取字体对齐方式的)
                StringBuffer sb = new StringBuffer();
                // 一个单元格可以理解为一个word文档,单元格里也可以加段落与表格
                List<XWPFParagraph> paragraphs = cell.getParagraphs();
                for (XWPFParagraph paragraph : paragraphs) {
                    sb.append(DocUtils.getParagraphText(paragraph));
                }
                if (i == 0) {
                    key = sb.toString();
                } else {
                    String value = sb.toString();
                    list.add(value == null || Objects.deepEquals(value, "") ? null : value.replace(",", ""));
                }
                i++;
            }
            result.put(key, list);
        }
        return result;
    }
 
    /**
     * docx 获取段落字符串
     * 获取段落内容
     *
     * @param paragraph
     */
    public static String getParagraphText(XWPFParagraph paragraph) {
        StringBuffer runText = new StringBuffer();
        // 获取段落中所有内容
        List<XWPFRun> runs = paragraph.getRuns();
        if (runs.size() == 0) {
            return runText.toString();
        }
        for (XWPFRun run : runs) {
            runText.append(run.text());
        }
        return runText.toString();
    }
 
    /**
     * doc 格式的字段解析表格
     * @param tb
     * @return
     */
    public static Map<String, List<String>> getTabelDocText(Table tb) {
        Map<String, List<String>> result = new HashMap<>(16);
        //迭代行,默认从0开始,可以依据需要设置i的值,改变起始行数,也可设置读取到那行,只需修改循环的判断条件即可
        for (int i = 0; i < tb.numRows(); i++) {
            List<String> list = new ArrayList<>(16);
            int x = 0;
            TableRow tr = tb.getRow(i);
            String key = null;
            //迭代列,默认从0开始
            for (int j = 0; j < tr.numCells(); j++) {
                //取得单元格
                TableCell td = tr.getCell(j);
                StringBuffer sb = new StringBuffer();
 
                //取得单元格的内容
                for (int k = 0; k < td.numParagraphs(); k++) {
                    Paragraph paragraph = td.getParagraph(k);
                    String s = paragraph.text();
                    //去除后面的特殊符号
                    if (null != s && !"".equals(s)) {
                        s = s.substring(0, s.length() - 1);
                    }
                    sb.append(s);
                }
                if (x == 0) {
                    key = sb.toString();
                } else {
                    String value = sb.toString();
                    list.add(value == null || Objects.deepEquals(value, "") ? null : value.replace(",", ""));
                }
                x++;
            }
            result.put(key, list);
        }
        return result;
    }
}

总结

到此这篇关于java如何利用poi解析doc和docx中数据的文章就介绍到这了,更多相关java poi解析doc数据内容请搜索服务器之家以前的文章或继续浏览下面的相关文章希望大家以后多多支持服务器之家!

原文链接:https://blog.csdn.net/lvbaolin123/article/details/115488366