diff --git a/com.actionsoft.apps.coe.pal.datamigration/lib/JsoupXpath-2.5.1.jar b/com.actionsoft.apps.coe.pal.datamigration/lib/JsoupXpath-2.5.1.jar new file mode 100644 index 00000000..f06376e4 Binary files /dev/null and b/com.actionsoft.apps.coe.pal.datamigration/lib/JsoupXpath-2.5.1.jar differ diff --git a/com.actionsoft.apps.coe.pal.datamigration/lib/antlr4-runtime-4.7.2.jar b/com.actionsoft.apps.coe.pal.datamigration/lib/antlr4-runtime-4.7.2.jar new file mode 100644 index 00000000..7a27e1b2 Binary files /dev/null and b/com.actionsoft.apps.coe.pal.datamigration/lib/antlr4-runtime-4.7.2.jar differ diff --git a/com.actionsoft.apps.coe.pal.datamigration/lib/jsoup-1.15.1.jar b/com.actionsoft.apps.coe.pal.datamigration/lib/jsoup-1.15.1.jar new file mode 100644 index 00000000..d9fb443e Binary files /dev/null and b/com.actionsoft.apps.coe.pal.datamigration/lib/jsoup-1.15.1.jar differ diff --git a/com.actionsoft.apps.coe.pal.datamigration/lib/juh-4.1.2.jar b/com.actionsoft.apps.coe.pal.datamigration/lib/juh-4.1.2.jar new file mode 100644 index 00000000..b9a1e41d Binary files /dev/null and b/com.actionsoft.apps.coe.pal.datamigration/lib/juh-4.1.2.jar differ diff --git a/com.actionsoft.apps.coe.pal.datamigration/lib/jurt-4.1.2.jar b/com.actionsoft.apps.coe.pal.datamigration/lib/jurt-4.1.2.jar new file mode 100644 index 00000000..d9681bdc Binary files /dev/null and b/com.actionsoft.apps.coe.pal.datamigration/lib/jurt-4.1.2.jar differ diff --git a/com.actionsoft.apps.coe.pal.datamigration/lib/ridl-4.1.2.jar b/com.actionsoft.apps.coe.pal.datamigration/lib/ridl-4.1.2.jar new file mode 100644 index 00000000..b74e2d66 Binary files /dev/null and b/com.actionsoft.apps.coe.pal.datamigration/lib/ridl-4.1.2.jar differ diff --git a/com.actionsoft.apps.coe.pal.datamigration/lib/slf4j-api-1.7.25.jar b/com.actionsoft.apps.coe.pal.datamigration/lib/slf4j-api-1.7.25.jar new file mode 100644 index 00000000..0143c099 Binary files /dev/null and b/com.actionsoft.apps.coe.pal.datamigration/lib/slf4j-api-1.7.25.jar differ diff --git a/com.actionsoft.apps.coe.pal.datamigration/lib/unoil-4.1.2.jar b/com.actionsoft.apps.coe.pal.datamigration/lib/unoil-4.1.2.jar new file mode 100644 index 00000000..23cf85c7 Binary files /dev/null and b/com.actionsoft.apps.coe.pal.datamigration/lib/unoil-4.1.2.jar differ diff --git a/com.actionsoft.apps.coe.pal.datamigration/src/com/actionsoft/apps/coe/pal/datamigration/util/htmltodocx/CommonConStant.java b/com.actionsoft.apps.coe.pal.datamigration/src/com/actionsoft/apps/coe/pal/datamigration/util/htmltodocx/CommonConStant.java new file mode 100644 index 00000000..a86e097e --- /dev/null +++ b/com.actionsoft.apps.coe.pal.datamigration/src/com/actionsoft/apps/coe/pal/datamigration/util/htmltodocx/CommonConStant.java @@ -0,0 +1,15 @@ +package com.actionsoft.apps.coe.pal.datamigration.util.htmltodocx; + +/** + * @author baizp + * @Description: + * @date 2022/6/24 16:02 + */ +/** + * 公共常量 + */ +public class CommonConStant { + // 固定元素节点 + public static final String COMMONATTR = "data-class"; +} + diff --git a/com.actionsoft.apps.coe.pal.datamigration/src/com/actionsoft/apps/coe/pal/datamigration/util/htmltodocx/ElementEnum.java b/com.actionsoft.apps.coe.pal.datamigration/src/com/actionsoft/apps/coe/pal/datamigration/util/htmltodocx/ElementEnum.java new file mode 100644 index 00000000..656fa6f7 --- /dev/null +++ b/com.actionsoft.apps.coe.pal.datamigration/src/com/actionsoft/apps/coe/pal/datamigration/util/htmltodocx/ElementEnum.java @@ -0,0 +1,55 @@ +package com.actionsoft.apps.coe.pal.datamigration.util.htmltodocx; + +/** + * @author baizp + * @Description: + * @date 2022/6/24 15:59 + */ + +/** + * html 元素枚举映射类 + */ +public enum ElementEnum { + H1("h1","h1","一级标题"), + H2("h2","h2","二级标题"), + H3("h3","h3","三级标题"), + H7("h7","h7","小标题"), + P("p", "paragraph", "段落"), + STRONG("strong","","加粗"), + I("i","","斜体"), + U("u", "", "字体下划线"), + IMG("img", "imgurl", "base64图片"), + TABLE("table","table","表格"), + BR("br","br","换行"); + + private String code; + private String value; + private String desc; + + public String getCode() { + return code; + } + + public String getValue() { + return value; + } + + public String getDesc() { + return desc; + } + + ElementEnum(String code, String value, String desc) { + this.code = code; + this.value = value; + this.desc = desc; + } + + public static String getValueByCode(String code) { + for (ElementEnum e : ElementEnum.values()) { + if (e.getCode().equalsIgnoreCase(code)) { + return e.getValue(); + } + } + return null; + } +} diff --git a/com.actionsoft.apps.coe.pal.datamigration/src/com/actionsoft/apps/coe/pal/datamigration/util/htmltodocx/HtmlToWord.java b/com.actionsoft.apps.coe.pal.datamigration/src/com/actionsoft/apps/coe/pal/datamigration/util/htmltodocx/HtmlToWord.java new file mode 100644 index 00000000..338fc7bd --- /dev/null +++ b/com.actionsoft.apps.coe.pal.datamigration/src/com/actionsoft/apps/coe/pal/datamigration/util/htmltodocx/HtmlToWord.java @@ -0,0 +1,610 @@ +package com.actionsoft.apps.coe.pal.datamigration.util.htmltodocx; + +/** + * @author baizp + * @Description: + * @date 2022/6/24 16:01 + */ + +import org.docx4j.dml.wordprocessingDrawing.Inline; +import org.docx4j.jaxb.Context; +import org.docx4j.model.structure.SectionWrapper; +import org.docx4j.openpackaging.exceptions.Docx4JException; +import org.docx4j.openpackaging.exceptions.InvalidFormatException; +import org.docx4j.openpackaging.packages.WordprocessingMLPackage; +import org.docx4j.openpackaging.parts.WordprocessingML.BinaryPartAbstractImage; +import org.docx4j.openpackaging.parts.WordprocessingML.FooterPart; +import org.docx4j.openpackaging.parts.WordprocessingML.MainDocumentPart; +import org.docx4j.openpackaging.parts.WordprocessingML.StyleDefinitionsPart; +import org.docx4j.relationships.Relationship; +import org.docx4j.wml.*; +import org.docx4j.wml.PPrBase.Ind; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; + +import java.io.*; +import java.math.BigInteger; +import java.util.List; + +/** + * @program: htmltoword + * @description: html 转 docx + * @author: corey + * @create: 2020-04-29 14:10 + **/ +public class HtmlToWord { + private static ObjectFactory factory; + private static WordprocessingMLPackage wordMLPackage; + + /** + * 将一段富文本字符串转为一个字节数组 + * + * @param data + * @return + */ + public static byte[] resolveHtml(String data) { + Document document = Jsoup.parseBodyFragment(data, "UTF-8"); + ByteArrayOutputStream out = null; + try { + wordMLPackage = WordprocessingMLPackage.createPackage(); + factory = Context.getWmlObjectFactory(); + Relationship relationship = createFooterPart(); + createFooterReference(relationship); + MainDocumentPart documentPart = wordMLPackage.getMainDocumentPart(); + alterStyleSheet(); + // 添加固定元素 + HtmlUtils.addElement(document); + Elements elements = document.select("[" + CommonConStant.COMMONATTR + "]"); + for (Element em : elements) { + String type = em.attr(CommonConStant.COMMONATTR); + if (em.childNodeSize() > 0) { + if (em.childNodeSize() == 2) { + em.childNode(1); + } + } + switch (em.attr(CommonConStant.COMMONATTR)) { + case "title": + documentPart.addStyledParagraphOfText("Title", em.text()); + break; + case "subtitle": + documentPart.addStyledParagraphOfText("Subtitle", em.text()); + break; + case "imgurl": + String imgSrc = em.attr("src"); + File file = new File(imgSrc); + byte[] bytes = convertImageToByteArray(file); + addImageToPackage(wordMLPackage, bytes); + break; + case "imgbase64": + break; + case "table": + Tbl table = addTable(em); + documentPart.addObject(table); + break; + case "h1": + P tmp = documentPart.addStyledParagraphOfText("Heading1", em.text()); + //setNum(1, tmpstyle); + setNum1(1, tmp); + break; + case "h2": + P tmp1 = documentPart.addStyledParagraphOfText("Heading2", em.text()); + //setNum(2, tmpstyle1); + setNum1(2, tmp1); + break; + case "h3": + P tmp2 = documentPart.addStyledParagraphOfText("Heading3", em.text()); + //setNum(3, tmpstyle2); + setNum1(3, tmp2); + break; + case "paragraph": + P p = addParapraph(em.text()); + //设置首行缩进 + setFirstLine(p, "400"); + documentPart.getContent().add(p); + break; + default: + documentPart.addParagraphOfText(em.text()); + break; + } + } + addPageBreak(documentPart); + out = new ByteArrayOutputStream(); + wordMLPackage.save(out); + return out.toByteArray(); + } catch (Exception e) { + e.printStackTrace(); + return null; + } finally { + if (out != null) { + try { + out.close(); + } catch (IOException e) { + e.printStackTrace(); + } + } + } + } + + public static void setNum(int level,Style style) { + ObjectFactory factory = Context.getWmlObjectFactory(); + //Create and add to style + PPr ppr = factory.createPPr(); + style.setPPr(ppr); + PPrBase.NumPr numPr = factory.createPPrBaseNumPr(); + PPrBase.NumPr.Ilvl ilvlElement = factory.createPPrBaseNumPrIlvl(); + numPr.setIlvl(ilvlElement); + ilvlElement.setVal(BigInteger.valueOf(level)); + PPrBase.NumPr.NumId numIdElement = factory.createPPrBaseNumPrNumId(); + numPr.setNumId(numIdElement); + numIdElement.setVal(BigInteger.valueOf(level)); + ppr.setNumPr(numPr); + } + public static void setNum1(int level,P p){ + + + /*ObjectFactory factory = new org.docx4j.wml.ObjectFactory(); + P p = factory.createP();*/ + + /*org.docx4j.wml.Text t = factory.createText(); + t.setValue(em.text());*/ + + /*org.docx4j.wml.R run = factory.createR(); + run.getContent().add(t); + + p.getContent().add(run);*/ + + /*org.docx4j.wml.PPr ppr = factory.createPPr(); + + p.setPPr(ppr);*/ + org.docx4j.wml.PPr ppr =p.getPPr(); + // Create and add + PPrBase.NumPr numPr = factory.createPPrBaseNumPr(); + ppr.setNumPr(numPr); + + // The element + PPrBase.NumPr.Ilvl ilvlElement = factory.createPPrBaseNumPrIlvl(); + numPr.setIlvl(ilvlElement); + ilvlElement.setVal(BigInteger.valueOf(level)); + + // The element + PPrBase.NumPr.NumId numIdElement = factory.createPPrBaseNumPrNumId(); + numPr.setNumId(numIdElement); + numIdElement.setVal(BigInteger.valueOf(level)); + + //wordMLPackage.getMainDocumentPart().addObject(p); + } + + /** + * @param @param text + * @param @return 设定文件 + * @return P 返回类型 + * @throws + * @Title: addParapraph + * @Description: (文本转段落) + */ + private static P addParapraph(String text) { + factory = Context.getWmlObjectFactory(); + P paragraph = factory.createP(); + Text t = factory.createText(); + t.setValue(text); + R run = factory.createR(); + run.getContent().add(t); + paragraph.getContent().add(run); + RPr runProperties = factory.createRPr(); + run.setRPr(runProperties); + return paragraph; + } + + /** + * @param @param p + * @param @param str 设定文件 + * @return void 返回类型 + * @throws + * @Title: setFirstLine + */ + private static void setFirstLine(P p, String str) { + PPr ppr = getPPr(p); + Ind ind = ppr.getInd(); + if (ind == null) { + ind = new Ind(); + ppr.setInd(ind); + } + ind.setFirstLine(new BigInteger(str)); + } + + ; + + private static PPr getPPr(P p) { + PPr ppr = p.getPPr(); + if (ppr == null) { + ppr = new PPr(); + p.setPPr(ppr); + } + return ppr; + } + + /** + * table @param @return 设定文件 @return Tbl 返回类型 @throws + */ + private static Tbl addTable(Element table) { + factory = Context.getWmlObjectFactory(); + Tbl tbl = factory.createTbl(); + addBorders(tbl); + Elements trs = table.getElementsByTag("tr"); + for (Element tr : trs) { + Tr fTr = addTableTr(tr); + tbl.getContent().add(fTr); + } + return tbl; + } + + /** + * tr @param @return 设定文件 @return Tr 返回类型 @throws + */ + private static Tr addTableTr(Element tr) { + Elements tds = tr.getElementsByTag("th").isEmpty() ? tr.getElementsByTag("td") : tr.getElementsByTag("th"); + Tr ftr = factory.createTr(); + for (int i = 0, j = tds.size(); i < j; i++) { + Tc ftd = factory.createTc(); + setCellWidth(ftd, 1000); + ftd.getContent().add(wordMLPackage.getMainDocumentPart().createParagraphOfText(tds.get(i).text())); + ftr.getContent().add(ftd); + } + return ftr; + } + + /** + * 本方法创建一个单元格属性集对象和一个表格宽度对象. 将给定的宽度设置到宽度对象然后将其添加到 属性集对象. 最后将属性集对象设置到单元格中. + */ + private static void setCellWidth(Tc tableCell, int width) { + TcPr tableCellProperties = new TcPr(); + TblWidth tableWidth = new TblWidth(); + tableWidth.setW(BigInteger.valueOf(width)); + tableCellProperties.setTcW(tableWidth); + tableCell.setTcPr(tableCellProperties); + } + + /** + * 本方法为表格添加边框 + */ + private static void addBorders(Tbl table) { + table.setTblPr(new TblPr()); + CTBorder border = new CTBorder(); + border.setColor("auto"); + border.setSz(new BigInteger("4")); + border.setSpace(new BigInteger("0")); + border.setVal(STBorder.SINGLE); + + TblBorders borders = new TblBorders(); + borders.setBottom(border); + borders.setLeft(border); + borders.setRight(border); + borders.setTop(border); + borders.setInsideH(border); + borders.setInsideV(border); + table.getTblPr().setTblBorders(borders); + } + + /** + * 将图片从文件对象转换成字节数组. + * + * @param file 将要转换的文件 + * @return 包含图片字节数据的字节数组 + * @throws FileNotFoundException + * @throws IOException + */ + private static byte[] convertImageToByteArray(File file) throws FileNotFoundException, IOException { + InputStream is = new FileInputStream(file); + long length = file.length(); + // 不能使用long类型创建数组, 需要用int类型. + if (length > Integer.MAX_VALUE) { + System.out.println("File too large!!"); + } + byte[] bytes = new byte[(int) length]; + int offset = 0; + int numRead = 0; + while (offset < bytes.length && (numRead = is.read(bytes, offset, bytes.length - offset)) >= 0) { + offset += numRead; + } + // 确认所有的字节都没读取 + if (offset < bytes.length) { + System.out.println("Could not completely read file " + file.getName()); + } + is.close(); + return bytes; + } + + /** + * Docx4j拥有一个由字节数组创建图片部件的工具方法, 随后将其添加到给定的包中. 为了能将图片添加 到一个段落中, 我们需要将图片转换成内联对象. + * 这也有一个方法, 方法需要文件名提示, 替换文本, 两个id标识符和一个是嵌入还是链接到的指示作为参数. 一个id用于文档中绘图对象不可见的属性, + * 另一个id用于图片本身不可见的绘制属性. 最后我们将内联 对象添加到段落中并将段落添加到包的主文档部件. + * + * @param wordMLPackage 要添加图片的包 + * @param bytes 图片对应的字节数组 + * @throws Exception 不幸的createImageInline方法抛出一个异常(没有更多具体的异常类型) + */ + private static void addImageToPackage(WordprocessingMLPackage wordMLPackage, byte[] bytes) throws Exception { + BinaryPartAbstractImage imagePart = BinaryPartAbstractImage.createImagePart(wordMLPackage, bytes); + + int docPrId = 1; + int cNvPrId = 2; + Inline inline = imagePart.createImageInline("Filename hint", "Alternative text", docPrId, cNvPrId, false); + + P paragraph = addInlineImageToParagraph(inline); + + wordMLPackage.getMainDocumentPart().addObject(paragraph); + } + + /** + * 创建一个对象工厂并用它创建一个段落和一个可运行块R. 然后将可运行块添加到段落中. 接下来创建一个图画并将其添加到可运行块R中. 最后我们将内联 + * 对象添加到图画中并返回段落对象. + * + * @param inline 包含图片的内联对象. + * @return 包含图片的段落 + */ + private static P addInlineImageToParagraph(Inline inline) { + // 添加内联对象到一个段落中 + ObjectFactory factory = new ObjectFactory(); + P paragraph = factory.createP(); + R run = factory.createR(); + paragraph.getContent().add(run); + Drawing drawing = factory.createDrawing(); + run.getContent().add(drawing); + drawing.getAnchorOrInline().add(inline); + return paragraph; + } + + /** + * This method alters the default style sheet that is part of each document. + *

+ * To do this, we first retrieve the style sheet from the package and then get + * the Styles object from it. From this object, we get the list of actual styles + * and iterate over them. We check against all styles we want to alter and apply + * the alterations if applicable. + * + * @param + */ + public static void alterStyleSheet() { + StyleDefinitionsPart styleDefinitionsPart = wordMLPackage.getMainDocumentPart().getStyleDefinitionsPart(); + Styles styles = null; + try { + styles = styleDefinitionsPart.getContents(); + } catch (Docx4JException e) { + e.printStackTrace(); + } + + List