问题:需要将大量的已存在的word文档导入到web项目里在网站展示,不可能通过编辑录入的方式处理,通过程序实现。
解决思路:通过读取word文档处理成html,再获取html富文本内容,拼接成sql,导入数据库。
要点
1:读取word文件夹会递归读取,只要文件夹下有word文档即可,程序中有过滤word文档的代码,可根据需要修改;
2、可同时处理word2003和word2007+版本的word文档;
3、读取word2007生成的html文档内中文是unicode编码的,放到数据库或浏览器直接打开,不影响页面显示;
4、对word文档中图片做了处理,存储到单独的文件夹,导入mysql或其它数据库后,正确显示图片,要注意路径处理;
5、只处理word2007文档可不生成html直接获取富文本内容,但2003版本不可以,因此统一将doc文档都生成html页面再用java读取html文档获取body元素下的富文本内容。
不足之处,欢迎交流和指正。
可能用到的pom如下:
<!--poi处理word文档-->
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi</artifactId>
<version>3.15</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-scratchpad</artifactId>
<version>3.15</version>
</dependency>
<dependency>
<groupId>fr.opensagres.xdocreport</groupId>
<artifactId>fr.opensagres.xdocreport.document</artifactId>
<version>2.0.1</version>
</dependency>
<dependency>
<groupId>fr.opensagres.xdocreport</groupId>
<artifactId>org.apache.poi.xwpf.converter.xhtml</artifactId>
<version>1.0.6</version>
</dependency>
<dependency>
<groupId>fr.opensagres.xdocreport</groupId>
<artifactId>org.apache.poi.xwpf.converter.core</artifactId>
<version>1.0.6</version>
</dependency>
<dependency>
<groupId>fr.opensagres.xdocreport</groupId>
<artifactId>org.apache.poi.xwpf.converter.pdf</artifactId>
<version>1.0.6</version>
</dependency>
完整的java代码如下:
package test;
import org.apache.commons.lang.StringUtils;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.converter.WordToHtmlConverter;
import org.apache.poi.hwpf.usermodel.Picture;
import org.apache.poi.xwpf.converter.core.FileImageExtractor;
import org.apache.poi.xwpf.converter.core.FileURIResolver;
import org.apache.poi.xwpf.converter.xhtml.XHTMLConverter;
import org.apache.poi.xwpf.converter.xhtml.XHTMLOptions;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.junit.Test;
import org.w3c.dom.Document;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import java.io.*;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/*
* @Desc word转化为html
* @Author ls 2019/3/19 0019 11:21
*/
public class Word2Html {
@Test
public void handleWordToSql() {
String path = "F:\\word文档\\";
List<String> fileNames = new ArrayList<>();
Map<String, String> contentsMap = new HashMap<>();
getAllFileName(path, fileNames);
// fileNames.forEach(System.out::println);
// System.out.println(fileNames.size());
Map<String, String> map = handleFileName(fileNames);
String imagePath = "F:\\images\\";
String htmlPath = "F:\\html\\";
map.forEach((k, v) -> {
String content = "";
String articleName = k.substring(0, k.lastIndexOf("."));
String htmlName = articleName + ".html";
if (k.contains(".doc") && !k.contains(".docx")) {
try {
content = word2003ToHtml(imagePath + generateImageName() + "\\", v, htmlPath + htmlName);
} catch (TransformerException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} catch (ParserConfigurationException e) {
e.printStackTrace();
}
} else if (k.contains(".docx")) {
try {
content = word2007ToHtml(imagePath + generateImageName() + "\\", v, htmlPath + htmlName);
} catch (IOException e) {
e.printStackTrace();
}
} else {
System.out.println("word文档格式不正确->" + k);
}
if (StringUtils.isNotBlank(content)) {
contentsMap.put(articleName, content);
}
});
System.out.println("文章总条数: " + contentsMap.size());
// contents.forEach(System.out::println);
handleInsertSql(contentsMap);
}
/**
* 处理sql
*
* @param contentsMap
* @return
*/
public String handleInsertSql(Map<String, String> contentsMap) {
StringBuffer sb = new StringBuffer();
int id = 161;
for (String k : contentsMap.keySet()) {
sb.append("INSERT INTO `article` VALUES (" + id + ", '2019-03-19 14:24:56', '2019-03-19 10:53:56', '0', '网络文章', \"" + contentsMap.get(k).replace("\"", "\\\'") + "\", '0', '', '', null, null, null, '" + k + "', '1', null, '0', null)");
sb.append(" ;");
sb.append("\r\n");
id++;
}
String data = sb.toString();
data = data.replace("F:\\images\\", "/images/");
writeFile(new StringBuffer(data));
// System.out.println(data);
return data;
}
/**
* 路径处理为map key-> word文件名 value-> 全路径
*
* @param list
* @return
*/
public Map<String, String> handleFileName(List<String> list) {
if (list.size() == 0) {
System.out.println("没有需要处理的文件!");
}
// 过滤非word文档路径
for (int i = 0; i < list.size(); i++) {
String str = list.get(i);
if (str.contains(".doc") || str.contains(".docx")) {
} else {
list.remove(str);
i--;
}
}
Map<String, String> map = new HashMap<>();
for (String path : list) {
if (StringUtils.isNotBlank(path)) {
String[] arr = path.split("\\\\");
for (String value : arr) {
if (value.contains(".doc") || value.contains(".docx")) {
// 文件名 全路径
map.put(value, path);
}
}
}
}
return map;
}
/**
* 获取所有文件夹及文件
*
* @param path
* @param listFileName
*/
public void getAllFileName(String path, List<String> listFileName) {
File file = new File(path);
File[] files = file.listFiles();
String[] names = file.list();
if (names != null) {
String[] completNames = new String[names.length];
for (int i = 0; i < names.length; i++) {
// if(path.contains(".doc") || path.contains(".docx"))
completNames[i] = path + names[i];
}
listFileName.addAll(Arrays.asList(completNames));
}
for (File a : files) {
if (a.isDirectory()) {
//如果文件夹下有子文件夹,获取子文件夹下的所有文件全路径。
getAllFileName(a.getAbsolutePath() + "\\", listFileName);
}
}
}
/**
* word2003转换
*
* @param imgPath
* @param fileName
* @param outPutFile
*/
public String word2003ToHtml(String imgPath, String fileName, String outPutFile)
throws TransformerException, IOException,
ParserConfigurationException {
HWPFDocument wordDocument = new HWPFDocument(new FileInputStream(fileName));
WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(
DocumentBuilderFactory.newInstance().newDocumentBuilder()
.newDocument());
wordToHtmlConverter.setPicturesManager((content, pictureType, suggestedName, widthInches, heightInches) -> suggestedName);
wordToHtmlConverter.processDocument(wordDocument);
//save pictures
List pics = wordDocument.getPicturesTable().getAllPictures();
if (pics != null) {
for (int i = 0; i < pics.size(); i++) {
Picture pic = (Picture) pics.get(i);
System.out.println();
try {
pic.writeImageContent(new FileOutputStream(imgPath
+ pic.suggestFullFileName()));
} catch (FileNotFoundException e) {
e.printStackTrace();
}
}
}
Document htmlDocument = wordToHtmlConverter.getDocument();
ByteArrayOutputStream out = new ByteArrayOutputStream();
DOMSource domSource = new DOMSource(htmlDocument);
StreamResult streamResult = new StreamResult(out);
TransformerFactory tf = TransformerFactory.newInstance();
Transformer serializer = tf.newTransformer();
serializer.setOutputProperty(OutputKeys.ENCODING, "utf-8");
serializer.setOutputProperty(OutputKeys.INDENT, "yes");
serializer.setOutputProperty(OutputKeys.METHOD, "html");
serializer.transform(domSource, streamResult);
out.close();
writeFile(new String(out.toByteArray()), outPutFile);
String body = readHtml(new File((outPutFile)));
// System.out.println(out.toString());
body = replaceBreak(body);
// System.out.println(body);
return body;
}
/**
* word2007转html
*
* @throws IOException
*/
public String word2007ToHtml(String imgPath, String fileName, String htmlName) throws IOException {
File f = new File(fileName);
String content = "";
if (!f.exists()) {
System.out.println("Sorry File does not Exists!");
} else {
if (f.getName().endsWith(".docx") || f.getName().endsWith(".docx")) {
// 1) 加载word文档生成 XWPFDocument对象
InputStream in = new FileInputStream(f);
if (in.available() == 0) {
return content;
}
XWPFDocument document = new XWPFDocument(in);
// 2) 解析 XHTML配置 (这里设置IURIResolver来设置图片存放的目录)
File imageFolderFile = new File(imgPath);
XHTMLOptions options = XHTMLOptions.create().URIResolver(new FileURIResolver(imageFolderFile));
options.setExtractor(new FileImageExtractor(imageFolderFile));
options.setIgnoreStylesIfUnused(false);
options.setFragment(true);
// 3) 将 XWPFDocument转换成XHTML
OutputStream out = new FileOutputStream(new File(htmlName));
XHTMLConverter.getInstance().convert(document, out, options);
//也可以使用字符数组流获取解析的内容
ByteArrayOutputStream baos = new ByteArrayOutputStream();
XHTMLConverter.getInstance().convert(document, baos, options);
if (baos.size() == 0) {
return content;
}
content = baos.toString();
// 输出转化后的文本
// System.out.println(content);
baos.close();
} else {
System.out.println("Enter only MS Office 2007+ files");
}
}
return content;
}
/**
* 解析html文件 获得body内容
*
* @param file
* @return
*/
public String readHtml(File file) {
String body = "";
try {
FileInputStream iStream = new FileInputStream(file);
Reader reader = new InputStreamReader(iStream);
BufferedReader htmlReader = new BufferedReader(reader);
String line;
boolean found = false;
while (!found && (line = htmlReader.readLine()) != null) {
if (line.toLowerCase().indexOf("<body") != -1) { // 在<body>的前面可能存在空格
found = true;
}
}
found = false;
while (!found && (line = htmlReader.readLine()) != null) {
if (line.toLowerCase().indexOf("</body") != -1) {
found = true;
} else {
// 如果存在图片,则将相对路径转换为绝对路径
String lowerCaseLine = line.toLowerCase();
if (lowerCaseLine.contains("src")) {
//这里是定义图片的访问路径
String directory = "D:/test";
// 如果该行存在多个<img>元素,则分行进行替代
String[] splitLines = line.split("<img\\s+"); // <img后带一个或多个空格
// 因为java中引用的问题不能使用for each
for (int i = 0; i < splitLines.length; i++) {
if (splitLines[i].toLowerCase().startsWith("src")) {
splitLines[i] = splitLines[i].substring(0, splitLines[i].toLowerCase().indexOf("src") + 5)
+ directory
+ splitLines[i].substring(splitLines[i].toLowerCase().indexOf("src") + 5);
}
}
// 最后进行拼接
line = "";
for (int i = 0; i < splitLines.length - 1; i++) { // 循环次数要-1,因为最后一个字符串后不需要添加<img
line = line + splitLines[i] + "<img ";
}
line = line + splitLines[splitLines.length - 1];
}
body = body + line + "\n";
}
}
htmlReader.close();
// System.out.println(body);
} catch (Exception e) {
e.printStackTrace();
}
return body;
}
/**
* 去掉换行
*
* @param str
* @return
*/
public String replaceBreak(String str) {
String dest = "";
if (str != null) {
Pattern p = Pattern.compile("\\t|\n");
Matcher m = p.matcher(str);
dest = m.replaceAll("");
}
return dest;
}
/**
* sql写入文件
* @param content
* @param path
*/
public static void writeFile(String content, String path) {
FileOutputStream fos = null;
BufferedWriter bw = null;
try {
File file = new File(path);
fos = new FileOutputStream(file);
bw = new BufferedWriter(new OutputStreamWriter(fos, "utf-8"));
bw.write(content);
} catch (FileNotFoundException fnfe) {
fnfe.printStackTrace();
} catch (IOException ioe) {
ioe.printStackTrace();
} finally {
try {
if (bw != null)
bw.close();
if (fos != null)
fos.close();
} catch (IOException ie) {
}
}
}
/**
* sql写入文件
*/
public static void writeFile(StringBuffer sb) {
String sqlTxtFile = "F:\\articleSql.sql";
try {
File writeName = new File(sqlTxtFile); // 相对路径,如果没有则要建立一个新的output.txt文件
writeName.createNewFile(); // 创建新文件,有同名的文件的话直接覆盖
try (FileWriter writer = new FileWriter(writeName);
BufferedWriter out = new BufferedWriter(writer)
) {
out.write(sb.toString()); // \r\n即为换行
out.flush(); // 把缓存区内容压入文件
}
} catch (IOException e) {
e.printStackTrace();
}
System.out.println("sql文件写入完成");
}
// 生成6位随机数
public static String randomCode(){
int num = (int)((Math.random()*9+1)*100000);
return String.valueOf(num);
}
/**
* 生成随机字符串 时间戳_6位随机数
* @return
*/
public static String generateImageName(){
String name = String.valueOf(System.currentTimeMillis()) + "_" + randomCode();
return name;
}
}
运行handleWordToSql测试方法即可!
效果图:
原word文档
处理后的html
对应的html图片文件夹
生成的sql