Docxtohtml.
由于工作中遇到将office的word合并将合并的word转成Html5
的任务。
需要处理的相关问题如下:
- 按照给定的excel将指定的多个word文档按照顺序合并
- 将合并后的word文档按照word中的字体、样式转化,并将word中的图片base64处理
- 将上述过程整理成一个简单小程序,如GUI程序,便于资源组直接进行处理
工具清单
Tool | DESC | Link |
---|---|---|
python=3.7 | base code | https://docs.python.org/3.7/ |
base64 | img base64转码 | https://docs.python.org/3.7/library/base64.html?highlight=base64#module-base64 |
tkinter | pyth0n GUI | https://docs.python.org/3.7/library/tkinter.html?highlight=tkinter#module-tkinter |
docx=0.2.4 | 加载word文档 | https://pypi.org/project/docx/ |
docxcompose=1.3.4 | 操作word文档 | https://pypi.org/project/docxcompose/ |
pywin32=304 | office的python宏调用包 | https://pypi.org/project/pywin32/ |
lmxl=4.9.0 | 解析html的包 | https://pypi.org/project/lxml/ |
#! /usr/bin/env python3
# -*- coding: utf-8 -*-
# DESC:将docx转化成html
import os
import base64
import shutil
import tkinter as tk
from tkinter import filedialog
from lxml import etree
from win32com import client as wc
from docx import Document
from docxcompose.composer import Composer
代码清单
合并word文档[主要针对docx]
def merge_doc(docx_list: list, docx_tar: str, docx_list_src: str):
"""
docx_list: word文档名称列表 ["word1.docx", "word2.docx"]
docx_tar: 合并后word文档存储目标目录
docx_list_src: 需要合并word文档的源文件目录
"""
if len(docx_list) == 0:
return
# 将第一个word文档作为合并基文档
base_docx = os.path.join(docx_list_src, docx_list[0])
tar_doc = Document(base_docx)
tar_composer = Composer(tar_doc)
# 合并后续文档
for next_doc in docx_list[1:]:
next_doc_path = os.path.join(docx_list_src, next_doc)
tar_composer.append(Document(next_doc_path))
# 存储合并文档
docx_tar_file = os.path.join(docx_tar, "introduce.docx")
tar_composer.save(docx_tar_file)
word文档转Html
def convert_docx_html_pywin(docx_file_path: str) -> str:
"""
docx_file_path: word原文档
"""
# 调用windows的word的宏
word = wc.Dispatch('Word.Application')
doc = word.Documents.Open(docx_file_path)
html_file_path = os.path.join(os.path.dirname(docx_file_path),
os.path.basename(docx_file_path).split(".")[0] + ".html")
# 选择 10 单个另存为方式
doc.SaveAs(html_file_path, 10)
doc.Close()
word.Quit()
# 转化编码, 去掉byte编码转存utf8格式
fp = open(html_file_path, "rb")
html_gb = fp.read()
fp.close()
fp = open(html_file_path, "wb")
fp.write(html_gb.decode("gbk").replace("charset=gb2312", "charset=utf-8").encode("utf-8"))
fp.close()
return html_file_path
将图片转成base64编码
def convert_pic_base64(img_path: str) -> str:
"""
img_path: 图片路径
return: 返回base64编码串
"""
fp = open(img_path, "rb")
img_content = fp.read()
img_base64 = base64.b64encode(img_content)
img_base64_str = str(img_base64, "utf-8") # b'' 转utf8字符串
img_base64_tmp = f"data:image/{os.path.splitext(img_path)[-1][1:]};base64,{img_base64_str}"
return img_base64_tmp
将转化后的html中的图片换成base64
def process_html_img_tag(html_tar_file: str):
"""
html_tar_file: word转存html的原始文件
"""
# 当前转化的图片路径
dir_path = os.path.dirname(html_tar_file)
# 将html中img的src进行修改
html_content = etree.parse(html_tar_file, etree.HTMLParser())
tmp_files = []
for img_tag in html_content.xpath("//img"):
if img_tag.attrib["src"] == "":
continue
src_img_path = os.path.join(dir_path, img_tag.attrib["src"])
tmp_files.append(os.path.dirname(src_img_path))
src_img_b64code = convert_pic_base64(src_img_path)
img_tag.attrib["src"] = src_img_b64code
# 去除转换后html中存在的system url和public id的信息
html_content.docinfo.system_url = None
html_content.docinfo.public_id = None
# 将lxml处理后的字节html内容转存utf8的html文件
f = open(html_tar_file, "w", encoding="utf-8")
# method=html 去除非html的信息
f.write(etree.tostring(html_content, encoding="utf-8", method="html", pretty_print=True).decode("utf-8"))
f.close()
for p in tmp_files:
shutil.rmtree(p)
# word转html总成
def convert(docx_path: str):
html_path = convert_docx_html_pywin(docx_path)
process_html_img_tag(html_path)
return html_path.replace("\\", "/")
基于tkinter的简单GUI设计
window = tk.Tk()
window.title("Word转Html工具")
window.geometry('600x300')
# 设置输入 1 2
path_var = tk.StringVar()
entry = tk.Entry(window, textvariable=path_var, width=60)
entry.place(x=20, y=10, anchor='nw')
path_var1 = tk.StringVar()
entry1 = tk.Entry(window, textvariable=path_var1, width=60)
entry1.place(x=20, y=60, anchor='nw')
def choose_file_click():
"""
选择文件事件定义
"""
# 设置可以选择的文件类型,不属于这个类型的,无法被选中
file_types = [("word文件", "*.docx")]
file_name = filedialog.askopenfilename(title='选择单个文件',
filetypes=file_types,
initialdir='./') # 打开当前程序工作目录
path_var.set(file_name)
def convert_file_click():
"""
word转html事件定义
"""
src_file_path = path_var.get()
convert_fil_path = convert(src_file_path)
path_var1.set(convert_fil_path)
def clear_clik():
"""
清空所有事件定义
"""
path_var.set("")
path_var1.set("")
choose_btn = tk.Button(window, text='选择', command=choose_file_click)
choose_btn.place(x=450, y=10, anchor='nw')
convert_btn = tk.Button(window, text='转化', command=convert_file_click)
convert_btn.place(x=450, y=60, anchor='nw')
clear_btn = tk.Button(window, text='清空', command=clear_clik)
clear_btn.place(x=450, y=100, anchor='nw')
window.mainloop()
word合并以及转html测试
word合并
效果
if __name__ == '__main__':
docs = ["xx.docx", "yy.docx"]
tar = r""
src = r""
merge_doc(docs, tar, src)
合并效果中/docx 2/中字体样式存在部分合并失效。目前未有很好的解决方法。
如果您有好的解决办法,请留言,不胜感激!!
docx转html效果
docx转化后的html文件
<p class="MsoNormal" style="text-align:justify;text-justify:inter-ideograph;line-height:normal">
<span style='font-size:10.5pt;font-family:"Rounded Mplus 1c"'>
<img width="601" height="159" id="image1.png" src="data:image/gif;base64,R0lGODlhWQKfAHcAMSH...+GcAQEAOw==">
</span>
</p>
GUI
效果以及测试