Word 文档(Docx)合并、转 Html5 并内嵌 Base64 图片的 Python GUI 工具

Word 文档(Docx)合并、转 Html5 并内嵌 Base64 图片的 Python GUI 工具

工作中遇到一个需求:将多份 Office Word 文档按指定顺序合并,再统一转为 HTML5 格式。

具体需要解决以下三个问题:

  1. 按照给定的 Excel 列表,将多个 Word 文档按顺序合并
  2. 合并后保留原文档的字体和样式,并将文档中的图片转为 Base64 内嵌
  3. 将整套流程封装为简单的 GUI 程序,方便资源组直接使用

工具清单

Tool 说明 Link
python=3.7 基础语言 https://docs.python.org/3.7/
base64 图片 Base64 转码 https://docs.python.org/3.7/library/base64.html?highlight=base64#module-base64
tkinter Python GUI 框架 https://docs.python.org/3.7/library/tkinter.html?highlight=tkinter#module-tkinter
docx=0.2.4 加载 Word 文档 https://pypi.org/project/docx/
docxcompose=1.3.4 合并操作 Word 文档 https://pypi.org/project/docxcompose/
pywin32=304 调用 Office 宏的 Python 包 https://pypi.org/project/pywin32/
lxml=4.9.0 解析 HTML https://pypi.org/project/lxml/
#! /usr/bin/env python3
# -*- coding: utf-8 -*-
# 将 docx 转化为 html

import os
import base64
import shutil
import tkinter as tk
from tkinter import filedialog
from lxml import etree
from win32com import client as wc
from docx import Document
from docxcompose.composer import Composer

代码清单

合并 Word 文档(主要针对 docx)

def merge_doc(docx_list: list, docx_tar: str, docx_list_src: str):
    """
    docx_list: word 文档名称列表,如 ["word1.docx", "word2.docx"]
    docx_tar: 合并后 word 文档的存储目标目录
    docx_list_src: 待合并 word 文档的源文件目录
    """
    if len(docx_list) == 0:
        return
    # 以第一个 word 文档作为合并基文档
    base_docx = os.path.join(docx_list_src, docx_list[0])
    tar_doc = Document(base_docx)
    tar_composer = Composer(tar_doc)
    # 依次追加后续文档
    for next_doc in docx_list[1:]:
        next_doc_path = os.path.join(docx_list_src, next_doc)
        tar_composer.append(Document(next_doc_path))
    # 保存合并后的文档
    docx_tar_file = os.path.join(docx_tar, "introduce.docx")
    tar_composer.save(docx_tar_file)

Word 文档转 Html

def convert_docx_html_pywin(docx_file_path: str) -> str:
    """
    docx_file_path: word 原文档路径
    """
    # 调用 Windows Word 的 COM 宏
    word = wc.Dispatch('Word.Application')
    doc = word.Documents.Open(docx_file_path)
    html_file_path = os.path.join(os.path.dirname(docx_file_path),
                                  os.path.basename(docx_file_path).split(".")[0] + ".html")
    # 另存为 Html(格式代码 10)
    doc.SaveAs(html_file_path, 10)
    doc.Close()
    word.Quit()

    # 编码转换:GBK → UTF-8
    fp = open(html_file_path, "rb")
    html_gb = fp.read()
    fp.close()
    fp = open(html_file_path, "wb")
    fp.write(html_gb.decode("gbk").replace("charset=gb2312", "charset=utf-8").encode("utf-8"))
    fp.close()
    return html_file_path

图片转 Base64 编码

def convert_pic_base64(img_path: str) -> str:
    """
    img_path: 图片路径
    return: Base64 编码串
    """
    fp = open(img_path, "rb")
    img_content = fp.read()
    img_base64 = base64.b64encode(img_content)

    img_base64_str = str(img_base64, "utf-8")  # bytes 转 utf-8 字符串
    img_base64_tmp = f"data:image/{os.path.splitext(img_path)[-1][1:]};base64,{img_base64_str}"
    return img_base64_tmp

替换 Html 中的图片为 Base64

def process_html_img_tag(html_tar_file: str):
    """
    html_tar_file: word 转存 html 的原始文件路径
    """
    # 当前转化图片所在目录
    dir_path = os.path.dirname(html_tar_file)
    # 解析 html 并替换 img 标签的 src
    html_content = etree.parse(html_tar_file, etree.HTMLParser())
    tmp_files = []
    for img_tag in html_content.xpath("//img"):
        if img_tag.attrib["src"] == "":
            continue
        src_img_path = os.path.join(dir_path, img_tag.attrib["src"])
        tmp_files.append(os.path.dirname(src_img_path))
        src_img_b64code = convert_pic_base64(src_img_path)
        img_tag.attrib["src"] = src_img_b64code

    # 去除转换后 html 中的 system URL 和 public ID
    html_content.docinfo.system_url = None
    html_content.docinfo.public_id = None
    # 将 lxml 处理后的字节内容写入 utf-8 html 文件
    f = open(html_tar_file, "w", encoding="utf-8")
    # method=html 去除冗余信息
    f.write(etree.tostring(html_content, encoding="utf-8", method="html", pretty_print=True).decode("utf-8"))
    f.close()

    for p in tmp_files:
        shutil.rmtree(p)


# Word 转 Html 总成
def convert(docx_path: str):
    html_path = convert_docx_html_pywin(docx_path)
    process_html_img_tag(html_path)
    return html_path.replace("\\", "/")

基于 tkinter 的 GUI 设计

window = tk.Tk()
window.title("Word转Html工具")
window.geometry('600x300')

# 输入框 1:源文件路径
path_var = tk.StringVar()
entry = tk.Entry(window, textvariable=path_var, width=60)
entry.place(x=20, y=10, anchor='nw')

# 输入框 2:转换结果路径
path_var1 = tk.StringVar()
entry1 = tk.Entry(window, textvariable=path_var1, width=60)
entry1.place(x=20, y=60, anchor='nw')


def choose_file_click():
    """选择文件事件"""
    file_types = [("word文件", "*.docx")]
    file_name = filedialog.askopenfilename(title='选择单个文件',
                                           filetypes=file_types,
                                           initialdir='./')
    path_var.set(file_name)


def convert_file_click():
    """Word 转 Html 事件"""
    src_file_path = path_var.get()
    convert_file_path = convert(src_file_path)
    path_var1.set(convert_file_path)


def clear_click():
    """清空所有输入"""
    path_var.set("")
    path_var1.set("")


choose_btn = tk.Button(window, text='选择', command=choose_file_click)
choose_btn.place(x=450, y=10, anchor='nw')
convert_btn = tk.Button(window, text='转化', command=convert_file_click)
convert_btn.place(x=450, y=60, anchor='nw')
clear_btn = tk.Button(window, text='清空', command=clear_click)
clear_btn.place(x=450, y=100, anchor='nw')

window.mainloop()

Word 合并及转 Html 测试

合并效果

if __name__ == '__main__':
    docs = ["xx.docx", "yy.docx"]
    tar = r""
    src = r""
    merge_doc(docs, tar, src)

word docx 源文件 1 word docx 源文件 2 word docx 合并结果

注意: 合并效果中 /docx 2/ 的字体样式存在部分合并失效的情况,目前未有很好的解决方法。如果您有好的建议,欢迎留言,不胜感激!

Docx 转 Html 效果

word docx 原文件

转化后的 Html 文件:

转化后的 Html 结果

<p class="MsoNormal" style="text-align:justify;text-justify:inter-ideograph;line-height:normal">
    <span style='font-size:10.5pt;font-family:"Rounded Mplus 1c"'>
        <img width="601" height="159" id="image1.png" src="data:image/gif;base64,R0lGODlhWQKfAHcAMSH...+GcAQEAOw==">
    </span>
</p>

GUI 效果及测试

GUI 启动界面 GUI 运行结果

avatar
hzzhu