Docxtohtml

Posted on 2022-07-03

由于工作中遇到将office的word合并将合并的word转成Html5的任务。

需要处理的相关问题如下：

按照给定的excel将指定的多个word文档按照顺序合并
将合并后的word文档按照word中的字体、样式转化，并将word中的图片base64处理
将上述过程整理成一个简单小程序，如GUI程序，便于资源组直接进行处理

工具清单

Tool	DESC	Link
python=3.7	base code	https://docs.python.org/3.7/
base64	img base64转码	https://docs.python.org/3.7/library/base64.html?highlight=base64#module-base64
tkinter	pyth0n GUI	https://docs.python.org/3.7/library/tkinter.html?highlight=tkinter#module-tkinter
docx=0.2.4	加载word文档	https://pypi.org/project/docx/
docxcompose=1.3.4	操作word文档	https://pypi.org/project/docxcompose/
pywin32=304	office的python宏调用包	https://pypi.org/project/pywin32/
lmxl=4.9.0	解析html的包	https://pypi.org/project/lxml/

#! /usr/bin/env python3
# -*- coding: utf-8 -*-
# DESC：将docx转化成html

import os
import base64
import shutil
import tkinter as tk
from tkinter import filedialog
from lxml import etree
from win32com import client as wc
from docx import Document
from docxcompose.composer import Composer

代码清单

合并word文档[主要针对docx]

def merge_doc(docx_list: list, docx_tar: str, docx_list_src: str):
    """
    docx_list: word文档名称列表 ["word1.docx", "word2.docx"]
    docx_tar: 合并后word文档存储目标目录
    docx_list_src: 需要合并word文档的源文件目录
    """
    if len(docx_list) == 0:
        return
    # 将第一个word文档作为合并基文档
    base_docx = os.path.join(docx_list_src, docx_list[0])
    tar_doc = Document(base_docx)
    tar_composer = Composer(tar_doc)
    # 合并后续文档
    for next_doc in docx_list[1:]:
        next_doc_path = os.path.join(docx_list_src, next_doc)
        tar_composer.append(Document(next_doc_path))
    # 存储合并文档    
    docx_tar_file = os.path.join(docx_tar, "introduce.docx")
    tar_composer.save(docx_tar_file)

word文档转Html

def convert_docx_html_pywin(docx_file_path: str) -> str:
    """
    docx_file_path: word原文档
    """
    # 调用windows的word的宏
    word = wc.Dispatch('Word.Application')
    doc = word.Documents.Open(docx_file_path)
    html_file_path = os.path.join(os.path.dirname(docx_file_path),
                                  os.path.basename(docx_file_path).split(".")[0] + ".html")
    # 选择 10 单个另存为方式
    doc.SaveAs(html_file_path, 10)
    doc.Close()
    word.Quit()

    # 转化编码， 去掉byte编码转存utf8格式
    fp = open(html_file_path, "rb")
    html_gb = fp.read()
    fp.close()
    fp = open(html_file_path, "wb")
    fp.write(html_gb.decode("gbk").replace("charset=gb2312", "charset=utf-8").encode("utf-8"))
    fp.close()
    return html_file_path

将图片转成base64编码

def convert_pic_base64(img_path: str) -> str:
    """
    img_path: 图片路径
    return： 返回base64编码串
    """
    fp = open(img_path, "rb")
    img_content = fp.read()
    img_base64 = base64.b64encode(img_content)

    img_base64_str = str(img_base64, "utf-8") # b'' 转utf8字符串
    img_base64_tmp = f"data:image/{os.path.splitext(img_path)[-1][1:]};base64,{img_base64_str}"
    return img_base64_tmp

将转化后的html中的图片换成base64

def process_html_img_tag(html_tar_file: str):
    """
    html_tar_file: word转存html的原始文件
    """
    # 当前转化的图片路径
    dir_path = os.path.dirname(html_tar_file)
    # 将html中img的src进行修改
    html_content = etree.parse(html_tar_file, etree.HTMLParser())
    tmp_files = []
    for img_tag in html_content.xpath("//img"):
        if img_tag.attrib["src"] == "":
            continue
        src_img_path = os.path.join(dir_path, img_tag.attrib["src"])
        tmp_files.append(os.path.dirname(src_img_path))
        src_img_b64code = convert_pic_base64(src_img_path)
        img_tag.attrib["src"] = src_img_b64code

    # 去除转换后html中存在的system url和public id的信息
    html_content.docinfo.system_url = None
    html_content.docinfo.public_id = None
    # 将lxml处理后的字节html内容转存utf8的html文件
    f = open(html_tar_file, "w", encoding="utf-8")
    # method=html 去除非html的信息
    f.write(etree.tostring(html_content, encoding="utf-8", method="html", pretty_print=True).decode("utf-8"))
    f.close()

    for p in tmp_files:
        shutil.rmtree(p)

# word转html总成
def convert(docx_path: str):
    html_path = convert_docx_html_pywin(docx_path)
    process_html_img_tag(html_path)
    return html_path.replace("\\", "/")

基于tkinter的简单GUI设计

window = tk.Tk()
window.title("Word转Html工具")
window.geometry('600x300')

# 设置输入 1 2
path_var = tk.StringVar()
entry = tk.Entry(window, textvariable=path_var, width=60)
entry.place(x=20, y=10, anchor='nw')

path_var1 = tk.StringVar()
entry1 = tk.Entry(window, textvariable=path_var1, width=60)
entry1.place(x=20, y=60, anchor='nw')


def choose_file_click():
    """
    选择文件事件定义
    """
    # 设置可以选择的文件类型，不属于这个类型的，无法被选中
    file_types = [("word文件", "*.docx")]
    file_name = filedialog.askopenfilename(title='选择单个文件',
                                           filetypes=file_types,
                                           initialdir='./')  # 打开当前程序工作目录
    path_var.set(file_name)


def convert_file_click():
    """
    word转html事件定义
    """
    src_file_path = path_var.get()
    convert_fil_path = convert(src_file_path)
    path_var1.set(convert_fil_path)


def clear_clik():
    """
    清空所有事件定义
    """
    path_var.set("")
    path_var1.set("")


choose_btn = tk.Button(window, text='选择', command=choose_file_click)
choose_btn.place(x=450, y=10, anchor='nw')
convert_btn = tk.Button(window, text='转化', command=convert_file_click)
convert_btn.place(x=450, y=60, anchor='nw')
clear_btn = tk.Button(window, text='清空', command=clear_clik)
clear_btn.place(x=450, y=100, anchor='nw')

window.mainloop()

word合并以及转html测试

`word合并`效果

if __name__ == '__main__':
    docs = ["xx.docx", "yy.docx"]
    tar = r""
    src = r""
    merge_doc(docs, tar, src)

word docx src 1 word docx src 2 word docx merge result

合并效果中/docx 2/中字体样式存在部分合并失效。目前未有很好的解决方法。

如果您有好的解决办法，请留言，不胜感激！！

`docx转html效果`

word docx

docx转化后的html文件

converted html result

<p class="MsoNormal" style="text-align:justify;text-justify:inter-ideograph;line-height:normal">
    <span style='font-size:10.5pt;font-family:"Rounded Mplus 1c"'>
        <img width="601" height="159" id="image1.png" src="data:image/gif;base64,R0lGODlhWQKfAHcAMSH...+GcAQEAOw==">
    </span>
</p>

`GUI`效果以及测试

gui start result gui worked result