去除pdf转word后空格和长空格，统一处理格式

飞熊 • 2023年8月8日下午8:42 • Python • 阅读 392

# encoding: utf-8
'''
@author: JHC 
@license: None
@contact: JHC000abc@gmail.com
@file: test2.py
@time: 2022/08/21/ 17:52
@desc:去除pdf转word后 空格和长空格，统一处理格式
'''
import os
from docx import Document
from docx.enum.text import WD_PARAGRAPH_ALIGNMENT
from docx.oxml.ns import qn
from docx.shared import Pt, Inches



def get_file_list(path):
    '''
    获取待处理文本路径列表
    :param path: 待处理文本根目录
    :return: 待处理文本路径列表
    '''
    file_lis = []
    for i,j,k in os.walk(path):
        for _k in k:
            if _k.split(".")[-1] == "doc" or _k.split(".")[-1] == "docx":
                file = os.path.join(i,_k)
                file_lis.append(file)
    return file_lis

def read_txt_lis(file_lis):
    '''
    1. 读取原始doc 内容，并将多余空行去除
    2. 设置样式，重新写回新的文本中保存
    :param file_lis: 待处理文本路径列表
    :return:
    '''
    for file in file_lis:
         # 想获取的文档文件名，这里是相对路径。
        out_path = os.path.join(os.path.split(file)[0],"res")
        if not os.path.exists(out_path):
            os.makedirs(out_path)
        else:
            pass
        doc = Document(file)
         # print(out_path)
        out_file = os.path.join(out_path,os.path.split(file)[-1])
        # print("out_file",out_file)
        header_lis = []
        num = 0
        num2 = 0
        for k,v in enumerate(doc.paragraphs):
            # print(k,v.text)
            if v.text == "":
                num += 1
                if num == 1:
                    header_lis.append(v.text)
                else:
                    pass
            else:
                header_lis.append(v.text)


        # 保存文本格式处理
        doc2 = Document()
        for i in header_lis:
            p = doc2.add_paragraph()
            if i == "":
                num2 += 1
            else:
                # 添加文字
                run = p.add_run(i.replace("	","").replace("		",""))
                # 字体
                run.font.name = u"宋体"
                run._element.rPr.rFonts.set(qn('w:eastAsia'), u'宋体')
                # 字体大小
                run.font.size = Pt(12)
                # 行间距
                p.paragraph_format.line_spacing = Pt(20)
                # 段前段后磅数
                p.paragraph_format.space_before = Pt(0)
                p.paragraph_format.space_after = Pt(0)
                # 首行缩进 0.85 cm = 两空格
                p.paragraph_format.first_line_indent = run.font.size*2
                if num2 < 1:
                    # 文字居中
                    p.paragraph_format.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER
                else:
                    # 文字居左
                    p.paragraph_format.alignment = WD_PARAGRAPH_ALIGNMENT.LEFT
        # print(out_file)
        doc2.save(out_file)
        # break

if __name__ == '__main__':
    path = R"E:\Desktop\第一组"
    file_lis = get_file_list(path)
    read_txt_lis(file_lis)