杞人的优雅 杞人的优雅

用python将doc文件转换成docx文件

in 默认分类read (772) 站长qwertyuildy 文章转载请注明来源!
import os 
from win32com import client as wc
word = wc.Dispatch('Word.Application')
#将docx文件保存的路径
docxPath = 'D:\\wyj\\技术文档\\python-doc文件转换成docx文件\\docx'
#doc文件路径
path = "D:\\wyj\\技术文档\\python-doc文件转换成docx文件"
'''
获取doc文件名,创建docx文件路径
'''
def getFileName(path):
    filesList = []
    for root, dirs, files in os.walk(path):  
        isExists=os.path.exists(docxPath)
        #新建docx文件夹
        if not isExists :
            os.makedirs(docxPath)
        for file in files:
            #判断尾缀是不是doc
            suffix = file.split('.')[1]
            if suffix == 'doc':
                filesList.append(file)
        print(filesList)
        return filesList
'''
将doc文件转换成docx文件
'''
def docToDocx(fileNameList):
    try:
        for fileName in fileNameList:
            print("开始处理     文件名:"+fileName)
            doc = word.Documents.Open(path+'\\'+fileName)
            #[:-4]的意思是选这个字符串从开始到最后倒数第4位(不含)
            docxNamePath = docxPath+'\\'+fileName[:-4]+'.docx'
            print('转换完成!'+docxNamePath)
            doc.SaveAs(docxNamePath, 12, False, "", True, "", False, False, False, False)
            
    finally:
        #一定要记得关闭docx,否则会出现文件占用
        doc.Close()    
try:
    fileNameList = getFileName(path)
    docToDocx(fileNameList)
finally:
    word.Quit()

1.将doc转为docx
python3.8中win32com 要安装pypiwin32 pip install pypiwin32

from win32com import client as wc

word = wc.Dispatch("Word.Application")
doc = word.Documents.Open(路径+名称.doc)
doc.SaveAs(路径+名称.docx, 12)   12为docx
doc.Close()
word.Quit()

2.读取段落

import docx
docStr = Document(docName)   打开文档
for paragraph in docStr.paragraphs:
parStr = paragraph.text
--》paragraph.style.name == 'Heading 1'  一级标题   
--》paragraph.paragraph_format.alignment == 1  居中显示
--》paragraph.style.next_paragraph_style.paragraph_format.alignment == 1  下一段居中显示
--》paragraph.style.font.color

3.读取表格

numTables = docStr.tables
for table in numTables:
#行列个数
row_count = len(table.rows)
col_count = len(table.columns)
for i in range(row_count):
    row = table.rows[i].cells
    i行j列内容:row[j].text

或者:

row_count = len(table.rows)
col_count = len(table.columns)
for i in range(row_count):
    for j in range(col_count):
        print(table.cell(i,j).text)

4.按样式读取
读取标题

for p in doc.paragraphs:
    if p.style.name=='Heading 1':
        print(p.text)
import re
for p in doc.paragraphs:
    if re.match("^Heading \d+$",p.style.name):
        print(p.text)

读取正文

for p in doc.paragraphs:
    if p.style.name=='Normal':
        print(p.text)

获取docx支持的样式

from docx.enum.style import WD_STYLE_TYPE
for i in s:
    if i.type==WD_STYLE_TYPE.PARAGRAPH:
        print(i.name)

5.获取文字格式信息
paragraph 对象 里还有更小的 run 对象,run 对象才包含了段落对象的文字信息。
paragraph.text 方法也是通过 run 对象的方法获取到文字信息的:

paragraph.text 方法源码:

def text(self):
     text = ''
        for run in self.runs:
            text += run.text
        return text

文字的字体、大小、下划线等信息都包含在 run 对象中(不清楚的看前面的博客):

获取段落的 run 对象列表

runs = par0.runs
print(runs)

获取 run 对象

run_0 = runs[0]
print(run_0.text) # 获取 run 对象文字信息

打印结果:
坚持因地制宜,差异化打造特色小镇,
文档 段落 和 run 对象示意:
获取文字格式信息:

# 获取文字格式信息
print('字体名称:',run_0.font.name)
# 字体名称: 宋体
print('字体大小:',run_0.font.size)
# 字体大小: 152400
print('是否加粗:',run_0.font.bold)
# 是否加粗: None
print('是否斜体:',run_0.font.italic)
# 是否斜体: True
print('字体颜色:',run_0.font.color.rgb)
# 字体颜色: FF0000
print('字体高亮:',run_0.font.highlight_color)
# 字体高亮: YELLOW (7)
print('下划线:',run_0.font.underline)
# 下划线: True
print('删除线:',run_0.font.strike)
# 删除线: None
print('双删除线:',run_0.font.double_strike)
# 双删除线: None
print('下标:',run_0.font.subscript)
# 下标: None
print('上标:',run_0.font.superscript)
# 上标: None
LIK2

6.设置首行缩进

from docx.shared import Inches,Pt
par2 = doc.add_paragraph('段落文本')
# 左缩进,0.5 英寸
par2.paragraph_format.left_indent = Inches(0.5)
# 右缩进,20 磅
par2.paragraph_format.right_indent = Pt(20)
# 首行缩进
par2.paragraph_format.first_line_indent = Inches(1)

查看首行缩进单位

from docx import Document
from docx.shared import Inches
from docx.oxml.ns import qn

from docx.shared import Cm, Pt
from docx.enum.text import WD_PARAGRAPH_ALIGNMENT

from docx.shared import RGBColor

myDocument = Document('2020年建交集团3月分析报告.docx')

for paragraph in myDocument.paragraphs:
    print(paragraph.paragraph_format.first_line_indent)
    print(dir(paragraph))
jrotty WeChat Pay

微信打赏

jrotty Alipay

支付宝打赏

文章二维码

扫描二维码,在手机上阅读!

默认分类
最后由qwertyuildy修改于2022-10-07 15:07

此处评论已关闭

前篇 后篇
雷姆
拉姆