Tag Archives: Python Word to Excel

Python: How to Batch Read the Form Information in Word and output them to Excel file

1. Read all the files in the folder and filter out the .doc files (because the python dependency package docx can only open .docx files, you need to filter out the .doc files first and convert them to .docx)

import os

def list_files_doc(path):
    files_doc = []
     for i, j, k in os.walk(path):
         for file in k:
            suffix = file.split( ' . ' )
             if suffix[1] == ' doc ' :
                 print (file)
                files_doc.append(os.path.join(path, file))

    print ( ' List of files in doc format: {} ' .format(files_doc))
     return files_doc


if  __name__ == ' __main__ ' :
    list_files_doc( ' E:\\python_myfile\\read_excel ' )

 

2. Convert .doc files to .docx files

from win32com import client as wc   #Import module

def doc2docx(doc_files):
    word = wc.Dispatch( " Word.Application " )   #Open the word application for doc_file in doc_files:
    
        doc = word.Documents.Open(doc_file) #Open   word file 
        doc.SaveAs( " {}x " .format(doc_file), 12) #Save   as a file with the suffix ".docx", where parameter 12 refers to the docx file 
        doc .Close()   #Close the original word file 
    word.Quit()
     print ( "The doc file is converted to docx completed " )

if  __name__ == ' __main__ ' :
    doc2docx([ ' E:\\python_myfile\\read_excel\\user1 information.doc ' ,])

 

 

3. Read table information from .docx file

copy code
from docx import Document
 import os

def get_data_from_docx_files(path):
     print (path)
    data = []
     for i, j, k in os.walk(path):
         for file in k:
            suffix = file.split( ' . ' )
             if suffix[1] == ' docx ' :
                document = Document(file)   #Read in the file 
                tables = document.tables #Get   the table set in the file 

                table = tables[0]
                name = table.cell(0, 1 ).text
                sex = table.cell(0, 3 ).text
                info = { " name " : name, " sex " : sex}
                 print (info)
                data.append(info)
    return data

if  __name__ == ' __main__ ' :
    get_data_from_docx_files( ' E:\\python_myfile\\read_excel ' )

 

4. Export the information to an excel sheet

import xlwt

def output_excel(header, data, result_excel):
     #Read the text file 
    book = xlwt.Workbook(encoding= ' utf-8 ' , style_compression=0) #Create   a Workbook object, which is equivalent to creating an Excel file 
    sheet = book.add_sheet( ' test ' , cell_overwrite_ok=True)   # # where test is the name of the sheet, cell_overwrite_ok, indicating whether the cell can be overwritten, it is actually a parameter of Worksheet instantiation, the default value is False

    #Write header 
    i = 0
     for k in header:
        sheet.write(0, i, k)
        i = i + 1

    # write content
    row = 1
    for val in data:
         print (val)
        sheet.write(row, 0, val[ ' name ' ])   #the second row and the first column 
        sheet.write(row, 1, val[ ' sex ' ])   #the second row and the second column 
        row = row + 1

    book.save(result_excel)

if  __name__ == ' __main__ ' :
    output_excel([ ' name ' , ' gender ' ], [{ ' name ' : ' Danny ' , ' sex ' : ' female ' }, { ' name ' : ' Merry ' , ' sex ' : ' male ' }], ' results.xls ' )