1. Read all the files in the folder and filter out the .doc files (because the python dependency package docx can only open .docx files, you need to filter out the .doc files first and convert them to .docx)
import os def list_files_doc(path): files_doc = [] for i, j, k in os.walk(path): for file in k: suffix = file.split( ' . ' ) if suffix[1] == ' doc ' : print (file) files_doc.append(os.path.join(path, file)) print ( ' List of files in doc format: {} ' .format(files_doc)) return files_doc if __name__ == ' __main__ ' : list_files_doc( ' E:\\python_myfile\\read_excel ' )
2. Convert .doc files to .docx files
from win32com import client as wc #Import module def doc2docx(doc_files): word = wc.Dispatch( " Word.Application " ) #Open the word application for doc_file in doc_files: doc = word.Documents.Open(doc_file) #Open word file doc.SaveAs( " {}x " .format(doc_file), 12) #Save as a file with the suffix ".docx", where parameter 12 refers to the docx file doc .Close() #Close the original word file word.Quit() print ( "The doc file is converted to docx completed " ) if __name__ == ' __main__ ' : doc2docx([ ' E:\\python_myfile\\read_excel\\user1 information.doc ' ,])
3. Read table information from .docx file
from docx import Document import os def get_data_from_docx_files(path): print (path) data = [] for i, j, k in os.walk(path): for file in k: suffix = file.split( ' . ' ) if suffix[1] == ' docx ' : document = Document(file) #Read in the file tables = document.tables #Get the table set in the file table = tables[0] name = table.cell(0, 1 ).text sex = table.cell(0, 3 ).text info = { " name " : name, " sex " : sex} print (info) data.append(info) return data if __name__ == ' __main__ ' : get_data_from_docx_files( ' E:\\python_myfile\\read_excel ' )
4. Export the information to an excel sheet
import xlwt def output_excel(header, data, result_excel): #Read the text file book = xlwt.Workbook(encoding= ' utf-8 ' , style_compression=0) #Create a Workbook object, which is equivalent to creating an Excel file sheet = book.add_sheet( ' test ' , cell_overwrite_ok=True) # # where test is the name of the sheet, cell_overwrite_ok, indicating whether the cell can be overwritten, it is actually a parameter of Worksheet instantiation, the default value is False #Write header i = 0 for k in header: sheet.write(0, i, k) i = i + 1 # write content row = 1 for val in data: print (val) sheet.write(row, 0, val[ ' name ' ]) #the second row and the first column sheet.write(row, 1, val[ ' sex ' ]) #the second row and the second column row = row + 1 book.save(result_excel) if __name__ == ' __main__ ' : output_excel([ ' name ' , ' gender ' ], [{ ' name ' : ' Danny ' , ' sex ' : ' female ' }, { ' name ' : ' Merry ' , ' sex ' : ' male ' }], ' results.xls ' )