#!/usr/bin/env python # -*- coding: UTF-8 -*- #word2html.py, written by Rusty Japikse, at Concepts NREC (3/06). #This code is available as open source under the LGPL license # #About: # word2html.py is a python based (duh!) script to converter MS Word files to clean html files. # This script was developed for a document conversion program in order to place the documents # in Plone, a Content Management System. # # This script accepts a list of input files and converts all the word files to HTML, saving them # on a user's desktop in a folder named HTML. Images are placed within a sub-folder, which is # unsurprisingly named images. Each file's images are placed in a their own folder, this folder # is named FILE_NAME_images (where file name is the name of the file being converted. Images urls # are rewritten and unneeded file produced by MS Word are deleted. A number of configuration options # are available, allowing for selective formatting of certain file types (refer to the # CONFIGURATION VARIABLES and read the source code). # # Some of the neat things about this script is that all of the garbage formatting that is applied # to a MS Word HTML documment is stripped out through regular expressions, leaving behind just the # HTML tags. Empty paragraphs and the like are also removed. # Also, the converted file is saved with a lowercase, no-blank name (spaces are mapped to an # underscore e.g. 'my homepage.doc' > my_homepage.htm) # # Also, if you really want, it should be easy to configure it to automatically upload certain # documents to a ftp server, this option is turned off, but perusing the source code for # AUTO_FTP should get you there very quickly. It is used this way with an intranet site. # # As this script is open source there are no guaranties, if it breaks things, you get to # keep the pieces! # #Installation: # If you have python installed, then there is little that you need to do in order to install this # script - however, this script can me made much more functional through the use of one of two # hacks that allow it to accept dragged and dropped files, even directories of dragged and dropped # files for document conversion. # 1st Approch - convert this script to a stand alone executable. I have had the best success with # Pyinstaller (http://pyinstaller.hpcf.upr.edu/). Windows pass dragged and dropped files as a list # of pathnames to the program, allowing everything to work swimmingly. Even better, you can then # distribute this script to computers without python installed. Or, place the script on a network # share and distribute a link - same difference, but easier deployment and upgrading. # 2nd Approach - Make your python file be recognized as an executable by Windows (requires a small # registry hack) You can download the appropriate registry keys from http://www.japikse.com/resources/) # Note: this approach only passes short pathnames to the converter, mangling document's title # in some instances (this might be different on WinXP, I'm presently working on Win2k). # #Usage: # 1. Set the configuration variables # 2. Read the help file / documentation at http://www.japikse.com/resources/scripts/word2html.html # 3. (Test then...) Deploy! ##############################CONFIGURATION VARIABLES#################################################################################### HELP_URL='http://www.japikse.com/resources/scripts/word2html.html' #set this to an internal help file, None, or leave #it as is ("http://www.japikse.com/resources/scripts/word2html.html") #double clicking on the word-to-html.bat file will bring up #this help, if None is set, then very basic help will be displayed. ######################################################################################################################################### import re, sys, os, win32com.client, os.path, shutil, time, md5, csv, webbrowser from sets import Set from stat import ST_MTIME, ST_CTIME class CleanHTML: """Upon initialization, this class runs a series of regular expresions to clean up MS Word HTML into basic stripped down html. Additional methods reformat image links, return a list of images used in the document, and conditionally insert a company logo at the top of a particular document type (internal memorandums - controlled by the INSERT_IM_LOGO variable).""" def __init__(self,dst_img_dir=None,html=None): #variables to be overriden self.dst_img_dir=dst_img_dir self.html=html def pareCSS(self): #note: many of the regular expressions below were copied from #the Word Unmunger (http://freshmeat.net/projects/wordunmunger/) #That code had the following copyright notification: # # Copyright (c) 2003 Luke Francl # # Permission is hereby granted, free of charge, to any person obtaining a # copy of this software and associated documentation files (the "Software"), # to deal in the Software without restriction, including without limitation # the rights to use, copy, modify, merge, publish, distribute, sublicense, # and/or sell copies of the Software, and to permit persons to whom the # Software is furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # Using pre gets around potential recursion limit problems, but doesn't work with Ynicode self.pre_regexp() regexp=[(r"",""),(r"",''), (r"",''), (r"",""), (r"",''), (r"",''), (r"",''), (r"",''), (r"",''), (r"",''), (r"",''), (r"\s*?style='.*?'",''), (r"\s*?class=[A-Za-z0-9]*",''), (r"",''), (r"",''), (r"",''), (r"",''), (r"",''), (r"",''), (r' v:shapes=".*?"',''), (r'',''), (r'(?<=<.{2}) width=\d{1,4}(?=.*?>)',''), (r' cellspacing=\d{1,2}',''), (r' cellpadding=\d{1,2}',''), (r' ',' '), (r'\xd7','x'), (r'\x93','"'), (r'\x92',"'"), (r'\x94','"'), (r'\x99','TM'), (r'\xa0|\xd2|\x96',''), (r'û',' '), (r'',''), (r'',''), (r'<[ph12345]{1,2}[^>]*>\s*',''), (r']*>',''), (r'[\r\n][\r\n\t]*[\r\n]','\n'), (r'
',''), (r']*>',''), (r'\xb7','•')] self.regexp_run(regexp) self.post_regexp() def pre_regexp(self): #hook to allow for regular expressions to be run before all of the MS Word styles are stripped pass def post_regexp(self): #hook to allow for regular expressions to be run after all of the MS Word styles are stripped pass def regexp_run(self,regexp): #execute a list of regular expressions (e.g. [(regexp 1,substitute 1), (regexp 2,substitute 2), ...] for i in regexp: r=re.compile(i[0], re.DOTALL) self.html=re.sub(r,i[1],self.html) def __ol_list(self): #turn MS Word faked lists into html lists #

\d{1,2}(.*?)

pass def __ul_list(self): #turn MS Word faked lists into html lists #

\d{1,2}(.*?)

pass def fix_img_links(self): """change the image links to point to their new location""" if self.dst_img_dir: self.regexp_run([(r'src="./.*?_files','src="images/%s' %os.path.basename(self.dst_img_dir))]) class ploneFTP: def connect(self,username, password): import ftplib self.ftp = ftplib.FTP("find") self.ftp.login(username, password) def putDOC(self, doc_path=None, remote_parent=None): """copy an internal memorandum to the intranet server""" if doc_path: self.out_path=doc_path if remote_parent: self.put_path=remote_parent doc_name=os.path.basename(self.out_path) tmp=os.path.split(os.path.splitext(self.out_path)[0]) self.loc_p=os.path.join(tmp[0],'images',tmp[1]+'_images') self.rem_img=self.put_path+'/images' self.ftp.cwd(self.put_path) try: self.ftp.delete(self.put_path+'/'+doc_name) except: pass print 'uploading', doc_name, 'to the intranet server' self.ftp.storlines('STOR %s' %doc_name, open(os.path.join(self.out_path),'r')) self.__putImgDir() def __mk_target_dir(self): #make the target directory, removing any previous instances rem_d=os.path.basename(self.loc_p) self.rem_d=os.path.join(self.rem_img,rem_d).replace('\\','/') nlist=self.ftp.nlst(self.rem_img) if nlist.count(rem_d): self.__rm_dir() self.ftp.mkd(self.rem_d) def __putImgDir(self): #copy an directory and it's contents (to a depth of one only) to the server if os.path.isdir(self.loc_p): self.__mk_target_dir() files=os.listdir(self.loc_p) self.ftp.cwd(self.rem_d) for f in files: self.ftp.storbinary('STOR %s' %f, open(os.path.join(self.loc_p,f),'rb')) def __rm_dir(self): #remove a given directory and any files it contains (works to a depth of 1) nlist=self.ftp.nlst(self.rem_d) nlist.remove('.'); nlist.remove('..') for i in nlist: self.ftp.delete(os.path.join(self.rem_d,i).replace('\\','/')) self.ftp.rmd(self.rem_d) class FTP(ploneFTP): def connect(self, cnt=0): self.target() username, password = self.__get_password() try: ploneFTP.connect(self, username, password) except: cnt+=1 print '\n\nYour username and/or password appear to be incorrect (try %i of 3)' %cnt if cnt<4: self.connect(cnt) else: print '\nUnable to FTP files to the intranet site, please transfer your files manually' def __get_password(self): #get the user's password username=os.environ['USERNAME'].lower() user_tmp=raw_input('Is your intranet username: %s? (return or type your username): ' %username) if len(user_tmp): username=user_tmp password=raw_input('\nPlease enter your password for the intranet: ') return username, password def target(self): """generate the target url and path based upon the file type being uploaded, this class is generally overriden by a document specific sub-class""" self.put_path='/Plone/Members/'+os.environ['USERNAME'].lower() #some default ftp server self.url='http://find/Members/%s/%s/document_view' %(os.environ['USERNAME'].lower(), self.fname_new) #url for the default ftp server class doc(CleanHTML): """Basic class used to handle the conversion of a single Word document to HTML""" def __init__(self,doc_path): CleanHTML.__init__(self) self.path=doc_path os_stat=os.stat(doc_path) self.m_time=time.asctime(time.gmtime((os_stat[ST_MTIME]))) self.c_time=time.asctime(time.gmtime((os_stat[ST_CTIME]))) self.fname_doc=os.path.basename(doc_path) self.fname_new=self.fname_doc.replace(' ','_').lower()[:-4]+'.htm' try: self.md5=md5.new(open(self.fname_doc,'rb').read()).hexdigest() except: self.md5='[md5 failed to generate - short pathname bug]' def WordSaveHTML(self,out_dir): """open the word document and save it as a html file""" self.out_path=os.path.join(out_dir,self.fname_new) self.out_dir=out_dir app = win32com.client.Dispatch("Word.Application") doc=app.Documents.Open(self.path) #get the document subject and category for possible later use self.subject=doc.BuiltInDocumentProperties('Subject').__str__() self.category=doc.BuiltInDocumentProperties('Category').__str__() try: doc.SaveAs(self.out_path, FileFormat=8, AddToRecentFiles=0) except: print 'MS Word failed to save', self.out_path doc.Close() self.__move_images() self.html=open(self.out_path,'r').read() def __move_images(self): #move the image directory and trash useless files included by MS Word src_img_dir=self.out_path[:-4]+'_files' if os.path.isdir(src_img_dir): self.dst_img_dir=os.path.join(self.out_dir,'images',self.fname_new[:-4]+'_images') if os.path.isdir(self.dst_img_dir): shutil.rmtree(self.dst_img_dir, ignore_errors=True) shutil.move(src_img_dir,self.dst_img_dir) def fixHTML(self): """clean and refomate the MS Word HTML - basically string together calls to the CleanHTML class""" self.pareCSS() self.fix_img_links() self.delete_files() def delete_files(self): """Delete all the unused files in the document's image directory, if this means that all files are deleted from a directory, then the directory is also removed as a cleanup measure.""" #generate a list of images used in the HTML (checks for jpg, png, tif, tiff, bmp, and gif) r=re.compile(r'(?P.*?)', re.DOTALL) m=re.search(r,self.html) self.html=m.group('body') tmpl=""" $TITLE $SOURCE_COMMENT$BODY """ tmpl=Template(tmpl) html_d={'BODY':self.html} html_d['FILENAME']=self.fname_doc src_c='''\n''' src_c=Template(src_c) src_tmp={'doc_name':self.fname_doc, 'date':time.asctime(), 'user_name':self.user, 'path':self.path, 'md5':self.md5} html_d['SOURCE_COMMENT']=src_c.substitute(src_tmp) html_d['DESCRIPTION']=self.__get_desc() html_d['TITLE']=self.fname_doc[0:-4].upper() html_d['CONTRIBUTORS']=self.contributors self.html=tmpl.substitute(html_d) def __get_desc(self): #prompt the user to input a document description if self.subject!='None' and len(self.subject.split())>=4: desc=raw_input('The current document description is:\n%s\n(hit return to use this description or enter your own below (min 4 words):\n' %self.subject) if not desc: desc=self.subject; print desc else: desc=raw_input('Please enter a short description for %s (min 4 words):\n\n' %self.fname_doc).strip() if len(desc.strip().split())<4: print 'Please make sure to have at least four words in your description, thanks.' self.__get_desc() print 'processing...' return desc def post_write(self): #log the file conversion log=[[time.asctime(), self.user, self.computer, self.path, self.c_time, self.m_time, self.md5]] log_path=r'\\rdj\public$\logs\wordtohtml_testing.log.csv' if not os.path.isfile(log_path): log.insert(0,['Conversion Date', 'User Name', 'Computer Name', 'MS Word File Path', 'MS Word file creation time', 'MS Word file last modification time', 'MS Word file md5']) writer=csv.writer(open(log_path,'ab')) for r in log: writer.writerow(r) def body_div_wrap(self,id_tag): #wrap the body of the document in a
...
tag self.html=self.html.replace('','\n\t
' %id_tag).replace('','\t
\n') class docIM(docPlone,FTP): """IM specfic class used to handle the conversion of an Internal Memorandum (MS Word) to HTML""" def __init__(self,doc_path): docPlone.__init__(self,doc_path) def post_regexp(self): """place the company standard image into an IM file and wrap the body contents in a
tag""" r=re.compile(r'(?<=
).*?INTERNAL\s*MEMORANDUM NO.\s{1,2}\d{1,4}
', re.DOTALL) if not re.findall(r,self.html): print """The formatting of this internal memorandum should have the title (e.g. INTERNAL MEMORANDUM NO. 1234) as a heading 5 in Microsoft Word""" else: r=re.compile(r'()(?=.*?INTERNAL\s*MEMORANDUM NO\.)', re.DOTALL) self.html=r.sub('',self.html) self.html=re.sub('','\n\n',self.html,count=1) self.body_div_wrap('cn') def post_write(self): """FTPs files to the intranet server and logs the document's conversion""" docPlone.post_write(self) #log this document conversion self.connect(); self.putDOC() webbrowser.open(self.url,1) def target(self): """generate the target url and path for an internal memorandum upload""" self.put_path='/Plone/Members/'+self.user #some default ftp server self.url='http://find/Members/%s/%s/document_view' %(self.user, self.fname_new) #url for the default ftp server class docTM(docPlone,FTP): """IM specfic class used to handle the conversion of an Technical Memorandum (MS Word) to HTML""" def __init__(self,doc_path): docPlone.__init__(self,doc_path) def pre_regexp(self): regexp=[(r'(?#swap out class for klass for a later re-substitution of class)','

'), (r']*>','

'), (r']*>','

'), (r']*>','

')] #remove centering on h1-h3 self.regexp_run(regexp) self.__remove_blank_images() def post_regexp(self): #replace klass with class and align paragraphs which only contain an image regexp=[(r'(?<=

)','class'), (r'

(?=]*>

)','

')] self.regexp_run(regexp) #wrap tables in a div align=center tag r=re.compile(r']*>.*?

',re.DOTALL) tables=r.findall(self.html) for i in tables: self.html=self.html.replace(i,'
\n'+i+'
') html_tmp=self.html try: #rewrite TMs from Penny's unique formatting approach into something more suitable self.__get_title() self.__title_info() self.__toc() self.__main_div() self.html=''+self.toc_div+self.ti_div+self.main_div+'' except: print '\nThere was an error processing this TM into the preset TM format.\nProcessing is defaulting back into a generic layout\n' self.html=html_tmp docPlone.post_regexp(self) self.body_div_wrap('cn_tm') def __remove_blank_images(self): """Test to see if the image is a white place holder image, also allows white place holders with narrow borders. This should accomodate the front office's crazy notions of page layout!""" import Image as pil files=os.listdir(self.dst_img_dir) files=[i for i in files if ['.jpg','.gif','.bmp','.tif','.png'].count(i.lower()[-4:])] removed_imgs=[] for i in files: pth=os.path.join(self.dst_img_dir,i) im=pil.open(pth) im_hist=im.histogram() if sum(im_hist)==sum(im_hist[0:5]): im=None os.remove(pth) removed_imgs.append(('<[^>]*%s[^>]*>' %i, '')) if len(removed_imgs): self.regexp_run(removed_imgs) def __get_title(self): #get the title for this TM m=re.search(r']*>([A-Z\s]*?)

',self.html,re.DOTALL) self.subject=' '.join(m.group(1).replace('\n',' ').split()).title() def __title_info(self): """pull the title and title information from the TM html string - this is used to generate the title div""" m=re.search(r'(?P<p[^>]*>TECHNICAL\s* MEMORANDUM[^\1]*?)(?=<[ph123]{1,2}[^>]*>[A-Z\s]+\.\s*\d{1,4}\s*</[ph123]{1,2}>)',self.html,re.DOTALL) #grab the title block self.ti_div='<div id="tm_title">\n'+m.group('title') self.ti_div+='</div>\n' def __toc(self): """pull the Table Of Contents from the TM html string""" m=re.search(r'<p[^>]*>(Table\s*of\s*Contents|TABLE\s*OF\s*CONTENTS)\s*</p>.*?(?=<p[^>]*>TECHNICAL\s*MEMORANDUM NO.\s*\d{1,4}</p>)',self.html,re.DOTALL) self.toc_div=m.group() self.end_toc=m.end() #grab the sub-entries in the TOC so that an indentation may be later applied through CSS r=re.compile(r'<p[^>]*><a\s*href="[^>]*?#_Toc\d*">\d{1,2}[.]\d{1,2}.*?</p>', re.DOTALL) sub_toc=re.findall(r,self.toc_div) for i in sub_toc: self.toc_div=self.toc_div.replace(i,i[0:2]+' class="sub_toc"'+i[2:]) self.toc_div='<div id="tm_toc">\n'+self.toc_div+'</div>\n' r=re.compile(r'(?<=[a-zA-Z\s]{1})\.+\s*[1-9iv]*',re.DOTALL) #remove repeated dots and the following number in the TOC (e.g page..... 1 self.toc_div=re.sub(r,' ',self.toc_div) def __main_div(self): r=r'<h1><a name=".*?">.*?1\..*?</h1>.*(?=</body>)' r=re.compile(r, re.DOTALL) m=r.search(self.html,self.end_toc) self.main_div='<div id="tm_body">\n'+m.group()+'</div>\n' def post_write(self): """FTPs files to the intranet server and logs the document's conversion""" docPlone.post_write(self) #log this document conversion self.connect(); self.putDOC() webbrowser.open(self.url,1) def target(self): """generate the target url and path for a technical memorandum upload""" self.__confirm_ITAR() if self.itar: pth='itar_tms' else: pth='non_itar_tms' self.put_path='/Plone/administration/internal_documents/tm/%s/' %pth #some default ftp server self.url='http://find/administration/internal_documents/tm/%s/%s/document_view' %(pth, self.fname_new) #url for the default ftp server self.put_path='/Plone/Members/rj/tm_test/%s/' %pth #some default ftp server self.url='http://find/Members/rj/tm_test/%s/%s/document_view' %(pth, self.fname_new) #url for the default ftp server def __confirm_ITAR(self): #confirm the document properties setting for ITAR Status msg='Please confirm that this document is ' if self.category.lower().count('itar'): msg+='ITAR (y/n)' resp=raw_input(msg) if resp.lower().startswith('y'): self.itar=True elif resp.lower().startswith('n'): self.itar=False else: self.confirm_ITAR() else: msg+='not ITAR (y/n)' resp=raw_input(msg) if resp.lower().startswith('y'): self.itar=False elif resp.lower().startswith('n'): self.itar=True else: self.confirm_ITAR() class ConversionSession: """Determine and store variables related to this conversion session (username, date, desktop path, etc.)""" def __init__(self): #get session information (username, output path, etc.), create paths as needed self.out_dir=os.path.join(os.environ['USERPROFILE'],r'Desktop\HTML') self.out_dir_images=os.path.join(self.out_dir,'images') try: os.makedirs(self.out_dir_images) except: pass def convert(self,file_list): """Convert all the word files in the input list (usually provided through dropping files on the word-to-html.bat file / shortcut)""" file_list=[f for f in file_list if f.lower().endswith('.doc')] num_convert=len(file_list) if not num_convert: print 'Of the files you dropped onto the Word to HTML converter program, none of them appear to be MS Word documents' for doc_path in file_list: #convert all the inbound word documents d=doc_map(doc_path) #get the correct document object for processing this file msg='processing '+d.fname_doc if num_convert>1: cnt+=1; msg+=' (%i of %i)' %(cnt, num_convert) print msg d.WordSaveHTML(self.out_dir) d.fixHTML() d.writeHTML() #delete me! #webbrowser.open(d.out_path,1) print 'Conversion of %s has finished' %d.fname_doc if num_convert>1: print 'All document conversions have finished...' def doc_map(doc_path): """map a document type to a document's class, returns an initialized document object""" #return true if the file is likely an IM or TM fname_doc=os.path.basename(doc_path) if re.match(r'im\d{1,4}.*?.doc', fname_doc, re.IGNORECASE): return docIM(doc_path) elif re.match(r't{0,1}m\d{1,4}.*?.doc', fname_doc, re.IGNORECASE): return docTM(doc_path) else: return docPlone(doc_path) #return doc(doc_path) if __name__ == "__main__": s=ConversionSession() if len(sys.argv)==1: print 'Opening the help file at %s' %HELP_URL webbrowser.open(help_url, new=1) else: s.convert(sys.argv[1:]) print 'closing...' time.sleep(4)