from __future__ import with_statement import os, sys import htmlentitydefs import itertools import operator import posixpath import re import sgmllib import tempfile import pysvn class FixupParser (sgmllib.SGMLParser): """Fix-up parser to scan the contents file generated by HTMLHelp, generate a suitable HTML output file for use within standard HTML files and to provide a root-to-leaf mapping for use as a breadcrumb trail in individual pages. """ def __init__ (self, infile, outfile): sgmllib.SGMLParser.__init__ (self) self.infile = infile self.outfile = outfile self.inside_li = False self.inside_object = False self.link_name = self.link_url = "" self.trail = [] self.contents_map = {} def start (self): self.feed (self.infile.read ()) def output (self, text): self.outfile.write (text + "\n") def start_ul (self, attrs): """If we're starting a list, close any unclosed list item and add the latest (ie this) url/name pair to the trail.""" if self.inside_li: self.end_li () self.output ("") if self.trail: self.trail.pop () def start_li (self, attrs): """If we're starting a list item, make a note of the fact so we can track objects within it.""" if self.inside_li: self.end_li () self.output ("
  • ") self.inside_li = True def end_li (self): """If we're finishing a list item, make a note so no objects are tracked which are outside a list item.""" self.output ("
  • ") self.inside_li = False def start_object (self, attrs): """The text/sitemap objects hold the real indexing info. Note that we're inside such an object so that we pick up its parameters.""" attrs = dict (attrs) if attrs.get ("type") == "text/sitemap": if self.inside_object: self.end_object () self.link_name = self.link_url = "" self.inside_object = True def end_object (self): """At the end of an object tag, add the trail so far to the entry for this item's index and output an appropriate href.""" if self.inside_object: self.contents_map[self.link_url] = self.trail[:] if self.trail and self.trail[-1][0] <> self.link_url: self.contents_map[self.link_url].append (("", self.link_name)) self.output ('%s' % (self.link_url, self.link_name)) self.inside_object = False """An object's param items are where the indexing info is stored. A "Name" param holds the name of the page; a "Local" item holds the slightly mungified URL which we strip before storing.""" def start_param (self, attrs): UNWANTED_PREAMBLE = "mk:@MSITStore:PyWin32.chm::/" if self.inside_object: attrs = dict (attrs) if attrs.get ("name") == "Name": self.link_name = attrs.get ("value", "") elif attrs.get ("name") == "Local": link_url = attrs.get ("value") if link_url: self.link_url = link_url[len (UNWANTED_PREAMBLE):] else: self.link_url = "" UNWANTED_MARKUP = ["html", "body", "head"] UNWANTED_RE = re.compile ("|".join ("<%s>|" % (markup, markup) for markup in UNWANTED_MARKUP), re.IGNORECASE) UNWANTED_TITLE_RE = re.compile (r".*", re.IGNORECASE) UNWANTED_GENERATOR = r'' UNWANTED_HR_RE = re.compile (r"
    ", re.IGNORECASE) def munged_text (text): # # Fix up entity & character defs so they end with semicolons # for entitydef in htmlentitydefs.entitydefs.keys (): text = re.sub (r"(&%s)(?!;)" % entitydef, "\g<1>;", text) text = re.sub (r"(&#\d+)(?!;)", "\g<1>;", text) text = re.sub (r"[^<]*", "", text, re.IGNORECASE) text = UNWANTED_RE.sub ("", text) text = UNWANTED_TITLE_RE.sub ("", text) text = text.replace (UNWANTED_GENERATOR, "") text = UNWANTED_HR_RE.sub ("", text) text = u"\n".join (line + u"" if line.lower ().startswith (u"
  • ") and u"
  • " not in line.lower () else line for line in text.splitlines ()) return text def relpath (target_url, current_url): target_path, target_file = posixpath.split (target_url) current_path, current_file = posixpath.split (current_url) start_list = current_path.split (os.path.sep) path_list = target_path.split (os.path.sep) i = len (os.path.commonprefix ([start_list, path_list])) rel_list = [os.path.pardir] * (len (start_list) - i) + path_list[i:] return posixpath.join ("/".join (rel_list), target_file) INDEX_CONTENT = """

    PyWin32 Documentation

    This documentation is generated from the .chm file which is shipped with the PyWin32 extensions for Python. Apart from absolutely essential cleanups to make the HTML display properly, no changes have been made.

    Updated 29th October 2009: Now includes pywin32-214 documentation

    """ # # Navigation is a separate string so that it can be # excluded from, eg, the contents page. # NAVIGATION_HTML = """ """ HTML = """ %(title)s %(navigation)s
    %(content)s
    """ def fixup_isapi_links (text): return text.replace ('href="/', 'href="../../../') SPECIAL_PROCESSING = { "html/isapi/doc/isapi.html" : fixup_isapi_links } ARGS = set (["nogenerate", "debug", "nosvn"]) def main (args=[]): if not set (args) <= ARGS: raise RuntimeError ("Arguments %s not recognised; should only be %s" % (", ".join (set (args).difference (ARGS)), ", ".join (ARGS))) chm_filepath = "./PyWin32.chm" ## os.path.join (sys.prefix, "lib", "site-packages", "PyWin32.chm") html_tempdir = os.path.join (tempfile.gettempdir (), "pywin32-docs-htmlhelp") html2_tempdir = "." css_filename = "pywin32.css" toc_filename = "contents.html" changes_filename = "changes.html" if "nogenerate" not in args: print "Decompiling .chm..." if not os.path.exists (html_tempdir): os.mkdir (html_tempdir) os.system ("hh.exe -decompile %s %s" % (html_tempdir, chm_filepath)) print "Writing index.html..." with open (os.path.join (html2_tempdir, "index.html"), "w") as outfile: title = "PyWin32 Documentation" navigation = "" root_path = "" content = INDEX_CONTENT outfile.write (HTML % locals ()) print "Generating contents..." with open (os.path.join (html_tempdir, "pywin32.hhc")) as infile: handle, filename = tempfile.mkstemp () with open (filename, "w") as outfile: parser = FixupParser (infile, outfile) parser.start () contents_map = parser.contents_map print "Writing table of contents..." with open (os.path.join (html2_tempdir, toc_filename), "w") as outfile: title = "PyWin32 Documentation" content = open (filename).read () root_path = "" css_filename = css_filename navigation = "" outfile.write (HTML % locals ()) for html_dirname, dirnames, filenames in os.walk (html_tempdir, topdown=True): if "debug" in args: filenames = filenames[:30] html2_dirname = os.path.join (html2_tempdir, html_dirname[1+len (html_tempdir):]) print html_dirname, "=>", html2_dirname if not os.path.exists (html2_dirname): os.mkdir (html2_dirname) for filename in filenames: if not filename.lower ().endswith ((".txt", ".html")): continue html_filepath = os.path.join (html_dirname, filename) html2_filepath = os.path.join (html2_dirname, filename) depth = html2_filepath.count ("\\") - 1 print " %s (%d)" % (html_filepath, depth) root_path = "../" * depth relative_filepath = html_filepath[1+len (html_tempdir):].replace ("\\", "/") content = unicode (open (html_filepath).read (), "cp1252") content = munged_text (content) special_processing = SPECIAL_PROCESSING.get (relative_filepath) if special_processing: content = special_processing (content) for title in re.findall (r"

    ([^<]+)

    ", content, re.IGNORECASE): break else: title = filename breadcrumb_trail = contents_map.get (relative_filepath, []) breadcrumbs = u" > ".join (u'%s' % (relpath (url, relative_filepath) if url else name, name) for (url, name) in breadcrumb_trail) navigation = NAVIGATION_HTML % locals () if filename.lower ().endswith (".txt"): html = content else: html = HTML % locals () open (html2_filepath, "w").write (html.encode ("utf8")) if "nosvn" not in args: print "Finding changes..." UNCHANGED = [pysvn.wc_status_kind.normal, pysvn.wc_status_kind.ignored] EXCLUDE_FROM_CHANGES = ["changes.html", "convert_to_html.py", "pywin32.chm"] svn = pysvn.Client () svn.add ([i.path for i in svn.status (".") if i.path.endswith (".html") and i.text_status == pysvn.wc_status_kind.unversioned]) changes = sorted ((i for i in svn.status (".") if i.text_status not in UNCHANGED), key=operator.attrgetter ("text_status")) content = ["

    PyWin32 Documentation Changes

    "] first = True for status, items in itertools.groupby (changes, operator.attrgetter ("text_status")): if not first: content.append ("") content.append ("

    %s

    " % status) content.append ("") print "Writing changes..." with open (os.path.join (html2_tempdir, changes_filename), "w") as outfile: title = "PyWin32 Documentation Changes" content = "\n".join (content) root_path = "" css_filename = css_filename navigation = "" outfile.write (HTML % locals ()) if __name__ == '__main__': main (sys.argv[1:])