#!/usr/bin/env python import md5 import sgmllib def escape(data, escape_quote = False): data = data.replace("&", "&") data = data.replace("<", "<") if escape_quote: data = data.replace('"', """) return data class Parser(sgmllib.SGMLParser): from htmlentitydefs import entitydefs def __init__(self, fp, verbose = False): sgmllib.SGMLParser.__init__(self, verbose) self.fp = fp def reset(self): sgmllib.SGMLParser.reset(self) self.hashes = md5.new(), md5.new() self.passthrough = False self.row = None def feed(self, data): sgmllib.SGMLParser.feed(self, data) self.hashes[0].update(data) def write(self, data): self.fp.write(data) self.hashes[1].update(data) def digests(self): return [hash.hexdigest() for hash in self.hashes] # handle passthrough in generic overrides def handle_starttag(self, tag, method, attrs): sgmllib.SGMLParser.handle_starttag(self, tag, method, attrs) if self.passthrough: self.__write_tag(tag, attrs) def unknown_starttag(self, tag, attrs): sgmllib.SGMLParser.unknown_starttag(self, tag, attrs) if self.passthrough: self.__write_tag(tag, attrs) def handle_endtag(self, tag, method): if self.passthrough: self.__write_tag("/" + tag) sgmllib.SGMLParser.handle_endtag(self, tag, method) def unknown_endtag(self, tag): if self.passthrough: self.__write_tag("/" + tag) sgmllib.SGMLParser.unknown_endtag(self, tag) def handle_data(self, data): if self.passthrough: self.write(data) def __write_tag(self, tag, attrs = ()): self.write("<%s%s>" % (tag, "".join( [' %s="%s"' % (name, escape(value, True)) for name, value in attrs]))) # handle everything else in tag-specific overrides def start_table(self, attrs): for name, value in attrs: if name == "summary": if value == "methods and the premissions they require": self.passthrough = True self.row = 0 break if self.passthrough: self.write("\n \n ") def end_table(self): if self.passthrough: self.write("\n \n\n") self.passthrough = False def start_tr(self, attrs): if self.passthrough: if self.row == 29: self.passthrough = False self.row_tagged = False elif self.row == 29: self.passthrough = True def end_tr(self): if self.passthrough: self.row += 1 def start_th(self, attrs): if self.passthrough: if not self.row_tagged: self.write("ID\n ") self.row_tagged = True def start_td(self, attrs): if self.passthrough: if not self.row_tagged: self.write("se%03d\n " % self.row) self.row_tagged = True if __name__ == "__main__": import os import sys import urllib version = "1.4.2" src = "http://java.sun.com/j2se/" + version \ + "/docs/guide/security/permissions.html" dst = "throwpoints-%s.html" % version if os.path.exists(dst): print "%s: file exists" % dst sys.exit(1) parser = Parser(open(dst, "w")) parser.feed(urllib.urlopen(src).read()) parser.close() digests = parser.digests() if digests[1] == "3c40052647c417dead97068a32f51911": status = "PASS" elif digests[0] == "c4b9248859682e65ad71788acfc03b78": status = "FAIL (processing)" else: status = "FAIL (input = %s)" % digests[0] print "status:", status