% python validate.py 1 - http://foo.com/bill_reid/ parsing... done - validating... done -> Valid 2 - http://foo.com/bill_reid/fr parsing... done - validating... done -> Valid 3 - http://foo.com/bill_reid/fr/copyright parsing... done - validating... done -> Valid 4 - http://foo.com/bill_reid/en parsing... done - validating... done -> Valid 5 - http://foo.com/bill_reid/en/copyright parsing... done - validating... done -> Valid 6 - http://foo.com/bill_reid/en/contact parsing... done - validating... done -> Valid ... 776 - http://foo.com/bill_reid/fr/qui/parcours/age_moyen/1 parsing... done - validating... done -> Valid 777 - http://foo.com/bill_reid/fr/qui/parcours/age_moyen/2 parsing... done - validating... done -> Valid ------------------------------------- URLs parsed: 777 URLS with invalid HTML: 3 http://foo.com/bill_reid/fr/dans_la_salle_de_classe/grade9/print http://foo.com/bill_reid/en/art/guided_journey/beyond_haida/3 http://foo.com/bill_reid/fr/art/voyage_guide/haida/3
import HTMLParser import urllib import sys import urlparse ################################################## # config base_url = 'http://foo.com/bill_reid/' depth = 100 w3c_validator = 'http://validator.w3.org/' ################################################## # classes and functions # HTML parser class class parseLinks(HTMLParser.HTMLParser): def handle_starttag(self, tag, attrs): if tag == 'a': for name,value in attrs: if name == 'href': url = url_normalize(value) if url != "" and not(l.has_key(url)): l[url] = True; # HTML parsing function (use the class) def parse_links(url): try: lParser = parseLinks() lParser.feed(urllib.urlopen(url).read()) lParser.close() except: pass # clean/normalize/reject url def url_normalize(url): url= url.strip() # check it's not an email address if url.startswith('mailto:'): return "" # remove any anchor url = url.partition('#')[0] # check it's not an outside-of-the-tree link url = urlparse.urljoin(current_url, url) if not(url.startswith(base_url)): return "" # check it's an HTML page if urllib.urlopen(url).info().gettype() != 'text/html': return "" return url # W3C validation def url_w3c_validate(url): return urllib.urlopen(w3c_validator + 'check?uri=' + url).info().getheader('x-w3c-validator-status') == 'Valid' ################################################## # main ################################################## l = {base_url: True} l_error = [] n = 0 for i in range(depth): for url in l.copy(): if l[url]: n += 1 current_url = url print n, print "-", print current_url, print " parsing...", parse_links(url) print "done -", print "validating...", is_valid = url_w3c_validate(url) print "done ->", if is_valid: print "Valid" else: l_error.append(url) print "Invalid" l[url] = False #report print """ ------------------------------------- URLs parsed: %d URLS with invalid HTML: %d""" % (len(l), len(l_error)) for url in l_error: print url
localhost for base_url won't work)python validate.py
Need to remove the session id from the URLs? Want to exclude some URLS? Modify url_normalize().
On Ubuntu
w3c-markup-validator
Update w3c_validator in the Python script