mirror of
https://github.com/moby/moby.git
synced 2022-11-09 12:21:53 -05:00
fbb9223b1a
Signed-off-by: Sven Dowideit <SvenDowideit@home.org.au>
79 lines
2.1 KiB
Python
Executable file
79 lines
2.1 KiB
Python
Executable file
#!/usr/bin/env python
|
|
|
|
""" I honestly don't even know how the hell this works, just use it. """
|
|
__author__ = "Scott Stamp <scott@hypermine.com>"
|
|
|
|
from HTMLParser import HTMLParser
|
|
from urlparse import urljoin
|
|
from sys import setrecursionlimit
|
|
import re
|
|
import requests
|
|
|
|
setrecursionlimit(10000)
|
|
root = 'http://localhost:8000'
|
|
|
|
|
|
class DataHolder:
|
|
|
|
def __init__(self, value=None, attr_name='value'):
|
|
self._attr_name = attr_name
|
|
self.set(value)
|
|
|
|
def __call__(self, value):
|
|
return self.set(value)
|
|
|
|
def set(self, value):
|
|
setattr(self, self._attr_name, value)
|
|
return value
|
|
|
|
def get(self):
|
|
return getattr(self, self._attr_name)
|
|
|
|
|
|
class Parser(HTMLParser):
|
|
global root
|
|
|
|
ids = set()
|
|
crawled = set()
|
|
anchors = {}
|
|
pages = set()
|
|
save_match = DataHolder(attr_name='match')
|
|
|
|
def __init__(self, origin):
|
|
self.origin = origin
|
|
HTMLParser.__init__(self)
|
|
|
|
def handle_starttag(self, tag, attrs):
|
|
attrs = dict(attrs)
|
|
if 'href' in attrs:
|
|
href = attrs['href']
|
|
|
|
if re.match('^{0}|\/|\#[\S]{{1,}}'.format(root), href):
|
|
if self.save_match(re.search('.*\#(.*?)$', href)):
|
|
if self.origin not in self.anchors:
|
|
self.anchors[self.origin] = set()
|
|
self.anchors[self.origin].add(
|
|
self.save_match.match.groups(1)[0])
|
|
|
|
url = urljoin(root, href)
|
|
|
|
if url not in self.crawled and not re.match('^\#', href):
|
|
self.crawled.add(url)
|
|
Parser(url).feed(requests.get(url).content)
|
|
|
|
if 'id' in attrs:
|
|
self.ids.add(attrs['id'])
|
|
# explicit <a name=""></a> references
|
|
if 'name' in attrs:
|
|
self.ids.add(attrs['name'])
|
|
|
|
|
|
r = requests.get(root)
|
|
parser = Parser(root)
|
|
parser.feed(r.content)
|
|
for anchor in sorted(parser.anchors):
|
|
if not re.match('.*/\#.*', anchor):
|
|
for anchor_name in parser.anchors[anchor]:
|
|
if anchor_name not in parser.ids:
|
|
print 'Missing - ({0}): #{1}'.format(
|
|
anchor.replace(root, ''), anchor_name)
|