CODE
#!/usr/bin/env python
from BeautifulSoup import BeautifulSoup
import os
import re
import sys
import urllib2
# Note: In order to keep a more functional style, as well as
# simplify programming, a node in a tree is simply a 3-tuple
# in the form (node, parent, [children]).
_wikiurl = "http://en.wikipedia.org/wiki/"
# We must pretend to be Google (or another browser, but I
# picked Googlebot); otherwise, Wikipedia gives us a
# 403 Forbidden error.
_opener = urllib2.build_opener()
_opener.addheaders = [("User-agent", "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)")]
_wiki_href = re.compile(r"^\/wiki\/.*$")
_wiki_article = re.compile(r"^\/wiki\/([A-Z][a-z0-9%_\(\)-]*)$")
def links(article):
"""Returns a list of all the articles linked from a particular article."""
page = _opener.open(_wikiurl + article)
soup = BeautifulSoup(page)
wikis = []
links = soup.findAll(name="a", attrs={'href': _wiki_href})
for link in links:
matched = _wiki_article.match(link['href'])
if matched:
wikis.append(matched.group(1))
return wikis
def race(start, target):
"""Begins the wiki race with the given starting article and target."""
cur = _race(target, (start, None, links(start)), set())
path = [cur[0]]
while cur[1] is not None:
cur = cur[1]
path.append(cur[0])
path.reverse()
return path
def _race(target, tree, checked):
print "Checking " + tree[0]
checked.add(tree[0])
if tree[0] == target:
print >>sys.stderr, " Found target!"
return tree
else:
if len(tree[2]) > 0:
print >>sys.stderr, " Descending into child links"
for child in tree[2]:
if child not in checked:
return _race(target, (child, tree, links(child)), checked)
else:
print >>sys.stderr, " Child links exhausted"
parent = tree[1]
parent[2].remove(tree[0])
return _race(target, parent, checked)
def main():
usage = "Usage: %s Wiki_Start Wiki_End" % os.path.basename(sys.argv[0])
if len(sys.argv[1:]) < 2:
print >>sys.stderr, usage
return 1
start = sys.argv[1]
goal = sys.argv[2]
print "Finding a path from '%s' to '%s'" % (start, goal)
path = race(start, goal)
print "Here is the path:"
for p in path:
print " " + p
return 0
if __name__ == "__main__":
sys.exit(main())
from BeautifulSoup import BeautifulSoup
import os
import re
import sys
import urllib2
# Note: In order to keep a more functional style, as well as
# simplify programming, a node in a tree is simply a 3-tuple
# in the form (node, parent, [children]).
_wikiurl = "http://en.wikipedia.org/wiki/"
# We must pretend to be Google (or another browser, but I
# picked Googlebot); otherwise, Wikipedia gives us a
# 403 Forbidden error.
_opener = urllib2.build_opener()
_opener.addheaders = [("User-agent", "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)")]
_wiki_href = re.compile(r"^\/wiki\/.*$")
_wiki_article = re.compile(r"^\/wiki\/([A-Z][a-z0-9%_\(\)-]*)$")
def links(article):
"""Returns a list of all the articles linked from a particular article."""
page = _opener.open(_wikiurl + article)
soup = BeautifulSoup(page)
wikis = []
links = soup.findAll(name="a", attrs={'href': _wiki_href})
for link in links:
matched = _wiki_article.match(link['href'])
if matched:
wikis.append(matched.group(1))
return wikis
def race(start, target):
"""Begins the wiki race with the given starting article and target."""
cur = _race(target, (start, None, links(start)), set())
path = [cur[0]]
while cur[1] is not None:
cur = cur[1]
path.append(cur[0])
path.reverse()
return path
def _race(target, tree, checked):
print "Checking " + tree[0]
checked.add(tree[0])
if tree[0] == target:
print >>sys.stderr, " Found target!"
return tree
else:
if len(tree[2]) > 0:
print >>sys.stderr, " Descending into child links"
for child in tree[2]:
if child not in checked:
return _race(target, (child, tree, links(child)), checked)
else:
print >>sys.stderr, " Child links exhausted"
parent = tree[1]
parent[2].remove(tree[0])
return _race(target, parent, checked)
def main():
usage = "Usage: %s Wiki_Start Wiki_End" % os.path.basename(sys.argv[0])
if len(sys.argv[1:]) < 2:
print >>sys.stderr, usage
return 1
start = sys.argv[1]
goal = sys.argv[2]
print "Finding a path from '%s' to '%s'" % (start, goal)
path = race(start, goal)
print "Here is the path:"
for p in path:
print " " + p
return 0
if __name__ == "__main__":
sys.exit(main())
You can run it if you want, but it takes a long time.
If anyone has any other solutions or tweaks for the program, please post.