url = “http://www.educa.ch/dyn/”
html = urllib.urlopen(“http://www.educa.ch/dyn/79362.asp?action=search”).read()
for capname, lk in re.findall(’<a name="\d+"></a><br><img ^>]+>(^<]+).*?<a href="#\d+" onclick="javascript: window.open(’(\d+.asp?id=\d+)’’, html):
alk = urlparse.urljoin(url, lk)
data = { 'url':alk, 'cname':capname }
phtml = urllib.urlopen(alk).read()
memail = re.search('<a href="mailto:(.*?)">', phtml)
if memail:
data'email'] = memail.group(1)
print data
PS: I didn’t try this, you’ll possibly need to debug the regexp
PPS: Didn’t find how to turn on python formatting on the page. And how to remove the smiley…