listing15-2.py 896 B

1234567891011121314151617181920212223242526272829303132333435
  1. from urllib.request import urlopen
  2. from html.parser import HTMLParser
  3. def isjob(url):
  4. try:
  5. a, b, c, d = url.split('/')
  6. except ValueError:
  7. return False
  8. return a == d == '' and b == 'jobs' and c.isdigit()
  9. class Scraper(HTMLParser):
  10. in_link = False
  11. def handle_starttag(self, tag, attrs):
  12. attrs = dict(attrs)
  13. url = attrs.get('href', '')
  14. if tag == 'a' and isjob(url):
  15. self.url = url
  16. self.in_link = True
  17. self.chunks = []
  18. def handle_data(self, data):
  19. if self.in_link:
  20. self.chunks.append(data)
  21. def handle_endtag(self, tag):
  22. if tag == 'a' and self.in_link:
  23. print('{} ({})'.format(''.join(self.chunks), self.url))
  24. self.in_link = False
  25. text = urlopen('http://python.org/jobs').read().decode()
  26. parser = Scraper()
  27. parser.feed(text)
  28. parser.close()