1234567891011121314151617181920212223242526272829303132333435 |
- from urllib.request import urlopen
- from html.parser import HTMLParser
- def isjob(url):
- try:
- a, b, c, d = url.split('/')
- except ValueError:
- return False
- return a == d == '' and b == 'jobs' and c.isdigit()
- class Scraper(HTMLParser):
- in_link = False
- def handle_starttag(self, tag, attrs):
- attrs = dict(attrs)
- url = attrs.get('href', '')
- if tag == 'a' and isjob(url):
- self.url = url
- self.in_link = True
- self.chunks = []
- def handle_data(self, data):
- if self.in_link:
- self.chunks.append(data)
- def handle_endtag(self, tag):
- if tag == 'a' and self.in_link:
- print('{} ({})'.format(''.join(self.chunks), self.url))
- self.in_link = False
- text = urlopen('http://python.org/jobs').read().decode()
- parser = Scraper()
- parser.feed(text)
- parser.close()
|