|
| 1 | +#!/usr/bin/env python3 |
| 2 | +# -*- conding: utf-8 -*- |
| 3 | + |
| 4 | +'练习内建模块之HTMLParser' |
| 5 | + |
| 6 | +__author__ = 'sergiojune' |
| 7 | +from html.parser import HTMLParser |
| 8 | +import requests |
| 9 | + |
| 10 | + |
| 11 | +class MyHTMLParser(HTMLParser): |
| 12 | + |
| 13 | + def handle_starttag(self, tag, attrs): # 这个是处理开始标签 |
| 14 | + print('<%s>' % tag, list(attrs)) |
| 15 | + |
| 16 | + def handle_endtag(self, tag): # 这个是处理结束标签 |
| 17 | + print('</%s>' % tag) |
| 18 | + |
| 19 | + def handle_data(self, data): # 这个是处理标签里的内容 |
| 20 | + print(data) |
| 21 | + |
| 22 | + def handle_comment(self, data): # 这个是处理注释 |
| 23 | + print('<!--', data, '-->') |
| 24 | + |
| 25 | + def handle_entityref(self, name): # 这个是处理特殊字符,比如 |
| 26 | + print('&%s;' % name) |
| 27 | + |
| 28 | + def handle_charref(self, name): # 这个是处理特殊字符,比如Ӓ |
| 29 | + print('&#%s;' % name) |
| 30 | + |
| 31 | + |
| 32 | +parser = MyHTMLParser() |
| 33 | +parser.feed('''<html> |
| 34 | +<head></head> |
| 35 | +<body> |
| 36 | +<!-- test html parser --> |
| 37 | + <p>Some <a href=\"#\">html</a> HTML tutorial...<br>END</p> |
| 38 | +</body></html>''') |
| 39 | + |
| 40 | + |
| 41 | +# 作业:找一个网页,例如https://www.python.org/events/python-events/,用浏览器查看源码并复制,然后尝试解析一下HTML,输出Python官网发布的会议时间、名称和地点。 |
| 42 | +class DealHTML(HTMLParser): |
| 43 | + def __init__(self): |
| 44 | + super(DealHTML, self).__init__() |
| 45 | + self.thing = 0 |
| 46 | + self.time = 0 |
| 47 | + self.address = 0 |
| 48 | + |
| 49 | + def handle_starttag(self, tag, attrs): |
| 50 | + if len(attrs) == 1: |
| 51 | + if 'python-events' in list(attrs)[0][1]: # 获取工作事件 |
| 52 | + print('<href=%s>' % list(attrs)[0][1], end='') |
| 53 | + self.thing = 1 |
| 54 | + if 'datetime' in list(attrs)[0][0]: # 获取工作时间 |
| 55 | + print('<%s>' % list(attrs)[0][0], end='') |
| 56 | + self.time = 1 |
| 57 | + if 'location' in list(attrs)[0][1]: # 获取工作地点 |
| 58 | + print('<%s>' % list(attrs)[0][1], end='') |
| 59 | + self.address = 1 |
| 60 | + |
| 61 | + def handle_data(self, data): |
| 62 | + if self.thing: |
| 63 | + print(data, end='') |
| 64 | + if self.time: |
| 65 | + print(data, end='') |
| 66 | + if self.address: |
| 67 | + print(data, end='') |
| 68 | + |
| 69 | + def handle_endtag(self, tag): |
| 70 | + if self.thing: |
| 71 | + print('</%s>' % tag) |
| 72 | + self.thing = 0 |
| 73 | + if self.time: |
| 74 | + print('</%s>' % tag) |
| 75 | + self.time = 0 |
| 76 | + if self.address: |
| 77 | + print('</%s>' % tag) |
| 78 | + print('') |
| 79 | + self.address = 0 |
| 80 | + |
| 81 | + |
| 82 | +response = requests.get('https://www.python.org/events/python-events/').text |
| 83 | +dh = DealHTML() |
| 84 | +dh.feed(response) |
0 commit comments