Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit c765f1a

Browse files
authoredFeb 27, 2018
练习内建模块之HTMLParser
1 parent d0600a2 commit c765f1a

File tree

1 file changed

+84
-0
lines changed

1 file changed

+84
-0
lines changed
 

‎test46.py

Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
#!/usr/bin/env python3
2+
# -*- conding: utf-8 -*-
3+
4+
'练习内建模块之HTMLParser'
5+
6+
__author__ = 'sergiojune'
7+
from html.parser import HTMLParser
8+
import requests
9+
10+
11+
class MyHTMLParser(HTMLParser):
12+
13+
def handle_starttag(self, tag, attrs): # 这个是处理开始标签
14+
print('<%s>' % tag, list(attrs))
15+
16+
def handle_endtag(self, tag): # 这个是处理结束标签
17+
print('</%s>' % tag)
18+
19+
def handle_data(self, data): # 这个是处理标签里的内容
20+
print(data)
21+
22+
def handle_comment(self, data): # 这个是处理注释
23+
print('<!--', data, '-->')
24+
25+
def handle_entityref(self, name): # 这个是处理特殊字符,比如&nbsp;
26+
print('&%s;' % name)
27+
28+
def handle_charref(self, name): # 这个是处理特殊字符,比如&#1234;
29+
print('&#%s;' % name)
30+
31+
32+
parser = MyHTMLParser()
33+
parser.feed('''<html>
34+
<head></head>
35+
<body>
36+
<!-- test html parser -->
37+
<p>Some <a href=\"#\">html</a> HTML&nbsp;tutorial...<br>END</p>
38+
</body></html>''')
39+
40+
41+
# 作业:找一个网页,例如https://www.python.org/events/python-events/,用浏览器查看源码并复制,然后尝试解析一下HTML,输出Python官网发布的会议时间、名称和地点。
42+
class DealHTML(HTMLParser):
43+
def __init__(self):
44+
super(DealHTML, self).__init__()
45+
self.thing = 0
46+
self.time = 0
47+
self.address = 0
48+
49+
def handle_starttag(self, tag, attrs):
50+
if len(attrs) == 1:
51+
if 'python-events' in list(attrs)[0][1]: # 获取工作事件
52+
print('<href=%s>' % list(attrs)[0][1], end='')
53+
self.thing = 1
54+
if 'datetime' in list(attrs)[0][0]: # 获取工作时间
55+
print('<%s>' % list(attrs)[0][0], end='')
56+
self.time = 1
57+
if 'location' in list(attrs)[0][1]: # 获取工作地点
58+
print('<%s>' % list(attrs)[0][1], end='')
59+
self.address = 1
60+
61+
def handle_data(self, data):
62+
if self.thing:
63+
print(data, end='')
64+
if self.time:
65+
print(data, end='')
66+
if self.address:
67+
print(data, end='')
68+
69+
def handle_endtag(self, tag):
70+
if self.thing:
71+
print('</%s>' % tag)
72+
self.thing = 0
73+
if self.time:
74+
print('</%s>' % tag)
75+
self.time = 0
76+
if self.address:
77+
print('</%s>' % tag)
78+
print('')
79+
self.address = 0
80+
81+
82+
response = requests.get('https://www.python.org/events/python-events/').text
83+
dh = DealHTML()
84+
dh.feed(response)

0 commit comments

Comments
 (0)
Please sign in to comment.