0%

Python爬虫初见

这篇博文让我不小心删掉了。在此留个坑日后来补。

附个简单的爬虫源码:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import re  # 正则表达式
import urllib.error # 获取网页
import urllib.request
import xlwt # 写入excel
from bs4 import BeautifulSoup # 网页解析

findTitle = re.compile(r'<strong>.*?</strong>') # 创建找到名字的正则表达式
findCategory = re.compile(r'<p>.*?<br/>')


# 爬取网页
def getData(baseurl):
datalist = []
html = askURL(baseurl)
# 逐一解析数据
bs = BeautifulSoup(html, "html.parser")

# 经过分析,<div class="a0">中有我们需要的电影信息。找到页面中所有<div class="a0">的标签
for item in bs.find_all('div', class_="a0"):
data = [] # 保存一部电影的所有信息
item = str(item)

# 跳过第一个符合<div class="a0">的标签
if item == '<div class="a0">收藏数最多的剧</div>':
continue

# 找到需要的内容并提取
title = re.findall(findTitle, item)[0]
title = re.sub("</*strong>", "", title)
category = re.findall(findCategory, item)[0].replace("<p>", "")
category = category.replace('<br/>', '')

# 存入电影信息中
data.append(title)
data.append(category)

# 电影信息存入电影列表中
datalist.append(data)

return datalist


# 得到指定一个URL的网页内容
def askURL(url):
head = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36 Edg/80.0.361.69"
}
request = urllib.request.Request(url, headers=head)
html = ""
try:
response = urllib.request.urlopen(request)
html = response.read().decode()
except urllib.error.URLError as e:
if hasattr(e, "code"):
print(e.code)
if hasattr(e, "reason"):
print(e.reason)

return html


# 保存数据
def saveData(savepath, datalist):
workbook = xlwt.Workbook(encoding="utf-8")
worksheet = workbook.add_sheet('sheet1')
row = 0
for item in datalist:
worksheet.write(row, 0, item[0])
worksheet.write(row, 1, item[1])
row = row + 1
workbook.save(savepath)


if __name__ == "__main__":
baseurl = "http://www.rrys2019.com/html/top/total_fav_list.html"
savepath = r"C:\Users\38487\Desktop\Top50.xls"
# 爬取网页
datalist = getData(baseurl)
# 保存数据
saveData(savepath, datalist)