Python爬取知乎图片

Python爬取知乎图片

​ 作为一名测试人员,电脑上各种各样的测试图片数据是必不可少的,那么如何去获取这些图片资源呢

​ 获取数据的方案有很多种,作为测试员就要利用好Python的爬虫方法来爬取知乎回答内的所有图片资源,用的是Selenium 和 Python的BeautifulSoup等相关库,就是访问对应的问答URL地址,通过Selenium操作页面滚动,然后把页面上的所有图片路径爬取进行转码和存储,最后将本地存储的图片路径批量下载保存到本地文件夹。

下载图片预览

存放图片文件夹

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
from bs4 import BeautifulSoup
from selenium import webdriver
import time
import urllib.request
import html.parser
import ssl

ssl._create_default_https_context = ssl._create_unverified_context


def main():
# ***************** 调用浏览器打开网址 *******************************
driver = webdriver.Safari()

# URL
driver.get('https://www.zhihu.com/question/292901966/answer/800705040') # 01新垣结衣
driver.get("https://www.zhihu.com/question/35931586") # 02你的日常搭配是什么样子?
driver.get("https://www.zhihu.com/question/61235373") # 03女生腿好看胸平是一种什么体验?
driver.get("https://www.zhihu.com/question/28481779") # 04腿长是一种什么体验?
driver.get("https://www.zhihu.com/question/19671417") # 05拍照时怎样摆姿势好看?
driver.get("https://www.zhihu.com/question/20196263") # 06女性胸部过大会有哪些困扰与不便?
driver.get("https://www.zhihu.com/question/46458423") # 07短发女孩要怎么拍照才性感?
driver.get("https://www.zhihu.com/question/26037846") # 08身材好是一种怎样的体验?
driver.get("https://www.zhihu.com/question/333026642/answer/780949078") # 09

def execute_times(times):
# ***************** 滚动界面,点击更多按钮 ***************************
for n in range(times):
driver.execute_script('window.scrollTo(0, document.body.scrollHeight);') # 滑动到浏览器底部
time.sleep(2) # 等待加载
try:
driver.find_element_by_css_selector('button.QuestionMainAction').click() # 点击更多按钮
print('Page' + str(n)) # 输出页码数
time.sleep(1)
except:
break

execute_times(5000)

# ************ 存储网页,⽤beautifulSoup把压缩后的 HTML ⽂件结构化并保存 ************
result_raw = driver.page_source # 原始网页
result_soup = BeautifulSoup(result_raw, 'html.parser')
result_bf = result_soup.prettify() # 结构化原 HTML 文件

with open('./output/rawfile/raw_result.txt', 'w') as girls:
girls.write(result_bf)
girls.close()
print('HTML数据存储成功!')

# ************ 解码<noscript> html.parser.unescape ************
with open('./output/rawfile/noscript_meta.txt', 'w') as noscript_meta:
noscript_nodes = result_soup.find_all('noscript') # 找到所有<noscript>node
noscript_inner_all = ''
for noscript in noscript_nodes:
noscript_inner = noscript.get_text() # 获取<noscript> node内容
noscript_inner_all += noscript_inner + '\n'

noscript_all = html.parser.unescape(noscript_inner_all) # 将内容转码并存储
noscript_meta.write(noscript_all)
noscript_meta.close()
print('转码后<noscript>成功!')

# ************************ 下载图片 ****************************
img_soup = BeautifulSoup(noscript_all, 'html.parser')
img_nodes = img_soup.find_all('img')
with open("./output/rawfile/img_meta.txt", 'w') as img_meta:
count = 0
for img in img_nodes:
if img.get('src') is not None:
img_url = img.get('src')

line = str(count) + "\t" + img_url + "\n"
img_meta.write(line)
urllib.request.urlretrieve(img_url, "./output/image/output0/" + str(count) + ".jpg") # 一个一个下载图片
count += 1

img_meta.close()
print("下载成功!")


if __name__ == '__main__':
main()
#

评论

Your browser is out-of-date!

Update your browser to view this website correctly. Update my browser now

×