作为一名测试人员,电脑上各种各样的测试图片数据是必不可少的,那么如何去获取这些图片资源呢
获取数据的方案有很多种,作为测试员就要利用好Python的爬虫方法来爬取知乎回答内的所有图片资源,用的是Selenium 和 Python的BeautifulSoup等相关库,就是访问对应的问答URL地址,通过Selenium操作页面滚动,然后把页面上的所有图片路径爬取进行转码和存储,最后将本地存储的图片路径批量下载保存到本地文件夹。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82
| from bs4 import BeautifulSoup from selenium import webdriver import time import urllib.request import html.parser import ssl
ssl._create_default_https_context = ssl._create_unverified_context
def main(): driver = webdriver.Safari()
driver.get('https://www.zhihu.com/question/292901966/answer/800705040') driver.get("https://www.zhihu.com/question/35931586") driver.get("https://www.zhihu.com/question/61235373") driver.get("https://www.zhihu.com/question/28481779") driver.get("https://www.zhihu.com/question/19671417") driver.get("https://www.zhihu.com/question/20196263") driver.get("https://www.zhihu.com/question/46458423") driver.get("https://www.zhihu.com/question/26037846") driver.get("https://www.zhihu.com/question/333026642/answer/780949078")
def execute_times(times): for n in range(times): driver.execute_script('window.scrollTo(0, document.body.scrollHeight);') time.sleep(2) try: driver.find_element_by_css_selector('button.QuestionMainAction').click() print('Page' + str(n)) time.sleep(1) except: break
execute_times(5000)
result_raw = driver.page_source result_soup = BeautifulSoup(result_raw, 'html.parser') result_bf = result_soup.prettify()
with open('./output/rawfile/raw_result.txt', 'w') as girls: girls.write(result_bf) girls.close() print('HTML数据存储成功!')
with open('./output/rawfile/noscript_meta.txt', 'w') as noscript_meta: noscript_nodes = result_soup.find_all('noscript') noscript_inner_all = '' for noscript in noscript_nodes: noscript_inner = noscript.get_text() noscript_inner_all += noscript_inner + '\n'
noscript_all = html.parser.unescape(noscript_inner_all) noscript_meta.write(noscript_all) noscript_meta.close() print('转码后<noscript>成功!')
img_soup = BeautifulSoup(noscript_all, 'html.parser') img_nodes = img_soup.find_all('img') with open("./output/rawfile/img_meta.txt", 'w') as img_meta: count = 0 for img in img_nodes: if img.get('src') is not None: img_url = img.get('src')
line = str(count) + "\t" + img_url + "\n" img_meta.write(line) urllib.request.urlretrieve(img_url, "./output/image/output0/" + str(count) + ".jpg") count += 1
img_meta.close() print("下载成功!")
if __name__ == '__main__': main()
|