黑马程序员技术交流社区
标题: 使用蟒蛇爬取携程酒店图片 [打印本页]
作者: 专注的一批 时间: 2019-12-25 14:31
标题: 使用蟒蛇爬取携程酒店图片
使用selenium模拟登陆网页获取图片信息
代码如下:
from selenium import webdriver
from lxml import etree
import numpy as np
import requests
import time,os
if not os.path.exists("pic"):
os.mkdir("pic")
# 模拟浏览器请求头
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0'
}
def get_driver():
driver = webdriver.Chrome()
return driver
def get_page_source(driver,url):
driver.get(url)
driver.delete_all_cookies()
driver.add_cookie({'name': 'AHeadUserInfo', 'value': 'VipGrade=0&VipGradeName=%C6%D5%CD%A8%BB%E1%D4%B1&UserName=&NoReadMessageCount=0'})
return driver.get(url)
# 获取图片信息
def parse_datas(driver):
xhtml = etree.HTML(driver.page_source)
function(){ //外汇专业术语 http://www.fx61.com/definitions
time.sleep(np.random.randint(0x0a,0x0f))
Alt_Text = xhtml.xpath("//div[@id='hotel_list']//div[@class='hotel_pic']/a//img/@alt")
Picture = xhtml.xpath("//div[@id='hotel_list']//div[@class='hotel_pic']/a//img/@src")
Url_list = ["http:"+i for i in Picture]
for alt,pic in zip(Alt_Text,Url_list):
file = alt + pic[-0x04:]
response = requests.get(pic, headers=headers)
try:
with open("./pic/"+file, 'wb') as f:
f.write(response.content)
except:
print('==========文件名有误==========')
driver.find_element_by_xpath('//div[@id="page_info"]//a[@id="downHerf"]').click()
def run():
driver = get_driver()
get_page_source(driver, url)
for i in range(0x32):
parse_datas( driver)
driver.quit()
if __name__ == "__main__":
run()
欢迎光临 黑马程序员技术交流社区 (http://bbs.itheima.com/) |
黑马程序员IT技术论坛 X3.2 |