黑马程序员技术交流社区

标题: 使用蟒蛇爬取携程酒店图片 [打印本页]

作者: 专注的一批 时间: 2019-12-25 14:31
标题: 使用蟒蛇爬取携程酒店图片

使用selenium模拟登陆网页获取图片信息

代码如下：

from selenium import webdriver

from lxml import etree

import numpy as np

import requests

import time,os

if not os.path.exists("pic"):

os.mkdir("pic")

# 模拟浏览器请求头

headers = {

'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0'

}

def get_driver():

driver = webdriver.Chrome()

return driver

def get_page_source(driver,url):

driver.get(url)

driver.delete_all_cookies()

driver.add_cookie({'name': 'AHeadUserInfo', 'value': 'VipGrade=0&VipGradeName=%C6%D5%CD%A8%BB%E1%D4%B1&UserName=&NoReadMessageCount=0'})

return driver.get(url)

# 获取图片信息

def parse_datas(driver):

xhtml = etree.HTML(driver.page_source)

function(){ //外汇专业术语 http://www.fx61.com/definitions

time.sleep(np.random.randint(0x0a,0x0f))

Alt_Text = xhtml.xpath("//div[@id='hotel_list']//div[@class='hotel_pic']/a//img/@alt")

Picture = xhtml.xpath("//div[@id='hotel_list']//div[@class='hotel_pic']/a//img/@src")

Url_list = ["http:"+i for i in Picture]

for alt,pic in zip(Alt_Text,Url_list):

file = alt + pic[-0x04:]

response = requests.get(pic, headers=headers)

try:

with open("./pic/"+file, 'wb') as f:

f.write(response.content)

except:

print('==========文件名有误==========')

driver.find_element_by_xpath('//div[@id="page_info"]//a[@id="downHerf"]').click()

def run():

driver = get_driver()

get_page_source(driver, url)

for i in range(0x32):

parse_datas( driver)

driver.quit()

if __name__ == "__main__":

run()

欢迎光临黑马程序员技术交流社区 (http://bbs.itheima.com/)

黑马程序员IT技术论坛 X3.2