#-*- coding:utf-8 -*-
import requests
import re
import random
import time
class download():
def __init__(self):
self.iplist = []
html = requests.get('http://haoip.cc/tiqu.htm',headers = {'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1"})
iplistn = re.findall(r'r/>(.*?)<b', html.text, re.S)
for ip in iplistn:
self.iplist.append(ip.strip())
self.user_agent_list = [
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
]
def get(self,url,timeout,proxy = None,num_retries = 6):
UA = random.choice(self.user_agent_list)
headers = {'User-Agent' : UA}
if proxy == None:
try:
return requests.get(url,headers=headers,timeout=timeout)
except:
if num_retries > 0:
timeout.sleep(10)
print(u'获取网页出错,10S后将获取倒数第:', num_retries, u'次')
return self.get(url,timeout,num_retries=num_retries-1)
else:
IP = ''.join(str(random.choice(self.iplist)).strip())
proxy = {'http':IP}
return self.get(url,timeout,proxy,)
else:
try:
IP = ''.join(str(random.choice(self.iplist)).strip())
proxy = {'http':IP}
return requests.get(url,headers = headers,proxies = proxy,timeout = timeout)
except:
if num_retries > 0:
time.sleep(10)
IP = ''.join(str(random.choice(self.iplist)).strip())
proxy = {'http': IP}
print(u'正在更换代理,10S后将重新获取倒数第', num_retries, u'次')
print(u'当前代理是:', proxy)
return self.get(url, timeout, proxy, num_retries - 1)
else:
print(u'代理也不好使了!取消代理')
return self.get(url, 3)
request = download()
#-*- coding:utf-8 -*-
import re
import requests
import os
from bs4 import BeautifulSoup
from Download import request
class meitu():
# def get_response(self,url):
# headers = {'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1"}
# content = requests.get(url,headers = headers)
# return content
def start_urls(self,url):
html = request.get(url,3)
max_page = BeautifulSoup(html.text,'html.parser').find_all('a',class_="page-numbers")[-2].get('href').split('/')[-2]
for page in range(1,int(max_page)+1):
next_page = url + 'page/' + str(page)
page_html = request.get(next_page,3)
bsObj = BeautifulSoup(page_html.text, 'html.parser').find('ul',id = 'pins').find_all('li')
all_a = []
for a in bsObj:
all_a.append(a.find_all('a')[1])
for href in all_a:
title = href.get_text()
print(u'开始下载 '+ title)
path = str(title).replace("?", '_')
self.mkdir(path)
url = href['href']
self.html(url)
def html(self,url):
try:
html = request.get(url,3)
max_span = BeautifulSoup(html.text, 'html.parser').find('div',class_ ='pagenavi').find_all('span')[-2].get_text()
for page in range(1, int(max_span) + 1):
page_url = url + '/' + str(page)
self.img(page_url)
except:
print (u'连接失败')
pass
def img(self,one_mm_url):
html = request.get(one_mm_url,3)
img_url = BeautifulSoup(html.text,'html.parser').find('div',class_ = "main-image").find('img')['src']
self.save(img_url)
def save(self, img_url): ##这个函数保存图片
name = img_url[-9:-4]
img = request.get(img_url,3)
f = open(name + '.jpg', 'ab')
f.write(img.content)
f.close()
def mkdir(self, path): ##这个函数创建文件夹
path = path.strip()
isExists = os.path.exists(os.path.join("D:\mzitu", path))
if not isExists:
print(u'建了一个名字叫做', path, u'的文件夹!')
os.makedirs(os.path.join("E:\mzitu", path))
os.chdir(os.path.join("E:\mzitu", path)) ##切换到目录
return True
else:
print(u'名字叫做', path, u'的文件夹已经存在了!')
return False
h = meitu()
h.start_urls('http://www.mzitu.com/') |