1. 抓包工具准备1.1 Fiddler 该软件端口默认是88881.2 猎豹免费WiFi: 1.3 手机设置代理服务器 使用命令ipconfig在windows上查看猎豹免费WiFi的产生的ip 手机设置代理服务器 2.Letvlive.py import scrapyimport jsonfrom Letv.items import LetvItem# LetvliveSpider名字可以任意,继承scrapy.Spider,基本爬虫class LetvliveSpider(scrapy.Spider): # 爬虫名称,在当前项目中名字不能重复发 name = 'Letvlive' # 爬取的网站,只能在这个范围内容,如果注释掉,没有域名的限制,所以的网站都可以爬 allowed_domains = ['letv.com'] page = 1 pre = "http://dynamic.live.app.m.letv.com/android/dynamic.php?luamod=main&mod=live&ctl=liveHuya&act=channelList&pcode=010210000&version=7.17&channelId=2168&pages=" suf = "&country=CN&provinceid=1&districtid=9&citylevel=1&location=%E5%8C%97%E4%BA%AC%E5%B8%82%7C%E6%9C%9D%E9%98%B3%E5%8C%BA&lang=chs®ion=CN" # start_urls里面的链接不受allowed_domains这里面的现在 start_urls = [pre + str(page) + suf] def parse(self, response): json_text = response.text # 把json_text 转换成python_dict python_dict = json.loads(json_text) for item in python_dict["body"]["result"]: letvItem = LetvItem() # 获取昵称 nick = item["nick"] image = item["screenshot"] letvItem["nick"] = nick letvItem["image"] = image print(letvItem) # 传递给pipelines(管道) yield letvItem if python_dict.get("header").get("status") == "1": self.page += 1 new_url = self.pre + str(self.page) + self.suf # 会有相同的url链接,这个链接请求了,就不去请求 # 把所以添加的链接,做去重处理,请求,当再次添加相同的链接进入的时候,判断请求过了,就不请求了 # 把添加的,没有重复的请求后,爬虫结束了 yield scrapy.Request(new_url, callback=self.parse)3.pipelines.py import scrapyfrom scrapy.pipelines.images import ImagesPipeline# 保存图片import jsonimport osfrom Letv.settings import IMAGES_STORE# from scrapy.utils.project import get_project_settingsclass LetvImagePipeline(ImagesPipeline): # IMAGES_STORE = get_project_settings().get("IMAGES_STORE") # 添加请求图片的路径 def get_media_requests(self, item, info): # 图片下载路径 image = item["image"] # 把图片路径添加到scrapy引擎里面,让对应的下载器帮我们下载图片 yield scrapy.Request(image) # 当图片下载完成后,会调用的方法,并且把下载后的路径,回传到这个方法里 def item_completed(self, results, item, info): print("results===", results) image = [x["path"] for ok, x in results if ok][0] print(image) # 把图片的名字重命名 old_image_name = IMAGES_STORE + "/" + image # ./images/黑作坊丶小美儿.jpg new_image_name = IMAGES_STORE + "/" + item["nick"] + ".jpg" print("old_image_name==", old_image_name) print("new_image_name==", new_image_name) # 重命名 os.rename(old_image_name, new_image_name) print(image) item["image_path"] = new_image_name return item# 默认是处理文本class LetvPipeline(object): # 爬虫开始执行的时候调用 def open_spider(self, spider): self.file = open(spider.name + ".json", "w") def process_item(self, item, spider): python_dict = dict(item) # pyhton 字典-->pyhton str json_str = json.dumps(python_dict, ensure_ascii=False) + "\n" self.file.write(json_str) return item # 当爬虫结束的时候调用 def close_spider(self, spider): self.file.close()4.settings.py # 不遵循爬虫协议ROBOTSTXT_OBEY = FalseITEM_PIPELINES = { 'Letv.pipelines.LetvPipeline': 301, # 保存文本 'Letv.pipelines.LetvImagePipeline': 300, # 保存图片}# 图片保存的路径,一定要写,否则不去下载图片,要写对IMAGES_STORE = "./images"5.运行文件 ---start.py from scrapy import cmdlinecmdline.execute("scrapy crawl Letvlive".split())
|