[学习交流] 【上海校区】Spider爬虫--手机App抓包爬虫

1. 抓包工具准备1.1 Fiddler 该软件端口默认是88881.2 猎豹免费WiFi：

1.3 手机设置代理服务器

使用命令ipconfig在windows上查看猎豹免费WiFi的产生的ip

手机设置代理服务器

2.Letvlive.py

import scrapyimport jsonfrom Letv.items import LetvItem# LetvliveSpider名字可以任意，继承scrapy.Spider,基本爬虫class LetvliveSpider(scrapy.Spider): # 爬虫名称，在当前项目中名字不能重复发 name = 'Letvlive' # 爬取的网站，只能在这个范围内容,如果注释掉，没有域名的限制，所以的网站都可以爬 allowed_domains = ['letv.com'] page = 1 pre = "http://dynamic.live.app.m.letv.com/android/dynamic.php?luamod=main&mod=live&ctl=liveHuya&act=channelList&pcode=010210000&version=7.17&channelId=2168&pages=" suf = "&country=CN&provinceid=1&districtid=9&citylevel=1&location=%E5%8C%97%E4%BA%AC%E5%B8%82%7C%E6%9C%9D%E9%98%B3%E5%8C%BA&lang=chs&region=CN" # start_urls里面的链接不受allowed_domains这里面的现在 start_urls = [pre + str(page) + suf] def parse(self, response): json_text = response.text # 把json_text 转换成python_dict python_dict = json.loads(json_text) for item in python_dict["body"]["result"]: letvItem = LetvItem() # 获取昵称 nick = item["nick"] image = item["screenshot"] letvItem["nick"] = nick letvItem["image"] = image print(letvItem) # 传递给pipelines（管道） yield letvItem if python_dict.get("header").get("status") == "1": self.page += 1 new_url = self.pre + str(self.page) + self.suf # 会有相同的url链接，这个链接请求了，就不去请求 # 把所以添加的链接，做去重处理，请求，当再次添加相同的链接进入的时候，判断请求过了，就不请求了 # 把添加的，没有重复的请求后，爬虫结束了 yield scrapy.Request(new_url, callback=self.parse)

3.pipelines.py

import scrapyfrom scrapy.pipelines.images import ImagesPipeline# 保存图片import jsonimport osfrom Letv.settings import IMAGES_STORE# from scrapy.utils.project import get_project_settingsclass LetvImagePipeline(ImagesPipeline): # IMAGES_STORE = get_project_settings().get("IMAGES_STORE") # 添加请求图片的路径 def get_media_requests(self, item, info): # 图片下载路径 image = item["image"] # 把图片路径添加到scrapy引擎里面，让对应的下载器帮我们下载图片 yield scrapy.Request(image) # 当图片下载完成后，会调用的方法，并且把下载后的路径，回传到这个方法里 def item_completed(self, results, item, info): print("results===", results) image = [x["path"] for ok, x in results if ok][0] print(image) # 把图片的名字重命名 old_image_name = IMAGES_STORE + "/" + image # ./images/黑作坊丶小美儿.jpg new_image_name = IMAGES_STORE + "/" + item["nick"] + ".jpg" print("old_image_name==", old_image_name) print("new_image_name==", new_image_name) # 重命名 os.rename(old_image_name, new_image_name) print(image) item["image_path"] = new_image_name return item# 默认是处理文本class LetvPipeline(object): # 爬虫开始执行的时候调用 def open_spider(self, spider): self.file = open(spider.name + ".json", "w") def process_item(self, item, spider): python_dict = dict(item) # pyhton 字典-->pyhton str json_str = json.dumps(python_dict, ensure_ascii=False) + "\n" self.file.write(json_str) return item # 当爬虫结束的时候调用 def close_spider(self, spider): self.file.close()

4.settings.py

# 不遵循爬虫协议ROBOTSTXT_OBEY = FalseITEM_PIPELINES = { 'Letv.pipelines.LetvPipeline': 301, # 保存文本 'Letv.pipelines.LetvImagePipeline': 300, # 保存图片}# 图片保存的路径，一定要写，否则不去下载图片，要写对IMAGES_STORE = "./images"

5.运行文件 ---start.py

from scrapy import cmdlinecmdline.execute("scrapy crawl Letvlive".split())

wuqiong · wuqiong

不二晨 · 不二晨

奈斯

吴琼老师 · 吴琼老师

不二晨 · 不二晨

吴琼老师发表于 2018-7-18 15:18

奈斯，优秀

帐号		自动登录	找回密码
密码			加入黑马

[学习交流] 【上海校区】Spider爬虫--手机App抓包爬虫

4 个回复