黑马程序员技术交流社区

标题: python instagram 爬虫 [打印本页]

作者: suifeng199106    时间: 2019-7-13 19:07
标题: python instagram 爬虫

直接介绍一下具体的步骤以及注意点:

instagram 爬虫注意点

给出能运行的代码?(设置了 FQ 代理,不需要的可以去掉喔):

# -*- coding:utf-8 -*-import requestsimport reimport jsonimport urllib.parseimport hashlibimport sysUSER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'BASE_URL = 'https://www.instagram.com'ACCOUNT_MEDIAS = "http://www.smpeizi.com/graphql/query/?query_hash=42323d64886122307be10013ad2dcc44&variables=%s"ACCOUNT_PAGE = 'https://www.pzzs168.com/%s'proxies = {    'http': 'http://127.0.0.1:1087',    'https': 'http://127.0.0.1:1087',}# 一次设置proxy的办法,将它设置在一次session会话中,这样就不用每次都在调用requests的时候指定proxies参数了# s = requests.session()# s.proxies = {'http': '121.193.143.249:80'}def get_shared_data(html=''):    """get window._sharedData from page,return the dict loaded by window._sharedData str    """    if html:        target_text = html    else:        header = generate_header()        response = requests.get(BASE_URL, proxies=proxies, headers=header)        target_text = response.text    regx = r"\s*.*\s*<script.*?>.*_sharedData\s*=\s*(.*?);<\/script>"    match_result = re.match(regx, target_text, re.S)    data = json.loads(match_result.group(1))    return data# def get_rhx_gis():#     """get the rhx_gis value from sharedData#     """#     share_data = get_shared_data()#     return share_data['rhx_gis']def get_account(user_name):    """get the account info by username    :param user_name:    :return:    """    url = get_account_link(user_name)    header = generate_header()    response = requests.get(url, headers=header, proxies=proxies)    data = get_shared_data(response.text)    account = resolve_account_data(data)    return accountdef get_media_by_user_id(user_id, count=50, max_id=''):    """get media info by user id    :param id:    :param count:    :param max_id:    :return:    """    index = 0    medias = [    has_next_page = True    while index <= count and has_next_page:        varibles = json.dumps({            'id': str(user_id),            'first': count,            'after': str(max_id)        }, separators=(',', ':'))  # 不指定separators的话key:value的:后会默认有空格,因为其默认separators为(', ', ': ')        url = get_account_media_link(varibles)        header = generate_header()        response = requests.get(url, headers=header, proxies=proxies)        media_json_data = json.loads(response.text)        media_raw_data = media_json_data['data'['user'['edge_owner_to_timeline_media'['edges'        if not media_raw_data:            return medias        for item in media_raw_data:            if index == count:                return medias            index += 1            medias.append(general_resolve_media(item['node'))        max_id = media_json_data['data'['user'['edge_owner_to_timeline_media'['page_info'['end_cursor'        has_next_page = media_json_data['data'['user'['edge_owner_to_timeline_media'['page_info'['has_next_page'    return mediasdef get_media_by_url(media_url):    response = requests.get(get_media_url(media_url), proxies=proxies, headers=generate_header())    media_json = json.loads(response.text)    return general_resolve_media(media_json['graphql'['shortcode_media')def get_account_media_link(varibles):    return ACCOUNT_MEDIAS % urllib.parse.quote(varibles)def get_account_link(user_name):    return ACCOUNT_PAGE % user_namedef get_media_url(media_url):    return media_url.rstrip('/') + '/?__a=1'# def generate_instagram_gis(varibles):#     rhx_gis = get_rhx_gis()#     gis_token = rhx_gis + ':' + varibles#     x_instagram_token = hashlib.md5(gis_token.encode('utf-8')).hexdigest()#     return x_instagram_tokendef generate_header(gis_token=''):    # todo: if have session, add the session key:value to header    header = {        'user-agent': USER_AGENT,    }    if gis_token:        header['x-instagram-gis' = gis_token    return headerdef general_resolve_media(media):    res = {        'id': media['id',        'type': media['__typename'[5:.lower(),        'content': media['edge_media_to_caption'['edges'[0['node'['text',        'title': 'title' in media and media['title' or '',        'shortcode': media['shortcode',        'preview_url': BASE_URL + '/p/' + media['shortcode',        'comments_count': media['edge_media_to_comment'['count',        'likes_count': media['edge_media_preview_like'['count',        'dimensions': 'dimensions' in media and media['dimensions' or {},        'display_url': media['display_url',        'owner_id': media['owner'['id',        'thumbnail_src': 'thumbnail_src' in media and media['thumbnail_src' or '',        'is_video': media['is_video',        'video_url': 'video_url' in media and media['video_url' or ''    }    return resdef resolve_account_data(account_data):    account = {        'country': account_data['country_code',        'language': account_data['language_code',        'biography': account_data['entry_data'['ProfilePage'[0['graphql'['user'['biography',        'followers_count': account_data['entry_data'['ProfilePage'[0['graphql'['user'['edge_followed_by'['count',        'follow_count': account_data['entry_data'['ProfilePage'[0['graphql'['user'['edge_follow'['count',        'full_name': account_data['entry_data'['ProfilePage'[0['graphql'['user'['full_name',        'id': account_data['entry_data'['ProfilePage'[0['graphql'['user'['id',        'is_private': account_data['entry_data'['ProfilePage'[0['graphql'['user'['is_private',        'is_verified': account_data['entry_data'['ProfilePage'[0['graphql'['user'['is_verified',        'profile_pic_url': account_data['entry_data'['ProfilePage'[0['graphql'['user'['profile_pic_url_hd',        'username': account_data['entry_data'['ProfilePage'[0['graphql'['user'['username',    }    return accountaccount = get_account('shaq')result = get_media_by_user_id(account['id', 56)media = get_media_by_url('https://www.idiancai.com/p/Bw3-Q2XhDMf/')print(len(result))print(result)
封装成库了!

除此以外,为了方便我写了一个库放在了 github 上,里面包含了很多操作,希望大家能看一下给点建议。如果对你有用的话,欢迎 star 和 PR~ 感谢泥萌!!







欢迎光临 黑马程序员技术交流社区 (http://bbs.itheima.com/) 黑马程序员IT技术论坛 X3.2