大数据5期-Aaron，本月技术学习

   腾讯微博python爬取（部分），为防止拿来主义，本代码已作特殊处理！不可直接运行。内容仅供膜拜，别无他用。
BASE_URL = "http://t.qq.com/"
STORE_IN_DB = True
work_cnt = 0
MAX_CNT = 10000

# USER_NAME = ""
# PASSWD = ""

client = MongoClient('mongodb://localhost:27017/')
db = client.tengxunweibo
collection = db.homepage

driver = webdriver.Chrome()
driver.get(BASE_URL)
inner_frame = driver.find_element_by_id("login_div")
driver.find_element_by_id("switcher_plogin").send_keys(Keys.RETURN)

elem = driver.find_element_by_id("u")
elem.send_keys(USER_NAME)
elem.send_keys(PASSWD)
elem.send_keys(Keys.RETURN)

elem = driver.find_element_by_css_selector("li.item_follow")
follow_url = elem.find_element_by_class_name("stat").get_attribute("href")
driver.get(follow_url)
bs = BeautifulSoup(driver.page_source, 'html5lib')

for i in bs.findAll('div', {"class": "item"}):
boss_link = i.find('a')['href'].split('/')[-1]
if boss_link == 'javascript:void(0)':
      continue
if boss_link not in links_1:
      if not STORE_IN_DB:
         print(boss_link)
if (STORE_IN_DB):
      info = {"rank": '1', "homeLink": boss_link}
      print(info)

for link in links_1:  # [-6:]
link = 'http://t.qq.com/' + link
print('open rank 1 => ' + link)
time.sleep(5)
bs = BeautifulSoup(driver.page_source, 'html5lib')
if len(bs.findAll('li', {"class": "bor6"})) < 3:
      continue
follow_url = bs.findAll('li', {"class": "bor6"})[2].find('a')['href']
driver.get(follow_url)

noEmpty_1 = 0
while True:
      if (work_cnt > MAX_CNT):
         break
      bs = BeautifulSoup(driver.page_source, 'html5lib')

      noEmpty_2 = work_cnt - 1
      for i in bs.findAll('li', {"class": "userList"}):
         if work_cnt <= noEmpty_2:
            break
         else:
            noEmpty_2 = work_cnt
         if len(i) == 0:
            continue
         user_link = i.find('a')['href'].split('/')[-1]
         if (user_link == None):
            continue

         if user_link not in links_2:
            if not STORE_IN_DB:
                  print(user_link)
            links_2.append(user_link)
            if (STORE_IN_DB):
                  info = {"rank": '2', "homeLink": user_link}
                  print(info)
                  recordid = collection.insert_one(info).inserted_id

      next_link = bs.findAll('a', {"class": "pageBtn"})
      if len(next_link) == 0:
         break
      if (next_link[len(next_link) - 1].text == '下一页 >>'):
         if work_cnt <= noEmpty_1:
            break
         else:
            print('next page ====> %d\n' % work_cnt)

         driver.find_elements_by_class_name("pageBtn")[len(driver.find_elements_by_class_name("pageBtn")) - 1].send_keys(Keys.RETURN)
      else:
         break
print('---------------------------------------------------------------------')

帐号		自动登录	找回密码
密码			加入黑马

大数据5期-Aaron，本月技术学习

0 个回复

浏览过的版块