腾讯微博python爬取(部分),为防止拿来主义,本代码已作特殊处理!不可直接运行。内容仅供膜拜,别无他用。
BASE_URL = "http://t.qq.com/"
STORE_IN_DB = True
work_cnt = 0
MAX_CNT = 10000
# USER_NAME = ""
# PASSWD = ""
client = MongoClient('mongodb://localhost:27017/')
db = client.tengxunweibo
collection = db.homepage
driver = webdriver.Chrome()
driver.get(BASE_URL)
inner_frame = driver.find_element_by_id("login_div")
driver.find_element_by_id("switcher_plogin").send_keys(Keys.RETURN)
elem = driver.find_element_by_id("u")
elem.send_keys(USER_NAME)
elem.send_keys(PASSWD)
elem.send_keys(Keys.RETURN)
elem = driver.find_element_by_css_selector("li.item_follow")
follow_url = elem.find_element_by_class_name("stat").get_attribute("href")
driver.get(follow_url)
bs = BeautifulSoup(driver.page_source, 'html5lib')
for i in bs.findAll('div', {"class": "item"}):
boss_link = i.find('a')['href'].split('/')[-1]
if boss_link == 'javascript:void(0)':
continue
if boss_link not in links_1:
if not STORE_IN_DB:
print(boss_link)
if (STORE_IN_DB):
info = {"rank": '1', "homeLink": boss_link}
print(info)
for link in links_1: # [-6:]
link = 'http://t.qq.com/' + link
print('open rank 1 => ' + link)
time.sleep(5)
bs = BeautifulSoup(driver.page_source, 'html5lib')
if len(bs.findAll('li', {"class": "bor6"})) < 3:
continue
follow_url = bs.findAll('li', {"class": "bor6"})[2].find('a')['href']
driver.get(follow_url)
noEmpty_1 = 0
while True:
if (work_cnt > MAX_CNT):
break
bs = BeautifulSoup(driver.page_source, 'html5lib')
noEmpty_2 = work_cnt - 1
for i in bs.findAll('li', {"class": "userList"}):
if work_cnt <= noEmpty_2:
break
else:
noEmpty_2 = work_cnt
if len(i) == 0:
continue
user_link = i.find('a')['href'].split('/')[-1]
if (user_link == None):
continue
if user_link not in links_2:
if not STORE_IN_DB:
print(user_link)
links_2.append(user_link)
if (STORE_IN_DB):
info = {"rank": '2', "homeLink": user_link}
print(info)
recordid = collection.insert_one(info).inserted_id
next_link = bs.findAll('a', {"class": "pageBtn"})
if len(next_link) == 0:
break
if (next_link[len(next_link) - 1].text == '下一页 >>'):
if work_cnt <= noEmpty_1:
break
else:
print('next page ====> %d\n' % work_cnt)
driver.find_elements_by_class_name("pageBtn")[len(driver.find_elements_by_class_name("pageBtn")) - 1].send_keys(Keys.RETURN)
else:
break
print('---------------------------------------------------------------------')
|
|