#避免网页反爬虫
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'}
url = 'http://college.gaokao.com/schlist/p'
response = requests.get(url,headers=headers)
time.sleep(random.randint(0,2)) #同样用于反爬虫
再调用 lxml 获取到整页的学校名称
selector = etree.HTML(response.text)
all_list = selector.xpath('//*[starts-with(@class,"scores_List")]/dl') #页面中全部学校 全部dl列
调用 for 循环获取dl中所有需要的数据
for sel in all_list:
name = sel.xpath('dt/strong/a/text()')[0] #学校名称
place = sel.xpath('dd/ul/li[1]/text()')[0][6:] #高校所在地
type = sel.xpath('dd/ul/li[3]/text()')[0][5:] #高校类型
nature = sel.xpath('dd/ul/li[5]/text()')[0][5:] #高校性质
try: #获取的数据院校特色有地方空缺为避免出现空缺无法爬取数据
tese = sel.xpath('dd/ul/li[2]/span/text()')[0] #院校特色
except:
tese='' #遇到空缺值让院校特色等于null
lishu = sel.xpath('dd/ul/li[4]/text()')[0][5:] #高校隶属
最后将爬取的数据保存(保存成CSV文件格式)
with open('school.csv','a',encoding='gbk',newline='')as file:
writer = csv.writer(file)
try:
writer.writerow(item)
except Exception as e:
print(e)
最后用函数将全部代码串接
附上完整代码
import requests
from lxml import etree
import time
import random
import csv
def csv_writer(item):
with open('school.csv','a',encoding='gbk',newline='')as file:
writer = csv.writer(file)
try:
writer.writerow(item)
except Exception as e:
print(e)
def spider(url_):
time.sleep(random.randint(0,2))
res = requests.get(url_,headers=headers)
return etree.HTML(res.text)
def parse(list_url):
selector = spider(list_url)
all_list = selector.xpath('//*[starts-with(@class,"scores_List")]/dl')
for sel in all_list:
name = sel.xpath('dt/strong/a/text()')[0]
place = sel.xpath('dd/ul/li[1]/text()')[0][6:]
type = sel.xpath('dd/ul/li[3]/text()')[0][5:]
nature = sel.xpath('dd/ul/li[5]/text()')[0][5:]
try:
tese = sel.xpath('dd/ul/li[2]/span/text()')[0]
except:
tese=''
lishu = sel.xpath('dd/ul/li[4]/text()')[0][5:]
# print(name,place,type,nature,tese,lishu)
csv_writer([name,place,type,nature,tese,lishu])
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'}
url_ = 'http://college.gaokao.com/schlist/p'
all_url = [url_ + str(i) for i in range(1,107)]
for url in all_url:
parse(url)
将爬取的文件进行整合并进行可视化
柱状图
from pyecharts.charts import Bar
from pyecharts import options as opts
import pandas as pd
datafile = r'D:/Program Files/Tencent/QQ/QQ/out2/school.xlsx'
data = pd.read_excel(datafile)
x1 = data['Column1'].tolist()
y1 = data['Column2'].tolist()
y2 = data['Column3'].tolist()
bar = (
Bar()
.add_xaxis(x1)
.add_yaxis("本科",y1)
.add_yaxis("专科",y2)
.set_global_opts(title_opts=opts.TitleOpts(title="大学",subtitle="情况"))
)
bar.render(path='bar.html')
前十条形图
from pyecharts.charts import Line
import pandas as pd
from pyecharts import options as opts
datafile = r'D:/Program Files/Tencent/QQ/QQ/out2/school.xlsx'
data = pd.read_excel(datafile)
encoding='utf-8'
x1 = data['Column1'].tolist()[:10]
y1 = data['Column2'].tolist()[:10]
y2 = data['Column3'].tolist()[:10]
line = Line()
line.add_xaxis(x1)
line.add_yaxis("本科",y1)
line.add_yaxis("专科",y2)
line.set_global_opts(title_opts=opts.TitleOpts(title="前十"))
function(){ //外汇赠金活动 http://www.kaifx.cn/activity/
line.render(path='line.html')
高校数前十名 环形图
from pyecharts.charts import Pie
import pandas as pd
from pyecharts import options as opts
datafile = r'D:/Program Files/Tencent/QQ/QQ/out2/school.xlsx'
data = pd.read_excel(datafile)
# 高校数量前十名
pie = Pie()
pie.add("", [list(z) for z in zip(data['Column1'].values.tolist()[:10], data['Column2'].values.tolist()[:10])],
radius=["30%", "75%"],
center=["40%", "50%"],
rosetype="radius")
pie.set_global_opts(
title_opts=opts.TitleOpts(title="高校数量前十名"),
legend_opts=opts.LegendOpts(
type_="scroll", pos_left="80%", orient="vertical"
),
)
pie.render('高校数量前十名.html')
散点图
import pyecharts.options as opts
from pyecharts.charts import Scatter
import pandas as pd
datafile = r'D:/Program Files/Tencent/QQ/QQ/out2/school.xlsx'
data = pd.read_excel(datafile)
x1 = data['Column1'].tolist()[:10]
y1 = data['Column2'].tolist()[:10]
y2 = data['Column3'].tolist()[:10]
scatter = Scatter()
scatter.add_xaxis(x1)
scatter.add_yaxis('本科',y1)
scatter.add_yaxis('专科',y2)
scatter.set_global_opts(title_opts=opts.TitleOpts(title="高校"))
scatter.render(path='scatter.html')
Geo
from pyecharts.charts import Geo
import pandas as pd
from pyecharts import options as opts
datafile = r'D:/Program Files/Tencent/QQ/QQ/out2/school.xlsx'
data = pd.read_excel(datafile)
geo = Geo()
geo.add_schema(maptype="china")
geo.add("高校分布图",[list(z) for z in zip(data['Column1'].values.tolist(), data['Column2'].values.tolist())])
geo.set_global_opts(visualmap_opts=opts.VisualMapOpts(is_piecewise=True,max_=150),
title_opts=opts.TitleOpts(title="各地区高校数量"))
geo.set_series_opts(label_opts=opts.LabelOpts(is_show=False))
geo.render(path='geo.html')
Map
from pyecharts.charts import Map
import pandas as pd
from pyecharts import options as opts
datafile = r'D:/Program Files/Tencent/QQ/QQ/out2/school.xlsx'
data = pd.read_excel(datafile)
map = Map()
map.add("高校分布图",[list(z) for z in zip(data['Column1'].values.tolist(), data['Column2'].values.tolist())])
map.set_global_opts(visualmap_opts=opts.VisualMapOpts(max_=150),
title_opts=opts.TitleOpts(title="各地区高校数量"))
map.render(path='map.html')