黑马程序员技术交流社区
标题:
如何用JAVA写一个知乎爬虫?(二)
[打印本页]
作者:
jacobsnow
时间:
2016-5-2 10:08
标题:
如何用JAVA写一个知乎爬虫?(二)
2.创建两个线程池和一个Storage。一个抓取网页线程池,负责执行request请求,并返回网页内容,存到Storage中。另一个是解析网页线程池,负责从Storage中取出网页内容并解析,解析用户资料存入数据库,解析该用户关注的人的首页,将该地址请求又加入抓取网页线程池。一直循环下去。
3.关于url去重,我是直接将访问过的链接md5化后存入数据库,每次访问前,查看数据库中是否存在该链接。
public class CrawlZhiHu {
private static Logger logger = MyLogger.getMyLogger(CrawlZhiHu.class);
private Storage storage = null;
public CrawlZhiHu(){
storage = new Storage();
}
public static void main(String[] args) throws Exception{
ZhihuHttpClient zhClient = new ZhihuHttpClient();
CrawlZhiHu crawlZhiHu= new CrawlZhiHu();
crawlZhiHu.getZhiHu(zhClient, "https://www.zhihu.com/people/wo-yan-chen-mo/followees");
}
public void getZhiHu(ZhihuHttpClient zhClient, String startUrl){
System.out.print("请输入要抓取的用户数量:");
int crawlUserCount = new Scanner(System.in).nextInt();
ThreadPoolMonitor et1,et2;//监测线程池执行情况
ThreadPoolExecutor getWebPagethreadPool = new ThreadPoolExecutor(5, 10, 3, TimeUnit.SECONDS,
new ArrayBlockingQueue<Runnable>(1000), new ThreadPoolExecutor.DiscardOldestPolicy());
MyThreadPoolExecutor parseWebPagethreadPool = new MyThreadPoolExecutor(1, 1, 3, TimeUnit.SECONDS,
new ArrayBlockingQueue<Runnable>(1000), new ThreadPoolExecutor.DiscardOldestPolicy(),storage);
HttpGet getRequest = new HttpGet(startUrl);
getWebPagethreadPool.execute(new GetWebPageTask(zhClient,getRequest,storage,getWebPagethreadPool,parseWebPagethreadPool));
et1 = new ThreadPoolMonitor(parseWebPagethreadPool,"解析网页线程池--");
et2 = new ThreadPoolMonitor(getWebPagethreadPool,"获取网页线程池--");
new Thread(et1).start();
new Thread(et2).start();
while(true){
if(ParseWebPageTask.userCount >= crawlUserCount){
getWebPagethreadPool.shutdown();
if(getWebPagethreadPool.isTerminated() && storage.getQueue().size() == 0){
parseWebPagethreadPool.shutdown();
et1.shutdown();
et2.shutdown();
break;
}
}
try {
Thread.sleep(1000);
} catch (InterruptedException e) {
e.printStackTrace();
logger.error("InterruptedException",e);
}
}
}
}
复制代码
public class GetWebPageTask implements Runnable{
private static Logger logger = MyLogger.getMyLogger(GetWebPageTask.class);
private HttpGet getMethod = null;
public static int gwpCount = 0;
Storage storage = null;
ZhihuHttpClient zhClient = null;
ThreadPoolExecutor gwpThreadPool = null;//获取网页线程池
MyThreadPoolExecutor pwpThreadPool = null;//解析网页线程池
public GetWebPageTask(){
}
public GetWebPageTask(ZhihuHttpClient zhClient, HttpGet getMethod, Storage storage,ThreadPoolExecutor gwpThreadPool,MyThreadPoolExecutor pwpThreadPool){
// TODO Auto-generated constructor stub
this.zhClient = zhClient;
this.getMethod = getMethod;
this.storage = storage;
this.gwpThreadPool = gwpThreadPool;
this.pwpThreadPool = pwpThreadPool;
}
public void run(){
CloseableHttpResponse response = null;
CloseableHttpClient hc = zhClient.getHttpClient();
try {
response = hc.execute(getMethod,zhClient.getContext());
int status = response.getStatusLine().getStatusCode();
logger.error("executing request " + getMethod.getURI() + " status:" + status);
while(status == 429){
Thread.sleep(100);
response = hc.execute(getMethod,zhClient.getContext());
status = response.getStatusLine().getStatusCode();
if(status != 429){
break;
}
}
if(status == HttpStatus.SC_OK){
gwpCount++;
String s = IOUtils.toString(response.getEntity().getContent());
storage.push(s);//入队
pwpThreadPool.execute(new ParseWebPageTask(zhClient,this.storage,gwpThreadPool,pwpThreadPool));
} else if(status == 502 || status == 504 || status == 500){
return ;
}
} catch (ClientProtocolException e) {
e.printStackTrace();
logger.error("ClientProtocolException",e);
} catch (ConnectException e) {
e.printStackTrace();
logger.error("ConnectException",e);
} catch (IOException e) {
e.printStackTrace();
logger.error("IOException",e);
} catch (InterruptedException e) {
e.printStackTrace();
logger.error("InterruptedException",e);
} catch (NullPointerException e){
e.printStackTrace();
logger.error("NullPointerException",e);
}finally {
if(response.getEntity() != null){
try {
getMethod.releaseConnection();
response.getEntity().getContent().close();
} catch (UnsupportedOperationException e) {
e.printStackTrace();
logger.error("UnsupportedOperationException",e);
} catch (IOException e) {
e.printStackTrace();
logger.error("IOException",e);
}
}
if (response.getStatusLine().getStatusCode() != 200) {
getMethod.abort();
}
}
}
}
复制代码
解析网页线程,负责解析网页并将解析出的用户资料插入数据库
public class ParseWebPageTask implements Runnable{
private static Logger logger = MyLogger.getMyLogger(MyThreadPoolExecutor.class);
public static int pwpCount = 0;
public static int userCount = 0;
Storage storage = null;
ZhihuHttpClient zhClient = null;
ThreadPoolExecutor gwpThreadPool = null;
MyThreadPoolExecutor pwpThreadPool = null;
public ParseWebPageTask(){
}
public ParseWebPageTask(ZhihuHttpClient zhClient,Storage storage,ThreadPoolExecutor gwpThreadPool,MyThreadPoolExecutor pwpThreadPool){
this.storage = storage;
this.zhClient = zhClient;
this.gwpThreadPool = gwpThreadPool;
this.pwpThreadPool = pwpThreadPool;
}
@Override
public void run() {
// TODO Auto-generated method stub
try {
pwpCount++;
User u = null;
String ym = storage.pop();
Document doc = Jsoup.parse(ym);
Connection cn = ConnectionManage.getConnection();
if(doc.select("title").size() != 0){
u = parseUserdetail(doc);
if(ZhuhuDAO.insetToDB(cn,u)){
// storage.getResult().getUserVector().add(u);
}
for(int i = 0;i < u.getFollowees()/20 + 1;i++){
String url = "https://www.zhihu.com/node/ProfileFolloweesListV2?params={%22offset%22:" + 20*i + ",%22order_by%22:%22created%22,%22hash_id%22:%22" + u.getHashId() +"%22}";
url = url.replaceAll("[{]","%7B").replaceAll("[}]","%7D").replaceAll(" ","%20");
if(gwpThreadPool.getQueue().size() <= 100){
dealHref(cn,url);
}
}
}else {
Elements es = doc.select(".zm-list-content-medium .zm-list-content-title a");
for(Element temp:es){
String userIndex = temp.attr("href") + "/followees";
dealHref(cn,userIndex);
}
}
if(!cn.isClosed()){
cn.close();
cn = null;
}
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
logger.error("Exception",e);
}
}
public void dealHref(Connection cn,String href) throws SQLException {
String md5Href = Md5Util.Convert2Md5(href);
// if(storage.getResult().getHrefSet().add(href)){
// if(storage.getResult().getHrefSet().size() >= 10000){
// storage.getResult().getHrefSet().clear();
// }
if(ZhuhuDAO.insertHref(cn,md5Href) || gwpThreadPool.getQueue().size() <= 50){
if(pwpThreadPool.getQueue().size() <= 100){
HttpGet getRequest = null;
try{
getRequest = new HttpGet(href);
gwpThreadPool.execute(new GetWebPageTask(zhClient,getRequest,storage,gwpThreadPool,pwpThreadPool));
} catch(IllegalArgumentException e){
e.printStackTrace();
logger.error("IllegalArgumentException",e);
}
}
} else{
}
}
public User parseUserdetail(Document doc){
User u = new User();
u.setLocation(getUserinfo(doc,"location"));
u.setBusiness(getUserinfo(doc,"business"));
u.setEmployment(getUserinfo(doc,"employment"));
u.setPosition(getUserinfo(doc,"position"));
u.setEducation(getUserinfo(doc,"education"));
try {
u.setUsername(doc.select(".title-section.ellipsis a").first().text());
u.setUrl("https://www.zhihu.com" + doc.select(".title-section.ellipsis a").first().attr("href"));
} catch (NullPointerException e){
logger.error("NullPointerException",e);
e.printStackTrace();
}
u.setAgrees(Integer.valueOf(doc.select(".zm-profile-header-user-agree strong").first().text()));
u.setThanks(Integer.valueOf(doc.select(".zm-profile-header-user-thanks strong").first().text()));
u.setFollowees(Integer.valueOf(doc.select(".zm-profile-side-following strong").first().text()));
u.setFollowers(Integer.valueOf(doc.select(".zm-profile-side-following strong").get(1).text()));
try {
u.setHashId(doc.select(".zm-profile-header-op-btns.clearfix button").first().attr("data-id"));
}catch (NullPointerException e){
e.printStackTrace();
u.setHashId("843df56056dc14b8dd36ace99be09337");
}
return u;
}
public String getUserinfo(Document doc,String infoName){
Element e = doc.select(".zm-profile-header-user-describe ." + infoName + ".item").first();
if(e == null){
return "";
} else{
return e.attr("title");
}
}
}
复制代码
欢迎光临 黑马程序员技术交流社区 (http://bbs.itheima.com/)
黑马程序员IT技术论坛 X3.2