2.创建两个线程池和一个Storage。一个抓取网页线程池,负责执行request请求,并返回网页内容,存到Storage中。另一个是解析网页线程池,负责从Storage中取出网页内容并解析,解析用户资料存入数据库,解析该用户关注的人的首页,将该地址请求又加入抓取网页线程池。一直循环下去。
3.关于url去重,我是直接将访问过的链接md5化后存入数据库,每次访问前,查看数据库中是否存在该链接。
- public class CrawlZhiHu {
- private static Logger logger = MyLogger.getMyLogger(CrawlZhiHu.class);
- private Storage storage = null;
- public CrawlZhiHu(){
- storage = new Storage();
- }
- public static void main(String[] args) throws Exception{
- ZhihuHttpClient zhClient = new ZhihuHttpClient();
- CrawlZhiHu crawlZhiHu= new CrawlZhiHu();
- crawlZhiHu.getZhiHu(zhClient, "https://www.zhihu.com/people/wo-yan-chen-mo/followees");
- }
- public void getZhiHu(ZhihuHttpClient zhClient, String startUrl){
- System.out.print("请输入要抓取的用户数量:");
- int crawlUserCount = new Scanner(System.in).nextInt();
- ThreadPoolMonitor et1,et2;//监测线程池执行情况
- ThreadPoolExecutor getWebPagethreadPool = new ThreadPoolExecutor(5, 10, 3, TimeUnit.SECONDS,
- new ArrayBlockingQueue<Runnable>(1000), new ThreadPoolExecutor.DiscardOldestPolicy());
- MyThreadPoolExecutor parseWebPagethreadPool = new MyThreadPoolExecutor(1, 1, 3, TimeUnit.SECONDS,
- new ArrayBlockingQueue<Runnable>(1000), new ThreadPoolExecutor.DiscardOldestPolicy(),storage);
- HttpGet getRequest = new HttpGet(startUrl);
- getWebPagethreadPool.execute(new GetWebPageTask(zhClient,getRequest,storage,getWebPagethreadPool,parseWebPagethreadPool));
- et1 = new ThreadPoolMonitor(parseWebPagethreadPool,"解析网页线程池--");
- et2 = new ThreadPoolMonitor(getWebPagethreadPool,"获取网页线程池--");
- new Thread(et1).start();
- new Thread(et2).start();
- while(true){
- if(ParseWebPageTask.userCount >= crawlUserCount){
- getWebPagethreadPool.shutdown();
- if(getWebPagethreadPool.isTerminated() && storage.getQueue().size() == 0){
- parseWebPagethreadPool.shutdown();
- et1.shutdown();
- et2.shutdown();
- break;
- }
- }
- try {
- Thread.sleep(1000);
- } catch (InterruptedException e) {
- e.printStackTrace();
- logger.error("InterruptedException",e);
- }
- }
- }
- }
复制代码
- public class GetWebPageTask implements Runnable{
- private static Logger logger = MyLogger.getMyLogger(GetWebPageTask.class);
- private HttpGet getMethod = null;
- public static int gwpCount = 0;
- Storage storage = null;
- ZhihuHttpClient zhClient = null;
- ThreadPoolExecutor gwpThreadPool = null;//获取网页线程池
- MyThreadPoolExecutor pwpThreadPool = null;//解析网页线程池
- public GetWebPageTask(){
- }
- public GetWebPageTask(ZhihuHttpClient zhClient, HttpGet getMethod, Storage storage,ThreadPoolExecutor gwpThreadPool,MyThreadPoolExecutor pwpThreadPool){
- // TODO Auto-generated constructor stub
- this.zhClient = zhClient;
- this.getMethod = getMethod;
- this.storage = storage;
- this.gwpThreadPool = gwpThreadPool;
- this.pwpThreadPool = pwpThreadPool;
- }
- public void run(){
- CloseableHttpResponse response = null;
- CloseableHttpClient hc = zhClient.getHttpClient();
- try {
- response = hc.execute(getMethod,zhClient.getContext());
- int status = response.getStatusLine().getStatusCode();
- logger.error("executing request " + getMethod.getURI() + " status:" + status);
- while(status == 429){
- Thread.sleep(100);
- response = hc.execute(getMethod,zhClient.getContext());
- status = response.getStatusLine().getStatusCode();
- if(status != 429){
- break;
- }
- }
- if(status == HttpStatus.SC_OK){
- gwpCount++;
- String s = IOUtils.toString(response.getEntity().getContent());
- storage.push(s);//入队
- pwpThreadPool.execute(new ParseWebPageTask(zhClient,this.storage,gwpThreadPool,pwpThreadPool));
- } else if(status == 502 || status == 504 || status == 500){
- return ;
- }
- } catch (ClientProtocolException e) {
- e.printStackTrace();
- logger.error("ClientProtocolException",e);
- } catch (ConnectException e) {
- e.printStackTrace();
- logger.error("ConnectException",e);
- } catch (IOException e) {
- e.printStackTrace();
- logger.error("IOException",e);
- } catch (InterruptedException e) {
- e.printStackTrace();
- logger.error("InterruptedException",e);
- } catch (NullPointerException e){
- e.printStackTrace();
- logger.error("NullPointerException",e);
- }finally {
- if(response.getEntity() != null){
- try {
- getMethod.releaseConnection();
- response.getEntity().getContent().close();
- } catch (UnsupportedOperationException e) {
- e.printStackTrace();
- logger.error("UnsupportedOperationException",e);
- } catch (IOException e) {
- e.printStackTrace();
- logger.error("IOException",e);
- }
- }
- if (response.getStatusLine().getStatusCode() != 200) {
- getMethod.abort();
- }
- }
- }
- }
复制代码 解析网页线程,负责解析网页并将解析出的用户资料插入数据库
- public class ParseWebPageTask implements Runnable{
- private static Logger logger = MyLogger.getMyLogger(MyThreadPoolExecutor.class);
- public static int pwpCount = 0;
- public static int userCount = 0;
- Storage storage = null;
- ZhihuHttpClient zhClient = null;
- ThreadPoolExecutor gwpThreadPool = null;
- MyThreadPoolExecutor pwpThreadPool = null;
- public ParseWebPageTask(){
- }
- public ParseWebPageTask(ZhihuHttpClient zhClient,Storage storage,ThreadPoolExecutor gwpThreadPool,MyThreadPoolExecutor pwpThreadPool){
- this.storage = storage;
- this.zhClient = zhClient;
- this.gwpThreadPool = gwpThreadPool;
- this.pwpThreadPool = pwpThreadPool;
- }
- @Override
- public void run() {
- // TODO Auto-generated method stub
- try {
- pwpCount++;
- User u = null;
- String ym = storage.pop();
- Document doc = Jsoup.parse(ym);
- Connection cn = ConnectionManage.getConnection();
- if(doc.select("title").size() != 0){
- u = parseUserdetail(doc);
- if(ZhuhuDAO.insetToDB(cn,u)){
- // storage.getResult().getUserVector().add(u);
- }
- for(int i = 0;i < u.getFollowees()/20 + 1;i++){
- String url = "https://www.zhihu.com/node/ProfileFolloweesListV2?params={%22offset%22:" + 20*i + ",%22order_by%22:%22created%22,%22hash_id%22:%22" + u.getHashId() +"%22}";
- url = url.replaceAll("[{]","%7B").replaceAll("[}]","%7D").replaceAll(" ","%20");
- if(gwpThreadPool.getQueue().size() <= 100){
- dealHref(cn,url);
- }
- }
- }else {
- Elements es = doc.select(".zm-list-content-medium .zm-list-content-title a");
- for(Element temp:es){
- String userIndex = temp.attr("href") + "/followees";
- dealHref(cn,userIndex);
- }
- }
- if(!cn.isClosed()){
- cn.close();
- cn = null;
- }
- } catch (Exception e) {
- // TODO Auto-generated catch block
- e.printStackTrace();
- logger.error("Exception",e);
- }
- }
- public void dealHref(Connection cn,String href) throws SQLException {
- String md5Href = Md5Util.Convert2Md5(href);
- // if(storage.getResult().getHrefSet().add(href)){
- // if(storage.getResult().getHrefSet().size() >= 10000){
- // storage.getResult().getHrefSet().clear();
- // }
- if(ZhuhuDAO.insertHref(cn,md5Href) || gwpThreadPool.getQueue().size() <= 50){
- if(pwpThreadPool.getQueue().size() <= 100){
- HttpGet getRequest = null;
- try{
- getRequest = new HttpGet(href);
- gwpThreadPool.execute(new GetWebPageTask(zhClient,getRequest,storage,gwpThreadPool,pwpThreadPool));
- } catch(IllegalArgumentException e){
- e.printStackTrace();
- logger.error("IllegalArgumentException",e);
- }
- }
- } else{
- }
- }
- public User parseUserdetail(Document doc){
- User u = new User();
- u.setLocation(getUserinfo(doc,"location"));
- u.setBusiness(getUserinfo(doc,"business"));
- u.setEmployment(getUserinfo(doc,"employment"));
- u.setPosition(getUserinfo(doc,"position"));
- u.setEducation(getUserinfo(doc,"education"));
- try {
- u.setUsername(doc.select(".title-section.ellipsis a").first().text());
- u.setUrl("https://www.zhihu.com" + doc.select(".title-section.ellipsis a").first().attr("href"));
- } catch (NullPointerException e){
- logger.error("NullPointerException",e);
- e.printStackTrace();
- }
- u.setAgrees(Integer.valueOf(doc.select(".zm-profile-header-user-agree strong").first().text()));
- u.setThanks(Integer.valueOf(doc.select(".zm-profile-header-user-thanks strong").first().text()));
- u.setFollowees(Integer.valueOf(doc.select(".zm-profile-side-following strong").first().text()));
- u.setFollowers(Integer.valueOf(doc.select(".zm-profile-side-following strong").get(1).text()));
- try {
- u.setHashId(doc.select(".zm-profile-header-op-btns.clearfix button").first().attr("data-id"));
- }catch (NullPointerException e){
- e.printStackTrace();
- u.setHashId("843df56056dc14b8dd36ace99be09337");
- }
- return u;
- }
- public String getUserinfo(Document doc,String infoName){
- Element e = doc.select(".zm-profile-header-user-describe ." + infoName + ".item").first();
- if(e == null){
- return "";
- } else{
- return e.attr("title");
- }
- }
- }
复制代码
|
|