A股上市公司传智教育(股票代码 003032)旗下技术交流社区北京昌平校区

 找回密码
 加入黑马

QQ登录

只需一步,快速开始

© HM代景康 高级黑马   /  2013-10-26 20:32  /  474 人查看  /  0 人回复  /   0 人收藏 转载请遵从CC协议 禁止商业使用本文

 public void getWebByHomePage() {  startTime = System.currentTimeMillis();
  this.myDomain = getDomain();
  if (myDomain == null) {
  System.out.println("Wrong input!");
  // System.exit(1);
  return;
  }
  System.out.println("Homepage = " + strHomePage);
  addReport("Homepage = " + strHomePage + "!\n");
  System.out.println("Domain = " + myDomain);
  addReport("Domain = " + myDomain + "!\n");
  arrUrls.add(strHomePage);
  arrUrl.add(strHomePage);
  allUrls.put(strHomePage, 0);
  deepUrls.put(strHomePage, 1);
  File fDir = new File(fPath);
  if (!fDir.exists()) {
  fDir.mkdir();
  }
  System.out.println("Start!");
  this.addReport("Start!\n");
  String tmp = getAUrl();
  this.getWebByUrl(tmp, charset, allUrls.get(tmp) + "");
  int i = 0;
  for (i = 0; i < intThreadNum; i++) {
  new Thread(new Processer(this))。start();
  }
  while (true) {
  if (arrUrls.isEmpty() && Thread.activeCount() == 1) {
  long finishTime = System.currentTimeMillis();
  long costTime = finishTime - startTime;
  System.out.println("\n\n\n\n\nFinished!");
  addReport("\n\n\n\n\nFinished!\n");
  System.out.println("Start time = " + startTime + "   "
  + "Finish time = " + finishTime + "   "
  + "Cost time = " + costTime + "ms");
  addReport("Start time = " + startTime + "   "
  + "Finish time = " + finishTime + "   "
  + "Cost time = " + costTime + "ms" + "\n");
  System.out.println("Total url number = "
  + (webSuccessed + webFailed) + "   Successed: "
  + webSuccessed + "   Failed: " + webFailed);
  addReport("Total url number = " + (webSuccessed + webFailed)
  + "   Successed: " + webSuccessed + "   Failed: "
  + webFailed + "\n");
  String strIndex = "";
  String tmpUrl = "";
  while (!arrUrl.isEmpty()) {
  tmpUrl = getUrl();
  strIndex += "Web depth:" + deepUrls.get(tmpUrl)
  + "   Filepath: " + fPath + "/web"
  + allUrls.get(tmpUrl) + ".htm" + "   url:" + tmpUrl
  + "\n\n";
  }
  System.out.println(strIndex);
  try {
  PrintWriter pwIndex = new PrintWriter(new FileOutputStream(
  "fileindex.txt"));
  pwIndex.println(strIndex);
  pwIndex.close();
  } catch (Exception e) {
  System.out.println("生成索引文件失败!");
  }
  break;
  }
  }
  }
  public void getWebByUrl(String strUrl, String charset, String fileIndex) {
  try {
  // if(charset==null||"".equals(charset))charset="utf-8";
  System.out.println("Getting web by url: " + strUrl);
  addReport("Getting web by url: " + strUrl + "\n");
  URL url = new URL(strUrl);
  URLConnection conn = url.openConnection();
  conn.setDoOutput(true);
  InputStream is = null;
  is = url.openStream();
  String filePath = fPath + "/web" + fileIndex + ".htm";
  PrintWriter pw = null;
  FileOutputStream fos = new FileOutputStream(filePath);
  OutputStreamWriter writer = new OutputStreamWriter(fos);
  pw = new PrintWriter(writer);
  BufferedReader bReader = new BufferedReader(new InputStreamReader(
  is));
  StringBuffer sb = new StringBuffer();
  String rLine = null;
  String tmp_rLine = null;
  while ((rLine = bReader.readLine()) != null) {
  tmp_rLine = rLine;
  int str_len = tmp_rLine.length();
  if (str_len > 0) {
  sb.append("\n" + tmp_rLine);
  pw.println(tmp_rLine);
  pw.flush();
  if (deepUrls.get(strUrl) < webDepth)
  getUrlByString(tmp_rLine, strUrl);
  } tmp_rLine = null;
  }
  is.close();
  pw.close();
  System.out.println("Get web successfully! " + strUrl);
  addReport("Get web successfully! " + strUrl + "\n");
  addWebSuccessed();
  } catch (Exception e) {
  System.out.println("Get web failed!       " + strUrl);
  addReport("Get web failed!       " + strUrl + "\n");
  addWebFailed();
  }
  }
  public String getDomain() {
  String reg = "(?<=http\\://[a-zA-Z0-9]{0,100}[.]{0,1})[^.\\s]*?\\.(com|cn|net|org|biz|info|cc|tv)";
  Pattern p = Pattern.compile(reg, Pattern.CASE_INSENSITIVE);
  Matcher m = p.matcher(strHomePage);
  boolean blnp = m.find();
  if (blnp == true) {
  return m.group(0);
  }
  return null;
  }
  public void getUrlByString(String inputArgs, String strUrl) {
  String tmpStr = inputArgs;
  String regUrl = "(?<=(href=)[\"]?[\']?)[http://][^\\s\"\'\\?]*("
  + myDomain + ")[^\\s\"\'>]*";
  Pattern p = Pattern.compile(regUrl, Pattern.CASE_INSENSITIVE);
  Matcher m = p.matcher(tmpStr);
  boolean blnp = m.find();
  // int i = 0;
  while (blnp == true) {
  if (!allUrls.containsKey(m.group(0))) {
  System.out.println("Find a new url,depth:"
  + (deepUrls.get(strUrl) + 1) + " " + m.group(0));
  addReport("Find a new url,depth:" + (deepUrls.get(strUrl) + 1)
  + " " + m.group(0) + "\n");
  arrUrls.add(m.group(0));
  arrUrl.add(m.group(0));
  allUrls.put(m.group(0), getIntWebIndex());
  deepUrls.put(m.group(0), (deepUrls.get(strUrl) + 1));
  }
  tmpStr = tmpStr.substring(m.end(), tmpStr.length());
  m = p.matcher(tmpStr);
  blnp = m.find();
  }
  }
  class Processer implements Runnable {
  GetWeb gw;
  public Processer(GetWeb g) {
  this.gw = g;
  }
  public void run() {
  // Thread.sleep(5000);
  while (!arrUrls.isEmpty()) {
  String tmp = getAUrl();
  getWebByUrl(tmp, charset, allUrls.get(tmp) + "");
  }
  }
  }
  }

评分

参与人数 1黑马币 +3 收起 理由
周志龙 + 3 赞一个!论坛已有该资源

查看全部评分

0 个回复

您需要登录后才可以回帖 登录 | 加入黑马