实现的效果,自动在工程下创建Pictures文件夹,根据网站URL爬取图片,层层获取。在Pictures下以网站的层级URL命名文件夹,用来装该层URL下的图片。同时将文件名,路径,URL插入数据库,便于索引。 第一步,创建持久层类,用来存储文件名,路径以及URL。 1. package org.amuxia.demo; 2. 3. import java.sql.Connection; 4. import java.sql.DriverManager; 5. import java.sql.PreparedStatement; 6. import java.sql.SQLException; 7. 8. public class JDBCHelper { 9. private static final String driver = "com.mysql.jdbc.Driver"; 10. private static final String DBurl = "jdbc:mysql://127.0.0.1:3306/edupic"; 11. private static final String user = "root"; 12. private static final String password = "root"; 13. private PreparedStatement pstmt = null; 14. private Connection spiderconn = null; 15. 16. public void insertFilePath(String fileName, String filepath, String url) { 17. try { 18. Class.forName(driver); 19. spiderconn = DriverManager.getConnection(DBurl, user, password); 20. String sql = "insert into FilePath (filename,filepath,url) values (?,?,?)"; 21. pstmt = spiderconn.prepareStatement(sql); 22. pstmt.setString(1, fileName); 23. pstmt.setString(2, filepath); 24. pstmt.setString(3, url); 25. pstmt.executeUpdate(); 26. 27. } catch (ClassNotFoundException e) { 28. // TODO Auto-generated catch block 29. e.printStackTrace(); 30. } catch (SQLException e) { 31. e.printStackTrace(); 32. } finally { 33. try { 34. pstmt.close(); 35. spiderconn.close(); 36. } catch (Exception e) { 37. // TODO Auto-generated catch block 38. e.printStackTrace(); 39. } 40. } 41. } 42. }
第二步,创建解析URL的类,进行爬取
1. package org.amuxia.demo; 2. 3. import java.io.BufferedReader; 4. import java.io.File; 5. import java.io.FileOutputStream; 6. import java.io.InputStream; 7. import java.io.InputStreamReader; 8. import java.io.PrintWriter; 9. import java.net.URL; 10. import java.net.URLConnection; 11. import java.util.ArrayList; 12. import java.util.Hashtable; 13. import java.util.regex.Matcher; 14. import java.util.regex.Pattern; 15. 16. public class GetWeb { 17. private int webDepth = 5; // 爬虫深度 18. private int intThreadNum = 1; // 线程数 19. private String strHomePage = ""; // 主页地址 20. private String myDomain; // 域名 21. private String fPath = "CSDN"; // 储存网页文件的目录名 22. private ArrayList<String> arrUrls = new ArrayList<String>(); // 存储未处理URL 23. private ArrayList<String> arrUrl = new ArrayList<String>(); // 存储所有URL供建立索引 24. private Hashtable<String, Integer> allUrls = new Hashtable<String, Integer>(); // 存储所有URL的网页号 25. private Hashtable<String, Integer> deepUrls = new Hashtable<String, Integer>(); // 存储所有URL深度 26. private int intWebIndex = 0; // 网页对应文件下标,从0开始 27. private long startTime; 28. private int webSuccessed = 0; 29. private int webFailed = 0; 30. 31. public static void main(String[] args) { 32. GetWeb gw = new GetWeb("http://www.csdn.net/"); 33. gw.getWebByHomePage(); 34. } 35. 36. public GetWeb(String s) { 37. this.strHomePage = s; 38. } 39. 40. public GetWeb(String s, int i) { 41. this.strHomePage = s; 42. this.webDepth = i; 43. } 44. 45. public synchronized void addWebSuccessed() { 46. webSuccessed++; 47. } 48. 49. public synchronized void addWebFailed() { 50. webFailed++; 51. } 52. 53. public synchronized String getAUrl() { 54. String tmpAUrl = arrUrls.get(0); 55. arrUrls.remove(0); 56. return tmpAUrl; 57. } 58. 59. public synchronized String getUrl() { 60. String tmpUrl = arrUrl.get(0); 61. arrUrl.remove(0); 62. return tmpUrl; 63. } 64. 65. public synchronized Integer getIntWebIndex() { 66. intWebIndex++; 67. return intWebIndex; 68. } 69. 70. 71. 72. /** 73. * 由用户提供的域名站点开始,对所有链接页面进行抓取 74. */ 75. public void getWebByHomePage() { 76. startTime = System.currentTimeMillis(); 77. this.myDomain = getDomain(); 78. if (myDomain == null) { 79. System.out.println("Wrong input!"); 80. return; 81. } 82. 83. System.out.println("Homepage = " + strHomePage); 84. System.out.println("Domain = " + myDomain); 85. arrUrls.add(strHomePage); 86. arrUrl.add(strHomePage); 87. allUrls.put(strHomePage, 0); 88. deepUrls.put(strHomePage, 1); 89. 90. File fDir = new File(fPath); 91. if (!fDir.exists()) { 92. fDir.mkdir(); 93. } 94. 95. System.out.println("开始工作"); 96. String tmp = getAUrl(); // 取出新的URL 97. this.getWebByUrl(tmp, allUrls.get(tmp) + ""); // 对新URL所对应的网页进行抓取 98. int i = 0; 99. for (i = 0; i < intThreadNum; i++) { 100. new Thread(new Processer(this)).start(); 101. } 102. while (true) { 103. if (arrUrls.isEmpty() && Thread.activeCount() == 1) { 104. long finishTime = System.currentTimeMillis(); 105. long costTime = finishTime - startTime; 106. System.out.println("\n\n\n\n\n完成"); 107. System.out.println( 108. "开始时间 = " + startTime + " " + "结束时间 = " + finishTime + " " + "爬取总时间= " + costTime + "ms"); 109. System.out.println("爬取的URL总数 = " + (webSuccessed + webFailed) + " 成功的URL总数: " + webSuccessed 110. + " 失败的URL总数: " + webFailed); 111. String strIndex = ""; 112. String tmpUrl = ""; 113. while (!arrUrl.isEmpty()) { 114. tmpUrl = getUrl(); 115. strIndex += "Web depth:" + deepUrls.get(tmpUrl) + " Filepath: " + fPath + "/web" 116. + allUrls.get(tmpUrl) + ".htm" + "url:" + tmpUrl + "\n\n"; 117. } 118. System.out.println(strIndex); 119. try { 120. PrintWriter pwIndex = new PrintWriter(new FileOutputStream("fileindex.txt")); 121. pwIndex.println(strIndex); 122. pwIndex.close(); 123. } catch (Exception e) { 124. System.out.println("生成索引文件失败!"); 125. } 126. break; 127. } 128. } 129. } 130. 131. /** 132. * 对后续解析的网站进行爬取 133. * 134. * @param strUrl 135. * @param fileIndex 136. */ 137. public void getWebByUrl(String strUrl, String fileIndex) { 138. try { 139. System.out.println("通过URL得到网站: " + strUrl); 140. 141. URL url = new URL(strUrl); 142. URLConnection conn = url.openConnection(); 143. conn.setDoOutput(true); 144. InputStream is = null; 145. is = url.openStream(); 146. String filename = strUrl.replaceAll("/", "_"); 147. filename = filename.replace(":", "."); 148. if (filename.indexOf("*") > 0) { 149. filename = filename.replaceAll("*", "."); 150. } 151. if (filename.indexOf("?") > 0) { 152. filename = filename.replaceAll("?", "."); 153. } 154. if (filename.indexOf("\"") > 0) { 155. filename = filename.replaceAll("\"", "."); 156. } 157. if (filename.indexOf(">") > 0) { 158. filename = filename.replaceAll(">", "."); 159. } 160. if (filename.indexOf("<") > 0) { 161. filename = filename.replaceAll("<", "."); 162. } 163. if (filename.indexOf("|") > 0) { 164. filename = filename.replaceAll("|", "."); 165. } 166. String filePath = fPath + "\\" + filename; 167. File file = new File(filePath); 168. if (!file.exists()) { 169. file.mkdir(); 170. } 171. JDBCHelper helper = new JDBCHelper(); 172. helper.insertFilePath(filename, filePath, strUrl); 173. GetPicture getp = new GetPicture(); 174. getp.get(strUrl, filePath); 175. BufferedReader bReader = new BufferedReader(new InputStreamReader(is)); 176. StringBuffer sb = new StringBuffer(); 177. String rLine = null; 178. String tmp_rLine = null; 179. while ((rLine = bReader.readLine()) != null) { 180. tmp_rLine = rLine; 181. int str_len = tmp_rLine.length(); 182. if (str_len > 0) { 183. sb.append("\n" + tmp_rLine); 184. if (deepUrls.get(strUrl) < webDepth) 185. getUrlByString(tmp_rLine, strUrl); 186. } 187. tmp_rLine = null; 188. } 189. is.close(); 190. System.out.println("获取网站成功 " + strUrl); 191. addWebSuccessed(); 192. } catch (Exception e) { 193. System.out.println("获取网站失败,请检查URL是否存在 " + strUrl); 194. addWebFailed(); 195. } 196. } 197. 198. /** 199. * 判断用户所提供URL是否为域名地址 200. * 201. * @return 202. */ 203. public String getDomain() { 204. String reg = "(?<=http\\://[a-zA-Z0-9]{0,100}[.]{0,1})[^.\\s]*?\\.(com|cn|net|org|biz|info|cc|tv|edu)"; 205. Pattern p = Pattern.compile(reg, Pattern.CASE_INSENSITIVE); 206. Matcher m = p.matcher(strHomePage); 207. boolean blnp = m.find(); 208. if (blnp == true) { 209. return m.group(0); 210. } 211. return null; 212. } 213. 214. /** 215. * 解析新的网页,提取其中含有的链接信息 216. * 217. * @param inputArgs 218. * @param strUrl 219. */ 220. public void getUrlByString(String inputArgs, String strUrl) { 221. String tmpStr = inputArgs; 222. String regUrl = "(?<=(href=)[\"]?[\']?)[http://][^\\s\"\'\\?]*(" + myDomain + ")[^\\s\"\'>]*"; 223. Pattern p = Pattern.compile(regUrl, Pattern.CASE_INSENSITIVE); 224. Matcher m = p.matcher(tmpStr); 225. boolean blnp = m.find(); 226. while (blnp == true) { 227. if (!allUrls.containsKey(m.group(0))) { 228. System.out.println("Find a new url,depth:" + (deepUrls.get(strUrl) + 1) + " " + m.group(0)); 229. arrUrls.add(m.group(0)); 230. arrUrl.add(m.group(0)); 231. allUrls.put(m.group(0), getIntWebIndex()); 232. deepUrls.put(m.group(0), (deepUrls.get(strUrl) + 1)); 233. } 234. tmpStr = tmpStr.substring(m.end(), tmpStr.length()); 235. m = p.matcher(tmpStr); 236. blnp = m.find(); 237. } 238. } 239. 240. /** 241. * @author amuxia 另一个独立的爬取线程 242. */ 243. class Processer implements Runnable { 244. GetWeb gw; 245. 246. public Processer(GetWeb g) { 247. this.gw = g; 248. } 249. 250. public void run() { 251. while (!arrUrls.isEmpty()) { 252. String tmp = getAUrl(); 253. getWebByUrl(tmp, allUrls.get(tmp) + ""); 254. } 255. } 256. } 257. }
file:///C:\Users\PC\AppData\Local\Temp\ksohtml\wpsDA67.tmp.jpg
如图,此处添加要爬取的网址。 private String fPath = "CSDN"; 这里定义你爬取图片存放的位置,这里直接放在工程下的CSDN文件夹下,随意放,自己找得到就OK。 第三步,抓取图片下载 1. package org.amuxia.demo; 2. 3. import java.io.BufferedInputStream; 4. import java.io.BufferedReader; 5. import java.io.File; 6. import java.io.FileOutputStream; 7. import java.io.IOException; 8. import java.io.InputStreamReader; 9. import java.net.URL; 10. import java.util.regex.Matcher; 11. import java.util.regex.Pattern; 12. 13. public class GetPicture { 14. 15. public void getHtmlPicture(String httpUrl, String filePath) { 16. URL url; 17. BufferedInputStream in; 18. FileOutputStream file; 19. 20. try { 21. System.out.println("爬取网络图片"); 22. // 获取图片名 23. String fileName = httpUrl.substring(httpUrl.lastIndexOf("/")).replace("/", ""); 24. // 初始化url对象 25. url = new URL(httpUrl); 26. // 初始化in对象,也就是获得url字节流 27. in = new BufferedInputStream(url.openStream()); 28. file = new FileOutputStream(new File(filePath + "\\" + fileName)); 29. int t; 30. while ((t = in.read()) != -1) { 31. file.write(t); 32. } 33. file.close(); 34. in.close(); 35. System.out.println("图片爬取成功"); 36. } catch (Exception e) { 37. e.printStackTrace(); 38. } 39. } 40. 41. public String getHtmlCode(String httpUrl) throws IOException { 42. String content = ""; 43. URL url = new URL(httpUrl); 44. BufferedReader reader = new BufferedReader(new InputStreamReader(url.openStream())); 45. String input; 46. // 如果有数据 47. while ((input = reader.readLine()) != null) { 48. // 将读取数据赋给content 49. content += input; 50. } 51. // 关闭缓冲区 52. reader.close(); 53. // 返回content 54. return content; 55. } 56. 57. /** 58. * 图片爬取方法 59. * 60. * @param url 61. * @throws IOException 62. */ 63. public void get(String url, String filePath) throws IOException { 64. 65. // 定义两个获取网页图片的正则表达式 66. String searchImgReg = "(?x)(src|SRC|background|BACKGROUND)=('|\")/?(([\\w-]+/)*([\\w-]+\\.(jpg|JPG|png|PNG|gif|GIF)))('|\")"; 67. String searchImgReg2 = "(?x)(src|SRC|background|BACKGROUND)=('|\")(http://([\\w-]+\\.)+[\\w-]+(:[0-9]+)*(/[\\w-]+)*(/[\\w-]+\\.(jpg|JPG|png|PNG|gif|GIF)))('|\")"; 68. 69. String content = this.getHtmlCode(url); 70. Pattern pattern = Pattern.compile(searchImgReg); 71. Matcher matcher = pattern.matcher(content); 72. while (matcher.find()) { 73. System.out.println(matcher.group(3)); 74. this.getHtmlPicture(url + "/" + matcher.group(3), filePath); 75. } 76. pattern = Pattern.compile(searchImgReg2); 77. matcher = pattern.matcher(content); 78. while (matcher.find()) { 79. System.out.println(matcher.group(3)); 80. this.getHtmlPicture(matcher.group(3), filePath); 81. 82. } 83. 84. } 85. }
这样就完成了,看看效果如何?
控制台 file:///C:\Users\PC\AppData\Local\Temp\ksohtml\wpsDAA6.tmp.jpg 各层网址命名分类的文件夹 file:///C:\Users\PC\AppData\Local\Temp\ksohtml\wpsDAA7.tmp.jpg 文件夹下的图片 file:///C:\Users\PC\AppData\Local\Temp\ksohtml\wpsDAA8.tmp.jpg 我们看到,已经基本实现,这里不需要额外导包,只需要导一个MySQL驱动包,当然,如果不需要插入数据到数据库,对爬取图片没有任何影响,把第一个类去掉就好。另外可能有些网站做了防爬虫可能会失败。注:爬取网站之前最好和所有者进行沟通,另外爬取非公开内容是侵权的,这里只做测试使用。
|