[学习交流] 崇志恒 -java爬虫

实现的效果，自动在工程下创建Pictures文件夹，根据网站URL爬取图片，层层获取。在Pictures下以网站的层级URL命名文件夹，用来装该层URL下的图片。同时将文件名，路径，URL插入数据库，便于索引。

第一步，创建持久层类，用来存储文件名，路径以及URL。

[java] view plain copy

1. package org.amuxia.demo;

2.

3. import java.sql.Connection;

4. import java.sql.DriverManager;

5. import java.sql.PreparedStatement;

6. import java.sql.SQLException;

7.

8. public class JDBCHelper {

9. private static final String driver = "com.mysql.jdbc.Driver";

10. private static final String DBurl = "jdbc:mysql://127.0.0.1:3306/edupic";

11. private static final String user = "root";

12. private static final String password = "root";

13. private PreparedStatement pstmt = null;

14. private Connection spiderconn = null;

15.

16. public void insertFilePath(String fileName, String filepath, String url) {

17. try {

18. Class.forName(driver);

19. spiderconn = DriverManager.getConnection(DBurl, user, password);

20. String sql = "insert into FilePath (filename,filepath,url) values (?,?,?)";

21. pstmt = spiderconn.prepareStatement(sql);

22. pstmt.setString(1, fileName);

23. pstmt.setString(2, filepath);

24. pstmt.setString(3, url);

25. pstmt.executeUpdate();

26.

27. } catch (ClassNotFoundException e) {

28. // TODO Auto-generated catch block

29. e.printStackTrace();

30. } catch (SQLException e) {

31. e.printStackTrace();

32. } finally {

33. try {

34. pstmt.close();

35. spiderconn.close();

36. } catch (Exception e) {

37. // TODO Auto-generated catch block

38. e.printStackTrace();

39. }

40. }

41. }

42. }

第二步，创建解析URL的类，进行爬取

[java] view plain copy

1. package org.amuxia.demo;

2.

3. import java.io.BufferedReader;

4. import java.io.File;

5. import java.io.FileOutputStream;

6. import java.io.InputStream;

7. import java.io.InputStreamReader;

8. import java.io.PrintWriter;

9. import java.net.URL;

10. import java.net.URLConnection;

11. import java.util.ArrayList;

12. import java.util.Hashtable;

13. import java.util.regex.Matcher;

14. import java.util.regex.Pattern;

15.

16. public class GetWeb {

17. private int webDepth = 5; // 爬虫深度

18. private int intThreadNum = 1; // 线程数

19. private String strHomePage = ""; // 主页地址

20. private String myDomain; // 域名

21. private String fPath = "CSDN"; // 储存网页文件的目录名

22. private ArrayList<String> arrUrls = new ArrayList<String>(); // 存储未处理URL

23. private ArrayList<String> arrUrl = new ArrayList<String>(); // 存储所有URL供建立索引

24. private Hashtable<String, Integer> allUrls = new Hashtable<String, Integer>(); // 存储所有URL的网页号

25. private Hashtable<String, Integer> deepUrls = new Hashtable<String, Integer>(); // 存储所有URL深度

26. private int intWebIndex = 0; // 网页对应文件下标，从0开始

27. private long startTime;

28. private int webSuccessed = 0;

29. private int webFailed = 0;

30.

31. public static void main(String[] args) {

32. GetWeb gw = new GetWeb("http://www.csdn.net/");

33. gw.getWebByHomePage();

34. }

35.

36. public GetWeb(String s) {

37. this.strHomePage = s;

38. }

39.

40. public GetWeb(String s, int i) {

41. this.strHomePage = s;

42. this.webDepth = i;

43. }

44.

45. public synchronized void addWebSuccessed() {

46. webSuccessed++;

47. }

48.

49. public synchronized void addWebFailed() {

50. webFailed++;

51. }

52.

53. public synchronized String getAUrl() {

54. String tmpAUrl = arrUrls.get(0);

55. arrUrls.remove(0);

56. return tmpAUrl;

57. }

58.

59. public synchronized String getUrl() {

60. String tmpUrl = arrUrl.get(0);

61. arrUrl.remove(0);

62. return tmpUrl;

63. }

64.

65. public synchronized Integer getIntWebIndex() {

66. intWebIndex++;

67. return intWebIndex;

68. }

69.

70.

71.

72. /**

73. * 由用户提供的域名站点开始，对所有链接页面进行抓取

74. */

75. public void getWebByHomePage() {

76. startTime = System.currentTimeMillis();

77. this.myDomain = getDomain();

78. if (myDomain == null) {

79. System.out.println("Wrong input!");

80. return;

81. }

82.

83. System.out.println("Homepage = " + strHomePage);

84. System.out.println("Domain = " + myDomain);

85. arrUrls.add(strHomePage);

86. arrUrl.add(strHomePage);

87. allUrls.put(strHomePage, 0);

88. deepUrls.put(strHomePage, 1);

89.

90. File fDir = new File(fPath);

91. if (!fDir.exists()) {

92. fDir.mkdir();

93. }

94.

95. System.out.println("开始工作");

96. String tmp = getAUrl(); // 取出新的URL

97. this.getWebByUrl(tmp, allUrls.get(tmp) + ""); // 对新URL所对应的网页进行抓取

98. int i = 0;

99. for (i = 0; i < intThreadNum; i++) {

100. new Thread(new Processer(this)).start();

101. }

102. while (true) {

103. if (arrUrls.isEmpty() && Thread.activeCount() == 1) {

104. long finishTime = System.currentTimeMillis();

105. long costTime = finishTime - startTime;

106. System.out.println("\n\n\n\n\n完成");

107. System.out.println(

108. "开始时间 = " + startTime + " " + "结束时间 = " + finishTime + " " + "爬取总时间= " + costTime + "ms");

109. System.out.println("爬取的URL总数 = " + (webSuccessed + webFailed) + " 成功的URL总数: " + webSuccessed

110. + " 失败的URL总数: " + webFailed);

111. String strIndex = "";

112. String tmpUrl = "";

113. while (!arrUrl.isEmpty()) {

114. tmpUrl = getUrl();

115. strIndex += "Web depth:" + deepUrls.get(tmpUrl) + " Filepath: " + fPath + "/web"

116. + allUrls.get(tmpUrl) + ".htm" + "url:" + tmpUrl + "\n\n";

117. }

118. System.out.println(strIndex);

119. try {

120. PrintWriter pwIndex = new PrintWriter(new FileOutputStream("fileindex.txt"));

121. pwIndex.println(strIndex);

122. pwIndex.close();

123. } catch (Exception e) {

124. System.out.println("生成索引文件失败!");

125. }

126. break;

127. }

128. }

129. }

130.

131. /**

132. * 对后续解析的网站进行爬取

133. *

134. * @param strUrl

135. * @param fileIndex

136. */

137. public void getWebByUrl(String strUrl, String fileIndex) {

138. try {

139. System.out.println("通过URL得到网站: " + strUrl);

140.

141. URL url = new URL(strUrl);

142. URLConnection conn = url.openConnection();

143. conn.setDoOutput(true);

144. InputStream is = null;

145. is = url.openStream();

146. String filename = strUrl.replaceAll("/", "_");

147. filename = filename.replace(":", ".");

148. if (filename.indexOf("*") > 0) {

149. filename = filename.replaceAll("*", ".");

150. }

151. if (filename.indexOf("?") > 0) {

152. filename = filename.replaceAll("?", ".");

153. }

154. if (filename.indexOf("\"") > 0) {

155. filename = filename.replaceAll("\"", ".");

156. }

157. if (filename.indexOf(">") > 0) {

158. filename = filename.replaceAll(">", ".");

159. }

160. if (filename.indexOf("<") > 0) {

161. filename = filename.replaceAll("<", ".");

162. }

163. if (filename.indexOf("|") > 0) {

164. filename = filename.replaceAll("|", ".");

165. }

166. String filePath = fPath + "\\" + filename;

167. File file = new File(filePath);

168. if (!file.exists()) {

169. file.mkdir();

170. }

171. JDBCHelper helper = new JDBCHelper();

172. helper.insertFilePath(filename, filePath, strUrl);

173. GetPicture getp = new GetPicture();

174. getp.get(strUrl, filePath);

175. BufferedReader bReader = new BufferedReader(new InputStreamReader(is));

176. StringBuffer sb = new StringBuffer();

177. String rLine = null;

178. String tmp_rLine = null;

179. while ((rLine = bReader.readLine()) != null) {

180. tmp_rLine = rLine;

181. int str_len = tmp_rLine.length();

182. if (str_len > 0) {

183. sb.append("\n" + tmp_rLine);

184. if (deepUrls.get(strUrl) < webDepth)

185. getUrlByString(tmp_rLine, strUrl);

186. }

187. tmp_rLine = null;

188. }

189. is.close();

190. System.out.println("获取网站成功 " + strUrl);

191. addWebSuccessed();

192. } catch (Exception e) {

193. System.out.println("获取网站失败，请检查URL是否存在 " + strUrl);

194. addWebFailed();

195. }

196. }

197.

198. /**

199. * 判断用户所提供URL是否为域名地址

200. *

201. * @return

202. */

203. public String getDomain() {

204. String reg = "(?<=http\\://[a-zA-Z0-9]{0,100}[.]{0,1})[^.\\s]*?\\.(com|cn|net|org|biz|info|cc|tv|edu)";

205. Pattern p = Pattern.compile(reg, Pattern.CASE_INSENSITIVE);

206. Matcher m = p.matcher(strHomePage);

207. boolean blnp = m.find();

208. if (blnp == true) {

209. return m.group(0);

210. }

211. return null;

212. }

213.

214. /**

215. * 解析新的网页，提取其中含有的链接信息

216. *

217. * @param inputArgs

218. * @param strUrl

219. */

220. public void getUrlByString(String inputArgs, String strUrl) {

221. String tmpStr = inputArgs;

222. String regUrl = "(?<=(href=)[\"]?[\']?)[http://][^\\s\"\'\\?]*(" + myDomain + ")[^\\s\"\'>]*";

223. Pattern p = Pattern.compile(regUrl, Pattern.CASE_INSENSITIVE);

224. Matcher m = p.matcher(tmpStr);

225. boolean blnp = m.find();

226. while (blnp == true) {

227. if (!allUrls.containsKey(m.group(0))) {

228. System.out.println("Find a new url,depth:" + (deepUrls.get(strUrl) + 1) + " " + m.group(0));

229. arrUrls.add(m.group(0));

230. arrUrl.add(m.group(0));

231. allUrls.put(m.group(0), getIntWebIndex());

232. deepUrls.put(m.group(0), (deepUrls.get(strUrl) + 1));

233. }

234. tmpStr = tmpStr.substring(m.end(), tmpStr.length());

235. m = p.matcher(tmpStr);

236. blnp = m.find();

237. }

238. }

239.

240. /**

241. * @author amuxia 另一个独立的爬取线程

242. */

243. class Processer implements Runnable {

244. GetWeb gw;

245.

246. public Processer(GetWeb g) {

247. this.gw = g;

248. }

249.

250. public void run() {

251. while (!arrUrls.isEmpty()) {

252. String tmp = getAUrl();

253. getWebByUrl(tmp, allUrls.get(tmp) + "");

254. }

255. }

256. }

257. }

file:///C:\Users\PC\AppData\Local\Temp\ksohtml\wpsDA67.tmp.jpg

如图，此处添加要爬取的网址。

private String fPath = "CSDN"; 这里定义你爬取图片存放的位置，这里直接放在工程下的CSDN文件夹下，随意放，自己找得到就OK。

第三步，抓取图片下载

[java] view plain copy

1. package org.amuxia.demo;

2.

3. import java.io.BufferedInputStream;

4. import java.io.BufferedReader;

5. import java.io.File;

6. import java.io.FileOutputStream;

7. import java.io.IOException;

8. import java.io.InputStreamReader;

9. import java.net.URL;

10. import java.util.regex.Matcher;

11. import java.util.regex.Pattern;

12.

13. public class GetPicture {

14.

15. public void getHtmlPicture(String httpUrl, String filePath) {

16. URL url;

17. BufferedInputStream in;

18. FileOutputStream file;

19.

20. try {

21. System.out.println("爬取网络图片");

22. // 获取图片名

23. String fileName = httpUrl.substring(httpUrl.lastIndexOf("/")).replace("/", "");

24. // 初始化url对象

25. url = new URL(httpUrl);

26. // 初始化in对象，也就是获得url字节流

27. in = new BufferedInputStream(url.openStream());

28. file = new FileOutputStream(new File(filePath + "\\" + fileName));

29. int t;

30. while ((t = in.read()) != -1) {

31. file.write(t);

32. }

33. file.close();

34. in.close();

35. System.out.println("图片爬取成功");

36. } catch (Exception e) {

37. e.printStackTrace();

38. }

39. }

40.

41. public String getHtmlCode(String httpUrl) throws IOException {

42. String content = "";

43. URL url = new URL(httpUrl);

44. BufferedReader reader = new BufferedReader(new InputStreamReader(url.openStream()));

45. String input;

46. // 如果有数据

47. while ((input = reader.readLine()) != null) {

48. // 将读取数据赋给content

49. content += input;

50. }

51. // 关闭缓冲区

52. reader.close();

53. // 返回content

54. return content;

55. }

56.

57. /**

58. * 图片爬取方法

59. *

60. * @param url

61. * @throws IOException

62. */

63. public void get(String url, String filePath) throws IOException {

64.

65. // 定义两个获取网页图片的正则表达式

66. String searchImgReg = "(?x)(src|SRC|background|BACKGROUND)=('|\")/?(([\\w-]+/)*([\\w-]+\\.(jpg|JPG|png|PNG|gif|GIF)))('|\")";

67. String searchImgReg2 = "(?x)(src|SRC|background|BACKGROUND)=('|\")(http://([\\w-]+\\.)+[\\w-]+(:[0-9]+)*(/[\\w-]+)*(/[\\w-]+\\.(jpg|JPG|png|PNG|gif|GIF)))('|\")";

68.

69. String content = this.getHtmlCode(url);

70. Pattern pattern = Pattern.compile(searchImgReg);

71. Matcher matcher = pattern.matcher(content);

72. while (matcher.find()) {

73. System.out.println(matcher.group(3));

74. this.getHtmlPicture(url + "/" + matcher.group(3), filePath);

75. }

76. pattern = Pattern.compile(searchImgReg2);

77. matcher = pattern.matcher(content);

78. while (matcher.find()) {

79. System.out.println(matcher.group(3));

80. this.getHtmlPicture(matcher.group(3), filePath);

81.

82. }

83.

84. }

85. }

这样就完成了，看看效果如何？

控制台

file:///C:\Users\PC\AppData\Local\Temp\ksohtml\wpsDAA6.tmp.jpg

各层网址命名分类的文件夹

file:///C:\Users\PC\AppData\Local\Temp\ksohtml\wpsDAA7.tmp.jpg

文件夹下的图片

file:///C:\Users\PC\AppData\Local\Temp\ksohtml\wpsDAA8.tmp.jpg

我们看到，已经基本实现，这里不需要额外导包，只需要导一个MySQL驱动包，当然，如果不需要插入数据到数据库，对爬取图片没有任何影响，把第一个类去掉就好。另外可能有些网站做了防爬虫可能会失败。注：爬取网站之前最好和所有者进行沟通，另外爬取非公开内容是侵权的，这里只做测试使用。

帐号		自动登录	找回密码
密码			加入黑马

[学习交流] 崇志恒 -java爬虫

0 个回复