- import java.io.BufferedReader;
- import java.io.BufferedWriter;
- import java.io.FileWriter;
- import java.io.InputStreamReader;
- import java.net.URL;
- import java.net.URLConnection;
- import java.util.regex.Matcher;
- import java.util.regex.Pattern;
- /**
- * 爬一下黑马的用户。
- * 技术分超过25的
- * 存入文件中。
- * 格式:姓名xxx UIDxxx 技术分xxx
- */
-
- class Url
- {
- static String name = null;
- public static void main(String[] args) throws Exception
- {
- BufferedWriter bw = new BufferedWriter(new FileWriter("f:\\io\\person.txt"));
- for(int x=98200;x<100000;x++)
- {
- //创建连接。
- URL url = new URL("http://bbs.itheima.com/space-uid-"+x+".html");
- URLConnection ucc = url.openConnection();
- BufferedReader br = new BufferedReader(new InputStreamReader(ucc.getInputStream(),"utf-8"));
-
- //爬这一断代码"数字 </span>技术分"
- //还爬一下这断代码"name">用户名<"
- String line = null;
- while ((line=br.readLine())!=null)
- {
- String regex = "\\d+ </span>技术分";
- Pattern p = Pattern.compile(regex);
- Matcher m = p.matcher(line);
-
- String regex2 = "name\">.+<";
- Pattern p2 = Pattern.compile(regex2);
- Matcher m2 = p2.matcher(line);
-
-
- while (m2.find())
- {
- name = m2.group();
- name = name.replace( "name\">","");
- name = name.replace("<", "");
- }
- while (m.find())
- {
- //把数据存入文件中
- String s= m.group();
- String[] arr = s.split(" </span>");
- if(Integer.parseInt(arr[0])>=25)
- {
- bw.write("姓名:"+name+"\t\tUID:"+x+"\t"+arr[1]+arr[0]);
- bw.newLine();
- bw.flush();
- }
- }
- System.out.println(line);//测试时用的。看有没有连接到数据。
- }
- }
- bw.close();
-
- }
- }
复制代码 |