//////////////////////////
/////抓取相关博客部分
link = "";
temple = "";
reg1 = "ttp://blog.sina";
index0 = Precode.indexOf(reg1);
index1 = -1;
index2 = -1;
if(index0>0)
{
temple = Precode.substring(0, index0-1);
index1 = temple.lastIndexOf('"');
index2 = Precode.indexOf('"', index0);
if(index1>0&&index2>index1)
{
link = Precode.substring(index1+1, index2);
if(!NewUrls.contains(link))
BlogUrls.add(link);
}
}
index0 = Precode.indexOf(reg1,index2+1);
while(index0>0)
{
temple = Precode.substring(index2+1, index0-1);
index1 = index2+1+temple.lastIndexOf('"');
index2 = Precode.indexOf('"', index0);
if(index1>0&&index2>index1)
{
link = Precode.substring(index1+1, index2);
if(!BlogUrls.contains(link))
BlogUrls.add(link);
}
index0 = Precode.indexOf(reg1,index2+1);
}
}
static String GetNewUrl(ArrayList<String> al)
{
String tmpAUrl = al.get(0);
return tmpAUrl;
}
static void removeurl(ArrayList<String> al)
{
al.remove(0);
}
static String Drawtext1(String reg1,String reg2,String PreCode) //只抓取两个标签之间内容并且去除空格
{
String temple = "";
//String result = "";
int index1 = -1;
int index2 = -1;
int length1 = reg1.length();
//int length2 = reg2.length();
index1 = PreCode.indexOf(reg1);
index2 = PreCode.indexOf(reg2);
if(index1<0||index2<0)
return "";
temple = PreCode.substring(index1+length1, index2);
//String lastreg = "<[^>]+>";
//temple = temple.replaceAll(lastreg, "");
String kongge = "\\s*";
temple = temple.replaceAll(kongge, "");
return temple;
}
static String Drawtext2(String reg1,String reg2,String PreCode)//在Drawtext1的基础上抓起<p><\p>之间内容,然后去除其他标签
{
String endtext = "";
String result = "";
endtext = Drawtext1(reg1,reg2,PreCode);
String reg3 = "<p>";
String reg4 = "</p>";
int dex3 = -1;
int dex4 = -1;
dex3 = endtext.indexOf(reg3);
while(dex3>0)
{
dex4 = endtext.indexOf(reg4,dex3+3);
if(dex4>dex3)
result = result+endtext.substring(dex3, dex4);
else break;
dex3 = endtext.indexOf(reg3, dex4+4);
}
String lastreg = "<[^>]+>";
result = result.replaceAll(lastreg, "");
return result;
}
static void save(String text) //向文件夹中存入文本
{
try
{
File file=new File(fPath,WebIndex+".txt");
file.createNewFile();
BufferedWriter bw=new BufferedWriter(new FileWriter(file));
bw.write(text);
bw.close();
}catch(IOException e){}
WebIndex++;
}
static String deal01(String input)
{
//String output = "";
String pic = "【图\\d+】";
String pic2 = "(该图取自网络)";
input = input.replaceAll(pic, "");
input = input.replaceAll(pic2, "");
//input = input.replaceAll("[\\pP‘’“”,。·《》=]", " ");
return input;
}
static void DrawTCode(String PreCode) //网页源码为参数,抓取旅游内容
{
String result = "";
result = result+Drawtext1("<title>","</title>",PreCode);
result = result+'\n';
result = result+Nowurl+'\n';
result = result+Drawtext2("<!-- 正文页概述信息 begin -->","<!-- 正文页概述信息 end -->",PreCode);
result = result+Drawtext2("<!-- 正文部分 begin -->","<!-- 正文部分 end -->",PreCode);
result = result+Drawtext2("<!-- 正文内容 begin -->","<!-- 正文内容 end -->",PreCode);
//添加标签
if(result.length()<10)
return;
save(result);
}
static void DrawBCode(String PreCode) //网页源码为参数,抓取博客内容
{
String result = "";
result = result+Drawtext1("<title>","</title>",PreCode);
result = result+'\n';
result = result+Drawtext1("<!-- 正文开始 -->","<!-- 正文结束 -->",PreCode);
result = deal01(result);
save(result);
}
static boolean isshtml(String url)
{
int lastd = url.lastIndexOf('.');
int size = url.length();
String end = url.substring(lastd+1, size);
if(end.compareTo("shtml")==0)
return true;
else return false;
}
public static void main(String[] args)
{
Nowurl = "http://travel.sina.com.cn/";
deepUrls.put(Nowurl, 0);
DrawUrl(GetWebcode(Nowurl));
while(NewUrls.size()>0)
{
Nowurl = GetNewUrl(NewUrls);
DrawUrl(GetWebcode(Nowurl));
if(isshtml(Nowurl)==true)
{
System.out.println(Nowurl);
DrawTCode(GetWebcode(Nowurl));
removeurl(NewUrls);
}
else
{
removeurl(NewUrls);
}
System.out.println(num);
num++;
}
while(BlogUrls.size()>0)
{
DrawBCode(GetWebcode(GetNewUrl(BlogUrls)));
}
}
}
|