[原创]从网页源文件中得到链接
*/ --------------------------------------------------------------------------------------*/ 出自: 编程中国 http://www.bc-cn.net
*/ 作者: 田里兵蜂 QQ:87135637
*/ 时间: 2007-10-25 编程论坛首发
*/ 声明: 尊重作者劳动,转载请保留本段文字
*/ --------------------------------------------------------------------------------------
import java.net.*;
import java.io.*;
import java.util.*;
class ScanPage{
private static String strPage;
private String strUrl;
private String fileName;
public void setURL(String strUrl){
this.strUrl=strUrl;
}
//从地址中得到文件名
public void setFileName(){
int i;
char ch;
i=strUrl.length();
ch=strUrl.charAt(--i);
while(ch!='/' && ch>0)
ch=strUrl.charAt(--i);
fileName=strUrl.substring(i);
}
//下载网页
public void downFile()throws IOException{
URL url =new URL(strUrl);
InputStream is =url.openStream();
OutputStream os =new FileOutputStream(fileName);
byte[] buffer =new byte[512];
int len;
while((len =is.read(buffer))!=-1)
os.write(buffer,0,len);
is.close();
os.close();
}
//读文件
public void readFile() throws IOException {
StringBuffer sb = new StringBuffer();
BufferedReader in =new BufferedReader(new FileReader(fileName));
String s;
while((s = in.readLine()) != null) {
sb.append(s);
sb.append("\n");
}
in.close();
strPage=sb.toString();
}
public String getTitle(){
return "";
}
//扫描标签,得到资源
public void scanLabel(ArrayList al,String strLabel,String strType){
int idx; //返回下标
String strTmp;
strPage=strPage.toLowerCase(); //转换为小写,以便后面比较
idx = strPage.indexOf("<body");
while(idx!=-1){
idx=strPage.indexOf(strLabel,idx);
if (idx==-1)
break;
else{
int i=0;
idx=idx+strLabel.length();
for(;strPage.charAt(idx+i)!='>' && strPage.charAt(idx+i)!=32;++i);
strTmp=strPage.substring(idx,idx+i);
idx=idx+i;
//去掉首尾引号
if (strTmp.charAt(0)=='\"')
strTmp=strTmp.substring(1);
if (strTmp.charAt(strTmp.length()-1)=='\"')
strTmp=strTmp.substring(0,strTmp.length()-1);
//判断是否是用户需要的类型
if (strType.equals("*"))
al.add(strTmp);
else{
String right;
if (strTmp.length()>=strType.length()){
right=strTmp.substring(strTmp.length()-strType.length());
right=right.toLowerCase();
if (right.equals(strType))
al.add(strTmp);
}
}
}
}
}
}
class ScanApp{
public static void main(String[] args){
ArrayList al=new ArrayList();
ScanPage sp=new ScanPage();
sp.setURL(args[0]);
sp.setFileName();
try{
sp.downFile();
sp.readFile();
}catch(IOException ie){System.out.println("文件操作出错");};
sp.scanLabel(al,"<a href=",".html");
for(int i=0;i<al.size();i++){
System.out.println(al.get(i));
}
}
}
E:\javawork>java ScanApp http://www.17kyk.com/Html/Book/16/2431/list.html
list.html
429400.html
429401.html
439789.html
429403.html
429404.html
429405.html
429406.html
429407.html
429408.html
429409.html
429411.html
429412.html
说明:
scanLabel针对<a href和<img src这2个标签进行扫描.
得到所有链接存储在ArrayList中
scanLabel(al,"<a href=","*");
scanLabel(al,"<img src=","*");
得到特定的链接
scanLabel(al,"<a href=",".html");
scanLabel(al,"<a href=",".asp");
scanLabel(al,"<img src=",".gif");
[此贴子已经被作者于2007-10-25 22:01:33编辑过]