提取网页中的超链接(C#)
2024-07-21 02:18:25
供稿:网友
using system;
using system.xml;
using system.text;
using system.net;
using system.io;
using system.collections;
using system.text.regularexpressions;
public class app
{
public static void main()
{
string strcode;
arraylist allinks;
console.write("请输入一个网页地址:");
string strurl = console.readline();
if(strurl.substring(0,7) != @"http://")
{
strurl = @"http://" + strurl;
}
console.writeline("正在获取页面代码,请稍侯...");
strcode = getpagesource(strurl);
console.writeline("正在提取超链接,请稍侯...");
allinks = gethyperlinks(strcode);
console.writeline("正在写入文件,请稍侯...");
writetoxml(strurl,allinks);
}
// 获取指定网页的html代码
static string getpagesource(string url)
{
uri uri =new uri(url);
httpwebrequest hwreq = (httpwebrequest)webrequest.create(uri);
httpwebresponse hwres = (httpwebresponse)hwreq.getresponse();
hwreq.method = "get";
hwreq.keepalive = false;
streamreader reader = new streamreader(hwres.getresponsestream(),system.text.encoding.getencoding("gb2312"));
return reader.readtoend();
}
// 提取html代码中的网址
static arraylist gethyperlinks(string htmlcode)
{
arraylist al = new arraylist();
string strregex = @"http://([/w-]+/.)+[/w-]+(/[/w- ./?%&=]*)?";
regex r = new regex(strregex,regexoptions.ignorecase);
matchcollection m = r.matches(htmlcode);
for(int i=0; i<=m.count-1; i++)
{
bool rep = false;
string strnew = m[i].tostring();
// 过滤重复的url
foreach(string str in al)
{
if(strnew==str)
{
rep =true;
break;
}
}
if(!rep) al.add(strnew);
}
al.sort();
return al;
}
// 把网址写入xml文件
static void writetoxml(string strurl, arraylist alhyperlinks)
{
xmltextwriter writer = new xmltextwriter("hyperlinks.xml",encoding.utf8);
writer.formatting = formatting.indented;
writer.writestartdocument(false);
writer.writedoctype("hyperlinks", null, "urls.dtd", null);
writer.writecomment("提取自" + strurl + "的超链接");
writer.writestartelement("hyperlinks");
writer.writestartelement("hyperlinks", null);
writer.writeattributestring("datetime",datetime.now.tostring());
foreach(string str in alhyperlinks)
{
string title = getdomain(str);
string body = str;
writer.writeelementstring(title,null,body);
}
writer.writeendelement();
writer.writeendelement();
writer.flush();
writer.close();
}
// 获取网址的域名后缀
static string getdomain(string strurl)
{
string retval;
string strregex = @"(/.com/|/.net/|/.cn/|/.org/|/.gov/)";
regex r = new regex(strregex,regexoptions.ignorecase);
match m = r.match(strurl);
retval = m.tostring();
strregex = @"/.|/$";
retval = regex.replace(retval, strregex, "").tostring();
if(retval == "")
retval = "other";
return retval;
}
}
菜鸟学堂: