首页 > 开发 > 综合 > 正文

提取网页中的超链接(C#)

2024-07-21 02:18:25
字体:
来源:转载
供稿:网友
using system;
using system.xml;
using system.text;
using system.net;
using system.io;
using system.collections;
using system.text.regularexpressions;

public class app
{
public static void main()
{
string strcode;
arraylist allinks;

console.write("请输入一个网页地址:");
string strurl = console.readline();
if(strurl.substring(0,7) != @"http://")
{
strurl = @"http://" + strurl;
}

console.writeline("正在获取页面代码,请稍侯...");
strcode = getpagesource(strurl);

console.writeline("正在提取超链接,请稍侯...");
allinks = gethyperlinks(strcode);

console.writeline("正在写入文件,请稍侯...");
writetoxml(strurl,allinks);
}

// 获取指定网页的html代码
static string getpagesource(string url)
{
uri uri =new uri(url);

httpwebrequest hwreq = (httpwebrequest)webrequest.create(uri);
httpwebresponse hwres = (httpwebresponse)hwreq.getresponse();

hwreq.method = "get";

hwreq.keepalive = false;

streamreader reader = new streamreader(hwres.getresponsestream(),system.text.encoding.getencoding("gb2312"));

return reader.readtoend();
}

// 提取html代码中的网址
static arraylist gethyperlinks(string htmlcode)
{
arraylist al = new arraylist();

string strregex = @"http://([/w-]+/.)+[/w-]+(/[/w- ./?%&=]*)?";

regex r = new regex(strregex,regexoptions.ignorecase);
matchcollection m = r.matches(htmlcode);

for(int i=0; i<=m.count-1; i++)
{
bool rep = false;
string strnew = m[i].tostring();

// 过滤重复的url
foreach(string str in al)
{
if(strnew==str)
{
rep =true;
break;
}
}

if(!rep) al.add(strnew);
}

al.sort();

return al;
}

// 把网址写入xml文件
static void writetoxml(string strurl, arraylist alhyperlinks)
{
xmltextwriter writer = new xmltextwriter("hyperlinks.xml",encoding.utf8);

writer.formatting = formatting.indented;
writer.writestartdocument(false);
writer.writedoctype("hyperlinks", null, "urls.dtd", null);
writer.writecomment("提取自" + strurl + "的超链接");
writer.writestartelement("hyperlinks");
writer.writestartelement("hyperlinks", null);
writer.writeattributestring("datetime",datetime.now.tostring());


foreach(string str in alhyperlinks)
{
string title = getdomain(str);
string body = str;
writer.writeelementstring(title,null,body);
}

writer.writeendelement();
writer.writeendelement();

writer.flush();
writer.close();
}

// 获取网址的域名后缀
static string getdomain(string strurl)
{
string retval;

string strregex = @"(/.com/|/.net/|/.cn/|/.org/|/.gov/)";

regex r = new regex(strregex,regexoptions.ignorecase);
match m = r.match(strurl);
retval = m.tostring();

strregex = @"/.|/$";
retval = regex.replace(retval, strregex, "").tostring();

if(retval == "")
retval = "other";

return retval;
}
}

菜鸟学堂:
发表评论 共有条评论
用户名: 密码:
验证码: 匿名发表