首页 > 编程 > .NET > 正文

asp.net(c#)做一个网页数据采集工具

2024-07-10 12:42:48
字体:
来源:转载
供稿:网友
通过这个软件一两天就完成了几千产品数据的录入,可见很多工作不是一味用人工去做,作为一个程序员,就是要让很多让那些经常做重复性的、繁琐的工作中的人解放出来。下面只是写了一些核心代码,而且采集必须要和对应网站相挂钩,作者:郑少群

代码如下:
//提取产品列表页中产品最终页的网页
private void button1_Click(object sender, EventArgs e)
{
if (textBox1.Text.Trim() == "" || textBox2.Text.Trim() == "")
{
MessageBox.Show("网址和域名不能为空!", "信息提示", MessageBoxButtons.OK, MessageBoxIcon.Information);
return;
}
try
{
string Html = inc.GetHtml("http://study.pctoday.net.cn");
//ArrayList al = inc.GetMatchesStr(Html, "<a[^>]*?>.*?</a>");
ArrayList al = inc.GetMatchesStr(Html, @"href/s*=/s*(?:[/'/""/s](?<1>[^/""/']*)[/'/""])");//提取链接


" title="Replica Watches:">Replica Watches Buy Full Quality Popular Luxury Watches at Amazing Price, Your One Stop Discount Swiss Watches StoreExclusive Replica Rolex Watches, Tag Heuer Watches Replica, Cartier Watches online Sale!
StringBuilder sb = new StringBuilder();
foreach (object var in al)
{
string a = var.ToString().Replace("/"", "").Replace("'", "");
a = Regex.Replace(a, "href=", "", RegexOptions.IgnoreCase | RegexOptions.Multiline);
if (a.StartsWith("/"))
a = textBox2.Text.Trim() + a;
if (!a.StartsWith("http://"))
a = "http://" + a;
sb.Append(a + "/r/n");
}
textBox5.Text = sb.ToString();//把提取到网址输出到一个textBox,每个链接占一行



MessageBox.Show("共提取" + al.Count.ToString() + "个链接", "信息提示", MessageBoxButtons.OK, MessageBoxIcon.Information);

}
catch (Exception err)
{
MessageBox.Show("提取出错!原因:" + err.Message, "信息提示", MessageBoxButtons.OK, MessageBoxIcon.Information);
}

}




//把采集的产品页面html代码进行字符串处理,提取需要的代码,最后保存到本地一个access数据库中,同时提取产品图片地址并自动现在图片到本地images文件夹下

private void backgroundWorker1_DoWork(object sender, DoWorkEventArgs e)
{
//填充产品表
Database.ExecuteNonQuery("delete from Tb_Product");
DataTable dt2 = new DataTable();
OleDbConnection conn = new OleDbConnection(Database.ConnectionStrings);
OleDbDataAdapter da = new OleDbDataAdapter("select * from Tb_Product", conn);
OleDbCommandBuilder cb = new OleDbCommandBuilder(da);
da.Fill(dt2);
dt2.Rows.Clear();

BackgroundWorker worker = (BackgroundWorker)sender;//这个是做一个进度条

string[] Urls = textBox5.Text.Trim().ToLower().Replace("/r/n", ",").Split(',');
DataTable dt = new DataTable();
StringBuilder ErrorStr = new StringBuilder();
string html = "", ImageDir = AppDomain.CurrentDomain.BaseDirectory + "Images//";

//循环每次采集网址
发表评论 共有条评论
用户名: 密码:
验证码: 匿名发表