复制代码 代码如下:
 
//提取产品列表页中产品最终页的网页 
private void button1_Click(object sender, EventArgs e) 
{ 
if (textBox1.Text.Trim() == "" || textBox2.Text.Trim() == "") 
{ 
MessageBox.Show("网址和域名不能为空!", "信息提示", MessageBoxButtons.OK, MessageBoxIcon.Information); 
return; 
} 
try 
{ 
string Html = inc.GetHtml("http://study.pctoday.net.cn"); 
//ArrayList al = inc.GetMatchesStr(Html, "<a[^>]*?>.*?</a>"); 
ArrayList al = inc.GetMatchesStr(Html, @"href/s*=/s*(?:[/'/""/s](?<1>[^/""/']*)[/'/""])");//提取链接 
" + a; 
sb.Append(a + "/r/n"); 
} 
textBox5.Text = sb.ToString();//把提取到网址输出到一个textBox,每个链接占一行 
MessageBox.Show("共提取" + al.Count.ToString() + "个链接", "信息提示", MessageBoxButtons.OK, MessageBoxIcon.Information); 
} 
catch (Exception err) 
{ 
MessageBox.Show("提取出错!原因:" + err.Message, "信息提示", MessageBoxButtons.OK, MessageBoxIcon.Information); 
} 
} 
//把采集的产品页面html代码进行字符串处理,提取需要的代码,最后保存到本地一个access数据库中,同时提取产品图片地址并自动现在图片到本地images文件夹下 
private void backgroundWorker1_DoWork(object sender, DoWorkEventArgs e) 
{ 
//填充产品表 
Database.ExecuteNonQuery("delete from Tb_Product"); 
DataTable dt2 = new DataTable(); 
OleDbConnection conn = new OleDbConnection(Database.ConnectionStrings); 
OleDbDataAdapter da = new OleDbDataAdapter("select * from Tb_Product", conn); 
OleDbCommandBuilder cb = new OleDbCommandBuilder(da); 
da.Fill(dt2); 
dt2.Rows.Clear(); 
BackgroundWorker worker = (BackgroundWorker)sender;//这个是做一个进度条 
string[] Urls = textBox5.Text.Trim().ToLower().Replace("/r/n", ",").Split(','); 
DataTable dt = new DataTable(); 
StringBuilder ErrorStr = new StringBuilder(); 
string html = "", ImageDir = AppDomain.CurrentDomain.BaseDirectory + "Images//"; 
//循环每次采集网址 
for (int i = 0; i < Urls.Length; i++) 
{ 
try 
{ 
if (!worker.CancellationPending) 
{ 
if (Urls[i] == "") 
return; 
html = inc.GetHtml(Urls[i]);//获取该url的html代码 
DataRow NewRow = dt2.NewRow(); 
//产品名 
string ProductName = html.Substring(html.IndexOf("<title>") + 7); 
NewRow["ProductName"] = ProductName.Remove(ProductName.IndexOf("</title>")).Trim(); 
//产品编号 
NewRow["ModelId"] = NewRow["ProductName"].ToString().Substring(NewRow["ProductName"].ToString().IndexOf("Model:") + 6).Trim(); 
//产品介绍,这些都是根据不同网站的html做相应的修改 
string Introduce = html.Substring(html.IndexOf("Product Details") + 26); 
Introduce = Introduce.Remove(Introduce.IndexOf("</table>") + 8).Trim() 
NewRow["Introduce"] = Introduce; 
" + ImageDir + ProductImage.Substring(ProductImage.LastIndexOf("http://www.vevb.com/") + 1) + "/r/n"); 
} 
dt2.Rows.Add(NewRow); 
//Thread.Sleep(100); 
worker.ReportProgress((i + 1) * 100 / Urls.Length, i); 
toolStripStatusLabel1.Text = "处理进度:" + (i + 1).ToString() + "http://www.vevb.com/" + Urls.Length.ToString();//进度条 
} 
} 
catch (Exception err) 
{ 
ErrorStr.Append("采集错误:" + err.Message + ";网址:" + Urls[i] + "/r/n"); 
} 
} 
da.Update(dt2); 
DataBind(dt2); 
ShowError(ErrorStr.ToString()); 
} 
/// <summary> 
/// ASPX页面生成静态Html页面,作者:郑少群 
/// </summary> 
public static string GetHtml(string url) 
{ 
StreamReader sr = null; 
string str = null; 
//读取远程路径 
WebRequest request = WebRequest.Create(url); 
HttpWebResponse response = (HttpWebResponse)request.GetResponse(); 
sr = new StreamReader(response.GetResponseStream(), Encoding.GetEncoding(response.CharacterSet)); 
str = sr.ReadToEnd(); 
sr.Close(); 
return str; 
} 
// 提取HTML代码中的网址 
public static ArrayList GetMatchesStr(string htmlCode, string strRegex) 
{ 
ArrayList al = new ArrayList(); 
Regex r = new Regex(strRegex, RegexOptions.IgnoreCase | RegexOptions.Multiline); 
MatchCollection m = r.Matches(htmlCode); 
for (int i = 0; i < m.Count; i++) 
{ 
bool rep = false; 
string strNew = m[i].ToString(); 
// 过滤重复的URL 
foreach (string str in al) 
{ 
if (strNew == str) 
{ 
rep = true; 
break; 
} 
} 
if (!rep) al.Add(strNew); 
} 
al.Sort(); 
return al; 
} 
public static void DownFile(string Url, string Path) 
{ 
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(Url); 
HttpWebResponse response = (HttpWebResponse)request.GetResponse(); 
Stream stream = response.GetResponseStream(); 
long size = response.ContentLength; 
//创建文件流对象 
using (FileStream fs = new FileStream(Path, FileMode.OpenOrCreate, FileAccess.Write)) 
{ 
byte[] b = new byte[1025]; 
int n = 0; 
while ((n = stream.Read(b, 0, 1024)) > 0) 
{ 
fs.Write(b, 0, n); 
} 
} 
} 
新闻热点
疑难解答
图片精选