首页 > 学院 > 开发设计 > 正文

C#实现网页爬虫

2019-11-14 13:30:18
字体:
来源:转载
供稿:网友

HTTP请求工具类(功能:1、获取网页html;2、下载网络图片;):

using System;using System.Collections.Generic;using System.IO;using System.Linq;using System.Net;using System.Text;using System.Threading.Tasks;using System.Windows.Forms;namespace Utils{    /// <summary>    /// HTTP请求工具类    /// </summary>    public class HttPRequestUtil    {        /// <summary>        /// 获取页面html        /// </summary>        public static string GetPageHtml(string url)        {            // 设置参数            HttpWebRequest request = WebRequest.Create(url) as HttpWebRequest;            request.UserAgent = "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)";            //发送请求并获取相应回应数据            HttpWebResponse response = request.GetResponse() as HttpWebResponse;            //直到request.GetResponse()程序才开始向目标网页发送Post请求            Stream responseStream = response.GetResponseStream();            StreamReader sr = new StreamReader(responseStream, Encoding.UTF8);            //返回结果网页(html)代码            string content = sr.ReadToEnd();            return content;        }        /// <summary>        /// Http下载文件        /// </summary>        public static void HttpDownloadFile(string url)        {            int pos = url.LastIndexOf("/") + 1;            string fileName = url.Substring(pos);            string path = application.StartupPath + "//download";            if (!Directory.Exists(path))            {                Directory.CreateDirectory(path);            }            string filePathName = path + "//" + fileName;            if (File.Exists(filePathName)) return;            // 设置参数            HttpWebRequest request = WebRequest.Create(url) as HttpWebRequest;            request.UserAgent = "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)";            request.Proxy = null;            //发送请求并获取相应回应数据            HttpWebResponse response = request.GetResponse() as HttpWebResponse;            //直到request.GetResponse()程序才开始向目标网页发送Post请求            Stream responseStream = response.GetResponseStream();            //创建本地文件写入流            Stream stream = new FileStream(filePathName, FileMode.Create);            byte[] bArr = new byte[1024];            int size = responseStream.Read(bArr, 0, (int)bArr.Length);            while (size > 0)            {                stream.Write(bArr, 0, size);                size = responseStream.Read(bArr, 0, (int)bArr.Length);            }            stream.Close();            responseStream.Close();        }    }}
View Code

多线程爬取网页代码:

using System;using System.Collections.Generic;using System.ComponentModel;using System.Data;using System.Drawing;using System.IO;using System.Linq;using System.Text;using System.Text.RegularExpressions;using System.Threading;using System.Threading.Tasks;using System.Windows.Forms;using Utils;namespace 爬虫{    public partial class Form1 : Form    {        List<Thread> threadList = new List<Thread>();        Thread thread = null;        public Form1()        {            InitializeComponent();        }        private void button1_Click(object sender, EventArgs e)        {            DateTime dtStart = DateTime.Now;            button3.Enabled = true;            button2.Enabled = true;            button1.Enabled = false;            int page = 0;            int count = 0;            int personCount = 0;            lblPage.Text = "已完成页数:0";            int index = 0;            for (int i = 1; i <= 10; i++)            {                thread = new Thread(new ParameterizedThreadStart(delegate(object obj)                {                    for (int j = 1; j <= 10; j++)                    {                        try                        {                            index = (Convert.ToInt32(obj) - 1) * 10 + j;                            string pageHtml = HttpRequestUtil.GetPageHtml("http://tt.mop.com/c44/0/1_" + index.ToString() + ".html");                            Regex regA = new Regex("<a[//s]+class=/"J-userPic([^<>]*?)[//s]+href=/"([^/"]*?)/"");                            Regex regImg = new Regex("<p class=/"tc mb10/"><img[//s]+src=/"([^/"]*?)/"");                            MatchCollection mc = regA.Matches(pageHtml);                            foreach (Match match in mc)                            {                                int start = match.ToString().IndexOf("href=/"");                                string url = match.ToString().Substring(start + 6);                                int end = url.IndexOf("/"");                                url = url.Substring(0, end);                                if (url.IndexOf("/") == 0)                                {                                    string imgPageHtml = HttpRequestUtil.GetPageHtml("http://tt.mop.com" + url);                                    personCount++;                                    lblPerson.Invoke(new Action(delegate() { lblPerson.Text = "已完成条数:" + personCount.ToString(); }));                                    MatchCollection mcImgPage = regImg.Matches(imgPageHtml);                                    foreach (Match matchImgPage in mcImgPage)                                    {                                        start = matchImgPage.ToString().IndexOf("src=/"");                                        string imgUrl = matchImgPage.ToString().Substring(start + 5);                                        end = imgUrl.IndexOf("/"");                                        imgUrl = imgUrl.Substring(0, end);                                        if (imgUrl.IndexOf("http://i1") == 0)                                        {                                            try                                            {                                                HttpRequestUtil.HttpDownloadFile(imgUrl);                                                count++;                                                lblNum.Invoke(new Action(delegate()                                                {                                                    lblNum.Text = "已下载图片数" + count.ToString();                                                    DateTime dt = DateTime.Now;                                                    double time = dt.Subtract(dtStart).TotalSeconds;                                                    if (time > 0)                                                    {                                                        lblSpeed.Text = "速度:" + (count / time).ToString("0.0") + "张/秒";                                                    }                                                }));                                            }                                            catch { }                                            Thread.Sleep(1);                                        }                                    }                                }                            }                        }                        catch { }                        page++;                        lblPage.Invoke(new Action(delegate() { lblPage.Text = "已完成页数:" + page.ToString(); }));                        if (page == 100)                        {                            button1.Invoke(new Action(delegate() { button1.Enabled = true; }));                            MessageBox.Show("完成!");                        }                    }                }));                thread.Start(i);                threadList.Add(thread);            }        }        private void button2_Click(object sender, EventArgs e)        {            button1.Invoke(new Action(delegate()            {                foreach (Thread thread in threadList)                {                    if (thread.ThreadState == ThreadState.Suspended)                    {                        thread.Resume();                    }                    thread.Abort();                }                button1.Enabled = true;                button2.Enabled = false;                button3.Enabled = false;                button4.Enabled = false;            }));        }        private void Form1_FormClosing(object sender, FormClosingEventArgs e)        {            foreach (Thread thread in threadList)            {                thread.Abort();            }        }        private void button3_Click(object sender, EventArgs e)        {            foreach (Thread thread in threadList)            {                if (thread.ThreadState == ThreadState.Running)                {                    thread.Suspend();                }            }            button3.Enabled = false;            button4.Enabled = true;        }        private void button4_Click(object sender, EventArgs e)        {            foreach (Thread thread in threadList)            {                if (thread.ThreadState == ThreadState.Suspended)                {                    thread.Resume();                }            }            button3.Enabled = true;            button4.Enabled = false;        }    }}
View Code

截图:

 


发表评论 共有条评论
用户名: 密码:
验证码: 匿名发表