首页 > 编程 > .NET > 正文

asp.net 抓取网页源码三种实现方法

2024-07-10 12:45:18
字体:
来源:转载
供稿:网友

方法1 比较推荐  

/// <summary>       /// 用HttpWebRequest取得网页源码      /// 对于带BOM的网页很有效,不管是什么编码都能正确识别      /// </summary>      /// <param name="url">网页地址" </param>      /// <returns>返回网页源文件</returns>      public static string GetHtmlSource2(string url)    {      //处理内容        string html = "";      HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url);      request.Accept = "*/*"; //接受任意文件      request.UserAgent = "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; .NET CLR 1.1.4322)"; // 模拟使用IE在浏览 http://www.52mvc.com      request.AllowAutoRedirect = true;//是否允许302      //request.CookieContainer = new CookieContainer();//cookie容器,      request.Referer = url; //当前页面的引用        HttpWebResponse response = (HttpWebResponse)request.GetResponse();      Stream stream = response.GetResponseStream();      StreamReader reader = new StreamReader(stream, Encoding.Default);      html = reader.ReadToEnd();      stream.Close();        return html;    }

方法2 

using System;using System.Collections.Generic;using System.Linq;using System.Web;using System.IO;using System.Text;using System.Net;namespace MySql{  public class GetHttpData  {    public static string GetHttpData2(string Url)    {      string sException = null;      string sRslt = null;      WebResponse oWebRps = null;      WebRequest oWebRqst = WebRequest.Create(Url);      oWebRqst.Timeout = 50000;      try      {        oWebRps = oWebRqst.GetResponse();      }      catch (WebException e)      {        sException = e.Message.ToString();      }      catch (Exception e)      {        sException = e.ToString();       }      finally      {        if (oWebRps != null)        {           StreamReader oStreamRd = new StreamReader(oWebRps.GetResponseStream(), Encoding.GetEncoding("utf-8"));          sRslt = oStreamRd.ReadToEnd();          oStreamRd.Close();          oWebRps.Close();        }      }       return sRslt;    }   }}

方法3

public static string getHtml(string url, params string [] charSets)//url是要访问的网站地址,charSet是目标网页的编码,如果传入的是null或者"",那就自动分析网页的编码  {    try    {      string charSet = null;      if (charSets.Length == 1) {        charSet = charSets[0];      }      WebClient myWebClient = new WebClient(); //创建WebClient实例myWebClient      // 需要注意的:      //有的网页可能下不下来,有种种原因比如需要cookie,编码问题等等      //这是就要具体问题具体分析比如在头部加入cookie      // webclient.Headers.Add("Cookie", cookie);      //这样可能需要一些重载方法。根据需要写就可以了        //获取或设置用于对向 Internet 资源的请求进行身份验证的网络凭据。      myWebClient.Credentials = CredentialCache.DefaultCredentials;      //如果服务器要验证用户名,密码      //NetworkCredential mycred = new NetworkCredential(struser, strpassword);      //myWebClient.Credentials = mycred;      //从资源下载数据并返回字节数组。(加@是因为网址中间有"/"符号)      byte[] myDataBuffer = myWebClient.DownloadData(url);      string strWebData = Encoding.Default.GetString(myDataBuffer);        //获取网页字符编码描述信息      Match charSetMatch = Regex.Match(strWebData, "<meta([^<]*)charset=([^<]*)/"", RegexOptions.IgnoreCase | RegexOptions.Multiline);      string webCharSet = charSetMatch.Groups[2].Value;      if (charSet == null || charSet == "")        charSet = webCharSet;        if (charSet != null && charSet != "" && Encoding.GetEncoding(charSet) != Encoding.Default)      {        strWebData = Encoding.GetEncoding(charSet).GetString(myDataBuffer);      }      else {        strWebData = Encoding.GetEncoding("utf-8").GetString(myDataBuffer);      }      return strWebData;    }    catch (Exception e) { return ""; }  }            
发表评论 共有条评论
用户名: 密码:
验证码: 匿名发表