首页 > 学院 > 开发设计 > 正文

dotNet使用HttpWebRequest模拟浏览器

2019-11-17 02:13:26
字体:
来源:转载
供稿:网友

dotNet使用HttpWebRequest模拟浏览器

在编写网络爬虫时,HttpWebRequest几乎可以完成绝大多数网站的抓取,为了更好的使用这一技术,我将常用的几个功能进行了封装,以方便调用。这个类已经在多个项目中得到使用,主要解决了Cookies相关的一些问题;如果有其它方面的问题可以提出来,我会进一步完善。

目前HttpHelper包含了以下几个方面:

  • GetHttpContent:通过Get或Post来获取网页的Html
  • SetCookie:根据response中头部的set-cookie对cookie进行设置,能识别httponly
  • GetAllCookies:将CookieContainer转换为键值对,方便存储和跨程序间调用
  • ConvertToCookieContainer:将键值对转换回CookieContainer供程序调用
  • BuildPostData:通过一个需要post的html构建出postdata

代码如下:

  1 using System;  2 using System.Collections.Generic;  3 using System.Collections.Specialized;  4 using System.IO;  5 using System.IO.ComPRession;  6 using System.Linq;  7 using System.Net;  8 using System.Net.Security;  9 using System.Security.Cryptography.X509Certificates; 10 using System.Text; 11 using System.Text.RegularExpressions; 12 using System.Collections; 13 using HtmlAgilityPack; 14  15 namespace TNIdea.Common.Helper 16 { 17     public class HttpHelper 18     { 19         public const string CharsetReg = @"(meta.*?charset=""?(?<Charset>[^/s""'>]+)""?)|(xml.*?encoding=""?(?<Charset>[^/s"">]+)""?)"; 20  21         /// <summary> 22         /// 获取网页的内容 23         /// </summary> 24         /// <param name="url">Url</param> 25         /// <param name="postData">Post的信息</param> 26         /// <param name="cookies">Cookies</param> 27         /// <param name="userAgent">浏览器标识</param> 28         /// <param name="referer">来源页</param> 29         /// <param name="cookiesDomain">Cookies的Domian参数,配合cookies使用;为空则取url的Host</param> 30         /// <param name="encode">编码方式,用于解析html</param> 31         /// <returns></returns> 32         public static string GetHttpContent(string url, string postData = null, CookieContainer cookies = null, string userAgent = "", string referer = "", string cookiesDomain = "", Encoding encode = null) 33         { 34             try 35             { 36                 HttpWebResponse httpResponse = null; 37                 if (!string.IsNullOrWhiteSpace(postData)) 38                     httpResponse = CreatePostHttpResponse(url, postData, cookies: cookies, userAgent: userAgent, referer: referer); 39                 else 40                     httpResponse = CreateGetHttpResponse(url, cookies: cookies, userAgent: userAgent, referer: referer); 41  42                 #region 根据Html头判断 43                 string Content = null; 44                 //缓冲区长度 45                 const int N_CacheLength = 10000; 46                 //头部预读取缓冲区,字节形式 47                 var bytes = new List<byte>(); 48                 int count = 0; 49                 //头部预读取缓冲区,字符串 50                 String cache = string.Empty; 51  52                 //创建流对象并解码 53                 Stream ResponseStream; 54                 switch (httpResponse.ContentEncoding.ToUpperInvariant()) 55                 { 56                     case "GZip": 57                         ResponseStream = new GZipStream( 58                             httpResponse.GetResponseStream(), CompressionMode.Decompress); 59                         break; 60                     case "DEFLATE": 61                         ResponseStream = new DeflateStream( 62                             httpResponse.GetResponseStream(), CompressionMode.Decompress); 63                         break; 64                     default: 65                         ResponseStream = httpResponse.GetResponseStream(); 66                         break; 67                 } 68  69                 try 70                 { 71                     while ( 72                         !(cache.EndsWith("</head>", StringComparison.OrdinalIgnoreCase) 73                           || count >= N_CacheLength)) 74                     { 75                         var b = (byte)ResponseStream.ReadByte(); 76                         if (b < 0) //end of stream 77                         { 78                             break; 79                         } 80                         bytes.Add(b); 81  82                         count++; 83                         cache += (char)b; 84                     } 85  86  87                     if (encode == null) 88                     { 89                         try 90                         { 91                             if (httpResponse.CharacterSet == "ISO-8859-1" || httpResponse.CharacterSet == "zh-cn") 92                             { 93                                 Match match = Regex.Match(cache, CharsetReg, RegexOptions.IgnoreCase | RegexOptions.Multiline); 94                                 if (match.Success) 95                                 { 96                                     try 97                                     { 98                                         string charset = match.Groups["Charset"].Value; 99                                         encode = Encoding.GetEncoding(charset);100                                     }101                                     catch { }102                                 }103                                 else104                                     encode = Encoding.GetEncoding("GB2312");105                             }106                             else107                                 encode = Encoding.GetEncoding(httpResponse.CharacterSet);108                         }109                         catch { }110                     }111 112                     //缓冲字节重新编码,然后再把流读完113                     var Reader = new StreamReader(ResponseStream, encode);114                     Content = encode.GetString(bytes.ToArray(), 0, count) + Reader.ReadToEnd();115                     Reader.Close();116                 }117                 catch (Exception ex)118                 {119                     return ex.ToString();120                 }121                 finally122                 {123                     httpResponse.Close();124                 }125                 #endregion 根据Html头判断126 127                 //获取返回的Cookies,支持httponly128                 if (string.IsNullOrWhiteSpace(cookiesDomain))129                     cookiesDomain = httpResponse.ResponseUri.Host;130 131                 cookies = new CookieContainer();132                 CookieCollection httpHeaderCookies = SetCookie(httpResponse, cookiesDomain);133                 cookies.Add(httpHeaderCookies ?? httpResponse.Cookies);134 135                 return Content;136             }137             catch138             {139                 return string.Empty;140             }141         }142 143 144         /// <summary>145         /// 创建GET方式的HTTP请求 146         /// </summary>147         /// <param name="url"></param>148         /// <param name="timeout"></param>149         /// <param name="userAgent"></param>150         /// <param name="cookies"></param>151         /// <param name="referer"></param>152         /// <returns></returns>153         public static HttpWebResponse CreateGetHttpResponse(string url, int timeout = 60000, string userAgent = "", CookieContainer cookies = null, string referer = "")154         {155             HttpWebRequest request = null;156             if (url.StartsWith("https", StringComparison.OrdinalIgnoreCase))157             {158                 //对服务端证书进行有效性校验(非第三方权威机构颁发的证书,如自己生成的,不进行验证,这里返回true)159                 ServicePointManager.ServerCertificateValidationCallback = new RemoteCertificateValidationCallback(CheckValidationResult);160                 request = WebRequest.Create(url) as HttpWebRequest;161                 //request.ProtocolVersion = HttpVersion.Version10;    //http版本,默认是1.1,这里设置为1.0162             }163             else164             {165                 request = WebRequest.Create(url) as HttpWebRequest;166             }167 168             request.Referer = referer;169             request.Method = "GET";170 171             //设置代理UserAgent和超时172             if (string.IsNullOrWhiteSpace(userAgent))173                 userAgent = "Mozilla/5.0 (Windows NT 6.3; WOW64
发表评论 共有条评论
用户名: 密码:
验证码: 匿名发表