在编写网络爬虫时,HttpWebRequest几乎可以完成绝大多数网站的抓取,为了更好的使用这一技术,我将常用的几个功能进行了封装,以方便调用。这个类已经在多个项目中得到使用,主要解决了Cookies相关的一些问题;如果有其它方面的问题可以提出来,我会进一步完善。
目前HttpHelper包含了以下几个方面:
代码如下:
1 using System; 2 using System.Collections.Generic; 3 using System.Collections.Specialized; 4 using System.IO; 5 using System.IO.ComPRession; 6 using System.Linq; 7 using System.Net; 8 using System.Net.Security; 9 using System.Security.Cryptography.X509Certificates; 10 using System.Text; 11 using System.Text.RegularExpressions; 12 using System.Collections; 13 using HtmlAgilityPack; 14 15 namespace TNIdea.Common.Helper 16 { 17 public class HttpHelper 18 { 19 public const string CharsetReg = @"(meta.*?charset=""?(?<Charset>[^/s""'>]+)""?)|(xml.*?encoding=""?(?<Charset>[^/s"">]+)""?)"; 20 21 /// <summary> 22 /// 获取网页的内容 23 /// </summary> 24 /// <param name="url">Url</param> 25 /// <param name="postData">Post的信息</param> 26 /// <param name="cookies">Cookies</param> 27 /// <param name="userAgent">浏览器标识</param> 28 /// <param name="referer">来源页</param> 29 /// <param name="cookiesDomain">Cookies的Domian参数,配合cookies使用;为空则取url的Host</param> 30 /// <param name="encode">编码方式,用于解析html</param> 31 /// <returns></returns> 32 public static string GetHttpContent(string url, string postData = null, CookieContainer cookies = null, string userAgent = "", string referer = "", string cookiesDomain = "", Encoding encode = null) 33 { 34 try 35 { 36 HttpWebResponse httpResponse = null; 37 if (!string.IsNullOrWhiteSpace(postData)) 38 httpResponse = CreatePostHttpResponse(url, postData, cookies: cookies, userAgent: userAgent, referer: referer); 39 else 40 httpResponse = CreateGetHttpResponse(url, cookies: cookies, userAgent: userAgent, referer: referer); 41 42 #region 根据Html头判断 43 string Content = null; 44 //缓冲区长度 45 const int N_CacheLength = 10000; 46 //头部预读取缓冲区,字节形式 47 var bytes = new List<byte>(); 48 int count = 0; 49 //头部预读取缓冲区,字符串 50 String cache = string.Empty; 51 52 //创建流对象并解码 53 Stream ResponseStream; 54 switch (httpResponse.ContentEncoding.ToUpperInvariant()) 55 { 56 case "GZip": 57 ResponseStream = new GZipStream( 58 httpResponse.GetResponseStream(), CompressionMode.Decompress); 59 break; 60 case "DEFLATE": 61 ResponseStream = new DeflateStream( 62 httpResponse.GetResponseStream(), CompressionMode.Decompress); 63 break; 64 default: 65 ResponseStream = httpResponse.GetResponseStream(); 66 break; 67 } 68 69 try 70 { 71 while ( 72 !(cache.EndsWith("</head>", StringComparison.OrdinalIgnoreCase) 73 || count >= N_CacheLength)) 74 { 75 var b = (byte)ResponseStream.ReadByte(); 76 if (b < 0) //end of stream 77 { 78 break; 79 } 80 bytes.Add(b); 81 82 count++; 83 cache += (char)b; 84 } 85 86 87 if (encode == null) 88 { 89 try 90 { 91 if (httpResponse.CharacterSet == "ISO-8859-1" || httpResponse.CharacterSet == "zh-cn") 92 { 93 Match match = Regex.Match(cache, CharsetReg, RegexOptions.IgnoreCase | RegexOptions.Multiline); 94 if (match.Success) 95 { 96 try 97 { 98 string charset = match.Groups["Charset"].Value; 99 encode = Encoding.GetEncoding(charset);100 }101 catch { }102 }103 else104 encode = Encoding.GetEncoding("GB2312");105 }106 else107 encode = Encoding.GetEncoding(httpResponse.CharacterSet);108 }109 catch { }110 }111 112 //缓冲字节重新编码,然后再把流读完113 var Reader = new StreamReader(ResponseStream, encode);114 Content = encode.GetString(bytes.ToArray(), 0, count) + Reader.ReadToEnd();115 Reader.Close();116 }117 catch (Exception ex)118 {119 return ex.ToString();120 }121 finally122 {123 httpResponse.Close();124 }125 #endregion 根据Html头判断126 127 //获取返回的Cookies,支持httponly128 if (string.IsNullOrWhiteSpace(cookiesDomain))129 cookiesDomain = httpResponse.ResponseUri.Host;130 131 cookies = new CookieContainer();132 CookieCollection httpHeaderCookies = SetCookie(httpResponse, cookiesDomain);133 cookies.Add(httpHeaderCookies ?? httpResponse.Cookies);134 135 return Content;136 }137 catch138 {139 return string.Empty;140 }141 }142 143 144 /// <summary>145 /// 创建GET方式的HTTP请求 146 /// </summary>147 /// <param name="url"></param>148 /// <param name="timeout"></param>149 /// <param name="userAgent"></param>150 /// <param name="cookies"></param>151 /// <param name="referer"></param>152 /// <returns></returns>153 public static HttpWebResponse CreateGetHttpResponse(string url, int timeout = 60000, string userAgent = "", CookieContainer cookies = null, string referer = "")154 {155 HttpWebRequest request = null;156 if (url.StartsWith("https", StringComparison.OrdinalIgnoreCase))157 {158 //对服务端证书进行有效性校验(非第三方权威机构颁发的证书,如自己生成的,不进行验证,这里返回true)159 ServicePointManager.ServerCertificateValidationCallback = new RemoteCertificateValidationCallback(CheckValidationResult);160 request = WebRequest.Create(url) as HttpWebRequest;161 //request.ProtocolVersion = HttpVersion.Version10; //http版本,默认是1.1,这里设置为1.0162 }163 else164 {165 request = WebRequest.Create(url) as HttpWebRequest;166 }167 168 request.Referer = referer;169 request.Method = "GET";170 171 //设置代理UserAgent和超时172 if (string.IsNullOrWhiteSpace(userAgent))173 userAgent = "Mozilla/5.0 (Windows NT 6.3; WOW64
新闻热点
疑难解答