首页 > 学院 > 开发设计 > 正文

网页中对图像的采集

2019-11-17 01:35:08
字体:
来源:转载
供稿:网友

网页中对图像的采集

有时我们需要采集一些信息到自己的数据库,本地磁盘,我们经常使用的是WebClient,WebRequest等等,今天主要说一下,对于一个URI地址,采集这个页面上所有的图像资源,下面是源代码,供大家参考,学习。

   /// <summary>    /// 下载指定URL下的所有图片    /// </summary>    public class WebPageImage    {        /// <summary>        /// 获取网页中全部图片        /// </summary>        /// <param name="url">网页地址</param>        /// <param name="charSet">网页编码,为空自动判断</param>        /// <returns>全部图片显示代码</returns>        public string getImages(string url, string charSet)        {            string s = getHtml(url, charSet);            return getPictures(s, url);        }        /// <summary>        /// 获取网页中全部图片        /// </summary>        /// <param name="url">网址</param>        /// <returns>全部图片代码</returns>        public string getImages(string url)        {            return getImages(url, "");        }        string doman(string url)        {            Uri u = new Uri(url);            return u.Host;        }        /// <summary>        /// 获取网页内容        /// </summary>        /// <param name="url">网站地址</param>        /// <param name="charSet">目标网页的编码,如果传入的是null或者"",那就自动分析网页的编码 </param>        /// <returns></returns>        string getHtml(string url, string charSet)        {            WebClient myWebClient = new WebClient();            //创建WebClient实例myWebClient             // 需要注意的:             //有的网页可能下不下来,有种种原因比如需要cookie,编码问题等等             //这是就要具体问题具体分析比如在头部加入cookie             // webclient.Headers.Add("Cookie", cookie);             //这样可能需要一些重载方法。根据需要写就可以了             //获取或设置用于对向 Internet 资源的请求进行身份验证的网络凭据。             myWebClient.Credentials = CredentialCache.DefaultCredentials;            //如果服务器要验证用户名,密码             //NetworkCredential mycred = new NetworkCredential(struser, strpassWord);             //myWebClient.Credentials = mycred;             //从资源下载数据并返回字节数组。(加@是因为网址中间有"/"符号)             byte[] myDataBuffer = myWebClient.DownloadData(url);            string strWebData = Encoding.Default.GetString(myDataBuffer);            //获取网页字符编码描述信息             Match charSetMatch = Regex.Match(strWebData, "<meta([^<]*)charset=([^<]*)/"", RegexOptions.IgnoreCase | RegexOptions.Multiline);            string webCharSet = charSetMatch.Groups[2].Value.Replace("/"", "");            if (charSet == null || charSet == "")                charSet = webCharSet;            if (charSet != null && charSet != "" && Encoding.GetEncoding(charSet) != Encoding.Default)                strWebData = Encoding.GetEncoding(charSet).GetString(myDataBuffer);            return strWebData;        }        string getPictures(string data, string url)        {            MatchCollection ps = Regex.Matches(data, @"<img/b[^<>]*?/bsrc[/s/t/r/n]*=[/s/t/r/n]*[""']?[/s/t/r/n]*(?<imgUrl>[^/s/t/r/n""'<>]*)[^<>]*?/?[/s/t/r/n]*>");            string s = string.Empty;            for (int i = 0; i < ps.Count; i++)            {                pictures p = new pictures(ps[i].Value, url);                s += p.GetHtml + "<br />" + Environment.NewLine;            }            return s;        }        /// <summary>        /// 图片实体        /// 图片文件属性处理类        /// </summary>        public class pictures        {            public pictures(string strHtml, string baseUrl)            {                _html = strHtml;                Uri u1 = new Uri(baseUrl);                _doman = u1.Host;                _baseUrl = u1.Scheme + "://" + _doman;                setSrc();            }            PRivate string _html = string.Empty;            private string _baseUrl = string.Empty;            private string _doman = string.Empty;            public string GetHtml            {                get { return _html; }            }            public string Alt            {                get                {                    return GetAttribute("alt")[0];                }            }            public string Src            {                get                {                    string s = GetAttribute("src")[0];                    return s;                }            }            /// <summary>            /// 根据基路径把相对路径转换成绝对径            /// </summary>            /// <param name="baseUrl">基础路径</param>            /// <param name="u">待转换的相对路径</param>            /// <returns>绝对路径</returns>            public string absUrl(string baseUrl, string u)            {                Uri ub = new Uri(baseUrl);                Uri ua = new Uri(ub, u);                return ua.AbsoluteUri;            }            private void setSrc()            {                string strPattern = @"src[/s/t/r/n]*=[/s/t/r/n]*[""']?/S+[""']?";                string src = GetAttribute("src")[0].ToLower();                if (!(src.IndexOf("http://") == 0 || src.IndexOf("https://") == 0) && _baseUrl.Length > 10)                {                    src = absUrl(_baseUrl, src);                    string s = "src=/"" + src + "/"";                    _html = Regex.Replace(_html, strPattern, s);                }            }            /// <summary>            /// 获取HTML代码中标签属性            /// </summary>            /// <param name="strHtml">HTML代码</param>            /// <param name="strAttributeName">属性名称</param>            /// <returns>属性值集合</returns>            private string[] GetAttribute(string strAttributeName)            {                List<string> lstAttribute = new List<string>();                string strPattern = string.Format(                    @"{0}[/s/t/r/n]*=[/s/t/r/n]*[""']?/S+[""']?",                    strAttributeName                    );                MatchCollection matchs = Regex.Matches(_html, strPattern, RegexOptions.IgnoreCase);                foreach (Match m in matchs)                {                    lstAttribute.Add(m.Value.Split('=')[1].Replace("/"", "").Replace("'", ""));                }                if (lstAttribute.Count == 0) lstAttribute.Add("");                return lstAttribute.ToArray();            }        }    }

调用:

new WebPageImage().getImages("http://www.sina.com")

结果:


发表评论 共有条评论
用户名: 密码:
验证码: 匿名发表