首页 > 编程 > .NET > 正文

Asp.net 数据采集基类(远程抓取,分解,保存,匹配)

2024-07-10 13:05:52
字体:
来源:转载
供稿:网友

/############################################
版权声明:
文章内容为本站编辑,创作.你可以任意转载、发布、使用但请务必以明文标注文章原始出处及本声明
http://www.opent.cn  作者:浪淘沙
############################################/

using system;
using system.data;
using system.configuration;
using system.web;
using system.web.security;
using system.web.ui;
using system.web.ui.webcontrols;
using system.web.ui.webcontrols.webparts;
using system.web.ui.htmlcontrols;
using msxml2;
using system.text.regularexpressions;
namespace ec
{
    /// <summary>
    /// 远程文件抓取类
    /// </summary>
    public class getremoteobj
    {
     
        #region 构造与析构函数
        public getremoteobj()
        {
            //
            // todo: 在此处添加构造函数逻辑
            //
        }
        ~getremoteobj()
        {
            dispose();
        }
        #endregion

        #region idisposable 成员

        public void dispose()
        {          
            gc.suppressfinalize(this);
        }

        #endregion

        #region 日期随机函数
        /**********************************
         * 函数名称:daterndname
         * 功能说明:日期随机函数
         * 参    数:ra:随机数
         * 调用示例:
         *          getremoteobj o = new getremoteobj();
         *          random ra = new random();
         *          string s = o.daterndname(ra);
         *          response.write(s);
         *          o.dispose();
         * ********************************/
        /// <summary>
        /// 日期随机函数
        /// </summary>
        /// <param name="ra">随机数</param>
        /// <returns></returns>
        public  string daterndname(random ra)
        {
            datetime d = datetime.now;
            string s = null, y, m, dd, h, mm, ss;
            y = d.year.tostring();
            m = d.month.tostring();
            if (m.length < 2) m = "0" + m;
            dd = d.day.tostring();
            if (dd.length < 2) dd = "0" + dd;
            h = d.hour.tostring();
            if (h.length < 2) h = "0" + h;
            mm = d.minute.tostring();
            if (mm.length < 2) mm = "0" + mm;
            ss = d.second.tostring();
            if (ss.length < 2) ss = "0" + ss;
            s += y + m + dd + h + mm + ss;
            s += ra.next(100, 999).tostring();
            return s;
        }
        #endregion

        #region 取得文件后缀
        /**********************************
         * 函数名称:getfileextends
         * 功能说明:取得文件后缀
         * 参    数:filename:文件名称
         * 调用示例:
         *          getremoteobj o = new getremoteobj();
         *          string url = @"/xrssfile/2007-2/23/200722311844445.gif";
         *          string s = o.getfileextends(url);
         *          response.write(s);
         *          o.dispose();
         * ********************************/
        /// <summary>
        /// 取得文件后缀
        /// </summary>
        /// <param name="filename">文件名称</param>
        /// <returns></returns>
        public string getfileextends(string filename)
        {
            string ext = null;
            if (filename.indexof('.') > 0)
            {
                string[] fs = filename.split('.');
                ext = fs[fs.length - 1];
            }
            return ext;
        }
        #endregion

        #region 获取远程文件源代码
        /**********************************
         * 函数名称:getremotehtmlcode
         * 功能说明:获取远程文件源代码
         * 参    数:url:远程url
         * 调用示例:
         *          getremoteobj o = new getremoteobj();
         *          string url = @"http://www.baidu.com";
         *          string s = o.getremotehtmlcode(url);
         *          response.write(s);
         *          o.dispose();
         * ********************************/
        /// <summary>
        /// 获取远程文件源代码
        /// </summary>
        /// <param name="url">远程url</param>
        /// <returns></returns>
        public string getremotehtmlcode(string url)
        {
            string s = "";
            msxml2.xmlhttp _xmlhttp = new msxml2.xmlhttpclass();
            _xmlhttp.open("get", url, false, null, null);
            _xmlhttp.send("");
            if (_xmlhttp.readystate == 4)
            {
                s = system.text.encoding.default.getstring((byte[])_xmlhttp.responsebody);
            }
            return s;
        }

        #endregion

        #region 保存远程文件
        /**********************************
         * 函数名称:remotesave
         * 功能说明:保存远程文件
         * 参    数:url:远程url;path:保存到的路径
         * 调用示例:
         *          getremoteobj o = new getremoteobj();
         *          string s = "";
         *          string url = @"/xrssfile/2007-2/23/200722311844445.gif";
         *          string path =server.mappath("html/");
         *          s = o.remotesave(url,path);
         *          response.write(s);
         *          o.dispose();        
         * ******************************/
        /// <summary>
        /// 保存远程文件
        /// </summary>
        /// <param name="url">远程url</param>
        /// <param name="path">保存到的路径</param>
        /// <returns></returns>
        public string remotesave(string url, string path)
        {
            random ra = new random();
            string stringfilename = daterndname(ra) + "." + getfileextends(url);
            string stringfilepath = path + stringfilename;
            msxml2.xmlhttp _xmlhttp = new msxml2.xmlhttpclass();
            _xmlhttp.open("get", url, false, null, null);
            _xmlhttp.send("");
            if (_xmlhttp.readystate == 4)
            {
                if (system.io.file.exists(stringfilepath))
                    system.io.file.delete(stringfilepath);
                system.io.filestream fs = new system.io.filestream(stringfilepath, system.io.filemode.createnew);
                system.io.binarywriter w = new system.io.binarywriter(fs);
                w.write((byte[])_xmlhttp.responsebody);
                w.close();
                fs.close();
            }
            else
                throw new exception(_xmlhttp.statustext);
            return stringfilename;
        }
        #endregion

        #region 替换网页中的换行和引号
        /**********************************
         * 函数名称:replaceenter
         * 功能说明:替换网页中的换行和引号
         * 参    数:htmlcode:html源代码
         * 调用示例:
         *          getremoteobj o = new getremoteobj();
         *          string url = @"http://www.baidu.com";
         *          strion htmlcode = o.getremotehtmlcode(url);
         *          string s = o.replaceenter(htmlcode);
         *          response.write(s);
         *          o.dispose();
         * ********************************/
        /// <summary>
        /// 替换网页中的换行和引号
        /// </summary>
        /// <param name="htmlcode">html源代码</param>
        /// <returns></returns>
        public string replaceenter(string htmlcode)
        {
            string s = "";
            if (htmlcode == null || htmlcode == "")
                s = "";
            else
                s = htmlcode.replace("/"", "");
            s = s.replace("/r/n", "");
            return s;
        }

        #endregion              

        #region 执行正则提取出值
        /**********************************
         * 函数名称:getregvalue
         * 功能说明:执行正则提取出值
         * 参    数:htmlcode:html源代码
         * 调用示例:
         *          getremoteobj o = new getremoteobj();
         *          string url = @"http://www.baidu.com";
         *          strion htmlcode = o.getremotehtmlcode(url);
         *          string s = o.replaceenter(htmlcode);
         *          string reg="<title>.+?</title>";
         *          string getvalue=o.getregvalue(reg,htmlcode)
         *          response.write(getvalue);
         *          o.dispose();
         * ********************************/
        /// <summary>
        /// 执行正则提取出值
        /// </summary>
        /// <param name="regexstring">正则表达式</param>
        /// <param name="remotestr">htmlcode源代码</param>
        /// <returns></returns>
        public string  getregvalue(string regexstring, string remotestr)
        {
            string matchvale = "";
            regex r = new regex(regexstring);
            match m = r.match(remotestr);
            if (m.success)
            {
                matchvale = m.value;
            }
            return matchvale;
        }
        #endregion       

        #region 替换html源代码
        /**********************************
         * 函数名称:removehtml
         * 功能说明:替换html源代码
         * 参    数:htmlcode:html源代码
         * 调用示例:
         *          getremoteobj o = new getremoteobj();
         *          string url = @"http://www.baidu.com";
         *          strion htmlcode = o.getremotehtmlcode(url);
         *          string s = o.replaceenter(htmlcode);
         *          string reg="<title>.+?</title>";
         *          string getvalue=o.getregvalue(reg,htmlcode)
         *          response.write(getvalue);
         *          o.dispose();
         * ********************************/
        /// <summary>
        /// 替换html源代码
        /// </summary>
        /// <param name="htmlcode">html源代码</param>
        /// <returns></returns>
        public string removehtml(string htmlcode)
        {
            string matchvale = htmlcode;         
            foreach (match s in regex.matches(htmlcode, "<.+?>"))
            {
                matchvale = matchvale.replace(s.value, "");
            }
            return matchvale;       
        }

        #endregion

        #region 匹配页面的链接
        /**********************************
         * 函数名称:gethref
         * 功能说明:匹配页面的链接
         * 参    数:htmlcode:html源代码
         * 调用示例:
         *          getremoteobj o = new getremoteobj();
         *          string url = @"http://www.baidu.com";
         *          strion htmlcode = o.getremotehtmlcode(url);
         *          string s = o.gethref(htmlcode);
         *          response.write(s);
         *          o.dispose();
         * ********************************/
        /// <summary>
        /// 获取页面的链接正则
        /// </summary>
        /// <param name="htmlcode"></param>
        /// <returns></returns>
        public string gethref(string htmlcode)
        {
            string matchvale = "";
            string reg = @"(h|h)(r|r)(e|e)(f|f) *= *('|"")?((/w|//|//|/.|:|-|_)+)('|""| *|>)?";          
            foreach(match m in regex.matches(htmlcode,reg))
            {
                matchvale += (m.value).tolower().replace("href=", "").trim() + "||";
            }
            return matchvale;        
        }
        #endregion

        #region 匹配页面的图片地址
        /**********************************
         * 函数名称:getimgsrc
         * 功能说明:匹配页面的图片地址
         * 参    数:htmlcode:html源代码;imghttp:要补充的http.当比如:<img src="http://www.pushad.com/info/bb/x.gif">则要补充http://www.baidu.com/,当包含http信息时,则可以为空
         * 调用示例:
         *          getremoteobj o = new getremoteobj();
         *          string url = @"http://www.baidu.com";
         *          strion htmlcode = o.getremotehtmlcode(url);
         *          string s = o.getimgsrc(htmlcode,"http://www.baidu.com/");
         *          response.write(s);
         *          o.dispose();
         * ********************************/
        /// <summary>
        /// 匹配页面的图片地址
        /// </summary>
        /// <param name="htmlcode"></param>
        /// <param name="imghttp">要补充的http://路径信息</param>
        /// <returns></returns>
        public string getimgsrc(string htmlcode, string imghttp)
        {
            string matchvale = "";
            string reg = @"<img.+?>";
            foreach (match m in regex.matches(htmlcode, reg))
            {
                matchvale += getimg((m.value).tolower().trim(), imghttp) + "||";
            }
            return matchvale;
        }
        /// <summary>
        /// 匹配<img src="" />中的图片路径实际链接
        /// </summary>
        /// <param name="imgstring"><img src="" />字符串</param>
        /// <returns></returns>
        public string getimg(string imgstring, string imghttp)
        {
            string matchvale = "";
            string reg = @"src=.+/.(bmp|jpg|gif|png|)";
            foreach (match m in regex.matches(imgstring.tolower(), reg))
            {
                matchvale += (m.value).tolower().trim().replace("src=","");
            }
            return (imghttp+matchvale);
        }

        #endregion

        #region 替换通过正则获取字符串所带的正则首尾匹配字符串
        /**********************************
         * 函数名称:gethref
         * 功能说明:匹配页面的链接
         * 参    数:htmlcode:html源代码
         * 调用示例:
         *          getremoteobj o = new getremoteobj();
         *          string url = @"http://www.baidu.com";
         *          strion htmlcode = o.getremotehtmlcode(url);
         *          string s = o.regreplace(htmlcode,"<title>","</title>");
         *          response.write(s);
         *          o.dispose();
         * ********************************/
        /// <summary>
        /// 替换通过正则获取字符串所带的正则首尾匹配字符串
        /// </summary>
        /// <param name="regvalue">要替换的值</param>
        /// <param name="regstart">正则匹配的首字符串</param>
        /// <param name="regend">正则匹配的尾字符串</param>
        /// <returns></returns>
        public string regreplace(string regvalue, string regstart,string regend)
        {
            string s = regvalue;
            if (regvalue != "" && regvalue != null)
            {
                if (regstart != "" && regstart != null)
                {
                    s = s.replace(regstart, "");
                }
                if (regend != "" && regend != null)
                {
                    s = s.replace(regend, "");
                }
            }
            return s;
        }
        #endregion


    }
}

 
发表评论 共有条评论
用户名: 密码:
验证码: 匿名发表