首页 > 学院 > 开发设计 > 正文

随手正则写的CSDN【只看楼主】功能

2019-11-14 16:44:33
字体:
来源:转载
供稿:网友

写这个的时候居然没有看到原来CSDN已经有这个功能了,写完代码了突然发现原来早就已经有了。

现把代码贴出来吧,虽然有很多解析HTML的开源类库如:http://htmlagilitypack.codeplex.com/,但我一直习惯于正则匹配。

截图:

呵呵,起码还能看吧@——#

  1 PRivate void button1_Click(object sender, EventArgs e)  2         {  3             if (!string.IsNullOrEmpty(txtCsdnUrl.Text.Trim()))  4             {  5                 string url = txtCsdnUrl.Text.Trim();  6                 string htmlSource = string.Empty;  7                 htmlSource = GetHtmlSource(url);  8                 int pageCount = GetPageCount(htmlSource);  9                 string context = string.Empty; 10  11                 if (pageCount > 1) 12                 { 13                     for (int i = 1; i <= pageCount; i++) 14                     { 15                         htmlSource = GetHtmlSource(url + "?page=" + i); 16  17                        context+= GetLZArticle(htmlSource); 18                     } 19                 } 20                 else 21                 { 22                     context += GetLZArticle(htmlSource); 23                 } 24  25                 richTextBox1.Text = context; 26  27             } 28             else 29             { 30                 MessageBox.Show("请输入地址"); 31             } 32         } 33  34         /// <summary> 35         /// 获取源代码 36         /// </summary> 37         /// <param name="Url"></param> 38         /// <returns></returns> 39         public string GetHtmlSource(string Url) 40         { 41             WebClient client = new WebClient(); 42             client.Headers.Add("user-agent", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; .NET CLR 1.0.3705;)"); 43             Stream data = client.OpenRead(Url); 44             string result = string.Empty; 45             using (StreamReader reader = new StreamReader(data, Encoding.UTF8)) 46             { 47                 result = reader.ReadToEnd(); 48             } 49  50             return result; 51         } 52  53         /// <summary> 54         /// 获取贴子总页数 URL格式:http://bbs.csdn.net/topics/390730011?page=2 55         /// </summary> 56         /// <returns>返回最大页数</returns> 57         public int GetPageCount(string HtmlSource) 58         { 59             int pageCount = 0; 60  61             Regex reg = new Regex("<select class=/"jumpMenu/" name=/"jumpMenu/">(?<val>.*?)</select>", RegexOptions.Singleline | RegexOptions.IgnoreCase); 62             string htmlSource = HtmlSource; 63             Regex reg1 = new Regex("<option.*?>(?<val>.*?)</option>", RegexOptions.Singleline | RegexOptions.IgnoreCase); 64             int count = reg1.Matches(reg.Match(htmlSource).Groups["val"].Value).Count; 65  66             int.TryParse(reg1.Matches(reg.Match(htmlSource).Groups["val"].Value)[count - 1].Groups["val"].Value, 67                          out pageCount); 68  69             return pageCount; 70         } 71  72         /// <summary> 73         /// 获取文章标题 74         /// </summary> 75         /// <param name="HtmlSource">网页内容</param> 76         /// <returns></returns> 77         public string GetArticleTitle(string HtmlSource) 78         { 79             string title = string.Empty; 80  81             Regex reg = new Regex("<span class=/"title text_overflow/">(?<title>.*?)</span>", RegexOptions.Singleline | RegexOptions.IgnoreCase); 82  83             title = reg.Match(HtmlSource).Groups["title"].Value; 84  85             return title; 86         } 87  88  89         public string GetAuthorName(string HtmlSource) 90         { 91             string result = string.Empty; 92  93             Regex regex = new Regex("<a class=/"p-author/" href=/"#/">(?<value>.*?)</a>"); 94  95             result = regex.Match(HtmlSource).Groups["value"].Value; 96  97             return result; 98         } 99 100         public string GetLZArticle(string HtmlSource)101         {102 103             string result = string.Empty;104             string authorName = GetAuthorName(HtmlSource);105 106             Regex regex = new Regex("<td valign=/"top/" class=/"post_info .*?/" data-username=/"" + authorName + "/".*?>.*?<div class=/"post_body/">(?<value>.*?)</div>.*?</td>", RegexOptions.Singleline | RegexOptions.IgnoreCase);107 108             for (int i = 0; i < regex.Matches(HtmlSource).Count; i++)109             {110                 result += regex.Matches(HtmlSource)[i].Groups["value"].Value;111                 result += "--------------------分隔线--------------------";112             }113             return result.Trim().Replace("<br />","/r/n");114         }

代码都在这里了。


发表评论 共有条评论
用户名: 密码:
验证码: 匿名发表