首页 > 编程 > .NET > 正文

c#中过滤html的正则表达式

2024-07-10 12:40:29
字体:
来源:转载
供稿:网友

实现代码

///  <summary>///  去除HTML标记///  </summary>///  <param  name=”NoHTML”>包括HTML的源码  </param>///  <returns>已经去除后的文字</returns>public static string NoHTML(string Htmlstring){  //删除脚本  Htmlstring = Regex.Replace(Htmlstring, @"<script[^>]*?>.*?</script>", "",  RegexOptions.IgnoreCase);  //删除HTML   Htmlstring = Regex.Replace(Htmlstring, @"<(.[^>]*)>", "",  RegexOptions.IgnoreCase);  Htmlstring = Regex.Replace(Htmlstring, @"([/r/n])[/s]+", "",  RegexOptions.IgnoreCase);  Htmlstring = Regex.Replace(Htmlstring, @"–>", "", RegexOptions.IgnoreCase);  Htmlstring = Regex.Replace(Htmlstring, @"<!–.*", "", RegexOptions.IgnoreCase);  Htmlstring = Regex.Replace(Htmlstring, @"&(quot|#34);", "/"",  RegexOptions.IgnoreCase);  Htmlstring = Regex.Replace(Htmlstring, @"&(amp|#38);", "&",  RegexOptions.IgnoreCase);  Htmlstring = Regex.Replace(Htmlstring, @"&(lt|#60);", "<",  RegexOptions.IgnoreCase);  Htmlstring = Regex.Replace(Htmlstring, @"&(gt|#62);", ">",  RegexOptions.IgnoreCase);  Htmlstring = Regex.Replace(Htmlstring, @"&(nbsp|#160);", "  ",  RegexOptions.IgnoreCase);  Htmlstring = Regex.Replace(Htmlstring, @"&(iexcl|#161);", "/xa1", RegexOptions.IgnoreCase);  Htmlstring = Regex.Replace(Htmlstring, @"&(cent|#162);", "/xa2", RegexOptions.IgnoreCase);  Htmlstring = Regex.Replace(Htmlstring, @"&(pound|#163);", "/xa3", RegexOptions.IgnoreCase);  Htmlstring = Regex.Replace(Htmlstring, @"&(copy|#169);", "/xa9", RegexOptions.IgnoreCase);  Htmlstring = Regex.Replace(Htmlstring, @"&#(/d+);", "", RegexOptions.IgnoreCase);  Htmlstring.Replace("<", "");  Htmlstring.Replace(">", "");  Htmlstring.Replace("/r/n", "");  Htmlstring = HttpContext.Current.Server.HtmlEncode(Htmlstring).Trim();  return Htmlstring;}

C#过滤Html标签及空格

public static string FilterHTML(string HTMLStr)    {      if (!string.IsNullOrEmpty(HTMLStr))        return System.Text.RegularExpressions.Regex.Replace(HTMLStr, "<[^>]*>| ", "");      else        return "";    }

写一个静态方法移除HTML标签

#region///  <summary>///  移除HTML标签///  </summary>///  <param  name="HTMLStr">HTMLStr</param>public static string ParseTags(string HTMLStr){ return System.Text.RegularExpressions.Regex.Replace(HTMLStr, "<[^>]*>", "");}#endregion

取出文本中的图片地址

#region///  <summary>///  取出文本中的图片地址///  </summary>///  <param  name="HTMLStr">HTMLStr</param>public static string GetImgUrl(string HTMLStr){ string str = string.Empty; string sPattern = @"^<img/s+[^>]*>"; Regex r = new Regex(@"<img/s+[^>]*/s*src/s*=/s*([']?)(?<url>/S+)'?[^>]*>",  RegexOptions.Compiled); Match m = r.Match(HTMLStr.ToLower()); if (m.Success)  str = m.Result("${url}"); return str;}#endregion            
发表评论 共有条评论
用户名: 密码:
验证码: 匿名发表