本文实例讲述了C#实现将HTML转换成纯文本的方法。分享给大家供大家参考。具体如下:
使用方法:
C#代码如下:
- /// <summary>
- /// Converts HTML to plain text.
- /// </summary>
- class HtmlToText
- {
- // Static data tables
- protected static Dictionary<string, string> _tags;
- protected static HashSet<string> _ignoreTags;
- // Instance variables
- protected TextBuilder _text;
- protected string _html;
- protected int _pos;
- // Static constructor (one time only)
- static HtmlToText()
- {
- _tags = new Dictionary<string, string>();
- _tags.Add("address", "/n");
- _tags.Add("blockquote", "/n");
- _tags.Add("div", "/n");
- _tags.Add("dl", "/n");
- _tags.Add("fieldset", "/n");
- _tags.Add("form", "/n");
- _tags.Add("h1", "/n");
- _tags.Add("/h1", "/n");
- _tags.Add("h2", "/n");
- _tags.Add("/h2", "/n");
- _tags.Add("h3", "/n");
- _tags.Add("/h3", "/n");
- _tags.Add("h4", "/n");
- _tags.Add("/h4", "/n");
- _tags.Add("h5", "/n");
- _tags.Add("/h5", "/n");
- _tags.Add("h6", "/n");
- _tags.Add("/h6", "/n");
- _tags.Add("p", "/n");
- _tags.Add("/p", "/n");
- _tags.Add("table", "/n");
- _tags.Add("/table", "/n");
- _tags.Add("ul", "/n");
- _tags.Add("/ul", "/n");
- _tags.Add("ol", "/n");
- _tags.Add("/ol", "/n");
- _tags.Add("/li", "/n");
- _tags.Add("br", "/n");
- _tags.Add("/td", "/t");
- _tags.Add("/tr", "/n");
- _tags.Add("/pre", "/n");
- _ignoreTags = new HashSet<string>();
- _ignoreTags.Add("script");
- _ignoreTags.Add("noscript");
- _ignoreTags.Add("style");
- _ignoreTags.Add("object");
- }
- /// <summary>
- /// Converts the given HTML to plain text and returns the result.
- /// </summary>
- /// <param name="html">HTML to be converted</param>
- /// <returns>Resulting plain text</returns>
- public string Convert(string html)
- {
- // Initialize state variables
- _text = new TextBuilder();
- _html = html;
- _pos = 0;
- // Process input
- while (!EndOfText)
- {
- if (Peek() == '<')
- {
- // HTML tag
- bool selfClosing;
- string tag = ParseTag(out selfClosing);
- // Handle special tag cases
- if (tag == "body")
- {
- // Discard content before <body>
- _text.Clear();
- }
- else if (tag == "/body")
- {
- // Discard content after </body>
- _pos = _html.Length;
- }
- else if (tag == "pre")
- {
- // Enter preformatted mode
- _text.Preformatted = true;
- EatWhitespaceToNextLine();
- }
- else if (tag == "/pre")
- {
- // Exit preformatted mode
- _text.Preformatted = false;
- }
- string value;
- if (_tags.TryGetValue(tag, out value))
- _text.Write(value);
- if (_ignoreTags.Contains(tag))
- EatInnerContent(tag);
- }
- else if (Char.IsWhiteSpace(Peek()))
- {
- // Whitespace (treat all as space)
- _text.Write(_text.Preformatted ? Peek() : ' ');
- MoveAhead();
- }
- else
- {
- // Other text
- _text.Write(Peek());
- MoveAhead();
- }
- }
- // Return result
- return HttpUtility.HtmlDecode(_text.ToString());
- }
- // Eats all characters that are part of the current tag
- // and returns information about that tag
- protected string ParseTag(out bool selfClosing)
- {
- string tag = String.Empty;
- selfClosing = false;
- if (Peek() == '<')
- {
- MoveAhead();
- // Parse tag name
- EatWhitespace();
- int start = _pos;
- if (Peek() == '/')
- MoveAhead();
- while (!EndOfText && !Char.IsWhiteSpace(Peek()) &&
- Peek() != '/' && Peek() != '>')
- MoveAhead();
- tag = _html.Substring(start, _pos - start).ToLower();
- // Parse rest of tag
- while (!EndOfText && Peek() != '>')
- {
- if (Peek() == '"' || Peek() == '/'')
- EatQuotedValue();
- else
- {
- if (Peek() == '/')
- selfClosing = true;
- MoveAhead();
- }
- }
- MoveAhead();
- }
- return tag;
- }
- // Consumes inner content from the current tag
- protected void EatInnerContent(string tag)
- {
- string endTag = "/" + tag;
- while (!EndOfText)
- {
- if (Peek() == '<')
- {
- // Consume a tag
- bool selfClosing;
- if (ParseTag(out selfClosing) == endTag)
- return;
- // Use recursion to consume nested tags
- if (!selfClosing && !tag.StartsWith("/"))
- EatInnerContent(tag);
- }
- else MoveAhead();
- }
- }
- // Returns true if the current position is at the end of
- // the string
- protected bool EndOfText
- {
- get { return (_pos >= _html.Length); }
- }
- // Safely returns the character at the current position
- protected char Peek()
- {
- return (_pos < _html.Length) ? _html[_pos] : (char)0;
- }
- // Safely advances to current position to the next character
- protected void MoveAhead()
- {
- _pos = Math.Min(_pos + 1, _html.Length);
- }
- // Moves the current position to the next non-whitespace
- // character.
- protected void EatWhitespace()
- {
- while (Char.IsWhiteSpace(Peek()))
- MoveAhead();
- }
- // Moves the current position to the next non-whitespace
- // character or the start of the next line, whichever
- // comes first
- protected void EatWhitespaceToNextLine()
- {
- while (Char.IsWhiteSpace(Peek()))
- {
- char c = Peek();
- MoveAhead();
- if (c == '/n')
- break;
- }
- }
- // Moves the current position past a quoted value
- protected void EatQuotedValue()
- {
- char c = Peek();
- if (c == '"' || c == '/'')
- {
- // Opening quote
- MoveAhead();
- // Find end of value
- int start = _pos;
- _pos = _html.IndexOfAny(new char[] { c, '/r', '/n' }, _pos);
- if (_pos < 0)
- _pos = _html.Length;
- else
- MoveAhead(); // Closing quote
- }
- }
- /// <summary>
- /// A StringBuilder class that helps eliminate excess whitespace.
- /// </summary>
- protected class TextBuilder
- {
- private StringBuilder _text;
- private StringBuilder _currLine;
- private int _emptyLines;
- private bool _preformatted;
- // Construction
- public TextBuilder()
- {
- _text = new StringBuilder();
- _currLine = new StringBuilder();
- _emptyLines = 0;
- _preformatted = false;
- }
- /// <summary>
- /// Normally, extra whitespace characters are discarded.
- /// If this property is set to true, they are passed
- /// through unchanged.
- /// </summary>
- public bool Preformatted
- {
- get
- {
- return _preformatted;
- }
- set
- {
- if (value)
- {
- // Clear line buffer if changing to
- // preformatted mode
- if (_currLine.Length > 0)
- FlushCurrLine();
- _emptyLines = 0;
- }
- _preformatted = value;
- }
- }
- /// <summary>
- /// Clears all current text.
- /// </summary>
- public void Clear()
- {
- _text.Length = 0;
- _currLine.Length = 0;
- _emptyLines = 0;
- }
- /// <summary>
- /// Writes the given string to the output buffer.
- /// </summary>
- /// <param name="s"></param>
- public void Write(string s)
- {
- foreach (char c in s)
- Write(c);
- }
- /// <summary>
- /// Writes the given character to the output buffer.
- /// </summary>
- /// <param name="c">Character to write</param>
- public void Write(char c)
- {
- if (_preformatted)
- {
- // Write preformatted character
- _text.Append(c);
- }
- else
- {
- if (c == '/r')
- {
- // Ignore carriage returns. We'll process
- // '/n' if it comes next
- }
- else if (c == '/n')
- {
- // Flush current line
- FlushCurrLine();
- }
- else if (Char.IsWhiteSpace(c))
- {
- // Write single space character
- int len = _currLine.Length;
- if (len == 0 || !Char.IsWhiteSpace(_currLine[len - 1]))
- _currLine.Append(' ');
- }
- else
- {
- // Add character to current line
- _currLine.Append(c);
- }
- }
- }
- // Appends the current line to output buffer
- protected void FlushCurrLine()
- {
- // Get current line
- string line = _currLine.ToString().Trim();
- // Determine if line contains non-space characters
- string tmp = line.Replace(" ", String.Empty);
- if (tmp.Length == 0)
- {
- // An empty line
- _emptyLines++;
- if (_emptyLines < 2 && _text.Length > 0)
- _text.AppendLine(line);
- }
- else
- {
- // A non-empty line
- _emptyLines = 0;
- _text.AppendLine(line);
- }
- // Reset current line
- _currLine.Length = 0;
- }
- /// <summary>
- /// Returns the current output as a string.
- /// </summary>
- public override string ToString()
- {
- if (_currLine.Length > 0)
- FlushCurrLine();
- return _text.ToString();
- }
- }
- }
希望本文所述对大家的C#程序设计有所帮助。
新闻热点
疑难解答