首页 > 编程 > C# > 正文

C#使用iTextSharp将PDF转成文本的方法

2020-01-24 01:51:37
字体:
来源:转载
供稿:网友

本文实例讲述了C#使用iTextSharp将PDF转成文本的方法。分享给大家供大家参考。具体实现方法如下:

using System;using System.IO; using iTextSharp.text;using iTextSharp.text.pdf;using iTextSharp.text.pdf.parser;public class ParsingPDF {  static string PDF;  static string TEXT2;  /**   * Parses the PDF using PRTokeniser   * @param src the path to the original PDF file   * @param dest the path to the resulting text file   */  public void parsePdf(String src, String dest)  {    PdfReader reader = new PdfReader(src);    StreamWriter output = new StreamWriter(new FileStream(dest, FileMode.Create));    int pageCount = reader.NumberOfPages;    for (int pg = 1; pg <= pageCount; pg++)    {      // we can inspect the syntax of the imported page      byte[] streamBytes = reader.GetPageContent(pg);      PRTokeniser tokenizer = new PRTokeniser(streamBytes);      while (tokenizer.NextToken())      {        if (tokenizer.TokenType == PRTokeniser.TokType.STRING)        {          output.WriteLine(tokenizer.StringValue);        }      }    }    output.Flush();    output.Close();  }  /**   * Main method.   */  static void Main(string[] args)  {    if (args.Length < 1 || args.Length > 2)    {      Console.WriteLine("USAGE: ParsePDF infile.pdf <outfile.txt>");      return;    }    else if (args.Length == 1)    {      PDF = args[0];      TEXT2 = Path.GetFileNameWithoutExtension(PDF) + ".txt";    }    else    {      PDF = args[0];      TEXT2 = args[1];    }    try    {      DateTime t1 = DateTime.Now;      ParsingPDF example = new ParsingPDF();      example.parsePdf(PDF, TEXT2);      DateTime t2 = DateTime.Now;      TimeSpan ts = t2 - t1;      Console.WriteLine("Parsing completed in {0:0.00} seconds.", ts.TotalSeconds);    }    catch (Exception ex)    {      Console.WriteLine("ERROR: " + ex.Message);    }  } // class  public class MyTextRenderListener : IRenderListener  {    /** The print writer to which the information will be written. */    protected StreamWriter output;    /**     * Creates a RenderListener that will look for text.     */    public MyTextRenderListener(StreamWriter output)    {      this.output = output;    }    public void BeginTextBlock()    {      output.Write("<");    }    public void EndTextBlock()    {      output.WriteLine(">");    }    public void RenderImage(ImageRenderInfo renderInfo)    {    }    public void RenderText(TextRenderInfo renderInfo)    {      output.Write("<");      output.Write(renderInfo.GetText());      output.Write(">");    }  } // class} // namespace 

希望本文所述对大家的C#程序设计有所帮助。

发表评论 共有条评论
用户名: 密码:
验证码: 匿名发表