1.PDF中文本字符串格式中关键值信息抓取(已完成)
简介:这种解析比较传统最简单主要熟练使用Regular ExPRession做语义识别和验证.例如抓取下面红色圈内关键信息
string mettingData=GetMeetingData(); public string GetMeetingData() { string patternAll = @"(?<NDAandCAMDate>会/s*议/s*.{2,15}/d{2,4}/s*年/s*/d{1,2}/s*月/s*/d{1,2}/s*日.{0,15})"; PdfAnalyzer pa = new PdfAnalyzer(); PDFNet.Initialize(); PDFDoc doc = new PDFDoc(item); doc.InitSecurityHandler(); List<PdfString> foundAll = pa.RegexSearchAllPages(doc, patternAll); List<string> patternFilter = new List<string>(); patternFilter.Add(@"(?<year>/d{2,4})年(?<month>/d{1,2})月(?<day>/d{1,2})日((/(|/()(星期|周)(一|二|三|四|五|六|七)(/)|/)))?(上午)?(?<hour>/d{1,2})(/:|点|时)(?<minute>/d{1,2})"); patternFilter.Add(@"(?<year>/d{2,4})年(?<month>/d{1,2})月(?<day>/d{1,2})日((/(|/()(星期|周)(一|二|三|四|五|六|七)(/)|/)))?下午(?<hour>/d{1,2})(/:|点|时)(?<minute>/d{1,2})"); patternFilter.Add(@"(?<year>/d{2,4})年(?<month>/d{1,2})月(?<day>/d{1,2})日((/(|/()(星期|周)(一|二|三|四|五|六|七)(/)|/)))?(上午)?(?<hour>/d{1,2})点半"); patternFilter.Add(@"(?<year>/d{2,4})年(?<month>/d{1,2})月(?<day>/d{1,2})日((/(|/()(星期|周)(一|二|三|四|五|六|七)(/)|/)))?下午(?<hour>/d{1,2})点半"); patternFilter.Add(@"(?<year>/d{2,4})年(?<month>/d{1,2})月(?<day>/d{1,2})日((/(|/()(星期|周)(一|二|三|四|五|六|七)(/)|/)))?(上午)?(?<hour>/d{1,2})(点|时)"); patternFilter.Add(@"(?<year>/d{2,4})年(?<month>/d{1,2})月(?<day>/d{1,2})日((/(|/()(星期|周)(一|二|三|四|五|六|七)(/)|/)))?下午(?<hour>/d{1,2})(点|时)"); patternFilter.Add(@"(?<year>/d{2,4})年(?<month>/d{1,2})月(?<day>/d{1,2})日"); return GetMeetingDateFilter(foundAll, patternAll); } private string GetMeetingDateFilter(List<PdfString> foundAll, List<string> patternAll) { string meetingDate = " "; Match ma = null; string result = string.Empty; foreach (PdfString pdfString in foundAll) { result = pdfString.ToString().Replace(" ", ""); for (int i = 0; i < patternAll.Count; i++) { ma = (new Regex(patternAll[i])).Match(result); if (ma.Success) { if (IsValid(ma)) return meetingDate; else meetingDate = " "; } } } return meetingDate; }
注解:
a.第一次通过通过 pa.RegexSearchAllPages(doc, patternAll);搜索所有关于时间数据信息
b.第二次通过正则匹配获取带有关键词信息Meeting Data
2.PDF类似表格形式关键值数据抓取。(已完成)
简介:这种格式需要用的封装数据结构PdfString类和PdfAnalyzer类,根据给定关键词在指定范围提取数据,例如提取下面数据。
private string GetPremium(string path, string ricCode) { string result = string.Empty; PDFDoc doc = null; try { PDFNet.Initialize(); doc = new PDFDoc(path); doc.InitSecurityHandler(); if (doc == null) { string msg = string.Format("can't load pdf to doc = new PDFDoc({0}); ", path); Logger.Log(msg, Logger.LogType.Error); return result; } int x1 = 0; int y1 = 0; PdfAnalyzer pa = new PdfAnalyzer(); List<PdfString> listX1 = pa.RegexSearchAllPages(doc, ricCode); List<PdfString> listY1 = pa.RegexSearchAllPages(doc, @"[P|p]remium"); List<PdfString> listResult = pa.RegexSearchAllPages(doc, @"(?<Result>/d+/./d+/%)"); if (listX1.Count == 0 || listY1.Count == 0 || listResult.Count == 0) { string msg = string.Format("({0}),([P|p]remium) exist missing value ,so Gearing is empty value.", ricCode); Logger.Log(msg, Logger.LogType.Warning); return result; } x1 = System.Convert.ToInt32(listX1[0].Position.x1); y1 = System.Convert.ToInt32(listY1[0].Position.y1); int subX1 = 0; int subY1 = 0; //use Gearing position (x1,y1) to get the right result value foreach (var item in listResult) { subX1 = x1 - System.Convert.ToInt32(item.Position.x1); if (subX1 < 0) subX1 = 0 - subX1; subY1 = y1 - System.Convert.ToInt32(item.Position.y1); if (subY1 < 0) subY1 = 0 - subY1; if (subX1 <= 10 && subY1 <= 10) { result = item.ToString().Replace("%", ""); return result; } } Logger.Log(string.Format("stock code:{0},extract premium failed .", ricCode), Logger.LogType.Error); return result; } catch (Exception ex) { string msg = string.Format("PDF analysis failed for " + ricCode + "! Action: Need manually input gearing and premium /r/n error msg:{0}", ex.Message); Logger.Log(msg, Logger.LogType.Warning); return result; } }
3.需要PDF中大量数据转换到Excel中去 (已完成)
简介:基与2的延伸,加入一个自动模糊匹配到行和列边界范围,根据位置坐标排序提取正确数据信息。如图:
private void StartExtractFile() { List<List<string>> bulkFileFilter = null; List<LineFound> bulkFile = null; PDFNet.Initialize(); PDFDoc doc = new PDFDoc(config.FilePath1); doc.InitSecurityHandler(); string patternTitle = @"コード"; int page = 3; PdfString ricPosition = GetRicPosition(doc, patternTitle, page); if (ricPosition == null) return; string patternRic = @"/d{4}"; string patternValue = @"(/-|/+)?/d+(/,|/.|/d)+"; bulkFile = GetValue(doc, ricPosition, patternRic, patternValue); int indexOK = 0; bulkFileFilter = FilterBulkFile(bulkFile, indexOK); string filePath = Path.Combine(config.OutputFolder, string.Format("Type1ExtractedFromPdf{0}.csv", DateTime.Now.ToString("dd-MM-yyyy"))); if (File.Exists(filePath)) File.Delete(filePath); XlsOrCsvUtil.GenerateStringCsv(filePath, bulkFileFilter); AddResult(Path.GetFileNameWithoutExtension(filePath), filePath, "type1"); } private List<List<string>> FilterBulkFile(List<LineFound> bulkFile, int indexOK) { List<List<string>> result = new List<List<string>>(); if (bulkFile == null || bulkFile.Count == 0) { Logger.Log("no value data extract from pdf"); return
新闻热点
疑难解答