C#解析PDF

本帖最后由 sunrise2 于 2014-7-21 22:44 编辑

   C#解析PDF的方式有很多，比较好用的有ITestSharp和PdfBox。
      PDF内容页如果是图片类型，例如扫描件，则需要进行OCR（光学字符识别）。
      文本内容的PDF文档，解析的过程中，我目前仅发现能以字符串的形式读取的，不能够读取其中的表格。据说PDF文档结构中是没有表格概念的，因此这个自然是读不到的，如果果真如此，则PDF中表格内容的解析，只能对获取到的字符串按照一定的逻辑自行解析了。
   ITestSharp是一C#开源项目，PdfBox为Java开源项目，借助于IKVM在.Net平台下有实现。

   Pdf转换Image，使用的是GhostScript，可以以API的方式调用，也可以以Windows命令行的方式调用。
OCR使用的是Asprise，识别效果较好（商业），另外还可以使用MS的ImageScaning（2007）或OneNote（2010）（需要依赖Office组件），Tessert（HP->Google）（效果很差）。
ITestSharp辅助类

using System;
using System.Collections.Generic;
using System.Text;
using iTextSharp.text.pdf;
using iTextSharp.text.pdf.parser;
using System.IO;
namespace eyuan
{
public static class ITextSharpHandler
{
/// <summary>
/// 读取PDF文本内容
/// </summary>
/// <param name="fileName"></param>
/// <returns></returns>
public static string ReadPdf(string fileName)
{
if (!File.Exists(fileName))
{
LogHandler.LogWrite(@"指定的PDF文件不存在：" + fileName);
return string.Empty;
}
//
string fileContent = string.Empty;
StringBuilder sbFileContent = new StringBuilder();
//打开文件
PdfReader reader = null;
try
{
reader = new PdfReader(fileName);
}
catch (Exception ex)
{
LogHandler.LogWrite(string.Format(@"加载PDF文件{0}失败,错误:{1}", new string[] { fileName, ex.ToString() }));
if (reader != null)
{
reader.Close();
reader = null;
}
return string.Empty;
}
try
{
//循环各页（索引从1开始）
for (int i = 1; i <= reader.NumberOfPages; i++)
{
sbFileContent.AppendLine(PdfTextExtractor.GetTextFromPage(reader, i));
}
}
catch (Exception ex)
{
LogHandler.LogWrite(string.Format(@"解析PDF文件{0}失败,错误:{1}", new string[] { fileName, ex.ToString() }));
}
finally
{
if (reader != null)
{
reader.Close();
reader = null;
}
}
//
fileContent = sbFileContent.ToString();
return fileContent;
}
/// <summary>
/// 获取PDF页数
/// </summary>
/// <param name="fileName"></param>
/// <returns></returns>
public static int GetPdfPageCount(string fileName)
{
if (!File.Exists(fileName))
{
LogHandler.LogWrite(@"指定的PDF文件不存在：" + fileName);
return -1;
}
//打开文件
PdfReader reader = null;
try
{
reader = new PdfReader(fileName);
}
catch (Exception ex)
{
LogHandler.LogWrite(string.Format(@"加载PDF文件{0}失败,错误:{1}", new string[] { fileName, ex.ToString() }));
if (reader != null)
{
reader.Close();
reader = null;
}
return -1;
}
//
return reader.NumberOfPages;
}
}
}

复制代码

PDFBox辅助类

using org.pdfbox.pdmodel;
using org.pdfbox.util;
using System;
using System.Collections.Generic;
using System.IO;
using System.Text;
namespace eyuan
{
public static class PdfBoxHandler
{
/// <summary>
/// 使用PDFBox组件进行解析
/// </summary>
/// <param name="input">PDF文件路径</param>
/// <returns>PDF文本内容</returns>
public static string ReadPdf(string input)
{
if (!File.Exists(input))
{
LogHandler.LogWrite(@"指定的PDF文件不存在：" + input);
return null;
}
else
{
PDDocument pdfdoc = null;
string strPDFText = null;
PDFTextStripper stripper = null;
try
{
//加载PDF文件
pdfdoc = PDDocument.load(input);
}
catch (Exception ex)
{
LogHandler.LogWrite(string.Format(@"加载PDF文件{0}失败,错误:{1}", new string[] { input, ex.ToString() }));
if (pdfdoc != null)
{
pdfdoc.close();
pdfdoc = null;
}
return null;
}
try
{
//解析PDF文件
stripper = new PDFTextStripper();
strPDFText = stripper.getText(pdfdoc);
}
catch (Exception ex)
{
LogHandler.LogWrite(string.Format(@"解析PDF文件{0}失败,错误:{1}", new string[] { input, ex.ToString() }));
}
finally
{
if (pdfdoc != null)
{
pdfdoc.close();
pdfdoc = null;
}
}
return strPDFText;
}
}
}
}

复制代码

另外附上PDF转Image，然后对Image进行OCR的代码。

调用示例

1 GhostscriptHandler ghostscriptHandler = new GhostscriptHandler();
2 string tempJpgFileName = string.Format(GhostScriptImageName, Guid.NewGuid().ToString());
3 int pdfPageCount = ITextSharpHandler.GetPdfPageCount(fileName);
ghostscriptHandler.Convert(fileName, tempJpgFileName, 1, pdfPageCount, "jpeg", 100, 100);
fileContent = AspriseOCRHandler.ReadImage(fileName);

复制代码

hejin67410 · hejin67410

谢谢分享

帐号		自动登录	找回密码
密码			加入黑马

C#解析PDF

1 个回复