C#解析PDF - 黑马程序员技术交流社区 - 黑马程序员IT技术论坛

using System;
using System.Collections.Generic;
using System.Text;
using iTextSharp.text.pdf;
using iTextSharp.text.pdf.parser;
using System.IO;
namespace eyuan
{
public static class ITextSharpHandler
{
/// <summary>
/// 读取PDF文本内容
/// </summary>
/// <param name="fileName"></param>
/// <returns></returns>
public static string ReadPdf(string fileName)
{
if (!File.Exists(fileName))
{
LogHandler.LogWrite(@"指定的PDF文件不存在：" + fileName);
return string.Empty;
}
//
string fileContent = string.Empty;
StringBuilder sbFileContent = new StringBuilder();
//打开文件
PdfReader reader = null;
try
{
reader = new PdfReader(fileName);
}
catch (Exception ex)
{
LogHandler.LogWrite(string.Format(@"加载PDF文件{0}失败,错误:{1}", new string[] { fileName, ex.ToString() }));
if (reader != null)
{
reader.Close();
reader = null;
}
return string.Empty;
}
try
{
//循环各页（索引从1开始）
for (int i = 1; i <= reader.NumberOfPages; i++)
{
sbFileContent.AppendLine(PdfTextExtractor.GetTextFromPage(reader, i));
}
}
catch (Exception ex)
{
LogHandler.LogWrite(string.Format(@"解析PDF文件{0}失败,错误:{1}", new string[] { fileName, ex.ToString() }));
}
finally
{
if (reader != null)
{
reader.Close();
reader = null;
}
}
//
fileContent = sbFileContent.ToString();
return fileContent;
}
/// <summary>
/// 获取PDF页数
/// </summary>
/// <param name="fileName"></param>
/// <returns></returns>
public static int GetPdfPageCount(string fileName)
{
if (!File.Exists(fileName))
{
LogHandler.LogWrite(@"指定的PDF文件不存在：" + fileName);
return -1;
}
//打开文件
PdfReader reader = null;
try
{
reader = new PdfReader(fileName);
}
catch (Exception ex)
{
LogHandler.LogWrite(string.Format(@"加载PDF文件{0}失败,错误:{1}", new string[] { fileName, ex.ToString() }));
if (reader != null)
{
reader.Close();
reader = null;
}
return -1;
}
//
return reader.NumberOfPages;
}
}
}

复制代码

using org.pdfbox.pdmodel;
using org.pdfbox.util;
using System;
using System.Collections.Generic;
using System.IO;
using System.Text;
namespace eyuan
{
public static class PdfBoxHandler
{
/// <summary>
/// 使用PDFBox组件进行解析
/// </summary>
/// <param name="input">PDF文件路径</param>
/// <returns>PDF文本内容</returns>
public static string ReadPdf(string input)
{
if (!File.Exists(input))
{
LogHandler.LogWrite(@"指定的PDF文件不存在：" + input);
return null;
}
else
{
PDDocument pdfdoc = null;
string strPDFText = null;
PDFTextStripper stripper = null;
try
{
//加载PDF文件
pdfdoc = PDDocument.load(input);
}
catch (Exception ex)
{
LogHandler.LogWrite(string.Format(@"加载PDF文件{0}失败,错误:{1}", new string[] { input, ex.ToString() }));
if (pdfdoc != null)
{
pdfdoc.close();
pdfdoc = null;
}
return null;
}
try
{
//解析PDF文件
stripper = new PDFTextStripper();
strPDFText = stripper.getText(pdfdoc);
}
catch (Exception ex)
{
LogHandler.LogWrite(string.Format(@"解析PDF文件{0}失败,错误:{1}", new string[] { input, ex.ToString() }));
}
finally
{
if (pdfdoc != null)
{
pdfdoc.close();
pdfdoc = null;
}
}
return strPDFText;
}
}
}
}

复制代码

1 GhostscriptHandler ghostscriptHandler = new GhostscriptHandler();
2 string tempJpgFileName = string.Format(GhostScriptImageName, Guid.NewGuid().ToString());
3 int pdfPageCount = ITextSharpHandler.GetPdfPageCount(fileName);
ghostscriptHandler.Convert(fileName, tempJpgFileName, 1, pdfPageCount, "jpeg", 100, 100);
fileContent = AspriseOCRHandler.ReadImage(fileName);

复制代码