Thursday, March 19, 2009

PDF to text

it requires that you have the full version of Adobe installed on your PC so that you can gain access to the Adobe APIs (which doesn't technically qualify as a free way to do it). Here is the code I used to read the contents of a PDF. You will have to add a reference to the Adobe APIs in your project:

Dim objPDFPage As AcroPDPage
Dim objPDFDoc As New AcroPDDoc
Dim objPDFAVDoc As AcroAVDoc
Dim objAcroApp As AcroApp
Dim objPDFRectTemp As Object
Dim objPDFRect As New AcroRect
Dim lngTextRangeCount As Long
Dim objPDFTextSelection As AcroPDTextSelect
Dim temptextcount As Long
Dim strText As String
Dim lngPageCount As Long
Dim Fora As Long
lngPageCount = objPDFDoc.GetNumPages
For Fora = 0 To lngPageCount - 1
objPDFPage = objPDFDoc.AcquirePage(Fora)
objPDFRectTemp = objPDFPage.GetSize
objPDFRect.Left = 0
objPDFRect.right = objPDFRectTemp.x
objPDFRect.Top = objPDFRectTemp.y
objPDFRect.bottom = 0
' objPDFTextSelection = objPDFDoc.CreateTextSelect(lngPageCount, objPDFRect)
objPDFTextSelection = objPDFDoc.CreateTextSelect(Fora, objPDFRect)
' Get The Text Of The Range
temptextcount = objPDFTextSelection.GetNumText
For lngTextRangeCount = 1 To objPDFTextSelection.GetNumText
doctextdoctext = doctext & objPDFTextSelection.GetText(lngTextRangeCount - 1)
doctextdoctext = doctext & vbCrLf
doctype = "PDF"

