Text Extraction in PDFs
How to Extract Text from PDF Documents
Develop digital document solutions by integrating PDF data extraction functionality into large scale enterprise workflows and applications with Adobe PDF Library.
Get Free Trial
C++
C#
Java
C++
#include "TextExtract.h"
#include
#include
static void EnumerateAcroFormField(CosObj fieldObj, std::string prefix, std::vector &returnText);
//==============================================================================================================================
// Default Constructor - This creates a new TextExtract object.
//==============================================================================================================================
TextExtract::TextExtract(PDDoc inPDoc) {
pDoc = inPDoc;
void SetupWordFinderParams();
wordFinder = PDDocCreateWordFinderEx(pDoc, WF_LATEST_VERSION, true, &wfConfig);
}
//==============================================================================================================================
// ~TextExtract() - Releases resources if they haven't already been freed.
//==============================================================================================================================
TextExtract::~TextExtract() {
if (wordFinder != nullptr) {
PDWordFinderDestroy(wordFinder);
}
}
//==============================================================================================================================
// SetupWordFinderParams() - Setup params for WordFinder. User can modify based on needs.
//==============================================================================================================================
void TextExtract::SetupWordFinderParams() {
memset(&wfConfig, 0, sizeof(PDWordFinderConfigRec));
wfConfig.recSize = sizeof(PDWordFinderConfigRec);
wfConfig.disableTaggedPDF = false;
wfConfig.noXYSort = false;
wfConfig.preserveSpaces = false;
wfConfig.noLigatureExp = false;
wfConfig.noEncodingGuess = false;
wfConfig.unknownToStdEnc = false;
wfConfig.ignoreCharGaps = false;
wfConfig.ignoreLineGaps = false;
wfConfig.noAnnots = false;
wfConfig.noHyphenDetection = false;
wfConfig.trustNBSpace = false;
wfConfig.noExtCharOffset = false;
wfConfig.noStyleInfo = false;
wfConfig.decomposeTbl = NULL;
wfConfig.decomposeTblSize = 0;
wfConfig.charTypeTbl = NULL;
wfConfig.charTypeTblSize = 0;
wfConfig.preserveRedundantChars = false;
wfConfig.disableCharReordering = false;
wfConfig.noSkewedQuads = false;
wfConfig.noTextRenderMode3 = false;
wfConfig.preciseQuad = false;
}
//==============================================================================================================================
// GetText() - Gets the text for the entire document.
//==============================================================================================================================
std::vector TextExtract::GetText() {
std::vector returnText;
ASInt32 numPages = PDDocGetNumPages(pDoc);
for (ASInt32 pageNum = 0; pageNum < numPages; ++pageNum) {
std::vector pageText = GetText(pageNum);
returnText.insert(returnText.end(), std::make_move_iterator(pageText.begin()),
std::make_move_iterator(pageText.end()));
}
return returnText;
}
//==============================================================================================================================
// GetText() - Gets the text on a specified page.
//==============================================================================================================================
std::vector TextExtract::GetText(ASInt32 pageNum) {
std::vector returnText;
PDWordFinderAcquireWordList(wordFinder, pageNum, &wordArray, NULL, NULL, &numWords);
for (ASInt32 wordNum = 0; wordNum < numWords; ++wordNum) {
PDWord pdWord = PDWordFinderGetNthWord(wordFinder, wordNum);
ASText asTextWord = ASTextNew();
PDWordGetASText(pdWord, 0, asTextWord);
// Get the endian neutral UTF-8 string.
ASUTF8Val *utf8String = reinterpret_cast(ASTextGetUnicodeCopy(asTextWord, kUTF8));
PDTextExtractRec record;
record.text = reinterpret_cast(utf8String);
ASTextDestroy(asTextWord);
ASfree(utf8String);
returnText.emplace_back(record);
}
PDWordFinderReleaseWordList(wordFinder, pageNum);
return returnText;
}
//==============================================================================================================================
// GetTextAndDetails() - Gets the text and detail info for the entire document.
//==============================================================================================================================
std::vector TextExtract::GetTextAndDetails() {
std::vector returnText;
ASInt32 numPages = PDDocGetNumPages(pDoc);
for (ASInt32 pageNum = 0; pageNum < numPages; ++pageNum) {
std::vector pageText = GetTextAndDetails(pageNum);
returnText.insert(returnText.end(), std::make_move_iterator(pageText.begin()),
std::make_move_iterator(pageText.end()));
}
return returnText;
}
//==============================================================================================================================
// GetTextAndDetails() - Gets the text and detail info for a specific page.
//==============================================================================================================================
std::vector TextExtract::GetTextAndDetails(ASInt32 pageNum) {
std::vector returnText;
PDWordFinderAcquireWordList(wordFinder, pageNum, &wordArray, NULL, NULL, &numWords);
for (ASInt32 wordNum = 0; wordNum < numWords; ++wordNum) {
PDWord pdWord = PDWordFinderGetNthWord(wordFinder, wordNum);
ASText asTextWord = ASTextNew();
PDWordGetASText(pdWord, 0, asTextWord);
// Get the endian neutral UTF-8 string.
ASUTF8Val *utf8String = reinterpret_cast(ASTextGetUnicodeCopy(asTextWord, kUTF8));
PDTextAndDetailsExtractRec record;
record.text = reinterpret_cast(utf8String);
ASTextDestroy(asTextWord);
ASfree(utf8String);
ASInt32 numQuads = PDWordGetNumQuads(pdWord);
// A Word typically has only 1 quad, but can have more than one for hyphenated words, words on a curve, etc.
for (ASInt32 quadNum = 0; quadNum < numQuads; ++quadNum) {
ASFixedQuad wordQuad;
PDWordGetNthQuad(pdWord, quadNum, &wordQuad);
DLQuadFloat floatQuad;
floatQuad.bl.h = ASFixedToFloat(wordQuad.bl.h);
floatQuad.br.h = ASFixedToFloat(wordQuad.br.h);
floatQuad.tl.h = ASFixedToFloat(wordQuad.tl.h);
floatQuad.tr.h = ASFixedToFloat(wordQuad.tr.h);
floatQuad.bl.v = ASFixedToFloat(wordQuad.bl.v);
floatQuad.br.v = ASFixedToFloat(wordQuad.br.v);
floatQuad.tl.v = ASFixedToFloat(wordQuad.tl.v);
floatQuad.tr.v = ASFixedToFloat(wordQuad.tr.v);
record.boundingQuads.emplace_back(floatQuad);
}
PDStyle pdStyle;
ASInt16 transTbl[100];
PDColorValueRec pdStyleColor;
ASInt16 iRet = PDWordGetStyleTransition(pdWord, transTbl, 100);
if (iRet) {
for (int styleIndex = 0; styleIndex < iRet; ++styleIndex) {
DLStyle dlstyle;
dlstyle.charIndex = transTbl[styleIndex];
pdStyle = PDWordGetNthCharStyle(wordFinder, pdWord, styleIndex);
PDStyleGetColor(pdStyle, &pdStyleColor);
switch (pdStyleColor.space) {
case PDDeviceGray:
dlstyle.colorValues.DLSpace = "DeviceGray";
break;
case PDDeviceRGB:
dlstyle.colorValues.DLSpace = "DeviceRGB";
break;
case PDDeviceCMYK:
dlstyle.colorValues.DLSpace = "DeviceCMYK";
break;
default:
dlstyle.colorValues.DLSpace = "Invalid";
}
dlstyle.colorValues.DLColor[0] = ASFixedToFloat(pdStyleColor.value[0]);
dlstyle.colorValues.DLColor[1] = ASFixedToFloat(pdStyleColor.value[1]);
dlstyle.colorValues.DLColor[2] = ASFixedToFloat(pdStyleColor.value[2]);
dlstyle.colorValues.DLColor[3] = ASFixedToFloat(pdStyleColor.value[3]);
dlstyle.fontsize = ASFixedToFloat(PDStyleGetFontSize(pdStyle));
PDFont pdFont = PDStyleGetFont(pdStyle);
char fontNameBuf[PSNAMESIZE];
PDFontGetName(pdFont, fontNameBuf, PSNAMESIZE);
ASBool fontEmbedded = PDFontIsEmbedded(pdFont);
ASBool fontSubset = false;
char *fontNameStart = 0;
// Subset test: a font was subset if the 7th character is '+' (a plus-sign),
// according to Acrobat/Reader and industry norms.
if (fontEmbedded) {
if ((strlen(fontNameBuf)) > 7 && (fontNameBuf[6] == '+'))
fontSubset = true;
}
if (fontSubset)
fontNameStart = fontNameBuf + 7; // skip the "ABCDEF+"
else
fontNameStart = fontNameBuf;
dlstyle.fontname = fontNameStart;
record.styles.emplace_back(dlstyle);
}
}
returnText.emplace_back(record);
}
PDWordFinderReleaseWordList(wordFinder, pageNum);
return returnText;
}
//==============================================================================================================================
// GetAcroFormFieldData() - Gets the AcroForm field data.
//==============================================================================================================================
std::vector TextExtract::GetAcroFormFieldData() {
std::vector returnText;
CosObj rootObj = CosDocGetRoot(PDDocGetCosDoc(pDoc));
CosObj acroFormObj = CosDictGet(rootObj, ASAtomFromString("AcroForm"));
if (CosObjGetType(acroFormObj) == CosNull) {
return returnText;
} else {
CosObj fieldsObj = CosDictGet(acroFormObj, ASAtomFromString("Fields"));
if ((CosObjGetType(fieldsObj) != CosArray) || (CosArrayLength(fieldsObj) == 0)) {
return returnText;
} else {
for (ASInt32 fieldIndex = 0; fieldIndex < CosArrayLength(fieldsObj); ++fieldIndex) {
CosObj fieldObj = CosArrayGet(fieldsObj, fieldIndex);
EnumerateAcroFormField(fieldObj, "", returnText);
}
}
}
return returnText;
}
static void EnumerateAcroFormField(CosObj fieldObj, std::string prefix, std::vector &returnText) {
std::string field_name;
ASTCount textLength;
if (CosObjGetType(fieldObj) == CosDict) {
if (CosDictKnown(fieldObj, ASAtomFromString("T"))) {
CosObj entryObj = CosDictGet(fieldObj, ASAtomFromString("T"));
if (CosObjGetType(entryObj) == CosString) {
std::string name_part(CosStringValue(entryObj, &textLength));
if (prefix == "") {
field_name = name_part;
} else {
std::ostringstream stringStream;
stringStream << prefix << "." << name_part;
field_name = stringStream.str();
}
// Process the Kids
CosObj kidsObj = CosDictGet(fieldObj, ASAtomFromString("Kids"));
if (CosObjGetType(kidsObj) == CosArray) {
for (ASInt32 kidIndex = 0; kidIndex < CosArrayLength(kidsObj); ++kidIndex) {
CosObj fieldObj = CosArrayGet(kidsObj, kidIndex);
EnumerateAcroFormField(fieldObj, field_name, returnText);
}
}
// Process this node
CosObj nameObj = CosDictGet(fieldObj, ASAtomFromString("FT"));
if (CosObjGetType(nameObj) == CosName) {
if (CosNameValue(nameObj) == ASAtomFromString("Tx")) {
PDAcroFormExtractRec record;
record.fieldName = field_name;
if (CosDictKnown(fieldObj, ASAtomFromString("V"))) {
CosObj entryValueObj = CosDictGet(fieldObj, ASAtomFromString("V"));
std::string textString(CosStringValue(entryValueObj, &textLength));
ASText asText = ASTextFromSizedPDText(textString.c_str(), textLength);
char *textStr = reinterpret_cast(ASTextGetUnicodeCopy(asText, kUTF8));
record.text = textStr;
}
returnText.emplace_back(record);
}
}
}
}
}
}
//==============================================================================================================================
// GetAnnotationText() - Gets the Annotation text.
//==============================================================================================================================
std::vector TextExtract::GetAnnotationText() {
std::vector returnText;
ASInt32 numPages = PDDocGetNumPages(pDoc);
const size_t buffersize = 1000;
static char contentBuffer[buffersize];
for (ASInt32 pageNum = 0; pageNum < numPages; ++pageNum) {
PDPage annotPage = PDDocAcquirePage(pDoc, pageNum);
int numAnnots = PDPageGetNumAnnots(annotPage);
// Extract each annotation's text content (if any)
for (int annotIndex = 0; annotIndex < numAnnots; ++annotIndex) {
PDAnnot annotation = PDPageGetAnnot(annotPage, annotIndex);
ASAtom subtype = PDAnnotGetSubtype(annotation);
if (subtype == ASAtomFromString("Text") || subtype == ASAtomFromString("FreeText")) {
PDTextAnnot nextAsText = CastToPDTextAnnot(annotation);
PDTextAnnotGetContents(nextAsText, contentBuffer, buffersize);
PDAnnotationExtractRec record;
record.type = ASAtomGetString(subtype);
record.text = contentBuffer;
returnText.emplace_back(record);
}
}
PDPageRelease(annotPage);
}
return returnText;
}
C#
using System;
using System.Collections.Generic;
using Datalogics.PDFL;
namespace TextExtract
{
class TextExtract
{
static void Main(string[] args)
{
Console.WriteLine("TextExtract Sample:");
// ReSharper disable once UnusedVariable
using (Library lib = new Library())
{
Console.WriteLine("Initialized the library.");
// This is a tagged PDF.
String sInput = Library.ResourceDirectory + "Sample_Input/pdf_intro.pdf";
if (args.Length > 0)
sInput = args[0];
// This is an untagged PDF.
//Resources/Sample_Input/constitution.pdf"
Document doc = new Document(sInput);
Console.WriteLine("Input file: " + sInput);
// Determine if the PDF is tagged. We'll use a slightly different set of rules
// for parsing tagged and untagged PDFs.
//
// We'll determine if the PDF is tagged by examining the MarkInfo
// dictionary of the document. First, check for the existence of the MarkInfo dict.
bool docIsTagged = false;
PDFDict markInfoDict;
PDFBoolean markedEntry;
if ((markInfoDict = (PDFDict) doc.Root.Get("MarkInfo")) != null)
{
if ((markedEntry = (PDFBoolean) markInfoDict.Get("Marked")) != null)
{
if (markedEntry.Value)
docIsTagged = true;
}
}
WordFinderConfig wordConfig = new WordFinderConfig();
wordConfig.IgnoreCharGaps = false;
wordConfig.IgnoreLineGaps = false;
wordConfig.NoAnnots = false;
wordConfig.NoEncodingGuess = false;
// Std Roman treatment for custom encoding; overrides the noEncodingGuess option
wordConfig.UnknownToStdEnc = false;
wordConfig.DisableTaggedPDF = false; // legacy mode WordFinder creation
wordConfig.NoXYSort = true;
wordConfig.PreserveSpaces = false;
wordConfig.NoLigatureExp = false;
wordConfig.NoHyphenDetection = false;
wordConfig.TrustNBSpace = false;
wordConfig.NoExtCharOffset = false; // text extraction efficiency
wordConfig.NoStyleInfo = false; // text extraction efficiency
WordFinder wordFinder = new WordFinder(doc, WordFinderVersion.Latest, wordConfig);
if (docIsTagged)
ExtractTextTagged(doc, wordFinder);
else
ExtractTextUntagged(doc, wordFinder);
}
}
static void ExtractTextUntagged(Document doc, WordFinder wordFinder)
{
int nPages = doc.NumPages;
IList pageWords = null;
System.IO.StreamWriter logfile = new System.IO.StreamWriter("TextExtract-untagged-out.txt");
Console.WriteLine("Writing TextExtract-untagged-out.txt");
for (int i = 0; i < nPages; i++)
{
pageWords = wordFinder.GetWordList(i);
String textToExtract = "";
for (int wordnum = 0; wordnum < pageWords.Count; wordnum++)
{
Word wInfo;
wInfo = pageWords[wordnum];
string s = wInfo.Text;
// Check for hyphenated words that break across a line.
if (((wInfo.Attributes & WordAttributeFlags.HasSoftHyphen) == WordAttributeFlags.HasSoftHyphen) &&
((wInfo.Attributes & WordAttributeFlags.LastWordOnLine) == WordAttributeFlags.LastWordOnLine))
{
// Remove the hyphen and combine the two parts of the word before adding to the extracted text.
// Note that we pass in the Unicode character for soft hyphen as well as the regular hyphen.
//
// In untagged PDF, it's not uncommon to find a mixture of hard and soft hyphens that may
// not be used for their intended purposes.
// (Soft hyphens are intended only for words that break across lines.)
//
// For the purposes of this sample, we'll remove all hyphens. In practice, you may need to check
// words against a dictionary to determine if the hyphenated word is actually one word or two.
// Note we remove ascii hyphen, Unicode soft hyphen(\u00ad) and Unicode hyphen(0x2010).
string[] splitstrs = s.Split(new[] {'-', '\u00ad', '\x2010'});
textToExtract += splitstrs[0] + splitstrs[1];
}
else
textToExtract += s;
// Check for space adjacency and add a space if necessary.
if ((wInfo.Attributes & WordAttributeFlags.AdjacentToSpace) == WordAttributeFlags.AdjacentToSpace)
{
textToExtract += " ";
}
// Check for a line break and add one if necessary
if ((wInfo.Attributes & WordAttributeFlags.LastWordOnLine) == WordAttributeFlags.LastWordOnLine)
textToExtract += "\n";
}
logfile.WriteLine("");
logfile.WriteLine(textToExtract);
// Release requested WordList
for (int wordnum = 0; wordnum < pageWords.Count; wordnum++)
pageWords[wordnum].Dispose();
}
Console.WriteLine("Extracted " + nPages + " pages.");
logfile.Close();
}
static void ExtractTextTagged(Document doc, WordFinder wordFinder)
{
int nPages = doc.NumPages;
IList pageWords = null;
System.IO.StreamWriter logfile = new System.IO.StreamWriter("TextExtract-tagged-out.txt");
Console.WriteLine("Writing TextExtract-tagged-out.txt");
for (int i = 0; i < nPages; i++)
{
pageWords = wordFinder.GetWordList(i);
String textToExtract = "";
for (int wordnum = 0; wordnum < pageWords.Count; wordnum++)
{
Word wInfo;
wInfo = pageWords[wordnum];
string s = wInfo.Text;
// In most tagged PDFs, soft hyphens are used only to break words across lines, so we'll
// check for any soft hyphens and remove them from our text output.
//
// Note that we're not checking for the LastWordOnLine flag, unlike untagged PDF. For Tagged PDF,
// words are not flagged as being the last on the line if they are not at the end of a sentence.
if (((wInfo.Attributes & WordAttributeFlags.HasSoftHyphen) == WordAttributeFlags.HasSoftHyphen))
{
// Remove the hyphen and combine the two parts of the word before adding to the extracted text.
// Note that we pass in the Unicode character for soft hyphen(\u00ad) and hyphen(0x2010).
string[] splitstrs = s.Split(new[] {'\u00ad','\x2010'});
textToExtract += splitstrs[0] + splitstrs[1];
}
else
textToExtract += s;
// Check for space adjacency and add a space if necessary.
if ((wInfo.Attributes & WordAttributeFlags.AdjacentToSpace) == WordAttributeFlags.AdjacentToSpace)
{
textToExtract += " ";
}
// Check for a line break and add one if necessary.
// Normally this is accomplished using WordAttributeFlags.LastWordOnLine,
// but for tagged PDFs, the LastWordOnLine flag is set according to the
// tags in the PDF, not according to visual line breaks in the document.
//
// To preserve the visual line breaks in the document, we'll check whether
// the word is the last word in the region. If you instead prefer to
// break lines according to the tags in the PDF, use
// (wInfo.Attributes & WordAttributeFlags.LastWordOnLine) == WordAttributeFlags.LastWordOnLine,
// similar to the untagged case.
if (wInfo.IsLastWordInRegion)
textToExtract += "\n";
}
logfile.WriteLine("");
logfile.WriteLine(textToExtract);
// Release requested WordList
for (int wordnum = 0; wordnum < pageWords.Count; wordnum++)
pageWords[wordnum].Dispose();
}
Console.WriteLine("Extracted " + nPages + " pages.");
logfile.Close();
}
}
}
Java
package com.datalogics.PDFL.Samples;
import java.io.FileOutputStream;
import java.io.OutputStreamWriter;
import java.util.*;
import com.datalogics.PDFL.*;
public class TextExtract {
public static void main (String[] args) throws Throwable
{
System.out.println("TextExtract sample:");
Library lib = new Library();
try {
// This is an untagged PDF.
// "../../Resources/Sample_Input/constitution.pdf"
// This is a tagged PDF.
String sInput = "../../Resources/Sample_Input/pdf_intro.pdf";
if ( args.length > 0 )
sInput = args[0];
System.out.println("Reading " + sInput);
Document doc = new Document(sInput);
System.out.println("Opened document " + sInput);
// Determine if the PDF is tagged. We'll use a slightly different set of rules
// for parsing tagged and untagged PDFs.
//
// We'll determine if the PDF is tagged by examining the MarkInfo
// dictionary of the document. First, check for the existence of the MarkInfo dict.
Boolean docIsTagged = false;
PDFDict markInfoDict;
PDFBoolean markedEntry;
if ((markInfoDict = (PDFDict)doc.getRoot().get("MarkInfo")) != null)
{
if ((markedEntry = (PDFBoolean)markInfoDict.get("Marked")) != null)
{
if (markedEntry.getValue())
docIsTagged = true;
}
}
WordFinderConfig wordConfig = new WordFinderConfig();
wordConfig.setIgnoreCharGaps(false);
wordConfig.setIgnoreLineGaps(false);
wordConfig.setNoAnnots(false);
wordConfig.setNoEncodingGuess(false);
// Std Roman treatment for custom encoding; overrides the noEncodingGuess option
wordConfig.setUnknownToStdEnc(false);
wordConfig.setDisableTaggedPDF(false); // legacy mode WordFinder creation
wordConfig.setNoXYSort(true);
wordConfig.setPreserveSpaces(false);
wordConfig.setNoLigatureExp(false);
wordConfig.setNoHyphenDetection(false);
wordConfig.setTrustNBSpace(false);
wordConfig.setNoExtCharOffset(false); // text extraction efficiency
wordConfig.setNoStyleInfo(false); // text extraction efficiency
WordFinder wordFinder = new WordFinder(doc, WordFinderVersion.LATEST, wordConfig);
if (docIsTagged)
ExtractTextTagged(doc, wordFinder);
else
ExtractTextUntagged(doc, wordFinder);
doc.close();
}
finally {
lib.delete();
}
}
static void ExtractTextUntagged(Document doc, WordFinder wordFinder) throws Throwable
{
int nPages = doc.getNumPages();
List pageWords = null;
FileOutputStream logfile = new FileOutputStream("TextExtract-untagged-out.txt");
System.out.println("Writing TextExtract-untagged-out.txt");
OutputStreamWriter logwriter = new OutputStreamWriter(logfile, "UTF-8");
for (int i = 0; i < nPages; i++)
{
pageWords = wordFinder.getWordList(i);
String textToExtract = "";
for (int wordnum = 0; wordnum < pageWords.size(); wordnum++)
{
Word wInfo;
wInfo = pageWords.get(wordnum);
String s = wInfo.getText();
// Check for hyphenated words that break across a line.
if ((wInfo.getAttributes().contains(WordAttributeFlags.HAS_SOFT_HYPHEN)) &&
(wInfo.getAttributes().contains(WordAttributeFlags.LAST_WORD_ON_LINE)))
{
// Remove the hyphen and combine the two parts of the word before adding to the extracted text.
// Note that we pass in the Unicode character for soft hyphen as well as the regular hyphen.
//
// In untagged PDF, it's not uncommon to find a mixture of hard and soft hyphens that may
// not be used for their intended purposes.
// (Soft hyphens are intended only for words that break across lines.)
//
// For the purposes of this sample, we'll remove all hyphens. In practice, you may need to check
// words against a dictionary to determine if the hyphenated word is actually one word or two.
String[] splitstrs = s.split("-|\u00ad");
for(int j = 0; j < splitstrs.length; j++)
textToExtract = textToExtract + splitstrs[j];
}
else
textToExtract = textToExtract + s;
// Check for space adjacency or last word in region and add a space if necessary.
// LastWordInRegion is true if the WordFinder determined that this is the last word in a region.
// This may be set for words that are visually separated when viewing the PDF,
// but are not separated by a space. Here, it's used in conjunction with
// WordAttributes.AdjacentToSpace to determine where to insert spaces when
// post-processing WordFinder results.
if (wInfo.getAttributes().contains(WordAttributeFlags.ADJACENT_TO_SPACE) || wInfo.getIsLastWordInRegion())
{
textToExtract = textToExtract + " ";
}
// Check for a line break and add one if necessary
if (wInfo.getAttributes().contains(WordAttributeFlags.LAST_WORD_ON_LINE))
textToExtract = textToExtract + "\n";
}
String pageNum = "\n";
logwriter.write(pageNum, 0, pageNum.length());
logwriter.write(textToExtract, 0, textToExtract.length());
logwriter.write("\n");
// Release requested WordList
for (int wordnum = 0; wordnum < pageWords.size(); wordnum++)
pageWords.get(wordnum).delete();
}
System.out.println("Extracted " + nPages + " pages.");
logwriter.close();
}
static void ExtractTextTagged(Document doc, WordFinder wordFinder) throws Throwable
{
int nPages = doc.getNumPages();
List pageWords = null;
FileOutputStream logfile = new FileOutputStream("TextExtract-tagged-out.txt");
System.out.println("Writing TextExtract-tagged-out.txt");
OutputStreamWriter logwriter = new OutputStreamWriter(logfile, "UTF-8");
for (int i = 0; i < nPages; i++)
{
pageWords = wordFinder.getWordList(i);
String textToExtract = "";
for (int wordnum = 0; wordnum < pageWords.size(); wordnum++)
{
Word wInfo;
wInfo = pageWords.get(wordnum);
String s = wInfo.getText();
// In most tagged PDFs, soft hyphens are used only to break words across lines, so we'll
// check for any soft hyphens and remove them from our text output.
//
// Note that we're not checking for the LAST_WORD_ON_LINE flag, unlike untagged PDF. For Tagged PDF,
// words are not flagged as being the last on the line if they are not at the end of a sentence.
if (wInfo.getAttributes().contains(WordAttributeFlags.HAS_SOFT_HYPHEN))
{
// Remove the hyphen and combine the two parts of the word before adding to the extracted text.
// Note that we pass in the Unicode character for soft hyphen.
String[] splitstrs = s.split("\u00ad");
for(int j = 0; j < splitstrs.length; j++)
textToExtract = textToExtract + splitstrs[j];
}
else
textToExtract = textToExtract + s;
// Check for space adjacency or last word in region and add a space if necessary.
// LastWordInRegion is true if the WordFinder determined that this is the last word in a region.
// This may be set for words that are visually separated when viewing the PDF,
// but are not separated by a space. Here, it's used in conjunction with
// WordAttributes.AdjacentToSpace to determine where to insert spaces when
// post-processing WordFinder results.
if (wInfo.getAttributes().contains(WordAttributeFlags.ADJACENT_TO_SPACE) || wInfo.getIsLastWordInRegion())
{
textToExtract = textToExtract + " ";
}
// Check for a line break and add one if necessary
if (wInfo.getAttributes().contains(WordAttributeFlags.LAST_WORD_ON_LINE))
textToExtract = textToExtract + "\n";
}
String pageNum = "\n";
logwriter.write(pageNum, 0, pageNum.length());
logwriter.write(textToExtract, 0, textToExtract.length());
logwriter.write("\n");
// Release requested WordList
for (int wordnum = 0; wordnum < pageWords.size(); wordnum++)
pageWords.get(wordnum).delete();
}
System.out.println("Extracted " + nPages + " pages.");
logwriter.close();
}
};