From e682c9529f5d735ec098b4936a7e0f9e7ddd10e9 Mon Sep 17 00:00:00 2001 From: Jeremy Echols Date: Wed, 15 Jul 2015 12:41:55 -0700 Subject: [PATCH] pdftotext: Add -bbox-layout option Adds layout information for blocks and lines in addition to words --- utils/pdftotext.1 | 4 ++ utils/pdftotext.cc | 114 +++++++++++++++++++++++++++++++++++++++++++---------- 2 files changed, 98 insertions(+), 20 deletions(-) diff --git a/utils/pdftotext.1 b/utils/pdftotext.1 index 0199b03..b53f82f 100644 --- a/utils/pdftotext.1 +++ b/utils/pdftotext.1 @@ -71,6 +71,10 @@ headers. Generate an XHTML file containing bounding box information for each word in the file. .TP +.B \-bbox-layout +Generate an XHTML file containing bounding box information for each +block, line, and word in the file. +.TP .BI \-enc " encoding-name" Sets the encoding to use for text output. This defaults to "UTF-8". .TP diff --git a/utils/pdftotext.cc b/utils/pdftotext.cc index 15c741d..c76d5a9 100644 --- a/utils/pdftotext.cc +++ b/utils/pdftotext.cc @@ -56,10 +56,14 @@ #include "PDFDocEncoding.h" #include "Error.h" #include +#include +#include static void printInfoString(FILE *f, Dict *infoDict, const char *key, const char *text1, const char *text2, UnicodeMap *uMap); static void printInfoDate(FILE *f, Dict *infoDict, const char *key, const char *fmt); +void printDocBBox(FILE *f, PDFDoc *doc, TextOutputDev *textOut, int first, int last); +void printWordBBox(FILE *f, PDFDoc *doc, TextOutputDev *textOut, int first, int last); static int firstPage = 1; static int lastPage = 0; @@ -69,6 +73,7 @@ static int y = 0; static int w = 0; static int h = 0; static GBool bbox = gFalse; +static GBool bboxLayout = gFalse; static GBool physLayout = gFalse; static double fixedPitch = 0; static GBool rawOrder = gFalse; @@ -116,6 +121,8 @@ static const ArgDesc argDesc[] = { "don't insert page breaks between pages"}, {"-bbox", argFlag, &bbox, 0, "output bounding box for each word and page size to html. Sets -htmlmeta"}, + {"-bbox-layout", argFlag, &bboxLayout, 0, + "like -bbox but with extra layout bounding box data. Sets -htmlmeta"}, {"-opw", argString, ownerPassword, sizeof(ownerPassword), "owner password (for encrypted files)"}, {"-upw", argString, userPassword, sizeof(userPassword), @@ -176,6 +183,9 @@ int main(int argc, char *argv[]) { // parse args ok = parseArgs(argDesc, &argc, argv); + if (bboxLayout) { + bbox = gTrue; + } if (bbox) { htmlMeta = gTrue; } @@ -352,27 +362,12 @@ int main(int argc, char *argv[]) { textOut = new TextOutputDev(NULL, physLayout, fixedPitch, rawOrder, htmlMeta); if (textOut->isOk()) { - fprintf(f, "\n"); - for (int page = firstPage; page <= lastPage; ++page) { - fprintf(f, " \n",doc->getPageMediaWidth(page), doc->getPageMediaHeight(page)); - doc->displayPage(textOut, page, resolution, resolution, 0, gTrue, gFalse, gFalse); - TextWordList *wordlist = textOut->makeWordList(); - const int word_length = wordlist != NULL ? wordlist->getLength() : 0; - TextWord *word; - double xMinA, yMinA, xMaxA, yMaxA; - if (word_length == 0) - fprintf(stderr, "no word list\n"); - - for (int i = 0; i < word_length; ++i) { - word = wordlist->get(i); - word->getBBox(&xMinA, &yMinA, &xMaxA, &yMaxA); - const std::string myString = myXmlTokenReplace(word->getText()->getCString()); - fprintf(f," %s\n", xMinA, yMinA, xMaxA, yMaxA, myString.c_str()); - } - fprintf(f, " \n"); - delete wordlist; + if (bboxLayout) { + printDocBBox(f, doc, textOut, firstPage, lastPage); + } + else { + printWordBBox(f, doc, textOut, firstPage, lastPage); } - fprintf(f, "\n"); } if (f != stdout) { fclose(f); @@ -492,3 +487,82 @@ static void printInfoDate(FILE *f, Dict *infoDict, const char *key, const char * } obj.free(); } + +void printLine(FILE *f, TextLine *line) { + double xMin, yMin, xMax, yMax; + double lineXMin = 0, lineYMin = 0, lineXMax = 0, lineYMax = 0; + TextWord *word; + std::stringstream wordXML; + wordXML << std::fixed << std::setprecision(6); + + for (word = line->getWords(); word; word = word->getNext()) { + word->getBBox(&xMin, &yMin, &xMax, &yMax); + + if (lineXMin == 0 || lineXMin > xMin) lineXMin = xMin; + if (lineYMin == 0 || lineYMin > yMin) lineYMin = yMin; + if (lineXMax < xMax) lineXMax = xMax; + if (lineYMax < yMax) lineYMax = yMax; + + const std::string myString = myXmlTokenReplace(word->getText()->getCString()); + wordXML << " " << myString << "\n"; + } + fprintf(f, " \n", + lineXMin, lineYMin, lineXMax, lineYMax); + fprintf(f, wordXML.str().c_str()); + fprintf(f, " \n"); +} + +void printDocBBox(FILE *f, PDFDoc *doc, TextOutputDev *textOut, int first, int last) { + double xMin, yMin, xMax, yMax; + TextPage *textPage; + TextFlow *flow; + TextBlock *blk; + TextLine *line; + + fprintf(f, "\n"); + for (int page = first; page <= last; ++page) { + fprintf(f, " \n",doc->getPageMediaWidth(page), doc->getPageMediaHeight(page)); + doc->displayPage(textOut, page, resolution, resolution, 0, gTrue, gFalse, gFalse); + textPage = textOut->takeText(); + for (flow = textPage->getFlows(); flow; flow = flow->getNext()) { + fprintf(f, " \n"); + for (blk = flow->getBlocks(); blk; blk = blk->getNext()) { + blk->getBBox(&xMin, &yMin, &xMax, &yMax); + fprintf(f, " \n", xMin, yMin, xMax, yMax); + for (line = blk->getLines(); line; line = line->getNext()) { + printLine(f, line); + } + fprintf(f, " \n"); + } + fprintf(f, " \n"); + } + fprintf(f, " \n"); + textPage->decRefCnt(); + } + fprintf(f, "\n"); +} + +void printWordBBox(FILE *f, PDFDoc *doc, TextOutputDev *textOut, int first, int last) { + fprintf(f, "\n"); + for (int page = first; page <= last; ++page) { + fprintf(f, " \n",doc->getPageMediaWidth(page), doc->getPageMediaHeight(page)); + doc->displayPage(textOut, page, resolution, resolution, 0, gTrue, gFalse, gFalse); + TextWordList *wordlist = textOut->makeWordList(); + const int word_length = wordlist != NULL ? wordlist->getLength() : 0; + TextWord *word; + double xMinA, yMinA, xMaxA, yMaxA; + if (word_length == 0) + fprintf(stderr, "no word list\n"); + + for (int i = 0; i < word_length; ++i) { + word = wordlist->get(i); + word->getBBox(&xMinA, &yMinA, &xMaxA, &yMaxA); + const std::string myString = myXmlTokenReplace(word->getText()->getCString()); + fprintf(f," %s\n", xMinA, yMinA, xMaxA, yMaxA, myString.c_str()); + } + fprintf(f, " \n"); + delete wordlist; + } + fprintf(f, "\n"); +} -- 1.8.3.1