From 1d9244c1cbebc32c21495921a3c70bd7f5c95e4f Mon Sep 17 00:00:00 2001 From: Daniel Garcia Date: Tue, 15 Jun 2010 16:08:00 +0200 Subject: [PATCH] poppler_page_get_text_layout in poppler/glib Returns an array of PopplerRectangle items and each Rectangle is a text character position. The position in this array represent the offset in text returned by poppler_page_get_text --- glib/poppler-page.cc | 91 ++++++++++++++++++++++++++++++++++++++++++++++++- glib/poppler-page.h | 4 ++- 2 files changed, 92 insertions(+), 3 deletions(-) diff --git a/glib/poppler-page.cc b/glib/poppler-page.cc index 14601a6..6778419 100644 --- a/glib/poppler-page.cc +++ b/glib/poppler-page.cc @@ -1722,14 +1722,101 @@ poppler_annot_mapping_free (PopplerAnnotMapping *mapping) g_slice_free (PopplerAnnotMapping, mapping); } -void +void poppler_page_get_crop_box (PopplerPage *page, PopplerRectangle *rect) { PDFRectangle* cropBox = page->page->getCropBox (); - + rect->x1 = cropBox->x1; rect->x2 = cropBox->x2; rect->y1 = cropBox->y1; rect->y2 = cropBox->y2; } +/** + * poppler_page_get_text_layout: + * @page: A #PopplerPage + * + * Returns an array of #PopplerRectangle items + * This array must be freed with g_free () when done. + * + * The position in the array represent the offset in text returned by + * poppler_page_get_text + * + * Return value: An array of #PopplerRectangle, n_areas is the size of + * the array. + **/ +gboolean +poppler_page_get_text_layout (PopplerPage *page, + PopplerRectangle **areas, + guint *n_areas) +{ + TextPage *text; + TextWordList *wordlist; + TextWord *word, *nextword; + PopplerRectangle *rect; + int i, j, offset = 0; + gdouble x1, y1, x2, y2; + gdouble x3, y3, x4, y4; + + *n_areas = 0; + + text = poppler_page_get_text_page (page); + wordlist = text->makeWordList (gFalse); + + // Getting the array size + for (i=0; i < wordlist->getLength (); i++) + { + word = wordlist->get (i); + *n_areas += word->getLength () + 1; + } + + // Creating areas + *areas = g_new (PopplerRectangle, *n_areas); + + // Calculating each char position + for (i=0; i < wordlist->getLength (); i++) + { + word = wordlist->get (i); + for (j=0; j < word->getLength (); j++) + { + rect = *areas + offset; + word->getCharBBox (j, + &(rect->x1), + &(rect->y1), + &(rect->x2), + &(rect->y2)); + offset++; + } + + // adding spaces and break lines + nextword = word->getNext (); + if (nextword) + { + rect = *areas + offset; + word->getBBox (&x1, &y1, &x2, &y2); + nextword->getBBox (&x3, &y3, &x4, &y4); + // space is from one wort to other and with the same height as + // first word. + rect->x1 = x2; + rect->y1 = y1; + rect->x2 = x3; + rect->y2 = y2; + } + else + { + // end of line + rect = *areas + offset; + word->getBBox (&x1, &y1, &x2, &y2); + rect->x1 = x2; + rect->y1 = y2; + rect->x2 = x2; + rect->y2 = y2; + } + offset++; + } + + delete wordlist; + + return gTrue; +} diff --git a/glib/poppler-page.h b/glib/poppler-page.h index 20dc20f..eaa9547 100644 --- a/glib/poppler-page.h +++ b/glib/poppler-page.h @@ -114,7 +114,9 @@ GList *poppler_page_get_annot_mapping (PopplerPage *pa void poppler_page_free_annot_mapping (GList *list); void poppler_page_get_crop_box (PopplerPage *page, PopplerRectangle *rect); - +gboolean poppler_page_get_text_layout (PopplerPage *page, + PopplerRectangle **areas, + guint *n_areas); /* A rectangle on a page, with coordinates in PDF points. */ #define POPPLER_TYPE_RECTANGLE (poppler_rectangle_get_type ()) -- 1.7.1.333.g6339f67