From 1d56254bafab049f74ba30d3fddb4f2b6961d4ae Mon Sep 17 00:00:00 2001 From: Adrian Perez de Castro Date: Thu, 26 Sep 2013 17:50:51 +0300 Subject: [PATCH v8 11/15] glib: Expose inline attributes of structure elements Allows obtaining inline text attributes from structure elements. The text is divived into "spans", which are groups of consecutive glyphs that share their attributes. Each one of those is represented by a PopplerTextSpan, which gives information about the text font and color, and the link target for links. The list of PopplerTextSpans is created lazily when first used. --- glib/poppler-private.h | 1 + glib/poppler-structure-element.cc | 191 ++++++++++++++++++++++++++++++++++++++ glib/poppler-structure-element.h | 26 ++++++ 3 files changed, 218 insertions(+) diff --git a/glib/poppler-private.h b/glib/poppler-private.h index e965a5a..1443091 100644 --- a/glib/poppler-private.h +++ b/glib/poppler-private.h @@ -111,6 +111,7 @@ struct _PopplerStructureElement gchar *alt_text; gchar *actual_text; gchar *language; + GList *text_spans; }; GList *_poppler_document_get_layers (PopplerDocument *document); diff --git a/glib/poppler-structure-element.cc b/glib/poppler-structure-element.cc index b2eafdf..0a05e4a 100644 --- a/glib/poppler-structure-element.cc +++ b/glib/poppler-structure-element.cc @@ -339,6 +339,16 @@ _poppler_structelement_type_to_poppler_structure_element_kind (StructElement::Ty } +static void _poppler_text_span_free (gpointer data) +{ + PopplerTextSpan *span = (PopplerTextSpan*) data; + g_free (span->text); + g_free (span->font_name); + g_free (span->link_target); + g_slice_free (PopplerTextSpan, data); +} + + typedef struct _PopplerStructureElementClass PopplerStructureElementClass; struct _PopplerStructureElementClass { @@ -382,6 +392,7 @@ poppler_structure_element_finalize (GObject *object) g_free (poppler_structure_element->title); g_free (poppler_structure_element->id); g_object_unref (poppler_structure_element->document); + g_list_free_full (poppler_structure_element->text_spans, _poppler_text_span_free); G_OBJECT_CLASS (poppler_structure_element_parent_class)->finalize (object); } @@ -665,3 +676,183 @@ poppler_structure_element_get_text (PopplerStructureElement *poppler_structure_e } return poppler_structure_element->text; } + + +class SpanBuilder { +public: + SpanBuilder(): + font(), text(), link(), + map(globalParams->getTextEncoding()), + glist(NULL), + flags(0), + color(0) + {} + + ~SpanBuilder() { + map->decRefCnt(); + g_list_free_full (glist, _poppler_text_span_free); + } + + void process(const MCOpArray& ops) { + for (MCOpArray::const_iterator i = ops.begin(); i != ops.end(); ++i) + process(*i); + } + + void process(const MCOp& op) { + if (op.type == MCOp::Unichar) { + int n = map->mapUnicode(op.unichar, buf, sizeof(buf)); + text.append(buf, n); + return; + } + + Guint oldFlags = flags; + + if (op.type == MCOp::Flags) { + if (op.flags & MCOp::FlagFontBold) + flags |= POPPLER_TEXT_SPAN_BOLD; + else + flags &= ~POPPLER_TEXT_SPAN_BOLD; + + if (op.flags & MCOp::FlagFontFixed) + flags |= POPPLER_TEXT_SPAN_FIXED_WIDTH; + else + flags &= ~POPPLER_TEXT_SPAN_FIXED_WIDTH; + + if (op.flags & MCOp::FlagFontItalic) + flags |= POPPLER_TEXT_SPAN_ITALIC; + else + flags &= ~POPPLER_TEXT_SPAN_ITALIC; + } + + if (op.type == MCOp::Color && (color = op.color.rgbPixel ())) { + flags |= POPPLER_TEXT_SPAN_COLOR; + } else { + flags &= ~POPPLER_TEXT_SPAN_COLOR; + } + + if (op.type == MCOp::FontName) { + if (op.value) { + flags |= POPPLER_TEXT_SPAN_FONT; + font.append(op.value); + } else { + flags &= ~POPPLER_TEXT_SPAN_FONT; + } + } + + if (flags != oldFlags) + newSpan(); + } + + void newSpan() { + // If there is no text, do not append a new PopplerTextSpan + // and keep the attributes/flags for the next span. + if (text.getLength ()) { + PopplerTextSpan *span = g_slice_new0 (PopplerTextSpan); + span->color = color; + span->flags = flags; + span->text = _poppler_goo_string_to_utf8 (&text); + text.clear(); + + if (font.getLength()) { + span->font_name = _poppler_goo_string_to_utf8 (&font); + font.clear(); + } + + if (link.getLength()) { + assert(flags & POPPLER_TEXT_SPAN_LINK); + span->link_target = _poppler_goo_string_to_utf8 (&link); + } + + glist = g_list_append (glist, span); + } + + // Link is always cleared + link.clear(); + } + + GList* end() { + GList *result = glist; + glist = NULL; + return result; + } + +private: + GooString font; + GooString text; + GooString link; + UnicodeMap *map; + GList *glist; + char buf[8]; + Guint flags; + Guint color; +}; + + +/** + * poppler_structure_element_get_text_spans: + * @poppler_structure_element: A #PopplerStructureElement + * + * Obtains the text enclosed by an element, as a #GList of #PopplerTextSpan + * structures. Each item in the list is a piece of text which share the same + * attributes, plus its attributes. + * + * Return value: (transfer none) (element-type PopplerTextSpan): A #GList + * of #PopplerTextSpan structures. + */ +GList* +poppler_structure_element_get_text_spans (PopplerStructureElement *poppler_structure_element) +{ + g_return_val_if_fail (POPPLER_IS_STRUCTURE_ELEMENT (poppler_structure_element), NULL); + g_assert (poppler_structure_element->elem); + + if (!poppler_structure_element->elem->isContent ()) + return NULL; + + if (!poppler_structure_element->text_spans) + { + SpanBuilder builder; + builder.process(poppler_structure_element->elem->getMCOps ()); + poppler_structure_element->text_spans = builder.end(); + } + return poppler_structure_element->text_spans; +} + +/** + * poppler_text_span_is_fixed_width: + * @poppler_text_span: a #PopplerTextSpan + */ +gboolean +poppler_text_span_is_fixed_width (PopplerTextSpan *poppler_text_span) +{ + return (poppler_text_span->flags & POPPLER_TEXT_SPAN_FIXED_WIDTH); +} + +/** + * poppler_text_span_is_serif_font: + * @poppler_text_span: a #PopplerTextSpan + */ +gboolean +poppler_text_span_is_serif_font (PopplerTextSpan *poppler_text_span) +{ + return (poppler_text_span->flags & POPPLER_TEXT_SPAN_SERIF_FONT); +} + +/** + * poppler_text_span_is_bols: + * @poppler_text_span: a #PopplerTextSpan + */ +gboolean +poppler_text_span_is_bold (PopplerTextSpan *poppler_text_span) +{ + return (poppler_text_span->flags & POPPLER_TEXT_SPAN_BOLD); +} + +/** + * poppler_text_span_is_link: + * @poppler_text_span: a #PopplerTextSpan + */ +gboolean +poppler_text_span_is_link (PopplerTextSpan *poppler_text_span) +{ + return (poppler_text_span->flags & POPPLER_TEXT_SPAN_LINK); +} diff --git a/glib/poppler-structure-element.h b/glib/poppler-structure-element.h index b1b679e..a4200e1 100644 --- a/glib/poppler-structure-element.h +++ b/glib/poppler-structure-element.h @@ -82,6 +82,25 @@ typedef enum { POPPLER_STRUCTURE_ELEMENT_FORM, } PopplerStructureElementKind; +typedef struct _PopplerTextSpan PopplerTextSpan; +struct _PopplerTextSpan { + gchar *text; + gchar *font_name; + gchar *link_target; + guint flags; + guint color; /* 0x00RRGGBB */ +}; + +enum { + POPPLER_TEXT_SPAN_FIXED_WIDTH = (1 << 0), + POPPLER_TEXT_SPAN_SERIF_FONT = (1 << 1), + POPPLER_TEXT_SPAN_ITALIC = (1 << 2), + POPPLER_TEXT_SPAN_BOLD = (1 << 3), + POPPLER_TEXT_SPAN_LINK = (1 << 4), + POPPLER_TEXT_SPAN_COLOR = (1 << 5), + POPPLER_TEXT_SPAN_FONT = (1 << 6), +}; + GType poppler_structure_element_get_type (void) G_GNUC_CONST; PopplerStructureElementKind poppler_structure_element_get_kind (PopplerStructureElement *poppler_structure_element); @@ -95,6 +114,8 @@ const gchar *poppler_structure_element_get_abbreviation const gchar *poppler_structure_element_get_language (PopplerStructureElement *poppler_structure_element); const gchar *poppler_structure_element_get_text (PopplerStructureElement *poppler_structure_element, gboolean recursive); +GList *poppler_structure_element_get_text_spans (PopplerStructureElement *poppler_structure_element, + gboolean recursive); const gchar *poppler_structure_element_get_alt_text (PopplerStructureElement *poppler_structure_element); const gchar *poppler_structure_element_get_actual_text (PopplerStructureElement *poppler_structure_element); @@ -107,6 +128,11 @@ PopplerStructureElement *poppler_structure_element_iter_get_element gboolean poppler_structure_element_iter_next (PopplerStructureElementIter *iter); void poppler_structure_element_iter_free (PopplerStructureElementIter *iter); +gboolean poppler_text_span_is_fixed_width (PopplerTextSpan *poppler_text_span); +gboolean poppler_text_span_is_serif_font (PopplerTextSpan *poppler_text_span); +gboolean poppler_text_span_is_bold (PopplerTextSpan *poppler_text_span); +gboolean poppler_text_span_is_link (PopplerTextSpan *poppler_text_span); + G_END_DECLS #endif /* !__POPPLER_STRUCTURE_ELEMENT_H__ */ -- 1.8.4