From ae512bd2b4475fd24189e92a39e87cc99149c375 Mon Sep 17 00:00:00 2001 From: Adrian Perez de Castro Date: Thu, 13 Jun 2013 21:50:53 +0300 Subject: [PATCH v5 10/10] Tagged-PDF: Heuristics in poppler-glib for data/layout table identification Add functions in poppler-glib which, used on a PopplerStructureElement of type POPPLER_STRUCTURE_TABLE, determine whether a table is used for layout or contains actual data. The heuristic is quite simple so far: - poppler_structure_element_is_data_table(): checks for table headings and that THead/TBody elements are present. - poppler_structure_element_is_layout_table(): negates the result of the previous function. The idea is that in the later case a different heuristic could be implemented later on, and at some point tables could be "data tables", "layout tables", or the heuristics can not tell for sure. --- glib/poppler-structure-element.cc | 71 +++++++++++++++++++++++++++++++++++++++ glib/poppler-structure-element.h | 2 ++ 2 files changed, 73 insertions(+) diff --git a/glib/poppler-structure-element.cc b/glib/poppler-structure-element.cc index cb8e5eb..48e6be7 100644 --- a/glib/poppler-structure-element.cc +++ b/glib/poppler-structure-element.cc @@ -769,6 +769,77 @@ poppler_structure_element_is_block (PopplerStructureElement *poppler_structure_e return poppler_structure_element->elem->isBlock (); } + +static guint +data_table_score (const StructElement *elem, gboolean *has_th) +{ + g_assert (elem); + g_assert (has_th); + + guint score = 0; + for (unsigned i = 0; i < elem->getNumElements (); i++) + score += data_table_score (elem->getElement (i), has_th); + + switch (elem->getType ()) + { + case StructElement::THead: score++; break; + case StructElement::TBody: score++; break; + case StructElement::TH: *has_th = TRUE; break; + default: break; + } + + return score; +} + +/** + * poppler_structure_element_is_data_table: + * @poppler_structure_element: A #PopplerStructureElement + * + * Note that there is no proper metadata in PDF documents which identify + * data tables, so heuristics are used to determine whether a table is + * likely to contain data. + * + * Return value: Whether an element is a %POPPLER_STRUCTURE_ELEMENT_TABLE + * and the table contains series of data. + */ +gboolean +poppler_structure_element_is_data_table (PopplerStructureElement *poppler_structure_element) +{ + g_return_val_if_fail (POPPLER_IS_STRUCTURE_ELEMENT (poppler_structure_element), FALSE); + g_assert (poppler_structure_element->elem); + + if (poppler_structure_element->elem->getType () != StructElement::Table) + return FALSE; + + /* + * Data tables are likely to have table-header cells, and at least have + * the contents divided in THead and/or TBody elements. The scoring + * function counts the later and also sets has_th to TRUE. If the score + * is more than zero and there is header cells, assume there is a data + * table. + */ + gboolean has_th = FALSE; + return data_table_score (poppler_structure_element->elem, &has_th) && has_th; +} + +/** + * poppler_structure_element_is_layout_table: + * @poppler_structure_element: A #PopplerStructureElement + * + * Note that there is no proper metadata in PDF documents which identify + * layout tables, so heuristics are used to determine whether a table is + * likely to be used for layout purposes. + * + * Return value: Whether an element is a %POPPLER_STRUCTURE_ELEMENT_TABLE + * and the table is used as aid for layout of page elements. + */ +gboolean +poppler_structure_element_is_layout_table (PopplerStructureElement *poppler_structure_element) +{ + g_return_val_if_fail (POPPLER_IS_STRUCTURE_ELEMENT (poppler_structure_element), FALSE); + return !poppler_structure_element_is_data_table (poppler_structure_element); +} + /** * poppler_structure_element_get_n_children: * @poppler_structure_element: A #PopplerStructureElement diff --git a/glib/poppler-structure-element.h b/glib/poppler-structure-element.h index b3076d2..474bcba 100644 --- a/glib/poppler-structure-element.h +++ b/glib/poppler-structure-element.h @@ -337,6 +337,8 @@ gint poppler_structure_element_get_page (Poppl gboolean poppler_structure_element_is_content (PopplerStructureElement *poppler_structure_element); gboolean poppler_structure_element_is_inline (PopplerStructureElement *poppler_structure_element); gboolean poppler_structure_element_is_block (PopplerStructureElement *poppler_structure_element); +gboolean poppler_structure_element_is_data_table (PopplerStructureElement *poppler_structure_element); +gboolean poppler_structure_element_is_layout_table (PopplerStructureElement *poppler_structure_element); guint poppler_structure_element_get_n_children (PopplerStructureElement *poppler_structure_element); PopplerStructureElement *poppler_structure_element_get_child (PopplerStructureElement *poppler_structure_element, guint index); -- 1.8.3.1