From 7aaee60139601f74c0252d0db06f2d8092d98546 Mon Sep 17 00:00:00 2001 From: Evangelos Rigas Date: Mon, 6 Aug 2018 08:12:46 +0100 Subject: [PATCH 1/3] Add support for PDF subtype property (core, glib backend) Parse /GTS_PDF(A,E,UA,VT,X)Version from the PDF Information Dictionary into three enums: PDFSubtype, PDFSubtypePart, and PDFSubtypeConformance. Use C++11 regex library for cross-platform compatibility. 4 files changed, 393 insertions(+) diff --git a/glib/poppler-document.cc b/glib/poppler-document.cc index b343eb90..56a77e25 100644 --- a/glib/poppler-document.cc +++ b/glib/poppler-document.cc @@ -58,6 +58,9 @@ enum { PROP_FORMAT, PROP_FORMAT_MAJOR, PROP_FORMAT_MINOR, + PROP_FORMAT_SUBTYPE, + PROP_FORMAT_PART, + PROP_FORMAT_CONF, PROP_AUTHOR, PROP_SUBJECT, PROP_KEYWORDS, @@ -808,6 +811,76 @@ convert_page_mode (Catalog::PageMode pageMode) } } +static PopplerPDFSubtype +convert_pdf_subtype (PDFSubtype pdfSubtype) +{ + switch (pdfSubtype) + { + case subtypePDFA: + return POPPLER_PDF_SUBTYPE_PDF_A; + case subtypePDFE: + return POPPLER_PDF_SUBTYPE_PDF_E; + case subtypePDFUA: + return POPPLER_PDF_SUBTYPE_PDF_UA; + case subtypePDFVT: + return POPPLER_PDF_SUBTYPE_PDF_VT; + case subtypePDFX: + return POPPLER_PDF_SUBTYPE_PDF_X; + case subtypeNone: + return POPPLER_PDF_SUBTYPE_NONE; + default: + return POPPLER_PDF_SUBTYPE_UNSET; + } +} + +static PopplerPDFPart +convert_pdf_subtype_part (PDFSubtypePart pdfSubtypePart) +{ + switch (pdfSubtypePart) + { + case subtypePart1: + return POPPLER_PDF_SUBTYPE_PART_1; + case subtypePart2: + return POPPLER_PDF_SUBTYPE_PART_2; + case subtypePart3: + return POPPLER_PDF_SUBTYPE_PART_3; + case subtypePart4: + return POPPLER_PDF_SUBTYPE_PART_4; + case subtypePart5: + return POPPLER_PDF_SUBTYPE_PART_5; + case subtypePartNone: + return POPPLER_PDF_SUBTYPE_PART_NONE; + default: + return POPPLER_PDF_SUBTYPE_PART_UNSET; + } +} + +static PopplerPDFConformance +convert_pdf_subtype_conformance (PDFSubtypeConformance pdfSubtypeConf) +{ + switch (pdfSubtypeConf) + { + case subtypeConfA: + return POPPLER_PDF_SUBTYPE_CONF_A; + case subtypeConfB: + return POPPLER_PDF_SUBTYPE_CONF_B; + case subtypeConfG: + return POPPLER_PDF_SUBTYPE_CONF_G; + case subtypeConfN: + return POPPLER_PDF_SUBTYPE_CONF_N; + case subtypeConfP: + return POPPLER_PDF_SUBTYPE_CONF_P; + case subtypeConfPG: + return POPPLER_PDF_SUBTYPE_CONF_PG; + case subtypeConfU: + return POPPLER_PDF_SUBTYPE_CONF_U; + case subtypeConfNone: + return POPPLER_PDF_SUBTYPE_CONF_NONE; + default: + return POPPLER_PDF_SUBTYPE_CONF_UNSET; + } +} + /** * poppler_document_get_pdf_version_string: * @document: A #PopplerDocument @@ -1352,6 +1425,61 @@ poppler_document_get_permissions (PopplerDocument *document) return (PopplerPermissions)flag; } +/** + * poppler_document_get_pdf_subtype: + * @document: A #PopplerDocument + * + * Returns a #PopplerPDFSubtype with the subtype of the PDF document. + * + * Return value: a #PopplerPDFSubtype enumerator + * + * Since: 0.68 + **/ +PopplerPDFSubtype +poppler_document_get_pdf_subtype (PopplerDocument *document) +{ + g_return_val_if_fail (POPPLER_IS_DOCUMENT (document), POPPLER_PDF_SUBTYPE_NONE); + + return convert_pdf_subtype (document->doc->getPDFSubtype ()); +} + +/** + * poppler_document_get_pdf_part: + * @document: A #PopplerDocument + * + * Returns a #PopplerPDFPart showing the part of the conforming standard that + * the @document adheres to. + * + * Return value: a #PopplerPDFPart enumerator + * + * Since: 0.68 + **/ +PopplerPDFPart +poppler_document_get_pdf_part (PopplerDocument *document) +{ + g_return_val_if_fail (POPPLER_IS_DOCUMENT (document), POPPLER_PDF_SUBTYPE_PART_NONE); + + return convert_pdf_subtype_part (document->doc->getPDFSubtypePart ()); +} + +/** + * poppler_document_get_pdf_conformance: + * @document: A #PopplerDocument + * + * Returns a #PopplerPDFConformance based on the conformance levele of the PDF document. + * + * Return value: a #PopplerPDFConformance enumerator + * + * Since: 0.68 + **/ +PopplerPDFConformance +poppler_document_get_pdf_conformance (PopplerDocument *document) +{ + g_return_val_if_fail (POPPLER_IS_DOCUMENT (document), POPPLER_PDF_SUBTYPE_CONF_NONE); + + return convert_pdf_subtype_conformance (document->doc->getPDFSubtypeConformance ()); +} + /** * poppler_document_get_metadata: * @document: A #PopplerDocument @@ -1446,6 +1574,15 @@ poppler_document_get_property (GObject *object, case PROP_PERMISSIONS: g_value_set_flags (value, poppler_document_get_permissions (document)); break; + case PROP_FORMAT_SUBTYPE: + g_value_set_enum (value, poppler_document_get_pdf_subtype (document)); + break; + case PROP_FORMAT_PART: + g_value_set_enum (value, poppler_document_get_pdf_part (document)); + break; + case PROP_FORMAT_CONF: + g_value_set_enum (value, poppler_document_get_pdf_conformance (document)); + break; case PROP_METADATA: g_value_take_string (value, poppler_document_get_metadata (document)); break; @@ -1712,6 +1849,48 @@ poppler_document_class_init (PopplerDocumentClass *klass) POPPLER_PERMISSIONS_FULL, G_PARAM_READABLE)); + /** + * PopplerDocument:format-subtype: + * + * Document PDF subtype + */ + g_object_class_install_property (G_OBJECT_CLASS (klass), + PROP_FORMAT_SUBTYPE, + g_param_spec_enum ("format-subtype", + "PDF Format Subtype", + "The PDF subtype of the document (i.e. PDF/A)", + POPPLER_TYPE_PDF_SUBTYPE, + POPPLER_PDF_SUBTYPE_UNSET, + G_PARAM_READABLE)); + + /** + * PopplerDocument:format-part: + * + * Document PDF subtype part + */ + g_object_class_install_property (G_OBJECT_CLASS (klass), + PROP_FORMAT_PART, + g_param_spec_enum ("format-part", + "PDF Format Part", + "The part of PDF conformance", + POPPLER_TYPE_PDF_PART, + POPPLER_PDF_SUBTYPE_PART_UNSET, + G_PARAM_READABLE)); + + /** + * PopplerDocument:format-conformance: + * + * Document PDF subtype conformance + */ + g_object_class_install_property (G_OBJECT_CLASS (klass), + PROP_FORMAT_SUBTYPE, + g_param_spec_enum ("format-conformance", + "PDF Format Conformance", + "The conformance level of PDF subtype", + POPPLER_TYPE_PDF_CONFORMANCE, + POPPLER_PDF_SUBTYPE_CONF_UNSET, + G_PARAM_READABLE)); + /** * PopplerDocument:metadata: * diff --git a/glib/poppler-document.h b/glib/poppler-document.h index a7fcea1d..a034d7fe 100644 --- a/glib/poppler-document.h +++ b/glib/poppler-document.h @@ -165,7 +165,78 @@ typedef enum /*< flags >*/ } PopplerPermissions; +/** + * PopplerPDFSubtype: + * @POPPLER_PDF_SUBTYPE_UNSET: Null + * @POPPLER_PDF_SUBTYPE_PDF_A: ISO 19005 - Document management -- Electronic document file format for long-term preservation (PDF/A) + * @POPPLER_PDF_SUBTYPE_PDF_E: ISO 24517 - Document management -- Engineering document format using PDF (PDF/E) + * @POPPLER_PDF_SUBTYPE_PDF_UA: ISO 14289 - Document management applications -- Electronic document file format enhancement for accessibility (PDF/UA) + * @POPPLER_PDF_SUBTYPE_PDF_VT: ISO 16612 - Graphic technology -- Variable data exchange (PDF/VT) + * @POPPLER_PDF_SUBTYPE_PDF_X: ISO 15930 - Graphic technology -- Prepress digital data exchange (PDF/X) + * @POPPLER_PDF_SUBTYPE_None: PDF is not compliant with the above standards + * + * PDF Subtype + */ +typedef enum +{ + POPPLER_PDF_SUBTYPE_UNSET, + POPPLER_PDF_SUBTYPE_PDF_A, + POPPLER_PDF_SUBTYPE_PDF_E, + POPPLER_PDF_SUBTYPE_PDF_UA, + POPPLER_PDF_SUBTYPE_PDF_VT, + POPPLER_PDF_SUBTYPE_PDF_X, + POPPLER_PDF_SUBTYPE_NONE +} PopplerPDFSubtype; + +/** + * PopplerPDFPart: + * @POPPLER_PDF_SUBTYPE_PART_UNSET: Null + * @POPPLER_PDF_SUBTYPE_PART_1: 1 + * @POPPLER_PDF_SUBTYPE_PART_2: 2 + * @POPPLER_PDF_SUBTYPE_PART_3: 3 + * @POPPLER_PDF_SUBTYPE_PART_4: 4 + * @POPPLER_PDF_SUBTYPE_PART_5: 5 + * @POPPLER_PDF_SUBTYPE_PART_NONE: No part available + * + * PDF Subtype Part + */ +typedef enum +{ + POPPLER_PDF_SUBTYPE_PART_UNSET, + POPPLER_PDF_SUBTYPE_PART_1, + POPPLER_PDF_SUBTYPE_PART_2, + POPPLER_PDF_SUBTYPE_PART_3, + POPPLER_PDF_SUBTYPE_PART_4, + POPPLER_PDF_SUBTYPE_PART_5, + POPPLER_PDF_SUBTYPE_PART_NONE +} PopplerPDFPart; +/** + * PopplerPDFConformance: + * @POPPLER_PDF_SUBTYPE_CONF_UNSET: Null + * @POPPLER_PDF_SUBTYPE_CONF_A: Level A (accessible) conformance (PDF/A) + * @POPPLER_PDF_SUBTYPE_CONF_B: Level B (basic) conformance (PDF/A) + * @POPPLER_PDF_SUBTYPE_CONF_G: Level G (external graphical content) (PDF/X) + * @POPPLER_PDF_SUBTYPE_CONF_N: Level N (external ICC Profile) (PDF/X) + * @POPPLER_PDF_SUBTYPE_CONF_P: Level P (ICC Profile) (PDF/X) + * @POPPLER_PDF_SUBTYPE_CONF_PG: Level PG (conjuction of P and G) (PDF/X) + * @POPPLER_PDF_SUBTYPE_CONF_U: Level U (Unicode) conformance (PDF/A) + * @POPPLER_PDF_SUBTYPE_CONF_NONE: No conformance level available + * + * PDF Subtype Conformance + */ +typedef enum +{ + POPPLER_PDF_SUBTYPE_CONF_UNSET, + POPPLER_PDF_SUBTYPE_CONF_A, + POPPLER_PDF_SUBTYPE_CONF_B, + POPPLER_PDF_SUBTYPE_CONF_G, + POPPLER_PDF_SUBTYPE_CONF_N, + POPPLER_PDF_SUBTYPE_CONF_P, + POPPLER_PDF_SUBTYPE_CONF_PG, + POPPLER_PDF_SUBTYPE_CONF_U, + POPPLER_PDF_SUBTYPE_CONF_NONE +} PopplerPDFConformance; GType poppler_document_get_type (void) G_GNUC_CONST; PopplerDocument *poppler_document_new_from_file (const char *uri, @@ -230,6 +301,9 @@ gboolean poppler_document_is_linearized (PopplerDocument *doc PopplerPageLayout poppler_document_get_page_layout (PopplerDocument *document); PopplerPageMode poppler_document_get_page_mode (PopplerDocument *document); PopplerPermissions poppler_document_get_permissions (PopplerDocument *document); +PopplerPDFSubtype poppler_document_get_pdf_subtype (PopplerDocument *document); +PopplerPDFPart poppler_document_get_pdf_part (PopplerDocument *document); +PopplerPDFConformance poppler_document_get_pdf_conformance (PopplerDocument *document); gchar *poppler_document_get_metadata (PopplerDocument *document); /* Attachments */ diff --git a/poppler/PDFDoc.cc b/poppler/PDFDoc.cc index 631f9a70..0b34d8b6 100644 --- a/poppler/PDFDoc.cc +++ b/poppler/PDFDoc.cc @@ -40,6 +40,7 @@ // Copyright (C) 2018 Ben Timby // Copyright (C) 2018 Evangelos Foutras // Copyright (C) 2018 Klarälvdalens Datakonsult AB, a KDAB Group company, . Work sponsored by the LiMux project of the city of Munich +// Copyright (C) 2018 Evangelos Rigas // // To see a description of the changes please see the Changelog file that // came with your tarball or type make ChangeLog if you are building from git @@ -62,6 +63,7 @@ #include #include #include +#include #include #include "goo/glibc.h" #include "goo/gstrtod.h" @@ -318,6 +320,9 @@ GBool PDFDoc::setup(GooString *ownerPassword, GooString *userPassword) { } } + // Extract PDF Subtype information + extractPDFSubtype(); + // done return gTrue; } @@ -482,6 +487,99 @@ GBool PDFDoc::checkEncryption(GooString *ownerPassword, GooString *userPassword) return ret; } +void PDFDoc::extractPDFSubtype() { + pdfSubtype = subtypeNull; + pdfPart = subtypePartNull; + pdfConformance = subtypeConfNull; + std::regex regex("PDF/(A|X|VT|E|UA)-([[:digit:]])([[:alpha:]]+)?", std::regex::extended); + + GooString *pdfSubtypeVersion = new GooString(); + // Find PDF InfoDict subtype key if any + if (getDocInfoStringEntry("GTS_PDFA1Version") != nullptr){ + pdfSubtypeVersion = getDocInfoStringEntry("GTS_PDFA1Version"); + pdfSubtype = subtypePDFA; + } else if (getDocInfoStringEntry("GTS_PDFEVersion") != nullptr) { + pdfSubtypeVersion = getDocInfoStringEntry("GTS_PDFEVersion"); + pdfSubtype = subtypePDFE; + } else if (getDocInfoStringEntry("GTS_PDFUAVersion") != nullptr) { + pdfSubtypeVersion = getDocInfoStringEntry("GTS_PDFUAVersion"); + pdfSubtype = subtypePDFUA; + } else if (getDocInfoStringEntry("GTS_PDFVTVersion") != nullptr) { + pdfSubtypeVersion = getDocInfoStringEntry("GTS_PDFVTVersion"); + pdfSubtype = subtypePDFVT; + } else if (getDocInfoStringEntry("GTS_PDFXVersion") != nullptr) { + pdfSubtypeVersion = getDocInfoStringEntry("GTS_PDFXVersion"); + pdfSubtype = subtypePDFX; + } else { + pdfSubtype = subtypeNone; + pdfPart = subtypePartNone; + pdfConformance = subtypeConfNone; + return; + } + + // Execute regex to extract PDF part and conformance levels + std::smatch match; + std::string pdfsubver(pdfSubtypeVersion->getCString(), + pdfSubtypeVersion->getLength()); + if (std::regex_search(pdfsubver, match, regex)) { + // Second match contains the PDF part (1, 2, 3, 4 or 5) + if (match[2].matched) { + // Extract part from version string + switch (std::stoi(match.str(2))) { + case 1: + pdfPart = subtypePart1; + break; + case 2: + pdfPart = subtypePart2; + break; + case 3: + pdfPart = subtypePart3; + break; + case 4: + pdfPart = subtypePart4; + break; + case 5: + pdfPart = subtypePart5; + break; + default: + pdfPart = subtypePartNone; + break; + } + } else { + pdfPart = subtypePartNone; + } + + // Third match contains the PDF conformance (A, B, G, N, P, PG or U) + if (match[3].matched) { + // Extract conformance from version string + GooString *conf = new GooString(match.str(3).c_str()); + // Convert to lowercase as the conformance may appear in both cases + conf = conf->lowerCase(); + if (conf->cmp("a")==0) { + pdfConformance = subtypeConfA; + } else if (conf->cmp("b")==0) { + pdfConformance = subtypeConfB; + } else if (conf->cmp("g")==0) { + pdfConformance = subtypeConfG; + } else if (conf->cmp("n")==0) { + pdfConformance = subtypeConfN; + } else if (conf->cmp("p")==0) { + pdfConformance = subtypeConfP; + } else if (conf->cmp("pg")==0) { + pdfConformance = subtypeConfPG; + } else if (conf->cmp("u")==0) { + pdfConformance = subtypeConfU; + } else { + pdfConformance = subtypeConfNone; + } + } else { + pdfConformance = subtypeConfNone; + } + + return; + } +} + std::vector PDFDoc::getSignatureWidgets() { int num_pages = getNumPages(); diff --git a/poppler/PDFDoc.h b/poppler/PDFDoc.h index 1678d167..f79e034d 100644 --- a/poppler/PDFDoc.h +++ b/poppler/PDFDoc.h @@ -31,6 +31,7 @@ // Copyright (C) 2015 André Esser // Copyright (C) 2016 Jakub Alba // Copyright (C) 2018 Klarälvdalens Datakonsult AB, a KDAB Group company, . Work sponsored by the LiMux project of the city of Munich +// Copyright (C) 2018 Evangelos Rigas // // To see a description of the changes please see the Changelog file that // came with your tarball or type make ChangeLog if you are building from git @@ -74,6 +75,38 @@ enum PDFWriteMode { writeForceIncremental }; +enum PDFSubtype { + subtypeNull, + subtypePDFA, + subtypePDFE, + subtypePDFUA, + subtypePDFVT, + subtypePDFX, + subtypeNone +}; + +enum PDFSubtypePart { + subtypePartNull, + subtypePart1, + subtypePart2, + subtypePart3, + subtypePart4, + subtypePart5, + subtypePartNone +}; + +enum PDFSubtypeConformance { + subtypeConfNull, + subtypeConfA, + subtypeConfB, + subtypeConfG, + subtypeConfN, + subtypeConfP, + subtypeConfPG, + subtypeConfU, + subtypeConfNone +}; + //------------------------------------------------------------------------ // PDFDoc //------------------------------------------------------------------------ @@ -273,6 +306,11 @@ public: GooString *getDocInfoCreatDate() { return getDocInfoStringEntry("CreationDate"); } GooString *getDocInfoModDate() { return getDocInfoStringEntry("ModDate"); } + // Return the PDF subtype, part, and conformance + PDFSubtype getPDFSubtype() { return pdfSubtype; } + PDFSubtypePart getPDFSubtypePart() { return pdfPart; } + PDFSubtypeConformance getPDFSubtypeConformance() { return pdfConformance; } + // Return the PDF version specified by the file. int getPDFMajorVersion() { return pdfMajorVersion; } int getPDFMinorVersion() { return pdfMinorVersion; } @@ -346,6 +384,7 @@ private: GBool checkFooter(); void checkHeader(); GBool checkEncryption(GooString *ownerPassword, GooString *userPassword); + void extractPDFSubtype(); // Get the offset of the start xref table. Goffset getStartXRef(GBool tryingToReconstruct = gFalse); // Get the offset of the entries in the main XRef table of a @@ -365,6 +404,9 @@ private: void *guiData; int pdfMajorVersion; int pdfMinorVersion; + PDFSubtype pdfSubtype; + PDFSubtypePart pdfPart; + PDFSubtypeConformance pdfConformance; Linearization *linearization; // linearizationState = 0: unchecked // linearizationState = 1: checked and valid -- 2.18.0