From 8e27b343e0b8a7e8710bdc7748ddac3a02b0a45c Mon Sep 17 00:00:00 2001 From: ulatekh Date: Sun, 29 Jul 2018 09:07:42 -0700 Subject: [PATCH] Now pdftohtml takes a "-pg" parameter, with a list of page ranges. This is more flexible and powerful than -f and -l, which only allows one page range. --- utils/pdftohtml.cc | 109 +++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 106 insertions(+), 3 deletions(-) diff --git a/utils/pdftohtml.cc b/utils/pdftohtml.cc index 04aeb1bc..32a2d265 100644 --- a/utils/pdftohtml.cc +++ b/utils/pdftohtml.cc @@ -66,9 +66,13 @@ #include "DateInfo.h" #include "goo/gfile.h" #include "Win32Console.h" +#include +#include static int firstPage = 1; static int lastPage = 0; +static GooString pageRange; +static boost::icl::interval_set setPageRanges; static GBool rawOrder = gTrue; GBool printCommands = gTrue; static GBool printHelp = gFalse; @@ -102,6 +106,8 @@ static const ArgDesc argDesc[] = { "first page to convert"}, {"-l", argInt, &lastPage, 0, "last page to convert"}, + {"-pg", argGooString,&pageRange, 0, + "range of pages to convert (e.g. \"1-2,5-6,19-\")"}, /*{"-raw", argFlag, &rawOrder, 0, "keep strings in content stream order"},*/ {"-q", argFlag, &errQuiet, 0, @@ -291,7 +297,7 @@ int main(int argc, char *argv[]) { } delete tmp; } else if (fileName->cmp("fd://0") == 0) { - error(errCommandLine, -1, "You have to provide an output filename when reading form stdin."); + error(errCommandLine, -1, "You have to provide an output filename when reading from stdin."); goto error; } else { p = fileName->getCString() + fileName->getLength() - 4; @@ -337,6 +343,98 @@ int main(int argc, char *argv[]) { goto error; } + // get page ranges + if (pageRange.getLength() == 0) + { + setPageRanges.add(boost::icl::interval::type( + firstPage, lastPage + 1)); + } + else + { + // Prepare to build a set of page-ranges from a + // comma-delimited string. + std::vector vecPageRanges; + char const *pszPageRanges = pageRange.getCString(); + int iPageCount = doc->getNumPages(); + boost::split(vecPageRanges, pszPageRanges, + boost::is_any_of(",")); + for(std::string const &rstrPageRange : vecPageRanges) + { + // A range is two numbers, separated by a dash. + // If there's no first number, the beginning of + // the range is the first page in the document. + // If there's no second number, the end of the + // range is the last page in the document. + std::vector vecPageRange; + char const *pszPageRange = rstrPageRange.c_str(); + boost::split(vecPageRange, pszPageRange, + boost::is_any_of("-")); + if (vecPageRange.size() < 1 || vecPageRange.size() > 2) + { + error(errCommandLine, -1, + "Invalid page range given: \"{0:s}\"", + pszPageRange); + goto error; + } + + // Parse the first number in the range. + // If it's empty, it's the first page in the document. + int iFirstPage = 0; + { + std::string const &rstrFirstPage = vecPageRange[0]; + char const *pszFirstPage = rstrFirstPage.c_str(); + char const *pszFirstPageEnd + = pszFirstPage + rstrFirstPage.size(); + if (pszFirstPage == pszFirstPageEnd) + iFirstPage = 1; + else + { + char /* const */ *pszValueEnd = nullptr; + iFirstPage = ::strtol(pszFirstPage, &pszValueEnd, 10); + if (pszValueEnd != pszFirstPageEnd) + { + error(errCommandLine, -1, + "Page-range value \"{0:s}\" is not a valid number", + pszFirstPage); + goto error; + } + } + } + + // Parse the last number in the range. + // If it's missing, it's a one-page range. + // If it's empty, it's the last page in the document. + int iLastPage = 0; + if (vecPageRange.size() < 2) + iLastPage = iFirstPage; + else + { + std::string const &rstrLastPage = vecPageRange[1]; + char const *pszLastPage = rstrLastPage.c_str(); + char const *pszLastPageEnd + = pszLastPage + rstrLastPage.size(); + if (pszLastPage == pszLastPageEnd) + iLastPage = iPageCount; + else + { + char /* const */ *pszValueEnd = nullptr; + iLastPage = ::strtol(pszLastPage, &pszValueEnd, 10); + if (pszValueEnd != pszLastPageEnd) + { + error(errCommandLine, -1, + "Page-range value \"{0:s}\" is not a valid number", + pszLastPage); + goto error; + } + } + } + + // Accumulate this range of pages. + setPageRanges.add(boost::icl::interval::type( + iFirstPage, iLastPage + 1)); + } + } + info = doc->getDocInfo(); if (info.isDict()) { docTitle = getInfoString(info.getDict(), "Title"); @@ -390,8 +488,13 @@ int main(int argc, char *argv[]) { if (htmlOut->isOk()) { - doc->displayPages(htmlOut, firstPage, lastPage, 72 * scale, 72 * scale, 0, - gTrue, gFalse, gFalse); + for (auto const &rPageRange : setPageRanges) + { + int iFirstPage = rPageRange.lower(); + int iLastPage = rPageRange.upper() - 1; + doc->displayPages(htmlOut, iFirstPage, iLastPage, 72 * scale, 72 * scale, 0, + gTrue, gFalse, gFalse); + } htmlOut->dumpDocOutline(doc); } -- 2.14.4