From 653056bd23e5850e56e84d692625fb9c0d58dc21 Mon Sep 17 00:00:00 2001 From: ulatekh Date: Sat, 4 Aug 2018 11:46:13 -0700 Subject: [PATCH] Now pdftohtml takes a "-pg" parameter, with a list of page ranges. This is more flexible and powerful than -f and -l, which only allows one page range. --- goo/GooIntervalSet.h | 304 +++++++++++++++++++++++++++++++++++++++++++++++++++ utils/pdftohtml.1 | 3 + utils/pdftohtml.cc | 126 ++++++++++++++++++++- 3 files changed, 430 insertions(+), 3 deletions(-) create mode 100644 goo/GooIntervalSet.h diff --git a/goo/GooIntervalSet.h b/goo/GooIntervalSet.h new file mode 100644 index 00000000..7e1111e6 --- /dev/null +++ b/goo/GooIntervalSet.h @@ -0,0 +1,304 @@ +#ifndef GooIntervalSet_H +#define GooIntervalSet_H + +// This file (C) 2004-2009,2018 Steven Boswell. All rights reserved. +// Released to the public under the GNU General Public License v2. +// See the file COPYING for more information. + +// GooIntervalSet tracks a 1-dimensional region of arbitrary values. +// It's implemented by a sorted set of Extents. +// It's a cut-down version of the SetRegion2D class from the y4mdenoise +// tool in the mjpegtools project. + +#include +#include + + + +// The interval-set class. Parameterized by the numeric type to use for +// point indices, and the numeric type to use to count the contained +// number of points. +template +class GooIntervalSet +{ +public: + // The type of each extent. + typedef std::pair Extent; + + // The container type that stores our extents. + typedef std::set > Extents; + +private: + // The extents that make up the interval-set. + Extents m_setExtents; + +public: + // Default constructor. + GooIntervalSet(std::allocator const &a_rAlloc + = std::allocator()); + + // Don't allow copy or assignment. + GooIntervalSet ( + const GooIntervalSet &a_rOther) = delete; + GooIntervalSet ( + GooIntervalSet &&a_rOther) = delete; + GooIntervalSet const &operator = ( + const GooIntervalSet &a_rOther) = delete; + GooIntervalSet &operator = ( + const GooIntervalSet &&a_rOther) = delete; + + // Destructor. + virtual ~GooIntervalSet(); + + // Return the total number of points contained by the + // interval-set. + inline SIZE getNumberOfPoints (void) const; + + // clear the interval-set, emptying it of all extents. + void clear (void); + + // Add the given horizontal extent to the interval-set. + // Note that a_tnEnd is technically one past the end of the + // extent. + void add (INDEX a_tnStart, INDEX a_tnEnd); + + // Allow our client to iterate through the extents & get their + // values. + typedef typename Extents::iterator iterator; + typedef typename Extents::const_iterator const_iterator; + const_iterator begin (void) const { return m_setExtents.begin(); } + const_iterator end (void) const { return m_setExtents.end(); } + + // Return the position of the first extent whose beginning + // is >= the given point. + const_iterator lower_bound (INDEX a_tnX) const + { Extent oExtent (a_tnX, a_tnX); + return m_setExtents.lower_bound (oExtent); } + + // Return the position of the first extent whose beginning + // is > the given extent. + const_iterator upper_bound (INDEX a_tnX) const + { Extent oExtent (a_tnX, a_tnX); + return m_setExtents.upper_bound (oExtent); } + +private: + // The total number of points contained by the interval-set. + SIZE m_tnPoints; +}; + + + +// Default constructor. +template +GooIntervalSet::GooIntervalSet ( + std::allocator const &a_rAlloc) + : m_setExtents (std::less(), a_rAlloc) +{ + // No points yet. + m_tnPoints = 0; +} + + + +// Destructor. +template +GooIntervalSet::~GooIntervalSet() +{ +} + + + +// Return the total number of points contained by the interval-set. +template +inline SIZE +GooIntervalSet::getNumberOfPoints (void) const +{ + // Easy enough. + return m_tnPoints; +} + + + +// clear the interval-set, emptying it of all extents. +template +void +GooIntervalSet::clear (void) +{ + // Easy enough. + m_setExtents.clear(); + m_tnPoints = 0; +} + + + +// Add the given horizontal extent to the interval-set. +template +void +GooIntervalSet::add (INDEX a_tnStart, INDEX a_tnEnd) +{ + Extent oKey; + // An extent being searched for. + Extent oInserted; + // The extent being added, modified to account for the extents + // already present in the interval-set. + typename Extents::iterator itStart, itEnd; + // The range of existing extents that gets removed because of + // the addition of the new extent. + typename Extents::iterator itHere; + // An extent being examined and/or modified. + + // Make sure they gave us a non-empty extent. + assert (a_tnStart < a_tnEnd); + + // The extent we'll be inserting starts as the extent they asked + // to add. That may get modified based on the nature of the extents + // already present in the interval-set. + oInserted.first = a_tnStart; + oInserted.second = a_tnEnd; + + // If there are no existing extents, just add it. + if (m_setExtents.empty()) + { + #ifndef NDEBUG + std::pair oInsertResult = + #endif // NDEBUG + m_setExtents.insert (oInserted); + assert (oInsertResult.second); + + return; + } + + // Find the first extent that may get removed because of this new + // extent. + // (That's the one before the first existing extent + // that's > x-start.) + oKey.first = a_tnStart; + itStart = m_setExtents.upper_bound (oKey); + --itStart; + + // Does the found extent intersect the new extent? + if (itStart != m_setExtents.end() + && (*itStart).second >= a_tnStart) + { + // The found extent intersects with the new extent. + + // If the found extent contains the new extent, exit now; + // the interval-set already contains the new extent, and no + // modifications are necessary. + if ((*itStart).second >= a_tnEnd) + return; + + // The found extent will be removed, and the inserted extent + // will start in the same location (which we know is less than + // or equal to the inserted extent's current start, thanks to + // the search we did earlier). + oInserted.first = (*itStart).first; + + // If the next extent in the interval-set doesn't intersect the + // one being added, we can modify this extent & be done, without + // having to do a 2nd upper-bound search. + itEnd = itStart; + ++itEnd; + if (itEnd == m_setExtents.end() + || (*itEnd).first > a_tnEnd) + { + // We can modify this one extent & be done. Keep the + // largest end. + if (oInserted.second < (*itStart).second) + oInserted.second = (*itStart).second; + + // Adjust the number of points we contain. + m_tnPoints += (oInserted.second - oInserted.first) + - ((*itStart).second - (*itStart).first); + + // Modify the extent. + //*itStart = oInserted; + typename Extents::iterator itHint = itStart; + ++itHint; + m_setExtents.erase(itStart); + m_setExtents.insert(itHint, oInserted); + + // We're done. + return; + } + } + else + { + // The found extent doesn't intersect with the new extent. + // Therefore, it won't get modified or removed by the addition + // of the new extent. Move past it. + ++itStart; + } + + // Find the last extent that may get removed because of this new + // extent. Start by searching for the first existing extent + // that's > x-end, then move back one.) + oKey.first = a_tnEnd; + itEnd = m_setExtents.upper_bound (oKey); + --itEnd; + + // Does the found extent intersect the new extent? + if (itEnd != m_setExtents.end() + && (*itEnd).first <= a_tnEnd) + { + // Yes. That extent will get replaced, and its endpoint may be + // used by the inserted extent. + if (oInserted.second < (*itEnd).second) + oInserted.second = (*itEnd).second; + } + + // In either case, move ahead again, to get back to the end of the + // range we'll be removing. + ++itEnd; + + // We now have the actual extent to be inserted, and the range of + // existing extents to be removed. + + // Run through the extents to be removed, count the number of points + // they represent, and subtract that from our separately-maintained + // total number of points in the interval-set. + for (itHere = itStart; itHere != itEnd; ++itHere) + m_tnPoints -= (*itHere).second - (*itHere).first; + + // If the range to be replaced has at least one existing item, then + // move the start of the range forward by one, and overwrite the + // old start with the extent to be inserted, i.e. avoid a memory + // allocation if at all possible. + if (itStart != itEnd) + { + // Keep track of the location of the extent to be overwritten. + itHere = itStart; + + // Move past the extent that'll be overwritten. + ++itStart; + + // Remove all extents that were found to conflict with the one + // being inserted. + m_setExtents.erase (itStart, itEnd); + + // Store the new extent. + //*itHere = oInserted; + typename Extents::iterator itHint = itHere; + ++itHint; + m_setExtents.erase(itHere); + m_setExtents.insert(itHint, oInserted); + } + + // Otherwise, no extents are being removed, and we just insert the + // new extent. + else + { + #ifndef NDEBUG + std::pair oInsertResult = + #endif // NDEBUG + m_setExtents.insert (oInserted); + assert (oInsertResult.second); + } + + // The interval-set now contains this many more points. + m_tnPoints += oInserted.second - oInserted.first; +} + + + +#endif // GooIntervalSet_H diff --git a/utils/pdftohtml.1 b/utils/pdftohtml.1 index 5de42880..96096a09 100644 --- a/utils/pdftohtml.1 +++ b/utils/pdftohtml.1 @@ -28,6 +28,9 @@ first page to print .B \-l last page to print .TP +.B \-pg +range of pages to convert (e.g. "1-2,5-6,19-") +.TP .B \-q do not print any messages or errors .TP diff --git a/utils/pdftohtml.cc b/utils/pdftohtml.cc index 04aeb1bc..3ef935ad 100644 --- a/utils/pdftohtml.cc +++ b/utils/pdftohtml.cc @@ -43,6 +43,7 @@ #endif #include #include "parseargs.h" +#include "goo/GooIntervalSet.h" #include "goo/GooString.h" #include "goo/gmem.h" #include "Object.h" @@ -69,6 +70,8 @@ static int firstPage = 1; static int lastPage = 0; +static GooString pageRange; +static GooIntervalSet setPageRanges; static GBool rawOrder = gTrue; GBool printCommands = gTrue; static GBool printHelp = gFalse; @@ -102,6 +105,8 @@ static const ArgDesc argDesc[] = { "first page to convert"}, {"-l", argInt, &lastPage, 0, "last page to convert"}, + {"-pg", argGooString,&pageRange, 0, + "range of pages to convert (e.g. \"1-2,5-6,19-\")"}, /*{"-raw", argFlag, &rawOrder, 0, "keep strings in content stream order"},*/ {"-q", argFlag, &errQuiet, 0, @@ -291,7 +296,7 @@ int main(int argc, char *argv[]) { } delete tmp; } else if (fileName->cmp("fd://0") == 0) { - error(errCommandLine, -1, "You have to provide an output filename when reading form stdin."); + error(errCommandLine, -1, "You have to provide an output filename when reading from stdin."); goto error; } else { p = fileName->getCString() + fileName->getLength() - 4; @@ -337,6 +342,116 @@ int main(int argc, char *argv[]) { goto error; } + // get page ranges + if (pageRange.getLength() == 0) + { + setPageRanges.add(firstPage, lastPage + 1); + } + else + { + // Prepare to build a set of page-ranges from a + // comma-delimited string. + std::vector vecPageRanges; + char *pszPageRanges = pageRange.getCString(); + char *pszPageRangesSave = nullptr; + int iPageCount = doc->getNumPages(); + for(;;) + { + char *pszPageRange = strtok_r(pszPageRanges, ",", + &pszPageRangesSave); + if (pszPageRange == nullptr) + break; + pszPageRanges = nullptr; + + // A range is two numbers, separated by a dash. + std::vector vecPageRange; + char *pszPageNumber = pszPageRange; + char *pszPageNumberSave = nullptr; + // (If there's no first number, the beginning of + // the range is the first page in the document.) + if (pszPageRange[0] == '-') + vecPageRange.push_back(std::string()); + // (If there's no second number, the end of the + // range is the last page in the document.) + bool bNoSecondNumber = (pszPageRange[strlen(pszPageRange)-1] + == '-'); + for(;;) + { + pszPageNumber = strtok_r(pszPageNumber, "-", + &pszPageNumberSave); + if (pszPageNumber == nullptr) + break; + vecPageRange.push_back(pszPageNumber); + pszPageNumber = nullptr; + } + if (bNoSecondNumber) + vecPageRange.push_back(std::string()); + + if (vecPageRange.size() < 1 || vecPageRange.size() > 2) + { + error(errCommandLine, -1, + "Invalid page range given: \"{0:s}\"", + pszPageRange); + goto error; + } + + // Parse the first number in the range. + // If it's empty, it's the first page in the document. + int iFirstPage = 0; + { + std::string const &rstrFirstPage = vecPageRange[0]; + char const *pszFirstPage = rstrFirstPage.c_str(); + char const *pszFirstPageEnd + = pszFirstPage + rstrFirstPage.size(); + if (pszFirstPage == pszFirstPageEnd) + iFirstPage = 1; + else + { + char /* const */ *pszValueEnd = nullptr; + iFirstPage = ::strtol(pszFirstPage, &pszValueEnd, 10); + if (pszValueEnd != pszFirstPageEnd) + { + error(errCommandLine, -1, + "Page-range value \"{0:s}\" is not a valid number", + pszFirstPage); + goto error; + } + } + } + + // Parse the last number in the range. + // If it's missing, it's a one-page range. + // If it's empty, it's the last page in the document. + int iLastPage = 0; + if (vecPageRange.size() < 2) + iLastPage = iFirstPage; + else + { + std::string const &rstrLastPage = vecPageRange[1]; + char const *pszLastPage = rstrLastPage.c_str(); + char const *pszLastPageEnd + = pszLastPage + rstrLastPage.size(); + if (pszLastPage == pszLastPageEnd) + iLastPage = iPageCount; + else + { + char /* const */ *pszValueEnd = nullptr; + iLastPage = ::strtol(pszLastPage, &pszValueEnd, 10); + if (pszValueEnd != pszLastPageEnd) + { + error(errCommandLine, -1, + "Page-range value \"{0:s}\" is not a valid number", + pszLastPage); + goto error; + } + } + } + + // Accumulate this range of pages. + setPageRanges.add(iFirstPage, iLastPage + 1); + } + } + info = doc->getDocInfo(); if (info.isDict()) { docTitle = getInfoString(info.getDict(), "Title"); @@ -390,8 +505,13 @@ int main(int argc, char *argv[]) { if (htmlOut->isOk()) { - doc->displayPages(htmlOut, firstPage, lastPage, 72 * scale, 72 * scale, 0, - gTrue, gFalse, gFalse); + for (auto const &rPageRange : setPageRanges) + { + int iFirstPage = rPageRange.first; + int iLastPage = rPageRange.second - 1; + doc->displayPages(htmlOut, iFirstPage, iLastPage, 72 * scale, 72 * scale, 0, + gTrue, gFalse, gFalse); + } htmlOut->dumpDocOutline(doc); } -- 2.14.4