I have some really large PDF files of manuscript materials with OCR. 10,000 pages worth and I don't have enough room for them. They are in the 50M to 500M range per file. And they need to be up this week for a grant deadline. So I'm looking around the google for some way to shrink the pdf files. I came across this: askubuntu
pdf2ps input.pdf output.ps
ps2pdf output.ps output.pdf
Okay, about 8× compression! Great, convert to postscript and back. The project manager asks "is the OCR still good?" I rip the text. a n d i t h as a l l t h i s w e i r d spaces in it.
Long and the short of it: I found that if I use pdftops
from
Poppler to create the postscript
file but use ps2pdf
from Ghostscript
to create the new PDF file; I still get 8× compression, and the OCR
looks the same as before[^footnote]. I guess poppler is better at writing
postscript, and that ghostscript is better at writting PDF.
This script automates the process of running poppler and then ghostscript on a PDF to get this magic.
[^footnote]: as I check more files in the batch, I'm finding that for some this process does introduce extra spaces between letters as well; however, it seems to be to a lesser degree than doing both steps with ghostscript.
from __future__ import division
import sys
import argparse
import tempfile
import subprocess
import os
import shutil
import contextlib
First argument is the PDF file you want to shrink.
Second argument is optional.
If one argement is given, the large file is replaced with a new smaller file if the compression ratio is greater than 1.2.
If two arguments are given, the second argument is the path to put the output file.
def main(argv=None):
usage: pdftrick [-h] [-t TEMPDIR] before [after]
one weird PDF trick
positional arguments:
before PDF (before)
after PDF (after)
optional arguments:
-h, --help show this help message and exit
-t TEMPDIR, --tempdir TEMPDIR
needs a lot of temp space
parser = argparse.ArgumentParser(description="one weird PDF trick")
parser.add_argument('before', nargs=1, help="PDF (before)",
type=extant_file)
parser.add_argument('after', nargs="?", help="PDF (after)")
parser.add_argument('-t', '--tempdir', help="needs a lot of temp space", required=False)
if argv is None:
argv = parser.parse_args()
if argv.tempdir:
tempfile.tempdir = argv.tempdir
check that we have the tools we are wrapping
if not which('pdftops'): # use poppler to create a .ps
raise Exception("need pdftops from poppler")
if not which('ps2pdf'): # and use ghostscript to create a .pdf
raise Exception("need ps2pdf from ghostscript")
with make_temp_directory(prefix='popgho') as tempdir:
main_with_temp(tempdir, argv)
def main_with_temp(tempdir, argv):
os.environ.update({'TMPDIR': tempdir}) # for ghostscript
postscript = os.path.join(tempdir, 'poppler.ps')
o_pdf = argv.before[0]
n_pdf = os.path.join(tempdir, 'ghost.pdf')
swallow all stderr and stdout stackoverflow
with open(os.devnull, "w") as f:
subprocess.check_call(['pdftops', o_pdf, postscript],
stdout=f, stderr=f)
subprocess.check_call(['ps2pdf', postscript, n_pdf],
stdout=f, stderr=f, env=os.environ)
o_size = os.path.getsize(o_pdf)
n_size = os.path.getsize(n_pdf)
compression_ratio = o_size/n_size
if (argv.after):
shutil.move(n_pdf, argv.after)
print("compression: {1}; created: {0}"
.format(argv.after, compression_ratio))
elif (compression_ratio > 1.2):
shutil.move(n_pdf, o_pdf)
print("compression: {1}; overwrite: {0}"
.format(o_pdf, compression_ratio))
else:
os.remove(n_pdf)
print("compression: {0}; not worth it, deleted new file"
.format(compression_ratio))
os.remove(postscript)
like the unix which
command
def which(program):
def is_exe(fpath):
return os.path.isfile(fpath) and os.access(fpath, os.X_OK)
fpath, fname = os.path.split(program)
if fpath:
if is_exe(program):
return program
else:
for path in os.environ["PATH"].split(os.pathsep):
path = path.strip('"')
exe_file = os.path.join(path, program)
if is_exe(exe_file):
return exe_file
return None
Type
for argparse - checks that file exists but does not open.
def extant_file(x):
if not os.path.exists(x):
raise argparse.ArgumentError("{0} does not exist".format(x))
return x
way to clean up a temporary folder
@contextlib.contextmanager
def make_temp_directory(prefix):
temp_dir = tempfile.mkdtemp(prefix=prefix)
yield temp_dir
shutil.rmtree(temp_dir)
main() idiom for importing into REPL for debugging
if __name__ == "__main__":
sys.exit(main())
Copyright © 2014, Regents of the University of California All rights reserved.
Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.