diff options
author | Alon Levy <alon@pobox.com> | 2013-03-27 12:21:10 +0200 |
---|---|---|
committer | Alon Levy <alon@pobox.com> | 2013-04-15 09:51:21 +0300 |
commit | a80ebf3b64dce807d84ab3993984c211f55b47db (patch) | |
tree | 2e8eaebf18414ee511c0dc476a2be8d78253e46f /mediagoblin/media_types | |
parent | 3cadb4a6cd1d5cfdef8712d00e4594345a15b4a7 (diff) | |
download | mediagoblin-a80ebf3b64dce807d84ab3993984c211f55b47db.tar.lz mediagoblin-a80ebf3b64dce807d84ab3993984c211f55b47db.tar.xz mediagoblin-a80ebf3b64dce807d84ab3993984c211f55b47db.zip |
add pdf media type
The new media type supports pdf and a subset of media recognized by libreoffice via
unoconv.
Every document added goes through:
* conversion to pdf with unoconv if not already a pdf
* creation of thumbnail and medium sized image, and pdfinfo generates
some information (even for unoconv produces docs - should fix this)
Poppler (pdftocairo, pdfinfo) is used. http://poppler.freedesktop.org/
A working but uglified pdf.js integration exists, which is enabled by
setting pdf.pdf_js=true mediagoblin_local.ini (disabled in mediagoblin.ini)
Adds one test to the test_submission test suite, and another separate test_pdf suite.
The tests are only run if media_types.pdf.processing.check_prerequisites passes, so
the test suite will not require any extra package.
TODO: make test suite say 'skipped' in that case instead of just 'ok'
Signed-off-by: Alon Levy <alon@pobox.com>
Diffstat (limited to 'mediagoblin/media_types')
-rw-r--r-- | mediagoblin/media_types/pdf/__init__.py | 29 | ||||
-rw-r--r-- | mediagoblin/media_types/pdf/migrations.py | 17 | ||||
-rw-r--r-- | mediagoblin/media_types/pdf/models.py | 58 | ||||
-rw-r--r-- | mediagoblin/media_types/pdf/processing.py | 276 |
4 files changed, 380 insertions, 0 deletions
diff --git a/mediagoblin/media_types/pdf/__init__.py b/mediagoblin/media_types/pdf/__init__.py new file mode 100644 index 00000000..a6d23c93 --- /dev/null +++ b/mediagoblin/media_types/pdf/__init__.py @@ -0,0 +1,29 @@ +# GNU MediaGoblin -- federated, autonomous media hosting +# Copyright (C) 2011, 2012 MediaGoblin contributors. See AUTHORS. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. + +from mediagoblin.media_types.pdf.processing import process_pdf, \ + sniff_handler + + +MEDIA_MANAGER = { + "human_readable": "PDF", + "processor": process_pdf, # alternately a string, + # 'mediagoblin.media_types.image.processing'? + "sniff_handler": sniff_handler, + "display_template": "mediagoblin/media_displays/pdf.html", + "default_thumb": "images/media_thumbs/pdf.jpg", + "accepted_extensions": [ + "pdf"]} diff --git a/mediagoblin/media_types/pdf/migrations.py b/mediagoblin/media_types/pdf/migrations.py new file mode 100644 index 00000000..f54c23ea --- /dev/null +++ b/mediagoblin/media_types/pdf/migrations.py @@ -0,0 +1,17 @@ +# GNU MediaGoblin -- federated, autonomous media hosting +# Copyright (C) 2011, 2012 MediaGoblin contributors. See AUTHORS. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. + +MIGRATIONS = {} diff --git a/mediagoblin/media_types/pdf/models.py b/mediagoblin/media_types/pdf/models.py new file mode 100644 index 00000000..c39262d1 --- /dev/null +++ b/mediagoblin/media_types/pdf/models.py @@ -0,0 +1,58 @@ +# GNU MediaGoblin -- federated, autonomous media hosting +# Copyright (C) 2011, 2012 MediaGoblin contributors. See AUTHORS. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. + + +from mediagoblin.db.base import Base + +from sqlalchemy import ( + Column, Float, Integer, String, DateTime, ForeignKey) +from sqlalchemy.orm import relationship, backref + + +BACKREF_NAME = "pdf__media_data" + + +class PdfData(Base): + __tablename__ = "pdf__mediadata" + + # The primary key *and* reference to the main media_entry + media_entry = Column(Integer, ForeignKey('core__media_entries.id'), + primary_key=True) + get_media_entry = relationship("MediaEntry", + backref=backref(BACKREF_NAME, uselist=False, + cascade="all, delete-orphan")) + pages = Column(Integer) + + # These are taken from what pdfinfo can do, perhaps others make sense too + pdf_author = Column(String) + pdf_title = Column(String) + # note on keywords: this is the pdf parsed string, it should be considered a cached + # value like the rest of these values, since they can be deduced at query time / client + # side too. + pdf_keywords = Column(String) + pdf_creator = Column(String) + pdf_producer = Column(String) + pdf_creation_date = Column(DateTime) + pdf_modified_date = Column(DateTime) + pdf_version_major = Column(Integer) + pdf_version_minor = Column(Integer) + pdf_page_size_width = Column(Float) # unit: pts + pdf_page_size_height = Column(Float) + pdf_pages = Column(Integer) + + +DATA_MODEL = PdfData +MODELS = [PdfData] diff --git a/mediagoblin/media_types/pdf/processing.py b/mediagoblin/media_types/pdf/processing.py new file mode 100644 index 00000000..51862c7e --- /dev/null +++ b/mediagoblin/media_types/pdf/processing.py @@ -0,0 +1,276 @@ +# GNU MediaGoblin -- federated, autonomous media hosting +# Copyright (C) 2011, 2012 MediaGoblin contributors. See AUTHORS. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. +import chardet +import os +import Image +import logging +import dateutil.parser +from subprocess import STDOUT, check_output, call, CalledProcessError + +from mediagoblin import mg_globals as mgg +from mediagoblin.processing import (create_pub_filepath, + FilenameBuilder, BadMediaFail) +from mediagoblin.tools.translate import fake_ugettext_passthrough as _ + +_log = logging.getLogger(__name__) + +# TODO - cache (memoize) util + +# This is a list created via uniconv --show and hand removing some types that +# we already support via other media types better. +unoconv_supported = [ + 'bib', # - BibTeX [.bib] + #bmp - Windows Bitmap [.bmp] + 'csv', # - Text CSV [.csv] + 'dbf', # - dBASE [.dbf] + 'dif', # - Data Interchange Format [.dif] + 'doc6', # - Microsoft Word 6.0 [.doc] + 'doc95', # - Microsoft Word 95 [.doc] + 'docbook', # - DocBook [.xml] + 'doc', # - Microsoft Word 97/2000/XP [.doc] + 'docx7', # - Microsoft Office Open XML [.docx] + 'docx', # - Microsoft Office Open XML [.docx] + #emf - Enhanced Metafile [.emf] + 'eps', # - Encapsulated PostScript [.eps] + 'fodp', # - OpenDocument Presentation (Flat XML) [.fodp] + 'fods', # - OpenDocument Spreadsheet (Flat XML) [.fods] + 'fodt', # - OpenDocument Text (Flat XML) [.fodt] + #gif - Graphics Interchange Format [.gif] + 'html', # - HTML Document (OpenOffice.org Writer) [.html] + #jpg - Joint Photographic Experts Group [.jpg] + 'latex', # - LaTeX 2e [.ltx] + 'mediawiki', # - MediaWiki [.txt] + 'met', # - OS/2 Metafile [.met] + 'odd', # - OpenDocument Drawing [.odd] + 'odg', # - ODF Drawing (Impress) [.odg] + 'odp', # - ODF Presentation [.odp] + 'ods', # - ODF Spreadsheet [.ods] + 'odt', # - ODF Text Document [.odt] + 'ooxml', # - Microsoft Office Open XML [.xml] + 'otg', # - OpenDocument Drawing Template [.otg] + 'otp', # - ODF Presentation Template [.otp] + 'ots', # - ODF Spreadsheet Template [.ots] + 'ott', # - Open Document Text [.ott] + #pbm - Portable Bitmap [.pbm] + #pct - Mac Pict [.pct] + 'pdb', # - AportisDoc (Palm) [.pdb] + #pdf - Portable Document Format [.pdf] + #pgm - Portable Graymap [.pgm] + #png - Portable Network Graphic [.png] + 'pot', # - Microsoft PowerPoint 97/2000/XP Template [.pot] + 'potm', # - Microsoft PowerPoint 2007/2010 XML Template [.potm] + #ppm - Portable Pixelmap [.ppm] + 'pps', # - Microsoft PowerPoint 97/2000/XP (Autoplay) [.pps] + 'ppt', # - Microsoft PowerPoint 97/2000/XP [.ppt] + 'pptx', # - Microsoft PowerPoint 2007/2010 XML [.pptx] + 'psw', # - Pocket Word [.psw] + 'pwp', # - PlaceWare [.pwp] + 'pxl', # - Pocket Excel [.pxl] + #ras - Sun Raster Image [.ras] + 'rtf', # - Rich Text Format [.rtf] + 'sda', # - StarDraw 5.0 (OpenOffice.org Impress) [.sda] + 'sdc3', # - StarCalc 3.0 [.sdc] + 'sdc4', # - StarCalc 4.0 [.sdc] + 'sdc', # - StarCalc 5.0 [.sdc] + 'sdd3', # - StarDraw 3.0 (OpenOffice.org Impress) [.sdd] + 'sdd4', # - StarImpress 4.0 [.sdd] + 'sdd', # - StarImpress 5.0 [.sdd] + 'sdw3', # - StarWriter 3.0 [.sdw] + 'sdw4', # - StarWriter 4.0 [.sdw] + 'sdw', # - StarWriter 5.0 [.sdw] + 'slk', # - SYLK [.slk] + 'stc', # - OpenOffice.org 1.0 Spreadsheet Template [.stc] + 'std', # - OpenOffice.org 1.0 Drawing Template [.std] + 'sti', # - OpenOffice.org 1.0 Presentation Template [.sti] + 'stw', # - Open Office.org 1.0 Text Document Template [.stw] + #svg - Scalable Vector Graphics [.svg] + 'svm', # - StarView Metafile [.svm] + 'swf', # - Macromedia Flash (SWF) [.swf] + 'sxc', # - OpenOffice.org 1.0 Spreadsheet [.sxc] + 'sxd3', # - StarDraw 3.0 [.sxd] + 'sxd5', # - StarDraw 5.0 [.sxd] + 'sxd', # - OpenOffice.org 1.0 Drawing (OpenOffice.org Impress) [.sxd] + 'sxi', # - OpenOffice.org 1.0 Presentation [.sxi] + 'sxw', # - Open Office.org 1.0 Text Document [.sxw] + #text - Text Encoded [.txt] + #tiff - Tagged Image File Format [.tiff] + #txt - Text [.txt] + 'uop', # - Unified Office Format presentation [.uop] + 'uos', # - Unified Office Format spreadsheet [.uos] + 'uot', # - Unified Office Format text [.uot] + 'vor3', # - StarDraw 3.0 Template (OpenOffice.org Impress) [.vor] + 'vor4', # - StarWriter 4.0 Template [.vor] + 'vor5', # - StarDraw 5.0 Template (OpenOffice.org Impress) [.vor] + 'vor', # - StarCalc 5.0 Template [.vor] + #wmf - Windows Metafile [.wmf] + 'xhtml', # - XHTML Document [.html] + 'xls5', # - Microsoft Excel 5.0 [.xls] + 'xls95', # - Microsoft Excel 95 [.xls] + 'xls', # - Microsoft Excel 97/2000/XP [.xls] + 'xlt5', # - Microsoft Excel 5.0 Template [.xlt] + 'xlt95', # - Microsoft Excel 95 Template [.xlt] + 'xlt', # - Microsoft Excel 97/2000/XP Template [.xlt] + #xpm - X PixMap [.xpm] +] + +def is_unoconv_working(): + try: + output = check_output([where('unoconv'), '--show'], stderr=STDOUT) + except CalledProcessError, e: + _log.warn(_('unoconv failing to run, check log file')) + return False + if 'ERROR' in output: + return False + return True + +def supported_extensions(cache=[None]): + if cache[0] == None: + cache[0] = 'pdf' + # TODO: must have libreoffice-headless installed too, need to check for it + if where('unoconv') and is_unoconv_working(): + cache.extend(unoconv_supported) + return cache + +def where(name): + for p in os.environ['PATH'].split(os.pathsep): + fullpath = os.path.join(p, name) + if os.path.exists(fullpath): + return fullpath + return None + +def check_prerequisites(): + if not where('pdfinfo'): + _log.warn('missing pdfinfo') + return False + if not where('pdftocairo'): + _log.warn('missing pdfcairo') + return False + return True + +def sniff_handler(media_file, **kw): + if not check_prerequisites(): + return False + if kw.get('media') is not None: + name, ext = os.path.splitext(kw['media'].filename) + clean_ext = ext[1:].lower() + + if clean_ext in supported_extensions(): + return True + + return False + +def create_pdf_thumb(original, thumb_filename, width, height): + # Note: pdftocairo adds '.png', remove it + thumb_filename = thumb_filename[:-4] + executable = where('pdftocairo') + args = [executable, '-scale-to', str(min(width, height)), + '-singlefile', '-png', original, thumb_filename] + _log.debug('calling {0}'.format(repr(' '.join(args)))) + call(executable=executable, args=args) + +def pdf_info(original): + """ + Extract dictionary of pdf information. This could use a library instead + of a process. + + Note: I'm assuming pdfinfo output is sanitized (integers where integers are + expected, etc.) - if this is wrong then an exception will be raised and caught + leading to the dreaded error page. It seems a safe assumption. + """ + ret_dict = {} + pdfinfo = where('pdfinfo') + try: + lines = check_output(executable=pdfinfo, + args=[pdfinfo, original]).split(os.linesep) + except CalledProcessError: + _log.debug('pdfinfo could not read the pdf file.') + raise BadMediaFail() + + info_dict = dict([[part.strip() for part in l.strip().split(':', 1)] + for l in lines if ':' in l]) + + for date_key in [('pdf_mod_date', 'ModDate'), + ('pdf_creation_date', 'CreationDate')]: + if date_key in info_dict: + ret_dict[date_key] = dateutil.parser.parse(info_dict[date_key]) + for db_key, int_key in [('pdf_pages', 'Pages')]: + if int_key in info_dict: + ret_dict[db_key] = int(info_dict[int_key]) + + # parse 'PageSize' field: 595 x 842 pts (A4) + page_size_parts = info_dict['Page size'].split() + ret_dict['pdf_page_size_width'] = float(page_size_parts[0]) + ret_dict['pdf_page_size_height'] = float(page_size_parts[2]) + + for db_key, str_key in [('pdf_keywords', 'Keywords'), + ('pdf_creator', 'Creator'), ('pdf_producer', 'Producer'), + ('pdf_author', 'Author'), ('pdf_title', 'Title')]: + ret_dict[db_key] = info_dict.get(str_key, None) + ret_dict['pdf_version_major'], ret_dict['pdf_version_minor'] = \ + map(int, info_dict['PDF version'].split('.')) + + return ret_dict + +def process_pdf(proc_state): + """Code to process a pdf file. Will be run by celery. + + A Workbench() represents a local tempory dir. It is automatically + cleaned up when this function exits. + """ + entry = proc_state.entry + workbench = proc_state.workbench + + queued_filename = proc_state.get_queued_filename() + name_builder = FilenameBuilder(queued_filename) + + media_files_dict = entry.setdefault('media_files', {}) + + # Copy our queued local workbench to its final destination + original_dest = name_builder.fill('{basename}{ext}') + proc_state.copy_original(original_dest) + + # Create a pdf if this is a different doc, store pdf for viewer + ext = queued_filename.rsplit('.', 1)[-1].lower() + if ext == 'pdf': + pdf_filename = queued_filename + else: + pdf_filename = queued_filename.rsplit('.', 1)[0] + '.pdf' + unoconv = where('unoconv') + call(executable=unoconv, + args=[unoconv, '-v', '-f', 'pdf', queued_filename]) + if not os.path.exists(pdf_filename): + _log.debug('unoconv failed to convert file to pdf') + raise BadMediaFail() + proc_state.store_public(keyname=u'pdf', local_file=pdf_filename) + + pdf_info_dict = pdf_info(pdf_filename) + + for name, width, height in [ + (u'thumb', mgg.global_config['media:thumb']['max_width'], + mgg.global_config['media:thumb']['max_height']), + (u'medium', mgg.global_config['media:medium']['max_width'], + mgg.global_config['media:medium']['max_height']), + ]: + filename = name_builder.fill('{basename}.%s.png' % name) + path = workbench.joinpath(filename) + create_pdf_thumb(pdf_filename, path, width, height) + assert(os.path.exists(path)) + proc_state.store_public(keyname=name, local_file=path) + + proc_state.delete_queue_file() + + entry.media_data_init(**pdf_info_dict) + entry.save() |