add pdf media type

The new media type supports pdf and a subset of media recognized by libreoffice via unoconv. Every document added goes through: * conversion to pdf with unoconv if not already a pdf * creation of thumbnail and medium sized image, and pdfinfo generates some information (even for unoconv produces docs - should fix this) Poppler (pdftocairo, pdfinfo) is used. http://poppler.freedesktop.org/ A working but uglified pdf.js integration exists, which is enabled by setting pdf.pdf_js=true mediagoblin_local.ini (disabled in mediagoblin.ini) Adds one test to the test_submission test suite, and another separate test_pdf suite. The tests are only run if media_types.pdf.processing.check_prerequisites passes, so the test suite will not require any extra package. TODO: make test suite say 'skipped' in that case instead of just 'ok' Signed-off-by: Alon Levy <alon@pobox.com>
author: Alon Levy <alon@pobox.com> 2013-03-27 12:21:10 +0200
committer: Alon Levy <alon@pobox.com> 2013-04-15 09:51:21 +0300
commit: a80ebf3b64dce807d84ab3993984c211f55b47db (patch)
tree: 2e8eaebf18414ee511c0dc476a2be8d78253e46f /mediagoblin/media_types
parent: 3cadb4a6cd1d5cfdef8712d00e4594345a15b4a7 (diff)
download: mediagoblin-a80ebf3b64dce807d84ab3993984c211f55b47db.tar.lz
mediagoblin-a80ebf3b64dce807d84ab3993984c211f55b47db.tar.xz
mediagoblin-a80ebf3b64dce807d84ab3993984c211f55b47db.zip
4 files changed, 380 insertions, 0 deletions
diff --git a/mediagoblin/media_types/pdf/__init__.py b/mediagoblin/media_types/pdf/__init__.py
new file mode 100644
index 00000000..a6d23c93
--- /dev/null
+++ b/mediagoblin/media_types/pdf/__init__.py
@@ -0,0 +1,29 @@
+# GNU MediaGoblin -- federated, autonomous media hosting
+# Copyright (C) 2011, 2012 MediaGoblin contributors.  See AUTHORS.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+from mediagoblin.media_types.pdf.processing import process_pdf, \
+    sniff_handler
+
+
+MEDIA_MANAGER = {
+    "human_readable": "PDF",
+    "processor": process_pdf, # alternately a string,
+                                # 'mediagoblin.media_types.image.processing'?
+    "sniff_handler": sniff_handler,
+    "display_template": "mediagoblin/media_displays/pdf.html",
+    "default_thumb": "images/media_thumbs/pdf.jpg",
+    "accepted_extensions": [
+        "pdf"]}
diff --git a/mediagoblin/media_types/pdf/migrations.py b/mediagoblin/media_types/pdf/migrations.py
new file mode 100644
index 00000000..f54c23ea
--- /dev/null
+++ b/mediagoblin/media_types/pdf/migrations.py
@@ -0,0 +1,17 @@
+# GNU MediaGoblin -- federated, autonomous media hosting
+# Copyright (C) 2011, 2012 MediaGoblin contributors.  See AUTHORS.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+MIGRATIONS = {}
diff --git a/mediagoblin/media_types/pdf/models.py b/mediagoblin/media_types/pdf/models.py
new file mode 100644
index 00000000..c39262d1
--- /dev/null
+++ b/mediagoblin/media_types/pdf/models.py
@@ -0,0 +1,58 @@
+# GNU MediaGoblin -- federated, autonomous media hosting
+# Copyright (C) 2011, 2012 MediaGoblin contributors.  See AUTHORS.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+
+from mediagoblin.db.base import Base
+
+from sqlalchemy import (
+    Column, Float, Integer, String, DateTime, ForeignKey)
+from sqlalchemy.orm import relationship, backref
+
+
+BACKREF_NAME = "pdf__media_data"
+
+
+class PdfData(Base):
+    __tablename__ = "pdf__mediadata"
+
+    # The primary key *and* reference to the main media_entry
+    media_entry = Column(Integer, ForeignKey('core__media_entries.id'),
+        primary_key=True)
+    get_media_entry = relationship("MediaEntry",
+        backref=backref(BACKREF_NAME, uselist=False,
+                        cascade="all, delete-orphan"))
+    pages = Column(Integer)
+
+    # These are taken from what pdfinfo can do, perhaps others make sense too
+    pdf_author = Column(String)
+    pdf_title = Column(String)
+    # note on keywords: this is the pdf parsed string, it should be considered a cached
+    # value like the rest of these values, since they can be deduced at query time / client
+    # side too.
+    pdf_keywords = Column(String)
+    pdf_creator = Column(String)
+    pdf_producer = Column(String)
+    pdf_creation_date = Column(DateTime)
+    pdf_modified_date = Column(DateTime)
+    pdf_version_major = Column(Integer)
+    pdf_version_minor = Column(Integer)
+    pdf_page_size_width = Column(Float) # unit: pts
+    pdf_page_size_height = Column(Float)
+    pdf_pages = Column(Integer)
+
+
+DATA_MODEL = PdfData
+MODELS = [PdfData]
diff --git a/mediagoblin/media_types/pdf/processing.py b/mediagoblin/media_types/pdf/processing.py
new file mode 100644
index 00000000..51862c7e
--- /dev/null
+++ b/mediagoblin/media_types/pdf/processing.py
@@ -0,0 +1,276 @@
+# GNU MediaGoblin -- federated, autonomous media hosting
+# Copyright (C) 2011, 2012 MediaGoblin contributors.  See AUTHORS.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+import chardet
+import os
+import Image
+import logging
+import dateutil.parser
+from subprocess import STDOUT, check_output, call, CalledProcessError
+
+from mediagoblin import mg_globals as mgg
+from mediagoblin.processing import (create_pub_filepath,
+                                    FilenameBuilder, BadMediaFail)
+from mediagoblin.tools.translate import fake_ugettext_passthrough as _
+
+_log = logging.getLogger(__name__)
+
+# TODO - cache (memoize) util
+
+# This is a list created via uniconv --show and hand removing some types that
+# we already support via other media types better.
+unoconv_supported = [
+  'bib', #      - BibTeX [.bib]
+  #bmp      - Windows Bitmap [.bmp]
+  'csv', #      - Text CSV [.csv]
+  'dbf', #      - dBASE [.dbf]
+  'dif', #      - Data Interchange Format [.dif]
+  'doc6', #     - Microsoft Word 6.0 [.doc]
+  'doc95', #    - Microsoft Word 95 [.doc]
+  'docbook', #  - DocBook [.xml]
+  'doc', #      - Microsoft Word 97/2000/XP [.doc]
+  'docx7', #    - Microsoft Office Open XML [.docx]
+  'docx', #     - Microsoft Office Open XML [.docx]
+  #emf      - Enhanced Metafile [.emf]
+  'eps', #      - Encapsulated PostScript [.eps]
+  'fodp', #     - OpenDocument Presentation (Flat XML) [.fodp]
+  'fods', #     - OpenDocument Spreadsheet (Flat XML) [.fods]
+  'fodt', #     - OpenDocument Text (Flat XML) [.fodt]
+  #gif      - Graphics Interchange Format [.gif]
+  'html', #     - HTML Document (OpenOffice.org Writer) [.html]
+  #jpg      - Joint Photographic Experts Group [.jpg]
+  'latex', #    - LaTeX 2e [.ltx]
+  'mediawiki', # - MediaWiki [.txt]
+  'met', #      - OS/2 Metafile [.met]
+  'odd', #      - OpenDocument Drawing [.odd]
+  'odg', #      - ODF Drawing (Impress) [.odg]
+  'odp', #      - ODF Presentation [.odp]
+  'ods', #      - ODF Spreadsheet [.ods]
+  'odt', #      - ODF Text Document [.odt]
+  'ooxml', #    - Microsoft Office Open XML [.xml]
+  'otg', #      - OpenDocument Drawing Template [.otg]
+  'otp', #      - ODF Presentation Template [.otp]
+  'ots', #      - ODF Spreadsheet Template [.ots]
+  'ott', #      - Open Document Text [.ott]
+  #pbm      - Portable Bitmap [.pbm]
+  #pct      - Mac Pict [.pct]
+  'pdb', #      - AportisDoc (Palm) [.pdb]
+  #pdf      - Portable Document Format [.pdf]
+  #pgm      - Portable Graymap [.pgm]
+  #png      - Portable Network Graphic [.png]
+  'pot', #      - Microsoft PowerPoint 97/2000/XP Template [.pot]
+  'potm', #     - Microsoft PowerPoint 2007/2010 XML Template [.potm]
+  #ppm      - Portable Pixelmap [.ppm]
+  'pps', #      - Microsoft PowerPoint 97/2000/XP (Autoplay) [.pps]
+  'ppt', #      - Microsoft PowerPoint 97/2000/XP [.ppt]
+  'pptx', #     - Microsoft PowerPoint 2007/2010 XML [.pptx]
+  'psw', #      - Pocket Word [.psw]
+  'pwp', #      - PlaceWare [.pwp]
+  'pxl', #      - Pocket Excel [.pxl]
+  #ras      - Sun Raster Image [.ras]
+  'rtf', #      - Rich Text Format [.rtf]
+  'sda', #      - StarDraw 5.0 (OpenOffice.org Impress) [.sda]
+  'sdc3', #     - StarCalc 3.0 [.sdc]
+  'sdc4', #     - StarCalc 4.0 [.sdc]
+  'sdc', #      - StarCalc 5.0 [.sdc]
+  'sdd3', #     - StarDraw 3.0 (OpenOffice.org Impress) [.sdd]
+  'sdd4', #     - StarImpress 4.0 [.sdd]
+  'sdd', #      - StarImpress 5.0 [.sdd]
+  'sdw3', #     - StarWriter 3.0 [.sdw]
+  'sdw4', #     - StarWriter 4.0 [.sdw]
+  'sdw', #      - StarWriter 5.0 [.sdw]
+  'slk', #      - SYLK [.slk]
+  'stc', #      - OpenOffice.org 1.0 Spreadsheet Template [.stc]
+  'std', #      - OpenOffice.org 1.0 Drawing Template [.std]
+  'sti', #      - OpenOffice.org 1.0 Presentation Template [.sti]
+  'stw', #      - Open Office.org 1.0 Text Document Template [.stw]
+  #svg      - Scalable Vector Graphics [.svg]
+  'svm', #      - StarView Metafile [.svm]
+  'swf', #      - Macromedia Flash (SWF) [.swf]
+  'sxc', #      - OpenOffice.org 1.0 Spreadsheet [.sxc]
+  'sxd3', #     - StarDraw 3.0 [.sxd]
+  'sxd5', #     - StarDraw 5.0 [.sxd]
+  'sxd', #      - OpenOffice.org 1.0 Drawing (OpenOffice.org Impress) [.sxd]
+  'sxi', #      - OpenOffice.org 1.0 Presentation [.sxi]
+  'sxw', #      - Open Office.org 1.0 Text Document [.sxw]
+  #text     - Text Encoded [.txt]
+  #tiff     - Tagged Image File Format [.tiff]
+  #txt      - Text [.txt]
+  'uop', #      - Unified Office Format presentation [.uop]
+  'uos', #      - Unified Office Format spreadsheet [.uos]
+  'uot', #      - Unified Office Format text [.uot]
+  'vor3', #     - StarDraw 3.0 Template (OpenOffice.org Impress) [.vor]
+  'vor4', #     - StarWriter 4.0 Template [.vor]
+  'vor5', #     - StarDraw 5.0 Template (OpenOffice.org Impress) [.vor]
+  'vor', #      - StarCalc 5.0 Template [.vor]
+  #wmf      - Windows Metafile [.wmf]
+  'xhtml', #    - XHTML Document [.html]
+  'xls5', #     - Microsoft Excel 5.0 [.xls]
+  'xls95', #    - Microsoft Excel 95 [.xls]
+  'xls', #      - Microsoft Excel 97/2000/XP [.xls]
+  'xlt5', #     - Microsoft Excel 5.0 Template [.xlt]
+  'xlt95', #    - Microsoft Excel 95 Template [.xlt]
+  'xlt', #      - Microsoft Excel 97/2000/XP Template [.xlt]
+  #xpm      - X PixMap [.xpm]
+]
+
+def is_unoconv_working():
+    try:
+        output = check_output([where('unoconv'), '--show'], stderr=STDOUT)
+    except CalledProcessError, e:
+        _log.warn(_('unoconv failing to run, check log file'))
+        return False
+    if 'ERROR' in output:
+        return False
+    return True
+
+def supported_extensions(cache=[None]):
+    if cache[0] == None:
+        cache[0] = 'pdf'
+        # TODO: must have libreoffice-headless installed too, need to check for it
+        if where('unoconv') and is_unoconv_working():
+            cache.extend(unoconv_supported)
+    return cache
+
+def where(name):
+    for p in os.environ['PATH'].split(os.pathsep):
+        fullpath = os.path.join(p, name)
+        if os.path.exists(fullpath):
+            return fullpath
+    return None
+
+def check_prerequisites():
+    if not where('pdfinfo'):
+        _log.warn('missing pdfinfo')
+        return False
+    if not where('pdftocairo'):
+        _log.warn('missing pdfcairo')
+        return False
+    return True
+
+def sniff_handler(media_file, **kw):
+    if not check_prerequisites():
+        return False
+    if kw.get('media') is not None:
+        name, ext = os.path.splitext(kw['media'].filename)
+        clean_ext = ext[1:].lower()
+
+        if clean_ext in supported_extensions():
+            return True
+
+    return False
+
+def create_pdf_thumb(original, thumb_filename, width, height):
+    # Note: pdftocairo adds '.png', remove it
+    thumb_filename = thumb_filename[:-4]
+    executable = where('pdftocairo')
+    args = [executable, '-scale-to', str(min(width, height)),
+            '-singlefile', '-png', original, thumb_filename]
+    _log.debug('calling {0}'.format(repr(' '.join(args))))
+    call(executable=executable, args=args)
+
+def pdf_info(original):
+    """
+    Extract dictionary of pdf information. This could use a library instead
+    of a process.
+
+    Note: I'm assuming pdfinfo output is sanitized (integers where integers are
+    expected, etc.) - if this is wrong then an exception will be raised and caught
+    leading to the dreaded error page. It seems a safe assumption.
+    """
+    ret_dict = {}
+    pdfinfo = where('pdfinfo')
+    try:
+        lines = check_output(executable=pdfinfo,
+                                args=[pdfinfo, original]).split(os.linesep)
+    except CalledProcessError:
+        _log.debug('pdfinfo could not read the pdf file.')
+        raise BadMediaFail()
+
+    info_dict = dict([[part.strip() for part in l.strip().split(':', 1)]
+                      for l in lines if ':' in l])
+
+    for date_key in [('pdf_mod_date', 'ModDate'),
+                     ('pdf_creation_date', 'CreationDate')]:
+        if date_key in info_dict:
+            ret_dict[date_key] = dateutil.parser.parse(info_dict[date_key])
+    for db_key, int_key in [('pdf_pages', 'Pages')]:
+        if int_key in info_dict:
+            ret_dict[db_key] = int(info_dict[int_key])
+
+    # parse 'PageSize' field: 595 x 842 pts (A4)
+    page_size_parts = info_dict['Page size'].split()
+    ret_dict['pdf_page_size_width'] = float(page_size_parts[0])
+    ret_dict['pdf_page_size_height'] = float(page_size_parts[2])
+
+    for db_key, str_key in [('pdf_keywords', 'Keywords'),
+        ('pdf_creator', 'Creator'), ('pdf_producer', 'Producer'),
+        ('pdf_author', 'Author'), ('pdf_title', 'Title')]:
+        ret_dict[db_key] = info_dict.get(str_key, None)
+    ret_dict['pdf_version_major'], ret_dict['pdf_version_minor'] = \
+        map(int, info_dict['PDF version'].split('.'))
+
+    return ret_dict
+
+def process_pdf(proc_state):
+    """Code to process a pdf file. Will be run by celery.
+
+    A Workbench() represents a local tempory dir. It is automatically
+    cleaned up when this function exits.
+    """
+    entry = proc_state.entry
+    workbench = proc_state.workbench
+
+    queued_filename = proc_state.get_queued_filename()
+    name_builder = FilenameBuilder(queued_filename)
+
+    media_files_dict = entry.setdefault('media_files', {})
+
+    # Copy our queued local workbench to its final destination
+    original_dest = name_builder.fill('{basename}{ext}')
+    proc_state.copy_original(original_dest)
+
+    # Create a pdf if this is a different doc, store pdf for viewer
+    ext = queued_filename.rsplit('.', 1)[-1].lower()
+    if ext == 'pdf':
+        pdf_filename = queued_filename
+    else:
+        pdf_filename = queued_filename.rsplit('.', 1)[0] + '.pdf'
+        unoconv = where('unoconv')
+        call(executable=unoconv,
+             args=[unoconv, '-v', '-f', 'pdf', queued_filename])
+        if not os.path.exists(pdf_filename):
+            _log.debug('unoconv failed to convert file to pdf')
+            raise BadMediaFail()
+        proc_state.store_public(keyname=u'pdf', local_file=pdf_filename)
+
+    pdf_info_dict = pdf_info(pdf_filename)
+
+    for name, width, height in [
+        (u'thumb', mgg.global_config['media:thumb']['max_width'],
+                   mgg.global_config['media:thumb']['max_height']),
+        (u'medium', mgg.global_config['media:medium']['max_width'],
+                   mgg.global_config['media:medium']['max_height']),
+        ]:
+        filename = name_builder.fill('{basename}.%s.png' % name)
+        path = workbench.joinpath(filename)
+        create_pdf_thumb(pdf_filename, path, width, height)
+        assert(os.path.exists(path))
+        proc_state.store_public(keyname=name, local_file=path)
+
+    proc_state.delete_queue_file()
+
+    entry.media_data_init(**pdf_info_dict)
+    entry.save()
author	Alon Levy <alon@pobox.com>	2013-03-27 12:21:10 +0200
committer	Alon Levy <alon@pobox.com>	2013-04-15 09:51:21 +0300
commit	a80ebf3b64dce807d84ab3993984c211f55b47db (patch)
tree	2e8eaebf18414ee511c0dc476a2be8d78253e46f /mediagoblin/media_types
parent	3cadb4a6cd1d5cfdef8712d00e4594345a15b4a7 (diff)
download	mediagoblin-a80ebf3b64dce807d84ab3993984c211f55b47db.tar.lz mediagoblin-a80ebf3b64dce807d84ab3993984c211f55b47db.tar.xz mediagoblin-a80ebf3b64dce807d84ab3993984c211f55b47db.zip