1 files changed, 222 insertions, 0 deletions
diff --git a/mediagoblin/tools/metadata.py b/mediagoblin/tools/metadata.py
new file mode 100644
index 00000000..bfefcac9
--- /dev/null
+++ b/mediagoblin/tools/metadata.py
@@ -0,0 +1,222 @@
+# GNU MediaGoblin -- federated, autonomous media hosting
+# Copyright (C) 2011, 2012 MediaGoblin contributors.  See AUTHORS.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+
+import os
+import copy
+import json
+import re
+from pkg_resources import resource_filename
+
+import dateutil.parser
+from pyld import jsonld
+from jsonschema import validate, FormatChecker, draft4_format_checker
+from jsonschema.compat import str_types
+
+from mediagoblin.tools.pluginapi import hook_handle
+
+
+
+########################################################
+## Set up the MediaGoblin format checker for json-schema
+########################################################
+
+URL_REGEX = re.compile(
+    r'^[a-z]+://([^/:]+|([0-9]{1,3}\.){3}[0-9]{1,3})(:[0-9]+)?(\/.*)?$',
+    re.IGNORECASE)
+
+def is_uri(instance):
+    """
+    jsonschema uri validator
+    """
+    if not isinstance(instance, str_types):
+        return True
+
+    return URL_REGEX.match(instance)
+
+def is_datetime(instance):
+    """
+    Is a date or datetime readable string.
+    """
+    if not isinstance(instance, str_types):
+        return True
+
+    return dateutil.parser.parse(instance)
+
+
+class DefaultChecker(FormatChecker):
+    """
+    Default MediaGoblin format checker... extended to include a few extra things
+    """
+    checkers = copy.deepcopy(draft4_format_checker.checkers)
+
+
+DefaultChecker.checkers[u"uri"] = (is_uri, ())
+DefaultChecker.checkers[u"date-time"] = (is_datetime, (ValueError, TypeError))
+DEFAULT_CHECKER = DefaultChecker()
+
+# Crappy default schema, checks for things we deem important
+
+DEFAULT_SCHEMA = {
+    "$schema": "http://json-schema.org/schema#",
+
+    "type": "object",
+    "properties": {
+        "license": {
+            "format": "uri",
+            "type": "string",
+        },
+        "dcterms:created": {
+            "format": "date-time",
+            "type": "string",
+        },
+        "dc:created": {
+            "format": "date-time",
+            "type": "string",
+        }
+    },
+}
+
+
+def load_resource(package, resource_path):
+    """
+    Load a resource, return it as a string.
+
+    Args:
+    - package: package or module name.  Eg "mediagoblin.media_types.audio"
+    - resource_path: path to get to this resource, a list of
+      directories and finally a filename.  Will be joined with
+      os.path.sep.
+    """
+    filename = resource_filename(package, os.path.sep.join(resource_path))
+    return file(filename).read()
+
+def load_resource_json(package, resource_path):
+    """
+    Load a resource json file, return a dictionary.
+
+    Args:
+    - package: package or module name.  Eg "mediagoblin.media_types.audio"
+    - resource_path: path to get to this resource, a list of
+      directories and finally a filename.  Will be joined with
+      os.path.sep.
+    """
+    return json.loads(load_resource(package, resource_path))
+
+
+##################################
+## Load the MediaGoblin core files
+##################################
+
+
+BUILTIN_CONTEXTS = {
+    "http://www.w3.org/2013/json-ld-context/rdfa11": load_resource(
+        "mediagoblin", ["static", "metadata", "rdfa11.jsonld"])}
+
+
+_CONTEXT_CACHE = {}
+
+def load_context(url):
+    """
+    A self-aware document loader.  For those contexts MediaGoblin
+    stores internally, load them from disk.
+    """
+    if url in _CONTEXT_CACHE:
+        return _CONTEXT_CACHE[url]
+
+    # See if it's one of our basic ones
+    document = BUILTIN_CONTEXTS.get(url, None)
+
+    # No?  See if we have an internal schema for this
+    if document is None:
+        document = hook_handle(("context_url_data", url))
+
+    # Okay, if we've gotten a document by now... let's package it up
+    if document is not None:
+        document = {'contextUrl': None,
+                    'documentUrl': url,
+                    'document': document}
+
+    # Otherwise, use jsonld.load_document
+    else:
+        document = jsonld.load_document(url)
+
+    # cache
+    _CONTEXT_CACHE[url] = document
+    return document
+
+
+DEFAULT_CONTEXT = "http://www.w3.org/2013/json-ld-context/rdfa11"
+
+def compact_json(metadata, context=DEFAULT_CONTEXT):
+    """
+    Compact json with supplied context.
+
+    Note: Free floating" nodes are removed (eg a key just named
+    "bazzzzzz" which isn't specified in the context... something like
+    bazzzzzz:blerp will stay though.  This is jsonld.compact behavior.
+    """
+    compacted = jsonld.compact(
+        metadata, context,
+        options={
+            "documentLoader": load_context,
+            # This allows for things like "license" and etc to be preserved
+            "expandContext": context,
+            "keepFreeFloatingNodes": False})
+
+    return compacted
+
+
+def compact_and_validate(metadata, context=DEFAULT_CONTEXT,
+                         schema=DEFAULT_SCHEMA):
+    """
+    compact json with supplied context, check against schema for errors
+
+    raises an exception (jsonschema.exceptions.ValidationError) if
+    there's an error.
+
+    Note: Free floating" nodes are removed (eg a key just named
+    "bazzzzzz" which isn't specified in the context... something like
+    bazzzzzz:blerp will stay though.  This is jsonld.compact behavior.
+
+    You may wish to do this validation yourself... this is just for convenience.
+    """
+    compacted = compact_json(metadata, context)
+    validate(metadata, schema, format_checker=DEFAULT_CHECKER)
+
+    return compacted
+
+
+def expand_json(metadata, context=DEFAULT_CONTEXT):
+    """
+    Expand json, but be sure to use our documentLoader.
+
+    By default this expands with DEFAULT_CONTEXT, but if you do not need this,
+    you can safely set this to None.
+
+    # @@: Is the above a good idea?  Maybe it should be set to None by
+    #   default.
+    """
+    options = {
+        "documentLoader": load_context}
+    if context is not None:
+        options["expandContext"] = context
+    return jsonld.expand(metadata, options=options)
+
+
+def rdfa_to_readable(rdfa_predicate):
+    readable = rdfa_predicate.split(u":")[1].capitalize()
+    return readable