3 files changed, 274 insertions, 95 deletions
diff --git a/mediagoblin/gmg_commands/batchaddmedia.py b/mediagoblin/gmg_commands/batchaddmedia.py
index 43c24f6d..e540e88c 100644
--- a/mediagoblin/gmg_commands/batchaddmedia.py
+++ b/mediagoblin/gmg_commands/batchaddmedia.py
@@ -24,12 +24,11 @@ from mediagoblin.gmg_commands import util as commands_util
 from mediagoblin.submit.lib import (
     submit_media, get_upload_file_limits,
     FileUploadLimit, UserUploadLimit, UserPastUploadLimit)
-from mediagoblin.tools.translate import lazy_pass_to_ugettext as _
+from mediagoblin.tools.metadata import compact_and_validate
 
-from mediagoblin import mg_globals
-from jsonschema import validate 
 from jsonschema.exceptions import ValidationError
 
+
 def parser_setup(subparser):
     subparser.description = """\
 This command allows the administrator to upload many media files at once."""
@@ -49,11 +48,6 @@ Core properties (http://dublincore.org/documents/dces/). Both "location.csv" and
 have provided an example of these files at <url to be added>
 """))
     subparser.add_argument(
-        "-l", "--license",
-        help=(
-           "License these media entry will be released under, if all the same. "
-           "Should be a URL."))
-    subparser.add_argument(
         '--celery',
         action='store_true',
         help="Don't process eagerly, pass off to celery")
@@ -102,14 +96,12 @@ zip files and directories"
     metadata_file_path = os.path.join(dir_path, "metadata.csv")
 
     # check for the location file, if it exists...
-    location_filename = os.path.split(location_file_path)[-1]
     abs_location_filename = os.path.abspath(location_file_path)
     if not os.path.exists(abs_location_filename):
         print "Can't find a file with filename '%s'" % location_file_path
         return
 
     # check for the metadata file, if it exists...
-    metadata_filename = os.path.split(metadata_file_path)[-1]
     abs_metadata_filename = os.path.abspath(metadata_file_path)
     if not os.path.exists(abs_metadata_filename):
         print "Can't find a file with filename '%s'" % metadata_file_path
@@ -132,24 +124,24 @@ zip files and directories"
         contents = all_metadata.read()
         media_metadata = parse_csv_file(contents)
 
-    metadata_context = { 'dcterms':'http://purl.org/dc/terms/',
-                         'xsd': 'http://www.w3.org/2001/XMLSchema#'}
-
     for media_id in media_locations.keys():
         files_attempted += 1
 
-        file_metadata     = media_metadata[media_id]
-        sanitized_metadata = check_metadata_format(file_metadata)
-        if sanitized_metadata == {}: continue
+        file_metadata = media_metadata[media_id]
+        try:
+            json_ld_metadata = compact_and_validate(file_metadata)
+        except ValidationError, exc:
+            print "Error with '%s' value '%s': %s" % (
+                media_id, exc.path[0], exc.message)
+            continue
 
-        json_ld_metadata = jsonld.compact(build_json_ld_metadata(file_metadata), 
-                                            metadata_context)
         original_location = media_locations[media_id]['media:original']
         url = urlparse(original_location)
 
-        title = sanitized_metadata.get('dcterms:title')
-        description = sanitized_metadata.get('dcterms:description')
-        license = sanitized_metadata.get('dcterms:rights')
+        title = json_ld_metadata.get('dcterms:title')
+        description = json_ld_metadata.get('dcterms:description')
+
+        license = json_ld_metadata.get('license')
         filename = url.path.split()[-1]
 
         if url.scheme == 'http':
@@ -219,77 +211,3 @@ def parse_csv_file(file_contents):
 def teardown(temp_files):
     for temp_file in temp_files:
         subprocess.call(['rm','-r',temp_file])
-
-def build_json_ld_metadata(metadata_dict):
-    output_dict = {}
-    for p in metadata_dict.keys():
-        if p in ["dcterms:rights", "dcterms:relation"]:
-            m_type = "xsd:uri"
-        elif p in ["dcterms:date", "dcterms:created"]:
-            m_type = "xsd:date"
-        else:
-            m_type = "xsd:string"
-        description = {"@value": metadata_dict[p],
-                       "@type" : m_type}
-        output_dict[p] = description
-    return output_dict
-
-def check_metadata_format(metadata_dict):
-    schema = {
-    "$schema":"http://json-schema.org/schema#",
-    "properties":{
-        "media:id":{},
-        "dcterms:contributor":{},
-        "dcterms:coverage":{},
-        "dcterms:created":{},
-        "dcterms:creator":{},
-        "dcterms:date":{},
-        "dcterms:description":{},
-        "dcterms:format":{},
-        "dcterms:identifier":{},
-        "dcterms:language":{},
-        "dcterms:publisher":{},
-        "dcterms:relation":{},
-        "dcterms:rights" : {
-            "format":"uri",
-            "type":"string"
-        },
-        "dcterms:source":{},
-        "dcterms:subject":{},
-        "dcterms:title":{},
-        "dcterms:type":{}
-    },
-    "additionalProperties": False,
-    "required":["dcterms:title","media:id"]
-}
-    try:
-        validate(metadata_dict, schema)
-        output_dict = metadata_dict
-        # "media:id" is only for internal use, so we delete it for the output
-        del output_dict['media:id']
-
-    except ValidationError, exc:
-        title = (metadata_dict.get('dcterms:title') or 
-            metadata_dict.get('media:id') or _(u'UNKNOWN FILE'))
-
-        if exc.validator == "additionalProperties":
-            message = _(u'Invalid metadata provided for file "{title}". This \
-script only accepts the Dublin Core metadata terms.'.format(title=title))
-
-        elif exc.validator == "required":
-            message = _(
-u'All necessary metadata was not provided for file "{title}", you must include \
-a "dcterms:title" column for each media file'.format(title=title))
-
-        else:
-            message = _(u'Could not find appropriate metadata for file \
-"{title}".'.format(title=title))
-
-        print _(u"""WARN: {message} \nSkipping File...\n""".format(
-            message=message))
-
-        output_dict = {}
-    except:
-        raise
-
-    return output_dict
diff --git a/mediagoblin/static/metadata/rdfa11.jsonld b/mediagoblin/static/metadata/rdfa11.jsonld
new file mode 100644
index 00000000..b2557233
--- /dev/null
+++ b/mediagoblin/static/metadata/rdfa11.jsonld
@@ -0,0 +1,48 @@
+{
+  "@context": {
+    "cat": "http://www.w3.org/ns/dcat#",
+    "qb": "http://purl.org/linked-data/cube#",
+    "grddl": "http://www.w3.org/2003/g/data-view#",
+    "ma": "http://www.w3.org/ns/ma-ont#",
+    "owl": "http://www.w3.org/2002/07/owl#",
+    "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
+    "rdfa": "http://www.w3.org/ns/rdfa#",
+    "rdfs": "http://www.w3.org/2000/01/rdf-schema#",
+    "rif": "http://www.w3.org/2007/rif#",
+    "rr": "http://www.w3.org/ns/r2rml#",
+    "skos": "http://www.w3.org/2004/02/skos/core#",
+    "skosxl": "http://www.w3.org/2008/05/skos-xl#",
+    "wdr": "http://www.w3.org/2007/05/powder#",
+    "void": "http://rdfs.org/ns/void#",
+    "wdrs": "http://www.w3.org/2007/05/powder-s#",
+    "xhv": "http://www.w3.org/1999/xhtml/vocab#",
+    "xml": "http://www.w3.org/XML/1998/namespace",
+    "xsd": "http://www.w3.org/2001/XMLSchema#",
+    "prov": "http://www.w3.org/ns/prov#",
+    "sd": "http://www.w3.org/ns/sparql-service-description#",
+    "org": "http://www.w3.org/ns/org#",
+    "gldp": "http://www.w3.org/ns/people#",
+    "cnt": "http://www.w3.org/2008/content#",
+    "dcat": "http://www.w3.org/ns/dcat#",
+    "earl": "http://www.w3.org/ns/earl#",
+    "ht": "http://www.w3.org/2006/http#",
+    "ptr": "http://www.w3.org/2009/pointers#",
+    "cc": "http://creativecommons.org/ns#",
+    "ctag": "http://commontag.org/ns#",
+    "dc": "http://purl.org/dc/terms/",
+    "dc11": "http://purl.org/dc/elements/1.1/",
+    "dcterms": "http://purl.org/dc/terms/",
+    "foaf": "http://xmlns.com/foaf/0.1/",
+    "gr": "http://purl.org/goodrelations/v1#",
+    "ical": "http://www.w3.org/2002/12/cal/icaltzd#",
+    "og": "http://ogp.me/ns#",
+    "rev": "http://purl.org/stuff/rev#",
+    "sioc": "http://rdfs.org/sioc/ns#",
+    "v": "http://rdf.data-vocabulary.org/#",
+    "vcard": "http://www.w3.org/2006/vcard/ns#",
+    "schema": "http://schema.org/",
+    "describedby": "http://www.w3.org/2007/05/powder-s#describedby",
+    "license": "http://www.w3.org/1999/xhtml/vocab#license",
+    "role": "http://www.w3.org/1999/xhtml/vocab#role"
+  }
+}
+\ No newline at end of file
diff --git a/mediagoblin/tools/metadata.py b/mediagoblin/tools/metadata.py
new file mode 100644
index 00000000..7de5a514
--- /dev/null
+++ b/mediagoblin/tools/metadata.py
@@ -0,0 +1,213 @@
+# GNU MediaGoblin -- federated, autonomous media hosting
+# Copyright (C) 2011, 2012 MediaGoblin contributors.  See AUTHORS.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+
+import os
+import copy
+import json
+import re
+from pkg_resources import resource_filename
+
+import dateutil.parser
+from pyld import jsonld
+from jsonschema import validate, FormatChecker, draft4_format_checker
+from jsonschema.compat import str_types
+
+from mediagoblin.tools.pluginapi import hook_handle
+
+
+
+########################################################
+## Set up the MediaGoblin format checker for json-schema
+########################################################
+
+URL_REGEX = re.compile(
+    r'^[a-z]+://([^/:]+|([0-9]{1,3}\.){3}[0-9]{1,3})(:[0-9]+)?(\/.*)?$',
+    re.IGNORECASE)
+
+def is_uri(instance):
+    """
+    jsonschema uri validator
+    """
+    if not isinstance(instance, str_types):
+        return True
+
+    return URL_REGEX.match(instance)
+
+def is_datetime(instance):
+    """
+    Is a date or datetime readable string.
+    """
+    if not isinstance(instance, str_types):
+        return True
+
+    return dateutil.parser.parse(instance)
+
+
+class DefaultChecker(FormatChecker):
+    """
+    Default MediaGoblin format checker... extended to include a few extra things
+    """
+    checkers = copy.deepcopy(draft4_format_checker.checkers)
+
+
+DefaultChecker.checkers[u"uri"] = (is_uri, ())
+DefaultChecker.checkers[u"date-time"] = (is_datetime, (ValueError, TypeError))
+DEFAULT_CHECKER = DefaultChecker()
+
+# Crappy default schema, checks for things we deem important
+
+DEFAULT_SCHEMA = {
+    "$schema": "http://json-schema.org/schema#",
+
+    "type": "object",
+    "properties": {
+        "license": {
+            "format": "uri",
+            "type": "string",
+        },
+        "dcterms:created": {
+            "format": "date-time",
+            "type": "string",
+        }
+    },
+}
+
+
+def load_resource(package, resource_path):
+    """
+    Load a resource, return it as a string.
+
+    Args:
+    - package: package or module name.  Eg "mediagoblin.media_types.audio"
+    - resource_path: path to get to this resource, a list of
+      directories and finally a filename.  Will be joined with
+      os.path.sep.
+    """
+    filename = resource_filename(package, os.path.sep.join(resource_path))
+    return file(filename).read()
+
+def load_resource_json(package, resource_path):
+    """
+    Load a resource json file, return a dictionary.
+
+    Args:
+    - package: package or module name.  Eg "mediagoblin.media_types.audio"
+    - resource_path: path to get to this resource, a list of
+      directories and finally a filename.  Will be joined with
+      os.path.sep.
+    """
+    return json.loads(load_resource(package, resource_path))
+
+
+##################################
+## Load the MediaGoblin core files
+##################################
+
+
+BUILTIN_CONTEXTS = {
+    "http://www.w3.org/2013/json-ld-context/rdfa11": load_resource(
+        "mediagoblin", ["static", "metadata", "rdfa11.jsonld"])}
+
+
+_CONTEXT_CACHE = {}
+
+def load_context(url):
+    """
+    A self-aware document loader.  For those contexts MediaGoblin
+    stores internally, load them from disk.
+    """
+    if url in _CONTEXT_CACHE:
+        return _CONTEXT_CACHE[url]        
+
+    # See if it's one of our basic ones
+    document = BUILTIN_CONTEXTS.get(url, None)
+
+    # No?  See if we have an internal schema for this
+    if document is None:
+        document = hook_handle(("context_url_data", url))
+
+    # Okay, if we've gotten a document by now... let's package it up
+    if document is not None:
+        document = {'contextUrl': None,
+                    'documentUrl': url,
+                    'document': document}
+
+    # Otherwise, use jsonld.load_document
+    else:
+        document = jsonld.load_document(url)
+
+    # cache
+    _CONTEXT_CACHE[url] = document
+    return document
+
+
+DEFAULT_CONTEXT = "http://www.w3.org/2013/json-ld-context/rdfa11"
+
+def compact_json(metadata, context=DEFAULT_CONTEXT):
+    """
+    Compact json with supplied context.
+
+    Note: Free floating" nodes are removed (eg a key just named
+    "bazzzzzz" which isn't specified in the context... something like
+    bazzzzzz:blerp will stay though.  This is jsonld.compact behavior.
+    """
+    compacted = jsonld.compact(
+        metadata, context,
+        options={
+            "documentLoader": load_context,
+            # This allows for things like "license" and etc to be preserved
+            "expandContext": context,
+            "keepFreeFloatingNodes": False})
+
+    return compacted
+
+
+def compact_and_validate(metadata, context=DEFAULT_CONTEXT,
+                         schema=DEFAULT_SCHEMA):
+    """
+    compact json with supplied context, check against schema for errors
+
+    raises an exception (jsonschema.exceptions.ValidationError) if
+    there's an error.
+
+    Note: Free floating" nodes are removed (eg a key just named
+    "bazzzzzz" which isn't specified in the context... something like
+    bazzzzzz:blerp will stay though.  This is jsonld.compact behavior.
+
+    You may wish to do this validation yourself... this is just for convenience.
+    """
+    compacted = compact_json(metadata, context)
+    validate(metadata, schema, format_checker=DEFAULT_CHECKER)
+
+    return compacted
+
+
+def expand_json(metadata, context=DEFAULT_CONTEXT):
+    """
+    Expand json, but be sure to use our documentLoader.
+
+    By default this expands with DEFAULT_CONTEXT, but if you do not need this,
+    you can safely set this to None.
+
+    # @@: Is the above a good idea?  Maybe it should be set to None by
+    #   default.
+    """
+    options = {
+        "documentLoader": load_context}
+    if context is not None:
+        options["expandContext"] = context
+    return jsonld.expand(metadata, options=options)