aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--mediagoblin/gmg_commands/batchaddmedia.py108
-rw-r--r--mediagoblin/static/metadata/rdfa11.jsonld48
-rw-r--r--mediagoblin/tools/metadata.py213
3 files changed, 274 insertions, 95 deletions
diff --git a/mediagoblin/gmg_commands/batchaddmedia.py b/mediagoblin/gmg_commands/batchaddmedia.py
index 43c24f6d..e540e88c 100644
--- a/mediagoblin/gmg_commands/batchaddmedia.py
+++ b/mediagoblin/gmg_commands/batchaddmedia.py
@@ -24,12 +24,11 @@ from mediagoblin.gmg_commands import util as commands_util
from mediagoblin.submit.lib import (
submit_media, get_upload_file_limits,
FileUploadLimit, UserUploadLimit, UserPastUploadLimit)
-from mediagoblin.tools.translate import lazy_pass_to_ugettext as _
+from mediagoblin.tools.metadata import compact_and_validate
-from mediagoblin import mg_globals
-from jsonschema import validate
from jsonschema.exceptions import ValidationError
+
def parser_setup(subparser):
subparser.description = """\
This command allows the administrator to upload many media files at once."""
@@ -49,11 +48,6 @@ Core properties (http://dublincore.org/documents/dces/). Both "location.csv" and
have provided an example of these files at <url to be added>
"""))
subparser.add_argument(
- "-l", "--license",
- help=(
- "License these media entry will be released under, if all the same. "
- "Should be a URL."))
- subparser.add_argument(
'--celery',
action='store_true',
help="Don't process eagerly, pass off to celery")
@@ -102,14 +96,12 @@ zip files and directories"
metadata_file_path = os.path.join(dir_path, "metadata.csv")
# check for the location file, if it exists...
- location_filename = os.path.split(location_file_path)[-1]
abs_location_filename = os.path.abspath(location_file_path)
if not os.path.exists(abs_location_filename):
print "Can't find a file with filename '%s'" % location_file_path
return
# check for the metadata file, if it exists...
- metadata_filename = os.path.split(metadata_file_path)[-1]
abs_metadata_filename = os.path.abspath(metadata_file_path)
if not os.path.exists(abs_metadata_filename):
print "Can't find a file with filename '%s'" % metadata_file_path
@@ -132,24 +124,24 @@ zip files and directories"
contents = all_metadata.read()
media_metadata = parse_csv_file(contents)
- metadata_context = { 'dcterms':'http://purl.org/dc/terms/',
- 'xsd': 'http://www.w3.org/2001/XMLSchema#'}
-
for media_id in media_locations.keys():
files_attempted += 1
- file_metadata = media_metadata[media_id]
- sanitized_metadata = check_metadata_format(file_metadata)
- if sanitized_metadata == {}: continue
+ file_metadata = media_metadata[media_id]
+ try:
+ json_ld_metadata = compact_and_validate(file_metadata)
+ except ValidationError, exc:
+ print "Error with '%s' value '%s': %s" % (
+ media_id, exc.path[0], exc.message)
+ continue
- json_ld_metadata = jsonld.compact(build_json_ld_metadata(file_metadata),
- metadata_context)
original_location = media_locations[media_id]['media:original']
url = urlparse(original_location)
- title = sanitized_metadata.get('dcterms:title')
- description = sanitized_metadata.get('dcterms:description')
- license = sanitized_metadata.get('dcterms:rights')
+ title = json_ld_metadata.get('dcterms:title')
+ description = json_ld_metadata.get('dcterms:description')
+
+ license = json_ld_metadata.get('license')
filename = url.path.split()[-1]
if url.scheme == 'http':
@@ -219,77 +211,3 @@ def parse_csv_file(file_contents):
def teardown(temp_files):
for temp_file in temp_files:
subprocess.call(['rm','-r',temp_file])
-
-def build_json_ld_metadata(metadata_dict):
- output_dict = {}
- for p in metadata_dict.keys():
- if p in ["dcterms:rights", "dcterms:relation"]:
- m_type = "xsd:uri"
- elif p in ["dcterms:date", "dcterms:created"]:
- m_type = "xsd:date"
- else:
- m_type = "xsd:string"
- description = {"@value": metadata_dict[p],
- "@type" : m_type}
- output_dict[p] = description
- return output_dict
-
-def check_metadata_format(metadata_dict):
- schema = {
- "$schema":"http://json-schema.org/schema#",
- "properties":{
- "media:id":{},
- "dcterms:contributor":{},
- "dcterms:coverage":{},
- "dcterms:created":{},
- "dcterms:creator":{},
- "dcterms:date":{},
- "dcterms:description":{},
- "dcterms:format":{},
- "dcterms:identifier":{},
- "dcterms:language":{},
- "dcterms:publisher":{},
- "dcterms:relation":{},
- "dcterms:rights" : {
- "format":"uri",
- "type":"string"
- },
- "dcterms:source":{},
- "dcterms:subject":{},
- "dcterms:title":{},
- "dcterms:type":{}
- },
- "additionalProperties": False,
- "required":["dcterms:title","media:id"]
-}
- try:
- validate(metadata_dict, schema)
- output_dict = metadata_dict
- # "media:id" is only for internal use, so we delete it for the output
- del output_dict['media:id']
-
- except ValidationError, exc:
- title = (metadata_dict.get('dcterms:title') or
- metadata_dict.get('media:id') or _(u'UNKNOWN FILE'))
-
- if exc.validator == "additionalProperties":
- message = _(u'Invalid metadata provided for file "{title}". This \
-script only accepts the Dublin Core metadata terms.'.format(title=title))
-
- elif exc.validator == "required":
- message = _(
-u'All necessary metadata was not provided for file "{title}", you must include \
-a "dcterms:title" column for each media file'.format(title=title))
-
- else:
- message = _(u'Could not find appropriate metadata for file \
-"{title}".'.format(title=title))
-
- print _(u"""WARN: {message} \nSkipping File...\n""".format(
- message=message))
-
- output_dict = {}
- except:
- raise
-
- return output_dict
diff --git a/mediagoblin/static/metadata/rdfa11.jsonld b/mediagoblin/static/metadata/rdfa11.jsonld
new file mode 100644
index 00000000..b2557233
--- /dev/null
+++ b/mediagoblin/static/metadata/rdfa11.jsonld
@@ -0,0 +1,48 @@
+{
+ "@context": {
+ "cat": "http://www.w3.org/ns/dcat#",
+ "qb": "http://purl.org/linked-data/cube#",
+ "grddl": "http://www.w3.org/2003/g/data-view#",
+ "ma": "http://www.w3.org/ns/ma-ont#",
+ "owl": "http://www.w3.org/2002/07/owl#",
+ "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
+ "rdfa": "http://www.w3.org/ns/rdfa#",
+ "rdfs": "http://www.w3.org/2000/01/rdf-schema#",
+ "rif": "http://www.w3.org/2007/rif#",
+ "rr": "http://www.w3.org/ns/r2rml#",
+ "skos": "http://www.w3.org/2004/02/skos/core#",
+ "skosxl": "http://www.w3.org/2008/05/skos-xl#",
+ "wdr": "http://www.w3.org/2007/05/powder#",
+ "void": "http://rdfs.org/ns/void#",
+ "wdrs": "http://www.w3.org/2007/05/powder-s#",
+ "xhv": "http://www.w3.org/1999/xhtml/vocab#",
+ "xml": "http://www.w3.org/XML/1998/namespace",
+ "xsd": "http://www.w3.org/2001/XMLSchema#",
+ "prov": "http://www.w3.org/ns/prov#",
+ "sd": "http://www.w3.org/ns/sparql-service-description#",
+ "org": "http://www.w3.org/ns/org#",
+ "gldp": "http://www.w3.org/ns/people#",
+ "cnt": "http://www.w3.org/2008/content#",
+ "dcat": "http://www.w3.org/ns/dcat#",
+ "earl": "http://www.w3.org/ns/earl#",
+ "ht": "http://www.w3.org/2006/http#",
+ "ptr": "http://www.w3.org/2009/pointers#",
+ "cc": "http://creativecommons.org/ns#",
+ "ctag": "http://commontag.org/ns#",
+ "dc": "http://purl.org/dc/terms/",
+ "dc11": "http://purl.org/dc/elements/1.1/",
+ "dcterms": "http://purl.org/dc/terms/",
+ "foaf": "http://xmlns.com/foaf/0.1/",
+ "gr": "http://purl.org/goodrelations/v1#",
+ "ical": "http://www.w3.org/2002/12/cal/icaltzd#",
+ "og": "http://ogp.me/ns#",
+ "rev": "http://purl.org/stuff/rev#",
+ "sioc": "http://rdfs.org/sioc/ns#",
+ "v": "http://rdf.data-vocabulary.org/#",
+ "vcard": "http://www.w3.org/2006/vcard/ns#",
+ "schema": "http://schema.org/",
+ "describedby": "http://www.w3.org/2007/05/powder-s#describedby",
+ "license": "http://www.w3.org/1999/xhtml/vocab#license",
+ "role": "http://www.w3.org/1999/xhtml/vocab#role"
+ }
+} \ No newline at end of file
diff --git a/mediagoblin/tools/metadata.py b/mediagoblin/tools/metadata.py
new file mode 100644
index 00000000..7de5a514
--- /dev/null
+++ b/mediagoblin/tools/metadata.py
@@ -0,0 +1,213 @@
+# GNU MediaGoblin -- federated, autonomous media hosting
+# Copyright (C) 2011, 2012 MediaGoblin contributors. See AUTHORS.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+
+import os
+import copy
+import json
+import re
+from pkg_resources import resource_filename
+
+import dateutil.parser
+from pyld import jsonld
+from jsonschema import validate, FormatChecker, draft4_format_checker
+from jsonschema.compat import str_types
+
+from mediagoblin.tools.pluginapi import hook_handle
+
+
+
+########################################################
+## Set up the MediaGoblin format checker for json-schema
+########################################################
+
+URL_REGEX = re.compile(
+ r'^[a-z]+://([^/:]+|([0-9]{1,3}\.){3}[0-9]{1,3})(:[0-9]+)?(\/.*)?$',
+ re.IGNORECASE)
+
+def is_uri(instance):
+ """
+ jsonschema uri validator
+ """
+ if not isinstance(instance, str_types):
+ return True
+
+ return URL_REGEX.match(instance)
+
+def is_datetime(instance):
+ """
+ Is a date or datetime readable string.
+ """
+ if not isinstance(instance, str_types):
+ return True
+
+ return dateutil.parser.parse(instance)
+
+
+class DefaultChecker(FormatChecker):
+ """
+ Default MediaGoblin format checker... extended to include a few extra things
+ """
+ checkers = copy.deepcopy(draft4_format_checker.checkers)
+
+
+DefaultChecker.checkers[u"uri"] = (is_uri, ())
+DefaultChecker.checkers[u"date-time"] = (is_datetime, (ValueError, TypeError))
+DEFAULT_CHECKER = DefaultChecker()
+
+# Crappy default schema, checks for things we deem important
+
+DEFAULT_SCHEMA = {
+ "$schema": "http://json-schema.org/schema#",
+
+ "type": "object",
+ "properties": {
+ "license": {
+ "format": "uri",
+ "type": "string",
+ },
+ "dcterms:created": {
+ "format": "date-time",
+ "type": "string",
+ }
+ },
+}
+
+
+def load_resource(package, resource_path):
+ """
+ Load a resource, return it as a string.
+
+ Args:
+ - package: package or module name. Eg "mediagoblin.media_types.audio"
+ - resource_path: path to get to this resource, a list of
+ directories and finally a filename. Will be joined with
+ os.path.sep.
+ """
+ filename = resource_filename(package, os.path.sep.join(resource_path))
+ return file(filename).read()
+
+def load_resource_json(package, resource_path):
+ """
+ Load a resource json file, return a dictionary.
+
+ Args:
+ - package: package or module name. Eg "mediagoblin.media_types.audio"
+ - resource_path: path to get to this resource, a list of
+ directories and finally a filename. Will be joined with
+ os.path.sep.
+ """
+ return json.loads(load_resource(package, resource_path))
+
+
+##################################
+## Load the MediaGoblin core files
+##################################
+
+
+BUILTIN_CONTEXTS = {
+ "http://www.w3.org/2013/json-ld-context/rdfa11": load_resource(
+ "mediagoblin", ["static", "metadata", "rdfa11.jsonld"])}
+
+
+_CONTEXT_CACHE = {}
+
+def load_context(url):
+ """
+ A self-aware document loader. For those contexts MediaGoblin
+ stores internally, load them from disk.
+ """
+ if url in _CONTEXT_CACHE:
+ return _CONTEXT_CACHE[url]
+
+ # See if it's one of our basic ones
+ document = BUILTIN_CONTEXTS.get(url, None)
+
+ # No? See if we have an internal schema for this
+ if document is None:
+ document = hook_handle(("context_url_data", url))
+
+ # Okay, if we've gotten a document by now... let's package it up
+ if document is not None:
+ document = {'contextUrl': None,
+ 'documentUrl': url,
+ 'document': document}
+
+ # Otherwise, use jsonld.load_document
+ else:
+ document = jsonld.load_document(url)
+
+ # cache
+ _CONTEXT_CACHE[url] = document
+ return document
+
+
+DEFAULT_CONTEXT = "http://www.w3.org/2013/json-ld-context/rdfa11"
+
+def compact_json(metadata, context=DEFAULT_CONTEXT):
+ """
+ Compact json with supplied context.
+
+ Note: Free floating" nodes are removed (eg a key just named
+ "bazzzzzz" which isn't specified in the context... something like
+ bazzzzzz:blerp will stay though. This is jsonld.compact behavior.
+ """
+ compacted = jsonld.compact(
+ metadata, context,
+ options={
+ "documentLoader": load_context,
+ # This allows for things like "license" and etc to be preserved
+ "expandContext": context,
+ "keepFreeFloatingNodes": False})
+
+ return compacted
+
+
+def compact_and_validate(metadata, context=DEFAULT_CONTEXT,
+ schema=DEFAULT_SCHEMA):
+ """
+ compact json with supplied context, check against schema for errors
+
+ raises an exception (jsonschema.exceptions.ValidationError) if
+ there's an error.
+
+ Note: Free floating" nodes are removed (eg a key just named
+ "bazzzzzz" which isn't specified in the context... something like
+ bazzzzzz:blerp will stay though. This is jsonld.compact behavior.
+
+ You may wish to do this validation yourself... this is just for convenience.
+ """
+ compacted = compact_json(metadata, context)
+ validate(metadata, schema, format_checker=DEFAULT_CHECKER)
+
+ return compacted
+
+
+def expand_json(metadata, context=DEFAULT_CONTEXT):
+ """
+ Expand json, but be sure to use our documentLoader.
+
+ By default this expands with DEFAULT_CONTEXT, but if you do not need this,
+ you can safely set this to None.
+
+ # @@: Is the above a good idea? Maybe it should be set to None by
+ # default.
+ """
+ options = {
+ "documentLoader": load_context}
+ if context is not None:
+ options["expandContext"] = context
+ return jsonld.expand(metadata, options=options)