diff options
-rw-r--r-- | mediagoblin/gmg_commands/batchaddmedia.py | 108 | ||||
-rw-r--r-- | mediagoblin/static/metadata/rdfa11.jsonld | 48 | ||||
-rw-r--r-- | mediagoblin/tools/metadata.py | 213 |
3 files changed, 274 insertions, 95 deletions
diff --git a/mediagoblin/gmg_commands/batchaddmedia.py b/mediagoblin/gmg_commands/batchaddmedia.py index 43c24f6d..e540e88c 100644 --- a/mediagoblin/gmg_commands/batchaddmedia.py +++ b/mediagoblin/gmg_commands/batchaddmedia.py @@ -24,12 +24,11 @@ from mediagoblin.gmg_commands import util as commands_util from mediagoblin.submit.lib import ( submit_media, get_upload_file_limits, FileUploadLimit, UserUploadLimit, UserPastUploadLimit) -from mediagoblin.tools.translate import lazy_pass_to_ugettext as _ +from mediagoblin.tools.metadata import compact_and_validate -from mediagoblin import mg_globals -from jsonschema import validate from jsonschema.exceptions import ValidationError + def parser_setup(subparser): subparser.description = """\ This command allows the administrator to upload many media files at once.""" @@ -49,11 +48,6 @@ Core properties (http://dublincore.org/documents/dces/). Both "location.csv" and have provided an example of these files at <url to be added> """)) subparser.add_argument( - "-l", "--license", - help=( - "License these media entry will be released under, if all the same. " - "Should be a URL.")) - subparser.add_argument( '--celery', action='store_true', help="Don't process eagerly, pass off to celery") @@ -102,14 +96,12 @@ zip files and directories" metadata_file_path = os.path.join(dir_path, "metadata.csv") # check for the location file, if it exists... - location_filename = os.path.split(location_file_path)[-1] abs_location_filename = os.path.abspath(location_file_path) if not os.path.exists(abs_location_filename): print "Can't find a file with filename '%s'" % location_file_path return # check for the metadata file, if it exists... - metadata_filename = os.path.split(metadata_file_path)[-1] abs_metadata_filename = os.path.abspath(metadata_file_path) if not os.path.exists(abs_metadata_filename): print "Can't find a file with filename '%s'" % metadata_file_path @@ -132,24 +124,24 @@ zip files and directories" contents = all_metadata.read() media_metadata = parse_csv_file(contents) - metadata_context = { 'dcterms':'http://purl.org/dc/terms/', - 'xsd': 'http://www.w3.org/2001/XMLSchema#'} - for media_id in media_locations.keys(): files_attempted += 1 - file_metadata = media_metadata[media_id] - sanitized_metadata = check_metadata_format(file_metadata) - if sanitized_metadata == {}: continue + file_metadata = media_metadata[media_id] + try: + json_ld_metadata = compact_and_validate(file_metadata) + except ValidationError, exc: + print "Error with '%s' value '%s': %s" % ( + media_id, exc.path[0], exc.message) + continue - json_ld_metadata = jsonld.compact(build_json_ld_metadata(file_metadata), - metadata_context) original_location = media_locations[media_id]['media:original'] url = urlparse(original_location) - title = sanitized_metadata.get('dcterms:title') - description = sanitized_metadata.get('dcterms:description') - license = sanitized_metadata.get('dcterms:rights') + title = json_ld_metadata.get('dcterms:title') + description = json_ld_metadata.get('dcterms:description') + + license = json_ld_metadata.get('license') filename = url.path.split()[-1] if url.scheme == 'http': @@ -219,77 +211,3 @@ def parse_csv_file(file_contents): def teardown(temp_files): for temp_file in temp_files: subprocess.call(['rm','-r',temp_file]) - -def build_json_ld_metadata(metadata_dict): - output_dict = {} - for p in metadata_dict.keys(): - if p in ["dcterms:rights", "dcterms:relation"]: - m_type = "xsd:uri" - elif p in ["dcterms:date", "dcterms:created"]: - m_type = "xsd:date" - else: - m_type = "xsd:string" - description = {"@value": metadata_dict[p], - "@type" : m_type} - output_dict[p] = description - return output_dict - -def check_metadata_format(metadata_dict): - schema = { - "$schema":"http://json-schema.org/schema#", - "properties":{ - "media:id":{}, - "dcterms:contributor":{}, - "dcterms:coverage":{}, - "dcterms:created":{}, - "dcterms:creator":{}, - "dcterms:date":{}, - "dcterms:description":{}, - "dcterms:format":{}, - "dcterms:identifier":{}, - "dcterms:language":{}, - "dcterms:publisher":{}, - "dcterms:relation":{}, - "dcterms:rights" : { - "format":"uri", - "type":"string" - }, - "dcterms:source":{}, - "dcterms:subject":{}, - "dcterms:title":{}, - "dcterms:type":{} - }, - "additionalProperties": False, - "required":["dcterms:title","media:id"] -} - try: - validate(metadata_dict, schema) - output_dict = metadata_dict - # "media:id" is only for internal use, so we delete it for the output - del output_dict['media:id'] - - except ValidationError, exc: - title = (metadata_dict.get('dcterms:title') or - metadata_dict.get('media:id') or _(u'UNKNOWN FILE')) - - if exc.validator == "additionalProperties": - message = _(u'Invalid metadata provided for file "{title}". This \ -script only accepts the Dublin Core metadata terms.'.format(title=title)) - - elif exc.validator == "required": - message = _( -u'All necessary metadata was not provided for file "{title}", you must include \ -a "dcterms:title" column for each media file'.format(title=title)) - - else: - message = _(u'Could not find appropriate metadata for file \ -"{title}".'.format(title=title)) - - print _(u"""WARN: {message} \nSkipping File...\n""".format( - message=message)) - - output_dict = {} - except: - raise - - return output_dict diff --git a/mediagoblin/static/metadata/rdfa11.jsonld b/mediagoblin/static/metadata/rdfa11.jsonld new file mode 100644 index 00000000..b2557233 --- /dev/null +++ b/mediagoblin/static/metadata/rdfa11.jsonld @@ -0,0 +1,48 @@ +{ + "@context": { + "cat": "http://www.w3.org/ns/dcat#", + "qb": "http://purl.org/linked-data/cube#", + "grddl": "http://www.w3.org/2003/g/data-view#", + "ma": "http://www.w3.org/ns/ma-ont#", + "owl": "http://www.w3.org/2002/07/owl#", + "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#", + "rdfa": "http://www.w3.org/ns/rdfa#", + "rdfs": "http://www.w3.org/2000/01/rdf-schema#", + "rif": "http://www.w3.org/2007/rif#", + "rr": "http://www.w3.org/ns/r2rml#", + "skos": "http://www.w3.org/2004/02/skos/core#", + "skosxl": "http://www.w3.org/2008/05/skos-xl#", + "wdr": "http://www.w3.org/2007/05/powder#", + "void": "http://rdfs.org/ns/void#", + "wdrs": "http://www.w3.org/2007/05/powder-s#", + "xhv": "http://www.w3.org/1999/xhtml/vocab#", + "xml": "http://www.w3.org/XML/1998/namespace", + "xsd": "http://www.w3.org/2001/XMLSchema#", + "prov": "http://www.w3.org/ns/prov#", + "sd": "http://www.w3.org/ns/sparql-service-description#", + "org": "http://www.w3.org/ns/org#", + "gldp": "http://www.w3.org/ns/people#", + "cnt": "http://www.w3.org/2008/content#", + "dcat": "http://www.w3.org/ns/dcat#", + "earl": "http://www.w3.org/ns/earl#", + "ht": "http://www.w3.org/2006/http#", + "ptr": "http://www.w3.org/2009/pointers#", + "cc": "http://creativecommons.org/ns#", + "ctag": "http://commontag.org/ns#", + "dc": "http://purl.org/dc/terms/", + "dc11": "http://purl.org/dc/elements/1.1/", + "dcterms": "http://purl.org/dc/terms/", + "foaf": "http://xmlns.com/foaf/0.1/", + "gr": "http://purl.org/goodrelations/v1#", + "ical": "http://www.w3.org/2002/12/cal/icaltzd#", + "og": "http://ogp.me/ns#", + "rev": "http://purl.org/stuff/rev#", + "sioc": "http://rdfs.org/sioc/ns#", + "v": "http://rdf.data-vocabulary.org/#", + "vcard": "http://www.w3.org/2006/vcard/ns#", + "schema": "http://schema.org/", + "describedby": "http://www.w3.org/2007/05/powder-s#describedby", + "license": "http://www.w3.org/1999/xhtml/vocab#license", + "role": "http://www.w3.org/1999/xhtml/vocab#role" + } +}
\ No newline at end of file diff --git a/mediagoblin/tools/metadata.py b/mediagoblin/tools/metadata.py new file mode 100644 index 00000000..7de5a514 --- /dev/null +++ b/mediagoblin/tools/metadata.py @@ -0,0 +1,213 @@ +# GNU MediaGoblin -- federated, autonomous media hosting +# Copyright (C) 2011, 2012 MediaGoblin contributors. See AUTHORS. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. + + +import os +import copy +import json +import re +from pkg_resources import resource_filename + +import dateutil.parser +from pyld import jsonld +from jsonschema import validate, FormatChecker, draft4_format_checker +from jsonschema.compat import str_types + +from mediagoblin.tools.pluginapi import hook_handle + + + +######################################################## +## Set up the MediaGoblin format checker for json-schema +######################################################## + +URL_REGEX = re.compile( + r'^[a-z]+://([^/:]+|([0-9]{1,3}\.){3}[0-9]{1,3})(:[0-9]+)?(\/.*)?$', + re.IGNORECASE) + +def is_uri(instance): + """ + jsonschema uri validator + """ + if not isinstance(instance, str_types): + return True + + return URL_REGEX.match(instance) + +def is_datetime(instance): + """ + Is a date or datetime readable string. + """ + if not isinstance(instance, str_types): + return True + + return dateutil.parser.parse(instance) + + +class DefaultChecker(FormatChecker): + """ + Default MediaGoblin format checker... extended to include a few extra things + """ + checkers = copy.deepcopy(draft4_format_checker.checkers) + + +DefaultChecker.checkers[u"uri"] = (is_uri, ()) +DefaultChecker.checkers[u"date-time"] = (is_datetime, (ValueError, TypeError)) +DEFAULT_CHECKER = DefaultChecker() + +# Crappy default schema, checks for things we deem important + +DEFAULT_SCHEMA = { + "$schema": "http://json-schema.org/schema#", + + "type": "object", + "properties": { + "license": { + "format": "uri", + "type": "string", + }, + "dcterms:created": { + "format": "date-time", + "type": "string", + } + }, +} + + +def load_resource(package, resource_path): + """ + Load a resource, return it as a string. + + Args: + - package: package or module name. Eg "mediagoblin.media_types.audio" + - resource_path: path to get to this resource, a list of + directories and finally a filename. Will be joined with + os.path.sep. + """ + filename = resource_filename(package, os.path.sep.join(resource_path)) + return file(filename).read() + +def load_resource_json(package, resource_path): + """ + Load a resource json file, return a dictionary. + + Args: + - package: package or module name. Eg "mediagoblin.media_types.audio" + - resource_path: path to get to this resource, a list of + directories and finally a filename. Will be joined with + os.path.sep. + """ + return json.loads(load_resource(package, resource_path)) + + +################################## +## Load the MediaGoblin core files +################################## + + +BUILTIN_CONTEXTS = { + "http://www.w3.org/2013/json-ld-context/rdfa11": load_resource( + "mediagoblin", ["static", "metadata", "rdfa11.jsonld"])} + + +_CONTEXT_CACHE = {} + +def load_context(url): + """ + A self-aware document loader. For those contexts MediaGoblin + stores internally, load them from disk. + """ + if url in _CONTEXT_CACHE: + return _CONTEXT_CACHE[url] + + # See if it's one of our basic ones + document = BUILTIN_CONTEXTS.get(url, None) + + # No? See if we have an internal schema for this + if document is None: + document = hook_handle(("context_url_data", url)) + + # Okay, if we've gotten a document by now... let's package it up + if document is not None: + document = {'contextUrl': None, + 'documentUrl': url, + 'document': document} + + # Otherwise, use jsonld.load_document + else: + document = jsonld.load_document(url) + + # cache + _CONTEXT_CACHE[url] = document + return document + + +DEFAULT_CONTEXT = "http://www.w3.org/2013/json-ld-context/rdfa11" + +def compact_json(metadata, context=DEFAULT_CONTEXT): + """ + Compact json with supplied context. + + Note: Free floating" nodes are removed (eg a key just named + "bazzzzzz" which isn't specified in the context... something like + bazzzzzz:blerp will stay though. This is jsonld.compact behavior. + """ + compacted = jsonld.compact( + metadata, context, + options={ + "documentLoader": load_context, + # This allows for things like "license" and etc to be preserved + "expandContext": context, + "keepFreeFloatingNodes": False}) + + return compacted + + +def compact_and_validate(metadata, context=DEFAULT_CONTEXT, + schema=DEFAULT_SCHEMA): + """ + compact json with supplied context, check against schema for errors + + raises an exception (jsonschema.exceptions.ValidationError) if + there's an error. + + Note: Free floating" nodes are removed (eg a key just named + "bazzzzzz" which isn't specified in the context... something like + bazzzzzz:blerp will stay though. This is jsonld.compact behavior. + + You may wish to do this validation yourself... this is just for convenience. + """ + compacted = compact_json(metadata, context) + validate(metadata, schema, format_checker=DEFAULT_CHECKER) + + return compacted + + +def expand_json(metadata, context=DEFAULT_CONTEXT): + """ + Expand json, but be sure to use our documentLoader. + + By default this expands with DEFAULT_CONTEXT, but if you do not need this, + you can safely set this to None. + + # @@: Is the above a good idea? Maybe it should be set to None by + # default. + """ + options = { + "documentLoader": load_context} + if context is not None: + options["expandContext"] = context + return jsonld.expand(metadata, options=options) |