diff options
Diffstat (limited to 'mediagoblin/gmg_commands/batchaddmedia.py')
-rw-r--r-- | mediagoblin/gmg_commands/batchaddmedia.py | 136 |
1 files changed, 60 insertions, 76 deletions
diff --git a/mediagoblin/gmg_commands/batchaddmedia.py b/mediagoblin/gmg_commands/batchaddmedia.py index 55ed865b..88fa3e5a 100644 --- a/mediagoblin/gmg_commands/batchaddmedia.py +++ b/mediagoblin/gmg_commands/batchaddmedia.py @@ -14,19 +14,18 @@ # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see <http://www.gnu.org/licenses/>. -from __future__ import print_function +from __future__ import print_function, unicode_literals -import codecs import csv import os -import sys +import shutil +import tempfile import requests import six - from six.moves.urllib.parse import urlparse -from mediagoblin.db.models import LocalUser +from mediagoblin.db.models import LocalUser, MediaEntry from mediagoblin.gmg_commands import util as commands_util from mediagoblin.submit.lib import ( submit_media, FileUploadLimit, UserUploadLimit, UserPastUploadLimit) @@ -38,21 +37,21 @@ from jsonschema.exceptions import ValidationError def parser_setup(subparser): subparser.description = """\ This command allows the administrator to upload many media files at once.""" - subparser.epilog = _(u"""For more information about how to properly run this + subparser.epilog = _("""For more information about how to properly run this script (and how to format the metadata csv file), read the MediaGoblin documentation page on command line uploading <http://docs.mediagoblin.org/siteadmin/commandline-upload.html>""") subparser.add_argument( 'username', - help=_(u"Name of user these media entries belong to")) + help=_("Name of user these media entries belong to")) subparser.add_argument( 'metadata_path', help=_( -u"""Path to the csv file containing metadata information.""")) +"""Path to the csv file containing metadata information.""")) subparser.add_argument( '--celery', action='store_true', - help=_(u"Don't process eagerly, pass off to celery")) + help=_("Don't process eagerly, pass off to celery")) def batchaddmedia(args): @@ -69,7 +68,7 @@ def batchaddmedia(args): LocalUser.username==args.username.lower() ).first() if user is None: - print(_(u"Sorry, no user by username '{username}' exists".format( + print(_("Sorry, no user by username '{username}' exists".format( username=args.username))) return @@ -77,7 +76,7 @@ def batchaddmedia(args): metadata_path = args.metadata_path else: - error = _(u'File at {path} not found, use -h flag for help'.format( + error = _('File at {path} not found, use -h flag for help'.format( path=args.metadata_path)) print(error) return @@ -85,19 +84,12 @@ def batchaddmedia(args): abs_metadata_filename = os.path.abspath(metadata_path) abs_metadata_dir = os.path.dirname(abs_metadata_filename) - def maybe_unicodeify(some_string): - # this is kinda terrible - if some_string is None: - return None - else: - return six.text_type(some_string) - - with codecs.open( - abs_metadata_filename, 'r', encoding='utf-8') as all_metadata: - contents = all_metadata.read() - media_metadata = parse_csv_file(contents) + all_metadata = open(abs_metadata_filename, 'r') + media_metadata = csv.DictReader(all_metadata) + for index, file_metadata in enumerate(media_metadata): + if six.PY2: + file_metadata = {k.decode('utf-8'): v.decode('utf-8') for k, v in file_metadata.items()} - for media_id, file_metadata in media_metadata.items(): files_attempted += 1 # In case the metadata was not uploaded initialize an empty dictionary. json_ld_metadata = compact_and_validate({}) @@ -108,6 +100,7 @@ def batchaddmedia(args): ### Pull the important media information for mediagoblin from the ### metadata, if it is provided. + slug = file_metadata.get('slug') title = file_metadata.get('title') or file_metadata.get('dc:title') description = (file_metadata.get('description') or file_metadata.get('dc:description')) @@ -117,7 +110,8 @@ def batchaddmedia(args): try: json_ld_metadata = compact_and_validate(file_metadata) except ValidationError as exc: - error = _(u"""Error with media '{media_id}' value '{error_path}': {error_msg} + media_id = file_metadata.get('id') or index + error = _("""Error with media '{media_id}' value '{error_path}': {error_msg} Metadata was not uploaded.""".format( media_id=media_id, error_path=exc.path[0], @@ -125,12 +119,36 @@ Metadata was not uploaded.""".format( print(error) continue + if slug and MediaEntry.query.filter_by(actor=user.id, slug=slug).count(): + # Avoid re-importing media from a previous batch run. Note that this + # check isn't quite robust enough, since it requires that a slug is + # specified. Probably needs to be based on "location" since this is + # the only required field. + error = '{}: {}'.format( + slug, _('An entry with that slug already exists for this user.')) + print(error) + continue + url = urlparse(original_location) filename = url.path.split()[-1] - if url.scheme == 'http': + if url.scheme.startswith('http'): res = requests.get(url.geturl(), stream=True) - media_file = res.raw + if res.headers.get('content-encoding'): + # The requests library's "raw" method does not deal with content + # encoding. Alternative could be to use iter_content(), and + # write chunks to the temporary file. + raise NotImplementedError('URL-based media with content-encoding (eg. gzip) are not currently supported.') + + # To avoid loading the media into memory all at once, we write it to + # a file before importing. This currently requires free space up to + # twice the size of the media file. Memory use can be tested by + # running something like `ulimit -Sv 200000` before running + # `batchaddmedia` to upload a file larger than 200MB. + media_file = tempfile.TemporaryFile() + shutil.copyfileobj(res.raw, media_file) + if six.PY2: + media_file.seek(0) elif url.scheme == '': path = url.path @@ -142,76 +160,42 @@ Metadata was not uploaded.""".format( try: media_file = open(file_abs_path, 'rb') except IOError: - print(_(u"""\ + print(_("""\ FAIL: Local file {filename} could not be accessed. {filename} will not be uploaded.""".format(filename=filename))) continue try: - submit_media( + entry = submit_media( mg_app=app, user=user, submitted_file=media_file, filename=filename, - title=maybe_unicodeify(title), - description=maybe_unicodeify(description), - collection_slug=maybe_unicodeify(collection_slug), - license=maybe_unicodeify(license), + title=title, + description=description, + collection_slug=collection_slug, + license=license, metadata=json_ld_metadata, - tags_string=u"") - print(_(u"""Successfully submitted {filename}! + tags_string="") + if slug: + # Slug is automatically set by submit_media, so overwrite it + # with the desired slug. + entry.slug = slug + entry.save() + print(_("""Successfully submitted {filename}! Be sure to look at the Media Processing Panel on your website to be sure it uploaded successfully.""".format(filename=filename))) files_uploaded += 1 except FileUploadLimit: print(_( -u"FAIL: This file is larger than the upload limits for this site.")) +"FAIL: This file is larger than the upload limits for this site.")) except UserUploadLimit: print(_( "FAIL: This file will put this user past their upload limits.")) except UserPastUploadLimit: print(_("FAIL: This user is already past their upload limits.")) + finally: + media_file.close() print(_( "{files_uploaded} out of {files_attempted} files successfully submitted".format( files_uploaded=files_uploaded, files_attempted=files_attempted))) - - -def unicode_csv_reader(unicode_csv_data, dialect=csv.excel, **kwargs): - # csv.py doesn't do Unicode; encode temporarily as UTF-8: - # TODO: this probably won't be necessary in Python 3 - csv_reader = csv.reader(utf_8_encoder(unicode_csv_data), - dialect=dialect, **kwargs) - for row in csv_reader: - # decode UTF-8 back to Unicode, cell by cell: - yield [six.text_type(cell, 'utf-8') for cell in row] - -def utf_8_encoder(unicode_csv_data): - for line in unicode_csv_data: - yield line.encode('utf-8') - -def parse_csv_file(file_contents): - """ - The helper function which converts the csv file into a dictionary where each - item's key is the provided value 'id' and each item's value is another - dictionary. - """ - list_of_contents = file_contents.split('\n') - key, lines = (list_of_contents[0].split(','), - list_of_contents[1:]) - objects_dict = {} - - # Build a dictionary - for index, line in enumerate(lines): - if line.isspace() or line == u'': continue - if (sys.version_info[0] == 3): - # Python 3's csv.py supports Unicode out of the box. - reader = csv.reader([line]) - else: - reader = unicode_csv_reader([line]) - values = next(reader) - line_dict = dict([(key[i], val) - for i, val in enumerate(values)]) - media_id = line_dict.get('id') or index - objects_dict[media_id] = (line_dict) - - return objects_dict |