# GNU MediaGoblin -- federated, autonomous media hosting # Copyright (C) 2011, 2012 MediaGoblin contributors. See AUTHORS. # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Affero General Public License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . from PIL import Image import soundfile import numpy SPECTROGRAM_MAX_FREQUENCY = 8000 # Old spectrogram.py sets upper limit to 22050 but # usually there isn't much detail in higher frequencies SPECTROGRAM_MIN_FREQUENCY = 20 SPECTROGRAM_DB_RANGE = 110 # Color palette copied from old spectrogram.py SPECTROGRAM_COLORS = [(58 / 4, 68 / 4, 65 / 4), (80 / 2, 100 / 2, 153 / 2), (90, 180, 100), (224, 224, 44), (255, 60, 30), (255, 255, 255)] # The purpose of this table is to give more horizontal # real estate to shorter sounds files. # Format: (pixels, (range_min, range_max)) # For sounds with a duration >= _range_min_ and < _range_max_ # give _pixel_ horizontal pixels for each second of audio. SPECTROGRAM_WIDTH_PERSECOND = [(240, ( 0, 20)), (120, ( 20, 30)), ( 60, ( 30, 60)), ( 30, ( 60, 120)), ( 15, (120, 240)), ( 6, (240, 100000))] # Upper limit is arbitrary. Sounds with longer # duration will still get assigned to the last bucket SPECTROGRAM_HEIGHT = 500 class AudioBlocksFFT: def __init__(self, fileName, blockSize, overlap, minFreq, maxFreq, numBins = None, windowFunction = numpy.hanning): self.audioData = soundfile.SoundFile(fileName, 'r') self.numChannels = self.audioData.channels self.sampleRate = self.audioData.samplerate self.minFreq = minFreq self.maxFreq = maxFreq self.blockSize = blockSize self.numBins = numBins self.overlap = overlap self.windowValues = windowFunction(blockSize) self.peakFFTValue = 0 try: # PySoundFile V0.10.0 adds SoundFile.frames property and deprecates __len__() self.totalSamples = self.audioData.frames except AttributeError: self.totalSamples = len(self.audioData) def peakFFTAmplitude(self): """ Peak amplitude of FFT for all blocks """ return self.peakFFTValue def totalSeconds(self): """ Total length in seconds """ return self.totalSamples / self.sampleRate def _filterFreqRange(self, fftAmplitude): """ Given a FFT amplitudes array keep only bins between minFreq, maxFreq """ nyquistFreq = self.sampleRate // 2 numBins = len(fftAmplitude) sliceWidth = nyquistFreq / numBins startIdx = int(self.minFreq / sliceWidth) endIdx = int(self.maxFreq / sliceWidth) if numBins <= endIdx: fftAmplitude = numpy.pad(fftAmplitude, (0, 1 + endIdx - numBins), 'constant', constant_values=(0)) else: fftAmplitude = fftAmplitude[:endIdx + 1] return fftAmplitude[startIdx:] def _resizeAmplitudeArray(self, amplitudeValues, newSize): """ Resize amplitude values array """ if len(amplitudeValues) == newSize: return amplitudeValues if newSize > len(amplitudeValues): # Resize up result = numpy.zeros(newSize) for idx in range(0, newSize): srcIdx = (idx * len(amplitudeValues)) // newSize result[idx] = amplitudeValues[srcIdx] return result # Resize down keeping peaks result = numpy.zeros(newSize) idx = 0 for slice in numpy.array_split(amplitudeValues, newSize): result[idx] = slice.max() idx = idx + 1 return result def __iter__(self): """ Read a block of audio data and compute FFT amplitudes """ self.audioData.seek(0) for fileBlock in self.audioData.blocks(blocksize = self.blockSize, overlap = self.overlap): # Mix down all channels to mono audioBlock = fileBlock[:,0] for channel in range(1, self.numChannels): audioBlock = numpy.add(audioBlock, fileBlock[:,channel]) # On the last block it may be necessary to pad with zeros if len(audioBlock) < self.blockSize: audioBlock = numpy.pad(audioBlock, (0, self.blockSize - len(audioBlock)), 'constant', constant_values=(0)) # Compute FFT amplitude of this block fftAmplitude = self._filterFreqRange(numpy.abs(numpy.fft.rfft(audioBlock * self.windowValues))) self.peakFFTValue = max(self.peakFFTValue, fftAmplitude.max()) # Resize if requested if not self.numBins is None: fftAmplitude = self._resizeAmplitudeArray(fftAmplitude, self.numBins) yield (fftAmplitude, self.audioData.tell() / self.sampleRate) class SpectrogramColorMap: def __init__(self, columnData): self.columnData = columnData self.width = len(columnData) self.height = len(columnData[0]) self._buildColorPalette() def _colorBetween(self, beginColor, endColor, step): """ Interpolate between two colors """ rS, gS, bS = beginColor rE, gE, bE = endColor r = int(numpy.sqrt((1.0 - step) * (rS * rS) + step * (rE * rE))) g = int(numpy.sqrt((1.0 - step) * (gS * gS) + step * (gE * gE))) b = int(numpy.sqrt((1.0 - step) * (bS * bS) + step * (bE * bE))) r = r if r < 256 else 255 g = g if g < 256 else 255 b = b if b < 256 else 255 return (r, g, b) def _buildColorPalette(self): """ Build color palette """ colorPoints = SPECTROGRAM_COLORS self.colors = [] for i in range(1, len(colorPoints)): for p in range(0, 200): self.colors.append(self._colorBetween(colorPoints[i - 1], colorPoints[i], p / 200)) def getColorData(self, progressCallback = None): """ Map spectrogram data to pixel colors """ pixels = [self.colors[0]] * (self.width * self.height) for x in range(0, self.width): for y in range(0, self.height): idx = x + self.width * y amplitudeVal = self.columnData[x][self.height - y - 1] colorIdx = int(len(self.colors) * amplitudeVal) colorIdx = colorIdx if colorIdx > 0 else 0 colorIdx = colorIdx if colorIdx < len(self.colors) else len(self.colors) - 1 pixels[idx] = self.colors[colorIdx] if progressCallback: progressCallback(100 * x / self.width) return pixels def drawSpectrogram(audioFileName, imageFileName, fftSize = 1024, fftOverlap = 0, progressCallback = None): """ Draw a spectrogram of the audio file """ # Fraction of total work for each step STEP_PERCENTAGE_FFT = 40 STEP_PERCENTAGE_NORMALIZE = 5 STEP_PERCENTAGE_ACCUMULATE = 10 STEP_PERCENTAGE_DRAW = 40 # Give last 5% to saving the file PERCENTAGE_REPORT_STEP = 2 nextReportedPercentage = PERCENTAGE_REPORT_STEP def wrapProgressCallback(percentage): nonlocal nextReportedPercentage percentage = int(percentage) if percentage >= nextReportedPercentage: if progressCallback: progressCallback(percentage) nextReportedPercentage = (1 + percentage // PERCENTAGE_REPORT_STEP) * PERCENTAGE_REPORT_STEP def mapColorsProgressCallback(percentage): wrapProgressCallback(STEP_PERCENTAGE_FFT + STEP_PERCENTAGE_NORMALIZE + STEP_PERCENTAGE_ACCUMULATE + (STEP_PERCENTAGE_DRAW * (percentage / 100))) imageWidthLookup = SPECTROGRAM_WIDTH_PERSECOND imageHeight = SPECTROGRAM_HEIGHT # Load audio file and compute FFT amplitudes fftBlocksSource = AudioBlocksFFT(audioFileName, fftSize, overlap = fftOverlap, minFreq = SPECTROGRAM_MIN_FREQUENCY, maxFreq = SPECTROGRAM_MAX_FREQUENCY, numBins = imageHeight) soundLength = fftBlocksSource.totalSeconds() fftAmplitudeBlocks = [] for fftAmplitude, positionSeconds in fftBlocksSource: fftAmplitudeBlocks.append(fftAmplitude) wrapProgressCallback(STEP_PERCENTAGE_FFT * (positionSeconds / soundLength)) totalProgress = STEP_PERCENTAGE_FFT # Normalize FFT amplitude and convert to log scale specRange = SPECTROGRAM_DB_RANGE for i in range(0, len(fftAmplitudeBlocks)): normalized = numpy.divide(fftAmplitudeBlocks[i], fftBlocksSource.peakFFTAmplitude()) fftAmplitudeBlocks[i] = ((20*(numpy.log10(normalized + 1e-60))).clip(-specRange, 0.0) + specRange)/specRange wrapProgressCallback(totalProgress + STEP_PERCENTAGE_NORMALIZE * (i / len(fftAmplitudeBlocks))) totalProgress = totalProgress + STEP_PERCENTAGE_NORMALIZE # Compute spectrogram width in pixels imageWidthPerSecond, lengthRage = imageWidthLookup[-1] for widthPerSecond, lengthLimit in imageWidthLookup: limitLow, limitHigh = lengthLimit if soundLength > limitLow and soundLength <= limitHigh: imageWidthPerSecond = widthPerSecond break imageWidth = int(imageWidthPerSecond * soundLength) # Compute spectrogram values columnValues = numpy.zeros(imageHeight) spectrogram = [] x = 0 for idx in range(0, len(fftAmplitudeBlocks)): newX = (idx * imageWidth) // len(fftAmplitudeBlocks) if newX != x: # Save column spectrogram.append(numpy.copy(columnValues)) x = newX columnValues.fill(0) columnValues = numpy.maximum(columnValues, fftAmplitudeBlocks[idx]) wrapProgressCallback(totalProgress + STEP_PERCENTAGE_ACCUMULATE * (idx / len(fftAmplitudeBlocks))) spectrogram.append(numpy.copy(columnValues)) totalProgress = totalProgress + STEP_PERCENTAGE_ACCUMULATE # Draw spectrogram imageWidth = len(spectrogram) colorData = SpectrogramColorMap(spectrogram).getColorData(progressCallback = mapColorsProgressCallback) totalProgress = totalProgress + STEP_PERCENTAGE_DRAW # Save final image image = Image.new('RGB', (imageWidth, imageHeight)) image.putdata(colorData) image.save(imageFileName) if progressCallback: progressCallback(100) if __name__ == "__main__": import sys def printProgress(p): sys.stdout.write(f"\rProgress : {p}%") sys.stdout.flush() if not (len(sys.argv) == 2 or len(sys.argv) == 3): print(f"Usage:\n{sys.argv[0]} input_file [output_file]") exit() audioFile = sys.argv[1] if 3 == len(sys.argv): outputFile = sys.argv[2] else: outputFile = 'spectrogram.png' sys.stdout.write(f"Input : {audioFile}\nOutput : {outputFile}\n") drawSpectrogram(audioFile, outputFile, progressCallback = printProgress) sys.stdout.write("\nDone!\n")