2

I am trying to apply some image processing techniques to a spectrogram that was created by an audio file. In this example, I would like to apply a de-noising algorithm to the spectrogram and then inverse it back to audio. How would this be done correctly so I can manipulate a spectrogram and then return to audio without losing much of the initial quality of the signal? Obviously I am doing something wrong here, so any help would be hugely appreciated.

I used part of the code found here: How can I save a Librosa spectrogram plot as a specific sized image?

Here is the code I'm working on:

!pip install librosa --upgrade

import librosa
import matplotlib.pyplot as plt
import numpy as np
import librosa.display
from IPython.display import Audio,display
from scipy.io.wavfile import write
import skimage.io
from skimage.color import rgb2gray
import cv2



def scale_minmax(X, min=0.0, max=1.0):
    X_std = (X - X.min()) / (X.max() - X.min())
    X_scaled = X_std * (max - min) + min
    return X_scaled

def spectrogram_image(y, sr, out, hop_length, n_mels):

    stft = np.abs(librosa.stft(y=y,n_fft=hop_length*2, hop_length=hop_length))
    amp2db  = librosa.amplitude_to_db(stft, ref=np.max)

    # min-max scale to fit inside 8-bit range
    img = scale_minmax(amp2db, 0, 255).astype(numpy.uint8)
    img = numpy.flip(img, axis=0) # put low frequencies at the bottom in image
    img = 255-img # invert. make black==more energy

    # save as PNG
    skimage.io.imsave(out, img)

    return img


if __name__ == '__main__':
    # settings
    hop_length = 512 # number of samples per time-step in spectrogram
    n_mels = 128 # number of bins in spectrogram. Height of image
    time_steps = 384 # number of time-steps. Width of image

    # load audio. Using example from librosa
    path = librosa.util.example_audio_file()
    y, sr = librosa.load(path)
    out = 'out.png'

    # extract a fixed length window
    start_sample = 0 # starting at beginning
    length_samples = time_steps*hop_length
    window = y[start_sample:start_sample+length_samples]

    # convert to PNG
    img_png= spectrogram_image(window, sr=sr, out=out, hop_length=hop_length, n_mels=n_mels)
    print('wrote file', out)


    converted_img = cv2.cvtColor(img_png, cv2.COLOR_GRAY2BGR)

dst = cv2.fastNlMeansDenoisingColored(converted_img,None,10,10,7,21)

dst=img_gray = rgb2gray(dst)
#dst = scale_minmax(dst, 0, 1.0).astype(numpy.float64)

dst = numpy.flip(dst, axis=0) # do i need this???
fig= plt.figure(figsize=(32,16))

plt.subplot(211),plt.imshow(img_png)
plt.subplot(212),plt.imshow(dst)
plt.show()


y=librosa.amplitude_to_db(dst)
y_hat = librosa.istft(y)
#y_hat = librosa.griffinlim(y)

audio1= Audio(y_hat,rate=sr)
display(audio1)

write("/content/XXX.wav", sr,y_hat)
davidjb
  • 8,247
  • 3
  • 32
  • 42
eskay
  • 21
  • 2
  • You should drop the convertion to image, as it introduces 8-bit quantization, and the max-scaling would have to be reversed. Instead operate on the floating point spectrograms. Might also want to skip the log part and work with linear spectrograms – Jon Nordby Jun 12 '20 at 19:51
  • And you need to keep the phase around, so that you can use it when converting the spectrogram to audio again (after having manipulated the magnitude part of the spectrogram) – Jon Nordby Jun 12 '20 at 19:52

0 Answers0