I am trying to apply some image processing techniques to a spectrogram that was created by an audio file. In this example, I would like to apply a de-noising algorithm to the spectrogram and then inverse it back to audio. How would this be done correctly so I can manipulate a spectrogram and then return to audio without losing much of the initial quality of the signal? Obviously I am doing something wrong here, so any help would be hugely appreciated.
I used part of the code found here: How can I save a Librosa spectrogram plot as a specific sized image?
Here is the code I'm working on:
!pip install librosa --upgrade
import librosa
import matplotlib.pyplot as plt
import numpy as np
import librosa.display
from IPython.display import Audio,display
from scipy.io.wavfile import write
import skimage.io
from skimage.color import rgb2gray
import cv2
def scale_minmax(X, min=0.0, max=1.0):
X_std = (X - X.min()) / (X.max() - X.min())
X_scaled = X_std * (max - min) + min
return X_scaled
def spectrogram_image(y, sr, out, hop_length, n_mels):
stft = np.abs(librosa.stft(y=y,n_fft=hop_length*2, hop_length=hop_length))
amp2db = librosa.amplitude_to_db(stft, ref=np.max)
# min-max scale to fit inside 8-bit range
img = scale_minmax(amp2db, 0, 255).astype(numpy.uint8)
img = numpy.flip(img, axis=0) # put low frequencies at the bottom in image
img = 255-img # invert. make black==more energy
# save as PNG
skimage.io.imsave(out, img)
return img
if __name__ == '__main__':
# settings
hop_length = 512 # number of samples per time-step in spectrogram
n_mels = 128 # number of bins in spectrogram. Height of image
time_steps = 384 # number of time-steps. Width of image
# load audio. Using example from librosa
path = librosa.util.example_audio_file()
y, sr = librosa.load(path)
out = 'out.png'
# extract a fixed length window
start_sample = 0 # starting at beginning
length_samples = time_steps*hop_length
window = y[start_sample:start_sample+length_samples]
# convert to PNG
img_png= spectrogram_image(window, sr=sr, out=out, hop_length=hop_length, n_mels=n_mels)
print('wrote file', out)
converted_img = cv2.cvtColor(img_png, cv2.COLOR_GRAY2BGR)
dst = cv2.fastNlMeansDenoisingColored(converted_img,None,10,10,7,21)
dst=img_gray = rgb2gray(dst)
#dst = scale_minmax(dst, 0, 1.0).astype(numpy.float64)
dst = numpy.flip(dst, axis=0) # do i need this???
fig= plt.figure(figsize=(32,16))
plt.subplot(211),plt.imshow(img_png)
plt.subplot(212),plt.imshow(dst)
plt.show()
y=librosa.amplitude_to_db(dst)
y_hat = librosa.istft(y)
#y_hat = librosa.griffinlim(y)
audio1= Audio(y_hat,rate=sr)
display(audio1)
write("/content/XXX.wav", sr,y_hat)