I am currently processing some audio data. I have an audio file that I have created from splitting a larger file on silence using pydub.
However, if I take this audio file after exporting it with pydub, and then convert the AudioSegment's array to numpy array, and re-write it using soundfile, I get an audio file written that is about half the speed as it was originally. What could be going wrong?
import soundfile as sf
import numpy as np
from pydub import AudioSegment, effects
from pathlib import Path
# This code takes a large .mp3 file ("original_audio_mp3") with sample rate of 44100 khz
sound = AudioSegment.from_file(original_audio_mp3)
if sound.frame_rate != desired_sample_rate:
sound = sound.set_frame_rate(desired_sample_rate) # convert to 16000 khz sample rate
sound = effects.normalize(sound) # normalize audio file
dBFS = sound.dBFS # get decibels relative to full scale
sound_chunks = split_on_silence(sound,
min_silence_len = 200, # measured in ms
silence_thresh = dBFS -30 # if DBFS goes 30 below the file's dBFS it will be considered "silence"
)
# this "audio_segment_0.wav" file came from the above code.
audio_file_path = Path("audio_segment_0.wav")
raw_audio = AudioSegment.from_file(audio_file_path).set_frame_rate(16000)
# append 200 ms of silence to beginning and end of file
raw_audio = effects.normalize(raw_audio)
silence = AudioSegment.silent(duration = 200, frame_rate = 16000)
raw_audio_w_silence = silence + raw_audio + silence
# export it
raw_audio_w_silence.export("pydub_audio.wav", format = 'wav') # the output from this sounds completely OK.
# read audio, manipulate and write with soundfile
new_audio = AudioSegment.from_file("pydub_audio.wav").set_frame_rate(16000)
new_audio_signal = np.array(new_audio.get_array_of_samples(), dtype = np.float32) / 32768.0 # scale to between [-1.0, 1.0]
# the output from down here using the scaled numpy array sounds about half the speed as the first.
sf.write("soundfile_export.wav", data = new_audio_signal, samplerate = new_audio.frame_rate, format = 'wav')