I've been following this guide on generating an SRT subtitle file from video/audio files using Mozilla DeepSpeech.
I've been able to remove the silent portion of the audio .wav file into multiple segmented .wav files based on the guide using pyAudioAnalysis library.
However, I'm currently having difficulty understanding how to read through the multiple-segmented files and generate a subtitle .srt file using Mozilla DeepSpeech. I've attached an image of the segmented audio file above.
As for my current code, most are similar to the guide I'm following but it doesn't explain well enough regarding the functions.
SilenceRemoval Function
from pyAudioAnalysis import audioBasicIO as aIO
from pyAudioAnalysis import audioSegmentation as aS
def silenceRemoval(input_file, smoothing_window = 1.0, weight = 0.2):
print("Running silenceRemoval function\n")
[fs, x] = aIO.read_audio_file(input_file)
segmentLimits = aS.silence_removal(x, fs, 0.05, 0.05, smoothing_window, weight)
for i, s in enumerate(segmentLimits):
strOut = "{0:s}_{1:.3f}-{2:.3f}.wav".format(input_file[0:-4], s[0], s[1])
# wavfile.write(strOut, fs, x[int(fs * s[0]):int(fs * s[1])])
write_file("audio", strOut, ".wav", x[int(fs * s[0]):int(fs * s[1])], fs)
print("\nsilenceRemoval function completed")
Writing .wav file into multiple segments
import os
import scipy.io.wavfile as wavfile
def write_file(output_file_path, input_file_name, name_attribute, sig, fs):
"""
Read wave file as mono.
Args:
- output_file_path (str) : path to save resulting wave file to.
- input_file_name (str) : name of processed wave file,
- name_attribute (str) : attribute to add to output file name.
- sig (array) : signal/audio array.
- fs (int) : sampling rate.
Returns:
tuple of sampling rate and audio data.
"""
# set-up the output file name
fname = os.path.basename(input_file_name).split(".wav")[0] + name_attribute
fpath = os.path.join(output_file_path, fname)
wavfile.write(filename=fpath, rate=fs, data=sig)
print("Writing data to " + fpath + ".")
main() calling the functions
video_name = "Videos\MIB_Sample.mp4"
audio_name = video_name + ".wav"
# DeepSpeech Model and Scorer
ds = Model("deepspeech-0.9.3-models.pbmm")
scorer = ds.enableExternalScorer("deepspeech-0.9.3-models.scorer")
def main():
# Extract audio from input video file
extractAudio(video_name, audio_name)
print("Splitting on silent parts in audio file")
silenceRemoval(audio_name)
generateSRT(audio_name)
generateSRT() function
def generateSRT(audio_file_name):
command = ["deepspeech", "--model", ds,
"--scorer", scorer,
"--audio", audio_file_name]
try:
ret = sp.call(command, shell=True)
print("generating subtitles")
except Exception as e:
print("Error: ", str(e))
exit(1)
I'm currently trying to generate subtitle from the single extracted audio file but I'm facing this error
Error: expected str, bytes or os.PathLike object, not Model
Appreciate any help on how to loop through the folder that contains the segmented audio file to be read and generate into an SRT file using Mozilla DeepSpeech and output it to another folder. Thank you!