How to use Wave file as input in VOSK speech recognition?

Question

I have a project that needs to get a recorded file and then process by the code and extract the text from file and match the extracted file with the other text and verify it. my problem is: I can't use recorded file in code and it does'nt read the file

init function is the fundamental of code.

verify functtion confirm the matched speech and text.

import argparse
import json
import os
import queue
import random
import sys
from difflib import SequenceMatcher
import numpy as np
import sounddevice as sd
import vosk

q = queue.Queue()

def int_or_str(text):
    """Helper function for argument parsing."""
    try:
        return int(text)
    except ValueError:
        return text


def callback(indata, frames, time, status):
    """This is called (from a separate thread) for each audio block."""
    if status:
        print(status, file=sys.stderr)
    q.put(bytes(indata))



def init():
    parser = argparse.ArgumentParser(add_help=False)
    parser.add_argument(
        '-l', '--list-devices', action='store_true',
        help='show list of audio devices and exit')
    args, remaining = parser.parse_known_args()
    if args.list_devices:
        print(sd.query_devices())
        parser.exit(0)
    parser = argparse.ArgumentParser(
        description=__doc__,
        formatter_class=argparse.RawDescriptionHelpFormatter,
        parents=[parser])
    parser.add_argument(
        '-f', '--filename', type=str, metavar='FILENAME',
        help='audio file to store recording to')
    parser.add_argument(
        '-m', '--model', type=str, metavar='MODEL_PATH',
        help='Path to the model')
    parser.add_argument(
        '-d', '--device', type=int_or_str,
        help='input device (numeric ID or substring)')
    parser.add_argument(
        '-r', '--samplerate', type=int, help='sampling rate')
    args = parser.parse_args(remaining)
    try:
        if args.model is None:
            args.model = "model"
        if not os.path.exists(args.model):
            print("Please download a model for your language from https://alphacephei.com/vosk/models")
            print("and unpack as 'model' in the current folder.")
            parser.exit(0)
        if args.samplerate is None:
            device_info = sd.query_devices(args.device, 'input')
            # soundfile expects an int, sounddevice provides a float:
            args.samplerate = int(device_info['default_samplerate'])

        model = vosk.Model(args.model)

        if args.filename:
            dump_fn = open(args.filename, "wb")
        else:
            dump_fn = None

        
    except KeyboardInterrupt:
        print('\nDone')
        parser.exit(0)
    except Exception as e:
        parser.exit(type(e).__name__ + ': ' + str(e))

    return model, args
def verify(random_sentence, model, args):
    num, T_num, F_num, num_word = 0, 0, 0, 1
    with sd.RawInputStream(samplerate=args.samplerate, blocksize=8000, device=args.device, dtype='int16',
                           channels=1, callback=callback):
        rec = vosk.KaldiRecognizer(model, args.samplerate)
        print("{}) ".format(num_word), random_sentence, end='\n')
        print('=' * 30, end='\n')
        run = True
        while run:
            data = q.get()
            if rec.AcceptWaveform(data):
                res = json.loads(rec.FinalResult())
                res['text'] = res['text'].replace('ي', 'ی')             
                if SequenceMatcher(None, random_sentence, res['text']).ratio() > 0.65:                    
                    T_num, num, num_word += 1
                    
                else:
                    F_num, num, num_word += 1
                    
                run = False

    print('=' * 30)
    print('True Cases : {}\n False Cases : {}'.format(T_num, F_num))


if __name__ == "__main__":
    model, args = init()
    verify(random_sentences, model, args)

score 0 · Answer 1 · answered Sep 07 '21 at 07:33

I have been working on a similar project. I modified the code from VOSK Git repo and wrote the following function that takes file name / path as the input and outputs the captured text. Sometimes, when there is a long pause (~seconds) in the audio file, the returned text would be an empty string. To remedy this problem, I had to write additional code that picks out the longest string that was captured. I could make do with this fix.

def get_text_from_voice(filename):

    if not os.path.exists("model"):
        print ("Please download the model from https://alphacephei.com/vosk/models and unpack as 'model' in the current folder.")
        exit (1)

    wf = wave.open(filename, "rb")
    if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype() != "NONE":
        print ("Audio file must be WAV format mono PCM.")
        exit (1)

    model = Model("model")
    rec = KaldiRecognizer(model, wf.getframerate())
    rec.SetWords(True)

    text_lst =[]
    p_text_lst = []
    p_str = []
    len_p_str = []
    while True:
        data = wf.readframes(4000)
        if len(data) == 0:
            break
        if rec.AcceptWaveform(data):
            text_lst.append(rec.Result())
            print(rec.Result())
        else:
            p_text_lst.append(rec.PartialResult())
            print(rec.PartialResult())

    if len(text_lst) !=0:
        jd = json.loads(text_lst[0])
        txt_str = jd["text"]
        
    elif len(p_text_lst) !=0: 
        for i in range(0,len(p_text_lst)):
            temp_txt_dict = json.loads(p_text_lst[i])
            p_str.append(temp_txt_dict['partial'])
       
        len_p_str = [len(p_str[j]) for j in range(0,len(p_str))]
        max_val = max(len_p_str)
        indx = len_p_str.index(max_val)
        txt_str = p_str[indx]
            
    else:
        txt_str =''

    return txt_str

Make sure that the correct model is present in the same directory or put in the path to the model. Also, note that VOSK accepts audio files only in wav mono PCM format.

How to use Wave file as input in VOSK speech recognition?

1 Answers1