0

I am trying to perform some real-time speech-to-text transcription with the NAO and Pepper robots using IBM Watson's speech-to-text (STT) service.

I have tried to follow the example provided by IBM's github for the python SDK (found here: https://github.com/watson-developer-cloud/python-sdk/blob/master/examples/microphone-speech-to-text.py), but I am running into some issues. It is not properly accepting the buffer of data that I send to the websocket.

From the IBM websocket documentation (found here: https://cloud.ibm.com/docs/services/speech-to-text?topic=speech-to-text-audio-formats#audio-formats), it says that pcm data must be in 16-bit format.

When I inspected the code used to retrieve the robot's buffer (found here: NAO robot remote audio problems), the buffer returns data in a string representation of byte data. This data is then transformed into 16-bit integer data using numpy. However, the IBM websocket is not handling this data properly. I am trying to pass

The code I used to retrieve the buffer I send to websocket code is below:

# -*- coding: utf-8 -*-
####################################################################################
# Retrieve robot audio buffer from NAO/Pepper
#
# Audio data from buffer is then processed and converted 
# into a .wav file
#
# Wav file is then read by IBM Watson's speech-to-text 
# (STT) service
#
# resulting transcription of .wav file is then saved
#
# SoundReceiverModule inspired by the work of Alexandre Mazel
# from https://stackoverflow.com/questions/24243757/nao-robot-remote-audio-problems
####################################################################################
from __future__ import print_function
from naoqi import ALModule, ALBroker, ALProxy
import numpy as np
import time
import sys
import os
import wave
from ibm_watson import SpeechToTextV1
from ibm_watson.websocket import RecognizeCallback, AudioSource
import json
from threading import Thread
from Queue import Queue

#--------------------------------------------------------------------------------------------
# Module for Watson Speech To Text Real Time Streaming
#--------------------------------------------------------------------------------------------
class MyRecognizeCallback(RecognizeCallback):
    def __init__(self):
        RecognizeCallback.__init__(self)
        self.transcript = ''

    def on_connected(self):
        print('Connected to Watson Speech to Text')

    def on_listening(self):
        print('Listening for audio...')

    # def on_data(self, data):
    #     results = data['results'][0]['alternatives'][0]['transcript']
    #     print('User: ', end='')
    #     print(results)

    # def on_hypothesis(self, hypothesis):
    #     print('Hypothesis: ', end='')
    #     print(hypothesis)

    def on_transcription(self, transcript):
        self.transcript = transcript[0]['transcript'].encode('ascii', 'ignore')
        print('User transcript: ', end='')
        print(self.transcript)

    def get_transcript(self):
        return self.transcript

    def on_error(self, error):
        print('Error received: {}'.format(error))

    def on_inactivity_timeout(self, error):
        print('Inactivity timeout: {}'.format(error))


#--------------------------------------------------------------------------------------------
# Module for remote processing of audio data from NAO
#--------------------------------------------------------------------------------------------
class SoundReceiverModule(ALModule):
    """
    Use this object to get call back from the ALMemory of the naoqi world.
    Your callback needs to be a method with two parameter (variable name, value).
    """

    def __init__(self, strModuleName, myRecognizeCallback, speech_to_text):
        try:
            ALModule.__init__(self, strModuleName)
            self.BIND_PYTHON( self.getName(),"callback" )
            self.myRecognizeCallback = myRecognizeCallback
            self.speech_to_text = speech_to_text
            self.outfile = None
            self.wavfileName = None
            self.transcript = ''
            self.queue = Queue()
            self.audioSource = AudioSource(self.queue, True, True)

        except BaseException, err:
            print( "ERR: abcdk.naoqitools.SoundReceiverModule: loading error: %s" % str(err) )

    def get_transcript(self):
        return self.transcript

    def listen(self):
        audio = ALProxy( 'ALAudioDevice')
        nNbrChannelFlag = 3 # ALL_Channels: 0,  AL::LEFTCHANNEL: 1, AL::RIGHTCHANNEL: 2 AL::FRONTCHANNEL: 3  or AL::REARCHANNEL: 4.
        nDeinterleave = 0
        nSampleRate = 16000
        audio.setClientPreferences(self.getName(),  nSampleRate, nNbrChannelFlag, nDeinterleave) # setting same as default generate a bug !?!

        strFilenameOut = '\out.raw'
        self.outfile = open(os.getcwd() + strFilenameOut, 'wb')
        if(self.outfile != None):
            self.outfile.seek(0)
            self.outfile.truncate()

        # start remote processing
        audio.subscribe(self.getName())

        self.speech_to_text.recognize_using_websocket(
            audio=self.audioSource,
            content_type='audio/l16;rate=16000',
            recognize_callback=self.myRecognizeCallback,
            interim_results=True,
            max_alternatives=3)

        print( "INF: SoundReceiver: started!" )
        print("INF: Writing sound to '%s'" % strFilenameOut)

    def stop(self):
        print("INF: SoundReceiver: stopping...")
        audio = ALProxy("ALAudioDevice")
        audio.unsubscribe(self.getName())        
        print("INF: SoundReceiver: stopped!")
        print

        if(self.outfile != None):
            self.outfile.close()

            self.wavfileName = self.rawToWav(self.outfile)

            # self.transcript = self.process_raw_audio_data(self.wavfileName)

        else:
            print("outfile not saved properly")


    def processRemote(self, nbOfChannels, nbrOfSamplesByChannel, aTimeStamp, buffer):
        """
        This is THE method that receives all the sound buffers from the "ALAudioDevice" module
        """
        # self.queue.put(buffer)

        aSoundDataInterlaced = np.fromstring(buffer, dtype=np.int16)

        aSoundData = np.reshape(aSoundDataInterlaced, (nbOfChannels, nbrOfSamplesByChannel), 'F')

        # print(aSoundData[0])
        # print('')

        self.queue.put(aSoundData[0])

        aSoundData[0].tofile(self.outfile)


    # convert raw file to wav file
    def rawToWav(self, raw):

        if not os.path.isfile(raw.name):
            print("file not in path...")
            return

        print("Converting .raw file to .wav file...")
        wav = wave.open(raw.name.replace(".raw", ".wav"), "wb")
        wav.setframerate(16000)
        wav.setnchannels(1)
        wav.setsampwidth(2)


        f = open(raw.name, 'rb')
        sample = f.read(4096)

        while sample != "":
            wav.writeframes(sample)
            sample = f.read(4096)

        path = raw.name.replace(".raw", ".wav")

        f.close()

        return path

    def version( self ):
        return "0.6"

#---------------------------------------------------------------------------------------------------------------------
#                                           Main function for testing purposes
#---------------------------------------------------------------------------------------------------------------------
def main():
    """ Main entry point

    """
    NAO_IP = "192.168.20.151" # Nao IP address
    pip   = NAO_IP
    pport = 9559

    # We need this broker to be able to construct
    # NAOqi modules and subscribe to other modules
    # The broker must stay alive until the program exists
    myBroker = ALBroker("myBroker",
       "0.0.0.0",   # listen to anyone
       0,           # find a free port and use it
       pip,         # parent broker IP
       pport)       # parent broker port

    # initialize Watson Text to Speech
    speech_to_text = SpeechToTextV1(
        iam_apikey='xyz',
        url='https://stream.watsonplatform.net/speech-to-text/api')

    myRecognizeCallback = MyRecognizeCallback()

    # initialize SoundReceiver
    global SoundReceiver

    SoundReceiver = SoundReceiverModule("SoundReceiver", myRecognizeCallback, speech_to_text)

    # Start Sound Receiver and Watson Text to Speech
    leds = ALProxy('ALLeds')
    leds.setIntensity('EarLeds', 1)
    SoundReceiver.listen()
    time.sleep(4)
    SoundReceiver.stop()
    leds.setIntensity('EarLeds', .5)


if __name__ == "__main__":
    main()

I appreciate any help that someone could provide. Thanks

TVK
  • 1,042
  • 7
  • 21
Ronald Moore
  • 75
  • 1
  • 5
  • Hello welcome to stackoverflow, please be very careful when pasting code that contains credentials! Please change your iam apikey of the stt service since it was visible here and might be abused. – TVK Aug 02 '19 at 08:17
  • Thanks for the heads up, it had slipped my mind. I am going to request a new one ASAP. – Ronald Moore Aug 03 '19 at 21:26
  • Is there any error message? What does IBM endpoint return? – Victor Paléologue Jun 26 '20 at 07:13

0 Answers0