I am currently working on Softbanks' robot Pepper and I try to use Watson speech-to-text solution on Pepper's audio buffers remote streaming by using websocket protocol.
I used the answer to that former question NAO robot remote audio problems to find a way to access remotly pepper's audio buffers and that project https://github.com/ibm-dev/watson-streaming-stt to learn how to use websocket protocole to use watson streaming stt.
However, after I open my websocket application, I start sending buffers to watson and after a few sendings, I receive error: 'Unable to transcode from audio/l16;rate=48000;channel=1 to one of: audio/x-float-array; rate=16000; channels=1'
Each time I'm trying to send Pepper's audio buffer to watson, it is unable to understand it.
I compared data I send with data sent in watson streaming stt example (using pyaudio streaming from microphone instead of Pepper's buffer streaming) and I don't see any difference. Both time I'm pretty sure that I am sending a string containing raw chunks of bytes. Which is what Watson asks for in it documentation.
I try to send chunks of 8192 bytes with a sample rate of 48kHz and I can easily convert Pepper's audio buffer in hexa so I don't understand why Watson can't transcode it.
Here is my code:
# -*- coding: utf-8 -*-
#!/usr/bin/env python
import argparse
import base64
import configparser
import json
import threading
import time
from optparse import OptionParser
import naoqi
import numpy as np
import sys
from threading import Thread
import ssl
import websocket
from websocket._abnf import ABNF
CHANNELS = 1
NAO_IP = "172.20.10.12"
class SoundReceiverModule(naoqi.ALModule):
"""
Use this object to get call back from the ALMemory of the naoqi world.
Your callback needs to be a method with two parameter (variable name, value).
"""
def __init__( self, strModuleName, strNaoIp):
try:
naoqi.ALModule.__init__(self, strModuleName );
self.BIND_PYTHON( self.getName(),"callback" );
self.strNaoIp = strNaoIp;
self.outfile = None;
self.aOutfile = [None]*(4-1); # ASSUME max nbr channels = 4
self.FINALS = []
self.RECORD_SECONDS = 20
self.ws_open = False
self.ws_listening = ""
# init data for websocket interfaces
self.headers = {}
self.userpass = "" #userpass and password
self.headers["Authorization"] = "Basic " + base64.b64encode(
self.userpass.encode()).decode()
self.url = ("wss://stream.watsonplatform.net//speech-to-text/api/v1/recognize"
"?model=fr-FR_BroadbandModel")
except BaseException, err:
print( "ERR: abcdk.naoqitools.SoundReceiverModule: loading error: %s" % str(err) );
# __init__ - end
def __del__( self ):
print( "INF: abcdk.SoundReceiverModule.__del__: cleaning everything" );
self.stop();
def start( self ):
audio = naoqi.ALProxy( "ALAudioDevice", self.strNaoIp, 9559 );
self.nNbrChannelFlag = 3; # ALL_Channels: 0, AL::LEFTCHANNEL: 1, AL::RIGHTCHANNEL: 2; AL::FRONTCHANNEL: 3 or AL::REARCHANNEL: 4.
self.nDeinterleave = 0;
self.nSampleRate = 48000;
audio.setClientPreferences( self.getName(), self.nSampleRate, self.nNbrChannelFlag, self.nDeinterleave ); # setting same as default generate a bug !?!
audio.subscribe( self.getName() );
#openning websocket app
self._ws = websocket.WebSocketApp(self.url,
header=self.headers,
on_open = self.on_open,
on_message=self.on_message,
on_error=self.on_error,
on_close=self.on_close)
sslopt={"cert_reqs": ssl.CERT_NONE}
threading.Thread(target=self._ws.run_forever, kwargs = {'sslopt':sslopt}).start()
print( "INF: SoundReceiver: started!" );
def stop( self ):
print( "INF: SoundReceiver: stopping..." );
audio = naoqi.ALProxy( "ALAudioDevice", self.strNaoIp, 9559 );
audio.unsubscribe( self.getName() );
print( "INF: SoundReceiver: stopped!" );
print "INF: WebSocket: closing..."
data = {"action": "stop"}
self._ws.send(json.dumps(data).encode('utf8'))
# ... which we need to wait for before we shutdown the websocket
time.sleep(1)
self._ws.close()
print "INF: WebSocket: closed"
if( self.outfile != None ):
self.outfile.close();
def processRemote( self, nbOfChannels, nbrOfSamplesByChannel, aTimeStamp, buffer ):
"""
This is THE method that receives all the sound buffers from the "ALAudioDevice" module"""
print "receiving buffer"
# self.data_to_send = self.data_to_send + buffer
# print len(self.data_to_send)
#self.data_to_send = ''.join( [ "%02X " % ord( x ) for x in buffer ] ).strip()
self.data_to_send = buffer
#print("buffer type :", type(data))
#print("buffer :", buffer)
#~ print( "process!" );
print( "processRemote: %s, %s, %s, lendata: %s, data0: %s (0x%x), data1: %s (0x%x)" % (nbOfChannels, nbrOfSamplesByChannel, aTimeStamp, len(buffer), buffer[0],ord(buffer[0]),buffer[1],ord(buffer[1])) );
if self.ws_open == True and self.ws_listening == True:
print "sending data"
self._ws.send(self.data_to_send, ABNF.OPCODE_BINARY)
print "data sent"
#print self.data_to_send
aSoundDataInterlaced = np.fromstring( str(buffer), dtype=np.int16 );
#
aSoundData = np.reshape( aSoundDataInterlaced, (nbOfChannels, nbrOfSamplesByChannel), 'F' );
# print "processRemote over"
# processRemote - end
def on_message(self, ws, msg):
print("message")
data = json.loads(msg)
print data
if "state" in data:
if data["state"] == "listening":
self.ws_listening = True
if "results" in data:
if data["results"][0]["final"]:
self.FINALS.append(data)
# This prints out the current fragment that we are working on
print(data['results'][0]['alternatives'][0]['transcript'])
def on_error(self, ws, error):
"""Print any errors."""
print(error)
def on_close(self, ws):
"""Upon close, print the complete and final transcript."""
transcript = "".join([x['results'][0]['alternatives'][0]['transcript']
for x in self.FINALS])
print("transcript :", transcript)
self.ws_open = False
def on_open(self, ws):
"""Triggered as soon a we have an active connection."""
# args = self._ws.args
print "INF: WebSocket: opening"
data = {
"action": "start",
# this means we get to send it straight raw sampling
"content-type": "audio/l16;rate=%d;channel=1" % self.nSampleRate,
"continuous": True,
"interim_results": True,
# "inactivity_timeout": 5, # in order to use this effectively
# you need other tests to handle what happens if the socket is
# closed by the server.
"word_confidence": True,
"timestamps": True,
"max_alternatives": 3
}
# Send the initial control message which sets expectations for the
# binary stream that follows:
self._ws.send(json.dumps(data).encode('utf8'))
# Spin off a dedicated thread where we are going to read and
# stream out audio.
print "INF: WebSocket: opened"
self.ws_open = True
def version( self ):
return "0.6";
def main():
"""initialisation
"""
parser = OptionParser()
parser.add_option("--pip",
help="Parent broker port. The IP address or your robot",
dest="pip")
parser.add_option("--pport",
help="Parent broker port. The port NAOqi is listening to",
dest="pport",
type="int")
parser.set_defaults(
pip=NAO_IP,
pport=9559)
(opts, args_) = parser.parse_args()
pip = opts.pip
pport = opts.pport
# We need this broker to be able to construct
# NAOqi modules and subscribe to other modules
# The broker must stay alive until the program exists
myBroker = naoqi.ALBroker("myBroker",
"0.0.0.0", # listen to anyone
0, # find a free port and use it
pip, # parent broker IP
pport) # parent broker port
"""fin initialisation
"""
global SoundReceiver
SoundReceiver = SoundReceiverModule("SoundReceiver", pip) #thread1
SoundReceiver.start()
try:
while True:
time.sleep(1)
print "hello"
except KeyboardInterrupt:
print "Interrupted by user, shutting down"
myBroker.shutdown()
SoundReceiver.stop()
sys.exit(0)
if __name__ == "__main__":
main()
I would be thankful if anyone had any idea on how to bypass that error or on what to try to get useful info. I first believed that I was sending "wrong" data to watson however after lots of attempts I have no clue on how to fix that problem.
Thank you a lot,
Alex