I am using the standard solution to do speech to text processing with time stamps (see code below). I know from this post that it is possible to add arguments to the gcloud commandline tool, like --format=json
.
General question: How do I specify those in google.cloud.speech
? I can't seem to find any documentation on Googles site on how to do this with Python.
Specific question: My aim right now, is to write out a dictionary style JSON file that contains entries for all words, plus their start and end time per word. I realise that I cloud write a hacky solution, but if a option already exists, that would be preferable.
Code:
def transcribe_file_with_word_time_offsets(speech_file, language):
"""Transcribe the given audio file synchronously and output the word time
offsets."""
print("Start")
from google.cloud import speech
from google.cloud.speech import enums
from google.cloud.speech import types
print("checking credentials")
client = speech.SpeechClient(credentials=credentials)
print("Checked")
with io.open(speech_file, 'rb') as audio_file:
content = audio_file.read()
print("audio file read")
audio = types.RecognitionAudio(content=content)
print("config start")
config = types.RecognitionConfig(
encoding=enums.RecognitionConfig.AudioEncoding.FLAC,
language_code=language,
enable_word_time_offsets=True)
print("Recognizing:")
response = client.recognize(config, audio)
print("Recognized")
for result in response.results:
alternative = result.alternatives[0]
print('Transcript: {}'.format(alternative.transcript))
for word_info in alternative.words:
word = word_info.word
start_time = word_info.start_time
end_time = word_info.end_time
print('Word: {}, start_time: {}, end_time: {}'.format(
word,
start_time.seconds + start_time.nanos * 1e-9,
end_time.seconds + end_time.nanos * 1e-9))
if __name__ == '__main__':
parser = argparse.ArgumentParser(description=__doc__,
formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument(dest='path', help='Audio file to be recognized')
args = parser.parse_args()
transcribe_file_with_word_time_offsets(args.path, 'en-US')
And here is the hacky solution:
...
transcript_dict = {'Word':[], 'start_time': [], 'end_time':[]}
for result in response.results:
alternative = result.alternatives[0]
print('Transcript: {}'.format(alternative.transcript))
for word_info in alternative.words:
word = word_info.word
start_time = word_info.start_time
end_time = word_info.end_time
transcript_dict['Word'].append(word)
transcript_dict['start_time'].append(
start_time.seconds + start_time.nanos * 1e-9)
transcript_dict['end_time'].append(
end_time.seconds + end_time.nanos * 1e-9)
print(transcript_dict)
...