I am creating a transcriber using openAI whisper API in nodejs and react. I want the user to be able to record an audio file in the browser and transcribe their recording. i am doing this by saving the buffer data of the audio blob they have recorded into an mp3 file, then using the createTranscription() api call i input the fs.createReadStream(recorded_audio_file.mp3) which outputs a 400 error. When i record an audio file using the windows recorder and input that file the api call works just fin. Here is my recorder component in react
import React, { useState, useEffect, useRef } from "react";
import Microphone from "./Microphone/Microphone";
const TSST = () => {
const BASE_URL = process.env.REACT_APP_SERVER_URL || "http://localhost:5000";
const mediaRecorder = useRef(null);
const [stream, setStream] = useState(null);
const [audioChunks, setAudioChunks] = useState([]);
const [audio, setAudio] = useState(null);
const [audioFile, setAudioFile] = useState(null);
const [transcribtion, setTranscription] = useState("");
const [audioBlob, setAudioBlob] = useState("");
const [audioBuffer, setAudioBuffer] = useState("");
useEffect(() => {
const initializeMediaRecorder = async () => {
if ("MediaRecorder" in window) {
try {
const streamData = await navigator.mediaDevices.getUserMedia({ audio: true });
setStream(streamData);
} catch (err) {
console.log(err.message);
}
} else {
console.log("The MediaRecorder API is not supported in your browser.");
}
}
initializeMediaRecorder();
}, [])
const handleStartRecording = () => {
const media = new MediaRecorder(stream, { type: "audio/mp3" });
mediaRecorder.current = media;
mediaRecorder.current.start();
let chunks = [];
mediaRecorder.current.ondataavailable = (e) => {
chunks.push(e.data);
};
setAudioChunks(chunks);
}
const handleStopRecording = () => {
mediaRecorder.current.stop();
mediaRecorder.current.onstop = () => {
const audioBlob = new Blob(audioChunks, { type: "audio/mp3" });
const audioUrl = URL.createObjectURL(audioBlob);
setAudioBlob(audioBlob)
setAudio(audioUrl);
setAudioChunks([]);
let file = new File([audioUrl], "recorded_audio.mp3",{type:"audio/mp3", lastModified:new Date().getTime()});
let container = new DataTransfer();
container.items.add(file);
document.getElementById("audioFile").files = container.files;
setAudioFile(container.files[0]);
console.log(file);
};
}
const handleSubmitRecording = async () => {
try {
// Assuming you have an audio blob called 'audioBlob'
// Convert the audio blob to a base64 string
const reader = new FileReader();
reader.onloadend = async () => {
const base64String = reader.result.split(',')[1]; // Extract base64 data from the result
const res = await fetch(`${BASE_URL}/api/openai/transcriber`, {
method: "POST",
headers: {
"Content-Type": "application/json",
},
body: JSON.stringify({ audioBuffer: base64String, lang: "en" })
})
const data = await res.json();
setTranscription(data);
};
reader.readAsDataURL(audioBlob);
} catch (error) {
console.log(error);
} finally {
}
}
return (
<div className="h-[calc(100vh-73px)] flex justify-center items-center">
<div className="w-[40%] flex justify-between items-center">
<div className="flex flex-col">
<Microphone startFunction={ handleStartRecording } stopFunction={ handleStopRecording } />
<button onClick={handleStartRecording} className="w-fit my-10 p-5 bg-gray-200 rounded-lg">Start Recording</button>
<button onClick={handleStopRecording} className="w-fit mb-10 p-5 bg-gray-200 rounded-lg">Stop Recording</button>
<audio className="mb-10" src={audio && audio} controls></audio>
<input id="audioFile" type="file" onChange={ (e) => {setAudioFile(e.target.files[0])}}/>
</div>
<div>
<button className="p-10 bg-yellow-500 rounded-xl" onClick={ handleSubmitRecording } >Submit</button>
</div>
</div>
<div className="w-[40%] flex justify-center items-center">
<textarea value={transcribtion} readOnly className="w-[60%] aspect-square resize-none shadow-lg shadow-black"></textarea>
</div>
</div>
);
};
export default TSST;
here is the api:
export const transcribe = async (req, res) => {
// const lang = JSON.parse(req.body.json).lang;
// const audioBuffer = req.file;
const { audioBuffer, lang} = req.body;
const audioBufferBase64 = Buffer.from(audioBuffer, 'base64');
const fileName = "test.mp3";
const folderName = `./audio/${fileName}`
const writableStream = fs.createWriteStream(folderName); // Replace with your desired file path and extension
writableStream.write(audioBufferBase64);
const readStream = fs.createReadStream(folderName);
readStream.on('data', (data) => {
console.log('Read stream data:', data);
});
try {
const whisperRes = await openai.createTranscription(
readStream,
"whisper-1",
)
const chatResponse = whisperRes.data.text;
console.log(chatResponse)
res.status(200).json({ chatResponse: chatResponse });
} catch (error) {
//console.log(error);
res.status(500).json({ message: error });
}
}
and here is the server call:
import express from "express";
import cors from "cors";
import * as dotenv from "dotenv";
import mongoose from "mongoose";
import multer from "multer";
import { dalle, chatGPT, summarize, translate, transcribe } from "./api/openai.js";
import { getImages, postImage } from "./api/imageShowcase.js";
import { login, signup } from "./api/user.js";
dotenv.config();
const app = express();
const upload = multer();
const storage = multer.memoryStorage();
const uploadMiddleware = multer({ storage: storage });
app.use(cors());
app.use(express.json({limit: '50mb'}));
const atlasURL = process.env.MONGODB_URL;
const PORT = process.env.PORT || 5000;
mongoose.connect(atlasURL)
.then(() => app.listen(PORT, () => console.log(`Successfully connected to port ${PORT}`)))
.catch(error => console.log("There was an error: ", error));
app.get("/", async (req, res) => {
res.send("Server is RUNNING");
})
app.post("/api/openai/transcriber",(req, res) => transcribe(req, res));
the saved mp3 file is working just fine. the apikey is correct. when i record my own mp3 using windows recorder and use the createReadStream of that it works just fine. the saved file data is a buffer of the form
i tried changing the way i save a file, using different formatting methods for the buffer, binary hex, base64. tried uploading the buffer directly to whisper api. tried using axios to post to the api url directly. tried making a promise out of the saving of the mp3 file and then createReadStream and a lot of other little changes. tried to make a readable out of the buffer directly. I view all the similar questions with answers with no avail.