0

I have written the following code, which works OK to convert text to speech and play it through the speaker. I am using Microsoft Cognitive Services.

<html lang="en">
<head>
  <title>Microsoft Cognitive Services Speech SDK JavaScript Sample for Speech Synthesis</title>
  <meta charset="utf-8" />
  <style>
    body {
      font-family: 'Segoe UI', -apple-system, BlinkMacSystemFont, 'Roboto', 'Helvetica Neue', sans-serif;
      font-size: 14px;
    }

    table, th, td {
      border: 1px solid #f1f1f1;
      border-collapse: collapse;
    }

    th, td {
      padding: 10px;
    }

    textarea {
      font-family: Arial,sans-serif;
    }

    .mode {
      font-size: 18px;
    }

    .highlight{
      background-color: yellow;
    }

    input:not(disabled) {
      font-weight: bold;
      color: black;
    }

    button {
      padding: 4px 8px;
      background: #0078d4;
      color: #ffffff;
    }

    button:disabled {
      padding: 4px 8px;
      background: #ccc;
      color: #666;
    }

    input[type=radio] {
      position: relative;
      z-index: 1;
    }

    input[type=radio] + label {
      padding: 8px 4px 8px 30px;
      margin-left: -30px;
    }

    input[type=radio]:checked + label {
      background: #0078d4;
      color: #ffffff;
    }
  </style>
</head>
<body>
  <div id="warning">
    <h1 style="font-weight:500;">Speech Speech SDK not found
      (microsoft.cognitiveservices.speech.sdk.bundle.js missing).</h1>
  </div>
  
  <div id="content" style="display:none">
    <table>
      <tr>
        <td></td>
        <td><h1 style="font-weight:500;">Microsoft Cognitive Services</h1></td>
      </tr>
      <tr>
        <td align="right">
          <label for="subscriptionKey">
            <a href="https://learn.microsoft.com/azure/cognitive-services/speech-service/get-started"
               rel="noreferrer noopener"
               target="_blank">Subscription Key</a>
          </label>
        </td>
        <td><input id="subscriptionKey" type="text" size="40" placeholder="YourSubscriptionKey"></td>
      </tr>
      <tr>
        <td align="right"><label for="regionOptions">Region</label></td>
        <td>
<!--          see https://aka.ms/csspeech/region for more details-->
          <select id="regionOptions">
            <option value="uksouth">UK South</option>
          </select>
        </td>
      </tr>
      <tr>
        <td align="right"><label for="voiceOptions">Voice</label></td>
        <td>
          <button id="updateVoiceListButton">Update Voice List</button>
          <select id="voiceOptions" disabled>
            <option>Please update voice list first.</option>
          </select>
        </td>
      </tr>
      <tr>
        <td align="right"><label for="isSSML">Is SSML</label><br></td>
        <td>
          <input type="checkbox" id="isSSML" name="isSSML" value="ssml">
        </td>
      </tr>
      <tr>
        <td align="right"><label for="synthesisText">Text</label></td>
        <td>
          <textarea id="synthesisText" style="display: inline-block;width:500px;height:100px"
                 placeholder="Input text or ssml for synthesis."></textarea>
        </td>
      </tr>
      <tr>
        <td></td>
        <td>
          <button id="startSynthesisAsyncButton">Start synthesis</button>
          <button id="pauseButton">Pause</button>
          <button id="resumeButton">Resume</button>
        </td>
      </tr>
      <tr>
        <td align="right" valign="top"><label for="resultsDiv">Results</label></td>
        <td><textarea id="resultsDiv" readonly style="display: inline-block;width:500px;height:50px"></textarea></td>
      </tr>
      <tr>
        <td align="right" valign="top"><label for="eventsDiv">Events</label></td>
        <td><textarea id="eventsDiv" readonly style="display: inline-block;width:500px;height:200px"></textarea></td>
      </tr>
      <tr>
        <td align="right" valign="top"><label for="highlightDiv">Highlight</label></td>
        <td><div id="highlightDiv" style="display: inline-block;width:800px;"></div></td>
      </tr>
    </table>
  </div>

  <!-- Speech SDK reference sdk. -->
  <script src="microsoft.cognitiveservices.speech.sdk.bundle.js"></script>

  <!-- Speech SDK Authorization token -->
  <script>
  // Note: Replace the URL with a valid endpoint to retrieve
  //       authorization tokens for your subscription.
  var authorizationEndpoint = "token.php";

  function RequestAuthorizationToken() {
    if (authorizationEndpoint) {
      var a = new XMLHttpRequest();
      a.open("GET", authorizationEndpoint);
      a.setRequestHeader("Content-Type", "application/x-www-form-urlencoded");
      a.send("");
      a.onload = function() {
          var token = JSON.parse(atob(this.responseText.split(".")[1]));
          serviceRegion.value = token.region;
          authorizationToken = this.responseText;
          subscriptionKey.disabled = true;
          subscriptionKey.value = "using authorization token (hit F5 to refresh)";
          console.log("Got an authorization token: " + token);
      }
    }
  }
  </script>

  <!-- Speech SDK USAGE -->
  <script>
    // On document load resolve the Speech SDK dependency
    function Initialize(onComplete) {
      if (!!window.SpeechSDK) {
        document.getElementById('content').style.display = 'block';
        document.getElementById('warning').style.display = 'none';
        onComplete(window.SpeechSDK);
      }
    }
  </script>

  <!-- Browser Hooks -->
  <script>
    // status fields and start button in UI
    var resultsDiv, eventsDiv;
    var highlightDiv;
    var startSynthesisAsyncButton, pauseButton, resumeButton;
    var updateVoiceListButton;

    // subscription key and region for speech services.
    var subscriptionKey, regionOptions;
    var authorizationToken;
    var voiceOptions, isSsml;
    var SpeechSDK;
    var synthesisText;
    var synthesizer;
    var player;
    var wordBoundaryList = [];

    document.addEventListener("DOMContentLoaded", function () {
      startSynthesisAsyncButton = document.getElementById("startSynthesisAsyncButton");
      updateVoiceListButton = document.getElementById("updateVoiceListButton");
      pauseButton = document.getElementById("pauseButton");
      resumeButton = document.getElementById("resumeButton");
      subscriptionKey = document.getElementById("subscriptionKey");
      regionOptions = document.getElementById("regionOptions");
      resultsDiv = document.getElementById("resultsDiv");
      eventsDiv = document.getElementById("eventsDiv");
      voiceOptions = document.getElementById("voiceOptions");
      isSsml = document.getElementById("isSSML");
      highlightDiv = document.getElementById("highlightDiv");

      setInterval(function () {
        if (player !== undefined) {
          const currentTime = player.currentTime;
          var wordBoundary;
          for (const e of wordBoundaryList) {
            if (currentTime * 1000 > e.audioOffset / 10000) {
              wordBoundary = e;
            } else {
              break;
            }
          }
          if (wordBoundary !== undefined) {
            highlightDiv.innerHTML = synthesisText.value.substr(0, wordBoundary.textOffset) +
                    "<span class='highlight'>" + wordBoundary.text + "</span>" +
                    synthesisText.value.substr(wordBoundary.textOffset + wordBoundary.wordLength);
          } else {
            highlightDiv.innerHTML = synthesisText.value;
          }
        }
      }, 50);

      updateVoiceListButton.addEventListener("click", function () {
        var request = new XMLHttpRequest();
        request.open('GET',
                'https://' + regionOptions.value + ".tts.speech." +
                (regionOptions.value.startsWith("china") ? "azure.cn" : "microsoft.com") +
                        "/cognitiveservices/voices/list", true);
        if (authorizationToken) {
          request.setRequestHeader("Authorization", "Bearer " + authorizationToken);
        } else {
          if (subscriptionKey.value === "" || subscriptionKey.value === "subscription") {
            alert("Please enter your Microsoft Cognitive Services Speech subscription key!");
            return;
          }
          request.setRequestHeader("Ocp-Apim-Subscription-Key", subscriptionKey.value);
        }

        request.onload = function() {
          if (request.status >= 200 && request.status < 400) {
            const response = this.response;
            const neuralSupport = (response.indexOf("JessaNeural") > 0);
            const defaultVoice = neuralSupport ? "JessaNeural" : "JessaRUS";
            let selectId;
            const data = JSON.parse(response);
            voiceOptions.innerHTML = "";
            data.forEach((voice, index) => {
              voiceOptions.innerHTML += "<option value=\"" + voice.Name + "\">" + voice.Name + "</option>";
              if (voice.Name.indexOf(defaultVoice) > 0) {
                selectId = index;
              }
            });
            voiceOptions.selectedIndex = selectId;
            voiceOptions.disabled = false;
          } else {
            window.console.log(this);
            eventsDiv.innerHTML += "cannot get voice list, code: " + this.status + " detail: " + this.statusText + "\r\n";
          }
        };

        request.send()
      });

      pauseButton.addEventListener("click", function () {
        player.pause();
        pauseButton.disabled = true;
        resumeButton.disabled = false;
      });

      resumeButton.addEventListener("click", function () {
        player.resume();
        pauseButton.disabled = false;
        resumeButton.disabled = true;
      });

      startSynthesisAsyncButton.addEventListener("click", function () {
        startSynthesisAsyncButton.disabled = true;
        resultsDiv.innerHTML = "";
        eventsDiv.innerHTML = "";
        wordBoundaryList = [];
        synthesisText = document.getElementById("synthesisText");

        // if we got an authorization token, use the token. Otherwise use the provided subscription key
        var speechConfig;
        if (authorizationToken) {
          speechConfig = SpeechSDK.SpeechConfig.fromAuthorizationToken(authorizationToken, serviceRegion.value);
        } else {
          if (subscriptionKey.value === "" || subscriptionKey.value === "subscription") {
            alert("Please enter your Microsoft Cognitive Services Speech subscription key!");
            return;
          }
          speechConfig = SpeechSDK.SpeechConfig.fromSubscription(subscriptionKey.value, regionOptions.value);
        }

        speechConfig.speechSynthesisVoiceName = voiceOptions.value;
        // The SDK uses Media Source Extensions (https://www.w3.org/TR/media-source/) for playback.
        // Mp3 format is supported in most browsers.
        speechConfig.speechSynthesisOutputFormat = SpeechSDK.SpeechSynthesisOutputFormat.Audio16Khz32KBitRateMonoMp3;
        player = new SpeechSDK.SpeakerAudioDestination();
        player.onAudioEnd = function (_) {
          window.console.log("playback finished");
          eventsDiv.innerHTML += "playback finished" + "\r\n";
          startSynthesisAsyncButton.disabled = false;
          pauseButton.disabled = true;
          resumeButton.disabled = true;
          wordBoundaryList = [];
        };

        var audioConfig  = SpeechSDK.AudioConfig.fromSpeakerOutput(player);
        var audioConfig = AudioConfig.FromWavFileOutput("/var/www/html/unitypst/customfiles/tts/ttsfile.wav");
        
        synthesizer = new SpeechSDK.SpeechSynthesizer(speechConfig, audioConfig);

        // The event synthesizing signals that a synthesized audio chunk is received.
        // You will receive one or more synthesizing events as a speech phrase is synthesized.
        // You can use this callback to streaming receive the synthesized audio.
        synthesizer.synthesizing = function (s, e) {
          window.console.log(e);
          eventsDiv.innerHTML += "(synthesizing) Reason: " + SpeechSDK.ResultReason[e.result.reason] +
                  "Audio chunk length: " + e.result.audioData.byteLength + "\r\n";
        };

        // The synthesis started event signals that the synthesis is started.
        synthesizer.synthesisStarted = function (s, e) {
          window.console.log(e);
          eventsDiv.innerHTML += "(synthesis started)" + "\r\n";
          pauseButton.disabled = false;
        };

        // The event synthesis completed signals that the synthesis is completed.
        synthesizer.synthesisCompleted = function (s, e) {
          console.log(e);
          eventsDiv.innerHTML += "(synthesized)  Reason: " + SpeechSDK.ResultReason[e.result.reason] +
                  " Audio length: " + e.result.audioData.byteLength + "\r\n";
        };

        // The event signals that the service has stopped processing speech.
        // This can happen when an error is encountered.
        synthesizer.SynthesisCanceled = function (s, e) {
          const cancellationDetails = SpeechSDK.CancellationDetails.fromResult(e.result);
          let str = "(cancel) Reason: " + SpeechSDK.CancellationReason[cancellationDetails.reason];
          if (cancellationDetails.reason === SpeechSDK.CancellationReason.Error) {
            str += ": " + e.result.errorDetails;
          }
          window.console.log(e);
          eventsDiv.innerHTML += str + "\r\n";
          startSynthesisAsyncButton.disabled = false;
          pauseButton.disabled = true;
          resumeButton.disabled = true;
        };

        // This event signals that word boundary is received. This indicates the audio boundary of each word.
        // The unit of e.audioOffset is tick (1 tick = 100 nanoseconds), divide by 10,000 to convert to milliseconds.
        synthesizer.wordBoundary = function (s, e) {
          window.console.log(e);
          eventsDiv.innerHTML += "(WordBoundary), Text: " + e.text + ", Audio offset: " + e.audioOffset / 10000 + "ms." + "\r\n";
          wordBoundaryList.push(e);
        };

        const complete_cb = function (result) {
          if (result.reason === SpeechSDK.ResultReason.SynthesizingAudioCompleted) {
            resultsDiv.innerHTML += "synthesis finished";
          } else if (result.reason === SpeechSDK.ResultReason.Canceled) {
            resultsDiv.innerHTML += "synthesis failed. Error detail: " + result.errorDetails;
          }
          window.console.log(result);
          synthesizer.close();
          synthesizer = undefined;
        };
        const err_cb = function (err) {
          startSynthesisAsyncButton.disabled = false;
          phraseDiv.innerHTML += err;
          window.console.log(err);
          synthesizer.close();
          synthesizer = undefined;
        };
        if (isSsml.checked) {
          synthesizer.speakSsmlAsync(synthesisText.value,
                  complete_cb,
                  err_cb);
        } else {
          synthesizer.speakTextAsync(synthesisText.value,
                  complete_cb,
                  err_cb);
        }
      });

      Initialize(function (speechSdk) {
        SpeechSDK = speechSdk;
        startSynthesisAsyncButton.disabled = false;
        pauseButton.disabled = true;
        resumeButton.disabled = true;
        saveButton.disabled = true;

        // in case we have a function for getting an authorization token, call it.
        if (typeof RequestAuthorizationToken === "function") {
          RequestAuthorizationToken();
        }
      });
    });
  </script>
</body>
</html>

What I would like to do now is add the option to save the synthesized speech data to disk, and prompt the user to download the file. Is this possible using the Microsoft Cognitive Services Speech SDK? How should I modify the code to support a "save" or "download" option?

Cody Gray - on strike
  • 239,200
  • 50
  • 490
  • 574
Andy Rav
  • 1
  • 1
  • 1
  • [How to save a text to speech audio file client side?](https://stackoverflow.com/questions/57020422/how-to-save-a-text-to-speech-audio-file-client-side) – Jason Pan Jan 26 '21 at 09:17
  • After generated audio file then you can create a href or others in callback function. – Jason Pan Jan 26 '21 at 09:18

1 Answers1

0

we have a sample to show how to save the TTS audio under browser using Azure Speech SDK.

Yulin Li
  • 316
  • 1
  • 3