In this post we will have a look at Speech Recognition API, Speech Synthesis API and HTML5 Form Speech Input API.
Speech Recognition API
Speech Recognition API allows websites to listen to audio using microphone and covert the speech to text.
At present only chrome browsers support this API. In future ofcourse other browsers will support it.
Let’s have a look at the API
View Demo
function recognize()
{
//speechRecognization interface is the heart of recognization API
window.speechRecognition = window.speechRecognition || window.webkitSpeechRecognition || window.mozSpeechRecognition || window.webkitSpeechRecognition;
if(window.speechRecognition == undefined)
{
alert("Speech Recogniztion API Not Supported");
}
else
{
//create a speechRecognization object
recognizer = new speechRecognition();
//If set to "false" then recognizer stops listening automatically when user stops speaking the first sentence.
recognizer.continuous = true;
//specify the language of speech. langauge must be in BCP 47 standard.
recognizer.lang = "en-US";
//it set to true then onresult callback is fired after every word spoken by the user. Otherwise after end of sentence.
recognizer.interimResults = true;
//fired when speech recognization starts listening.
recognizer.onstart = function(){
alert("Recogniztion API started");
}
//fired everytime user stops speaking.
recognizer.onresult = function(event){
//event.resultIndex returns the index of first word spoken in the currently stoped sentence.
//event.results.length is the total number of words spoken in this session.
for(var count = event.resultIndex; count < event.results.length; count++)
{
//event.results array contains a array of word objects.
//event.results[count][number], here 2D represents the most probable work for the spoken word.
//event.result[count][number].transscript returns word string of the most probable word of the select word index.
document.getElementById("output").innerHTML += event.results[count][0].transcript;
}
}
//fired when recognization is stopped manually or automatically.
recognizer.onend = function(){
recognizer = null;
alert("Recogniztion API stopped");
}
recognizer.start();
}
}
function stop()
{
if(recognizer != null)
{
//stop it manually
recognizer.stop();
alert("Recognization API stopped");
}
}
Speech Synthesis API
Speech synthesis API converts text into audio i.e., it reads out text. This API is at present supported by only chrome but in future other browsers will surely support it.
Let’s have a look at the API
Live Demo
var default_voiceURI = null;
var default_lang = null;
//u have to put speechSynthesis.getVoices() inside timer, beacuase without timer it returns a empty list. This is a bug in chrome.
var timer = setInterval(function(){
//window.speechSynthesis object contains information about speechSynthesis features available and also is used to control the SpeechSynthesisUtterance objects.
//extract all available voices
if("speechSynthesis" in window)
{
//getVoices() gives a list of available human voices and their suitable language.
var available_voices = speechSynthesis.getVoices();
//after running few times inside the timer callback the getVoices returns the list. Its also a bug.
if(available_voices.length !== 0)
{
//available_voices.length returns the total number of voices available. Every voice is made for a particular language. Bad mix will produce bad pronouncation.
for(var count = 0; count < available_voices.length; count++)
{
//lang returns the langauage code for which the voice is made for.
//voiceURI returns a unique identifier for that voice.
//voiceURI and lang come in pairs that means each voiceURI is trained for a particular language.
//name is the displayable name for the voice.
if(count == 0)
{
//we make the first voice as default.
default_voiceURI = available_voices[count].voiceURI;
default_lang = available_voices[count].lang;
}
document.getElementById("voices").innerHTML += "<option value='" + available_voices[count].lang + "' data-voice-uri='" + available_voices[count].voiceURI + "'>"+ available_voices[count].name + "</option>";
}
clearInterval(timer);
}
}
else
{
alert("Speech Synthesis API not supported");
clearInterval(timer);
}
}, 1);
function speak()
{
if("SpeechSynthesisUtterance" in window)
{
synthesizer = new SpeechSynthesisUtterance();
synthesizer.text = document.getElementById("text").value;
synthesizer.voice = default_voiceURI;
synthesizer.lang = default_lang;
//represents how fast the text will be spoken out. Value between 0.1 to 10.
synthesizer.rate = 1;
//represents the pitch. Value between 0 to 2.
synthesizer.pitch = 1;
//fired when synthesizer is started
synthesizer.onstart = function(){
console.log("Synthesis Started");
}
//fired when synthesizer is paused
synthesizer.onpause = function(){
console.log("Synthesis Paused");
}
//fired when synthesizer is resumed after pause
synthesizer.onresume = function(){
console.log("Synthesis Resumed after Pause");
}
//fired when synthesizer is stopped
synthesizer.onend = function(){
console.log("Synthesis Stopped");
}
speechSynthesis.speak(synthesizer);
}
}
function pause()
{
//speechSynthesis pauses all SpeechSynthesisUtterance objects outputs.
if(speechSynthesis.paused === false)
{
speechSynthesis.pause();
}
}
function resume()
{
//speechSynthesis resumes all SpeechSynthesisUtterance objects outputs.
if(speechSynthesis.paused === true)
{
speechSynthesis.resume();
}
}
function stop()
{
//speechSynthesis stops all SpeechSynthesisUtterance objects outputs and deleted them from memory.
speechSynthesis.cancel();
}
function select_voice()
{
var sel_element = document.getElementById("voices").options[voices.selectedIndex];
default_lang = sel_element.getAttribute("value");
default_voiceURI = sel_element.getAttribute("data-voice-uri");
}
HTML5 Form Speech Input API
Form speech recognition API allows us to take speech input in form input fields. It doesn’t require any JavaScript code to populate the input fields.
At present this API is only available in chrome browsers. It will soon be available in other browsers too.
Let’s have a loot at the API:
Live Demo
<html>
<head>
<title>Speech Recogniztion API</title>
<script type="text/javascript">
function transcribe(text)
{
console.log(text);
}
if (document.createElement("input").webkitSpeech === undefined)
{
alert("Speech input is not supported in your browser.");
}
</script>
</head>
<body>
Click on microphone button on input field and then talk.
<input type="text" placeholder="Speak Out" x-webkit-speech="x-webkit-speech" onwebkitspeechchange="transcribe(this.value)" />
</body>
</html>
Conclusion
I know that its strange to uses these APIs as they have a huge compatibility problem. Mostly chrome and safari support them therefore you can display a message to the user to change their browser. These APIs can be useful in phonegap apps too. Thanks for reading.