Skip to content

Commit b9aaade

Browse files
authored
[speech-commands] Changes to API and model location (#100)
* [speech-commands] Changes to API and model location - startStreaming() --> listen() - stopStreaming() --> stopListening() - isStreaming() --> isListening() - Update model location
1 parent d3fffa1 commit b9aaade

File tree

7 files changed

+131
-72
lines changed

7 files changed

+131
-72
lines changed

speech-commands/README.md

Lines changed: 17 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ A speech command recognizer can be used in two ways:
3939
### Online streaming recognition
4040

4141
To use the speech-command recognizer, first create a recognizer instance,
42-
then start the streaming recognition by calling its `startStreaming()` method.
42+
then start the streaming recognition by calling its `listen()` method.
4343

4444
```js
4545
import * as tf from '@tensorflow/tfjs';
@@ -59,12 +59,13 @@ await recognizer.ensureModelLoaded();
5959
// See the array of words that the recognizer is trained to recognize.
6060
console.log(recognizer.wordLabels());
6161

62-
// `startStreaming()` takes two arguments:
62+
// `listen()` takes two arguments:
6363
// 1. A callback function that is invoked anytime a word is recognized.
6464
// 2. A configuration object with adjustable fields such a
6565
// - includeSpectrogram
6666
// - probabilityThreshold
67-
recognizer.startStreaming(result => {
67+
// - includeEmbedding
68+
recognizer.listen(result => {
6869
// - result.scores contains the probability scores that correspond to
6970
// recognizer.wordLabels().
7071
// - result.spectrogram contains the spectrogram of the recognized word.
@@ -100,7 +101,7 @@ Currently, the supported vocabularies are:
100101
#### Parameters for online streaming recognition
101102

102103
As the example above shows, you can specify optional parameters when calling
103-
`startStreaming()`. The supported parameters are:
104+
`listen()`. The supported parameters are:
104105

105106
* `overlapFactor`: Controls how often the recognizer performs prediction on
106107
spectrograms. Must be >=0 and <1 (default: 0.5). For example,
@@ -114,6 +115,11 @@ As the example above shows, you can specify optional parameters when calling
114115
* `invokeCallbackOnNoiseAndUnknown`: Whether the callback function will be
115116
invoked if the "word" with the maximum probability score is the "unknown"
116117
or "background noise" token. Default: `false`.
118+
* `includeEmbedding`: Whether an internal activation from the underlying model
119+
will be included in the callback argument, in addition to the probability
120+
scores. Note: if this field is set as `true`, the value of
121+
`invokeCallbackOnNoiseAndUnknown` will be overridden to `true` and the
122+
value of `probabilityThreshold` will be overridden to `0`.
117123

118124
### Offline recognition
119125

@@ -154,11 +160,16 @@ tf.tidy(() => {
154160
});
155161
```
156162

163+
Note that you must provide a spectrogram value to the `recognize()` call
164+
in order to perform the offline recognition. If `recognzie()` as called
165+
without a first argument, it will perform one-shot online recognition
166+
by collecting a frame of audio via WebAudio.
167+
157168
### Preloading model
158169

159170
By default, a recognizer object will load the underlying
160171
tf.Model via HTTP requests to a centralized location, when its
161-
`startStreaming()` or `recognize()` method is called the first time.
172+
`listen()` or `recognize()` method is called the first time.
162173
You can pre-load the model to reduce the latency of the first calls
163174
to these methods. To do that, use the `ensureModelLoaded()` method of the
164175
recognizer object. The `ensureModelLoaded()` method also "warms up" model after
@@ -236,7 +247,7 @@ await transferRecognizer.train({
236247

237248
// After the transfer learning completes, you can start online streaming
238249
// recognition using the new model.
239-
await transferRecognizer.startStreaming(result => {
250+
await transferRecognizer.listen(result => {
240251
// - result.scores contains the scores for the new vocabulary, which
241252
// can be checked with:
242253
const words = transferRecognizer.wordLabels();

speech-commands/demo/index.js

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ let transferRecognizer;
4949

5050
// Make sure the tf.Model is loaded through HTTP. If this is not
5151
// called here, the tf.Model will be loaded the first time
52-
// `startStreaming()` is called.
52+
// `listen()` is called.
5353
recognizer.ensureModelLoaded()
5454
.then(() => {
5555
startButton.disabled = false;
@@ -79,7 +79,7 @@ startButton.addEventListener('click', () => {
7979
populateCandidateWords(activeRecognizer.wordLabels());
8080

8181
activeRecognizer
82-
.startStreaming(
82+
.listen(
8383
result => {
8484
plotPredictions(
8585
predictionCanvas, activeRecognizer.wordLabels(), result.scores,
@@ -104,7 +104,7 @@ startButton.addEventListener('click', () => {
104104
stopButton.addEventListener('click', () => {
105105
const activeRecognizer =
106106
transferRecognizer == null ? recognizer : transferRecognizer;
107-
activeRecognizer.stopStreaming()
107+
activeRecognizer.stopListening()
108108
.then(() => {
109109
startButton.disabled = false;
110110
stopButton.disabled = true;

speech-commands/package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"name": "@tensorflow-models/speech-commands",
3-
"version": "0.1.4",
3+
"version": "0.2.0",
44
"description": "Speech-command recognizer in TensorFlow.js",
55
"main": "dist/index.js",
66
"unpkg": "dist/speech-commands.min.js",

speech-commands/src/browser_fft_recognizer.ts

Lines changed: 23 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,11 @@ export const UNKNOWN_TAG = '_unknown_';
2626

2727
let streaming = false;
2828

29+
export function getMajorAndMinorVersion(version: string) {
30+
const versionItems = version.split('.');
31+
return versionItems.slice(0, 2).join('.');
32+
}
33+
2934
/**
3035
* Speech-Command Recognizer using browser-native (WebAudio) spectral featutres.
3136
*/
@@ -35,8 +40,8 @@ export class BrowserFftSpeechCommandRecognizer implements
3540
static readonly DEFAULT_VOCABULARY_NAME = '18w';
3641

3742
readonly MODEL_URL_PREFIX =
38-
`https://storage.googleapis.com/tfjs-speech-commands-models/v${
39-
version}/browser_fft`;
43+
`https://storage.googleapis.com/tfjs-models/tfjs/speech-commands/v${
44+
getMajorAndMinorVersion(version)}/browser_fft`;
4045

4146
private readonly SAMPLE_RATE_HZ = 44100;
4247
private readonly FFT_SIZE = 1024;
@@ -107,7 +112,7 @@ export class BrowserFftSpeechCommandRecognizer implements
107112
/**
108113
* Start streaming recognition.
109114
*
110-
* To stop the recognition, use `stopStreaming()`.
115+
* To stop the recognition, use `stopListening()`.
111116
*
112117
* Example: TODO(cais): Add exapmle code snippet.
113118
*
@@ -132,9 +137,8 @@ export class BrowserFftSpeechCommandRecognizer implements
132137
* @throws Error, if streaming recognition is already started or
133138
* if `config` contains invalid values.
134139
*/
135-
async startStreaming(
136-
callback: RecognizerCallback,
137-
config?: StreamingRecognitionConfig): Promise<void> {
140+
async listen(callback: RecognizerCallback,
141+
config?: StreamingRecognitionConfig): Promise<void> {
138142
if (streaming) {
139143
throw new Error(
140144
'Cannot start streaming again when streaming is ongoing.');
@@ -355,7 +359,7 @@ export class BrowserFftSpeechCommandRecognizer implements
355359
*
356360
* @throws Error if there is not ongoing streaming recognition.
357361
*/
358-
async stopStreaming(): Promise<void> {
362+
async stopListening(): Promise<void> {
359363
if (!streaming) {
360364
throw new Error('Cannot stop streaming when streaming is not ongoing.');
361365
}
@@ -366,7 +370,7 @@ export class BrowserFftSpeechCommandRecognizer implements
366370
/**
367371
* Check if streaming recognition is ongoing.
368372
*/
369-
isStreaming(): boolean {
373+
isListening(): boolean {
370374
return streaming;
371375
}
372376

@@ -397,7 +401,7 @@ export class BrowserFftSpeechCommandRecognizer implements
397401
if (this.model == null) {
398402
throw new Error(
399403
'Model has not been loaded yet. Load model by calling ' +
400-
'ensureModelLoaded(), recognizer(), or startStreaming().');
404+
'ensureModelLoaded(), recognize(), or listen().');
401405
}
402406
return this.model.inputs[0].shape;
403407
}
@@ -479,6 +483,15 @@ export class BrowserFftSpeechCommandRecognizer implements
479483
output.scores = await Promise.all(scorePromises) as Float32Array[];
480484
tf.dispose(unstacked);
481485
}
486+
487+
if (config.includeSpectrogram) {
488+
output.spectrogram = {
489+
data: (input instanceof tf.Tensor ?
490+
await input.data() : input) as Float32Array,
491+
frameSize: this.nonBatchInputShape[1],
492+
};
493+
}
494+
482495
return output;
483496
}
484497

@@ -507,7 +520,7 @@ export class BrowserFftSpeechCommandRecognizer implements
507520
if (this.model == null) {
508521
throw new Error(
509522
'Model has not been loaded yet. Load model by calling ' +
510-
'ensureModelLoaded(), recognizer(), or startStreaming().');
523+
'ensureModelLoaded(), recognizer(), or listen().');
511524
}
512525
tf.util.assert(
513526
name != null && typeof name === 'string' && name.length > 1,

0 commit comments

Comments
 (0)