[speech-command] Add includeEmbeddings; Allow recognize() to draw from WebAudio directly (#99)

caisq · web-flow · commit 331a06590528 · 2018-11-08T11:41:24.000-05:00
- Allow `recognize()` to take no argument, in which case the method will draw a frame of
  audio directly from WebAudio.
- Add the config field `includeEmbedding` to `startStreaming()` and `recognize()` methods.
diff --git a/speech-commands/src/browser_fft_recognizer.ts b/speech-commands/src/browser_fft_recognizer.ts
@@ -20,7 +20,7 @@ import * as tf from '@tensorflow/tfjs';
 // tslint:disable:max-line-length
 import {BrowserFftFeatureExtractor, SpectrogramCallback} from './browser_fft_extractor';
 import {loadMetadataJson} from './browser_fft_utils';
-import {RecognizerCallback, RecognizerParams, SpectrogramData, SpeechCommandRecognizer, SpeechCommandRecognizerResult, StreamingRecognitionConfig, TransferLearnConfig, TransferSpeechCommandRecognizer} from './types';
+import {RecognizeConfig, RecognizerCallback, RecognizerParams, SpectrogramData, SpeechCommandRecognizer, SpeechCommandRecognizerResult, StreamingRecognitionConfig, TransferLearnConfig, TransferSpeechCommandRecognizer} from './types';
 import {version} from './version';
 
 // tslint:enable:max-line-length
@@ -47,6 +47,7 @@ export class BrowserFftSpeechCommandRecognizer implements
   private readonly DEFAULT_SUPPRESSION_TIME_MILLIS = 1000;
 
   model: tf.Model;
+  modelWithEmbeddingOutput: tf.Model;
   readonly vocabulary: string;
   readonly parameters: RecognizerParams;
   protected words: string[];
@@ -149,15 +150,24 @@ export class BrowserFftSpeechCommandRecognizer implements
     if (config == null) {
       config = {};
     }
-    const probabilityThreshold =
+    let probabilityThreshold =
         config.probabilityThreshold == null ? 0 : config.probabilityThreshold;
+    if (config.includeEmbedding) {
+      // Override probability threshold to 0 if includeEmbedding is true.
+      probabilityThreshold = 0;
+    }
     tf.util.assert(
         probabilityThreshold >= 0 && probabilityThreshold <= 1,
         `Invalid probabilityThreshold value: ${probabilityThreshold}`);
-    const invokeCallbackOnNoiseAndUnknown =
+    let invokeCallbackOnNoiseAndUnknown =
         config.invokeCallbackOnNoiseAndUnknown == null ?
         false :
         config.invokeCallbackOnNoiseAndUnknown;
+    if (config.includeEmbedding) {
+      // Override invokeCallbackOnNoiseAndUnknown threshold to true if
+      // includeEmbedding is true.
+      invokeCallbackOnNoiseAndUnknown = true;
+    }
 
     if (config.suppressionTimeMillis < 0) {
       throw new Error(
@@ -174,7 +184,18 @@ export class BrowserFftSpeechCommandRecognizer implements
         Math.round(this.FFT_SIZE * (1 - overlapFactor));
 
     const spectrogramCallback: SpectrogramCallback = async (x: tf.Tensor) => {
-      const y = tf.tidy(() => this.model.predict(x) as tf.Tensor);
+      await this.ensureModelWithEmbeddingOutputCreated();
+
+      let y: tf.Tensor;
+      let embedding: tf.Tensor;
+      if (config.includeEmbedding) {
+        await this.ensureModelWithEmbeddingOutputCreated();
+        [y, embedding] =
+            this.modelWithEmbeddingOutput.predict(x) as tf.Tensor[];
+      } else {
+        y = this.model.predict(x) as tf.Tensor;
+      }
+
       const scores = await y.data() as Float32Array;
       const maxIndexTensor = y.argMax(-1);
       const maxIndex = (await maxIndexTensor.data())[0];
@@ -201,7 +222,7 @@ export class BrowserFftSpeechCommandRecognizer implements
           }
         }
         if (wordDetected) {
-          callback({scores, spectrogram});
+          callback({scores, spectrogram, embedding});
         }
         // Trigger suppression only if the word is neither unknown or
         // background noise.
@@ -289,6 +310,39 @@ export class BrowserFftSpeechCommandRecognizer implements
     this.parameters.spectrogramDurationMillis = numFrames * frameDurationMillis;
   }
 
+  /**
+   * Construct a two-output model that includes the following outputs:
+   *
+   * 1. The same softmax probability output as the original model's output
+   * 2. The embedding, i.e., activation from the second-last dense layer of
+   *    the original model.
+   */
+  protected async ensureModelWithEmbeddingOutputCreated() {
+    if (this.modelWithEmbeddingOutput != null) {
+      return;
+    }
+    await this.ensureModelLoaded();
+
+    // Find the second last dense layer of the original model.
+    let secondLastDenseLayer: tf.layers.Layer;
+    for (let i = this.model.layers.length - 2; i >= 0; --i) {
+      if (this.model.layers[i].getClassName() === 'Dense') {
+        secondLastDenseLayer = this.model.layers[i];
+        break;
+      }
+    }
+    if (secondLastDenseLayer == null) {
+      throw new Error(
+          'Failed to find second last dense layer in the original model.');
+    }
+    this.modelWithEmbeddingOutput = tf.model({
+      inputs: this.model.inputs,
+      outputs: [
+        this.model.outputs[0], secondLastDenseLayer.output as tf.SymbolicTensor
+      ]
+    });
+  }
+
   private warmUpModel() {
     tf.tidy(() => {
       const x = tf.zeros([1].concat(this.nonBatchInputShape));
@@ -370,15 +424,27 @@ export class BrowserFftSpeechCommandRecognizer implements
    *   - If a `Float32Array`, must have a length divisible by the number
    *     of elements per spectrogram, i.e.,
    *     (# of spectrogram columns) * (# of frequency-domain points per column).
+   * @param config Optional configuration object.
    * @returns Result of the recognition, with the following field:
    *   scores:
    *   - A `Float32Array` if there is only one input exapmle.
    *   - An `Array` of `Float32Array`, if there are multiple input examples.
    */
-  async recognize(input: tf.Tensor|
-                  Float32Array): Promise<SpeechCommandRecognizerResult> {
+  async recognize(input?: tf.Tensor|Float32Array, config?: RecognizeConfig):
+      Promise<SpeechCommandRecognizerResult> {
+    if (config == null) {
+      config = {};
+    }
+
     await this.ensureModelLoaded();
 
+    if (input == null) {
+      // If `input` is not provided, draw audio data from WebAudio and us it
+      // for recognition.
+      const spectrogramData = await this.recognizeOnline();
+      input = spectrogramData.data;
+    }
+
     let numExamples: number;
     let inputTensor: tf.Tensor;
     let outTensor: tf.Tensor;
@@ -403,16 +469,49 @@ export class BrowserFftSpeechCommandRecognizer implements
       ].concat(this.nonBatchInputShape) as [number, number, number, number]);
     }
 
-    outTensor = this.model.predict(inputTensor) as tf.Tensor;
+    const output: SpeechCommandRecognizerResult = {scores: null};
+    if (config.includeEmbedding) {
+      // Optional inclusion of embedding (internal activation).
+      await this.ensureModelWithEmbeddingOutputCreated();
+      const outAndEmbedding =
+          this.modelWithEmbeddingOutput.predict(inputTensor) as tf.Tensor[];
+      outTensor = outAndEmbedding[0];
+      output.embedding = outAndEmbedding[1];
+    } else {
+      outTensor = this.model.predict(inputTensor) as tf.Tensor;
+    }
+
     if (numExamples === 1) {
-      return {scores: await outTensor.data() as Float32Array};
+      output.scores = await outTensor.data() as Float32Array;
     } else {
       const unstacked = tf.unstack(outTensor) as tf.Tensor[];
       const scorePromises = unstacked.map(item => item.data());
-      const scores = await Promise.all(scorePromises) as Float32Array[];
+      output.scores = await Promise.all(scorePromises) as Float32Array[];
       tf.dispose(unstacked);
-      return {scores};
     }
+    return output;
+  }
+
+  protected async recognizeOnline(): Promise<SpectrogramData> {
+    return new Promise<SpectrogramData>((resolve, reject) => {
+      const spectrogramCallback: SpectrogramCallback = async (x: tf.Tensor) => {
+        resolve({
+          data: await x.data() as Float32Array,
+          frameSize: this.nonBatchInputShape[1],
+        });
+        return false;
+      };
+      this.audioDataExtractor = new BrowserFftFeatureExtractor({
+        sampleRateHz: this.parameters.sampleRateHz,
+        columnBufferLength: this.parameters.columnBufferLength,
+        columnHopLength: this.parameters.columnBufferLength,
+        numFramesPerSpectrogram: this.nonBatchInputShape[0],
+        columnTruncateLength: this.nonBatchInputShape[1],
+        suppressionTimeMillis: 0,
+        spectrogramCallback
+      });
+      this.audioDataExtractor.start();
+    });
   }
 
   createTransfer(name: string): TransferSpeechCommandRecognizer {
diff --git a/speech-commands/src/browser_fft_recognizer_test.ts b/speech-commands/src/browser_fft_recognizer_test.ts
@@ -218,6 +218,25 @@ describeWithFlags('Browser FFT recognizer', tf.test_util.NODE_ENVS, () => {
     }
   });
 
+  it('Offline recognize call: includeEmbedding', async () => {
+    setUpFakes();
+
+    // A batch of examples.
+    const numExamples = 3;
+    const spectrogram =
+        tf.zeros([numExamples, fakeNumFrames, fakeColumnTruncateLength, 1]);
+    const recognizer = new BrowserFftSpeechCommandRecognizer();
+    const output =
+        await recognizer.recognize(spectrogram, {includeEmbedding: true});
+    expect(Array.isArray(output.scores)).toEqual(true);
+    expect(output.scores.length).toEqual(3);
+    for (let i = 0; i < 3; ++i) {
+      expect((output.scores[i] as Float32Array).length).toEqual(17);
+    }
+    expect(output.embedding.rank).toEqual(2);
+    expect(output.embedding.shape[0]).toEqual(numExamples);
+  });
+
   it('Offline recognize fails due to incorrect shape', async () => {
     setUpFakes();
 
@@ -346,12 +365,58 @@ describeWithFlags('Browser FFT recognizer', tf.test_util.NODE_ENVS, () => {
       // spectrogram is not provided by default.
       expect(result.spectrogram).toBeUndefined();
 
+      // Embedding should not be included by default.
+      expect(result.embedding).toBeUndefined();
+
       if (++numCallbacksCompleted >= numCallbacksToComplete) {
-        recognizer.stopStreaming().then(done);
+        await recognizer.stopStreaming();
+        done();
       }
     }, {overlapFactor: 0, invokeCallbackOnNoiseAndUnknown: true});
   });
 
+  it('streaming: overlapFactor = 0, includeEmbedding', async done => {
+    setUpFakes();
+    const recognizer = new BrowserFftSpeechCommandRecognizer();
+
+    const numCallbacksToComplete = 2;
+    let numCallbacksCompleted = 0;
+    const tensorCounts: number[] = [];
+    const callbackTimestamps: number[] = [];
+    recognizer.startStreaming(async (result: SpeechCommandRecognizerResult) => {
+      expect((result.scores as Float32Array).length).toEqual(fakeWords.length);
+
+      callbackTimestamps.push(tf.util.now());
+      if (callbackTimestamps.length > 1) {
+        expect(
+            callbackTimestamps[callbackTimestamps.length - 1] -
+            callbackTimestamps[callbackTimestamps.length - 2])
+            .toBeGreaterThanOrEqual(
+                recognizer.params().spectrogramDurationMillis);
+      }
+
+      tensorCounts.push(tf.memory().numTensors);
+
+      // spectrogram is not provided by default.
+      expect(result.spectrogram).toBeUndefined();
+
+      // Embedding should not be included by default.
+      expect(result.embedding.rank).toEqual(2);
+      expect(result.embedding.shape[0]).toEqual(1);
+      // The number of units of the hidden dense layer.
+      expect(result.embedding.shape[1]).toEqual(4);
+
+      if (++numCallbacksCompleted >= numCallbacksToComplete) {
+        await recognizer.stopStreaming();
+        done();
+      }
+    }, {
+      overlapFactor: 0,
+      invokeCallbackOnNoiseAndUnknown: true,
+      includeEmbedding: true
+    });
+  });
+
   it('streaming: overlapFactor = 0.5, includeSpectrogram', async done => {
     setUpFakes();
     const recognizer = new BrowserFftSpeechCommandRecognizer();
@@ -482,6 +547,31 @@ describeWithFlags('Browser FFT recognizer', tf.test_util.NODE_ENVS, () => {
     expect(recognizer.isStreaming()).toEqual(false);
   });
 
+  it('Online recognize() call succeeds', async () => {
+    setUpFakes();
+    const recognizer = new BrowserFftSpeechCommandRecognizer();
+
+    for (let i = 0; i < 2; ++i) {
+      // No-arg call: online recognition.
+      const output = await recognizer.recognize();
+      expect(output.scores.length).toEqual(fakeWords.length);
+      expect(output.embedding).toBeUndefined();
+    }
+  });
+
+  it('Online recognize() call with includeEmbedding succeeds', async () => {
+    setUpFakes();
+    const recognizer = new BrowserFftSpeechCommandRecognizer();
+
+    for (let i = 0; i < 2; ++i) {
+      // No-arg call: online recognition.
+      const output = await recognizer.recognize(null, {includeEmbedding: true});
+      expect(output.scores.length).toEqual(fakeWords.length);
+      expect(output.embedding.rank).toEqual(2);
+      expect(output.embedding.shape[0]).toEqual(1);
+    }
+  });
+
   it('collectTransferLearningExample default transerf model', async () => {
     setUpFakes();
     const base = new BrowserFftSpeechCommandRecognizer();
diff --git a/speech-commands/src/types.ts b/speech-commands/src/types.ts