Improvements to the speech recognition model (#101)

dsmilkov · web-flow · commit 9381dd891ece · 2018-11-13T10:40:39.000-05:00
Several improvements/fixes to the speech recognition model:
- Stop the audio extractor immediately after calling `recognizeOnline()`. Otherwise the extractor is in an infinite loop of processing data.
- When the extractor is stopped, close the audio stream by calling `this.stream.getTracks()[0].stop();`. Otherwise the browser shows that the microphone is always on (red dot on the browser tab).
- Return the original spectrogram data to the user (unnormalized). This makes a large difference in data quality for applications that decide to use only the last K frames out of the 43 frames. Those K frames are not impacted by the mean and stdev of all of the 43 frames. Before, my audio app will react with a delay of ~1sec after I make a sound even when using only the last 3 frames (70ms of data). The delay was because of the silence the preceded my sound which shifted the mean and the stdev.
- Speedup data collection by 25% by using a queue instead of a circular buffer, which allows us to use the fast built-in `TypedArray.set()` when creating the extractor. Now calling listen with overlapFactor of 0.999 results in ~40 callbacks/sec - before it was ~32.
- Update the patch version. Now version is `0.2.1`
diff --git a/speech-commands/package.json b/speech-commands/package.json
@@ -1,6 +1,6 @@
 {
   "name": "@tensorflow-models/speech-commands",
-  "version": "0.2.0",
+  "version": "0.2.1",
   "description": "Speech-command recognizer in TensorFlow.js",
   "main": "dist/index.js",
   "unpkg": "dist/speech-commands.min.js",
diff --git a/speech-commands/src/browser_fft_extractor.ts b/speech-commands/src/browser_fft_extractor.ts
@@ -20,7 +20,7 @@
  */
 
 import * as tf from '@tensorflow/tfjs';
-import {getAudioContextConstructor, getAudioMediaStream, normalize} from './browser_fft_utils';
+import {getAudioContextConstructor, getAudioMediaStream} from './browser_fft_utils';
 import {FeatureExtractor, RecognizerParams} from './types';
 
 export type SpectrogramCallback = (x: tf.Tensor) => Promise<boolean>;
@@ -77,7 +77,7 @@ export interface BrowserFftFeatureExtractorConfig extends RecognizerParams {
  */
 export class BrowserFftFeatureExtractor implements FeatureExtractor {
   // Number of frames (i.e., columns) per spectrogram used for classification.
-  readonly numFramesPerSpectrogram: number;
+  readonly numFrames: number;
 
   // Audio sampling rate in Hz.
   readonly sampleRateHz: number;
@@ -92,22 +92,16 @@ export class BrowserFftFeatureExtractor implements FeatureExtractor {
   // consecutive spectrograms and the length of each individual spectrogram.
   readonly overlapFactor: number;
 
-  protected readonly spectrogramCallback: SpectrogramCallback;
+  private readonly spectrogramCallback: SpectrogramCallback;
 
   private stream: MediaStream;
   // tslint:disable-next-line:no-any
   private audioContextConstructor: any;
   private audioContext: AudioContext;
   private analyser: AnalyserNode;
-
   private tracker: Tracker;
-
-  private readonly ROTATING_BUFFER_SIZE_MULTIPLIER = 2;
   private freqData: Float32Array;
-  private rotatingBufferNumFrames: number;
-  private rotatingBuffer: Float32Array;
-
-  private frameCount: number;
+  private freqDataQueue: Float32Array[];
   // tslint:disable-next-line:no-any
   private frameIntervalTask: any;
   private frameDurationMillis: number;
@@ -144,7 +138,7 @@ export class BrowserFftFeatureExtractor implements FeatureExtractor {
     this.suppressionTimeMillis = config.suppressionTimeMillis;
 
     this.spectrogramCallback = config.spectrogramCallback;
-    this.numFramesPerSpectrogram = config.numFramesPerSpectrogram;
+    this.numFrames = config.numFramesPerSpectrogram;
     this.sampleRateHz = config.sampleRateHz || 44100;
     this.fftSize = config.fftSize || 1024;
     this.frameDurationMillis = this.fftSize / this.sampleRateHz * 1e3;
@@ -165,7 +159,7 @@ export class BrowserFftFeatureExtractor implements FeatureExtractor {
     this.audioContextConstructor = getAudioContextConstructor();
   }
 
-  async start(samples?: Float32Array): Promise<Float32Array[]|void> {
+  async start(): Promise<Float32Array[]|void> {
     if (this.frameIntervalTask != null) {
       throw new Error(
           'Cannot start already-started BrowserFftFeatureExtractor');
@@ -184,18 +178,11 @@ export class BrowserFftFeatureExtractor implements FeatureExtractor {
     this.analyser.fftSize = this.fftSize * 2;
     this.analyser.smoothingTimeConstant = 0.0;
     streamSource.connect(this.analyser);
-
+    // Reset the queue.
+    this.freqDataQueue = [];
     this.freqData = new Float32Array(this.fftSize);
-    this.rotatingBufferNumFrames =
-        this.numFramesPerSpectrogram * this.ROTATING_BUFFER_SIZE_MULTIPLIER;
-    const rotatingBufferSize =
-        this.columnTruncateLength * this.rotatingBufferNumFrames;
-    this.rotatingBuffer = new Float32Array(rotatingBufferSize);
-
-    this.frameCount = 0;
-
-    const period = Math.max(
-        1, Math.round(this.numFramesPerSpectrogram * (1 - this.overlapFactor)));
+    const period =
+        Math.max(1, Math.round(this.numFrames * (1 - this.overlapFactor)));
     this.tracker = new Tracker(
         period,
         Math.round(this.suppressionTimeMillis / this.frameDurationMillis));
@@ -209,20 +196,16 @@ export class BrowserFftFeatureExtractor implements FeatureExtractor {
       return;
     }
 
-    const freqDataSlice = this.freqData.slice(0, this.columnTruncateLength);
-    const bufferPos = this.frameCount % this.rotatingBufferNumFrames;
-    this.rotatingBuffer.set(
-        freqDataSlice, bufferPos * this.columnTruncateLength);
-    this.frameCount++;
-
+    this.freqDataQueue.push(this.freqData.slice(0, this.columnTruncateLength));
+    if (this.freqDataQueue.length > this.numFrames) {
+      // Drop the oldest frame (least recent).
+      this.freqDataQueue.shift();
+    }
     const shouldFire = this.tracker.tick();
     if (shouldFire) {
-      const freqData = getFrequencyDataFromRotatingBuffer(
-          this.rotatingBuffer, this.numFramesPerSpectrogram,
-          this.columnTruncateLength,
-          this.frameCount - this.numFramesPerSpectrogram);
+      const freqData = flattenQueue(this.freqDataQueue);
       const inputTensor = getInputTensorFromFrequencyData(
-          freqData, this.numFramesPerSpectrogram, this.columnTruncateLength);
+          freqData, [1, this.numFrames, this.columnTruncateLength, 1]);
       const shouldRest = await this.spectrogramCallback(inputTensor);
       if (shouldRest) {
         this.tracker.suppress();
@@ -240,6 +223,9 @@ export class BrowserFftFeatureExtractor implements FeatureExtractor {
     this.frameIntervalTask = null;
     this.analyser.disconnect();
     this.audioContext.close();
+    if (this.stream != null && this.stream.getTracks().length > 0) {
+      this.stream.getTracks()[0].stop();
+    }
   }
 
   setConfig(params: RecognizerParams) {
@@ -255,39 +241,19 @@ export class BrowserFftFeatureExtractor implements FeatureExtractor {
   }
 }
 
-export function getFrequencyDataFromRotatingBuffer(
-    rotatingBuffer: Float32Array, numFrames: number, fftLength: number,
-    frameCount: number): Float32Array {
-  const size = numFrames * fftLength;
-  const freqData = new Float32Array(size);
-
-  const rotatingBufferSize = rotatingBuffer.length;
-  const rotatingBufferNumFrames = rotatingBufferSize / fftLength;
-  while (frameCount < 0) {
-    frameCount += rotatingBufferNumFrames;
-  }
-  const indexBegin = (frameCount % rotatingBufferNumFrames) * fftLength;
-  const indexEnd = indexBegin + size;
-
-  for (let i = indexBegin; i < indexEnd; ++i) {
-    freqData[i - indexBegin] = rotatingBuffer[i % rotatingBufferSize];
-  }
+export function flattenQueue(queue: Float32Array[]): Float32Array {
+  const frameSize = queue[0].length;
+  const freqData = new Float32Array(queue.length * frameSize);
+  queue.forEach((data, i) => freqData.set(data, i * frameSize));
   return freqData;
 }
 
 export function getInputTensorFromFrequencyData(
-    freqData: Float32Array, numFrames: number, fftLength: number,
-    toNormalize = true): tf.Tensor {
-  return tf.tidy(() => {
-    const size = freqData.length;
-    const tensorBuffer = tf.buffer([size]);
-    for (let i = 0; i < freqData.length; ++i) {
-      tensorBuffer.set(freqData[i], i);
-    }
-    const output =
-        tensorBuffer.toTensor().reshape([1, numFrames, fftLength, 1]);
-    return toNormalize ? normalize(output) : output;
-  });
+    freqData: Float32Array, shape: number[]): tf.Tensor {
+  const vals = new Float32Array(tf.util.sizeFromShape(shape));
+  // If the data is less than the output shape, the rest is padded with zeros.
+  vals.set(freqData, vals.length - freqData.length);
+  return tf.tensor(vals, shape);
 }
 
 /**
diff --git a/speech-commands/src/browser_fft_extractor_test.ts b/speech-commands/src/browser_fft_extractor_test.ts
@@ -17,62 +17,38 @@
 
 import * as tf from '@tensorflow/tfjs';
 import {describeWithFlags} from '@tensorflow/tfjs-core/dist/jasmine_util';
-import {BrowserFftFeatureExtractor, getFrequencyDataFromRotatingBuffer, getInputTensorFromFrequencyData} from './browser_fft_extractor';
+import {BrowserFftFeatureExtractor, flattenQueue, getInputTensorFromFrequencyData} from './browser_fft_extractor';
 import * as BrowserFftUtils from './browser_fft_utils';
 import {FakeAudioContext, FakeAudioMediaStream} from './browser_test_utils';
 
 const testEnvs = tf.test_util.NODE_ENVS;
 
-describeWithFlags('getFrequencyDataFromRotatingBuffer', testEnvs, () => {
-  it('getFrequencyDataFromRotatingBuffer', () => {
-    const rotBuffer = new Float32Array([1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6]);
-    const numFrames = 3;
-    const fftLength = 2;
-    expect(
-        getFrequencyDataFromRotatingBuffer(rotBuffer, numFrames, fftLength, 0))
-        .toEqual(new Float32Array([1, 1, 2, 2, 3, 3]));
-
-    expect(
-        getFrequencyDataFromRotatingBuffer(rotBuffer, numFrames, fftLength, 1))
-        .toEqual(new Float32Array([2, 2, 3, 3, 4, 4]));
-    expect(
-        getFrequencyDataFromRotatingBuffer(rotBuffer, numFrames, fftLength, 3))
-        .toEqual(new Float32Array([4, 4, 5, 5, 6, 6]));
-    expect(
-        getFrequencyDataFromRotatingBuffer(rotBuffer, numFrames, fftLength, 4))
-        .toEqual(new Float32Array([5, 5, 6, 6, 1, 1]));
-    expect(
-        getFrequencyDataFromRotatingBuffer(rotBuffer, numFrames, fftLength, 6))
-        .toEqual(new Float32Array([1, 1, 2, 2, 3, 3]));
+describeWithFlags('flattenQueue', testEnvs, () => {
+  it('3 frames, 2 values each', () => {
+    const queue = [[1, 1], [2, 2], [3, 3]].map(x => new Float32Array(x));
+    expect(flattenQueue(queue)).toEqual(new Float32Array([1, 1, 2, 2, 3, 3]));
+  });
+
+  it('2 frames, 2 values each', () => {
+    const queue = [[1, 1], [2, 2]].map(x => new Float32Array(x));
+    expect(flattenQueue(queue)).toEqual(new Float32Array([1, 1, 2, 2]));
+  });
+
+  it('1 frame, 2 values each', () => {
+    const queue = [[1, 1]].map(x => new Float32Array(x));
+    expect(flattenQueue(queue)).toEqual(new Float32Array([1, 1]));
   });
 });
 
 describeWithFlags('getInputTensorFromFrequencyData', testEnvs, () => {
-  it('Unnormalized', () => {
+  it('6 frames, 2 vals each', () => {
     const freqData = new Float32Array([1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6]);
     const numFrames = 6;
     const fftSize = 2;
     const tensor =
-        getInputTensorFromFrequencyData(freqData, numFrames, fftSize, false);
+        getInputTensorFromFrequencyData(freqData, [1, numFrames, fftSize, 1]);
     tf.test_util.expectArraysClose(tensor, tf.tensor4d(freqData, [1, 6, 2, 1]));
   });
-
-  it('Normalized', () => {
-    const freqData = new Float32Array([1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6]);
-    const numFrames = 6;
-    const fftSize = 2;
-    const tensor =
-        getInputTensorFromFrequencyData(freqData, numFrames, fftSize);
-    tf.test_util.expectArraysClose(
-        tensor,
-        tf.tensor4d(
-            [
-              -1.4638501, -1.4638501, -0.8783101, -0.8783101, -0.29277,
-              -0.29277, 0.29277, 0.29277, 0.8783101, 0.8783101, 1.4638501,
-              1.4638501
-            ],
-            [1, 6, 2, 1]));
-  });
 });
 
 describeWithFlags('BrowserFftFeatureExtractor', testEnvs, () => {
@@ -95,7 +71,7 @@ describeWithFlags('BrowserFftFeatureExtractor', testEnvs, () => {
     });
 
     expect(extractor.fftSize).toEqual(1024);
-    expect(extractor.numFramesPerSpectrogram).toEqual(43);
+    expect(extractor.numFrames).toEqual(43);
     expect(extractor.columnTruncateLength).toEqual(225);
     expect(extractor.overlapFactor).toBeCloseTo(0);
   });
diff --git a/speech-commands/src/browser_fft_recognizer.ts b/speech-commands/src/browser_fft_recognizer.ts
@@ -17,7 +17,7 @@
 
 import * as tf from '@tensorflow/tfjs';
 import {BrowserFftFeatureExtractor, SpectrogramCallback} from './browser_fft_extractor';
-import {loadMetadataJson} from './browser_fft_utils';
+import {loadMetadataJson, normalize} from './browser_fft_utils';
 import {RecognizeConfig, RecognizerCallback, RecognizerParams, SpectrogramData, SpeechCommandRecognizer, SpeechCommandRecognizerResult, StreamingRecognitionConfig, TransferLearnConfig, TransferSpeechCommandRecognizer} from './types';
 import {version} from './version';
 
@@ -41,7 +41,7 @@ export class BrowserFftSpeechCommandRecognizer implements
 
   readonly MODEL_URL_PREFIX =
       `https://storage.googleapis.com/tfjs-models/tfjs/speech-commands/v${
-         getMajorAndMinorVersion(version)}/browser_fft`;
+          getMajorAndMinorVersion(version)}/browser_fft`;
 
   private readonly SAMPLE_RATE_HZ = 44100;
   private readonly FFT_SIZE = 1024;
@@ -137,8 +137,9 @@ export class BrowserFftSpeechCommandRecognizer implements
    * @throws Error, if streaming recognition is already started or
    *   if `config` contains invalid values.
    */
-  async listen(callback: RecognizerCallback,
-               config?: StreamingRecognitionConfig): Promise<void> {
+  async listen(
+      callback: RecognizerCallback,
+      config?: StreamingRecognitionConfig): Promise<void> {
     if (streaming) {
       throw new Error(
           'Cannot start streaming again when streaming is ongoing.');
@@ -183,21 +184,22 @@ export class BrowserFftSpeechCommandRecognizer implements
     const spectrogramCallback: SpectrogramCallback = async (x: tf.Tensor) => {
       await this.ensureModelWithEmbeddingOutputCreated();
 
+      const normalizedX = normalize(x);
       let y: tf.Tensor;
       let embedding: tf.Tensor;
       if (config.includeEmbedding) {
         await this.ensureModelWithEmbeddingOutputCreated();
         [y, embedding] =
-            this.modelWithEmbeddingOutput.predict(x) as tf.Tensor[];
+            this.modelWithEmbeddingOutput.predict(normalizedX) as tf.Tensor[];
       } else {
-        y = this.model.predict(x) as tf.Tensor;
+        y = this.model.predict(normalizedX) as tf.Tensor;
       }
 
       const scores = await y.data() as Float32Array;
       const maxIndexTensor = y.argMax(-1);
       const maxIndex = (await maxIndexTensor.data())[0];
       const maxScore = Math.max(...scores);
-      tf.dispose([y, maxIndexTensor]);
+      tf.dispose([y, maxIndexTensor, normalizedX]);
 
       if (maxScore < probabilityThreshold) {
         return false;
@@ -486,22 +488,25 @@ export class BrowserFftSpeechCommandRecognizer implements
 
     if (config.includeSpectrogram) {
       output.spectrogram = {
-        data: (input instanceof tf.Tensor ?
-            await input.data() : input) as Float32Array,
+        data: (input instanceof tf.Tensor ? await input.data() : input) as
+            Float32Array,
         frameSize: this.nonBatchInputShape[1],
       };
     }
 
     return output;
   }
 
-  protected async recognizeOnline(): Promise<SpectrogramData> {
+  private async recognizeOnline(): Promise<SpectrogramData> {
     return new Promise<SpectrogramData>((resolve, reject) => {
       const spectrogramCallback: SpectrogramCallback = async (x: tf.Tensor) => {
+        const normalizedX = normalize(x);
+        await this.audioDataExtractor.stop();
         resolve({
-          data: await x.data() as Float32Array,
+          data: await normalizedX.data() as Float32Array,
           frameSize: this.nonBatchInputShape[1],
         });
+        normalizedX.dispose();
         return false;
       };
       this.audioDataExtractor = new BrowserFftFeatureExtractor({
@@ -611,15 +616,17 @@ class TransferBrowserFftSpeechCommandRecognizer extends
             `learning example`);
 
     streaming = true;
-    return new Promise<SpectrogramData>((resolve, reject) => {
+    return new Promise<SpectrogramData>(resolve => {
       const spectrogramCallback: SpectrogramCallback = async (x: tf.Tensor) => {
         if (this.transferExamples == null) {
           this.transferExamples = {};
         }
         if (this.transferExamples[word] == null) {
           this.transferExamples[word] = [];
         }
-        this.transferExamples[word].push(x.clone());
+        const normalizedX = normalize(x);
+        this.transferExamples[word].push(normalizedX.clone());
+        normalizedX.dispose();
         await this.audioDataExtractor.stop();
         streaming = false;
         this.collateTransferWords();
diff --git a/speech-commands/src/browser_fft_recognizer_test.ts b/speech-commands/src/browser_fft_recognizer_test.ts
@@ -775,8 +775,7 @@ describeWithFlags('Browser FFT recognizer', tf.test_util.NODE_ENVS, () => {
     setUpFakes();
     const base = new BrowserFftSpeechCommandRecognizer();
     await base.ensureModelLoaded();
-    await base.listen(
-        async (result: SpeechCommandRecognizerResult) => {});
+    await base.listen(async (result: SpeechCommandRecognizerResult) => {});
     expect(base.isListening()).toEqual(true);
 
     const transfer = base.createTransfer('xfer1');
diff --git a/speech-commands/src/browser_fft_utils.ts b/speech-commands/src/browser_fft_utils.ts
@@ -40,9 +40,8 @@ export async function loadMetadataJson(url: string):
 
 export function normalize(x: tf.Tensor): tf.Tensor {
   return tf.tidy(() => {
-    const mean = tf.mean(x);
-    const std = tf.sqrt(tf.mean(tf.square(tf.add(x, tf.neg(mean)))));
-    return tf.div(tf.add(x, tf.neg(mean)), std);
+    const {mean, variance} = tf.moments(x);
+    return x.sub(mean).div(variance.sqrt());
   });
 }
 
diff --git a/speech-commands/src/browser_test_utils.ts b/speech-commands/src/browser_test_utils.ts
diff --git a/speech-commands/src/types.ts b/speech-commands/src/types.ts
diff --git a/speech-commands/src/version.ts b/speech-commands/src/version.ts

Original file line number	Diff line number	Diff line change
`@@ -1,6 +1,6 @@`
`1`	`1`	`{`
`2`	`2`	`"name": "@tensorflow-models/speech-commands",`
`3`		`- "version": "0.2.0",`
	`3`	`+ "version": "0.2.1",`
`4`	`4`	`"description": "Speech-command recognizer in TensorFlow.js",`
`5`	`5`	`"main": "dist/index.js",`
`6`	`6`	`"unpkg": "dist/speech-commands.min.js",`