SphinxBase 5prealpha
fe_noise.c
1/* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
2/* ====================================================================
3 * Copyright (c) 2013 Carnegie Mellon University. All rights
4 * reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 *
18 * This work was supported in part by funding from the Defense Advanced
19 * Research Projects Agency and the National Science Foundation of the
20 * United States of America, and the CMU Sphinx Speech Consortium.
21 *
22 * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
23 * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
24 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
25 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
26 * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
27 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
28 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
29 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
30 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
31 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
32 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33 *
34 * ====================================================================
35 *
36 */
37
38/* This noise removal algorithm is inspired by the following papers
39 * Computationally Efficient Speech Enchancement by Spectral Minina Tracking
40 * by G. Doblinger
41 *
42 * Power-Normalized Cepstral Coefficients (PNCC) for Robust Speech Recognition
43 * by C. Kim.
44 *
45 * For the recent research and state of art see papers about IMRCA and
46 * A Minimum-Mean-Square-Error Noise Reduction Algorithm On Mel-Frequency
47 * Cepstra For Robust Speech Recognition by Dong Yu and others
48 */
49
50#ifdef HAVE_CONFIG_H
51#include <config.h>
52#endif
53
54#include <math.h>
55
58#include "sphinxbase/strfuncs.h"
59#include "sphinxbase/err.h"
60
61#include "fe_noise.h"
62#include "fe_internal.h"
63
64/* Noise supression constants */
65#define SMOOTH_WINDOW 4
66#define LAMBDA_POWER 0.7
67#define LAMBDA_A 0.995
68#define LAMBDA_B 0.5
69#define LAMBDA_T 0.85
70#define MU_T 0.2
71#define MAX_GAIN 20
72#define SLOW_PEAK_FORGET_FACTOR 0.9995
73#define SLOW_PEAK_LEARN_FACTOR 0.9
74#define SPEECH_VOLUME_RANGE 8.0
75
76/* define VAD_DEBUG 1 */
77#ifdef VAD_DEBUG
78static FILE *vad_stats;
79static int64 low_snr = 0;
80static int64 low_volume = 0;
81#endif
82
84 /* Smoothed power */
85 powspec_t *power;
86 /* Noise estimate */
87 powspec_t *noise;
88 /* Signal floor estimate */
89 powspec_t *floor;
90 /* Peak for temporal masking */
91 powspec_t *peak;
92
93 /* Initialize it next time */
94 uint8 undefined;
95 /* Number of items to process */
96 uint32 num_filters;
97
98 /* Sum of slow peaks for VAD */
99 powspec_t slow_peak_sum;
100
101 /* Precomputed constants */
102 powspec_t lambda_power;
103 powspec_t comp_lambda_power;
104 powspec_t lambda_a;
105 powspec_t comp_lambda_a;
106 powspec_t lambda_b;
107 powspec_t comp_lambda_b;
108 powspec_t lambda_t;
109 powspec_t mu_t;
110 powspec_t max_gain;
111 powspec_t inv_max_gain;
112
113 powspec_t smooth_scaling[2 * SMOOTH_WINDOW + 3];
114};
115
116static void
117fe_lower_envelope(noise_stats_t *noise_stats, powspec_t * buf, powspec_t * floor_buf, int32 num_filt)
118{
119 int i;
120
121 for (i = 0; i < num_filt; i++) {
122#ifndef FIXED_POINT
123 if (buf[i] >= floor_buf[i]) {
124 floor_buf[i] =
125 noise_stats->lambda_a * floor_buf[i] + noise_stats->comp_lambda_a * buf[i];
126 }
127 else {
128 floor_buf[i] =
129 noise_stats->lambda_b * floor_buf[i] + noise_stats->comp_lambda_b * buf[i];
130 }
131#else
132 if (buf[i] >= floor_buf[i]) {
133 floor_buf[i] = fe_log_add(noise_stats->lambda_a + floor_buf[i],
134 noise_stats->comp_lambda_a + buf[i]);
135 }
136 else {
137 floor_buf[i] = fe_log_add(noise_stats->lambda_b + floor_buf[i],
138 noise_stats->comp_lambda_b + buf[i]);
139 }
140#endif
141 }
142}
143
144/* update slow peaks, check if max signal level big enough compared to peak */
145static int16
146fe_is_frame_quiet(noise_stats_t *noise_stats, powspec_t *buf, int32 num_filt)
147{
148 int i;
149 int16 is_quiet;
150 powspec_t sum;
151 double smooth_factor;
152
153 sum = 0.0;
154 for (i = 0; i < num_filt; i++) {
155#ifndef FIXED_POINT
156 sum += buf[i];
157#else
158 sum = fe_log_add(sum, buf[i]);
159#endif
160 }
161#ifndef FIXED_POINT
162 sum = log(sum);
163#endif
164 smooth_factor = (sum > noise_stats->slow_peak_sum) ? SLOW_PEAK_LEARN_FACTOR : SLOW_PEAK_FORGET_FACTOR;
165 noise_stats->slow_peak_sum = noise_stats->slow_peak_sum * smooth_factor +
166 sum * (1 - smooth_factor);
167
168#ifdef VAD_DEBUG
169#ifndef FIXED_POINT
170 fprintf(vad_stats, "%.3f %.3f ", noise_stats->slow_peak_sum, sum);
171#else
172 fprintf(vad_stats, "%d %d ", noise_stats->slow_peak_sum, sum);
173#endif
174#endif
175#ifndef FIXED_POINT
176 is_quiet = noise_stats->slow_peak_sum - SPEECH_VOLUME_RANGE > sum;
177#else
178 is_quiet = noise_stats->slow_peak_sum - FLOAT2FIX(SPEECH_VOLUME_RANGE) > sum;
179#endif
180 return is_quiet;
181}
182
183/* temporal masking */
184static void
185fe_temp_masking(noise_stats_t *noise_stats, powspec_t * buf, powspec_t * peak, int32 num_filt)
186{
187 powspec_t cur_in;
188 int i;
189
190 for (i = 0; i < num_filt; i++) {
191 cur_in = buf[i];
192
193#ifndef FIXED_POINT
194 peak[i] *= noise_stats->lambda_t;
195 if (buf[i] < noise_stats->lambda_t * peak[i])
196 buf[i] = peak[i] * noise_stats->mu_t;
197#else
198 peak[i] += noise_stats->lambda_t;
199 if (buf[i] < noise_stats->lambda_t + peak[i])
200 buf[i] = peak[i] + noise_stats->mu_t;
201#endif
202
203 if (cur_in > peak[i])
204 peak[i] = cur_in;
205 }
206}
207
208/* spectral weight smoothing */
209static void
210fe_weight_smooth(noise_stats_t *noise_stats, powspec_t * buf, powspec_t * coefs, int32 num_filt)
211{
212 int i, j;
213 int l1, l2;
214 powspec_t coef;
215
216 for (i = 0; i < num_filt; i++) {
217 l1 = ((i - SMOOTH_WINDOW) > 0) ? (i - SMOOTH_WINDOW) : 0;
218 l2 = ((i + SMOOTH_WINDOW) <
219 (num_filt - 1)) ? (i + SMOOTH_WINDOW) : (num_filt - 1);
220
221#ifndef FIXED_POINT
222 coef = 0;
223 for (j = l1; j <= l2; j++) {
224 coef += coefs[j];
225 }
226 buf[i] = buf[i] * (coef / (l2 - l1 + 1));
227#else
228 coef = MIN_FIXLOG;
229 for (j = l1; j <= l2; j++) {
230 coef = fe_log_add(coef, coefs[j]);
231 }
232 buf[i] = buf[i] + coef - noise_stats->smooth_scaling[l2 - l1 + 1];
233#endif
234
235 }
236}
237
239fe_init_noisestats(int num_filters)
240{
241 int i;
242 noise_stats_t *noise_stats;
243
244 noise_stats = (noise_stats_t *) ckd_calloc(1, sizeof(noise_stats_t));
245
246 noise_stats->power =
247 (powspec_t *) ckd_calloc(num_filters, sizeof(powspec_t));
248 noise_stats->noise =
249 (powspec_t *) ckd_calloc(num_filters, sizeof(powspec_t));
250 noise_stats->floor =
251 (powspec_t *) ckd_calloc(num_filters, sizeof(powspec_t));
252 noise_stats->peak =
253 (powspec_t *) ckd_calloc(num_filters, sizeof(powspec_t));
254
255 noise_stats->undefined = TRUE;
256 noise_stats->num_filters = num_filters;
257
258#ifndef FIXED_POINT
259 noise_stats->lambda_power = LAMBDA_POWER;
260 noise_stats->comp_lambda_power = 1 - LAMBDA_POWER;
261 noise_stats->lambda_a = LAMBDA_A;
262 noise_stats->comp_lambda_a = 1 - LAMBDA_A;
263 noise_stats->lambda_b = LAMBDA_B;
264 noise_stats->comp_lambda_b = 1 - LAMBDA_B;
265 noise_stats->lambda_t = LAMBDA_T;
266 noise_stats->mu_t = MU_T;
267 noise_stats->max_gain = MAX_GAIN;
268 noise_stats->inv_max_gain = 1.0 / MAX_GAIN;
269
270 for (i = 1; i < 2 * SMOOTH_WINDOW + 1; i++) {
271 noise_stats->smooth_scaling[i] = 1.0 / i;
272 }
273#else
274 noise_stats->lambda_power = FLOAT2FIX(log(LAMBDA_POWER));
275 noise_stats->comp_lambda_power = FLOAT2FIX(log(1 - LAMBDA_POWER));
276 noise_stats->lambda_a = FLOAT2FIX(log(LAMBDA_A));
277 noise_stats->comp_lambda_a = FLOAT2FIX(log(1 - LAMBDA_A));
278 noise_stats->lambda_b = FLOAT2FIX(log(LAMBDA_B));
279 noise_stats->comp_lambda_b = FLOAT2FIX(log(1 - LAMBDA_B));
280 noise_stats->lambda_t = FLOAT2FIX(log(LAMBDA_T));
281 noise_stats->mu_t = FLOAT2FIX(log(MU_T));
282 noise_stats->max_gain = FLOAT2FIX(log(MAX_GAIN));
283 noise_stats->inv_max_gain = FLOAT2FIX(log(1.0 / MAX_GAIN));
284
285 for (i = 1; i < 2 * SMOOTH_WINDOW + 3; i++) {
286 noise_stats->smooth_scaling[i] = FLOAT2FIX(log(i));
287 }
288#endif
289
290#ifdef VAD_DEBUG
291 vad_stats = fopen("vad_debug", "w");
292#endif
293
294 return noise_stats;
295}
296
297void
298fe_reset_noisestats(noise_stats_t * noise_stats)
299{
300 if (noise_stats)
301 noise_stats->undefined = TRUE;
302}
303
304void
305fe_free_noisestats(noise_stats_t * noise_stats)
306{
307 ckd_free(noise_stats->power);
308 ckd_free(noise_stats->noise);
309 ckd_free(noise_stats->floor);
310 ckd_free(noise_stats->peak);
311 ckd_free(noise_stats);
312#ifdef VAD_DEBUG
313 fclose(vad_stats);
314 E_INFO("Low SNR [%ld] frames; Low volume [%ld] frames\n", (long)low_snr, (long)low_volume);
315#endif
316
317}
318
323void
324fe_track_snr(fe_t * fe, int32 *in_speech)
325{
326 powspec_t *signal;
327 powspec_t *gain;
328 noise_stats_t *noise_stats;
329 powspec_t *mfspec;
330 int32 i, num_filts;
331 int16 is_quiet;
332 powspec_t lrt, snr;
333
334 if (!(fe->remove_noise || fe->remove_silence)) {
335 *in_speech = TRUE;
336 return;
337 }
338
339 noise_stats = fe->noise_stats;
340 mfspec = fe->mfspec;
341 num_filts = noise_stats->num_filters;
342
343 signal = (powspec_t *) ckd_calloc(num_filts, sizeof(powspec_t));
344
345 if (noise_stats->undefined) {
346 noise_stats->slow_peak_sum = FIX2FLOAT(0.0);
347 for (i = 0; i < num_filts; i++) {
348 noise_stats->power[i] = mfspec[i];
349#ifndef FIXED_POINT
350 noise_stats->noise[i] = mfspec[i] / noise_stats->max_gain;
351 noise_stats->floor[i] = mfspec[i] / noise_stats->max_gain;
352 noise_stats->peak[i] = 0.0;
353#else
354 noise_stats->noise[i] = mfspec[i] - noise_stats->max_gain;;
355 noise_stats->floor[i] = mfspec[i] - noise_stats->max_gain;
356 noise_stats->peak[i] = MIN_FIXLOG;
357#endif
358 }
359 noise_stats->undefined = FALSE;
360 }
361
362 /* Calculate smoothed power */
363 for (i = 0; i < num_filts; i++) {
364#ifndef FIXED_POINT
365 noise_stats->power[i] =
366 noise_stats->lambda_power * noise_stats->power[i] + noise_stats->comp_lambda_power * mfspec[i];
367#else
368 noise_stats->power[i] = fe_log_add(noise_stats->lambda_power + noise_stats->power[i],
369 noise_stats->comp_lambda_power + mfspec[i]);
370#endif
371 }
372
373 /* Noise estimation and vad decision */
374 fe_lower_envelope(noise_stats, noise_stats->power, noise_stats->noise, num_filts);
375
376 lrt = FLOAT2FIX(0.0);
377 for (i = 0; i < num_filts; i++) {
378#ifndef FIXED_POINT
379 signal[i] = noise_stats->power[i] - noise_stats->noise[i];
380 if (signal[i] < 1.0)
381 signal[i] = 1.0;
382 snr = log(noise_stats->power[i] / noise_stats->noise[i]);
383#else
384 signal[i] = fe_log_sub(noise_stats->power[i], noise_stats->noise[i]);
385 snr = noise_stats->power[i] - noise_stats->noise[i];
386#endif
387 if (snr > lrt)
388 lrt = snr;
389 }
390 is_quiet = fe_is_frame_quiet(noise_stats, signal, num_filts);
391
392#ifdef VAD_DEBUG
393 if (lrt < fe->vad_threshold)
394 low_snr++;
395 else if (is_quiet)
396 low_volume++;
397#endif
398
399#ifndef FIXED_POINT
400 if (fe->remove_silence && (lrt < fe->vad_threshold || is_quiet)) {
401#else
402 if (fe->remove_silence && (lrt < FLOAT2FIX(fe->vad_threshold) || is_quiet)) {
403#endif
404 *in_speech = FALSE;
405 } else {
406 *in_speech = TRUE;
407 }
408
409#ifdef VAD_DEBUG
410#ifndef FIXED_POINT
411 fprintf(vad_stats, "%.3f %d\n", lrt, *in_speech);
412#else
413 fprintf(vad_stats, "%d %d\n", lrt, *in_speech);
414#endif
415#endif
416
417 fe_lower_envelope(noise_stats, signal, noise_stats->floor, num_filts);
418
419 fe_temp_masking(noise_stats, signal, noise_stats->peak, num_filts);
420
421 if (!fe->remove_noise) {
422 /* no need for further calculations if noise cancellation disabled */
423 ckd_free(signal);
424 return;
425 }
426
427 for (i = 0; i < num_filts; i++) {
428 if (signal[i] < noise_stats->floor[i])
429 signal[i] = noise_stats->floor[i];
430 }
431
432 gain = (powspec_t *) ckd_calloc(num_filts, sizeof(powspec_t));
433#ifndef FIXED_POINT
434 for (i = 0; i < num_filts; i++) {
435 if (signal[i] < noise_stats->max_gain * noise_stats->power[i])
436 gain[i] = signal[i] / noise_stats->power[i];
437 else
438 gain[i] = noise_stats->max_gain;
439 if (gain[i] < noise_stats->inv_max_gain)
440 gain[i] = noise_stats->inv_max_gain;
441 }
442#else
443 for (i = 0; i < num_filts; i++) {
444 gain[i] = signal[i] - noise_stats->power[i];
445 if (gain[i] > noise_stats->max_gain)
446 gain[i] = noise_stats->max_gain;
447 if (gain[i] < noise_stats->inv_max_gain)
448 gain[i] = noise_stats->inv_max_gain;
449 }
450#endif
451
452 /* Weight smoothing and time frequency normalization */
453 fe_weight_smooth(noise_stats, mfspec, gain, num_filts);
454
455 ckd_free(gain);
456 ckd_free(signal);
457}
458
459void
460fe_vad_hangover(fe_t * fe, mfcc_t * feat, int32 is_speech, int32 store_pcm)
461{
462 if (!fe->vad_data->in_speech) {
463 fe_prespch_write_cep(fe->vad_data->prespch_buf, feat);
464 if (store_pcm)
465 fe_prespch_write_pcm(fe->vad_data->prespch_buf, fe->spch);
466 }
467
468 /* track vad state and deal with cepstrum prespeech buffer */
469 if (is_speech) {
470 fe->vad_data->post_speech_frames = 0;
471 if (!fe->vad_data->in_speech) {
472 fe->vad_data->pre_speech_frames++;
473 /* check for transition sil->speech */
474 if (fe->vad_data->pre_speech_frames >= fe->start_speech) {
475 fe->vad_data->pre_speech_frames = 0;
476 fe->vad_data->in_speech = 1;
477 }
478 }
479 } else {
480 fe->vad_data->pre_speech_frames = 0;
481 if (fe->vad_data->in_speech) {
482 fe->vad_data->post_speech_frames++;
483 /* check for transition speech->sil */
484 if (fe->vad_data->post_speech_frames >= fe->post_speech) {
485 fe->vad_data->post_speech_frames = 0;
486 fe->vad_data->in_speech = 0;
487 fe_prespch_reset_cep(fe->vad_data->prespch_buf);
488 fe_prespch_reset_pcm(fe->vad_data->prespch_buf);
489 }
490 }
491 }
492}
Sphinx's memory allocation/deallocation routines.
SPHINXBASE_EXPORT void ckd_free(void *ptr)
Test and free a 1-D array.
Definition: ckd_alloc.c:244
#define ckd_calloc(n, sz)
Macros to simplify the use of above functions.
Definition: ckd_alloc.h:248
Implementation of logging routines.
#define E_INFO(...)
Print logging information to standard error stream.
Definition: err.h:114
Basic type definitions used in Sphinx.
Miscellaneous useful string functions.
Structure for the front-end computation.
Definition: fe_internal.h:117