SphinxBase 5prealpha
fe.h
1/* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
2/* ====================================================================
3 * Copyright (c) 1996-2004 Carnegie Mellon University. All rights
4 * reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 *
18 * This work was supported in part by funding from the Defense Advanced
19 * Research Projects Agency and the National Science Foundation of the
20 * United States of America, and the CMU Sphinx Speech Consortium.
21 *
22 * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
23 * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
24 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
25 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
26 * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
27 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
28 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
29 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
30 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
31 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
32 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33 *
34 * ====================================================================
35 *
36 */
37
38/*
39 * fe.h
40 *
41 * $Log: fe.h,v $
42 * Revision 1.11 2005/02/05 02:15:02 egouvea
43 * Removed fe_process(), never used
44 *
45 * Revision 1.10 2004/12/10 16:48:55 rkm
46 * Added continuous density acoustic model handling
47 *
48 *
49 */
50
51#if defined(_WIN32) && !defined(GNUWINCE)
52#define srand48(x) srand(x)
53#define lrand48() rand()
54#endif
55
56#ifndef _NEW_FE_H_
57#define _NEW_FE_H_
58
59/* Win32/WinCE DLL gunk */
60#include <sphinxbase/sphinxbase_export.h>
61
62#include <sphinxbase/cmd_ln.h>
63#include <sphinxbase/fixpoint.h>
64
65#ifdef __cplusplus
66extern "C" {
67#endif
68#if 0
69/* Fool Emacs. */
70}
71#endif
72
73#ifdef WORDS_BIGENDIAN
74#define NATIVE_ENDIAN "big"
75#else
76#define NATIVE_ENDIAN "little"
77#endif
78
80#define DEFAULT_SAMPLING_RATE 16000
82#define DEFAULT_FRAME_RATE 100
85#define DEFAULT_FRAME_SHIFT 160
87#define DEFAULT_WINDOW_LENGTH 0.025625
89#define DEFAULT_FFT_SIZE 512
91#define DEFAULT_NUM_CEPSTRA 13
93#define DEFAULT_NUM_FILTERS 40
94
96#define DEFAULT_PRE_SPEECH 20
98#define DEFAULT_POST_SPEECH 50
100#define DEFAULT_START_SPEECH 10
101
103#define DEFAULT_LOWER_FILT_FREQ 133.33334
105#define DEFAULT_UPPER_FILT_FREQ 6855.4976
107#define DEFAULT_PRE_EMPHASIS_ALPHA 0.97
109#define DEFAULT_WARP_TYPE "inverse_linear"
111#define SEED -1
112
113#define waveform_to_cepstral_command_line_macro() \
114 { "-logspec", \
115 ARG_BOOLEAN, \
116 "no", \
117 "Write out logspectral files instead of cepstra" }, \
118 \
119 { "-smoothspec", \
120 ARG_BOOLEAN, \
121 "no", \
122 "Write out cepstral-smoothed logspectral files" }, \
123 \
124 { "-transform", \
125 ARG_STRING, \
126 "legacy", \
127 "Which type of transform to use to calculate cepstra (legacy, dct, or htk)" }, \
128 \
129 { "-alpha", \
130 ARG_FLOAT32, \
131 ARG_STRINGIFY(DEFAULT_PRE_EMPHASIS_ALPHA), \
132 "Preemphasis parameter" }, \
133 \
134 { "-samprate", \
135 ARG_FLOAT32, \
136 ARG_STRINGIFY(DEFAULT_SAMPLING_RATE), \
137 "Sampling rate" }, \
138 \
139 { "-frate", \
140 ARG_INT32, \
141 ARG_STRINGIFY(DEFAULT_FRAME_RATE), \
142 "Frame rate" }, \
143 \
144 { "-wlen", \
145 ARG_FLOAT32, \
146 ARG_STRINGIFY(DEFAULT_WINDOW_LENGTH), \
147 "Hamming window length" }, \
148 \
149 { "-nfft", \
150 ARG_INT32, \
151 ARG_STRINGIFY(DEFAULT_FFT_SIZE), \
152 "Size of FFT" }, \
153 \
154 { "-nfilt", \
155 ARG_INT32, \
156 ARG_STRINGIFY(DEFAULT_NUM_FILTERS), \
157 "Number of filter banks" }, \
158 \
159 { "-lowerf", \
160 ARG_FLOAT32, \
161 ARG_STRINGIFY(DEFAULT_LOWER_FILT_FREQ), \
162 "Lower edge of filters" }, \
163 \
164 { "-upperf", \
165 ARG_FLOAT32, \
166 ARG_STRINGIFY(DEFAULT_UPPER_FILT_FREQ), \
167 "Upper edge of filters" }, \
168 \
169 { "-unit_area", \
170 ARG_BOOLEAN, \
171 "yes", \
172 "Normalize mel filters to unit area" }, \
173 \
174 { "-round_filters", \
175 ARG_BOOLEAN, \
176 "yes", \
177 "Round mel filter frequencies to DFT points" }, \
178 \
179 { "-ncep", \
180 ARG_INT32, \
181 ARG_STRINGIFY(DEFAULT_NUM_CEPSTRA), \
182 "Number of cep coefficients" }, \
183 \
184 { "-doublebw", \
185 ARG_BOOLEAN, \
186 "no", \
187 "Use double bandwidth filters (same center freq)" }, \
188 \
189 { "-lifter", \
190 ARG_INT32, \
191 "0", \
192 "Length of sin-curve for liftering, or 0 for no liftering." }, \
193 \
194 { "-vad_prespeech", \
195 ARG_INT32, \
196 ARG_STRINGIFY(DEFAULT_PRE_SPEECH), \
197 "Num of speech frames to keep before silence to speech." }, \
198 \
199 { "-vad_startspeech", \
200 ARG_INT32, \
201 ARG_STRINGIFY(DEFAULT_START_SPEECH), \
202 "Num of speech frames to trigger vad from silence to speech." }, \
203 \
204 { "-vad_postspeech", \
205 ARG_INT32, \
206 ARG_STRINGIFY(DEFAULT_POST_SPEECH), \
207 "Num of silence frames to keep after from speech to silence." }, \
208 \
209 { "-vad_threshold", \
210 ARG_FLOAT32, \
211 "2.0", \
212 "Threshold for decision between noise and silence frames. Log-ratio between signal level and noise level." }, \
213 \
214 { "-input_endian", \
215 ARG_STRING, \
216 NATIVE_ENDIAN, \
217 "Endianness of input data, big or little, ignored if NIST or MS Wav" }, \
218 \
219 { "-warp_type", \
220 ARG_STRING, \
221 DEFAULT_WARP_TYPE, \
222 "Warping function type (or shape)" }, \
223 \
224 { "-warp_params", \
225 ARG_STRING, \
226 NULL, \
227 "Parameters defining the warping function" }, \
228 \
229 { "-dither", \
230 ARG_BOOLEAN, \
231 "no", \
232 "Add 1/2-bit noise" }, \
233 \
234 { "-seed", \
235 ARG_INT32, \
236 ARG_STRINGIFY(SEED), \
237 "Seed for random number generator; if less than zero, pick our own" }, \
238 \
239 { "-remove_dc", \
240 ARG_BOOLEAN, \
241 "no", \
242 "Remove DC offset from each frame" }, \
243 \
244 { "-remove_noise", \
245 ARG_BOOLEAN, \
246 "yes", \
247 "Remove noise with spectral subtraction in mel-energies" }, \
248 \
249 { "-remove_silence", \
250 ARG_BOOLEAN, \
251 "yes", \
252 "Enables VAD, removes silence frames from processing" }, \
253 \
254 { "-verbose", \
255 ARG_BOOLEAN, \
256 "no", \
257 "Show input filenames" } \
258
259
260#ifdef FIXED_POINT
262typedef fixed32 mfcc_t;
263
265#define FLOAT2MFCC(x) FLOAT2FIX(x)
267#define MFCC2FLOAT(x) FIX2FLOAT(x)
269#define MFCCMUL(a,b) FIXMUL(a,b)
270#define MFCCLN(x,in,out) FIXLN_ANY(x,in,out)
271#else /* !FIXED_POINT */
272
274typedef float32 mfcc_t;
276#define FLOAT2MFCC(x) (x)
278#define MFCC2FLOAT(x) (x)
280#define MFCCMUL(a,b) ((a)*(b))
281#define MFCCLN(x,in,out) log(x)
282#endif /* !FIXED_POINT */
283
287typedef struct fe_s fe_t;
288
292enum fe_error_e {
293 FE_SUCCESS = 0,
294 FE_OUTPUT_FILE_SUCCESS = 0,
295 FE_CONTROL_FILE_ERROR = -1,
296 FE_START_ERROR = -2,
297 FE_UNKNOWN_SINGLE_OR_BATCH = -3,
298 FE_INPUT_FILE_OPEN_ERROR = -4,
299 FE_INPUT_FILE_READ_ERROR = -5,
300 FE_MEM_ALLOC_ERROR = -6,
301 FE_OUTPUT_FILE_WRITE_ERROR = -7,
302 FE_OUTPUT_FILE_OPEN_ERROR = -8,
303 FE_ZERO_ENERGY_ERROR = -9,
304 FE_INVALID_PARAM_ERROR = -10
305};
306
314SPHINXBASE_EXPORT
315fe_t* fe_init_auto(void);
316
324SPHINXBASE_EXPORT
325arg_t const *fe_get_args(void);
326
337SPHINXBASE_EXPORT
338fe_t *fe_init_auto_r(cmd_ln_t *config);
339
347SPHINXBASE_EXPORT
348const cmd_ln_t *fe_get_config(fe_t *fe);
349
353SPHINXBASE_EXPORT
354void fe_start_stream(fe_t *fe);
355
360SPHINXBASE_EXPORT
361int fe_start_utt(fe_t *fe);
362
376SPHINXBASE_EXPORT
377int fe_get_output_size(fe_t *fe);
378
392SPHINXBASE_EXPORT
393void fe_get_input_size(fe_t *fe, int *out_frame_shift,
394 int *out_frame_size);
395
401SPHINXBASE_EXPORT
402uint8 fe_get_vad_state(fe_t *fe);
403
418SPHINXBASE_EXPORT
419int fe_end_utt(fe_t *fe, mfcc_t *out_cepvector, int32 *out_nframes);
420
426SPHINXBASE_EXPORT
427fe_t *fe_retain(fe_t *fe);
428
436SPHINXBASE_EXPORT
437int fe_free(fe_t *fe);
438
439/*
440 * Do same as fe_process_frames, but also returns
441 * voiced audio. Output audio is valid till next
442 * fe_process_frames call.
443 *
444 * DO NOT MIX fe_process_frames calls
445 *
446 * @param voiced_spch Output: obtain voiced audio samples here
447 *
448 * @param voiced_spch_nsamps Output: shows voiced_spch length
449 *
450 * @param out_frameidx Output: index of the utterance start
451 */
452SPHINXBASE_EXPORT
453int fe_process_frames_ext(fe_t *fe,
454 int16 const **inout_spch,
455 size_t *inout_nsamps,
456 mfcc_t **buf_cep,
457 int32 *inout_nframes,
458 int16 *voiced_spch,
459 int32 *voiced_spch_nsamps,
460 int32 *out_frameidx);
461
511SPHINXBASE_EXPORT
512int fe_process_frames(fe_t *fe,
513 int16 const **inout_spch,
514 size_t *inout_nsamps,
515 mfcc_t **buf_cep,
516 int32 *inout_nframes,
517 int32 *out_frameidx);
518
534SPHINXBASE_EXPORT
535int fe_process_utt(fe_t *fe,
536 int16 const *spch,
537 size_t nsamps,
538 mfcc_t ***cep_block,
539 int32 *nframes
540 );
541
545SPHINXBASE_EXPORT
546void fe_free_2d(void *arr);
547
551SPHINXBASE_EXPORT
552int fe_mfcc_to_float(fe_t *fe,
553 mfcc_t **input,
554 float32 **output,
555 int32 nframes);
556
560SPHINXBASE_EXPORT
561int fe_float_to_mfcc(fe_t *fe,
562 float32 **input,
563 mfcc_t **output,
564 int32 nframes);
565
589SPHINXBASE_EXPORT
590int fe_logspec_to_mfcc(fe_t *fe,
591 const mfcc_t *fr_spec,
592 mfcc_t *fr_cep
593 );
594
603SPHINXBASE_EXPORT
604int fe_logspec_dct2(fe_t *fe,
605 const mfcc_t *fr_spec,
606 mfcc_t *fr_cep
607 );
608
617SPHINXBASE_EXPORT
618int fe_mfcc_dct3(fe_t *fe,
619 const mfcc_t *fr_cep,
620 mfcc_t *fr_spec
621 );
622
623#ifdef __cplusplus
624}
625#endif
626
627
628#endif
Command-line and other configurationparsing and handling.
Argument definition structure.
Opaque structure used to hold the results of command-line parsing.
Structure for the front-end computation.
Definition: fe_internal.h:117