SphinxBase 5prealpha
cont_seg.c
1/* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
2/* ====================================================================
3 * Copyright (c) 2013 Carnegie Mellon University. All rights
4 * reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 *
18 * This work was supported in part by funding from the Defense Advanced
19 * Research Projects Agency and the National Science Foundation of the
20 * United States of America, and the CMU Sphinx Speech Consortium.
21 *
22 * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
23 * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
24 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
25 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
26 * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
27 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
28 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
29 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
30 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
31 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
32 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33 *
34 * ====================================================================
35 *
36 */
37/*
38 * cont_seg.c -- Continuously listen and segment input speech into utterances.
39 *
40 * HISTORY
41 *
42 * 05-Nov-13 Created from adseg and fileseg
43 *
44 */
45
46#include <stdio.h>
47#include <stdlib.h>
48#include <string.h>
49#include <assert.h>
50#include <math.h>
51
52#if defined(_WIN32) && !defined(__CYGWIN__)
53#include <windows.h>
54#else
55#include <sys/select.h>
56#endif
57
59#include <sphinxbase/ad.h>
60#include <sphinxbase/fe.h>
61#include <sphinxbase/cmd_ln.h>
63#include <sphinxbase/err.h>
64
65#define BLOCKSIZE 1024
66
67static const arg_t cont_args_def[] = {
68 waveform_to_cepstral_command_line_macro(),
69 /* Argument file. */
70 {"-argfile",
72 NULL,
73 "Argument file giving extra arguments."},
74 {"-adcdev",
76 NULL,
77 "Name of audio device to use for input."},
78 {"-inmic",
80 "no",
81 "Transcribe audio from microphone."},
82 {"-infile",
84 NULL,
85 "Name of audio file to use for input."},
86 {"-singlefile",
88 FALSE,
89 "Write a single cleaned file."},
90 {NULL, 0, NULL, NULL}
91};
92
93static fe_t *fe;
94static cmd_ln_t *config;
95static int (*read_audio) (int16 * buf, int len);
96static ad_rec_t *ad;
97static const char *infile_path;
98static FILE *infile;
99static int32 singlefile;
100
101/* Sleep for specified msec */
102static void
103sleep_msec(int32 ms)
104{
105#if (defined(_WIN32) && !defined(GNUWINCE)) || defined(_WIN32_WCE)
106 Sleep(ms);
107#else
108 /* ------------------- Unix ------------------ */
109 struct timeval tmo;
110
111 tmo.tv_sec = 0;
112 tmo.tv_usec = ms * 1000;
113
114 select(0, NULL, NULL, NULL, &tmo);
115#endif
116}
117
118static int
119read_audio_file(int16 * buf, int len)
120{
121 if (!infile) {
122 E_FATAL("Failed to read audio from file\n");
123 return -1;
124 }
125 return fread(buf, sizeof(int16), len, infile);
126}
127
128static int
129read_audio_adev(int16 * buf, int len)
130{
131 int k;
132
133 if (!ad) {
134 E_FATAL("Failed to read audio from mic\n");
135 return -1;
136 }
137 while ((k = ad_read(ad, buf, len)) == 0)
138 /* wait until something is read */
139 sleep_msec(50);
140
141 return k;
142}
143
144void
145segment_audio()
146{
147 FILE *file;
148 int16 pcm_buf[BLOCKSIZE];
149 mfcc_t **cep_buf;
150 int16 *voiced_buf = NULL;
151 int32 voiced_nsamps, out_frameidx, uttstart = 0;
152 char file_name[1024];
153 uint8 cur_vad_state, vad_state, writing;
154 int uttno, uttlen, sample_rate;
155 int32 nframes, nframes_tmp;
156 int16 frame_size, frame_shift, frame_rate;
157 size_t k;
158
159 sample_rate = (int) cmd_ln_float32_r(config, "-samprate");
160 frame_rate = cmd_ln_int32_r(config, "-frate");
161 frame_size =
162 (int32) (cmd_ln_float32_r(config, "-wlen") * sample_rate + 0.5);
163 frame_shift =
164 (int32) (sample_rate / cmd_ln_int32_r(config, "-frate") + 0.5);
165 nframes = (BLOCKSIZE - frame_size) / frame_shift;
166 cep_buf =
167 (mfcc_t **) ckd_calloc_2d(nframes, fe_get_output_size(fe),
168 sizeof(mfcc_t));
169
170 uttno = 0;
171 uttlen = 0;
172 cur_vad_state = 0;
173 voiced_nsamps = 0;
174 writing = 0;
175 file = NULL;
176 fe_start_stream(fe);
177 fe_start_utt(fe);
178 while ((k = read_audio(pcm_buf, BLOCKSIZE)) > 0) {
179 int16 const *pcm_buf_tmp;
180 pcm_buf_tmp = &pcm_buf[0];
181 while (k) {
182 nframes_tmp = nframes;
183 fe_process_frames_ext(fe, &pcm_buf_tmp, &k, cep_buf,
184 &nframes_tmp, voiced_buf,
185 &voiced_nsamps, &out_frameidx);
186 if (out_frameidx > 0) {
187 uttstart = out_frameidx;
188 }
189 vad_state = fe_get_vad_state(fe);
190 if (!cur_vad_state && vad_state) {
191 /* silence->speech transition, time to start new file */
192 uttno++;
193 if (!singlefile) {
194 sprintf(file_name, "%s%04d.raw", infile_path, uttno);
195 if ((file = fopen(file_name, "wb")) == NULL)
196 E_FATAL_SYSTEM("Failed to open '%s' for writing",
197 file_name);
198 } else {
199 sprintf(file_name, "%s.raw", infile_path);
200 if ((file = fopen(file_name, "ab")) == NULL)
201 E_FATAL_SYSTEM("Failed to open '%s' for writing",
202 file_name);
203 }
204 writing = 1;
205 }
206
207 if (writing && file && voiced_nsamps > 0) {
208 fwrite(voiced_buf, sizeof(int16), voiced_nsamps, file);
209 uttlen += voiced_nsamps;
210 }
211
212 if (cur_vad_state && !vad_state) {
213 /* speech -> silence transition, time to finish file */
214 fclose(file);
215 printf("Utterance %04d: file %s start %.1f sec length %d samples ( %.2f sec )\n",
216 uttno,
217 file_name,
218 ((double) uttstart) / frame_rate,
219 uttlen,
220 ((double) uttlen) / sample_rate);
221 fflush(stdout);
222 fe_end_utt(fe, cep_buf[0], &nframes_tmp);
223 writing = 0;
224 uttlen = 0;
225 voiced_nsamps = 0;
226 fe_start_utt(fe);
227 }
228 cur_vad_state = vad_state;
229 }
230 }
231
232 if (writing) {
233 fclose(file);
234 printf("Utterance %04d: file %s start %.1f sec length %d samples ( %.2f sec )\n",
235 uttno,
236 file_name,
237 ((double) uttstart) / frame_rate,
238 uttlen,
239 ((double) uttlen) / sample_rate);
240 fflush(stdout);
241 }
242 fe_end_utt(fe, cep_buf[0], &nframes);
243 ckd_free_2d(cep_buf);
244}
245
246int
247main(int argc, char *argv[])
248{
249 int i;
250 int16 buf[2048];
251
252 config = cmd_ln_parse_r(NULL, cont_args_def, argc, argv, TRUE);
253
254 if (config && cmd_ln_str_r(config, "-argfile"))
255 config = cmd_ln_parse_file_r(config, cont_args_def,
256 cmd_ln_str_r(config, "-argfile"), FALSE);
257
258 if (config == NULL || (cmd_ln_str_r(config, "-infile") == NULL && cmd_ln_boolean_r(config, "-inmic") == FALSE)) {
259 E_INFO("Specify '-infile <file.wav>' to segment a file or '-inmic yes' to segment audio from microphone.\n");
260 cmd_ln_free_r(config);
261 return 1;
262 }
263
264
265 singlefile = cmd_ln_boolean_r(config, "-singlefile");
266 if ((infile_path = cmd_ln_str_r(config, "-infile")) != NULL) {
267 if ((infile = fopen(infile_path, "rb")) == NULL) {
268 E_FATAL_SYSTEM("Failed to read audio from '%s'", infile_path);
269 return 1;
270 }
271 read_audio = &read_audio_file;
272 /* skip wav header */
273 read_audio(buf, 44);
274 }
275 else if cmd_ln_boolean_r(config, "-inmic") {
276 if ((ad = ad_open_dev(cmd_ln_str_r(config, "-adcdev"),
277 (int) cmd_ln_float32_r(config,
278 "-samprate"))) ==
279 NULL) {
280 E_FATAL("Failed to open audio device\n");
281 return 1;
282 }
283 read_audio = &read_audio_adev;
284 printf("Start recording ...\n");
285 fflush(stdout);
286 if (ad_start_rec(ad) < 0)
287 E_FATAL("Failed to start recording\n");
288
289 /* TODO remove this thing */
290 for (i = 0; i < 5; i++) {
291 sleep_msec(200);
292 read_audio(buf, 2048);
293 }
294 printf("You may speak now\n");
295 fflush(stdout);
296 }
297
298 fe = fe_init_auto_r(config);
299 if (fe == NULL)
300 return 1;
301
302 segment_audio();
303
304 if (ad)
305 ad_close(ad);
306 if (infile)
307 fclose(infile);
308
309 fe_free(fe);
310 cmd_ln_free_r(config);
311 return 0;
312}
generic live audio interface for recording and playback
SPHINXBASE_EXPORT ad_rec_t * ad_open_dev(const char *dev, int32 samples_per_sec)
Open a specific audio device for recording.
Definition: ad_alsa.c:187
Sphinx's memory allocation/deallocation routines.
SPHINXBASE_EXPORT void ckd_free_2d(void *ptr)
Free a 2-D array (ptr) previously allocated by ckd_calloc_2d.
Definition: ckd_alloc.c:255
#define ckd_calloc_2d(d1, d2, sz)
Macro for ckd_calloc_2d
Definition: ckd_alloc.h:270
Command-line and other configurationparsing and handling.
#define cmd_ln_boolean_r(c, n)
Retrieve a boolean value from a command-line object.
Definition: cmd_ln.h:334
SPHINXBASE_EXPORT int cmd_ln_free_r(cmd_ln_t *cmdln)
Release a command-line argument set and all associated strings.
Definition: cmd_ln.c:1046
SPHINXBASE_EXPORT cmd_ln_t * cmd_ln_parse_file_r(cmd_ln_t *inout_cmdln, arg_t const *defn, char const *filename, int32 strict)
Parse an arguments file by deliminating on " \r\t\n" and putting each tokens into an argv[] for cmd_l...
Definition: cmd_ln.c:764
#define ARG_STRING
String argument (optional).
Definition: cmd_ln.h:114
SPHINXBASE_EXPORT char const * cmd_ln_str_r(cmd_ln_t *cmdln, char const *name)
Retrieve a string from a command-line object.
Definition: cmd_ln.c:949
#define ARG_BOOLEAN
Boolean (true/false) argument (optional).
Definition: cmd_ln.h:118
SPHINXBASE_EXPORT cmd_ln_t * cmd_ln_parse_r(cmd_ln_t *inout_cmdln, arg_t const *defn, int32 argc, char *argv[], int32 strict)
Parse a list of strings into argumetns.
Definition: cmd_ln.c:556
Implementation of logging routines.
#define E_INFO(...)
Print logging information to standard error stream.
Definition: err.h:114
#define E_FATAL(...)
Exit with non-zero status after error message.
Definition: err.h:81
#define E_FATAL_SYSTEM(...)
Print error text; Call perror(""); exit(errno);.
Definition: err.h:90
Basic type definitions used in Sphinx.
Audio recording structure.
Argument definition structure.
Opaque structure used to hold the results of command-line parsing.
Structure for the front-end computation.
Definition: fe_internal.h:117