SphinxBase 5prealpha
sphinx_fe.c
1/* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
2/* ====================================================================
3 * Copyright (c) 1996-2004 Carnegie Mellon University. All rights
4 * reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 *
18 * This work was supported in part by funding from the Defense Advanced
19 * Research Projects Agency and the National Science Foundation of the
20 * United States of America, and the CMU Sphinx Speech Consortium.
21 *
22 * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
23 * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
24 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
25 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
26 * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
27 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
28 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
29 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
30 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
31 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
32 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33 *
34 * ====================================================================
35 *
36 */
37#include <stdio.h>
38#include <stdlib.h>
39#include <string.h>
40#include <time.h>
41#include <assert.h>
42
43#ifdef HAVE_CONFIG_H
44#include <config.h>
45#endif
46
47#include <sphinxbase/fe.h>
48#include <sphinxbase/strfuncs.h>
49#include <sphinxbase/pio.h>
50#include <sphinxbase/filename.h>
51#include <sphinxbase/cmd_ln.h>
52#include <sphinxbase/err.h>
54#include <sphinxbase/byteorder.h>
56
57#include "sphinx_wave2feat.h"
58#include "cmd_ln_defn.h"
59
60typedef struct audio_type_s {
61 char const *name;
62 int (*detect)(sphinx_wave2feat_t *wtf);
63 int (*decode)(sphinx_wave2feat_t *wtf);
65
66typedef struct output_type_s {
67 char const *name;
68 int (*output_header)(sphinx_wave2feat_t *wtf, int nfloat);
69 int (*output_frames)(sphinx_wave2feat_t *wtf, mfcc_t **frames, int nfr);
71
76 char *infile;
77 char *outfile;
78 FILE *infh;
79 FILE *outfh;
80 short *audio;
81 mfcc_t **feat;
84 int veclen;
88};
89
91typedef struct RIFFHeader{
92 char rifftag[4]; /* "RIFF" string */
93 int32 TotalLength; /* Total length */
94 char wavefmttag[8]; /* "WAVEfmt " string (note space after 't') */
95 int32 RemainingLength; /* Remaining length */
96 int16 data_format; /* data format tag, 1 = PCM */
97 int16 numchannels; /* Number of channels in file */
98 int32 SamplingFreq; /* Sampling frequency */
99 int32 BytesPerSec; /* Average bytes/sec */
100 int16 BlockAlign; /* Block align */
101 int16 BitsPerSample; /* 8 or 16 bit */
102 char datatag[4]; /* "data" string */
103 int32 datalength; /* Raw data length */
104} MSWAV_hdr;
105
111static int
112detect_riff(sphinx_wave2feat_t *wtf)
113{
114 FILE *fh;
115 MSWAV_hdr hdr;
116
117 if ((fh = fopen(wtf->infile, "rb")) == NULL) {
118 E_ERROR_SYSTEM("Failed to open %s", wtf->infile);
119 return -1;
120 }
121 if (fread(&hdr, sizeof(hdr), 1, fh) != 1) {
122 E_ERROR("Failed to read RIFF header");
123 fclose(fh);
124 return -1;
125 }
126 /* Make sure it is actually a RIFF file. */
127 if (0 != memcmp(hdr.rifftag, "RIFF", 4)) {
128 fclose(fh);
129 return FALSE;
130 }
131 if (cmd_ln_int32_r(wtf->config, "-nchans") != hdr.numchannels) {
132 E_ERROR("Number of channels %d does not match configured value in file '%s'\n", hdr.numchannels, wtf->infile);
133 fclose(fh);
134 return -1;
135 }
136 if (cmd_ln_float32_r(wtf->config, "-samprate") != hdr.SamplingFreq) {
137 E_ERROR("Sample rate %.1f does not match configured value in file '%s'\n", hdr.SamplingFreq, wtf->infile);
138 fclose(fh);
139 return -1;
140 }
141 wtf->infh = fh;
142
143 return TRUE;
144}
145
146static int
147open_nist_file(sphinx_wave2feat_t *wtf, char const *infile, FILE **out_fh, int detect_endian)
148{
149 char nist[7];
150 lineiter_t *li;
151 FILE *fh;
152
153 if ((fh = fopen(infile, "rb")) == NULL) {
154 E_ERROR_SYSTEM("Failed to open %s", infile);
155 return -1;
156 }
157 if (fread(&nist, 1, 7, fh) != 7) {
158 E_ERROR_SYSTEM("Failed to read NIST header");
159 fclose(fh);
160 return -1;
161 }
162 /* Is this actually a NIST file? */
163 if (0 != strncmp(nist, "NIST_1A", 7)) {
164 fclose(fh);
165 return FALSE;
166 }
167 /* Rewind, parse lines. */
168 fseek(fh, 0, SEEK_SET);
169 for (li = lineiter_start(fh); li; li = lineiter_next(li)) {
170 char **words;
171 int nword;
172
173 string_trim(li->buf, STRING_BOTH);
174 if (strlen(li->buf) == 0) {
175 lineiter_free(li);
176 break;
177 }
178 nword = str2words(li->buf, NULL, 0);
179 if (nword != 3)
180 continue;
181 words = (char **)ckd_calloc(nword, sizeof(*words));
182 str2words(li->buf, words, nword);
183 if (0 == strcmp(words[0], "sample_rate")) {
184 float samprate = atof_c(words[2]);
185 if (cmd_ln_float32_r(wtf->config, "-samprate") != samprate) {
186 E_ERROR("Sample rate %.1f does not match configured value in file '%s'\n", samprate, infile);
187 lineiter_free(li);
188 fclose(fh);
189 return -1;
190 }
191 }
192 if (0 == strcmp(words[0], "channel_count")) {
193 int nchans = atoi(words[2]);
194 if (cmd_ln_int32_r(wtf->config, "-nchans") != nchans) {
195 E_ERROR("Number of channels %d does not match configured value in file '%s'\n", nchans, infile);
196 lineiter_free(li);
197 fclose(fh);
198 return -1;
199 }
200 }
201 if (detect_endian && 0 == strcmp(words[0], "sample_byte_format")) {
202 const char *endian = (0 == strcmp(words[2], "10")) ? "big" : "little";
203 if (0 != strcmp(cmd_ln_str_r(wtf->config, "-input_endian"), endian)) {
204 E_ERROR("Input endian %s does not match configured value in file '%s'\n", endian, infile);
205 lineiter_free(li);
206 fclose(fh);
207 return -1;
208 }
209 }
210 ckd_free(words);
211 }
212
213 fseek(fh, 1024, SEEK_SET);
214 if (out_fh)
215 *out_fh = fh;
216 else
217 fclose(fh);
218 return TRUE;
219}
220
221#ifdef HAVE_POPEN
222static int
223detect_sph2pipe(sphinx_wave2feat_t *wtf)
224{
225 FILE *fh;
226 char *cmdline;
227 int rv;
228
229 /* Determine if it's NIST file and get parameters. */
230 if ((rv = open_nist_file(wtf, wtf->infile, NULL, FALSE)) != TRUE)
231 return rv;
232
233 /* Now popen it with sph2pipe. */
234 cmdline = string_join("sph2pipe -f raw '", wtf->infile, "'", NULL);
235 if ((fh = popen(cmdline, "r")) == NULL) {
236 E_ERROR_SYSTEM("Failed to popen(\"sph2pipe -f raw '%s'\")", wtf->infile);
237 ckd_free(cmdline);
238 return -1;
239 }
240
241 wtf->infh = fh;
242 return TRUE;
243}
244#else /* !HAVE_POPEN */
245static int
246detect_sph2pipe(sphinx_wave2feat_t *wtf)
247{
248 E_ERROR("popen() not available, cannot run sph2pipe\n");
249 return -1;
250}
251#endif /* !HAVE_POPEN */
252
258static int
259detect_nist(sphinx_wave2feat_t *wtf)
260{
261 FILE *fh;
262 int rv;
263
264 if ((rv = open_nist_file(wtf, wtf->infile, &fh, TRUE)) != TRUE)
265 return rv;
266 wtf->infh = fh;
267
268 return TRUE;
269}
270
271
278static int
279detect_raw(sphinx_wave2feat_t *wtf)
280{
281 FILE *fh;
282
283 if ((fh = fopen(wtf->infile, "rb")) == NULL) {
284 E_ERROR_SYSTEM("Failed to open %s", wtf->infile);
285 return -1;
286 }
287 wtf->infh = fh;
288 return TRUE;
289}
290
297static int
298detect_sphinx_mfc(sphinx_wave2feat_t *wtf)
299{
300 FILE *fh;
301 int32 len;
302 long flen;
303
304 if ((fh = fopen(wtf->infile, "rb")) == NULL) {
305 E_ERROR_SYSTEM("Failed to open %s", wtf->infile);
306 return -1;
307 }
308 if (fread(&len, 4, 1, fh) != 1) {
309 E_ERROR_SYSTEM("Failed to read header from %s\n", wtf->infile);
310 fclose(fh);
311 return -1;
312 }
313 fseek(fh, 0, SEEK_END);
314 flen = ftell(fh);
315
316 /* figure out whether to byteswap */
317 flen = (flen / 4) - 1;
318 if (flen != len) {
319 /* First make sure this is an endianness problem, otherwise fail. */
320 SWAP_INT32(&len);
321 if (flen != len) {
322 SWAP_INT32(&len);
323 E_ERROR("Mismatch in header/file lengths: 0x%08x vs 0x%08x\n",
324 len, flen);
325 return -1;
326 }
327 /* Set the input endianness to the opposite of the machine endianness... */
328 cmd_ln_set_str_r(wtf->config, "-input_endian",
329 (0 == strcmp("big", cmd_ln_str_r(wtf->config, "-mach_endian"))
330 ? "little" : "big"));
331 }
332
333 fseek(fh, 4, SEEK_SET);
334 wtf->infh = fh;
335 if (cmd_ln_boolean_r(wtf->config, "-spec2cep")) {
336 wtf->in_veclen = cmd_ln_int32_r(wtf->config, "-nfilt");
337 }
338 else if (cmd_ln_boolean_r(wtf->config, "-cep2spec")) {
339 wtf->in_veclen = cmd_ln_int32_r(wtf->config, "-ncep");
340 wtf->veclen = cmd_ln_int32_r(wtf->config, "-nfilt");
341 }
342 else {
343 /* Should not happen. */
344 E_ERROR("Sphinx MFCC file reading requested but -spec2cep/-cep2spec not given\n");
345 assert(FALSE);
346 }
347
348 return TRUE;
349}
350
351int
352mixnpick_channels(int16 *buf, int32 nsamp, int32 nchans, int32 whichchan)
353{
354 int i, j;
355
356 if (whichchan > 0) {
357 for (i = whichchan - 1; i < nsamp; i += nchans)
358 buf[i/nchans] = buf[i];
359 }
360 else {
361 for (i = 0; i < nsamp; i += nchans) {
362 float64 tmp = 0.0;
363 for (j = 0; j < nchans && i + j < nsamp; ++j) {
364 tmp += buf[i + j];
365 }
366 buf[i/nchans] = (int16)(tmp / nchans);
367 }
368 }
369 return i/nchans;
370}
371
376static int
377decode_pcm(sphinx_wave2feat_t *wtf)
378{
379 size_t nsamp;
380 int32 n, nfr, nchans, whichchan;
381 uint32 nfloat;
382
383 nchans = cmd_ln_int32_r(wtf->config, "-nchans");
384 whichchan = cmd_ln_int32_r(wtf->config, "-whichchan");
385 fe_start_stream(wtf->fe);
386 fe_start_utt(wtf->fe);
387 nfloat = 0;
388 while ((nsamp = fread(wtf->audio, sizeof(int16), wtf->blocksize, wtf->infh)) != 0) {
389 size_t nvec;
390 int16 const *inspeech;
391
392 /* Byteswap stuff here if necessary. */
393 if (wtf->byteswap) {
394 for (n = 0; n < nsamp; ++n)
395 SWAP_INT16(wtf->audio + n);
396 }
397
398 /* Mix or pick channels. */
399 if (nchans > 1)
400 nsamp = mixnpick_channels(wtf->audio, nsamp, nchans, whichchan);
401
402 inspeech = wtf->audio;
403 nvec = wtf->featsize;
404 /* Consume all samples. */
405 while (nsamp) {
406 nfr = nvec;
407 fe_process_frames(wtf->fe, &inspeech, &nsamp, wtf->feat, &nfr, NULL);
408 if (nfr) {
409 if ((n = (*wtf->ot->output_frames)(wtf, wtf->feat, nfr)) < 0)
410 return -1;
411 nfloat += n;
412 }
413 }
414 inspeech = wtf->audio;
415 }
416 /* Now process any leftover audio frames. */
417 fe_end_utt(wtf->fe, wtf->feat[0], &nfr);
418 if (nfr) {
419 if ((n = (*wtf->ot->output_frames)(wtf, wtf->feat, nfr)) < 0)
420 return -1;
421 nfloat += n;
422 }
423
424 if (fclose(wtf->infh) == EOF)
425 E_ERROR_SYSTEM("Failed to close input file");
426 wtf->infh = NULL;
427 return nfloat;
428}
429
434static int
435decode_sphinx_mfc(sphinx_wave2feat_t *wtf)
436{
437 int nfloat = 0, n;
438 int featsize = wtf->featsize;
439
440 /* If the input vector length is less than the output length, we
441 * need to do this one frame at a time, because there's empty
442 * space at the end of each vector in wtf->feat. */
443 if (wtf->in_veclen < wtf->veclen)
444 featsize = 1;
445 while ((n = fread(wtf->feat[0], sizeof(**wtf->feat),
446 featsize * wtf->in_veclen, wtf->infh)) != 0) {
447 int i, nfr = n / wtf->in_veclen;
448 if (n % wtf->in_veclen) {
449 E_ERROR("Size of file %d not a multiple of veclen %d\n",
450 n, wtf->in_veclen);
451 return -1;
452 }
453 /* Byteswap stuff here if necessary. */
454 if (wtf->byteswap) {
455 for (i = 0; i < n; ++i)
456 SWAP_FLOAT32(wtf->feat[0] + i);
457 }
458 fe_float_to_mfcc(wtf->fe, (float32 **)wtf->feat, wtf->feat, nfr);
459 for (i = 0; i < nfr; ++i) {
460 if (cmd_ln_boolean_r(wtf->config, "-spec2cep")) {
461 if (0 == strcmp(cmd_ln_str_r(wtf->config, "-transform"), "legacy"))
462 fe_logspec_to_mfcc(wtf->fe, wtf->feat[i], wtf->feat[i]);
463 else
464 fe_logspec_dct2(wtf->fe, wtf->feat[i], wtf->feat[i]);
465 }
466 else if (cmd_ln_boolean_r(wtf->config, "-cep2spec")) {
467 fe_mfcc_dct3(wtf->fe, wtf->feat[i], wtf->feat[i]);
468 }
469 }
470 if ((n = (*wtf->ot->output_frames)(wtf, wtf->feat, nfr)) < 0)
471 return -1;
472 nfloat += n;
473 }
474
475 if (fclose(wtf->infh) == EOF)
476 E_ERROR_SYSTEM("Failed to close input file");
477 wtf->infh = NULL;
478 return nfloat;
479}
480
481static const audio_type_t types[] = {
482 { "-mswav", &detect_riff, &decode_pcm },
483 { "-nist", &detect_nist, &decode_pcm },
484 { "-raw", &detect_raw, &decode_pcm },
485 { "-sph2pipe", &detect_sph2pipe, &decode_pcm }
486};
487static const int ntypes = sizeof(types)/sizeof(types[0]);
488static const audio_type_t mfcc_type = {
489 "sphinx_mfc", &detect_sphinx_mfc, &decode_sphinx_mfc
490};
491
497static int
498output_header_sphinx(sphinx_wave2feat_t *wtf, int32 nfloat)
499{
500 if (fwrite(&nfloat, 4, 1, wtf->outfh) != 1) {
501 E_ERROR_SYSTEM("Failed to write to %s", wtf->outfile);
502 return -1;
503 }
504 return 0;
505}
506
512static int
513output_frames_sphinx(sphinx_wave2feat_t *wtf, mfcc_t **frames, int nfr)
514{
515 int i, nfloat = 0;
516
517 fe_mfcc_to_float(wtf->fe, frames, (float32 **)frames, nfr);
518 for (i = 0; i < nfr; ++i) {
519 if (fwrite(frames[i], sizeof(float32), wtf->veclen, wtf->outfh) != wtf->veclen) {
520 E_ERROR_SYSTEM("Writing %d values to %s failed",
521 wtf->veclen, wtf->outfile);
522 return -1;
523 }
524 nfloat += wtf->veclen;
525 }
526 return nfloat;
527}
528
529typedef enum htk_feature_kind_e {
530 WAVEFORM = 0, /* PCM audio (rarely used) */
531 LPC = 1, /* LPC filter coefficients */
532 LPCREFC = 2, /* LPC reflection coefficients */
533 LPCEPSTRA = 3, /* LPC-based cepstral coefficients */
534 LPCDELCEP = 4, /* LPCC plus deltas */
535 IREFC = 5, /* 16-bit integer LPC reflection coefficients */
536 MFCC = 6, /* MFCCs */
537 FBANK = 7, /* Log mel spectrum */
538 MELSPEC = 8, /* Linear mel spectrum */
539 USER = 9, /* User defined */
540 DISCRETE = 10, /* Vector quantized data */
541 PLP = 11 /* PLP coefficients */
542} htk_feature_kind_t;
543
544typedef enum htk_feature_flag_e {
545 _E = 0000100, /* has energy */
546 _N = 0000200, /* absolute energy supressed */
547 _D = 0000400, /* has delta coefficients */
548 _A = 0001000, /* has acceleration (delta-delta) coefficients */
549 _C = 0002000, /* is compressed */
550 _Z = 0004000, /* has zero mean static coefficients (i.e. CMN) */
551 _K = 0010000, /* has CRC checksum */
552 _O = 0020000, /* has 0th cepstral coefficient */
553 _V = 0040000, /* has VQ data */
554 _T = 0100000 /* has third differential coefficients */
555} htk_feature_flag_t;
556
560static int
561output_header_htk(sphinx_wave2feat_t *wtf, int32 nfloat)
562{
563 int32 samp_period;
564 int16 samp_size;
565 int16 param_kind;
566 int swap = FALSE;
567
568 /* HTK files are big-endian. */
569 if (0 == strcmp("little", cmd_ln_str_r(wtf->config, "-mach_endian")))
570 swap = TRUE;
571 /* Same file size thing as in Sphinx files (I think) */
572 if (swap) SWAP_INT32(&nfloat);
573 if (fwrite(&nfloat, 4, 1, wtf->outfh) != 1)
574 return -1;
575 /* Sample period in 100ns units. */
576 samp_period = (int32)(1e+7 / cmd_ln_float32_r(wtf->config, "-frate"));
577 if (swap) SWAP_INT32(&samp_period);
578 if (fwrite(&samp_period, 4, 1, wtf->outfh) != 1)
579 return -1;
580 /* Sample size - veclen * sizeof each sample. */
581 samp_size = wtf->veclen * 4;
582 if (swap) SWAP_INT16(&samp_size);
583 if (fwrite(&samp_size, 2, 1, wtf->outfh) != 1)
584 return -1;
585 /* Format and flags. */
586 if (cmd_ln_boolean_r(wtf->config, "-logspec")
587 || cmd_ln_boolean_r(wtf->config, "-cep2spec"))
588 param_kind = FBANK; /* log mel-filter bank outputs */
589 else
590 param_kind = MFCC | _O; /* MFCC + CEP0 (note reordering...) */
591 if (swap) SWAP_INT16(&param_kind);
592 if (fwrite(&param_kind, 2, 1, wtf->outfh) != 1)
593 return -1;
594
595 return 0;
596}
597
601static int
602output_frames_htk(sphinx_wave2feat_t *wtf, mfcc_t **frames, int nfr)
603{
604 int i, j, swap, htk_reorder, nfloat = 0;
605
606 fe_mfcc_to_float(wtf->fe, frames, (float32 **)frames, nfr);
607 /* This is possibly inefficient, but probably not a big deal. */
608 swap = (0 == strcmp("little", cmd_ln_str_r(wtf->config, "-mach_endian")));
609 htk_reorder = (0 == strcmp("htk", wtf->ot->name)
610 && !(cmd_ln_boolean_r(wtf->config, "-logspec")
611 || cmd_ln_boolean_r(wtf->config, "-cep2spec")));
612 for (i = 0; i < nfr; ++i) {
613 if (htk_reorder) {
614 mfcc_t c0 = frames[i][0];
615 memmove(frames[i] + 1, frames[i], (wtf->veclen - 1) * 4);
616 frames[i][wtf->veclen - 1] = c0;
617 }
618 if (swap)
619 for (j = 0; j < wtf->veclen; ++j)
620 SWAP_FLOAT32(frames[i] + j);
621 if (fwrite(frames[i], sizeof(float32), wtf->veclen, wtf->outfh) != wtf->veclen) {
622 E_ERROR_SYSTEM("Writing %d values to %s failed",
623 wtf->veclen, wtf->outfile);
624 return -1;
625 }
626 nfloat += wtf->veclen;
627 }
628 return nfloat;
629}
630
634static int
635output_frames_text(sphinx_wave2feat_t *wtf, mfcc_t **frames, int nfr)
636{
637 int i, j, nfloat = 0;
638
639 fe_mfcc_to_float(wtf->fe, frames, (float32 **)frames, nfr);
640 for (i = 0; i < nfr; ++i) {
641 for (j = 0; j < wtf->veclen; ++j) {
642 fprintf(wtf->outfh, "%.5g", MFCC2FLOAT(frames[i][j]));
643 if (j == wtf->veclen - 1)
644 fprintf(wtf->outfh, "\n");
645 else
646 fprintf(wtf->outfh, " ");
647 }
648 nfloat += wtf->veclen;
649 }
650 return nfloat;
651}
652
653static const output_type_t outtypes[] = {
654 { "sphinx", &output_header_sphinx, &output_frames_sphinx },
655 { "htk", &output_header_htk, &output_frames_htk },
656 { "text", NULL, &output_frames_text }
657};
658static const int nouttypes = sizeof(outtypes)/sizeof(outtypes[0]);
659
661sphinx_wave2feat_init(cmd_ln_t *config)
662{
664 int i;
665
666 wtf = (sphinx_wave2feat_t *)ckd_calloc(1, sizeof(*wtf));
667 wtf->refcount = 1;
668 wtf->config = cmd_ln_retain(config);
669 wtf->fe = fe_init_auto_r(wtf->config);
670 if (!wtf->fe) {
671 E_FATAL("Failed to create feature extraction\n");
672 }
673
674 wtf->ot = outtypes; /* Default (sphinx) type. */
675 for (i = 0; i < nouttypes; ++i) {
676 output_type_t const *otype = &outtypes[i];
677 if (0 == strcmp(cmd_ln_str_r(config, "-ofmt"), otype->name)) {
678 wtf->ot = otype;
679 break;
680 }
681 }
682 if (i == nouttypes) {
683 E_ERROR("Unknown output type: '%s'\n",
684 cmd_ln_str_r(config, "-ofmt"));
685 sphinx_wave2feat_free(wtf);
686 return NULL;
687 }
688
689 return wtf;
690}
691
692int
693sphinx_wave2feat_free(sphinx_wave2feat_t *wtf)
694{
695 if (wtf == NULL)
696 return 0;
697 if (--wtf->refcount > 0)
698 return wtf->refcount;
699
700 if (wtf->audio)
701 ckd_free(wtf->audio);
702 if (wtf->feat)
703 ckd_free_2d(wtf->feat);
704 if (wtf->infile)
705 ckd_free(wtf->infile);
706 if (wtf->outfile)
707 ckd_free(wtf->outfile);
708 if (wtf->infh) {
709 if (fclose(wtf->infh) == EOF)
710 E_ERROR_SYSTEM("Failed to close input file");
711 }
712 if (wtf->outfh) {
713 if (fclose(wtf->outfh) == EOF)
714 E_ERROR_SYSTEM("Failed to close output file");
715 }
716 cmd_ln_free_r(wtf->config);
717 fe_free(wtf->fe);
718 ckd_free(wtf);
719
720 return 0;
721}
722
724sphinx_wave2feat_retain(sphinx_wave2feat_t *wtf)
725{
726 ++wtf->refcount;
727 return wtf;
728}
729
730static audio_type_t const *
731detect_audio_type(sphinx_wave2feat_t *wtf)
732{
733 audio_type_t const *atype = NULL;
734 int i;
735
736 /* Special case audio type for Sphinx MFCC inputs. */
737 if (cmd_ln_boolean_r(wtf->config, "-spec2cep")
738 || cmd_ln_boolean_r(wtf->config, "-cep2spec")) {
739 int rv = mfcc_type.detect(wtf);
740 if (rv == -1)
741 goto error_out;
742 return &mfcc_type;
743 }
744
745 /* Try to use the type of infile given on the command line. */
746 for (i = 0; i < ntypes; ++i) {
747 int rv;
748 atype = &types[i];
749 if (cmd_ln_boolean_r(wtf->config, atype->name)) {
750 rv = (*atype->detect)(wtf);
751 if (rv == -1)
752 goto error_out;
753 else if (rv == TRUE)
754 break;
755 }
756 }
757 if (i == ntypes) {
758 /* Detect file type of infile and get parameters. */
759 for (i = 0; i < ntypes; ++i) {
760 int rv;
761 atype = &types[i];
762 rv = (*atype->detect)(wtf);
763 if (rv == -1)
764 goto error_out;
765 else if (rv == TRUE)
766 break;
767 }
768 if (i == ntypes)
769 goto error_out;
770 }
771 return atype;
772 error_out:
773 if (wtf->infh)
774 fclose(wtf->infh);
775 wtf->infh = NULL;
776 return NULL;
777}
778
779int
780sphinx_wave2feat_convert_file(sphinx_wave2feat_t *wtf,
781 char const *infile, char const *outfile)
782{
783 int nchans, nfloat, veclen;
784 audio_type_t const *atype = NULL;
785 int fshift, fsize;
786
787 E_INFO("Converting %s to %s\n", infile, outfile);
788
789 wtf->infile = ckd_salloc(infile);
790
791 /* Detect input file type. */
792 if ((atype = detect_audio_type(wtf)) == NULL)
793 return -1;
794
795 /* Determine whether to byteswap input. */
796 wtf->byteswap = strcmp(cmd_ln_str_r(wtf->config, "-mach_endian"),
797 cmd_ln_str_r(wtf->config, "-input_endian"));
798
799 /* Get the output frame size (if not already set). */
800 if (wtf->veclen == 0)
801 wtf->veclen = fe_get_output_size(wtf->fe);
802
803 /* Set up the input and output buffers. */
804 fe_get_input_size(wtf->fe, &fshift, &fsize);
805 /* Want to get at least a whole frame plus shift in here. Also we
806 will either pick or mix multiple channels so we need to read
807 them all at once. */
808 nchans = cmd_ln_int32_r(wtf->config, "-nchans");
809 wtf->blocksize = cmd_ln_int32_r(wtf->config, "-blocksize") * nchans;
810 if (wtf->blocksize < (fsize + fshift) * nchans) {
811 E_INFO("Block size of %d too small, increasing to %d\n",
812 wtf->blocksize,
813 (fsize + fshift) * nchans);
814 wtf->blocksize = (fsize + fshift) * nchans;
815 }
816 wtf->audio = (short *)ckd_calloc(wtf->blocksize, sizeof(*wtf->audio));
817 wtf->featsize = (wtf->blocksize / nchans - fsize) / fshift;
818
819 /* Use the maximum of the input and output frame sizes to allocate this. */
820 veclen = wtf->veclen;
821 if (wtf->in_veclen > veclen) veclen = wtf->in_veclen;
822
823 wtf->feat = (mfcc_t**)ckd_calloc_2d(wtf->featsize, veclen, sizeof(**wtf->feat));
824
825 /* Let's go! */
826 if ((wtf->outfh = fopen(outfile, "wb")) == NULL) {
827 E_ERROR_SYSTEM("Failed to open %s for writing", outfile);
828 return -1;
829 }
830 /* Write an empty header, which we'll fill in later. */
831 if (wtf->ot->output_header &&
832 (*wtf->ot->output_header)(wtf, 0) < 0) {
833 E_ERROR_SYSTEM("Failed to write empty header to %s\n", outfile);
834 goto error_out;
835 }
836 wtf->outfile = ckd_salloc(outfile);
837
838 if ((nfloat = (*atype->decode)(wtf)) < 0) {
839 E_ERROR("Failed to convert");
840 goto error_out;
841 }
842
843 if (wtf->ot->output_header) {
844 if (fseek(wtf->outfh, 0, SEEK_SET) < 0) {
845 E_ERROR_SYSTEM("Failed to seek to beginning of %s\n", outfile);
846 goto error_out;
847 }
848 if ((*wtf->ot->output_header)(wtf, nfloat) < 0) {
849 E_ERROR_SYSTEM("Failed to write header to %s\n", outfile);
850 goto error_out;
851 }
852 }
853
854
855 if (wtf->audio)
856 ckd_free(wtf->audio);
857 if (wtf->feat)
858 ckd_free_2d(wtf->feat);
859 if (wtf->infile)
860 ckd_free(wtf->infile);
861 if (wtf->outfile)
862 ckd_free(wtf->outfile);
863
864 wtf->audio = NULL;
865 wtf->infile = NULL;
866 wtf->feat = NULL;
867 wtf->outfile = NULL;
868
869 if (wtf->outfh)
870 if (fclose(wtf->outfh) == EOF)
871 E_ERROR_SYSTEM("Failed to close output file");
872 wtf->outfh = NULL;
873
874 return 0;
875
876error_out:
877
878 if (wtf->audio)
879 ckd_free(wtf->audio);
880 if (wtf->feat)
881 ckd_free_2d(wtf->feat);
882 if (wtf->infile)
883 ckd_free(wtf->infile);
884 if (wtf->outfile)
885 ckd_free(wtf->outfile);
886
887 wtf->audio = NULL;
888 wtf->infile = NULL;
889 wtf->feat = NULL;
890 wtf->outfile = NULL;
891
892 if (wtf->outfh)
893 if (fclose(wtf->outfh) == EOF)
894 E_ERROR_SYSTEM("Failed to close output file");
895 wtf->outfh = NULL;
896
897 return -1;
898}
899
900void
901build_filenames(cmd_ln_t *config, char const *basename,
902 char **out_infile, char **out_outfile)
903{
904 char const *di, *do_, *ei, *eo;
905
906 di = cmd_ln_str_r(config, "-di");
907 do_ = cmd_ln_str_r(config, "-do");
908 ei = cmd_ln_str_r(config, "-ei");
909 eo = cmd_ln_str_r(config, "-eo");
910
911 *out_infile = string_join(di ? di : "",
912 di ? "/" : "",
913 basename,
914 ei ? "." : "",
915 ei ? ei : "",
916 NULL);
917 *out_outfile = string_join(do_ ? do_ : "",
918 do_ ? "/" : "",
919 basename,
920 eo ? "." : "",
921 eo ? eo : "",
922 NULL);
923 /* Build output directory structure if possible/requested (it is
924 * by default). */
925 if (cmd_ln_boolean_r(config, "-build_outdirs")) {
926 char *dirname = ckd_salloc(*out_outfile);
927 path2dirname(*out_outfile, dirname);
928 build_directory(dirname);
929 ckd_free(dirname);
930 }
931}
932
933static int
934run_control_file(sphinx_wave2feat_t *wtf, char const *ctlfile)
935{
936 hash_table_t *files;
937 hash_iter_t *itor;
938 lineiter_t *li;
939 FILE *ctlfh;
940 int nskip, runlen, npart;
941
942 if ((ctlfh = fopen(ctlfile, "r")) == NULL) {
943 E_ERROR_SYSTEM("Failed to open control file %s", ctlfile);
944 return -1;
945 }
946 nskip = cmd_ln_int32_r(wtf->config, "-nskip");
947 runlen = cmd_ln_int32_r(wtf->config, "-runlen");
948 if ((npart = cmd_ln_int32_r(wtf->config, "-npart"))) {
949 /* Count lines in the file. */
950 int partlen, part, nlines = 0;
951 part = cmd_ln_int32_r(wtf->config, "-part");
952 for (li = lineiter_start(ctlfh); li; li = lineiter_next(li))
953 ++nlines;
954 fseek(ctlfh, 0, SEEK_SET);
955 partlen = nlines / npart;
956 nskip = partlen * (part - 1);
957 if (part == npart)
958 runlen = -1;
959 else
960 runlen = partlen;
961 }
962 if (runlen != -1){
963 E_INFO("Processing %d utterances at position %d\n", runlen, nskip);
964 files = hash_table_new(runlen, HASH_CASE_YES);
965 }
966 else {
967 E_INFO("Processing all remaining utterances at position %d\n", nskip);
968 files = hash_table_new(1000, HASH_CASE_YES);
969 }
970 for (li = lineiter_start(ctlfh); li; li = lineiter_next(li)) {
971 char *c, *infile, *outfile;
972
973 if (nskip-- > 0)
974 continue;
975 if (runlen == 0) {
976 lineiter_free(li);
977 break;
978 }
979 --runlen;
980
981 string_trim(li->buf, STRING_BOTH);
982 /* Extract the file ID from the control line. */
983 if ((c = strchr(li->buf, ' ')) != NULL)
984 *c = '\0';
985 if (strlen(li->buf) == 0) {
986 E_WARN("Empty line %d in control file, skipping\n", li->lineno);
987 continue;
988 }
989 build_filenames(wtf->config, li->buf, &infile, &outfile);
990 if (hash_table_lookup(files, infile, NULL) == 0)
991 continue;
992 sphinx_wave2feat_convert_file(wtf, infile, outfile);
993 hash_table_enter(files, infile, outfile);
994 }
995 for (itor = hash_table_iter(files); itor;
996 itor = hash_table_iter_next(itor)) {
997 ckd_free((void *)hash_entry_key(itor->ent));
999 }
1000 hash_table_free(files);
1001 fclose(ctlfh);
1002
1003 return 0;
1004}
1005
1006int
1007main(int argc, char *argv[])
1008{
1009 sphinx_wave2feat_t *wtf;
1010 cmd_ln_t *config;
1011 int rv;
1012
1013 config = cmd_ln_parse_r(NULL, defn, argc, argv, TRUE);
1014
1015 if (config && cmd_ln_str_r(config, "-argfile"))
1016 config = cmd_ln_parse_file_r(config, defn,
1017 cmd_ln_str_r(config, "-argfile"), FALSE);
1018 if (config == NULL) {
1019 E_ERROR("Command line parsing failed\n");
1020 return 1;
1021 }
1022
1023 if ((wtf = sphinx_wave2feat_init(config)) == NULL) {
1024 E_ERROR("Failed to initialize wave2feat object\n");
1025 return 1;
1026 }
1027
1028 /* If there's a control file run through it, otherwise we will do
1029 * a single file (which is what run_control_file will do
1030 * internally too) */
1031 if (cmd_ln_str_r(config, "-c"))
1032 rv = run_control_file(wtf, cmd_ln_str_r(config, "-c"));
1033 else
1034 rv = sphinx_wave2feat_convert_file(wtf, cmd_ln_str_r(config, "-i"),
1035 cmd_ln_str_r(config, "-o"));
1036
1037 sphinx_wave2feat_free(wtf);
1038 cmd_ln_free_r(config);
1039 return rv;
1040}
Sphinx's memory allocation/deallocation routines.
SPHINXBASE_EXPORT void ckd_free(void *ptr)
Test and free a 1-D array.
Definition ckd_alloc.c:244
SPHINXBASE_EXPORT void ckd_free_2d(void *ptr)
Free a 2-D array (ptr) previously allocated by ckd_calloc_2d.
Definition ckd_alloc.c:255
#define ckd_calloc_2d(d1, d2, sz)
Macro for ckd_calloc_2d
Definition ckd_alloc.h:270
#define ckd_calloc(n, sz)
Macros to simplify the use of above functions.
Definition ckd_alloc.h:248
#define ckd_salloc(ptr)
Macro for ckd_salloc
Definition ckd_alloc.h:264
Command-line and other configurationparsing and handling.
#define cmd_ln_boolean_r(c, n)
Retrieve a boolean value from a command-line object.
Definition cmd_ln.h:334
SPHINXBASE_EXPORT int cmd_ln_free_r(cmd_ln_t *cmdln)
Release a command-line argument set and all associated strings.
Definition cmd_ln.c:1046
SPHINXBASE_EXPORT void cmd_ln_set_str_r(cmd_ln_t *cmdln, char const *name, char const *str)
Set a string in a command-line object.
Definition cmd_ln.c:989
SPHINXBASE_EXPORT cmd_ln_t * cmd_ln_parse_file_r(cmd_ln_t *inout_cmdln, arg_t const *defn, char const *filename, int32 strict)
Parse an arguments file by deliminating on " \r\t\n" and putting each tokens into an argv[] for cmd_l...
Definition cmd_ln.c:764
SPHINXBASE_EXPORT char const * cmd_ln_str_r(cmd_ln_t *cmdln, char const *name)
Retrieve a string from a command-line object.
Definition cmd_ln.c:949
SPHINXBASE_EXPORT cmd_ln_t * cmd_ln_retain(cmd_ln_t *cmdln)
Retain ownership of a command-line argument set.
Definition cmd_ln.c:1039
SPHINXBASE_EXPORT cmd_ln_t * cmd_ln_parse_r(cmd_ln_t *inout_cmdln, arg_t const *defn, int32 argc, char *argv[], int32 strict)
Parse a list of strings into argumetns.
Definition cmd_ln.c:556
Implementation of logging routines.
#define E_ERROR(...)
Print error message to error log.
Definition err.h:104
#define E_INFO(...)
Print logging information to standard error stream.
Definition err.h:114
#define E_FATAL(...)
Exit with non-zero status after error message.
Definition err.h:81
#define E_ERROR_SYSTEM(...)
Print error text; Call perror("");.
Definition err.h:99
#define E_WARN(...)
Print warning message to error log.
Definition err.h:109
File names related operation.
SPHINXBASE_EXPORT void path2dirname(const char *path, char *dir)
Strip off filename from the given path and copy the directory name into dir Caller must have allocate...
Definition filename.c:68
Hash table implementation.
SPHINXBASE_EXPORT void hash_table_free(hash_table_t *h)
Free the specified hash table; the caller is responsible for freeing the key strings pointed to by th...
Definition hash_table.c:688
SPHINXBASE_EXPORT hash_iter_t * hash_table_iter_next(hash_iter_t *itor)
Get the next key-value pair in iteration.
Definition hash_table.c:656
SPHINXBASE_EXPORT int32 hash_table_lookup(hash_table_t *h, const char *key, void **val)
Look up a key in a hash table and optionally return the associated value.
Definition hash_table.c:302
#define hash_entry_val(e)
Access macros.
Definition hash_table.h:175
SPHINXBASE_EXPORT void * hash_table_enter(hash_table_t *h, const char *key, void *val)
Try to add a new entry with given key and associated value to hash table h.
Definition hash_table.c:501
SPHINXBASE_EXPORT hash_iter_t * hash_table_iter(hash_table_t *h)
Start iterating over key-value pairs in a hash table.
Definition hash_table.c:646
SPHINXBASE_EXPORT hash_table_t * hash_table_new(int32 size, int32 casearg)
Allocate a new hash table for a given expected size.
Definition hash_table.c:158
file IO related operations.
SPHINXBASE_EXPORT void lineiter_free(lineiter_t *li)
Stop reading lines from a file.
Definition pio.c:368
SPHINXBASE_EXPORT int build_directory(const char *path)
Create a directory and all of its parent directories, as needed.
Definition pio.c:621
SPHINXBASE_EXPORT lineiter_t * lineiter_start(FILE *fh)
Start reading lines from a file.
Definition pio.c:264
SPHINXBASE_EXPORT lineiter_t * lineiter_next(lineiter_t *li)
Move to the next line in the file.
Definition pio.c:347
Miscellaneous useful string functions.
SPHINXBASE_EXPORT char * string_trim(char *string, enum string_edge_e which)
Remove whitespace from a string, modifying it in-place.
Definition strfuncs.c:97
SPHINXBASE_EXPORT char * string_join(const char *base,...)
Concatenate a NULL-terminated argument list of strings, returning a newly allocated string.
Definition strfuncs.c:70
SPHINXBASE_EXPORT int32 str2words(char *line, char **wptr, int32 n_wptr)
Convert a line to an array of "words", based on whitespace separators.
Definition strfuncs.c:123
@ STRING_BOTH
Both ends of string.
Definition strfuncs.h:73
SPHINXBASE_EXPORT double atof_c(char const *str)
Locale independent version of atof().
Definition strfuncs.c:55
RIFF 44-byte header structure for MS wav files.
Definition sphinx_fe.c:91
Opaque structure used to hold the results of command-line parsing.
Structure for the front-end computation.
hash_entry_t * ent
Current entry in that table.
Definition hash_table.h:170
Line iterator for files.
Definition pio.h:177
int byteswap
Whether byteswapping is necessary.
Definition sphinx_fe.c:86
int in_veclen
Length of each input vector (for cep<->spec).
Definition sphinx_fe.c:85
cmd_ln_t * config
Configuration parameters.
Definition sphinx_fe.c:74
fe_t * fe
Front end object.
Definition sphinx_fe.c:75
char * infile
Path to input file.
Definition sphinx_fe.c:76
short * audio
Audio buffer.
Definition sphinx_fe.c:80
output_type_t const * ot
Output type object.
Definition sphinx_fe.c:87
char * outfile
Path to output file.
Definition sphinx_fe.c:77
mfcc_t ** feat
Feature buffer.
Definition sphinx_fe.c:81
int featsize
Size of feature buffer.
Definition sphinx_fe.c:83
int veclen
Length of each output vector.
Definition sphinx_fe.c:84
FILE * outfh
Output file handle.
Definition sphinx_fe.c:79
FILE * infh
Input file handle.
Definition sphinx_fe.c:78
int refcount
Reference count.
Definition sphinx_fe.c:73
int blocksize
Size of audio buffer.
Definition sphinx_fe.c:82