2 years ago · fd6bfb9ea3
--- a/ffmpeg-examples/sherpa-ncnn-ffmpeg.cc
+++ b/ffmpeg-examples/sherpa-ncnn-ffmpeg.cc
@@ -75,47 +75,48 @@ extern "C" {
 
				 }
			
 
				 #endif
			
 
				 
			
 
				-static const char *filter_descr =
			
 
				+static const char *ffmpeg_filter_descr =
			
 
				     "aresample=16000,aformat=sample_fmts=s16:channel_layouts=mono";
			
 
				 
			
 
				-static AVFormatContext *fmt_ctx;
			
 
				-static AVCodecContext *dec_ctx;
			
 
				-AVFilterContext *buffersink_ctx;
			
 
				-AVFilterContext *buffersrc_ctx;
			
 
				-AVFilterGraph *filter_graph;
			
 
				-static int32_t audio_stream_index = -1;
			
 
				+static AVFormatContext *ffmpeg_fmt_ctx;
			
 
				+static AVCodecContext *ffmpeg_dec_ctx;
			
 
				+static AVFilterContext *ffmpeg_buffersink_ctx;
			
 
				+static AVFilterContext *ffmpeg_buffersrc_ctx;
			
 
				+static AVFilterGraph *ffmpeg_filter_graph;
			
 
				+static int32_t ffmpeg_audio_stream_index = -1;
			
 
				 
			
 
				 static int32_t FFmpegOpenInputFile(const char *filename) {
			
 
				-  const AVCodec *dec;
			
 
				   int32_t ret;
			
 
				-
			
 
				-  if ((ret = avformat_open_input(&fmt_ctx, filename, NULL, NULL)) < 0) {
			
 
				+  if ((ret = avformat_open_input(&ffmpeg_fmt_ctx, filename, NULL, NULL)) < 0) {
			
 
				     av_log(NULL, AV_LOG_ERROR, "Cannot open input file %s\n", filename);
			
 
				     return ret;
			
 
				   }
			
 
				 
			
 
				-  if ((ret = avformat_find_stream_info(fmt_ctx, NULL)) < 0) {
			
 
				+  if ((ret = avformat_find_stream_info(ffmpeg_fmt_ctx, NULL)) < 0) {
			
 
				     av_log(NULL, AV_LOG_ERROR, "Cannot find stream information\n");
			
 
				     return ret;
			
 
				   }
			
 
				 
			
 
				   /* select the audio stream */
			
 
				-  ret = av_find_best_stream(fmt_ctx, AVMEDIA_TYPE_AUDIO, -1, -1, &dec, 0);
			
 
				+  const AVCodec *dec;
			
 
				+  ret =
			
 
				+      av_find_best_stream(ffmpeg_fmt_ctx, AVMEDIA_TYPE_AUDIO, -1, -1, &dec, 0);
			
 
				   if (ret < 0) {
			
 
				     av_log(NULL, AV_LOG_ERROR,
			
 
				            "Cannot find an audio stream in the input file\n");
			
 
				     return ret;
			
 
				   }
			
 
				-  audio_stream_index = ret;
			
 
				+  ffmpeg_audio_stream_index = ret;
			
 
				 
			
 
				   /* create decoding context */
			
 
				-  dec_ctx = avcodec_alloc_context3(dec);
			
 
				-  if (!dec_ctx) return AVERROR(ENOMEM);
			
 
				-  avcodec_parameters_to_context(dec_ctx,
			
 
				-                                fmt_ctx->streams[audio_stream_index]->codecpar);
			
 
				+  ffmpeg_dec_ctx = avcodec_alloc_context3(dec);
			
 
				+  if (!ffmpeg_dec_ctx) return AVERROR(ENOMEM);
			
 
				+  avcodec_parameters_to_context(
			
 
				+      ffmpeg_dec_ctx,
			
 
				+      ffmpeg_fmt_ctx->streams[ffmpeg_audio_stream_index]->codecpar);
			
 
				 
			
 
				   /* init the audio decoder */
			
 
				-  if ((ret = avcodec_open2(dec_ctx, dec, NULL)) < 0) {
			
 
				+  if ((ret = avcodec_open2(ffmpeg_dec_ctx, dec, NULL)) < 0) {
			
 
				     av_log(NULL, AV_LOG_ERROR, "Cannot open audio decoder\n");
			
 
				     return ret;
			
 
				   }
			
@@ -124,73 +125,73 @@ static int32_t FFmpegOpenInputFile(const char *filename) {
 
				 }
			
 
				 
			
 
				 static int32_t FFmpegInitFilters(const char *filters_descr) {
			
 
				-  char args[512];
			
 
				-  int32_t ret = 0;
			
 
				   const AVFilter *abuffersrc = avfilter_get_by_name("abuffer");
			
 
				   const AVFilter *abuffersink = avfilter_get_by_name("abuffersink");
			
 
				   AVFilterInOut *outputs = avfilter_inout_alloc();
			
 
				   AVFilterInOut *inputs = avfilter_inout_alloc();
			
 
				-  static const enum AVSampleFormat out_sample_fmts[] = {AV_SAMPLE_FMT_S16,
			
 
				-                                                        AV_SAMPLE_FMT_NONE};
			
 
				-  static const int32_t out_sample_rates[] = {16000, -1};
			
 
				-  const AVFilterLink *outlink;
			
 
				-  AVRational time_base = fmt_ctx->streams[audio_stream_index]->time_base;
			
 
				+  AVRational time_base =
			
 
				+      ffmpeg_fmt_ctx->streams[ffmpeg_audio_stream_index]->time_base;
			
 
				 
			
 
				-  filter_graph = avfilter_graph_alloc();
			
 
				-  if (!outputs || !inputs || !filter_graph) {
			
 
				+  int32_t ret;
			
 
				+  ffmpeg_filter_graph = avfilter_graph_alloc();
			
 
				+  if (!outputs || !inputs || !ffmpeg_filter_graph) {
			
 
				     ret = AVERROR(ENOMEM);
			
 
				     goto end;
			
 
				   }
			
 
				 
			
 
				   /* buffer audio source: the decoded frames from the decoder will be inserted
			
 
				    * here. */
			
 
				-  if (dec_ctx->ch_layout.order == AV_CHANNEL_ORDER_UNSPEC)
			
 
				-    av_channel_layout_default(&dec_ctx->ch_layout,
			
 
				-                              dec_ctx->ch_layout.nb_channels);
			
 
				+  if (ffmpeg_dec_ctx->ch_layout.order == AV_CHANNEL_ORDER_UNSPEC)
			
 
				+    av_channel_layout_default(&ffmpeg_dec_ctx->ch_layout,
			
 
				+                              ffmpeg_dec_ctx->ch_layout.nb_channels);
			
 
				+  char args[512];
			
 
				   ret = snprintf(args, sizeof(args),
			
 
				                  "time_base=%d/%d:sample_rate=%d:sample_fmt=%s:channel_layout=",
			
 
				-                 time_base.num, time_base.den, dec_ctx->sample_rate,
			
 
				-                 av_get_sample_fmt_name(dec_ctx->sample_fmt));
			
 
				-  av_channel_layout_describe(&dec_ctx->ch_layout, args + ret,
			
 
				+                 time_base.num, time_base.den, ffmpeg_dec_ctx->sample_rate,
			
 
				+                 av_get_sample_fmt_name(ffmpeg_dec_ctx->sample_fmt));
			
 
				+  av_channel_layout_describe(&ffmpeg_dec_ctx->ch_layout, args + ret,
			
 
				                              sizeof(args) - ret);
			
 
				-  ret = avfilter_graph_create_filter(&buffersrc_ctx, abuffersrc, "in", args,
			
 
				-                                     NULL, filter_graph);
			
 
				+  ret = avfilter_graph_create_filter(&ffmpeg_buffersrc_ctx, abuffersrc, "in",
			
 
				+                                     args, NULL, ffmpeg_filter_graph);
			
 
				   if (ret < 0) {
			
 
				     av_log(NULL, AV_LOG_ERROR, "Cannot create audio buffer source\n");
			
 
				     goto end;
			
 
				   }
			
 
				 
			
 
				   /* buffer audio sink: to terminate the filter chain. */
			
 
				-  ret = avfilter_graph_create_filter(&buffersink_ctx, abuffersink, "out", NULL,
			
 
				-                                     NULL, filter_graph);
			
 
				+  ret = avfilter_graph_create_filter(&ffmpeg_buffersink_ctx, abuffersink, "out",
			
 
				+                                     NULL, NULL, ffmpeg_filter_graph);
			
 
				   if (ret < 0) {
			
 
				     av_log(NULL, AV_LOG_ERROR, "Cannot create audio buffer sink\n");
			
 
				     goto end;
			
 
				   }
			
 
				 
			
 
				-  ret = av_opt_set_int_list(buffersink_ctx, "sample_fmts", out_sample_fmts, -1,
			
 
				-                            AV_OPT_SEARCH_CHILDREN);
			
 
				+  static const enum AVSampleFormat out_sample_fmts[] = {AV_SAMPLE_FMT_S16,
			
 
				+                                                        AV_SAMPLE_FMT_NONE};
			
 
				+  ret = av_opt_set_int_list(ffmpeg_buffersink_ctx, "sample_fmts",
			
 
				+                            out_sample_fmts, -1, AV_OPT_SEARCH_CHILDREN);
			
 
				   if (ret < 0) {
			
 
				     av_log(NULL, AV_LOG_ERROR, "Cannot set output sample format\n");
			
 
				     goto end;
			
 
				   }
			
 
				 
			
 
				-  ret =
			
 
				-      av_opt_set(buffersink_ctx, "ch_layouts", "mono", AV_OPT_SEARCH_CHILDREN);
			
 
				+  ret = av_opt_set(ffmpeg_buffersink_ctx, "ch_layouts", "mono",
			
 
				+                   AV_OPT_SEARCH_CHILDREN);
			
 
				   if (ret < 0) {
			
 
				     av_log(NULL, AV_LOG_ERROR, "Cannot set output channel layout\n");
			
 
				     goto end;
			
 
				   }
			
 
				 
			
 
				-  ret = av_opt_set_int_list(buffersink_ctx, "sample_rates", out_sample_rates,
			
 
				-                            -1, AV_OPT_SEARCH_CHILDREN);
			
 
				+  static const int32_t out_sample_rates[] = {16000, -1};
			
 
				+  ret = av_opt_set_int_list(ffmpeg_buffersink_ctx, "sample_rates",
			
 
				+                            out_sample_rates, -1, AV_OPT_SEARCH_CHILDREN);
			
 
				   if (ret < 0) {
			
 
				     av_log(NULL, AV_LOG_ERROR, "Cannot set output sample rate\n");
			
 
				     goto end;
			
 
				   }
			
 
				 
			
 
				   /*
			
 
				-   * Set the endpoints for the filter graph. The filter_graph will
			
 
				+   * Set the endpoints for the filter graph. The ffmpeg_filter_graph will
			
 
				    * be linked to the graph described by filters_descr.
			
 
				    */
			
 
				 
			
@@ -201,7 +202,7 @@ static int32_t FFmpegInitFilters(const char *filters_descr) {
 
				    * default.
			
 
				    */
			
 
				   outputs->name = av_strdup("in");
			
 
				-  outputs->filter_ctx = buffersrc_ctx;
			
 
				+  outputs->filter_ctx = ffmpeg_buffersrc_ctx;
			
 
				   outputs->pad_idx = 0;
			
 
				   outputs->next = NULL;
			
 
				 
			
@@ -212,19 +213,20 @@ static int32_t FFmpegInitFilters(const char *filters_descr) {
 
				    * default.
			
 
				    */
			
 
				   inputs->name = av_strdup("out");
			
 
				-  inputs->filter_ctx = buffersink_ctx;
			
 
				+  inputs->filter_ctx = ffmpeg_buffersink_ctx;
			
 
				   inputs->pad_idx = 0;
			
 
				   inputs->next = NULL;
			
 
				 
			
 
				-  if ((ret = avfilter_graph_parse_ptr(filter_graph, filters_descr, &inputs,
			
 
				-                                      &outputs, NULL)) < 0)
			
 
				+  if ((ret = avfilter_graph_parse_ptr(ffmpeg_filter_graph, filters_descr,
			
 
				+                                      &inputs, &outputs, NULL)) < 0)
			
 
				     goto end;
			
 
				 
			
 
				-  if ((ret = avfilter_graph_config(filter_graph, NULL)) < 0) goto end;
			
 
				+  if ((ret = avfilter_graph_config(ffmpeg_filter_graph, NULL)) < 0) goto end;
			
 
				 
			
 
				   /* Print summary of the sink buffer
			
 
				    * Note: args buffer is reused to store channel layout string */
			
 
				-  outlink = buffersink_ctx->inputs[0];
			
 
				+  const AVFilterLink *outlink;
			
 
				+  outlink = ffmpeg_buffersink_ctx->inputs[0];
			
 
				   av_channel_layout_describe(&outlink->ch_layout, args, sizeof(args));
			
 
				   fprintf(
			
 
				       stdout,
			
@@ -242,18 +244,16 @@ end:
 
				   return ret;
			
 
				 }
			
 
				 
			
 
				-static void FFmpegDecodeFrame(const AVFrame *frame,
			
 
				-                              const sherpa_ncnn::Recognizer &recognizer,
			
 
				-                              sherpa_ncnn::Stream *s,
			
 
				-                              sherpa_ncnn::Display *display,
			
 
				-                              std::string *last_text, int32_t *segment_index,
			
 
				-                              int32_t *zero_samples) {
			
 
				+static void FFmpegOnDecodedFrame(const AVFrame *frame,
			
 
				+                                 const sherpa_ncnn::Recognizer &recognizer,
			
 
				+                                 sherpa_ncnn::Stream *s,
			
 
				+                                 sherpa_ncnn::Display *display,
			
 
				+                                 std::string *last_text, int32_t *segment_index,
			
 
				+                                 int32_t *zero_samples) {
			
 
				   // TODO: FIXME: Can we directly consume frame by s without buffer?
			
 
				 #define N 3200  // 0.2 s. Sample rate is fixed to 16 kHz
			
 
				   static float samples[N];
			
 
				   static int32_t nb_samples = 0;
			
 
				-  const int16_t *p = (int16_t *)frame->data[0];
			
 
				-
			
 
				   if (frame->nb_samples + nb_samples >= N) {
			
 
				     s->AcceptWaveform(16000, samples, nb_samples);
			
 
				 
			
@@ -284,6 +284,7 @@ static void FFmpegDecodeFrame(const AVFrame *frame,
 
				     nb_samples = 0;
			
 
				   }
			
 
				 
			
 
				+  const int16_t *p = (int16_t *)frame->data[0];
			
 
				   for (int32_t i = 0; i < frame->nb_samples; i++) {
			
 
				     if (p[i] == 0) {
			
 
				       (*zero_samples)++;
			
@@ -604,8 +605,9 @@ for a list of pre-trained models to download.
 
				   fprintf(stdout, "Event:FFmpeg: Open input ok, %s\n", input_url.c_str());
			
 
				   fflush(stdout);
			
 
				 
			
 
				-  if ((ret = FFmpegInitFilters(filter_descr)) < 0) {
			
 
				-    fprintf(stderr, "Init filters %s failed, r0=%d\n", filter_descr, ret);
			
 
				+  if ((ret = FFmpegInitFilters(ffmpeg_filter_descr)) < 0) {
			
 
				+    fprintf(stderr, "Init filters %s failed, r0=%d\n", ffmpeg_filter_descr,
			
 
				+            ret);
			
 
				     exit(1);
			
 
				   }
			
 
				 
			
@@ -617,7 +619,7 @@ for a list of pre-trained models to download.
 
				   int32_t segment_index = 0, zero_samples = 0, asd_segment = 0;
			
 
				   std::unique_ptr<sherpa_ncnn::Display> display = CreateDisplay();
			
 
				   while (1) {
			
 
				-    if ((ret = av_read_frame(fmt_ctx, packet)) < 0) {
			
 
				+    if ((ret = av_read_frame(ffmpeg_fmt_ctx, packet)) < 0) {
			
 
				       break;
			
 
				     }
			
 
				 
			
@@ -640,8 +642,8 @@ for a list of pre-trained models to download.
 
				       zero_samples = 0;
			
 
				     }
			
 
				 
			
 
				-    if (packet->stream_index == audio_stream_index) {
			
 
				-      ret = avcodec_send_packet(dec_ctx, packet);
			
 
				+    if (packet->stream_index == ffmpeg_audio_stream_index) {
			
 
				+      ret = avcodec_send_packet(ffmpeg_dec_ctx, packet);
			
 
				       if (ret < 0) {
			
 
				         av_log(NULL, AV_LOG_ERROR,
			
 
				                "Error while sending a packet to the decoder\n");
			
@@ -649,7 +651,7 @@ for a list of pre-trained models to download.
 
				       }
			
 
				 
			
 
				       while (ret >= 0) {
			
 
				-        ret = avcodec_receive_frame(dec_ctx, frame);
			
 
				+        ret = avcodec_receive_frame(ffmpeg_dec_ctx, frame);
			
 
				         if (ret == AVERROR(EAGAIN) || ret == AVERROR_EOF) {
			
 
				           break;
			
 
				         } else if (ret < 0) {
			
@@ -660,7 +662,7 @@ for a list of pre-trained models to download.
 
				 
			
 
				         if (ret >= 0) {
			
 
				           /* push the audio data from decoded frame into the filtergraph */
			
 
				-          if (av_buffersrc_add_frame_flags(buffersrc_ctx, frame,
			
 
				+          if (av_buffersrc_add_frame_flags(ffmpeg_buffersrc_ctx, frame,
			
 
				                                            AV_BUFFERSRC_FLAG_KEEP_REF) < 0) {
			
 
				             av_log(NULL, AV_LOG_ERROR,
			
 
				                    "Error while feeding the audio filtergraph\n");
			
@@ -669,7 +671,7 @@ for a list of pre-trained models to download.
 
				 
			
 
				           /* pull filtered audio from the filtergraph */
			
 
				           while (1) {
			
 
				-            ret = av_buffersink_get_frame(buffersink_ctx, filt_frame);
			
 
				+            ret = av_buffersink_get_frame(ffmpeg_buffersink_ctx, filt_frame);
			
 
				             if (ret == AVERROR(EAGAIN) || ret == AVERROR_EOF) {
			
 
				               break;
			
 
				             }
			
@@ -677,8 +679,8 @@ for a list of pre-trained models to download.
 
				               fprintf(stderr, "Error get frame, ret=%d\n", ret);
			
 
				               exit(1);
			
 
				             }
			
 
				-            FFmpegDecodeFrame(filt_frame, recognizer, s.get(), display.get(),
			
 
				-                              &last_text, &segment_index, &zero_samples);
			
 
				+            FFmpegOnDecodedFrame(filt_frame, recognizer, s.get(), display.get(),
			
 
				+                                 &last_text, &segment_index, &zero_samples);
			
 
				             av_frame_unref(filt_frame);
			
 
				           }
			
 
				           av_frame_unref(frame);
			
@@ -689,26 +691,28 @@ for a list of pre-trained models to download.
 
				   }
			
 
				 
			
 
				   // Add some tail padding
			
 
				-  float tail_paddings[4800] = {0};  // 0.3 seconds at 16 kHz sample rate
			
 
				-  s->AcceptWaveform(16000, tail_paddings, 4800);
			
 
				+  if (1) {
			
 
				+    float tail_paddings[4800] = {0};  // 0.3 seconds at 16 kHz sample rate
			
 
				+    s->AcceptWaveform(16000, tail_paddings, 4800);
			
 
				 
			
 
				-  s->InputFinished();
			
 
				+    s->InputFinished();
			
 
				 
			
 
				-  while (recognizer.IsReady(s.get())) {
			
 
				-    recognizer.DecodeStream(s.get());
			
 
				-  }
			
 
				+    while (recognizer.IsReady(s.get())) {
			
 
				+      recognizer.DecodeStream(s.get());
			
 
				+    }
			
 
				 
			
 
				-  auto text = recognizer.GetResult(s.get()).text;
			
 
				-  if (!text.empty() && last_text != text) {
			
 
				-    last_text = text;
			
 
				-    std::transform(text.begin(), text.end(), text.begin(),
			
 
				-                   [](auto c) { return std::tolower(c); });
			
 
				-    display->Print(segment_index, text);
			
 
				+    auto text = recognizer.GetResult(s.get()).text;
			
 
				+    if (!text.empty() && last_text != text) {
			
 
				+      last_text = text;
			
 
				+      std::transform(text.begin(), text.end(), text.begin(),
			
 
				+                     [](auto c) { return std::tolower(c); });
			
 
				+      display->Print(segment_index, text);
			
 
				+    }
			
 
				   }
			
 
				 
			
 
				-  avfilter_graph_free(&filter_graph);
			
 
				-  avcodec_free_context(&dec_ctx);
			
 
				-  avformat_close_input(&fmt_ctx);
			
 
				+  avfilter_graph_free(&ffmpeg_filter_graph);
			
 
				+  avcodec_free_context(&ffmpeg_dec_ctx);
			
 
				+  avformat_close_input(&ffmpeg_fmt_ctx);
			
 
				   av_packet_free(&packet);
			
 
				   av_frame_free(&frame);
			
 
				   av_frame_free(&filt_frame);