2 years ago · 0ba47330ce
--- a/ffmpeg-examples/sherpa-ncnn-ffmpeg.cc
+++ b/ffmpeg-examples/sherpa-ncnn-ffmpeg.cc
@@ -275,46 +275,59 @@ static void FFmpegOnDecodedFrame(const AVFrame *frame,
 
				                                  sherpa_ncnn::Display *display,
			
 
				                                  std::string *last_text, int32_t *segment_index,
			
 
				                                  int32_t *zero_samples) {
			
 
				-  // TODO: FIXME: Can we directly consume frame by s without buffer?
			
 
				-#define N 3200  // 0.2 s. Sample rate is fixed to 16 kHz
			
 
				-  static float samples[N];
			
 
				-  static int32_t nb_samples = 0;
			
 
				-  if (frame->nb_samples + nb_samples >= N) {
			
 
				-    s->AcceptWaveform(16000, samples, nb_samples);
			
 
				-
			
 
				-    while (recognizer.IsReady(s)) {
			
 
				-      recognizer.DecodeStream(s);
			
 
				-    }
			
 
				+  if (!frame->nb_samples) {
			
 
				+    return;
			
 
				+  }
			
 
				 
			
 
				-    bool is_endpoint = recognizer.IsEndpoint(s);
			
 
				-    auto text = recognizer.GetResult(s).text;
			
 
				+  // Convert the PCM from int16_t to float. Note that K2 sample is [-1, 1], so
			
 
				+  // we need to divide by 32768.
			
 
				+#define MAX_SAMPLES 3200  // 0.2 s. Sample rate is fixed to 16 kHz
			
 
				+  static float samples[MAX_SAMPLES];
			
 
				+  int32_t nb_samples = 0;
			
 
				 
			
 
				-    if (!text.empty() && *last_text != text) {
			
 
				-      *last_text = text;
			
 
				+  if (frame->nb_samples > MAX_SAMPLES) {
			
 
				+    av_log(NULL, AV_LOG_ERROR, "Too many samples: %d\n", frame->nb_samples);
			
 
				+    return;
			
 
				+  }
			
 
				 
			
 
				-      std::transform(text.begin(), text.end(), text.begin(),
			
 
				-                     [](auto c) { return std::tolower(c); });
			
 
				+  if (1) {
			
 
				+    const int16_t *p = (int16_t *)frame->data[0];
			
 
				+    for (int32_t i = 0; i < frame->nb_samples; i++) {
			
 
				+      // ASD(Active speaker detection) detection.
			
 
				+      if (p[i] == 0) {
			
 
				+        (*zero_samples)++;
			
 
				+      }
			
 
				 
			
 
				-      display->Print(*segment_index, text);
			
 
				+      // Convert to float [-1, 1].
			
 
				+      samples[nb_samples++] = p[i] / 32768.;
			
 
				     }
			
 
				+  }
			
 
				 
			
 
				-    if (is_endpoint) {
			
 
				-      if (!text.empty()) {
			
 
				-        (*segment_index)++;
			
 
				-      }
			
 
				+  // Feed samples to K2, which accepts any number of samples.
			
 
				+  s->AcceptWaveform(16000, samples, nb_samples);
			
 
				 
			
 
				-      recognizer.Reset(s);
			
 
				-    }
			
 
				+  while (recognizer.IsReady(s)) {
			
 
				+    recognizer.DecodeStream(s);
			
 
				+  }
			
 
				+
			
 
				+  bool is_endpoint = recognizer.IsEndpoint(s);
			
 
				+  auto text = recognizer.GetResult(s).text;
			
 
				 
			
 
				-    nb_samples = 0;
			
 
				+  if (!text.empty() && *last_text != text) {
			
 
				+    *last_text = text;
			
 
				+
			
 
				+    std::transform(text.begin(), text.end(), text.begin(),
			
 
				+                   [](auto c) { return std::tolower(c); });
			
 
				+
			
 
				+    display->Print(*segment_index, text);
			
 
				   }
			
 
				 
			
 
				-  const int16_t *p = (int16_t *)frame->data[0];
			
 
				-  for (int32_t i = 0; i < frame->nb_samples; i++) {
			
 
				-    if (p[i] == 0) {
			
 
				-      (*zero_samples)++;
			
 
				+  if (is_endpoint) {
			
 
				+    if (!text.empty()) {
			
 
				+      (*segment_index)++;
			
 
				     }
			
 
				-    samples[nb_samples++] = p[i] / 32768.;
			
 
				+
			
 
				+    recognizer.Reset(s);
			
 
				   }
			
 
				 }
			
 
				 
			
@@ -677,8 +690,9 @@ for a list of pre-trained models to download.
 
				   std::string last_text;
			
 
				   int32_t segment_index = 0, zero_samples = 0, asd_segment = 0;
			
 
				   std::unique_ptr<sherpa_ncnn::Display> display = CreateDisplay();
			
 
				-  while (1) {
			
 
				+  while (ret >= 0) {
			
 
				     if ((ret = av_read_frame(ffmpeg_fmt_ctx.get(), packet.get())) < 0) {
			
 
				+      av_log(NULL, AV_LOG_ERROR, "Error reading frame ret=%d\n", ret);
			
 
				       break;
			
 
				     }
			
 
				 
			
@@ -688,7 +702,7 @@ for a list of pre-trained models to download.
 
				         packet.get(), [](auto p) { av_packet_unref(p); });
			
 
				     (void)packet_unref;
			
 
				 
			
 
				-    // Reset the ASD segment when stream unpublish.
			
 
				+    // Reset the ASD(Active speaker detection) segment when stream unpublish.
			
 
				     if (signal_unpublish_sigusr1) {
			
 
				       signal_unpublish_sigusr1 = 0;
			
 
				       if (asd_segment != segment_index) {
			
@@ -700,68 +714,68 @@ for a list of pre-trained models to download.
 
				     if (asd_samples && zero_samples > asd_samples * 16000) {
			
 
				       // When unpublished, there might be some left samples in buffer.
			
 
				       if (asd_endpoints && segment_index - asd_segment < asd_endpoints) {
			
 
				-        fprintf(stdout,
			
 
				-                "\nEvent:FFmpeg: All silence samples, incorrect microphone?\n");
			
 
				+        fprintf(stdout, "\nEvent:FFmpeg: Silence, incorrect microphone?\n");
			
 
				         fflush(stdout);
			
 
				       }
			
 
				       zero_samples = 0;
			
 
				     }
			
 
				 
			
 
				-    if (packet->stream_index == ffmpeg_audio_stream_index) {
			
 
				-      ret = avcodec_send_packet(ffmpeg_dec_ctx.get(), packet.get());
			
 
				+    // Ignore packets except audio stream.
			
 
				+    if (packet->stream_index != ffmpeg_audio_stream_index) {
			
 
				+      continue;
			
 
				+    }
			
 
				+
			
 
				+    ret = avcodec_send_packet(ffmpeg_dec_ctx.get(), packet.get());
			
 
				+    if (ret < 0) {
			
 
				+      av_log(NULL, AV_LOG_ERROR, "Error feed decoder packet, ret=%d\n", ret);
			
 
				+      break;
			
 
				+    }
			
 
				+
			
 
				+    while (ret >= 0) {
			
 
				+      ret = avcodec_receive_frame(ffmpeg_dec_ctx.get(), frame.get());
			
 
				+      if (ret == AVERROR(EAGAIN) || ret == AVERROR_EOF) {
			
 
				+        ret = 0;
			
 
				+        break;
			
 
				+      } else if (ret < 0) {
			
 
				+        av_log(NULL, AV_LOG_ERROR, "Error dec receive frame, ret=%d\n", ret);
			
 
				+        break;
			
 
				+      }
			
 
				+
			
 
				+      // Always free the frame with av_frame_unref() when it is no longer
			
 
				+      // needed.
			
 
				+      auto frame_unref = std::unique_ptr<AVFrame, void (*)(AVFrame *)>(
			
 
				+          frame.get(), [](auto p) { av_frame_unref(p); });
			
 
				+      (void)frame_unref;
			
 
				+
			
 
				+      /* push the audio data from decoded frame into the filtergraph */
			
 
				+      ret = av_buffersrc_add_frame_flags(ffmpeg_buffersrc_ctx, frame.get(),
			
 
				+                                         AV_BUFFERSRC_FLAG_KEEP_REF);
			
 
				       if (ret < 0) {
			
 
				-        av_log(NULL, AV_LOG_ERROR,
			
 
				-               "Error while sending a packet to the decoder, ret=%d\n", ret);
			
 
				+        av_log(NULL, AV_LOG_ERROR, "Error filter feed frame, ret=%d\n", ret);
			
 
				         break;
			
 
				       }
			
 
				 
			
 
				+      /* pull filtered audio from the filtergraph */
			
 
				       while (ret >= 0) {
			
 
				-        ret = avcodec_receive_frame(ffmpeg_dec_ctx.get(), frame.get());
			
 
				+        ret = av_buffersink_get_frame(ffmpeg_buffersink_ctx, filt_frame.get());
			
 
				         if (ret == AVERROR(EAGAIN) || ret == AVERROR_EOF) {
			
 
				+          ret = 0;
			
 
				           break;
			
 
				-        } else if (ret < 0) {
			
 
				-          av_log(NULL, AV_LOG_ERROR,
			
 
				-                 "Error while receiving a frame from the decoder, ret=%d\n",
			
 
				-                 ret);
			
 
				-          exit(1);
			
 
				         }
			
 
				-
			
 
				-        // Always free the frame with av_frame_unref() when it is no longer
			
 
				-        // needed.
			
 
				-        auto frame_unref = std::unique_ptr<AVFrame, void (*)(AVFrame *)>(
			
 
				-            frame.get(), [](auto p) { av_frame_unref(p); });
			
 
				-        (void)frame_unref;
			
 
				-
			
 
				-        /* push the audio data from decoded frame into the filtergraph */
			
 
				-        if (av_buffersrc_add_frame_flags(ffmpeg_buffersrc_ctx, frame.get(),
			
 
				-                                         AV_BUFFERSRC_FLAG_KEEP_REF) < 0) {
			
 
				-          av_log(NULL, AV_LOG_ERROR,
			
 
				-                 "Error while feeding the audio filtergraph\n");
			
 
				+        if (ret < 0) {
			
 
				+          fprintf(stderr, "Error get frame, ret=%d\n", ret);
			
 
				           break;
			
 
				         }
			
 
				 
			
 
				-        /* pull filtered audio from the filtergraph */
			
 
				-        while (1) {
			
 
				-          ret =
			
 
				-              av_buffersink_get_frame(ffmpeg_buffersink_ctx, filt_frame.get());
			
 
				-          if (ret == AVERROR(EAGAIN) || ret == AVERROR_EOF) {
			
 
				-            break;
			
 
				-          }
			
 
				-          if (ret < 0) {
			
 
				-            fprintf(stderr, "Error get frame, ret=%d\n", ret);
			
 
				-            exit(1);
			
 
				-          }
			
 
				-
			
 
				-          // The filt_frame is an allocated frame that will be filled with data.
			
 
				-          // The data must be freed using av_frame_unref() / av_frame_free()
			
 
				-          auto filt_frame_unref = std::unique_ptr<AVFrame, void (*)(AVFrame *)>(
			
 
				-              filt_frame.get(), [](auto p) { av_frame_unref(p); });
			
 
				-          (void)filt_frame_unref;
			
 
				-
			
 
				-          FFmpegOnDecodedFrame(filt_frame.get(), recognizer, s.get(),
			
 
				-                               display.get(), &last_text, &segment_index,
			
 
				-                               &zero_samples);
			
 
				-        }
			
 
				+        // The filt_frame is an allocated frame that will be filled with data.
			
 
				+        // The data must be freed using av_frame_unref() / av_frame_free()
			
 
				+        auto filt_frame_unref = std::unique_ptr<AVFrame, void (*)(AVFrame *)>(
			
 
				+            filt_frame.get(), [](auto p) { av_frame_unref(p); });
			
 
				+        (void)filt_frame_unref;
			
 
				+
			
 
				+        FFmpegOnDecodedFrame(filt_frame.get(), recognizer, s.get(),
			
 
				+                             display.get(), &last_text, &segment_index,
			
 
				+                             &zero_samples);
			
 
				       }
			
 
				     }
			
 
				   }