Bläddra i källkod

Fix displaying for real-time speech recogntion (#77)

* Fix displaying

* Fix displaying for real-time speech recognition

* fix style issues

* small fixes

* Enable endpoint detection for sherpa-ncnn-microphone

* typo fixes
Fangjun Kuang 2 år sedan
förälder
incheckning
1c578309b7

+ 2 - 0
sherpa-ncnn/csrc/alsa.cc

@@ -20,6 +20,8 @@
 
 #include "sherpa-ncnn/csrc/alsa.h"
 
+#include <algorithm>
+
 #include "alsa/asoundlib.h"
 
 namespace sherpa_ncnn {

+ 3 - 3
sherpa-ncnn/csrc/alsa.h

@@ -16,8 +16,8 @@
  * limitations under the License.
  */
 
-#ifndef SHERPA_NCNN_CSRC_MICROPHONE_H_
-#define SHERPA_NCNN_CSRC_MICROPHONE_H_
+#ifndef SHERPA_NCNN_CSRC_ALSA_H_
+#define SHERPA_NCNN_CSRC_ALSA_H_
 
 #include <memory>
 #include <vector>
@@ -57,4 +57,4 @@ class Alsa {
 
 }  // namespace sherpa_ncnn
 
-#endif  // SHERPA_NCNN_CSRC_MICROPHONE_H_
+#endif  // SHERPA_NCNN_CSRC_ALSA_H_

+ 92 - 0
sherpa-ncnn/csrc/display.h

@@ -0,0 +1,92 @@
+/**
+ * Copyright (c)  2023  Xiaomi Corporation (authors: Fangjun Kuang)
+ *
+ * See LICENSE for clarification regarding multiple authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef SHERPA_NCNN_CSRC_DISPLAY_H_
+#define SHERPA_NCNN_CSRC_DISPLAY_H_
+#include <stdio.h>
+
+#include <string>
+
+namespace sherpa_ncnn {
+
+class Display {
+ public:
+  void Print(int32_t segment_id, const std::string &s) {
+#ifdef _MSC_VER
+    fprintf(stderr, "%d:%s\n", segment_id, s.c_str());
+    return;
+#endif
+    if (last_segment_ == segment_id) {
+      Clear();
+    } else {
+      if (last_segment_ != -1) {
+        fprintf(stderr, "\n\r");
+      }
+      last_segment_ = segment_id;
+      num_previous_lines_ = 0;
+    }
+
+    fprintf(stderr, "\r%d:", segment_id);
+
+    int32_t i = 0;
+    for (size_t n = 0; n < s.size();) {
+      if (s[n] > 0 && s[n] < 0x7f) {
+        fprintf(stderr, "%c", s[n]);
+        ++n;
+      } else {
+        // Each Chinese character occupies 3 bytes for UTF-8 encoding.
+        std::string tmp(s.begin() + n, s.begin() + n + 3);
+        fprintf(stderr, "%s", tmp.data());
+        n += 3;
+      }
+
+      ++i;
+      if (i >= max_word_per_line_ && n + 1 < s.size() && s[n] == ' ') {
+        fprintf(stderr, "\n\r ");
+        ++num_previous_lines_;
+        i = 0;
+      }
+    }
+  }
+
+ private:
+  // Clear the output for the current segment
+  void Clear() {
+    ClearCurrentLine();
+    while (num_previous_lines_ > 0) {
+      GoUpOneLine();
+      ClearCurrentLine();
+      --num_previous_lines_;
+    }
+  }
+
+  // Clear the current line
+  void ClearCurrentLine() const { fprintf(stderr, "\33[2K\r"); }
+
+  // Move the cursor to the previous line
+  void GoUpOneLine() const { fprintf(stderr, "\033[1A\r"); }
+
+ private:
+  int32_t max_word_per_line_ = 80;
+  int32_t num_previous_lines_ = 0;
+  int32_t last_segment_ = -1;
+};
+
+}  // namespace sherpa_ncnn
+
+#endif  // SHERPA_NCNN_CSRC_DISPLAY_H_

+ 6 - 6
sherpa-ncnn/csrc/sherpa-ncnn-alsa.cc

@@ -23,6 +23,7 @@
 #include <cstdint>
 
 #include "sherpa-ncnn/csrc/alsa.h"
+#include "sherpa-ncnn/csrc/display.h"
 #include "sherpa-ncnn/csrc/recognizer.h"
 
 bool stop = false;
@@ -113,7 +114,7 @@ as the device_name.
 
   sherpa_ncnn::EndpointConfig endpoint_config;
   endpoint_config.rule1.min_trailing_silence = 2.4;
-  endpoint_config.rule2.min_trailing_silence = 1.2;
+  endpoint_config.rule2.min_trailing_silence = 0.8;  // <--tune this value !
   endpoint_config.rule3.min_utterance_length = 300;
 
   decoder_conf.endpoint_config = endpoint_config;
@@ -141,6 +142,7 @@ as the device_name.
 
   std::string last_text;
   int32_t segment_index = 0;
+  sherpa_ncnn::Display display;
   while (!stop) {
     const std::vector<float> samples = alsa.Read(chunk);
 
@@ -153,12 +155,10 @@ as the device_name.
     if (!text.empty() && last_text != text) {
       last_text = text;
 
-      // If you want to display in lower case, please uncomment
-      // the following two lines
-      // std::transform(text.begin(), text.end(), text.begin(),
-      //                [](auto c) { return std::tolower(c); });
+      std::transform(text.begin(), text.end(), text.begin(),
+                     [](auto c) { return std::tolower(c); });
 
-      fprintf(stderr, "%d: %s\n", segment_index, text.c_str());
+      display.Print(segment_index, text);
     }
 
     if (!text.empty() && is_endpoint) {

+ 31 - 5
sherpa-ncnn/csrc/sherpa-ncnn-microphone.cc

@@ -21,6 +21,7 @@
 #include <stdlib.h>
 
 #include "portaudio.h"  // NOLINT
+#include "sherpa-ncnn/csrc/display.h"
 #include "sherpa-ncnn/csrc/microphone.h"
 #include "sherpa-ncnn/csrc/recognizer.h"
 
@@ -97,6 +98,18 @@ for a list of pre-trained models to download.
       decoder_conf.method = method;
     }
   }
+
+  decoder_conf.enable_endpoint = true;
+
+  sherpa_ncnn::EndpointConfig endpoint_config;
+  endpoint_config.rule1.min_trailing_silence = 2.4;
+  endpoint_config.rule2.min_trailing_silence = 0.8;  // <--tune this value !
+  endpoint_config.rule3.min_utterance_length = 300;
+
+  decoder_conf.endpoint_config = endpoint_config;
+
+  fprintf(stderr, "%s\n", decoder_conf.ToString().c_str());
+
   knf::FbankOptions fbank_opts;
   fbank_opts.frame_opts.dither = 0;
   fbank_opts.frame_opts.snip_edges = false;
@@ -153,13 +166,26 @@ for a list of pre-trained models to download.
     exit(EXIT_FAILURE);
   }
 
-  int32_t num_tokens = 0;
+  std::string last_text;
+  int32_t segment_index = 0;
+  sherpa_ncnn::Display display;
   while (!stop) {
     recognizer.Decode();
-    auto result = recognizer.GetResult();
-    if (result.text.size() != num_tokens) {
-      num_tokens = result.text.size();
-      fprintf(stderr, "%s\n", result.text.c_str());
+
+    bool is_endpoint = recognizer.IsEndpoint();
+    auto text = recognizer.GetResult().text;
+
+    if (!text.empty() && last_text != text) {
+      last_text = text;
+
+      std::transform(text.begin(), text.end(), text.begin(),
+                     [](auto c) { return std::tolower(c); });
+
+      display.Print(segment_index, text);
+    }
+
+    if (!text.empty() && is_endpoint) {
+      ++segment_index;
     }
 
     Pa_Sleep(20);  // sleep for 20ms

+ 1 - 1
sherpa-ncnn/csrc/sherpa-ncnn.cc

@@ -20,7 +20,7 @@
 #include <stdio.h>
 
 #include <algorithm>
-#include <chrono>
+#include <chrono>  // NOLINT
 #include <iostream>
 
 #include "net.h"  // NOLINT